Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 mm updates from Ingo Molnar: "Lots of changes in this cycle: - Lots of CPA (change page attribute) optimizations and related cleanups (Thomas Gleixner, Peter Zijstra) - Make lazy TLB mode even lazier (Rik van Riel) - Fault handler cleanups and improvements (Dave Hansen) - kdump, vmcore: Enable kdumping encrypted memory with AMD SME enabled (Lianbo Jiang) - Clean up VM layout documentation (Baoquan He, Ingo Molnar) - ... plus misc other fixes and enhancements" * 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (51 commits) x86/stackprotector: Remove the call to boot_init_stack_canary() from cpu_startup_entry() x86/mm: Kill stray kernel fault handling comment x86/mm: Do not warn about PCI BIOS W+X mappings resource: Clean it up a bit resource: Fix find_next_iomem_res() iteration issue resource: Include resource end in walk_*() interfaces x86/kexec: Correct KEXEC_BACKUP_SRC_END off-by-one error x86/mm: Remove spurious fault pkey check x86/mm/vsyscall: Consider vsyscall page part of user address space x86/mm: Add vsyscall address helper x86/mm: Fix exception table comments x86/mm: Add clarifying comments for user addr space x86/mm: Break out user address space handling x86/mm: Break out kernel address space handling x86/mm: Clarify hardware vs. software "error_code" x86/mm/tlb: Make lazy TLB mode lazier x86/mm/tlb: Add freed_tables element to flush_tlb_info x86/mm/tlb: Add freed_tables argument to flush_tlb_mm_range smp,cpumask: introduce on_each_cpu_cond_mask smp: use __cpumask_set_cpu in on_each_cpu_cond ...
This commit is contained in:
commit
99792e0cea
|
@ -1,55 +1,124 @@
|
|||
====================================================
|
||||
Complete virtual memory map with 4-level page tables
|
||||
====================================================
|
||||
|
||||
Virtual memory map with 4 level page tables:
|
||||
Notes:
|
||||
|
||||
0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm
|
||||
hole caused by [47:63] sign extension
|
||||
ffff800000000000 - ffff87ffffffffff (=43 bits) guard hole, reserved for hypervisor
|
||||
ffff880000000000 - ffffc7ffffffffff (=64 TB) direct mapping of all phys. memory
|
||||
ffffc80000000000 - ffffc8ffffffffff (=40 bits) hole
|
||||
ffffc90000000000 - ffffe8ffffffffff (=45 bits) vmalloc/ioremap space
|
||||
ffffe90000000000 - ffffe9ffffffffff (=40 bits) hole
|
||||
ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
|
||||
... unused hole ...
|
||||
ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
|
||||
... unused hole ...
|
||||
vaddr_end for KASLR
|
||||
fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping
|
||||
fffffe8000000000 - fffffeffffffffff (=39 bits) LDT remap for PTI
|
||||
ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
|
||||
... unused hole ...
|
||||
ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
|
||||
... unused hole ...
|
||||
ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0
|
||||
ffffffffa0000000 - fffffffffeffffff (1520 MB) module mapping space
|
||||
[fixmap start] - ffffffffff5fffff kernel-internal fixmap range
|
||||
ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI
|
||||
ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
|
||||
- Negative addresses such as "-23 TB" are absolute addresses in bytes, counted down
|
||||
from the top of the 64-bit address space. It's easier to understand the layout
|
||||
when seen both in absolute addresses and in distance-from-top notation.
|
||||
|
||||
Virtual memory map with 5 level page tables:
|
||||
For example 0xffffe90000000000 == -23 TB, it's 23 TB lower than the top of the
|
||||
64-bit address space (ffffffffffffffff).
|
||||
|
||||
0000000000000000 - 00ffffffffffffff (=56 bits) user space, different per mm
|
||||
hole caused by [56:63] sign extension
|
||||
ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor
|
||||
ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory
|
||||
ff90000000000000 - ff9fffffffffffff (=52 bits) LDT remap for PTI
|
||||
ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB)
|
||||
ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole
|
||||
ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
|
||||
... unused hole ...
|
||||
ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB)
|
||||
... unused hole ...
|
||||
vaddr_end for KASLR
|
||||
fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping
|
||||
... unused hole ...
|
||||
ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
|
||||
... unused hole ...
|
||||
ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
|
||||
... unused hole ...
|
||||
ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0
|
||||
ffffffffa0000000 - fffffffffeffffff (1520 MB) module mapping space
|
||||
[fixmap start] - ffffffffff5fffff kernel-internal fixmap range
|
||||
ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI
|
||||
ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
|
||||
Note that as we get closer to the top of the address space, the notation changes
|
||||
from TB to GB and then MB/KB.
|
||||
|
||||
- "16M TB" might look weird at first sight, but it's an easier to visualize size
|
||||
notation than "16 EB", which few will recognize at first sight as 16 exabytes.
|
||||
It also shows it nicely how incredibly large 64-bit address space is.
|
||||
|
||||
========================================================================================================================
|
||||
Start addr | Offset | End addr | Size | VM area description
|
||||
========================================================================================================================
|
||||
| | | |
|
||||
0000000000000000 | 0 | 00007fffffffffff | 128 TB | user-space virtual memory, different per mm
|
||||
__________________|____________|__________________|_________|___________________________________________________________
|
||||
| | | |
|
||||
0000800000000000 | +128 TB | ffff7fffffffffff | ~16M TB | ... huge, almost 64 bits wide hole of non-canonical
|
||||
| | | | virtual memory addresses up to the -128 TB
|
||||
| | | | starting offset of kernel mappings.
|
||||
__________________|____________|__________________|_________|___________________________________________________________
|
||||
|
|
||||
| Kernel-space virtual memory, shared between all processes:
|
||||
____________________________________________________________|___________________________________________________________
|
||||
| | | |
|
||||
ffff800000000000 | -128 TB | ffff87ffffffffff | 8 TB | ... guard hole, also reserved for hypervisor
|
||||
ffff880000000000 | -120 TB | ffffc7ffffffffff | 64 TB | direct mapping of all physical memory (page_offset_base)
|
||||
ffffc80000000000 | -56 TB | ffffc8ffffffffff | 1 TB | ... unused hole
|
||||
ffffc90000000000 | -55 TB | ffffe8ffffffffff | 32 TB | vmalloc/ioremap space (vmalloc_base)
|
||||
ffffe90000000000 | -23 TB | ffffe9ffffffffff | 1 TB | ... unused hole
|
||||
ffffea0000000000 | -22 TB | ffffeaffffffffff | 1 TB | virtual memory map (vmemmap_base)
|
||||
ffffeb0000000000 | -21 TB | ffffebffffffffff | 1 TB | ... unused hole
|
||||
ffffec0000000000 | -20 TB | fffffbffffffffff | 16 TB | KASAN shadow memory
|
||||
fffffc0000000000 | -4 TB | fffffdffffffffff | 2 TB | ... unused hole
|
||||
| | | | vaddr_end for KASLR
|
||||
fffffe0000000000 | -2 TB | fffffe7fffffffff | 0.5 TB | cpu_entry_area mapping
|
||||
fffffe8000000000 | -1.5 TB | fffffeffffffffff | 0.5 TB | LDT remap for PTI
|
||||
ffffff0000000000 | -1 TB | ffffff7fffffffff | 0.5 TB | %esp fixup stacks
|
||||
__________________|____________|__________________|_________|____________________________________________________________
|
||||
|
|
||||
| Identical layout to the 47-bit one from here on:
|
||||
____________________________________________________________|____________________________________________________________
|
||||
| | | |
|
||||
ffffff8000000000 | -512 GB | ffffffeeffffffff | 444 GB | ... unused hole
|
||||
ffffffef00000000 | -68 GB | fffffffeffffffff | 64 GB | EFI region mapping space
|
||||
ffffffff00000000 | -4 GB | ffffffff7fffffff | 2 GB | ... unused hole
|
||||
ffffffff80000000 | -2 GB | ffffffff9fffffff | 512 MB | kernel text mapping, mapped to physical address 0
|
||||
ffffffff80000000 |-2048 MB | | |
|
||||
ffffffffa0000000 |-1536 MB | fffffffffeffffff | 1520 MB | module mapping space
|
||||
ffffffffff000000 | -16 MB | | |
|
||||
FIXADDR_START | ~-11 MB | ffffffffff5fffff | ~0.5 MB | kernel-internal fixmap range, variable size and offset
|
||||
ffffffffff600000 | -10 MB | ffffffffff600fff | 4 kB | legacy vsyscall ABI
|
||||
ffffffffffe00000 | -2 MB | ffffffffffffffff | 2 MB | ... unused hole
|
||||
__________________|____________|__________________|_________|___________________________________________________________
|
||||
|
||||
|
||||
====================================================
|
||||
Complete virtual memory map with 5-level page tables
|
||||
====================================================
|
||||
|
||||
Notes:
|
||||
|
||||
- With 56-bit addresses, user-space memory gets expanded by a factor of 512x,
|
||||
from 0.125 PB to 64 PB. All kernel mappings shift down to the -64 PT starting
|
||||
offset and many of the regions expand to support the much larger physical
|
||||
memory supported.
|
||||
|
||||
========================================================================================================================
|
||||
Start addr | Offset | End addr | Size | VM area description
|
||||
========================================================================================================================
|
||||
| | | |
|
||||
0000000000000000 | 0 | 00ffffffffffffff | 64 PB | user-space virtual memory, different per mm
|
||||
__________________|____________|__________________|_________|___________________________________________________________
|
||||
| | | |
|
||||
0000800000000000 | +64 PB | ffff7fffffffffff | ~16K PB | ... huge, still almost 64 bits wide hole of non-canonical
|
||||
| | | | virtual memory addresses up to the -128 TB
|
||||
| | | | starting offset of kernel mappings.
|
||||
__________________|____________|__________________|_________|___________________________________________________________
|
||||
|
|
||||
| Kernel-space virtual memory, shared between all processes:
|
||||
____________________________________________________________|___________________________________________________________
|
||||
| | | |
|
||||
ff00000000000000 | -64 PB | ff0fffffffffffff | 4 PB | ... guard hole, also reserved for hypervisor
|
||||
ff10000000000000 | -60 PB | ff8fffffffffffff | 32 PB | direct mapping of all physical memory (page_offset_base)
|
||||
ff90000000000000 | -28 PB | ff9fffffffffffff | 4 PB | LDT remap for PTI
|
||||
ffa0000000000000 | -24 PB | ffd1ffffffffffff | 12.5 PB | vmalloc/ioremap space (vmalloc_base)
|
||||
ffd2000000000000 | -11.5 PB | ffd3ffffffffffff | 0.5 PB | ... unused hole
|
||||
ffd4000000000000 | -11 PB | ffd5ffffffffffff | 0.5 PB | virtual memory map (vmemmap_base)
|
||||
ffd6000000000000 | -10.5 PB | ffdeffffffffffff | 2.25 PB | ... unused hole
|
||||
ffdf000000000000 | -8.25 PB | fffffdffffffffff | ~8 PB | KASAN shadow memory
|
||||
fffffc0000000000 | -4 TB | fffffdffffffffff | 2 TB | ... unused hole
|
||||
| | | | vaddr_end for KASLR
|
||||
fffffe0000000000 | -2 TB | fffffe7fffffffff | 0.5 TB | cpu_entry_area mapping
|
||||
fffffe8000000000 | -1.5 TB | fffffeffffffffff | 0.5 TB | ... unused hole
|
||||
ffffff0000000000 | -1 TB | ffffff7fffffffff | 0.5 TB | %esp fixup stacks
|
||||
__________________|____________|__________________|_________|____________________________________________________________
|
||||
|
|
||||
| Identical layout to the 47-bit one from here on:
|
||||
____________________________________________________________|____________________________________________________________
|
||||
| | | |
|
||||
ffffff8000000000 | -512 GB | ffffffeeffffffff | 444 GB | ... unused hole
|
||||
ffffffef00000000 | -68 GB | fffffffeffffffff | 64 GB | EFI region mapping space
|
||||
ffffffff00000000 | -4 GB | ffffffff7fffffff | 2 GB | ... unused hole
|
||||
ffffffff80000000 | -2 GB | ffffffff9fffffff | 512 MB | kernel text mapping, mapped to physical address 0
|
||||
ffffffff80000000 |-2048 MB | | |
|
||||
ffffffffa0000000 |-1536 MB | fffffffffeffffff | 1520 MB | module mapping space
|
||||
ffffffffff000000 | -16 MB | | |
|
||||
FIXADDR_START | ~-11 MB | ffffffffff5fffff | ~0.5 MB | kernel-internal fixmap range, variable size and offset
|
||||
ffffffffff600000 | -10 MB | ffffffffff600fff | 4 kB | legacy vsyscall ABI
|
||||
ffffffffffe00000 | -2 MB | ffffffffffffffff | 2 MB | ... unused hole
|
||||
__________________|____________|__________________|_________|___________________________________________________________
|
||||
|
||||
Architecture defines a 64-bit virtual address. Implementations can support
|
||||
less. Currently supported are 48- and 57-bit virtual addresses. Bits 63
|
||||
|
|
|
@ -1487,6 +1487,14 @@ config X86_DIRECT_GBPAGES
|
|||
supports them), so don't confuse the user by printing
|
||||
that we have them enabled.
|
||||
|
||||
config X86_CPA_STATISTICS
|
||||
bool "Enable statistic for Change Page Attribute"
|
||||
depends on DEBUG_FS
|
||||
---help---
|
||||
Expose statistics about the Change Page Attribute mechanims, which
|
||||
helps to determine the effectivness of preserving large and huge
|
||||
page mappings when mapping protections are changed.
|
||||
|
||||
config ARCH_HAS_MEM_ENCRYPT
|
||||
def_bool y
|
||||
|
||||
|
|
|
@ -187,11 +187,12 @@ extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size)
|
|||
#define ioremap_nocache ioremap_nocache
|
||||
extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size);
|
||||
#define ioremap_uc ioremap_uc
|
||||
|
||||
extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
|
||||
#define ioremap_cache ioremap_cache
|
||||
extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, unsigned long prot_val);
|
||||
#define ioremap_prot ioremap_prot
|
||||
extern void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size);
|
||||
#define ioremap_encrypted ioremap_encrypted
|
||||
|
||||
/**
|
||||
* ioremap - map bus memory into CPU space
|
||||
|
|
|
@ -67,7 +67,7 @@ struct kimage;
|
|||
|
||||
/* Memory to backup during crash kdump */
|
||||
#define KEXEC_BACKUP_SRC_START (0UL)
|
||||
#define KEXEC_BACKUP_SRC_END (640 * 1024UL) /* 640K */
|
||||
#define KEXEC_BACKUP_SRC_END (640 * 1024UL - 1) /* 640K */
|
||||
|
||||
/*
|
||||
* CPU does not save ss and sp on stack if execution is already
|
||||
|
|
|
@ -59,13 +59,16 @@
|
|||
#endif
|
||||
|
||||
/*
|
||||
* Kernel image size is limited to 1GiB due to the fixmap living in the
|
||||
* next 1GiB (see level2_kernel_pgt in arch/x86/kernel/head_64.S). Use
|
||||
* 512MiB by default, leaving 1.5GiB for modules once the page tables
|
||||
* are fully set up. If kernel ASLR is configured, it can extend the
|
||||
* kernel page table mapping, reducing the size of the modules area.
|
||||
* Maximum kernel image size is limited to 1 GiB, due to the fixmap living
|
||||
* in the next 1 GiB (see level2_kernel_pgt in arch/x86/kernel/head_64.S).
|
||||
*
|
||||
* On KASLR use 1 GiB by default, leaving 1 GiB for modules once the
|
||||
* page tables are fully set up.
|
||||
*
|
||||
* If KASLR is disabled we can shrink it to 0.5 GiB and increase the size
|
||||
* of the modules area to 1.5 GiB.
|
||||
*/
|
||||
#if defined(CONFIG_RANDOMIZE_BASE)
|
||||
#ifdef CONFIG_RANDOMIZE_BASE
|
||||
#define KERNEL_IMAGE_SIZE (1024 * 1024 * 1024)
|
||||
#else
|
||||
#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)
|
||||
|
|
|
@ -6,16 +6,23 @@
|
|||
#define tlb_end_vma(tlb, vma) do { } while (0)
|
||||
#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
|
||||
|
||||
#define tlb_flush(tlb) \
|
||||
{ \
|
||||
if (!tlb->fullmm && !tlb->need_flush_all) \
|
||||
flush_tlb_mm_range(tlb->mm, tlb->start, tlb->end, 0UL); \
|
||||
else \
|
||||
flush_tlb_mm_range(tlb->mm, 0UL, TLB_FLUSH_ALL, 0UL); \
|
||||
}
|
||||
static inline void tlb_flush(struct mmu_gather *tlb);
|
||||
|
||||
#include <asm-generic/tlb.h>
|
||||
|
||||
static inline void tlb_flush(struct mmu_gather *tlb)
|
||||
{
|
||||
unsigned long start = 0UL, end = TLB_FLUSH_ALL;
|
||||
unsigned int stride_shift = tlb_get_unmap_shift(tlb);
|
||||
|
||||
if (!tlb->fullmm && !tlb->need_flush_all) {
|
||||
start = tlb->start;
|
||||
end = tlb->end;
|
||||
}
|
||||
|
||||
flush_tlb_mm_range(tlb->mm, start, end, stride_shift, tlb->freed_tables);
|
||||
}
|
||||
|
||||
/*
|
||||
* While x86 architecture in general requires an IPI to perform TLB
|
||||
* shootdown, enablement code for several hypervisors overrides
|
||||
|
|
|
@ -148,22 +148,6 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
|
|||
#define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr)
|
||||
#endif
|
||||
|
||||
static inline bool tlb_defer_switch_to_init_mm(void)
|
||||
{
|
||||
/*
|
||||
* If we have PCID, then switching to init_mm is reasonably
|
||||
* fast. If we don't have PCID, then switching to init_mm is
|
||||
* quite slow, so we try to defer it in the hopes that we can
|
||||
* avoid it entirely. The latter approach runs the risk of
|
||||
* receiving otherwise unnecessary IPIs.
|
||||
*
|
||||
* This choice is just a heuristic. The tlb code can handle this
|
||||
* function returning true or false regardless of whether we have
|
||||
* PCID.
|
||||
*/
|
||||
return !static_cpu_has(X86_FEATURE_PCID);
|
||||
}
|
||||
|
||||
struct tlb_context {
|
||||
u64 ctx_id;
|
||||
u64 tlb_gen;
|
||||
|
@ -547,23 +531,30 @@ struct flush_tlb_info {
|
|||
unsigned long start;
|
||||
unsigned long end;
|
||||
u64 new_tlb_gen;
|
||||
unsigned int stride_shift;
|
||||
bool freed_tables;
|
||||
};
|
||||
|
||||
#define local_flush_tlb() __flush_tlb()
|
||||
|
||||
#define flush_tlb_mm(mm) flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL)
|
||||
#define flush_tlb_mm(mm) \
|
||||
flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL, true)
|
||||
|
||||
#define flush_tlb_range(vma, start, end) \
|
||||
flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags)
|
||||
#define flush_tlb_range(vma, start, end) \
|
||||
flush_tlb_mm_range((vma)->vm_mm, start, end, \
|
||||
((vma)->vm_flags & VM_HUGETLB) \
|
||||
? huge_page_shift(hstate_vma(vma)) \
|
||||
: PAGE_SHIFT, false)
|
||||
|
||||
extern void flush_tlb_all(void);
|
||||
extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
|
||||
unsigned long end, unsigned long vmflag);
|
||||
unsigned long end, unsigned int stride_shift,
|
||||
bool freed_tables);
|
||||
extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
|
||||
|
||||
static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
|
||||
{
|
||||
flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, VM_NONE);
|
||||
flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT, false);
|
||||
}
|
||||
|
||||
void native_flush_tlb_others(const struct cpumask *cpumask,
|
||||
|
|
|
@ -11,8 +11,38 @@
|
|||
#include <linux/uaccess.h>
|
||||
#include <linux/io.h>
|
||||
|
||||
static ssize_t __copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
|
||||
unsigned long offset, int userbuf,
|
||||
bool encrypted)
|
||||
{
|
||||
void *vaddr;
|
||||
|
||||
if (!csize)
|
||||
return 0;
|
||||
|
||||
if (encrypted)
|
||||
vaddr = (__force void *)ioremap_encrypted(pfn << PAGE_SHIFT, PAGE_SIZE);
|
||||
else
|
||||
vaddr = (__force void *)ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE);
|
||||
|
||||
if (!vaddr)
|
||||
return -ENOMEM;
|
||||
|
||||
if (userbuf) {
|
||||
if (copy_to_user((void __user *)buf, vaddr + offset, csize)) {
|
||||
iounmap((void __iomem *)vaddr);
|
||||
return -EFAULT;
|
||||
}
|
||||
} else
|
||||
memcpy(buf, vaddr + offset, csize);
|
||||
|
||||
set_iounmap_nonlazy();
|
||||
iounmap((void __iomem *)vaddr);
|
||||
return csize;
|
||||
}
|
||||
|
||||
/**
|
||||
* copy_oldmem_page - copy one page from "oldmem"
|
||||
* copy_oldmem_page - copy one page of memory
|
||||
* @pfn: page frame number to be copied
|
||||
* @buf: target memory address for the copy; this can be in kernel address
|
||||
* space or user address space (see @userbuf)
|
||||
|
@ -21,30 +51,22 @@
|
|||
* @userbuf: if set, @buf is in user address space, use copy_to_user(),
|
||||
* otherwise @buf is in kernel address space, use memcpy().
|
||||
*
|
||||
* Copy a page from "oldmem". For this page, there is no pte mapped
|
||||
* in the current kernel. We stitch up a pte, similar to kmap_atomic.
|
||||
* Copy a page from the old kernel's memory. For this page, there is no pte
|
||||
* mapped in the current kernel. We stitch up a pte, similar to kmap_atomic.
|
||||
*/
|
||||
ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
|
||||
size_t csize, unsigned long offset, int userbuf)
|
||||
ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
|
||||
unsigned long offset, int userbuf)
|
||||
{
|
||||
void *vaddr;
|
||||
|
||||
if (!csize)
|
||||
return 0;
|
||||
|
||||
vaddr = ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE);
|
||||
if (!vaddr)
|
||||
return -ENOMEM;
|
||||
|
||||
if (userbuf) {
|
||||
if (copy_to_user(buf, vaddr + offset, csize)) {
|
||||
iounmap(vaddr);
|
||||
return -EFAULT;
|
||||
}
|
||||
} else
|
||||
memcpy(buf, vaddr + offset, csize);
|
||||
|
||||
set_iounmap_nonlazy();
|
||||
iounmap(vaddr);
|
||||
return csize;
|
||||
return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* copy_oldmem_page_encrypted - same as copy_oldmem_page() above but ioremap the
|
||||
* memory with the encryption mask set to accomodate kdump on SME-enabled
|
||||
* machines.
|
||||
*/
|
||||
ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize,
|
||||
unsigned long offset, int userbuf)
|
||||
{
|
||||
return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, true);
|
||||
}
|
||||
|
|
|
@ -273,7 +273,7 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
|
|||
map_ldt_struct_to_user(mm);
|
||||
|
||||
va = (unsigned long)ldt_slot_va(slot);
|
||||
flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0);
|
||||
flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, PAGE_SHIFT, false);
|
||||
|
||||
ldt->slot = slot;
|
||||
return 0;
|
||||
|
|
|
@ -199,7 +199,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
|
|||
pte_unmap_unlock(pte, ptl);
|
||||
out:
|
||||
up_write(&mm->mmap_sem);
|
||||
flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, 0UL);
|
||||
flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, PAGE_SHIFT, false);
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -19,7 +19,9 @@
|
|||
#include <linux/sched.h>
|
||||
#include <linux/seq_file.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/pci.h>
|
||||
|
||||
#include <asm/e820/types.h>
|
||||
#include <asm/pgtable.h>
|
||||
|
||||
/*
|
||||
|
@ -241,6 +243,29 @@ static unsigned long normalize_addr(unsigned long u)
|
|||
return (signed long)(u << shift) >> shift;
|
||||
}
|
||||
|
||||
static void note_wx(struct pg_state *st)
|
||||
{
|
||||
unsigned long npages;
|
||||
|
||||
npages = (st->current_address - st->start_address) / PAGE_SIZE;
|
||||
|
||||
#ifdef CONFIG_PCI_BIOS
|
||||
/*
|
||||
* If PCI BIOS is enabled, the PCI BIOS area is forced to WX.
|
||||
* Inform about it, but avoid the warning.
|
||||
*/
|
||||
if (pcibios_enabled && st->start_address >= PAGE_OFFSET + BIOS_BEGIN &&
|
||||
st->current_address <= PAGE_OFFSET + BIOS_END) {
|
||||
pr_warn_once("x86/mm: PCI BIOS W+X mapping %lu pages\n", npages);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
/* Account the WX pages */
|
||||
st->wx_pages += npages;
|
||||
WARN_ONCE(1, "x86/mm: Found insecure W+X mapping at address %pS\n",
|
||||
(void *)st->start_address);
|
||||
}
|
||||
|
||||
/*
|
||||
* This function gets called on a break in a continuous series
|
||||
* of PTE entries; the next one is different so we need to
|
||||
|
@ -276,14 +301,8 @@ static void note_page(struct seq_file *m, struct pg_state *st,
|
|||
unsigned long delta;
|
||||
int width = sizeof(unsigned long) * 2;
|
||||
|
||||
if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX)) {
|
||||
WARN_ONCE(1,
|
||||
"x86/mm: Found insecure W+X mapping at address %p/%pS\n",
|
||||
(void *)st->start_address,
|
||||
(void *)st->start_address);
|
||||
st->wx_pages += (st->current_address -
|
||||
st->start_address) / PAGE_SIZE;
|
||||
}
|
||||
if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX))
|
||||
note_wx(st);
|
||||
|
||||
/*
|
||||
* Now print the actual finished series
|
||||
|
|
|
@ -851,6 +851,15 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
|
|||
show_opcodes(regs, loglvl);
|
||||
}
|
||||
|
||||
/*
|
||||
* The (legacy) vsyscall page is the long page in the kernel portion
|
||||
* of the address space that has user-accessible permissions.
|
||||
*/
|
||||
static bool is_vsyscall_vaddr(unsigned long vaddr)
|
||||
{
|
||||
return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR);
|
||||
}
|
||||
|
||||
static void
|
||||
__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
|
||||
unsigned long address, u32 *pkey, int si_code)
|
||||
|
@ -874,18 +883,6 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
|
|||
if (is_errata100(regs, address))
|
||||
return;
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
/*
|
||||
* Instruction fetch faults in the vsyscall page might need
|
||||
* emulation.
|
||||
*/
|
||||
if (unlikely((error_code & X86_PF_INSTR) &&
|
||||
((address & ~0xfff) == VSYSCALL_ADDR))) {
|
||||
if (emulate_vsyscall(regs, address))
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* To avoid leaking information about the kernel page table
|
||||
* layout, pretend that user-mode accesses to kernel addresses
|
||||
|
@ -1043,19 +1040,13 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
|
|||
}
|
||||
}
|
||||
|
||||
static int spurious_fault_check(unsigned long error_code, pte_t *pte)
|
||||
static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte)
|
||||
{
|
||||
if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
|
||||
return 0;
|
||||
|
||||
if ((error_code & X86_PF_INSTR) && !pte_exec(*pte))
|
||||
return 0;
|
||||
/*
|
||||
* Note: We do not do lazy flushing on protection key
|
||||
* changes, so no spurious fault will ever set X86_PF_PK.
|
||||
*/
|
||||
if ((error_code & X86_PF_PK))
|
||||
return 1;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
@ -1082,7 +1073,7 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
|
|||
* (Optional Invalidation).
|
||||
*/
|
||||
static noinline int
|
||||
spurious_fault(unsigned long error_code, unsigned long address)
|
||||
spurious_kernel_fault(unsigned long error_code, unsigned long address)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
p4d_t *p4d;
|
||||
|
@ -1113,27 +1104,27 @@ spurious_fault(unsigned long error_code, unsigned long address)
|
|||
return 0;
|
||||
|
||||
if (p4d_large(*p4d))
|
||||
return spurious_fault_check(error_code, (pte_t *) p4d);
|
||||
return spurious_kernel_fault_check(error_code, (pte_t *) p4d);
|
||||
|
||||
pud = pud_offset(p4d, address);
|
||||
if (!pud_present(*pud))
|
||||
return 0;
|
||||
|
||||
if (pud_large(*pud))
|
||||
return spurious_fault_check(error_code, (pte_t *) pud);
|
||||
return spurious_kernel_fault_check(error_code, (pte_t *) pud);
|
||||
|
||||
pmd = pmd_offset(pud, address);
|
||||
if (!pmd_present(*pmd))
|
||||
return 0;
|
||||
|
||||
if (pmd_large(*pmd))
|
||||
return spurious_fault_check(error_code, (pte_t *) pmd);
|
||||
return spurious_kernel_fault_check(error_code, (pte_t *) pmd);
|
||||
|
||||
pte = pte_offset_kernel(pmd, address);
|
||||
if (!pte_present(*pte))
|
||||
return 0;
|
||||
|
||||
ret = spurious_fault_check(error_code, pte);
|
||||
ret = spurious_kernel_fault_check(error_code, pte);
|
||||
if (!ret)
|
||||
return 0;
|
||||
|
||||
|
@ -1141,12 +1132,12 @@ spurious_fault(unsigned long error_code, unsigned long address)
|
|||
* Make sure we have permissions in PMD.
|
||||
* If not, then there's a bug in the page tables:
|
||||
*/
|
||||
ret = spurious_fault_check(error_code, (pte_t *) pmd);
|
||||
ret = spurious_kernel_fault_check(error_code, (pte_t *) pmd);
|
||||
WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
|
||||
|
||||
return ret;
|
||||
}
|
||||
NOKPROBE_SYMBOL(spurious_fault);
|
||||
NOKPROBE_SYMBOL(spurious_kernel_fault);
|
||||
|
||||
int show_unhandled_signals = 1;
|
||||
|
||||
|
@ -1193,6 +1184,14 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)
|
|||
|
||||
static int fault_in_kernel_space(unsigned long address)
|
||||
{
|
||||
/*
|
||||
* On 64-bit systems, the vsyscall page is at an address above
|
||||
* TASK_SIZE_MAX, but is not considered part of the kernel
|
||||
* address space.
|
||||
*/
|
||||
if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(address))
|
||||
return false;
|
||||
|
||||
return address >= TASK_SIZE_MAX;
|
||||
}
|
||||
|
||||
|
@ -1214,14 +1213,71 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs)
|
|||
}
|
||||
|
||||
/*
|
||||
* This routine handles page faults. It determines the address,
|
||||
* and the problem, and then passes it off to one of the appropriate
|
||||
* routines.
|
||||
* Called for all faults where 'address' is part of the kernel address
|
||||
* space. Might get called for faults that originate from *code* that
|
||||
* ran in userspace or the kernel.
|
||||
*/
|
||||
static noinline void
|
||||
__do_page_fault(struct pt_regs *regs, unsigned long error_code,
|
||||
unsigned long address)
|
||||
static void
|
||||
do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
|
||||
unsigned long address)
|
||||
{
|
||||
/*
|
||||
* Protection keys exceptions only happen on user pages. We
|
||||
* have no user pages in the kernel portion of the address
|
||||
* space, so do not expect them here.
|
||||
*/
|
||||
WARN_ON_ONCE(hw_error_code & X86_PF_PK);
|
||||
|
||||
/*
|
||||
* We can fault-in kernel-space virtual memory on-demand. The
|
||||
* 'reference' page table is init_mm.pgd.
|
||||
*
|
||||
* NOTE! We MUST NOT take any locks for this case. We may
|
||||
* be in an interrupt or a critical region, and should
|
||||
* only copy the information from the master page table,
|
||||
* nothing more.
|
||||
*
|
||||
* Before doing this on-demand faulting, ensure that the
|
||||
* fault is not any of the following:
|
||||
* 1. A fault on a PTE with a reserved bit set.
|
||||
* 2. A fault caused by a user-mode access. (Do not demand-
|
||||
* fault kernel memory due to user-mode accesses).
|
||||
* 3. A fault caused by a page-level protection violation.
|
||||
* (A demand fault would be on a non-present page which
|
||||
* would have X86_PF_PROT==0).
|
||||
*/
|
||||
if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
|
||||
if (vmalloc_fault(address) >= 0)
|
||||
return;
|
||||
}
|
||||
|
||||
/* Was the fault spurious, caused by lazy TLB invalidation? */
|
||||
if (spurious_kernel_fault(hw_error_code, address))
|
||||
return;
|
||||
|
||||
/* kprobes don't want to hook the spurious faults: */
|
||||
if (kprobes_fault(regs))
|
||||
return;
|
||||
|
||||
/*
|
||||
* Note, despite being a "bad area", there are quite a few
|
||||
* acceptable reasons to get here, such as erratum fixups
|
||||
* and handling kernel code that can fault, like get_user().
|
||||
*
|
||||
* Don't take the mm semaphore here. If we fixup a prefetch
|
||||
* fault we could otherwise deadlock:
|
||||
*/
|
||||
bad_area_nosemaphore(regs, hw_error_code, address, NULL);
|
||||
}
|
||||
NOKPROBE_SYMBOL(do_kern_addr_fault);
|
||||
|
||||
/* Handle faults in the user portion of the address space */
|
||||
static inline
|
||||
void do_user_addr_fault(struct pt_regs *regs,
|
||||
unsigned long hw_error_code,
|
||||
unsigned long address)
|
||||
{
|
||||
unsigned long sw_error_code;
|
||||
struct vm_area_struct *vma;
|
||||
struct task_struct *tsk;
|
||||
struct mm_struct *mm;
|
||||
|
@ -1232,55 +1288,23 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
|
|||
tsk = current;
|
||||
mm = tsk->mm;
|
||||
|
||||
prefetchw(&mm->mmap_sem);
|
||||
|
||||
if (unlikely(kmmio_fault(regs, address)))
|
||||
return;
|
||||
|
||||
/*
|
||||
* We fault-in kernel-space virtual memory on-demand. The
|
||||
* 'reference' page table is init_mm.pgd.
|
||||
*
|
||||
* NOTE! We MUST NOT take any locks for this case. We may
|
||||
* be in an interrupt or a critical region, and should
|
||||
* only copy the information from the master page table,
|
||||
* nothing more.
|
||||
*
|
||||
* This verifies that the fault happens in kernel space
|
||||
* (error_code & 4) == 0, and that the fault was not a
|
||||
* protection error (error_code & 9) == 0.
|
||||
*/
|
||||
if (unlikely(fault_in_kernel_space(address))) {
|
||||
if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
|
||||
if (vmalloc_fault(address) >= 0)
|
||||
return;
|
||||
}
|
||||
|
||||
/* Can handle a stale RO->RW TLB: */
|
||||
if (spurious_fault(error_code, address))
|
||||
return;
|
||||
|
||||
/* kprobes don't want to hook the spurious faults: */
|
||||
if (kprobes_fault(regs))
|
||||
return;
|
||||
/*
|
||||
* Don't take the mm semaphore here. If we fixup a prefetch
|
||||
* fault we could otherwise deadlock:
|
||||
*/
|
||||
bad_area_nosemaphore(regs, error_code, address, NULL);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
/* kprobes don't want to hook the spurious faults: */
|
||||
if (unlikely(kprobes_fault(regs)))
|
||||
return;
|
||||
|
||||
if (unlikely(error_code & X86_PF_RSVD))
|
||||
pgtable_bad(regs, error_code, address);
|
||||
/*
|
||||
* Reserved bits are never expected to be set on
|
||||
* entries in the user portion of the page tables.
|
||||
*/
|
||||
if (unlikely(hw_error_code & X86_PF_RSVD))
|
||||
pgtable_bad(regs, hw_error_code, address);
|
||||
|
||||
if (unlikely(smap_violation(error_code, regs))) {
|
||||
bad_area_nosemaphore(regs, error_code, address, NULL);
|
||||
/*
|
||||
* Check for invalid kernel (supervisor) access to user
|
||||
* pages in the user address space.
|
||||
*/
|
||||
if (unlikely(smap_violation(hw_error_code, regs))) {
|
||||
bad_area_nosemaphore(regs, hw_error_code, address, NULL);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -1289,10 +1313,17 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
|
|||
* in a region with pagefaults disabled then we must not take the fault
|
||||
*/
|
||||
if (unlikely(faulthandler_disabled() || !mm)) {
|
||||
bad_area_nosemaphore(regs, error_code, address, NULL);
|
||||
bad_area_nosemaphore(regs, hw_error_code, address, NULL);
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* hw_error_code is literally the "page fault error code" passed to
|
||||
* the kernel directly from the hardware. But, we will shortly be
|
||||
* modifying it in software, so give it a new name.
|
||||
*/
|
||||
sw_error_code = hw_error_code;
|
||||
|
||||
/*
|
||||
* It's safe to allow irq's after cr2 has been saved and the
|
||||
* vmalloc fault has been handled.
|
||||
|
@ -1302,7 +1333,26 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
|
|||
*/
|
||||
if (user_mode(regs)) {
|
||||
local_irq_enable();
|
||||
error_code |= X86_PF_USER;
|
||||
/*
|
||||
* Up to this point, X86_PF_USER set in hw_error_code
|
||||
* indicated a user-mode access. But, after this,
|
||||
* X86_PF_USER in sw_error_code will indicate either
|
||||
* that, *or* an implicit kernel(supervisor)-mode access
|
||||
* which originated from user mode.
|
||||
*/
|
||||
if (!(hw_error_code & X86_PF_USER)) {
|
||||
/*
|
||||
* The CPU was in user mode, but the CPU says
|
||||
* the fault was not a user-mode access.
|
||||
* Must be an implicit kernel-mode access,
|
||||
* which we do not expect to happen in the
|
||||
* user address space.
|
||||
*/
|
||||
pr_warn_once("kernel-mode error from user-mode: %lx\n",
|
||||
hw_error_code);
|
||||
|
||||
sw_error_code |= X86_PF_USER;
|
||||
}
|
||||
flags |= FAULT_FLAG_USER;
|
||||
} else {
|
||||
if (regs->flags & X86_EFLAGS_IF)
|
||||
|
@ -1311,31 +1361,49 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
|
|||
|
||||
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
|
||||
|
||||
if (error_code & X86_PF_WRITE)
|
||||
if (sw_error_code & X86_PF_WRITE)
|
||||
flags |= FAULT_FLAG_WRITE;
|
||||
if (error_code & X86_PF_INSTR)
|
||||
if (sw_error_code & X86_PF_INSTR)
|
||||
flags |= FAULT_FLAG_INSTRUCTION;
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
/*
|
||||
* When running in the kernel we expect faults to occur only to
|
||||
* addresses in user space. All other faults represent errors in
|
||||
* the kernel and should generate an OOPS. Unfortunately, in the
|
||||
* case of an erroneous fault occurring in a code path which already
|
||||
* holds mmap_sem we will deadlock attempting to validate the fault
|
||||
* against the address space. Luckily the kernel only validly
|
||||
* references user space from well defined areas of code, which are
|
||||
* listed in the exceptions table.
|
||||
* Instruction fetch faults in the vsyscall page might need
|
||||
* emulation. The vsyscall page is at a high address
|
||||
* (>PAGE_OFFSET), but is considered to be part of the user
|
||||
* address space.
|
||||
*
|
||||
* As the vast majority of faults will be valid we will only perform
|
||||
* the source reference check when there is a possibility of a
|
||||
* deadlock. Attempt to lock the address space, if we cannot we then
|
||||
* validate the source. If this is invalid we can skip the address
|
||||
* space check, thus avoiding the deadlock:
|
||||
* The vsyscall page does not have a "real" VMA, so do this
|
||||
* emulation before we go searching for VMAs.
|
||||
*/
|
||||
if ((sw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) {
|
||||
if (emulate_vsyscall(regs, address))
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Kernel-mode access to the user address space should only occur
|
||||
* on well-defined single instructions listed in the exception
|
||||
* tables. But, an erroneous kernel fault occurring outside one of
|
||||
* those areas which also holds mmap_sem might deadlock attempting
|
||||
* to validate the fault against the address space.
|
||||
*
|
||||
* Only do the expensive exception table search when we might be at
|
||||
* risk of a deadlock. This happens if we
|
||||
* 1. Failed to acquire mmap_sem, and
|
||||
* 2. The access did not originate in userspace. Note: either the
|
||||
* hardware or earlier page fault code may set X86_PF_USER
|
||||
* in sw_error_code.
|
||||
*/
|
||||
if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
|
||||
if (!(error_code & X86_PF_USER) &&
|
||||
if (!(sw_error_code & X86_PF_USER) &&
|
||||
!search_exception_tables(regs->ip)) {
|
||||
bad_area_nosemaphore(regs, error_code, address, NULL);
|
||||
/*
|
||||
* Fault from code in kernel from
|
||||
* which we do not expect faults.
|
||||
*/
|
||||
bad_area_nosemaphore(regs, sw_error_code, address, NULL);
|
||||
return;
|
||||
}
|
||||
retry:
|
||||
|
@ -1351,16 +1419,16 @@ retry:
|
|||
|
||||
vma = find_vma(mm, address);
|
||||
if (unlikely(!vma)) {
|
||||
bad_area(regs, error_code, address);
|
||||
bad_area(regs, sw_error_code, address);
|
||||
return;
|
||||
}
|
||||
if (likely(vma->vm_start <= address))
|
||||
goto good_area;
|
||||
if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
|
||||
bad_area(regs, error_code, address);
|
||||
bad_area(regs, sw_error_code, address);
|
||||
return;
|
||||
}
|
||||
if (error_code & X86_PF_USER) {
|
||||
if (sw_error_code & X86_PF_USER) {
|
||||
/*
|
||||
* Accessing the stack below %sp is always a bug.
|
||||
* The large cushion allows instructions like enter
|
||||
|
@ -1368,12 +1436,12 @@ retry:
|
|||
* 32 pointers and then decrements %sp by 65535.)
|
||||
*/
|
||||
if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
|
||||
bad_area(regs, error_code, address);
|
||||
bad_area(regs, sw_error_code, address);
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (unlikely(expand_stack(vma, address))) {
|
||||
bad_area(regs, error_code, address);
|
||||
bad_area(regs, sw_error_code, address);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -1382,8 +1450,8 @@ retry:
|
|||
* we can handle it..
|
||||
*/
|
||||
good_area:
|
||||
if (unlikely(access_error(error_code, vma))) {
|
||||
bad_area_access_error(regs, error_code, address, vma);
|
||||
if (unlikely(access_error(sw_error_code, vma))) {
|
||||
bad_area_access_error(regs, sw_error_code, address, vma);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -1425,13 +1493,13 @@ good_area:
|
|||
return;
|
||||
|
||||
/* Not returning to user mode? Handle exceptions or die: */
|
||||
no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
|
||||
no_context(regs, sw_error_code, address, SIGBUS, BUS_ADRERR);
|
||||
return;
|
||||
}
|
||||
|
||||
up_read(&mm->mmap_sem);
|
||||
if (unlikely(fault & VM_FAULT_ERROR)) {
|
||||
mm_fault_error(regs, error_code, address, &pkey, fault);
|
||||
mm_fault_error(regs, sw_error_code, address, &pkey, fault);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -1449,6 +1517,28 @@ good_area:
|
|||
|
||||
check_v8086_mode(regs, address, tsk);
|
||||
}
|
||||
NOKPROBE_SYMBOL(do_user_addr_fault);
|
||||
|
||||
/*
|
||||
* This routine handles page faults. It determines the address,
|
||||
* and the problem, and then passes it off to one of the appropriate
|
||||
* routines.
|
||||
*/
|
||||
static noinline void
|
||||
__do_page_fault(struct pt_regs *regs, unsigned long hw_error_code,
|
||||
unsigned long address)
|
||||
{
|
||||
prefetchw(¤t->mm->mmap_sem);
|
||||
|
||||
if (unlikely(kmmio_fault(regs, address)))
|
||||
return;
|
||||
|
||||
/* Was the fault on kernel-controlled part of the address space? */
|
||||
if (unlikely(fault_in_kernel_space(address)))
|
||||
do_kern_addr_fault(regs, hw_error_code, address);
|
||||
else
|
||||
do_user_addr_fault(regs, hw_error_code, address);
|
||||
}
|
||||
NOKPROBE_SYMBOL(__do_page_fault);
|
||||
|
||||
static nokprobe_inline void
|
||||
|
|
|
@ -923,34 +923,19 @@ static void mark_nxdata_nx(void)
|
|||
void mark_rodata_ro(void)
|
||||
{
|
||||
unsigned long start = PFN_ALIGN(_text);
|
||||
unsigned long size = PFN_ALIGN(_etext) - start;
|
||||
unsigned long size = (unsigned long)__end_rodata - start;
|
||||
|
||||
set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
|
||||
printk(KERN_INFO "Write protecting the kernel text: %luk\n",
|
||||
pr_info("Write protecting kernel text and read-only data: %luk\n",
|
||||
size >> 10);
|
||||
|
||||
kernel_set_to_readonly = 1;
|
||||
|
||||
#ifdef CONFIG_CPA_DEBUG
|
||||
printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
|
||||
start, start+size);
|
||||
set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
|
||||
|
||||
printk(KERN_INFO "Testing CPA: write protecting again\n");
|
||||
set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
|
||||
#endif
|
||||
|
||||
start += size;
|
||||
size = (unsigned long)__end_rodata - start;
|
||||
set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
|
||||
printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
|
||||
size >> 10);
|
||||
|
||||
#ifdef CONFIG_CPA_DEBUG
|
||||
printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size);
|
||||
pr_info("Testing CPA: Reverting %lx-%lx\n", start, start + size);
|
||||
set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
|
||||
|
||||
printk(KERN_INFO "Testing CPA: write protecting again\n");
|
||||
pr_info("Testing CPA: write protecting again\n");
|
||||
set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
|
||||
#endif
|
||||
mark_nxdata_nx();
|
||||
|
|
|
@ -131,7 +131,8 @@ static void __ioremap_check_mem(resource_size_t addr, unsigned long size,
|
|||
* caller shouldn't need to know that small detail.
|
||||
*/
|
||||
static void __iomem *__ioremap_caller(resource_size_t phys_addr,
|
||||
unsigned long size, enum page_cache_mode pcm, void *caller)
|
||||
unsigned long size, enum page_cache_mode pcm,
|
||||
void *caller, bool encrypted)
|
||||
{
|
||||
unsigned long offset, vaddr;
|
||||
resource_size_t last_addr;
|
||||
|
@ -199,7 +200,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
|
|||
* resulting mapping.
|
||||
*/
|
||||
prot = PAGE_KERNEL_IO;
|
||||
if (sev_active() && mem_flags.desc_other)
|
||||
if ((sev_active() && mem_flags.desc_other) || encrypted)
|
||||
prot = pgprot_encrypted(prot);
|
||||
|
||||
switch (pcm) {
|
||||
|
@ -291,7 +292,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
|
|||
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS;
|
||||
|
||||
return __ioremap_caller(phys_addr, size, pcm,
|
||||
__builtin_return_address(0));
|
||||
__builtin_return_address(0), false);
|
||||
}
|
||||
EXPORT_SYMBOL(ioremap_nocache);
|
||||
|
||||
|
@ -324,7 +325,7 @@ void __iomem *ioremap_uc(resource_size_t phys_addr, unsigned long size)
|
|||
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC;
|
||||
|
||||
return __ioremap_caller(phys_addr, size, pcm,
|
||||
__builtin_return_address(0));
|
||||
__builtin_return_address(0), false);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(ioremap_uc);
|
||||
|
||||
|
@ -341,7 +342,7 @@ EXPORT_SYMBOL_GPL(ioremap_uc);
|
|||
void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
|
||||
{
|
||||
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC,
|
||||
__builtin_return_address(0));
|
||||
__builtin_return_address(0), false);
|
||||
}
|
||||
EXPORT_SYMBOL(ioremap_wc);
|
||||
|
||||
|
@ -358,14 +359,21 @@ EXPORT_SYMBOL(ioremap_wc);
|
|||
void __iomem *ioremap_wt(resource_size_t phys_addr, unsigned long size)
|
||||
{
|
||||
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WT,
|
||||
__builtin_return_address(0));
|
||||
__builtin_return_address(0), false);
|
||||
}
|
||||
EXPORT_SYMBOL(ioremap_wt);
|
||||
|
||||
void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size)
|
||||
{
|
||||
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
|
||||
__builtin_return_address(0), true);
|
||||
}
|
||||
EXPORT_SYMBOL(ioremap_encrypted);
|
||||
|
||||
void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
|
||||
{
|
||||
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
|
||||
__builtin_return_address(0));
|
||||
__builtin_return_address(0), false);
|
||||
}
|
||||
EXPORT_SYMBOL(ioremap_cache);
|
||||
|
||||
|
@ -374,7 +382,7 @@ void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
|
|||
{
|
||||
return __ioremap_caller(phys_addr, size,
|
||||
pgprot2cachemode(__pgprot(prot_val)),
|
||||
__builtin_return_address(0));
|
||||
__builtin_return_address(0), false);
|
||||
}
|
||||
EXPORT_SYMBOL(ioremap_prot);
|
||||
|
||||
|
|
|
@ -37,11 +37,20 @@ struct cpa_data {
|
|||
unsigned long numpages;
|
||||
int flags;
|
||||
unsigned long pfn;
|
||||
unsigned force_split : 1;
|
||||
unsigned force_split : 1,
|
||||
force_static_prot : 1;
|
||||
int curpage;
|
||||
struct page **pages;
|
||||
};
|
||||
|
||||
enum cpa_warn {
|
||||
CPA_CONFLICT,
|
||||
CPA_PROTECT,
|
||||
CPA_DETECT,
|
||||
};
|
||||
|
||||
static const int cpa_warn_level = CPA_PROTECT;
|
||||
|
||||
/*
|
||||
* Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings)
|
||||
* using cpa_lock. So that we don't allow any other cpu, with stale large tlb
|
||||
|
@ -94,6 +103,87 @@ void arch_report_meminfo(struct seq_file *m)
|
|||
static inline void split_page_count(int level) { }
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_X86_CPA_STATISTICS
|
||||
|
||||
static unsigned long cpa_1g_checked;
|
||||
static unsigned long cpa_1g_sameprot;
|
||||
static unsigned long cpa_1g_preserved;
|
||||
static unsigned long cpa_2m_checked;
|
||||
static unsigned long cpa_2m_sameprot;
|
||||
static unsigned long cpa_2m_preserved;
|
||||
static unsigned long cpa_4k_install;
|
||||
|
||||
static inline void cpa_inc_1g_checked(void)
|
||||
{
|
||||
cpa_1g_checked++;
|
||||
}
|
||||
|
||||
static inline void cpa_inc_2m_checked(void)
|
||||
{
|
||||
cpa_2m_checked++;
|
||||
}
|
||||
|
||||
static inline void cpa_inc_4k_install(void)
|
||||
{
|
||||
cpa_4k_install++;
|
||||
}
|
||||
|
||||
static inline void cpa_inc_lp_sameprot(int level)
|
||||
{
|
||||
if (level == PG_LEVEL_1G)
|
||||
cpa_1g_sameprot++;
|
||||
else
|
||||
cpa_2m_sameprot++;
|
||||
}
|
||||
|
||||
static inline void cpa_inc_lp_preserved(int level)
|
||||
{
|
||||
if (level == PG_LEVEL_1G)
|
||||
cpa_1g_preserved++;
|
||||
else
|
||||
cpa_2m_preserved++;
|
||||
}
|
||||
|
||||
static int cpastats_show(struct seq_file *m, void *p)
|
||||
{
|
||||
seq_printf(m, "1G pages checked: %16lu\n", cpa_1g_checked);
|
||||
seq_printf(m, "1G pages sameprot: %16lu\n", cpa_1g_sameprot);
|
||||
seq_printf(m, "1G pages preserved: %16lu\n", cpa_1g_preserved);
|
||||
seq_printf(m, "2M pages checked: %16lu\n", cpa_2m_checked);
|
||||
seq_printf(m, "2M pages sameprot: %16lu\n", cpa_2m_sameprot);
|
||||
seq_printf(m, "2M pages preserved: %16lu\n", cpa_2m_preserved);
|
||||
seq_printf(m, "4K pages set-checked: %16lu\n", cpa_4k_install);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int cpastats_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
return single_open(file, cpastats_show, NULL);
|
||||
}
|
||||
|
||||
static const struct file_operations cpastats_fops = {
|
||||
.open = cpastats_open,
|
||||
.read = seq_read,
|
||||
.llseek = seq_lseek,
|
||||
.release = single_release,
|
||||
};
|
||||
|
||||
static int __init cpa_stats_init(void)
|
||||
{
|
||||
debugfs_create_file("cpa_stats", S_IRUSR, arch_debugfs_dir, NULL,
|
||||
&cpastats_fops);
|
||||
return 0;
|
||||
}
|
||||
late_initcall(cpa_stats_init);
|
||||
#else
|
||||
static inline void cpa_inc_1g_checked(void) { }
|
||||
static inline void cpa_inc_2m_checked(void) { }
|
||||
static inline void cpa_inc_4k_install(void) { }
|
||||
static inline void cpa_inc_lp_sameprot(int level) { }
|
||||
static inline void cpa_inc_lp_preserved(int level) { }
|
||||
#endif
|
||||
|
||||
|
||||
static inline int
|
||||
within(unsigned long addr, unsigned long start, unsigned long end)
|
||||
{
|
||||
|
@ -195,14 +285,20 @@ static void cpa_flush_all(unsigned long cache)
|
|||
on_each_cpu(__cpa_flush_all, (void *) cache, 1);
|
||||
}
|
||||
|
||||
static void __cpa_flush_range(void *arg)
|
||||
static bool __cpa_flush_range(unsigned long start, int numpages, int cache)
|
||||
{
|
||||
/*
|
||||
* We could optimize that further and do individual per page
|
||||
* tlb invalidates for a low number of pages. Caveat: we must
|
||||
* flush the high aliases on 64bit as well.
|
||||
*/
|
||||
__flush_tlb_all();
|
||||
BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
|
||||
|
||||
WARN_ON(PAGE_ALIGN(start) != start);
|
||||
|
||||
if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) {
|
||||
cpa_flush_all(cache);
|
||||
return true;
|
||||
}
|
||||
|
||||
flush_tlb_kernel_range(start, start + PAGE_SIZE * numpages);
|
||||
|
||||
return !cache;
|
||||
}
|
||||
|
||||
static void cpa_flush_range(unsigned long start, int numpages, int cache)
|
||||
|
@ -210,12 +306,7 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
|
|||
unsigned int i, level;
|
||||
unsigned long addr;
|
||||
|
||||
BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
|
||||
WARN_ON(PAGE_ALIGN(start) != start);
|
||||
|
||||
on_each_cpu(__cpa_flush_range, NULL, 1);
|
||||
|
||||
if (!cache)
|
||||
if (__cpa_flush_range(start, numpages, cache))
|
||||
return;
|
||||
|
||||
/*
|
||||
|
@ -235,30 +326,13 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
|
|||
}
|
||||
}
|
||||
|
||||
static void cpa_flush_array(unsigned long *start, int numpages, int cache,
|
||||
static void cpa_flush_array(unsigned long baddr, unsigned long *start,
|
||||
int numpages, int cache,
|
||||
int in_flags, struct page **pages)
|
||||
{
|
||||
unsigned int i, level;
|
||||
#ifdef CONFIG_PREEMPT
|
||||
/*
|
||||
* Avoid wbinvd() because it causes latencies on all CPUs,
|
||||
* regardless of any CPU isolation that may be in effect.
|
||||
*
|
||||
* This should be extended for CAT enabled systems independent of
|
||||
* PREEMPT because wbinvd() does not respect the CAT partitions and
|
||||
* this is exposed to unpriviledged users through the graphics
|
||||
* subsystem.
|
||||
*/
|
||||
unsigned long do_wbinvd = 0;
|
||||
#else
|
||||
unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */
|
||||
#endif
|
||||
|
||||
BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
|
||||
|
||||
on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1);
|
||||
|
||||
if (!cache || do_wbinvd)
|
||||
if (__cpa_flush_range(baddr, numpages, cache))
|
||||
return;
|
||||
|
||||
/*
|
||||
|
@ -286,84 +360,179 @@ static void cpa_flush_array(unsigned long *start, int numpages, int cache,
|
|||
}
|
||||
}
|
||||
|
||||
static bool overlaps(unsigned long r1_start, unsigned long r1_end,
|
||||
unsigned long r2_start, unsigned long r2_end)
|
||||
{
|
||||
return (r1_start <= r2_end && r1_end >= r2_start) ||
|
||||
(r2_start <= r1_end && r2_end >= r1_start);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_PCI_BIOS
|
||||
/*
|
||||
* The BIOS area between 640k and 1Mb needs to be executable for PCI BIOS
|
||||
* based config access (CONFIG_PCI_GOBIOS) support.
|
||||
*/
|
||||
#define BIOS_PFN PFN_DOWN(BIOS_BEGIN)
|
||||
#define BIOS_PFN_END PFN_DOWN(BIOS_END - 1)
|
||||
|
||||
static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn)
|
||||
{
|
||||
if (pcibios_enabled && overlaps(spfn, epfn, BIOS_PFN, BIOS_PFN_END))
|
||||
return _PAGE_NX;
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* The .rodata section needs to be read-only. Using the pfn catches all
|
||||
* aliases. This also includes __ro_after_init, so do not enforce until
|
||||
* kernel_set_to_readonly is true.
|
||||
*/
|
||||
static pgprotval_t protect_rodata(unsigned long spfn, unsigned long epfn)
|
||||
{
|
||||
unsigned long epfn_ro, spfn_ro = PFN_DOWN(__pa_symbol(__start_rodata));
|
||||
|
||||
/*
|
||||
* Note: __end_rodata is at page aligned and not inclusive, so
|
||||
* subtract 1 to get the last enforced PFN in the rodata area.
|
||||
*/
|
||||
epfn_ro = PFN_DOWN(__pa_symbol(__end_rodata)) - 1;
|
||||
|
||||
if (kernel_set_to_readonly && overlaps(spfn, epfn, spfn_ro, epfn_ro))
|
||||
return _PAGE_RW;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Protect kernel text against becoming non executable by forbidding
|
||||
* _PAGE_NX. This protects only the high kernel mapping (_text -> _etext)
|
||||
* out of which the kernel actually executes. Do not protect the low
|
||||
* mapping.
|
||||
*
|
||||
* This does not cover __inittext since that is gone after boot.
|
||||
*/
|
||||
static pgprotval_t protect_kernel_text(unsigned long start, unsigned long end)
|
||||
{
|
||||
unsigned long t_end = (unsigned long)_etext - 1;
|
||||
unsigned long t_start = (unsigned long)_text;
|
||||
|
||||
if (overlaps(start, end, t_start, t_end))
|
||||
return _PAGE_NX;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#if defined(CONFIG_X86_64)
|
||||
/*
|
||||
* Once the kernel maps the text as RO (kernel_set_to_readonly is set),
|
||||
* kernel text mappings for the large page aligned text, rodata sections
|
||||
* will be always read-only. For the kernel identity mappings covering the
|
||||
* holes caused by this alignment can be anything that user asks.
|
||||
*
|
||||
* This will preserve the large page mappings for kernel text/data at no
|
||||
* extra cost.
|
||||
*/
|
||||
static pgprotval_t protect_kernel_text_ro(unsigned long start,
|
||||
unsigned long end)
|
||||
{
|
||||
unsigned long t_end = (unsigned long)__end_rodata_hpage_align - 1;
|
||||
unsigned long t_start = (unsigned long)_text;
|
||||
unsigned int level;
|
||||
|
||||
if (!kernel_set_to_readonly || !overlaps(start, end, t_start, t_end))
|
||||
return 0;
|
||||
/*
|
||||
* Don't enforce the !RW mapping for the kernel text mapping, if
|
||||
* the current mapping is already using small page mapping. No
|
||||
* need to work hard to preserve large page mappings in this case.
|
||||
*
|
||||
* This also fixes the Linux Xen paravirt guest boot failure caused
|
||||
* by unexpected read-only mappings for kernel identity
|
||||
* mappings. In this paravirt guest case, the kernel text mapping
|
||||
* and the kernel identity mapping share the same page-table pages,
|
||||
* so the protections for kernel text and identity mappings have to
|
||||
* be the same.
|
||||
*/
|
||||
if (lookup_address(start, &level) && (level != PG_LEVEL_4K))
|
||||
return _PAGE_RW;
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
static pgprotval_t protect_kernel_text_ro(unsigned long start,
|
||||
unsigned long end)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline bool conflicts(pgprot_t prot, pgprotval_t val)
|
||||
{
|
||||
return (pgprot_val(prot) & ~val) != pgprot_val(prot);
|
||||
}
|
||||
|
||||
static inline void check_conflict(int warnlvl, pgprot_t prot, pgprotval_t val,
|
||||
unsigned long start, unsigned long end,
|
||||
unsigned long pfn, const char *txt)
|
||||
{
|
||||
static const char *lvltxt[] = {
|
||||
[CPA_CONFLICT] = "conflict",
|
||||
[CPA_PROTECT] = "protect",
|
||||
[CPA_DETECT] = "detect",
|
||||
};
|
||||
|
||||
if (warnlvl > cpa_warn_level || !conflicts(prot, val))
|
||||
return;
|
||||
|
||||
pr_warn("CPA %8s %10s: 0x%016lx - 0x%016lx PFN %lx req %016llx prevent %016llx\n",
|
||||
lvltxt[warnlvl], txt, start, end, pfn, (unsigned long long)pgprot_val(prot),
|
||||
(unsigned long long)val);
|
||||
}
|
||||
|
||||
/*
|
||||
* Certain areas of memory on x86 require very specific protection flags,
|
||||
* for example the BIOS area or kernel text. Callers don't always get this
|
||||
* right (again, ioremap() on BIOS memory is not uncommon) so this function
|
||||
* checks and fixes these known static required protection bits.
|
||||
*/
|
||||
static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
|
||||
unsigned long pfn)
|
||||
static inline pgprot_t static_protections(pgprot_t prot, unsigned long start,
|
||||
unsigned long pfn, unsigned long npg,
|
||||
int warnlvl)
|
||||
{
|
||||
pgprot_t forbidden = __pgprot(0);
|
||||
pgprotval_t forbidden, res;
|
||||
unsigned long end;
|
||||
|
||||
/*
|
||||
* The BIOS area between 640k and 1Mb needs to be executable for
|
||||
* PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
|
||||
* There is no point in checking RW/NX conflicts when the requested
|
||||
* mapping is setting the page !PRESENT.
|
||||
*/
|
||||
#ifdef CONFIG_PCI_BIOS
|
||||
if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
|
||||
pgprot_val(forbidden) |= _PAGE_NX;
|
||||
#endif
|
||||
if (!(pgprot_val(prot) & _PAGE_PRESENT))
|
||||
return prot;
|
||||
|
||||
/*
|
||||
* The kernel text needs to be executable for obvious reasons
|
||||
* Does not cover __inittext since that is gone later on. On
|
||||
* 64bit we do not enforce !NX on the low mapping
|
||||
*/
|
||||
if (within(address, (unsigned long)_text, (unsigned long)_etext))
|
||||
pgprot_val(forbidden) |= _PAGE_NX;
|
||||
/* Operate on the virtual address */
|
||||
end = start + npg * PAGE_SIZE - 1;
|
||||
|
||||
/*
|
||||
* The .rodata section needs to be read-only. Using the pfn
|
||||
* catches all aliases. This also includes __ro_after_init,
|
||||
* so do not enforce until kernel_set_to_readonly is true.
|
||||
*/
|
||||
if (kernel_set_to_readonly &&
|
||||
within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT,
|
||||
__pa_symbol(__end_rodata) >> PAGE_SHIFT))
|
||||
pgprot_val(forbidden) |= _PAGE_RW;
|
||||
res = protect_kernel_text(start, end);
|
||||
check_conflict(warnlvl, prot, res, start, end, pfn, "Text NX");
|
||||
forbidden = res;
|
||||
|
||||
#if defined(CONFIG_X86_64)
|
||||
/*
|
||||
* Once the kernel maps the text as RO (kernel_set_to_readonly is set),
|
||||
* kernel text mappings for the large page aligned text, rodata sections
|
||||
* will be always read-only. For the kernel identity mappings covering
|
||||
* the holes caused by this alignment can be anything that user asks.
|
||||
*
|
||||
* This will preserve the large page mappings for kernel text/data
|
||||
* at no extra cost.
|
||||
*/
|
||||
if (kernel_set_to_readonly &&
|
||||
within(address, (unsigned long)_text,
|
||||
(unsigned long)__end_rodata_hpage_align)) {
|
||||
unsigned int level;
|
||||
res = protect_kernel_text_ro(start, end);
|
||||
check_conflict(warnlvl, prot, res, start, end, pfn, "Text RO");
|
||||
forbidden |= res;
|
||||
|
||||
/*
|
||||
* Don't enforce the !RW mapping for the kernel text mapping,
|
||||
* if the current mapping is already using small page mapping.
|
||||
* No need to work hard to preserve large page mappings in this
|
||||
* case.
|
||||
*
|
||||
* This also fixes the Linux Xen paravirt guest boot failure
|
||||
* (because of unexpected read-only mappings for kernel identity
|
||||
* mappings). In this paravirt guest case, the kernel text
|
||||
* mapping and the kernel identity mapping share the same
|
||||
* page-table pages. Thus we can't really use different
|
||||
* protections for the kernel text and identity mappings. Also,
|
||||
* these shared mappings are made of small page mappings.
|
||||
* Thus this don't enforce !RW mapping for small page kernel
|
||||
* text mapping logic will help Linux Xen parvirt guest boot
|
||||
* as well.
|
||||
*/
|
||||
if (lookup_address(address, &level) && (level != PG_LEVEL_4K))
|
||||
pgprot_val(forbidden) |= _PAGE_RW;
|
||||
}
|
||||
#endif
|
||||
/* Check the PFN directly */
|
||||
res = protect_pci_bios(pfn, pfn + npg - 1);
|
||||
check_conflict(warnlvl, prot, res, start, end, pfn, "PCIBIOS NX");
|
||||
forbidden |= res;
|
||||
|
||||
prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
|
||||
res = protect_rodata(pfn, pfn + npg - 1);
|
||||
check_conflict(warnlvl, prot, res, start, end, pfn, "Rodata RO");
|
||||
forbidden |= res;
|
||||
|
||||
return prot;
|
||||
return __pgprot(pgprot_val(prot) & ~forbidden);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -421,18 +590,18 @@ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
|
|||
*/
|
||||
pte_t *lookup_address(unsigned long address, unsigned int *level)
|
||||
{
|
||||
return lookup_address_in_pgd(pgd_offset_k(address), address, level);
|
||||
return lookup_address_in_pgd(pgd_offset_k(address), address, level);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(lookup_address);
|
||||
|
||||
static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
|
||||
unsigned int *level)
|
||||
{
|
||||
if (cpa->pgd)
|
||||
if (cpa->pgd)
|
||||
return lookup_address_in_pgd(cpa->pgd + pgd_index(address),
|
||||
address, level);
|
||||
|
||||
return lookup_address(address, level);
|
||||
return lookup_address(address, level);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -549,40 +718,35 @@ static pgprot_t pgprot_clear_protnone_bits(pgprot_t prot)
|
|||
return prot;
|
||||
}
|
||||
|
||||
static int
|
||||
try_preserve_large_page(pte_t *kpte, unsigned long address,
|
||||
struct cpa_data *cpa)
|
||||
static int __should_split_large_page(pte_t *kpte, unsigned long address,
|
||||
struct cpa_data *cpa)
|
||||
{
|
||||
unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn, old_pfn;
|
||||
unsigned long numpages, pmask, psize, lpaddr, pfn, old_pfn;
|
||||
pgprot_t old_prot, new_prot, req_prot, chk_prot;
|
||||
pte_t new_pte, old_pte, *tmp;
|
||||
pgprot_t old_prot, new_prot, req_prot;
|
||||
int i, do_split = 1;
|
||||
enum pg_level level;
|
||||
|
||||
if (cpa->force_split)
|
||||
return 1;
|
||||
|
||||
spin_lock(&pgd_lock);
|
||||
/*
|
||||
* Check for races, another CPU might have split this page
|
||||
* up already:
|
||||
*/
|
||||
tmp = _lookup_address_cpa(cpa, address, &level);
|
||||
if (tmp != kpte)
|
||||
goto out_unlock;
|
||||
return 1;
|
||||
|
||||
switch (level) {
|
||||
case PG_LEVEL_2M:
|
||||
old_prot = pmd_pgprot(*(pmd_t *)kpte);
|
||||
old_pfn = pmd_pfn(*(pmd_t *)kpte);
|
||||
cpa_inc_2m_checked();
|
||||
break;
|
||||
case PG_LEVEL_1G:
|
||||
old_prot = pud_pgprot(*(pud_t *)kpte);
|
||||
old_pfn = pud_pfn(*(pud_t *)kpte);
|
||||
cpa_inc_1g_checked();
|
||||
break;
|
||||
default:
|
||||
do_split = -EINVAL;
|
||||
goto out_unlock;
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
psize = page_level_size(level);
|
||||
|
@ -592,8 +756,8 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
|
|||
* Calculate the number of pages, which fit into this large
|
||||
* page starting at address:
|
||||
*/
|
||||
nextpage_addr = (address + psize) & pmask;
|
||||
numpages = (nextpage_addr - address) >> PAGE_SHIFT;
|
||||
lpaddr = (address + psize) & pmask;
|
||||
numpages = (lpaddr - address) >> PAGE_SHIFT;
|
||||
if (numpages < cpa->numpages)
|
||||
cpa->numpages = numpages;
|
||||
|
||||
|
@ -620,71 +784,142 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
|
|||
pgprot_val(req_prot) |= _PAGE_PSE;
|
||||
|
||||
/*
|
||||
* old_pfn points to the large page base pfn. So we need
|
||||
* to add the offset of the virtual address:
|
||||
* old_pfn points to the large page base pfn. So we need to add the
|
||||
* offset of the virtual address:
|
||||
*/
|
||||
pfn = old_pfn + ((address & (psize - 1)) >> PAGE_SHIFT);
|
||||
cpa->pfn = pfn;
|
||||
|
||||
new_prot = static_protections(req_prot, address, pfn);
|
||||
/*
|
||||
* Calculate the large page base address and the number of 4K pages
|
||||
* in the large page
|
||||
*/
|
||||
lpaddr = address & pmask;
|
||||
numpages = psize >> PAGE_SHIFT;
|
||||
|
||||
/*
|
||||
* We need to check the full range, whether
|
||||
* static_protection() requires a different pgprot for one of
|
||||
* the pages in the range we try to preserve:
|
||||
* Sanity check that the existing mapping is correct versus the static
|
||||
* protections. static_protections() guards against !PRESENT, so no
|
||||
* extra conditional required here.
|
||||
*/
|
||||
addr = address & pmask;
|
||||
pfn = old_pfn;
|
||||
for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) {
|
||||
pgprot_t chk_prot = static_protections(req_prot, addr, pfn);
|
||||
chk_prot = static_protections(old_prot, lpaddr, old_pfn, numpages,
|
||||
CPA_CONFLICT);
|
||||
|
||||
if (pgprot_val(chk_prot) != pgprot_val(new_prot))
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
/*
|
||||
* If there are no changes, return. maxpages has been updated
|
||||
* above:
|
||||
*/
|
||||
if (pgprot_val(new_prot) == pgprot_val(old_prot)) {
|
||||
do_split = 0;
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
/*
|
||||
* We need to change the attributes. Check, whether we can
|
||||
* change the large page in one go. We request a split, when
|
||||
* the address is not aligned and the number of pages is
|
||||
* smaller than the number of pages in the large page. Note
|
||||
* that we limited the number of possible pages already to
|
||||
* the number of pages in the large page.
|
||||
*/
|
||||
if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) {
|
||||
if (WARN_ON_ONCE(pgprot_val(chk_prot) != pgprot_val(old_prot))) {
|
||||
/*
|
||||
* The address is aligned and the number of pages
|
||||
* covers the full page.
|
||||
* Split the large page and tell the split code to
|
||||
* enforce static protections.
|
||||
*/
|
||||
new_pte = pfn_pte(old_pfn, new_prot);
|
||||
__set_pmd_pte(kpte, address, new_pte);
|
||||
cpa->flags |= CPA_FLUSHTLB;
|
||||
do_split = 0;
|
||||
cpa->force_static_prot = 1;
|
||||
return 1;
|
||||
}
|
||||
|
||||
out_unlock:
|
||||
/*
|
||||
* Optimization: If the requested pgprot is the same as the current
|
||||
* pgprot, then the large page can be preserved and no updates are
|
||||
* required independent of alignment and length of the requested
|
||||
* range. The above already established that the current pgprot is
|
||||
* correct, which in consequence makes the requested pgprot correct
|
||||
* as well if it is the same. The static protection scan below will
|
||||
* not come to a different conclusion.
|
||||
*/
|
||||
if (pgprot_val(req_prot) == pgprot_val(old_prot)) {
|
||||
cpa_inc_lp_sameprot(level);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* If the requested range does not cover the full page, split it up
|
||||
*/
|
||||
if (address != lpaddr || cpa->numpages != numpages)
|
||||
return 1;
|
||||
|
||||
/*
|
||||
* Check whether the requested pgprot is conflicting with a static
|
||||
* protection requirement in the large page.
|
||||
*/
|
||||
new_prot = static_protections(req_prot, lpaddr, old_pfn, numpages,
|
||||
CPA_DETECT);
|
||||
|
||||
/*
|
||||
* If there is a conflict, split the large page.
|
||||
*
|
||||
* There used to be a 4k wise evaluation trying really hard to
|
||||
* preserve the large pages, but experimentation has shown, that this
|
||||
* does not help at all. There might be corner cases which would
|
||||
* preserve one large page occasionally, but it's really not worth the
|
||||
* extra code and cycles for the common case.
|
||||
*/
|
||||
if (pgprot_val(req_prot) != pgprot_val(new_prot))
|
||||
return 1;
|
||||
|
||||
/* All checks passed. Update the large page mapping. */
|
||||
new_pte = pfn_pte(old_pfn, new_prot);
|
||||
__set_pmd_pte(kpte, address, new_pte);
|
||||
cpa->flags |= CPA_FLUSHTLB;
|
||||
cpa_inc_lp_preserved(level);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int should_split_large_page(pte_t *kpte, unsigned long address,
|
||||
struct cpa_data *cpa)
|
||||
{
|
||||
int do_split;
|
||||
|
||||
if (cpa->force_split)
|
||||
return 1;
|
||||
|
||||
spin_lock(&pgd_lock);
|
||||
do_split = __should_split_large_page(kpte, address, cpa);
|
||||
spin_unlock(&pgd_lock);
|
||||
|
||||
return do_split;
|
||||
}
|
||||
|
||||
static void split_set_pte(struct cpa_data *cpa, pte_t *pte, unsigned long pfn,
|
||||
pgprot_t ref_prot, unsigned long address,
|
||||
unsigned long size)
|
||||
{
|
||||
unsigned int npg = PFN_DOWN(size);
|
||||
pgprot_t prot;
|
||||
|
||||
/*
|
||||
* If should_split_large_page() discovered an inconsistent mapping,
|
||||
* remove the invalid protection in the split mapping.
|
||||
*/
|
||||
if (!cpa->force_static_prot)
|
||||
goto set;
|
||||
|
||||
prot = static_protections(ref_prot, address, pfn, npg, CPA_PROTECT);
|
||||
|
||||
if (pgprot_val(prot) == pgprot_val(ref_prot))
|
||||
goto set;
|
||||
|
||||
/*
|
||||
* If this is splitting a PMD, fix it up. PUD splits cannot be
|
||||
* fixed trivially as that would require to rescan the newly
|
||||
* installed PMD mappings after returning from split_large_page()
|
||||
* so an eventual further split can allocate the necessary PTE
|
||||
* pages. Warn for now and revisit it in case this actually
|
||||
* happens.
|
||||
*/
|
||||
if (size == PAGE_SIZE)
|
||||
ref_prot = prot;
|
||||
else
|
||||
pr_warn_once("CPA: Cannot fixup static protections for PUD split\n");
|
||||
set:
|
||||
set_pte(pte, pfn_pte(pfn, ref_prot));
|
||||
}
|
||||
|
||||
static int
|
||||
__split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
|
||||
struct page *base)
|
||||
{
|
||||
unsigned long lpaddr, lpinc, ref_pfn, pfn, pfninc = 1;
|
||||
pte_t *pbase = (pte_t *)page_address(base);
|
||||
unsigned long ref_pfn, pfn, pfninc = 1;
|
||||
unsigned int i, level;
|
||||
pte_t *tmp;
|
||||
pgprot_t ref_prot;
|
||||
pte_t *tmp;
|
||||
|
||||
spin_lock(&pgd_lock);
|
||||
/*
|
||||
|
@ -707,15 +942,17 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
|
|||
* PAT bit to correct position.
|
||||
*/
|
||||
ref_prot = pgprot_large_2_4k(ref_prot);
|
||||
|
||||
ref_pfn = pmd_pfn(*(pmd_t *)kpte);
|
||||
lpaddr = address & PMD_MASK;
|
||||
lpinc = PAGE_SIZE;
|
||||
break;
|
||||
|
||||
case PG_LEVEL_1G:
|
||||
ref_prot = pud_pgprot(*(pud_t *)kpte);
|
||||
ref_pfn = pud_pfn(*(pud_t *)kpte);
|
||||
pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
|
||||
|
||||
lpaddr = address & PUD_MASK;
|
||||
lpinc = PMD_SIZE;
|
||||
/*
|
||||
* Clear the PSE flags if the PRESENT flag is not set
|
||||
* otherwise pmd_present/pmd_huge will return true
|
||||
|
@ -736,8 +973,8 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
|
|||
* Get the target pfn from the original entry:
|
||||
*/
|
||||
pfn = ref_pfn;
|
||||
for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
|
||||
set_pte(&pbase[i], pfn_pte(pfn, ref_prot));
|
||||
for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc, lpaddr += lpinc)
|
||||
split_set_pte(cpa, pbase + i, pfn, ref_prot, lpaddr, lpinc);
|
||||
|
||||
if (virt_addr_valid(address)) {
|
||||
unsigned long pfn = PFN_DOWN(__pa(address));
|
||||
|
@ -756,14 +993,24 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
|
|||
__set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));
|
||||
|
||||
/*
|
||||
* Intel Atom errata AAH41 workaround.
|
||||
* Do a global flush tlb after splitting the large page
|
||||
* and before we do the actual change page attribute in the PTE.
|
||||
*
|
||||
* The real fix should be in hw or in a microcode update, but
|
||||
* we also probabilistically try to reduce the window of having
|
||||
* a large TLB mixed with 4K TLBs while instruction fetches are
|
||||
* going on.
|
||||
* Without this, we violate the TLB application note, that says:
|
||||
* "The TLBs may contain both ordinary and large-page
|
||||
* translations for a 4-KByte range of linear addresses. This
|
||||
* may occur if software modifies the paging structures so that
|
||||
* the page size used for the address range changes. If the two
|
||||
* translations differ with respect to page frame or attributes
|
||||
* (e.g., permissions), processor behavior is undefined and may
|
||||
* be implementation-specific."
|
||||
*
|
||||
* We do this global tlb flush inside the cpa_lock, so that we
|
||||
* don't allow any other cpu, with stale tlb entries change the
|
||||
* page attribute in parallel, that also falls into the
|
||||
* just split large page entry.
|
||||
*/
|
||||
__flush_tlb_all();
|
||||
flush_tlb_all();
|
||||
spin_unlock(&pgd_lock);
|
||||
|
||||
return 0;
|
||||
|
@ -1247,7 +1494,9 @@ repeat:
|
|||
pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
|
||||
pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
|
||||
|
||||
new_prot = static_protections(new_prot, address, pfn);
|
||||
cpa_inc_4k_install();
|
||||
new_prot = static_protections(new_prot, address, pfn, 1,
|
||||
CPA_PROTECT);
|
||||
|
||||
new_prot = pgprot_clear_protnone_bits(new_prot);
|
||||
|
||||
|
@ -1273,7 +1522,7 @@ repeat:
|
|||
* Check, whether we can keep the large page intact
|
||||
* and just change the pte:
|
||||
*/
|
||||
do_split = try_preserve_large_page(kpte, address, cpa);
|
||||
do_split = should_split_large_page(kpte, address, cpa);
|
||||
/*
|
||||
* When the range fits into the existing large page,
|
||||
* return. cp->numpages and cpa->tlbflush have been updated in
|
||||
|
@ -1286,28 +1535,8 @@ repeat:
|
|||
* We have to split the large page:
|
||||
*/
|
||||
err = split_large_page(cpa, kpte, address);
|
||||
if (!err) {
|
||||
/*
|
||||
* Do a global flush tlb after splitting the large page
|
||||
* and before we do the actual change page attribute in the PTE.
|
||||
*
|
||||
* With out this, we violate the TLB application note, that says
|
||||
* "The TLBs may contain both ordinary and large-page
|
||||
* translations for a 4-KByte range of linear addresses. This
|
||||
* may occur if software modifies the paging structures so that
|
||||
* the page size used for the address range changes. If the two
|
||||
* translations differ with respect to page frame or attributes
|
||||
* (e.g., permissions), processor behavior is undefined and may
|
||||
* be implementation-specific."
|
||||
*
|
||||
* We do this global tlb flush inside the cpa_lock, so that we
|
||||
* don't allow any other cpu, with stale tlb entries change the
|
||||
* page attribute in parallel, that also falls into the
|
||||
* just split large page entry.
|
||||
*/
|
||||
flush_tlb_all();
|
||||
if (!err)
|
||||
goto repeat;
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
@ -1529,19 +1758,19 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
|
|||
cache = !!pgprot2cachemode(mask_set);
|
||||
|
||||
/*
|
||||
* On success we use CLFLUSH, when the CPU supports it to
|
||||
* avoid the WBINVD. If the CPU does not support it and in the
|
||||
* error case we fall back to cpa_flush_all (which uses
|
||||
* WBINVD):
|
||||
* On error; flush everything to be sure.
|
||||
*/
|
||||
if (!ret && boot_cpu_has(X86_FEATURE_CLFLUSH)) {
|
||||
if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
|
||||
cpa_flush_array(addr, numpages, cache,
|
||||
cpa.flags, pages);
|
||||
} else
|
||||
cpa_flush_range(baddr, numpages, cache);
|
||||
} else
|
||||
if (ret) {
|
||||
cpa_flush_all(cache);
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
|
||||
cpa_flush_array(baddr, addr, numpages, cache,
|
||||
cpa.flags, pages);
|
||||
} else {
|
||||
cpa_flush_range(baddr, numpages, cache);
|
||||
}
|
||||
|
||||
out:
|
||||
return ret;
|
||||
|
@ -1856,10 +2085,7 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
|
|||
/*
|
||||
* Before changing the encryption attribute, we need to flush caches.
|
||||
*/
|
||||
if (static_cpu_has(X86_FEATURE_CLFLUSH))
|
||||
cpa_flush_range(start, numpages, 1);
|
||||
else
|
||||
cpa_flush_all(1);
|
||||
cpa_flush_range(start, numpages, 1);
|
||||
|
||||
ret = __change_page_attr_set_clr(&cpa, 1);
|
||||
|
||||
|
@ -1870,10 +2096,7 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
|
|||
* in case TLB flushing gets optimized in the cpa_flush_range()
|
||||
* path use the same logic as above.
|
||||
*/
|
||||
if (static_cpu_has(X86_FEATURE_CLFLUSH))
|
||||
cpa_flush_range(start, numpages, 0);
|
||||
else
|
||||
cpa_flush_all(0);
|
||||
cpa_flush_range(start, numpages, 0);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
|
|
@ -185,8 +185,11 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
|
|||
{
|
||||
struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
|
||||
u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
|
||||
bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy);
|
||||
unsigned cpu = smp_processor_id();
|
||||
u64 next_tlb_gen;
|
||||
bool need_flush;
|
||||
u16 new_asid;
|
||||
|
||||
/*
|
||||
* NB: The scheduler will call us with prev == next when switching
|
||||
|
@ -240,20 +243,41 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
|
|||
next->context.ctx_id);
|
||||
|
||||
/*
|
||||
* We don't currently support having a real mm loaded without
|
||||
* our cpu set in mm_cpumask(). We have all the bookkeeping
|
||||
* in place to figure out whether we would need to flush
|
||||
* if our cpu were cleared in mm_cpumask(), but we don't
|
||||
* currently use it.
|
||||
* Even in lazy TLB mode, the CPU should stay set in the
|
||||
* mm_cpumask. The TLB shootdown code can figure out from
|
||||
* from cpu_tlbstate.is_lazy whether or not to send an IPI.
|
||||
*/
|
||||
if (WARN_ON_ONCE(real_prev != &init_mm &&
|
||||
!cpumask_test_cpu(cpu, mm_cpumask(next))))
|
||||
cpumask_set_cpu(cpu, mm_cpumask(next));
|
||||
|
||||
return;
|
||||
/*
|
||||
* If the CPU is not in lazy TLB mode, we are just switching
|
||||
* from one thread in a process to another thread in the same
|
||||
* process. No TLB flush required.
|
||||
*/
|
||||
if (!was_lazy)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Read the tlb_gen to check whether a flush is needed.
|
||||
* If the TLB is up to date, just use it.
|
||||
* The barrier synchronizes with the tlb_gen increment in
|
||||
* the TLB shootdown code.
|
||||
*/
|
||||
smp_mb();
|
||||
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
|
||||
if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) ==
|
||||
next_tlb_gen)
|
||||
return;
|
||||
|
||||
/*
|
||||
* TLB contents went out of date while we were in lazy
|
||||
* mode. Fall through to the TLB switching code below.
|
||||
*/
|
||||
new_asid = prev_asid;
|
||||
need_flush = true;
|
||||
} else {
|
||||
u16 new_asid;
|
||||
bool need_flush;
|
||||
u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
|
||||
|
||||
/*
|
||||
|
@ -308,46 +332,48 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
|
|||
/* Let nmi_uaccess_okay() know that we're changing CR3. */
|
||||
this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
|
||||
barrier();
|
||||
|
||||
if (need_flush) {
|
||||
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
|
||||
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
|
||||
load_new_mm_cr3(next->pgd, new_asid, true);
|
||||
|
||||
/*
|
||||
* NB: This gets called via leave_mm() in the idle path
|
||||
* where RCU functions differently. Tracing normally
|
||||
* uses RCU, so we need to use the _rcuidle variant.
|
||||
*
|
||||
* (There is no good reason for this. The idle code should
|
||||
* be rearranged to call this before rcu_idle_enter().)
|
||||
*/
|
||||
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
|
||||
} else {
|
||||
/* The new ASID is already up to date. */
|
||||
load_new_mm_cr3(next->pgd, new_asid, false);
|
||||
|
||||
/* See above wrt _rcuidle. */
|
||||
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Record last user mm's context id, so we can avoid
|
||||
* flushing branch buffer with IBPB if we switch back
|
||||
* to the same user.
|
||||
*/
|
||||
if (next != &init_mm)
|
||||
this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
|
||||
|
||||
/* Make sure we write CR3 before loaded_mm. */
|
||||
barrier();
|
||||
|
||||
this_cpu_write(cpu_tlbstate.loaded_mm, next);
|
||||
this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
|
||||
}
|
||||
|
||||
load_mm_cr4(next);
|
||||
switch_ldt(real_prev, next);
|
||||
if (need_flush) {
|
||||
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
|
||||
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
|
||||
load_new_mm_cr3(next->pgd, new_asid, true);
|
||||
|
||||
/*
|
||||
* NB: This gets called via leave_mm() in the idle path
|
||||
* where RCU functions differently. Tracing normally
|
||||
* uses RCU, so we need to use the _rcuidle variant.
|
||||
*
|
||||
* (There is no good reason for this. The idle code should
|
||||
* be rearranged to call this before rcu_idle_enter().)
|
||||
*/
|
||||
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
|
||||
} else {
|
||||
/* The new ASID is already up to date. */
|
||||
load_new_mm_cr3(next->pgd, new_asid, false);
|
||||
|
||||
/* See above wrt _rcuidle. */
|
||||
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Record last user mm's context id, so we can avoid
|
||||
* flushing branch buffer with IBPB if we switch back
|
||||
* to the same user.
|
||||
*/
|
||||
if (next != &init_mm)
|
||||
this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
|
||||
|
||||
/* Make sure we write CR3 before loaded_mm. */
|
||||
barrier();
|
||||
|
||||
this_cpu_write(cpu_tlbstate.loaded_mm, next);
|
||||
this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
|
||||
|
||||
if (next != real_prev) {
|
||||
load_mm_cr4(next);
|
||||
switch_ldt(real_prev, next);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -368,20 +394,7 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
|
|||
if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
|
||||
return;
|
||||
|
||||
if (tlb_defer_switch_to_init_mm()) {
|
||||
/*
|
||||
* There's a significant optimization that may be possible
|
||||
* here. We have accurate enough TLB flush tracking that we
|
||||
* don't need to maintain coherence of TLB per se when we're
|
||||
* lazy. We do, however, need to maintain coherence of
|
||||
* paging-structure caches. We could, in principle, leave our
|
||||
* old mm loaded and only switch to init_mm when
|
||||
* tlb_remove_page() happens.
|
||||
*/
|
||||
this_cpu_write(cpu_tlbstate.is_lazy, true);
|
||||
} else {
|
||||
switch_mm(NULL, &init_mm, NULL);
|
||||
}
|
||||
this_cpu_write(cpu_tlbstate.is_lazy, true);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -468,6 +481,9 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
|
|||
* paging-structure cache to avoid speculatively reading
|
||||
* garbage into our TLB. Since switching to init_mm is barely
|
||||
* slower than a minimal flush, just switch to init_mm.
|
||||
*
|
||||
* This should be rare, with native_flush_tlb_others skipping
|
||||
* IPIs to lazy TLB mode CPUs.
|
||||
*/
|
||||
switch_mm_irqs_off(NULL, &init_mm, NULL);
|
||||
return;
|
||||
|
@ -528,17 +544,16 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
|
|||
f->new_tlb_gen == local_tlb_gen + 1 &&
|
||||
f->new_tlb_gen == mm_tlb_gen) {
|
||||
/* Partial flush */
|
||||
unsigned long addr;
|
||||
unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT;
|
||||
unsigned long nr_invalidate = (f->end - f->start) >> f->stride_shift;
|
||||
unsigned long addr = f->start;
|
||||
|
||||
addr = f->start;
|
||||
while (addr < f->end) {
|
||||
__flush_tlb_one_user(addr);
|
||||
addr += PAGE_SIZE;
|
||||
addr += 1UL << f->stride_shift;
|
||||
}
|
||||
if (local)
|
||||
count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages);
|
||||
trace_tlb_flush(reason, nr_pages);
|
||||
count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_invalidate);
|
||||
trace_tlb_flush(reason, nr_invalidate);
|
||||
} else {
|
||||
/* Full flush. */
|
||||
local_flush_tlb();
|
||||
|
@ -571,6 +586,11 @@ static void flush_tlb_func_remote(void *info)
|
|||
flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
|
||||
}
|
||||
|
||||
static bool tlb_is_not_lazy(int cpu, void *data)
|
||||
{
|
||||
return !per_cpu(cpu_tlbstate.is_lazy, cpu);
|
||||
}
|
||||
|
||||
void native_flush_tlb_others(const struct cpumask *cpumask,
|
||||
const struct flush_tlb_info *info)
|
||||
{
|
||||
|
@ -606,8 +626,23 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
|
|||
(void *)info, 1);
|
||||
return;
|
||||
}
|
||||
smp_call_function_many(cpumask, flush_tlb_func_remote,
|
||||
|
||||
/*
|
||||
* If no page tables were freed, we can skip sending IPIs to
|
||||
* CPUs in lazy TLB mode. They will flush the CPU themselves
|
||||
* at the next context switch.
|
||||
*
|
||||
* However, if page tables are getting freed, we need to send the
|
||||
* IPI everywhere, to prevent CPUs in lazy TLB mode from tripping
|
||||
* up on the new contents of what used to be page tables, while
|
||||
* doing a speculative memory access.
|
||||
*/
|
||||
if (info->freed_tables)
|
||||
smp_call_function_many(cpumask, flush_tlb_func_remote,
|
||||
(void *)info, 1);
|
||||
else
|
||||
on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func_remote,
|
||||
(void *)info, 1, GFP_ATOMIC, cpumask);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -623,12 +658,15 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
|
|||
static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
|
||||
|
||||
void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
|
||||
unsigned long end, unsigned long vmflag)
|
||||
unsigned long end, unsigned int stride_shift,
|
||||
bool freed_tables)
|
||||
{
|
||||
int cpu;
|
||||
|
||||
struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = {
|
||||
.mm = mm,
|
||||
.stride_shift = stride_shift,
|
||||
.freed_tables = freed_tables,
|
||||
};
|
||||
|
||||
cpu = get_cpu();
|
||||
|
@ -638,8 +676,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
|
|||
|
||||
/* Should we flush just the requested range? */
|
||||
if ((end != TLB_FLUSH_ALL) &&
|
||||
!(vmflag & VM_HUGETLB) &&
|
||||
((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) {
|
||||
((end - start) >> stride_shift) <= tlb_single_page_flush_ceiling) {
|
||||
info.start = start;
|
||||
info.end = end;
|
||||
} else {
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
#include <linux/tick.h>
|
||||
#include <linux/nmi.h>
|
||||
#include <linux/cpuhotplug.h>
|
||||
#include <linux/stackprotector.h>
|
||||
|
||||
#include <asm/paravirt.h>
|
||||
#include <asm/desc.h>
|
||||
|
@ -88,6 +89,7 @@ static void cpu_bringup(void)
|
|||
asmlinkage __visible void cpu_bringup_and_idle(void)
|
||||
{
|
||||
cpu_bringup();
|
||||
boot_init_stack_canary();
|
||||
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
|
||||
}
|
||||
|
||||
|
|
|
@ -902,12 +902,22 @@ static bool copy_device_table(void)
|
|||
}
|
||||
}
|
||||
|
||||
old_devtb_phys = entry & PAGE_MASK;
|
||||
/*
|
||||
* When SME is enabled in the first kernel, the entry includes the
|
||||
* memory encryption mask(sme_me_mask), we must remove the memory
|
||||
* encryption mask to obtain the true physical address in kdump kernel.
|
||||
*/
|
||||
old_devtb_phys = __sme_clr(entry) & PAGE_MASK;
|
||||
|
||||
if (old_devtb_phys >= 0x100000000ULL) {
|
||||
pr_err("The address of old device table is above 4G, not trustworthy!\n");
|
||||
return false;
|
||||
}
|
||||
old_devtb = memremap(old_devtb_phys, dev_table_size, MEMREMAP_WB);
|
||||
old_devtb = (sme_active() && is_kdump_kernel())
|
||||
? (__force void *)ioremap_encrypted(old_devtb_phys,
|
||||
dev_table_size)
|
||||
: memremap(old_devtb_phys, dev_table_size, MEMREMAP_WB);
|
||||
|
||||
if (!old_devtb)
|
||||
return false;
|
||||
|
||||
|
|
|
@ -24,6 +24,8 @@
|
|||
#include <linux/vmalloc.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/mem_encrypt.h>
|
||||
#include <asm/pgtable.h>
|
||||
#include <asm/io.h>
|
||||
#include "internal.h"
|
||||
|
||||
|
@ -98,7 +100,8 @@ static int pfn_is_ram(unsigned long pfn)
|
|||
|
||||
/* Reads a page from the oldmem device from given offset. */
|
||||
static ssize_t read_from_oldmem(char *buf, size_t count,
|
||||
u64 *ppos, int userbuf)
|
||||
u64 *ppos, int userbuf,
|
||||
bool encrypted)
|
||||
{
|
||||
unsigned long pfn, offset;
|
||||
size_t nr_bytes;
|
||||
|
@ -120,8 +123,15 @@ static ssize_t read_from_oldmem(char *buf, size_t count,
|
|||
if (pfn_is_ram(pfn) == 0)
|
||||
memset(buf, 0, nr_bytes);
|
||||
else {
|
||||
tmp = copy_oldmem_page(pfn, buf, nr_bytes,
|
||||
offset, userbuf);
|
||||
if (encrypted)
|
||||
tmp = copy_oldmem_page_encrypted(pfn, buf,
|
||||
nr_bytes,
|
||||
offset,
|
||||
userbuf);
|
||||
else
|
||||
tmp = copy_oldmem_page(pfn, buf, nr_bytes,
|
||||
offset, userbuf);
|
||||
|
||||
if (tmp < 0)
|
||||
return tmp;
|
||||
}
|
||||
|
@ -155,7 +165,7 @@ void __weak elfcorehdr_free(unsigned long long addr)
|
|||
*/
|
||||
ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
|
||||
{
|
||||
return read_from_oldmem(buf, count, ppos, 0);
|
||||
return read_from_oldmem(buf, count, ppos, 0, false);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -163,7 +173,7 @@ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
|
|||
*/
|
||||
ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos)
|
||||
{
|
||||
return read_from_oldmem(buf, count, ppos, 0);
|
||||
return read_from_oldmem(buf, count, ppos, 0, sme_active());
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -173,9 +183,20 @@ int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma,
|
|||
unsigned long from, unsigned long pfn,
|
||||
unsigned long size, pgprot_t prot)
|
||||
{
|
||||
prot = pgprot_encrypted(prot);
|
||||
return remap_pfn_range(vma, from, pfn, size, prot);
|
||||
}
|
||||
|
||||
/*
|
||||
* Architectures which support memory encryption override this.
|
||||
*/
|
||||
ssize_t __weak
|
||||
copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize,
|
||||
unsigned long offset, int userbuf)
|
||||
{
|
||||
return copy_oldmem_page(pfn, buf, csize, offset, userbuf);
|
||||
}
|
||||
|
||||
/*
|
||||
* Copy to either kernel or user space
|
||||
*/
|
||||
|
@ -351,7 +372,8 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos,
|
|||
m->offset + m->size - *fpos,
|
||||
buflen);
|
||||
start = m->paddr + *fpos - m->offset;
|
||||
tmp = read_from_oldmem(buffer, tsz, &start, userbuf);
|
||||
tmp = read_from_oldmem(buffer, tsz, &start,
|
||||
userbuf, sme_active());
|
||||
if (tmp < 0)
|
||||
return tmp;
|
||||
buflen -= tsz;
|
||||
|
|
|
@ -26,6 +26,10 @@ extern int remap_oldmem_pfn_range(struct vm_area_struct *vma,
|
|||
|
||||
extern ssize_t copy_oldmem_page(unsigned long, char *, size_t,
|
||||
unsigned long, int);
|
||||
extern ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf,
|
||||
size_t csize, unsigned long offset,
|
||||
int userbuf);
|
||||
|
||||
void vmcore_cleanup(void);
|
||||
|
||||
/* Architecture code defines this if there are other possible ELF
|
||||
|
|
|
@ -53,6 +53,10 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
|
|||
smp_call_func_t func, void *info, bool wait,
|
||||
gfp_t gfp_flags);
|
||||
|
||||
void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info),
|
||||
smp_call_func_t func, void *info, bool wait,
|
||||
gfp_t gfp_flags, const struct cpumask *mask);
|
||||
|
||||
int smp_call_function_single_async(int cpu, call_single_data_t *csd);
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
|
|
|
@ -471,6 +471,10 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
|
|||
}
|
||||
}
|
||||
|
||||
/* Ensure that these pages are decrypted if SME is enabled. */
|
||||
if (pages)
|
||||
arch_kexec_post_alloc_pages(page_address(pages), 1 << order, 0);
|
||||
|
||||
return pages;
|
||||
}
|
||||
|
||||
|
@ -867,6 +871,7 @@ static int kimage_load_crash_segment(struct kimage *image,
|
|||
result = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
arch_kexec_post_alloc_pages(page_address(page), 1, 0);
|
||||
ptr = kmap(page);
|
||||
ptr += maddr & ~PAGE_MASK;
|
||||
mchunk = min_t(size_t, mbytes,
|
||||
|
@ -884,6 +889,7 @@ static int kimage_load_crash_segment(struct kimage *image,
|
|||
result = copy_from_user(ptr, buf, uchunk);
|
||||
kexec_flush_icache_page(page);
|
||||
kunmap(page);
|
||||
arch_kexec_pre_free_pages(page_address(page), 1);
|
||||
if (result) {
|
||||
result = -EFAULT;
|
||||
goto out;
|
||||
|
|
|
@ -318,33 +318,34 @@ int release_resource(struct resource *old)
|
|||
|
||||
EXPORT_SYMBOL(release_resource);
|
||||
|
||||
/*
|
||||
* Finds the lowest iomem resource existing within [res->start.res->end).
|
||||
* The caller must specify res->start, res->end, res->flags, and optionally
|
||||
* desc. If found, returns 0, res is overwritten, if not found, returns -1.
|
||||
* This function walks the whole tree and not just first level children until
|
||||
* and unless first_level_children_only is true.
|
||||
/**
|
||||
* Finds the lowest iomem resource that covers part of [start..end]. The
|
||||
* caller must specify start, end, flags, and desc (which may be
|
||||
* IORES_DESC_NONE).
|
||||
*
|
||||
* If a resource is found, returns 0 and *res is overwritten with the part
|
||||
* of the resource that's within [start..end]; if none is found, returns
|
||||
* -1.
|
||||
*
|
||||
* This function walks the whole tree and not just first level children
|
||||
* unless @first_lvl is true.
|
||||
*/
|
||||
static int find_next_iomem_res(struct resource *res, unsigned long desc,
|
||||
bool first_level_children_only)
|
||||
static int find_next_iomem_res(resource_size_t start, resource_size_t end,
|
||||
unsigned long flags, unsigned long desc,
|
||||
bool first_lvl, struct resource *res)
|
||||
{
|
||||
resource_size_t start, end;
|
||||
struct resource *p;
|
||||
bool sibling_only = false;
|
||||
|
||||
BUG_ON(!res);
|
||||
if (!res)
|
||||
return -EINVAL;
|
||||
|
||||
start = res->start;
|
||||
end = res->end;
|
||||
BUG_ON(start >= end);
|
||||
|
||||
if (first_level_children_only)
|
||||
sibling_only = true;
|
||||
if (start >= end)
|
||||
return -EINVAL;
|
||||
|
||||
read_lock(&resource_lock);
|
||||
|
||||
for (p = iomem_resource.child; p; p = next_resource(p, sibling_only)) {
|
||||
if ((p->flags & res->flags) != res->flags)
|
||||
for (p = iomem_resource.child; p; p = next_resource(p, first_lvl)) {
|
||||
if ((p->flags & flags) != flags)
|
||||
continue;
|
||||
if ((desc != IORES_DESC_NONE) && (desc != p->desc))
|
||||
continue;
|
||||
|
@ -352,45 +353,43 @@ static int find_next_iomem_res(struct resource *res, unsigned long desc,
|
|||
p = NULL;
|
||||
break;
|
||||
}
|
||||
if ((p->end >= start) && (p->start < end))
|
||||
if ((p->end >= start) && (p->start <= end))
|
||||
break;
|
||||
}
|
||||
|
||||
read_unlock(&resource_lock);
|
||||
if (!p)
|
||||
return -1;
|
||||
|
||||
/* copy data */
|
||||
if (res->start < p->start)
|
||||
res->start = p->start;
|
||||
if (res->end > p->end)
|
||||
res->end = p->end;
|
||||
res->start = max(start, p->start);
|
||||
res->end = min(end, p->end);
|
||||
res->flags = p->flags;
|
||||
res->desc = p->desc;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int __walk_iomem_res_desc(struct resource *res, unsigned long desc,
|
||||
bool first_level_children_only,
|
||||
void *arg,
|
||||
static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
|
||||
unsigned long flags, unsigned long desc,
|
||||
bool first_lvl, void *arg,
|
||||
int (*func)(struct resource *, void *))
|
||||
{
|
||||
u64 orig_end = res->end;
|
||||
struct resource res;
|
||||
int ret = -1;
|
||||
|
||||
while ((res->start < res->end) &&
|
||||
!find_next_iomem_res(res, desc, first_level_children_only)) {
|
||||
ret = (*func)(res, arg);
|
||||
while (start < end &&
|
||||
!find_next_iomem_res(start, end, flags, desc, first_lvl, &res)) {
|
||||
ret = (*func)(&res, arg);
|
||||
if (ret)
|
||||
break;
|
||||
|
||||
res->start = res->end + 1;
|
||||
res->end = orig_end;
|
||||
start = res.end + 1;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
/**
|
||||
* Walks through iomem resources and calls func() with matching resource
|
||||
* ranges. This walks through whole tree and not just first level children.
|
||||
* All the memory ranges which overlap start,end and also match flags and
|
||||
|
@ -407,13 +406,7 @@ static int __walk_iomem_res_desc(struct resource *res, unsigned long desc,
|
|||
int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start,
|
||||
u64 end, void *arg, int (*func)(struct resource *, void *))
|
||||
{
|
||||
struct resource res;
|
||||
|
||||
res.start = start;
|
||||
res.end = end;
|
||||
res.flags = flags;
|
||||
|
||||
return __walk_iomem_res_desc(&res, desc, false, arg, func);
|
||||
return __walk_iomem_res_desc(start, end, flags, desc, false, arg, func);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(walk_iomem_res_desc);
|
||||
|
||||
|
@ -425,15 +418,11 @@ EXPORT_SYMBOL_GPL(walk_iomem_res_desc);
|
|||
* ranges.
|
||||
*/
|
||||
int walk_system_ram_res(u64 start, u64 end, void *arg,
|
||||
int (*func)(struct resource *, void *))
|
||||
int (*func)(struct resource *, void *))
|
||||
{
|
||||
struct resource res;
|
||||
unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
|
||||
|
||||
res.start = start;
|
||||
res.end = end;
|
||||
res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
|
||||
|
||||
return __walk_iomem_res_desc(&res, IORES_DESC_NONE, true,
|
||||
return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, true,
|
||||
arg, func);
|
||||
}
|
||||
|
||||
|
@ -444,13 +433,9 @@ int walk_system_ram_res(u64 start, u64 end, void *arg,
|
|||
int walk_mem_res(u64 start, u64 end, void *arg,
|
||||
int (*func)(struct resource *, void *))
|
||||
{
|
||||
struct resource res;
|
||||
unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY;
|
||||
|
||||
res.start = start;
|
||||
res.end = end;
|
||||
res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
|
||||
|
||||
return __walk_iomem_res_desc(&res, IORES_DESC_NONE, true,
|
||||
return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, true,
|
||||
arg, func);
|
||||
}
|
||||
|
||||
|
@ -462,27 +447,27 @@ int walk_mem_res(u64 start, u64 end, void *arg,
|
|||
* It is to be used only for System RAM.
|
||||
*/
|
||||
int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
|
||||
void *arg, int (*func)(unsigned long, unsigned long, void *))
|
||||
void *arg, int (*func)(unsigned long, unsigned long, void *))
|
||||
{
|
||||
resource_size_t start, end;
|
||||
unsigned long flags;
|
||||
struct resource res;
|
||||
unsigned long pfn, end_pfn;
|
||||
u64 orig_end;
|
||||
int ret = -1;
|
||||
|
||||
res.start = (u64) start_pfn << PAGE_SHIFT;
|
||||
res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
|
||||
res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
|
||||
orig_end = res.end;
|
||||
while ((res.start < res.end) &&
|
||||
(find_next_iomem_res(&res, IORES_DESC_NONE, true) >= 0)) {
|
||||
start = (u64) start_pfn << PAGE_SHIFT;
|
||||
end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
|
||||
flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
|
||||
while (start < end &&
|
||||
!find_next_iomem_res(start, end, flags, IORES_DESC_NONE,
|
||||
true, &res)) {
|
||||
pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
|
||||
end_pfn = (res.end + 1) >> PAGE_SHIFT;
|
||||
if (end_pfn > pfn)
|
||||
ret = (*func)(pfn, end_pfn - pfn, arg);
|
||||
if (ret)
|
||||
break;
|
||||
res.start = res.end + 1;
|
||||
res.end = orig_end;
|
||||
start = res.end + 1;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
@ -658,8 +643,8 @@ static int find_resource(struct resource *root, struct resource *new,
|
|||
* @constraint: the size and alignment constraints to be met.
|
||||
*/
|
||||
static int reallocate_resource(struct resource *root, struct resource *old,
|
||||
resource_size_t newsize,
|
||||
struct resource_constraint *constraint)
|
||||
resource_size_t newsize,
|
||||
struct resource_constraint *constraint)
|
||||
{
|
||||
int err=0;
|
||||
struct resource new = *old;
|
||||
|
@ -972,7 +957,7 @@ skip:
|
|||
* Existing children of the resource are assumed to be immutable.
|
||||
*/
|
||||
int adjust_resource(struct resource *res, resource_size_t start,
|
||||
resource_size_t size)
|
||||
resource_size_t size)
|
||||
{
|
||||
int result;
|
||||
|
||||
|
@ -983,9 +968,9 @@ int adjust_resource(struct resource *res, resource_size_t start,
|
|||
}
|
||||
EXPORT_SYMBOL(adjust_resource);
|
||||
|
||||
static void __init __reserve_region_with_split(struct resource *root,
|
||||
resource_size_t start, resource_size_t end,
|
||||
const char *name)
|
||||
static void __init
|
||||
__reserve_region_with_split(struct resource *root, resource_size_t start,
|
||||
resource_size_t end, const char *name)
|
||||
{
|
||||
struct resource *parent = root;
|
||||
struct resource *conflict;
|
||||
|
@ -1044,9 +1029,9 @@ static void __init __reserve_region_with_split(struct resource *root,
|
|||
|
||||
}
|
||||
|
||||
void __init reserve_region_with_split(struct resource *root,
|
||||
resource_size_t start, resource_size_t end,
|
||||
const char *name)
|
||||
void __init
|
||||
reserve_region_with_split(struct resource *root, resource_size_t start,
|
||||
resource_size_t end, const char *name)
|
||||
{
|
||||
int abort = 0;
|
||||
|
||||
|
@ -1172,7 +1157,7 @@ EXPORT_SYMBOL(__request_region);
|
|||
* The described resource region must match a currently busy region.
|
||||
*/
|
||||
void __release_region(struct resource *parent, resource_size_t start,
|
||||
resource_size_t n)
|
||||
resource_size_t n)
|
||||
{
|
||||
struct resource **p;
|
||||
resource_size_t end;
|
||||
|
@ -1234,7 +1219,7 @@ EXPORT_SYMBOL(__release_region);
|
|||
* simplicity. Enhance this logic when necessary.
|
||||
*/
|
||||
int release_mem_region_adjustable(struct resource *parent,
|
||||
resource_size_t start, resource_size_t size)
|
||||
resource_size_t start, resource_size_t size)
|
||||
{
|
||||
struct resource **p;
|
||||
struct resource *res;
|
||||
|
@ -1410,9 +1395,9 @@ static int devm_region_match(struct device *dev, void *res, void *match_data)
|
|||
this->start == match->start && this->n == match->n;
|
||||
}
|
||||
|
||||
struct resource * __devm_request_region(struct device *dev,
|
||||
struct resource *parent, resource_size_t start,
|
||||
resource_size_t n, const char *name)
|
||||
struct resource *
|
||||
__devm_request_region(struct device *dev, struct resource *parent,
|
||||
resource_size_t start, resource_size_t n, const char *name)
|
||||
{
|
||||
struct region_devres *dr = NULL;
|
||||
struct resource *res;
|
||||
|
|
|
@ -347,21 +347,6 @@ EXPORT_SYMBOL_GPL(play_idle);
|
|||
|
||||
void cpu_startup_entry(enum cpuhp_state state)
|
||||
{
|
||||
/*
|
||||
* This #ifdef needs to die, but it's too late in the cycle to
|
||||
* make this generic (ARM and SH have never invoked the canary
|
||||
* init for the non boot CPUs!). Will be fixed in 3.11
|
||||
*/
|
||||
#ifdef CONFIG_X86
|
||||
/*
|
||||
* If we're the non-boot CPU, nothing set the stack canary up
|
||||
* for us. The boot CPU already has it initialized but no harm
|
||||
* in doing it again. This is a good place for updating it, as
|
||||
* we wont ever return from this function (so the invalid
|
||||
* canaries already on the stack wont ever trigger).
|
||||
*/
|
||||
boot_init_stack_canary();
|
||||
#endif
|
||||
arch_cpu_idle_prepare();
|
||||
cpuhp_online_idle(state);
|
||||
while (1)
|
||||
|
|
|
@ -56,7 +56,6 @@
|
|||
#include <linux/profile.h>
|
||||
#include <linux/rcupdate_wait.h>
|
||||
#include <linux/security.h>
|
||||
#include <linux/stackprotector.h>
|
||||
#include <linux/stop_machine.h>
|
||||
#include <linux/suspend.h>
|
||||
#include <linux/swait.h>
|
||||
|
|
19
kernel/smp.c
19
kernel/smp.c
|
@ -669,9 +669,9 @@ EXPORT_SYMBOL(on_each_cpu_mask);
|
|||
* You must not call this function with disabled interrupts or
|
||||
* from a hardware interrupt handler or from a bottom half handler.
|
||||
*/
|
||||
void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
|
||||
void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info),
|
||||
smp_call_func_t func, void *info, bool wait,
|
||||
gfp_t gfp_flags)
|
||||
gfp_t gfp_flags, const struct cpumask *mask)
|
||||
{
|
||||
cpumask_var_t cpus;
|
||||
int cpu, ret;
|
||||
|
@ -680,9 +680,9 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
|
|||
|
||||
if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) {
|
||||
preempt_disable();
|
||||
for_each_online_cpu(cpu)
|
||||
for_each_cpu(cpu, mask)
|
||||
if (cond_func(cpu, info))
|
||||
cpumask_set_cpu(cpu, cpus);
|
||||
__cpumask_set_cpu(cpu, cpus);
|
||||
on_each_cpu_mask(cpus, func, info, wait);
|
||||
preempt_enable();
|
||||
free_cpumask_var(cpus);
|
||||
|
@ -692,7 +692,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
|
|||
* just have to IPI them one by one.
|
||||
*/
|
||||
preempt_disable();
|
||||
for_each_online_cpu(cpu)
|
||||
for_each_cpu(cpu, mask)
|
||||
if (cond_func(cpu, info)) {
|
||||
ret = smp_call_function_single(cpu, func,
|
||||
info, wait);
|
||||
|
@ -701,6 +701,15 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
|
|||
preempt_enable();
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(on_each_cpu_cond_mask);
|
||||
|
||||
void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
|
||||
smp_call_func_t func, void *info, bool wait,
|
||||
gfp_t gfp_flags)
|
||||
{
|
||||
on_each_cpu_cond_mask(cond_func, func, info, wait, gfp_flags,
|
||||
cpu_online_mask);
|
||||
}
|
||||
EXPORT_SYMBOL(on_each_cpu_cond);
|
||||
|
||||
static void do_nothing(void *unused)
|
||||
|
|
14
kernel/up.c
14
kernel/up.c
|
@ -68,9 +68,9 @@ EXPORT_SYMBOL(on_each_cpu_mask);
|
|||
* Preemption is disabled here to make sure the cond_func is called under the
|
||||
* same condtions in UP and SMP.
|
||||
*/
|
||||
void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
|
||||
smp_call_func_t func, void *info, bool wait,
|
||||
gfp_t gfp_flags)
|
||||
void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info),
|
||||
smp_call_func_t func, void *info, bool wait,
|
||||
gfp_t gfp_flags, const struct cpumask *mask)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
|
@ -82,6 +82,14 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
|
|||
}
|
||||
preempt_enable();
|
||||
}
|
||||
EXPORT_SYMBOL(on_each_cpu_cond_mask);
|
||||
|
||||
void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
|
||||
smp_call_func_t func, void *info, bool wait,
|
||||
gfp_t gfp_flags)
|
||||
{
|
||||
on_each_cpu_cond_mask(cond_func, func, info, wait, gfp_flags, NULL);
|
||||
}
|
||||
EXPORT_SYMBOL(on_each_cpu_cond);
|
||||
|
||||
int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys)
|
||||
|
|
|
@ -8,6 +8,7 @@
|
|||
*/
|
||||
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/hugetlb.h>
|
||||
#include <asm/tlb.h>
|
||||
#include <asm-generic/pgtable.h>
|
||||
|
||||
|
|
Loading…
Reference in New Issue