2016-04-29 21:25:58 +08:00
|
|
|
/*
|
|
|
|
* Page table handling routines for radix page table.
|
|
|
|
*
|
|
|
|
* Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
|
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU General Public License
|
|
|
|
* as published by the Free Software Foundation; either version
|
|
|
|
* 2 of the License, or (at your option) any later version.
|
|
|
|
*/
|
2017-08-30 15:41:29 +08:00
|
|
|
|
|
|
|
#define pr_fmt(fmt) "radix-mmu: " fmt
|
|
|
|
|
|
|
|
#include <linux/kernel.h>
|
2017-02-04 07:16:44 +08:00
|
|
|
#include <linux/sched/mm.h>
|
2016-04-29 21:25:58 +08:00
|
|
|
#include <linux/memblock.h>
|
|
|
|
#include <linux/of_fdt.h>
|
2017-06-29 01:04:09 +08:00
|
|
|
#include <linux/mm.h>
|
2017-08-30 15:41:17 +08:00
|
|
|
#include <linux/string_helpers.h>
|
2016-04-29 21:25:58 +08:00
|
|
|
|
|
|
|
#include <asm/pgtable.h>
|
|
|
|
#include <asm/pgalloc.h>
|
|
|
|
#include <asm/dma.h>
|
|
|
|
#include <asm/machdep.h>
|
|
|
|
#include <asm/mmu.h>
|
|
|
|
#include <asm/firmware.h>
|
2016-12-14 10:36:51 +08:00
|
|
|
#include <asm/powernv.h>
|
2017-06-06 13:48:57 +08:00
|
|
|
#include <asm/sections.h>
|
2017-04-11 13:23:25 +08:00
|
|
|
#include <asm/trace.h>
|
2016-04-29 21:25:58 +08:00
|
|
|
|
2016-04-29 21:26:30 +08:00
|
|
|
#include <trace/events/thp.h>
|
|
|
|
|
powerpc/mm/radix: Workaround prefetch issue with KVM
There's a somewhat architectural issue with Radix MMU and KVM.
When coming out of a guest with AIL (Alternate Interrupt Location, ie,
MMU enabled), we start executing hypervisor code with the PID register
still containing whatever the guest has been using.
The problem is that the CPU can (and will) then start prefetching or
speculatively load from whatever host context has that same PID (if
any), thus bringing translations for that context into the TLB, which
Linux doesn't know about.
This can cause stale translations and subsequent crashes.
Fixing this in a way that is neither racy nor a huge performance
impact is difficult. We could just make the host invalidations always
use broadcast forms but that would hurt single threaded programs for
example.
We chose to fix it instead by partitioning the PID space between guest
and host. This is possible because today Linux only use 19 out of the
20 bits of PID space, so existing guests will work if we make the host
use the top half of the 20 bits space.
We additionally add support for a property to indicate to Linux the
size of the PID register which will be useful if we eventually have
processors with a larger PID space available.
There is still an issue with malicious guests purposefully setting the
PID register to a value in the hosts PID range. Hopefully future HW
can prevent that, but in the meantime, we handle it with a pair of
kludges:
- On the way out of a guest, before we clear the current VCPU in the
PACA, we check the PID and if it's outside of the permitted range
we flush the TLB for that PID.
- When context switching, if the mm is "new" on that CPU (the
corresponding bit was set for the first time in the mm cpumask), we
check if any sibling thread is in KVM (has a non-NULL VCPU pointer
in the PACA). If that is the case, we also flush the PID for that
CPU (core).
This second part is needed to handle the case where a process is
migrated (or starts a new pthread) on a sibling thread of the CPU
coming out of KVM, as there's a window where stale translations can
exist before we detect it and flush them out.
A future optimization could be added by keeping track of whether the
PID has ever been used and avoid doing that for completely fresh PIDs.
We could similarily mark PIDs that have been the subject of a global
invalidation as "fresh". But for now this will do.
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
[mpe: Rework the asm to build with CONFIG_PPC_RADIX_MMU=n, drop
unneeded include of kvm_book3s_asm.h]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2017-07-24 12:26:06 +08:00
|
|
|
unsigned int mmu_pid_bits;
|
|
|
|
unsigned int mmu_base_pid;
|
|
|
|
|
2016-07-13 17:35:28 +08:00
|
|
|
static int native_register_process_table(unsigned long base, unsigned long pg_sz,
|
|
|
|
unsigned long table_size)
|
2016-04-29 21:25:58 +08:00
|
|
|
{
|
2017-08-03 12:15:51 +08:00
|
|
|
unsigned long patb0, patb1;
|
|
|
|
|
|
|
|
patb0 = be64_to_cpu(partition_tb[0].patb0);
|
|
|
|
patb1 = base | table_size | PATB_GR;
|
|
|
|
|
|
|
|
mmu_partition_table_set_entry(0, patb0, patb1);
|
2016-07-13 17:35:28 +08:00
|
|
|
|
2016-04-29 21:25:58 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-02-13 23:08:24 +08:00
|
|
|
static __ref void *early_alloc_pgtable(unsigned long size, int nid,
|
|
|
|
unsigned long region_start, unsigned long region_end)
|
2016-04-29 21:25:58 +08:00
|
|
|
{
|
2018-02-13 23:08:24 +08:00
|
|
|
unsigned long pa = 0;
|
2016-04-29 21:25:58 +08:00
|
|
|
void *pt;
|
|
|
|
|
2018-02-13 23:08:24 +08:00
|
|
|
if (region_start || region_end) /* has region hint */
|
|
|
|
pa = memblock_alloc_range(size, size, region_start, region_end,
|
|
|
|
MEMBLOCK_NONE);
|
|
|
|
else if (nid != -1) /* has node hint */
|
|
|
|
pa = memblock_alloc_base_nid(size, size,
|
|
|
|
MEMBLOCK_ALLOC_ANYWHERE,
|
|
|
|
nid, MEMBLOCK_NONE);
|
|
|
|
|
|
|
|
if (!pa)
|
|
|
|
pa = memblock_alloc_base(size, size, MEMBLOCK_ALLOC_ANYWHERE);
|
|
|
|
|
|
|
|
BUG_ON(!pa);
|
|
|
|
|
|
|
|
pt = __va(pa);
|
2016-04-29 21:25:58 +08:00
|
|
|
memset(pt, 0, size);
|
|
|
|
|
|
|
|
return pt;
|
|
|
|
}
|
|
|
|
|
2018-02-13 23:08:23 +08:00
|
|
|
static int early_map_kernel_page(unsigned long ea, unsigned long pa,
|
|
|
|
pgprot_t flags,
|
2018-02-13 23:08:24 +08:00
|
|
|
unsigned int map_page_size,
|
|
|
|
int nid,
|
|
|
|
unsigned long region_start, unsigned long region_end)
|
2018-02-13 23:08:23 +08:00
|
|
|
{
|
2018-02-13 23:08:24 +08:00
|
|
|
unsigned long pfn = pa >> PAGE_SHIFT;
|
2018-02-13 23:08:23 +08:00
|
|
|
pgd_t *pgdp;
|
|
|
|
pud_t *pudp;
|
|
|
|
pmd_t *pmdp;
|
|
|
|
pte_t *ptep;
|
|
|
|
|
|
|
|
pgdp = pgd_offset_k(ea);
|
|
|
|
if (pgd_none(*pgdp)) {
|
2018-02-13 23:08:24 +08:00
|
|
|
pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid,
|
|
|
|
region_start, region_end);
|
2018-02-13 23:08:23 +08:00
|
|
|
pgd_populate(&init_mm, pgdp, pudp);
|
|
|
|
}
|
|
|
|
pudp = pud_offset(pgdp, ea);
|
|
|
|
if (map_page_size == PUD_SIZE) {
|
|
|
|
ptep = (pte_t *)pudp;
|
|
|
|
goto set_the_pte;
|
|
|
|
}
|
|
|
|
if (pud_none(*pudp)) {
|
2018-02-13 23:08:24 +08:00
|
|
|
pmdp = early_alloc_pgtable(PMD_TABLE_SIZE, nid,
|
|
|
|
region_start, region_end);
|
2018-02-13 23:08:23 +08:00
|
|
|
pud_populate(&init_mm, pudp, pmdp);
|
|
|
|
}
|
|
|
|
pmdp = pmd_offset(pudp, ea);
|
|
|
|
if (map_page_size == PMD_SIZE) {
|
|
|
|
ptep = pmdp_ptep(pmdp);
|
|
|
|
goto set_the_pte;
|
|
|
|
}
|
|
|
|
if (!pmd_present(*pmdp)) {
|
2018-02-13 23:08:24 +08:00
|
|
|
ptep = early_alloc_pgtable(PAGE_SIZE, nid,
|
|
|
|
region_start, region_end);
|
2018-02-13 23:08:23 +08:00
|
|
|
pmd_populate_kernel(&init_mm, pmdp, ptep);
|
|
|
|
}
|
|
|
|
ptep = pte_offset_kernel(pmdp, ea);
|
|
|
|
|
|
|
|
set_the_pte:
|
2018-02-13 23:08:24 +08:00
|
|
|
set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
|
2018-02-13 23:08:23 +08:00
|
|
|
smp_wmb();
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-02-13 23:08:24 +08:00
|
|
|
/*
|
|
|
|
* nid, region_start, and region_end are hints to try to place the page
|
|
|
|
* table memory in the same node or region.
|
|
|
|
*/
|
|
|
|
static int __map_kernel_page(unsigned long ea, unsigned long pa,
|
2016-04-29 21:25:58 +08:00
|
|
|
pgprot_t flags,
|
2018-02-13 23:08:24 +08:00
|
|
|
unsigned int map_page_size,
|
|
|
|
int nid,
|
|
|
|
unsigned long region_start, unsigned long region_end)
|
2016-04-29 21:25:58 +08:00
|
|
|
{
|
2018-02-13 23:08:24 +08:00
|
|
|
unsigned long pfn = pa >> PAGE_SHIFT;
|
2016-04-29 21:25:58 +08:00
|
|
|
pgd_t *pgdp;
|
|
|
|
pud_t *pudp;
|
|
|
|
pmd_t *pmdp;
|
|
|
|
pte_t *ptep;
|
|
|
|
/*
|
|
|
|
* Make sure task size is correct as per the max adddr
|
|
|
|
*/
|
|
|
|
BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE);
|
2018-02-13 23:08:23 +08:00
|
|
|
|
2018-02-13 23:08:24 +08:00
|
|
|
if (unlikely(!slab_is_available()))
|
|
|
|
return early_map_kernel_page(ea, pa, flags, map_page_size,
|
|
|
|
nid, region_start, region_end);
|
2018-02-13 23:08:23 +08:00
|
|
|
|
2018-02-13 23:08:24 +08:00
|
|
|
/*
|
|
|
|
* Should make page table allocation functions be able to take a
|
|
|
|
* node, so we can place kernel page tables on the right nodes after
|
|
|
|
* boot.
|
|
|
|
*/
|
2018-02-13 23:08:23 +08:00
|
|
|
pgdp = pgd_offset_k(ea);
|
|
|
|
pudp = pud_alloc(&init_mm, pgdp, ea);
|
|
|
|
if (!pudp)
|
|
|
|
return -ENOMEM;
|
|
|
|
if (map_page_size == PUD_SIZE) {
|
|
|
|
ptep = (pte_t *)pudp;
|
|
|
|
goto set_the_pte;
|
|
|
|
}
|
|
|
|
pmdp = pmd_alloc(&init_mm, pudp, ea);
|
|
|
|
if (!pmdp)
|
|
|
|
return -ENOMEM;
|
|
|
|
if (map_page_size == PMD_SIZE) {
|
|
|
|
ptep = pmdp_ptep(pmdp);
|
|
|
|
goto set_the_pte;
|
2016-04-29 21:25:58 +08:00
|
|
|
}
|
2018-02-13 23:08:23 +08:00
|
|
|
ptep = pte_alloc_kernel(pmdp, ea);
|
|
|
|
if (!ptep)
|
|
|
|
return -ENOMEM;
|
2016-04-29 21:25:58 +08:00
|
|
|
|
|
|
|
set_the_pte:
|
2018-02-13 23:08:24 +08:00
|
|
|
set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
|
2016-04-29 21:25:58 +08:00
|
|
|
smp_wmb();
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-02-13 23:08:24 +08:00
|
|
|
static int __map_kernel_page_nid(unsigned long ea, unsigned long pa,
|
|
|
|
pgprot_t flags,
|
|
|
|
unsigned int map_page_size, int nid)
|
|
|
|
{
|
|
|
|
return __map_kernel_page(ea, pa, flags, map_page_size, nid, 0, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
int radix__map_kernel_page(unsigned long ea, unsigned long pa,
|
|
|
|
pgprot_t flags,
|
|
|
|
unsigned int map_page_size)
|
|
|
|
{
|
|
|
|
return __map_kernel_page(ea, pa, flags, map_page_size, -1, 0, 0);
|
|
|
|
}
|
|
|
|
|
2017-06-29 01:04:09 +08:00
|
|
|
#ifdef CONFIG_STRICT_KERNEL_RWX
|
2017-07-14 14:51:21 +08:00
|
|
|
void radix__change_memory_range(unsigned long start, unsigned long end,
|
|
|
|
unsigned long clear)
|
2017-06-29 01:04:09 +08:00
|
|
|
{
|
|
|
|
unsigned long idx;
|
|
|
|
pgd_t *pgdp;
|
|
|
|
pud_t *pudp;
|
|
|
|
pmd_t *pmdp;
|
|
|
|
pte_t *ptep;
|
|
|
|
|
|
|
|
start = ALIGN_DOWN(start, PAGE_SIZE);
|
|
|
|
end = PAGE_ALIGN(end); // aligns up
|
|
|
|
|
2017-07-14 14:51:21 +08:00
|
|
|
pr_debug("Changing flags on range %lx-%lx removing 0x%lx\n",
|
|
|
|
start, end, clear);
|
2017-06-29 01:04:09 +08:00
|
|
|
|
|
|
|
for (idx = start; idx < end; idx += PAGE_SIZE) {
|
|
|
|
pgdp = pgd_offset_k(idx);
|
|
|
|
pudp = pud_alloc(&init_mm, pgdp, idx);
|
|
|
|
if (!pudp)
|
|
|
|
continue;
|
|
|
|
if (pud_huge(*pudp)) {
|
|
|
|
ptep = (pte_t *)pudp;
|
|
|
|
goto update_the_pte;
|
|
|
|
}
|
|
|
|
pmdp = pmd_alloc(&init_mm, pudp, idx);
|
|
|
|
if (!pmdp)
|
|
|
|
continue;
|
|
|
|
if (pmd_huge(*pmdp)) {
|
|
|
|
ptep = pmdp_ptep(pmdp);
|
|
|
|
goto update_the_pte;
|
|
|
|
}
|
|
|
|
ptep = pte_alloc_kernel(pmdp, idx);
|
|
|
|
if (!ptep)
|
|
|
|
continue;
|
|
|
|
update_the_pte:
|
2017-07-14 14:51:21 +08:00
|
|
|
radix__pte_update(&init_mm, idx, ptep, clear, 0, 0);
|
2017-06-29 01:04:09 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
radix__flush_tlb_kernel_range(start, end);
|
|
|
|
}
|
2017-07-14 14:51:21 +08:00
|
|
|
|
|
|
|
void radix__mark_rodata_ro(void)
|
|
|
|
{
|
|
|
|
unsigned long start, end;
|
|
|
|
|
powerpc/mm/radix: Fix crashes on Power9 DD1 with radix MMU and STRICT_RWX
When using the radix MMU on Power9 DD1, to work around a hardware
problem, radix__pte_update() is required to do a two stage update of
the PTE. First we write a zero value into the PTE, then we flush the
TLB, and then we write the new PTE value.
In the normal case that works OK, but it does not work if we're
updating the PTE that maps the code we're executing, because the
mapping is removed by the TLB flush and we can no longer execute from
it. Unfortunately the STRICT_RWX code needs to do exactly that.
The exact symptoms when we hit this case vary, sometimes we print an
oops and then get stuck after that, but I've also seen a machine just
get stuck continually page faulting with no oops printed. The variance
is presumably due to the exact layout of the text and the page size
used for the mappings. In all cases we are unable to boot to a shell.
There are possible solutions such as creating a second mapping of the
TLB flush code, executing from that, and then jumping back to the
original. However we don't want to add that level of complexity for a
DD1 work around.
So just detect that we're running on Power9 DD1 and refrain from
changing the permissions, effectively disabling STRICT_RWX on Power9
DD1.
Fixes: 7614ff3272a1 ("powerpc/mm/radix: Implement STRICT_RWX/mark_rodata_ro() for Radix")
Cc: stable@vger.kernel.org # v4.13+
Reported-by: Andrew Jeffery <andrew@aj.id.au>
[Changelog as suggested by Michael Ellerman <mpe@ellerman.id.au>]
Signed-off-by: Balbir Singh <bsingharora@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2017-10-16 13:21:35 +08:00
|
|
|
/*
|
|
|
|
* mark_rodata_ro() will mark itself as !writable at some point.
|
|
|
|
* Due to DD1 workaround in radix__pte_update(), we'll end up with
|
|
|
|
* an invalid pte and the system will crash quite severly.
|
|
|
|
*/
|
|
|
|
if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
|
|
|
|
pr_warn("Warning: Unable to mark rodata read only on P9 DD1\n");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2017-07-14 14:51:21 +08:00
|
|
|
start = (unsigned long)_stext;
|
|
|
|
end = (unsigned long)__init_begin;
|
|
|
|
|
|
|
|
radix__change_memory_range(start, end, _PAGE_WRITE);
|
|
|
|
}
|
2017-07-14 14:51:23 +08:00
|
|
|
|
|
|
|
void radix__mark_initmem_nx(void)
|
|
|
|
{
|
|
|
|
unsigned long start = (unsigned long)__init_begin;
|
|
|
|
unsigned long end = (unsigned long)__init_end;
|
|
|
|
|
|
|
|
radix__change_memory_range(start, end, _PAGE_EXEC);
|
|
|
|
}
|
2017-06-29 01:04:09 +08:00
|
|
|
#endif /* CONFIG_STRICT_KERNEL_RWX */
|
|
|
|
|
2017-01-17 03:07:43 +08:00
|
|
|
static inline void __meminit print_mapping(unsigned long start,
|
|
|
|
unsigned long end,
|
|
|
|
unsigned long size)
|
|
|
|
{
|
2017-08-30 15:41:17 +08:00
|
|
|
char buf[10];
|
|
|
|
|
2017-01-17 03:07:43 +08:00
|
|
|
if (end <= start)
|
|
|
|
return;
|
|
|
|
|
2017-08-30 15:41:17 +08:00
|
|
|
string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf));
|
|
|
|
|
|
|
|
pr_info("Mapped 0x%016lx-0x%016lx with %s pages\n", start, end, buf);
|
2017-01-17 03:07:43 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static int __meminit create_physical_mapping(unsigned long start,
|
2018-02-13 23:08:24 +08:00
|
|
|
unsigned long end,
|
|
|
|
int nid)
|
2017-01-17 03:07:43 +08:00
|
|
|
{
|
2017-06-06 13:48:57 +08:00
|
|
|
unsigned long vaddr, addr, mapping_size = 0;
|
|
|
|
pgprot_t prot;
|
2017-06-29 01:04:09 +08:00
|
|
|
unsigned long max_mapping_size;
|
|
|
|
#ifdef CONFIG_STRICT_KERNEL_RWX
|
|
|
|
int split_text_mapping = 1;
|
|
|
|
#else
|
|
|
|
int split_text_mapping = 0;
|
|
|
|
#endif
|
2017-01-17 03:07:43 +08:00
|
|
|
|
|
|
|
start = _ALIGN_UP(start, PAGE_SIZE);
|
|
|
|
for (addr = start; addr < end; addr += mapping_size) {
|
|
|
|
unsigned long gap, previous_size;
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
gap = end - addr;
|
|
|
|
previous_size = mapping_size;
|
2017-06-29 01:04:09 +08:00
|
|
|
max_mapping_size = PUD_SIZE;
|
2017-01-17 03:07:43 +08:00
|
|
|
|
2017-06-29 01:04:09 +08:00
|
|
|
retry:
|
2017-01-17 03:07:43 +08:00
|
|
|
if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE &&
|
2017-06-29 01:04:09 +08:00
|
|
|
mmu_psize_defs[MMU_PAGE_1G].shift &&
|
|
|
|
PUD_SIZE <= max_mapping_size)
|
2017-01-17 03:07:43 +08:00
|
|
|
mapping_size = PUD_SIZE;
|
|
|
|
else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE &&
|
|
|
|
mmu_psize_defs[MMU_PAGE_2M].shift)
|
|
|
|
mapping_size = PMD_SIZE;
|
|
|
|
else
|
|
|
|
mapping_size = PAGE_SIZE;
|
|
|
|
|
2017-06-29 01:04:09 +08:00
|
|
|
if (split_text_mapping && (mapping_size == PUD_SIZE) &&
|
|
|
|
(addr <= __pa_symbol(__init_begin)) &&
|
|
|
|
(addr + mapping_size) >= __pa_symbol(_stext)) {
|
|
|
|
max_mapping_size = PMD_SIZE;
|
|
|
|
goto retry;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (split_text_mapping && (mapping_size == PMD_SIZE) &&
|
|
|
|
(addr <= __pa_symbol(__init_begin)) &&
|
|
|
|
(addr + mapping_size) >= __pa_symbol(_stext))
|
|
|
|
mapping_size = PAGE_SIZE;
|
|
|
|
|
2017-01-17 03:07:43 +08:00
|
|
|
if (mapping_size != previous_size) {
|
|
|
|
print_mapping(start, addr, previous_size);
|
|
|
|
start = addr;
|
|
|
|
}
|
|
|
|
|
2017-06-06 13:48:57 +08:00
|
|
|
vaddr = (unsigned long)__va(addr);
|
|
|
|
|
2017-06-29 01:04:10 +08:00
|
|
|
if (overlaps_kernel_text(vaddr, vaddr + mapping_size) ||
|
|
|
|
overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size))
|
2017-06-06 13:48:57 +08:00
|
|
|
prot = PAGE_KERNEL_X;
|
|
|
|
else
|
|
|
|
prot = PAGE_KERNEL;
|
|
|
|
|
2018-02-13 23:08:24 +08:00
|
|
|
rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end);
|
2017-01-17 03:07:43 +08:00
|
|
|
if (rc)
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
print_mapping(start, addr, mapping_size);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-02-13 23:08:24 +08:00
|
|
|
void __init radix_init_pgtable(void)
|
2016-04-29 21:25:58 +08:00
|
|
|
{
|
|
|
|
unsigned long rts_field;
|
|
|
|
struct memblock_region *reg;
|
|
|
|
|
|
|
|
/* We don't support slb for radix */
|
|
|
|
mmu_slb_size = 0;
|
|
|
|
/*
|
|
|
|
* Create the linear mapping, using standard page size for now
|
|
|
|
*/
|
2018-02-13 23:08:24 +08:00
|
|
|
for_each_memblock(memory, reg) {
|
|
|
|
/*
|
|
|
|
* The memblock allocator is up at this point, so the
|
|
|
|
* page tables will be allocated within the range. No
|
|
|
|
* need or a node (which we don't have yet).
|
|
|
|
*/
|
2017-01-17 03:07:43 +08:00
|
|
|
WARN_ON(create_physical_mapping(reg->base,
|
2018-02-13 23:08:24 +08:00
|
|
|
reg->base + reg->size,
|
|
|
|
-1));
|
|
|
|
}
|
powerpc/mm/radix: Workaround prefetch issue with KVM
There's a somewhat architectural issue with Radix MMU and KVM.
When coming out of a guest with AIL (Alternate Interrupt Location, ie,
MMU enabled), we start executing hypervisor code with the PID register
still containing whatever the guest has been using.
The problem is that the CPU can (and will) then start prefetching or
speculatively load from whatever host context has that same PID (if
any), thus bringing translations for that context into the TLB, which
Linux doesn't know about.
This can cause stale translations and subsequent crashes.
Fixing this in a way that is neither racy nor a huge performance
impact is difficult. We could just make the host invalidations always
use broadcast forms but that would hurt single threaded programs for
example.
We chose to fix it instead by partitioning the PID space between guest
and host. This is possible because today Linux only use 19 out of the
20 bits of PID space, so existing guests will work if we make the host
use the top half of the 20 bits space.
We additionally add support for a property to indicate to Linux the
size of the PID register which will be useful if we eventually have
processors with a larger PID space available.
There is still an issue with malicious guests purposefully setting the
PID register to a value in the hosts PID range. Hopefully future HW
can prevent that, but in the meantime, we handle it with a pair of
kludges:
- On the way out of a guest, before we clear the current VCPU in the
PACA, we check the PID and if it's outside of the permitted range
we flush the TLB for that PID.
- When context switching, if the mm is "new" on that CPU (the
corresponding bit was set for the first time in the mm cpumask), we
check if any sibling thread is in KVM (has a non-NULL VCPU pointer
in the PACA). If that is the case, we also flush the PID for that
CPU (core).
This second part is needed to handle the case where a process is
migrated (or starts a new pthread) on a sibling thread of the CPU
coming out of KVM, as there's a window where stale translations can
exist before we detect it and flush them out.
A future optimization could be added by keeping track of whether the
PID has ever been used and avoid doing that for completely fresh PIDs.
We could similarily mark PIDs that have been the subject of a global
invalidation as "fresh". But for now this will do.
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
[mpe: Rework the asm to build with CONFIG_PPC_RADIX_MMU=n, drop
unneeded include of kvm_book3s_asm.h]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2017-07-24 12:26:06 +08:00
|
|
|
|
|
|
|
/* Find out how many PID bits are supported */
|
|
|
|
if (cpu_has_feature(CPU_FTR_HVMODE)) {
|
|
|
|
if (!mmu_pid_bits)
|
|
|
|
mmu_pid_bits = 20;
|
|
|
|
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
|
|
|
|
/*
|
|
|
|
* When KVM is possible, we only use the top half of the
|
|
|
|
* PID space to avoid collisions between host and guest PIDs
|
|
|
|
* which can cause problems due to prefetch when exiting the
|
|
|
|
* guest with AIL=3
|
|
|
|
*/
|
|
|
|
mmu_base_pid = 1 << (mmu_pid_bits - 1);
|
|
|
|
#else
|
|
|
|
mmu_base_pid = 1;
|
|
|
|
#endif
|
|
|
|
} else {
|
|
|
|
/* The guest uses the bottom half of the PID space */
|
|
|
|
if (!mmu_pid_bits)
|
|
|
|
mmu_pid_bits = 19;
|
|
|
|
mmu_base_pid = 1;
|
|
|
|
}
|
|
|
|
|
2016-04-29 21:25:58 +08:00
|
|
|
/*
|
|
|
|
* Allocate Partition table and process table for the
|
|
|
|
* host.
|
|
|
|
*/
|
powerpc/mm/radix: Workaround prefetch issue with KVM
There's a somewhat architectural issue with Radix MMU and KVM.
When coming out of a guest with AIL (Alternate Interrupt Location, ie,
MMU enabled), we start executing hypervisor code with the PID register
still containing whatever the guest has been using.
The problem is that the CPU can (and will) then start prefetching or
speculatively load from whatever host context has that same PID (if
any), thus bringing translations for that context into the TLB, which
Linux doesn't know about.
This can cause stale translations and subsequent crashes.
Fixing this in a way that is neither racy nor a huge performance
impact is difficult. We could just make the host invalidations always
use broadcast forms but that would hurt single threaded programs for
example.
We chose to fix it instead by partitioning the PID space between guest
and host. This is possible because today Linux only use 19 out of the
20 bits of PID space, so existing guests will work if we make the host
use the top half of the 20 bits space.
We additionally add support for a property to indicate to Linux the
size of the PID register which will be useful if we eventually have
processors with a larger PID space available.
There is still an issue with malicious guests purposefully setting the
PID register to a value in the hosts PID range. Hopefully future HW
can prevent that, but in the meantime, we handle it with a pair of
kludges:
- On the way out of a guest, before we clear the current VCPU in the
PACA, we check the PID and if it's outside of the permitted range
we flush the TLB for that PID.
- When context switching, if the mm is "new" on that CPU (the
corresponding bit was set for the first time in the mm cpumask), we
check if any sibling thread is in KVM (has a non-NULL VCPU pointer
in the PACA). If that is the case, we also flush the PID for that
CPU (core).
This second part is needed to handle the case where a process is
migrated (or starts a new pthread) on a sibling thread of the CPU
coming out of KVM, as there's a window where stale translations can
exist before we detect it and flush them out.
A future optimization could be added by keeping track of whether the
PID has ever been used and avoid doing that for completely fresh PIDs.
We could similarily mark PIDs that have been the subject of a global
invalidation as "fresh". But for now this will do.
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
[mpe: Rework the asm to build with CONFIG_PPC_RADIX_MMU=n, drop
unneeded include of kvm_book3s_asm.h]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2017-07-24 12:26:06 +08:00
|
|
|
BUG_ON(PRTB_SIZE_SHIFT > 36);
|
2018-02-13 23:08:24 +08:00
|
|
|
process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT, -1, 0, 0);
|
2016-04-29 21:25:58 +08:00
|
|
|
/*
|
|
|
|
* Fill in the process table.
|
|
|
|
*/
|
2016-06-17 14:10:36 +08:00
|
|
|
rts_field = radix__get_tree_size();
|
2016-04-29 21:25:58 +08:00
|
|
|
process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE);
|
|
|
|
/*
|
|
|
|
* Fill in the partition table. We are suppose to use effective address
|
|
|
|
* of process table here. But our linear mapping also enable us to use
|
|
|
|
* physical address here.
|
|
|
|
*/
|
2016-08-04 13:32:06 +08:00
|
|
|
register_process_table(__pa(process_tb), 0, PRTB_SIZE_SHIFT - 12);
|
2016-04-29 21:25:58 +08:00
|
|
|
pr_info("Process table %p and radix root for kernel: %p\n", process_tb, init_mm.pgd);
|
2017-02-27 11:32:41 +08:00
|
|
|
asm volatile("ptesync" : : : "memory");
|
|
|
|
asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
|
|
|
|
"r" (TLBIEL_INVAL_SET_LPID), "r" (0));
|
|
|
|
asm volatile("eieio; tlbsync; ptesync" : : : "memory");
|
2017-04-11 13:23:25 +08:00
|
|
|
trace_tlbie(0, 0, TLBIEL_INVAL_SET_LPID, 0, 2, 1, 1);
|
2016-04-29 21:25:58 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void __init radix_init_partition_table(void)
|
|
|
|
{
|
2016-11-21 13:00:58 +08:00
|
|
|
unsigned long rts_field, dw0;
|
2016-06-17 14:10:36 +08:00
|
|
|
|
2016-11-21 13:00:58 +08:00
|
|
|
mmu_partition_table_init();
|
2016-06-17 14:10:36 +08:00
|
|
|
rts_field = radix__get_tree_size();
|
2016-11-21 13:00:58 +08:00
|
|
|
dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR;
|
|
|
|
mmu_partition_table_set_entry(0, dw0, 0);
|
2016-04-29 21:25:58 +08:00
|
|
|
|
2016-07-13 17:35:25 +08:00
|
|
|
pr_info("Initializing Radix MMU\n");
|
|
|
|
pr_info("Partition table %p\n", partition_tb);
|
2016-04-29 21:25:58 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void __init radix_init_native(void)
|
|
|
|
{
|
2016-08-04 13:32:06 +08:00
|
|
|
register_process_table = native_register_process_table;
|
2016-04-29 21:25:58 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static int __init get_idx_from_shift(unsigned int shift)
|
|
|
|
{
|
|
|
|
int idx = -1;
|
|
|
|
|
|
|
|
switch (shift) {
|
|
|
|
case 0xc:
|
|
|
|
idx = MMU_PAGE_4K;
|
|
|
|
break;
|
|
|
|
case 0x10:
|
|
|
|
idx = MMU_PAGE_64K;
|
|
|
|
break;
|
|
|
|
case 0x15:
|
|
|
|
idx = MMU_PAGE_2M;
|
|
|
|
break;
|
|
|
|
case 0x1e:
|
|
|
|
idx = MMU_PAGE_1G;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
return idx;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int __init radix_dt_scan_page_sizes(unsigned long node,
|
|
|
|
const char *uname, int depth,
|
|
|
|
void *data)
|
|
|
|
{
|
|
|
|
int size = 0;
|
|
|
|
int shift, idx;
|
|
|
|
unsigned int ap;
|
|
|
|
const __be32 *prop;
|
|
|
|
const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
|
|
|
|
|
|
|
|
/* We are scanning "cpu" nodes only */
|
|
|
|
if (type == NULL || strcmp(type, "cpu") != 0)
|
|
|
|
return 0;
|
|
|
|
|
powerpc/mm/radix: Workaround prefetch issue with KVM
There's a somewhat architectural issue with Radix MMU and KVM.
When coming out of a guest with AIL (Alternate Interrupt Location, ie,
MMU enabled), we start executing hypervisor code with the PID register
still containing whatever the guest has been using.
The problem is that the CPU can (and will) then start prefetching or
speculatively load from whatever host context has that same PID (if
any), thus bringing translations for that context into the TLB, which
Linux doesn't know about.
This can cause stale translations and subsequent crashes.
Fixing this in a way that is neither racy nor a huge performance
impact is difficult. We could just make the host invalidations always
use broadcast forms but that would hurt single threaded programs for
example.
We chose to fix it instead by partitioning the PID space between guest
and host. This is possible because today Linux only use 19 out of the
20 bits of PID space, so existing guests will work if we make the host
use the top half of the 20 bits space.
We additionally add support for a property to indicate to Linux the
size of the PID register which will be useful if we eventually have
processors with a larger PID space available.
There is still an issue with malicious guests purposefully setting the
PID register to a value in the hosts PID range. Hopefully future HW
can prevent that, but in the meantime, we handle it with a pair of
kludges:
- On the way out of a guest, before we clear the current VCPU in the
PACA, we check the PID and if it's outside of the permitted range
we flush the TLB for that PID.
- When context switching, if the mm is "new" on that CPU (the
corresponding bit was set for the first time in the mm cpumask), we
check if any sibling thread is in KVM (has a non-NULL VCPU pointer
in the PACA). If that is the case, we also flush the PID for that
CPU (core).
This second part is needed to handle the case where a process is
migrated (or starts a new pthread) on a sibling thread of the CPU
coming out of KVM, as there's a window where stale translations can
exist before we detect it and flush them out.
A future optimization could be added by keeping track of whether the
PID has ever been used and avoid doing that for completely fresh PIDs.
We could similarily mark PIDs that have been the subject of a global
invalidation as "fresh". But for now this will do.
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
[mpe: Rework the asm to build with CONFIG_PPC_RADIX_MMU=n, drop
unneeded include of kvm_book3s_asm.h]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2017-07-24 12:26:06 +08:00
|
|
|
/* Find MMU PID size */
|
|
|
|
prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size);
|
|
|
|
if (prop && size == 4)
|
|
|
|
mmu_pid_bits = be32_to_cpup(prop);
|
|
|
|
|
|
|
|
/* Grab page size encodings */
|
2016-04-29 21:25:58 +08:00
|
|
|
prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size);
|
|
|
|
if (!prop)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
pr_info("Page sizes from device-tree:\n");
|
|
|
|
for (; size >= 4; size -= 4, ++prop) {
|
|
|
|
|
|
|
|
struct mmu_psize_def *def;
|
|
|
|
|
|
|
|
/* top 3 bit is AP encoding */
|
|
|
|
shift = be32_to_cpu(prop[0]) & ~(0xe << 28);
|
|
|
|
ap = be32_to_cpu(prop[0]) >> 29;
|
2016-11-05 12:24:22 +08:00
|
|
|
pr_info("Page size shift = %d AP=0x%x\n", shift, ap);
|
2016-04-29 21:25:58 +08:00
|
|
|
|
|
|
|
idx = get_idx_from_shift(shift);
|
|
|
|
if (idx < 0)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
def = &mmu_psize_defs[idx];
|
|
|
|
def->shift = shift;
|
|
|
|
def->ap = ap;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* needed ? */
|
|
|
|
cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2016-07-26 19:55:27 +08:00
|
|
|
void __init radix__early_init_devtree(void)
|
2016-04-29 21:25:58 +08:00
|
|
|
{
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Try to find the available page sizes in the device-tree
|
|
|
|
*/
|
|
|
|
rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL);
|
|
|
|
if (rc != 0) /* Found */
|
|
|
|
goto found;
|
|
|
|
/*
|
|
|
|
* let's assume we have page 4k and 64k support
|
|
|
|
*/
|
|
|
|
mmu_psize_defs[MMU_PAGE_4K].shift = 12;
|
|
|
|
mmu_psize_defs[MMU_PAGE_4K].ap = 0x0;
|
|
|
|
|
|
|
|
mmu_psize_defs[MMU_PAGE_64K].shift = 16;
|
|
|
|
mmu_psize_defs[MMU_PAGE_64K].ap = 0x5;
|
|
|
|
found:
|
|
|
|
#ifdef CONFIG_SPARSEMEM_VMEMMAP
|
|
|
|
if (mmu_psize_defs[MMU_PAGE_2M].shift) {
|
|
|
|
/*
|
|
|
|
* map vmemmap using 2M if available
|
|
|
|
*/
|
|
|
|
mmu_vmemmap_psize = MMU_PAGE_2M;
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2016-08-24 17:33:39 +08:00
|
|
|
static void update_hid_for_radix(void)
|
|
|
|
{
|
|
|
|
unsigned long hid0;
|
|
|
|
unsigned long rb = 3UL << PPC_BITLSHIFT(53); /* IS = 3 */
|
|
|
|
|
|
|
|
asm volatile("ptesync": : :"memory");
|
|
|
|
/* prs = 0, ric = 2, rs = 0, r = 1 is = 3 */
|
|
|
|
asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
|
|
|
|
: : "r"(rb), "i"(1), "i"(0), "i"(2), "r"(0) : "memory");
|
|
|
|
/* prs = 1, ric = 2, rs = 0, r = 1 is = 3 */
|
|
|
|
asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
|
|
|
|
: : "r"(rb), "i"(1), "i"(1), "i"(2), "r"(0) : "memory");
|
|
|
|
asm volatile("eieio; tlbsync; ptesync; isync; slbia": : :"memory");
|
2017-04-11 13:23:25 +08:00
|
|
|
trace_tlbie(0, 0, rb, 0, 2, 0, 1);
|
|
|
|
trace_tlbie(0, 0, rb, 0, 2, 1, 1);
|
|
|
|
|
2016-08-24 17:33:39 +08:00
|
|
|
/*
|
|
|
|
* now switch the HID
|
|
|
|
*/
|
|
|
|
hid0 = mfspr(SPRN_HID0);
|
|
|
|
hid0 |= HID0_POWER9_RADIX;
|
|
|
|
mtspr(SPRN_HID0, hid0);
|
|
|
|
asm volatile("isync": : :"memory");
|
|
|
|
|
|
|
|
/* Wait for it to happen */
|
|
|
|
while (!(mfspr(SPRN_HID0) & HID0_POWER9_RADIX))
|
|
|
|
cpu_relax();
|
|
|
|
}
|
|
|
|
|
2016-11-15 14:56:14 +08:00
|
|
|
static void radix_init_amor(void)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* In HV mode, we init AMOR (Authority Mask Override Register) so that
|
|
|
|
* the hypervisor and guest can setup IAMR (Instruction Authority Mask
|
|
|
|
* Register), enable key 0 and set it to 1.
|
|
|
|
*
|
|
|
|
* AMOR = 0b1100 .... 0000 (Mask for key 0 is 11)
|
|
|
|
*/
|
|
|
|
mtspr(SPRN_AMOR, (3ul << 62));
|
|
|
|
}
|
|
|
|
|
2016-11-15 14:56:16 +08:00
|
|
|
static void radix_init_iamr(void)
|
|
|
|
{
|
|
|
|
unsigned long iamr;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The IAMR should set to 0 on DD1.
|
|
|
|
*/
|
|
|
|
if (cpu_has_feature(CPU_FTR_POWER9_DD1))
|
|
|
|
iamr = 0;
|
|
|
|
else
|
|
|
|
iamr = (1ul << 62);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Radix always uses key0 of the IAMR to determine if an access is
|
|
|
|
* allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction
|
|
|
|
* fetch.
|
|
|
|
*/
|
|
|
|
mtspr(SPRN_IAMR, iamr);
|
|
|
|
}
|
|
|
|
|
2016-04-29 21:25:58 +08:00
|
|
|
void __init radix__early_init_mmu(void)
|
|
|
|
{
|
|
|
|
unsigned long lpcr;
|
|
|
|
|
|
|
|
#ifdef CONFIG_PPC_64K_PAGES
|
|
|
|
/* PAGE_SIZE mappings */
|
|
|
|
mmu_virtual_psize = MMU_PAGE_64K;
|
|
|
|
#else
|
|
|
|
mmu_virtual_psize = MMU_PAGE_4K;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef CONFIG_SPARSEMEM_VMEMMAP
|
|
|
|
/* vmemmap mapping */
|
|
|
|
mmu_vmemmap_psize = mmu_virtual_psize;
|
|
|
|
#endif
|
|
|
|
/*
|
|
|
|
* initialize page table size
|
|
|
|
*/
|
|
|
|
__pte_index_size = RADIX_PTE_INDEX_SIZE;
|
|
|
|
__pmd_index_size = RADIX_PMD_INDEX_SIZE;
|
|
|
|
__pud_index_size = RADIX_PUD_INDEX_SIZE;
|
|
|
|
__pgd_index_size = RADIX_PGD_INDEX_SIZE;
|
|
|
|
__pmd_cache_index = RADIX_PMD_INDEX_SIZE;
|
|
|
|
__pte_table_size = RADIX_PTE_TABLE_SIZE;
|
|
|
|
__pmd_table_size = RADIX_PMD_TABLE_SIZE;
|
|
|
|
__pud_table_size = RADIX_PUD_TABLE_SIZE;
|
|
|
|
__pgd_table_size = RADIX_PGD_TABLE_SIZE;
|
|
|
|
|
2016-04-29 21:26:19 +08:00
|
|
|
__pmd_val_bits = RADIX_PMD_VAL_BITS;
|
|
|
|
__pud_val_bits = RADIX_PUD_VAL_BITS;
|
|
|
|
__pgd_val_bits = RADIX_PGD_VAL_BITS;
|
2016-04-29 21:25:58 +08:00
|
|
|
|
2016-04-29 21:26:21 +08:00
|
|
|
__kernel_virt_start = RADIX_KERN_VIRT_START;
|
|
|
|
__kernel_virt_size = RADIX_KERN_VIRT_SIZE;
|
|
|
|
__vmalloc_start = RADIX_VMALLOC_START;
|
|
|
|
__vmalloc_end = RADIX_VMALLOC_END;
|
2017-08-01 18:29:22 +08:00
|
|
|
__kernel_io_start = RADIX_KERN_IO_START;
|
2016-04-29 21:26:21 +08:00
|
|
|
vmemmap = (struct page *)RADIX_VMEMMAP_BASE;
|
|
|
|
ioremap_bot = IOREMAP_BASE;
|
2016-06-30 04:06:28 +08:00
|
|
|
|
|
|
|
#ifdef CONFIG_PCI
|
|
|
|
pci_io_base = ISA_IO_BASE;
|
|
|
|
#endif
|
|
|
|
|
2016-04-29 21:26:23 +08:00
|
|
|
/*
|
|
|
|
* For now radix also use the same frag size
|
|
|
|
*/
|
|
|
|
__pte_frag_nr = H_PTE_FRAG_NR;
|
|
|
|
__pte_frag_size_shift = H_PTE_FRAG_SIZE_SHIFT;
|
2016-04-29 21:26:21 +08:00
|
|
|
|
2016-05-31 14:26:29 +08:00
|
|
|
if (!firmware_has_feature(FW_FEATURE_LPAR)) {
|
2016-07-05 13:03:51 +08:00
|
|
|
radix_init_native();
|
2016-08-24 17:33:39 +08:00
|
|
|
if (cpu_has_feature(CPU_FTR_POWER9_DD1))
|
|
|
|
update_hid_for_radix();
|
2016-05-31 14:26:29 +08:00
|
|
|
lpcr = mfspr(SPRN_LPCR);
|
2016-07-13 17:35:21 +08:00
|
|
|
mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
|
2016-04-29 21:25:58 +08:00
|
|
|
radix_init_partition_table();
|
2016-11-15 14:56:14 +08:00
|
|
|
radix_init_amor();
|
2017-01-30 18:21:36 +08:00
|
|
|
} else {
|
|
|
|
radix_init_pseries();
|
2016-05-31 14:26:29 +08:00
|
|
|
}
|
2016-04-29 21:25:58 +08:00
|
|
|
|
2016-11-21 13:00:58 +08:00
|
|
|
memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
|
|
|
|
|
2016-11-15 14:56:16 +08:00
|
|
|
radix_init_iamr();
|
2016-04-29 21:25:58 +08:00
|
|
|
radix_init_pgtable();
|
powerpc/64s: Improve local TLB flush for boot and MCE on POWER9
There are several cases outside the normal address space management
where a CPU's entire local TLB is to be flushed:
1. Booting the kernel, in case something has left stale entries in
the TLB (e.g., kexec).
2. Machine check, to clean corrupted TLB entries.
One other place where the TLB is flushed, is waking from deep idle
states. The flush is a side-effect of calling ->cpu_restore with the
intention of re-setting various SPRs. The flush itself is unnecessary
because in the first case, the TLB should not acquire new corrupted
TLB entries as part of sleep/wake (though they may be lost).
This type of TLB flush is coded inflexibly, several times for each CPU
type, and they have a number of problems with ISA v3.0B:
- The current radix mode of the MMU is not taken into account, it is
always done as a hash flushn For IS=2 (LPID-matching flush from host)
and IS=3 with HV=0 (guest kernel flush), tlbie(l) is undefined if
the R field does not match the current radix mode.
- ISA v3.0B hash must flush the partition and process table caches as
well.
- ISA v3.0B radix must flush partition and process scoped translations,
partition and process table caches, and also the page walk cache.
So consolidate the flushing code and implement it in C and inline asm
under the mm/ directory with the rest of the flush code. Add ISA v3.0B
cases for radix and hash, and use the radix flush in radix environment.
Provide a way for IS=2 (LPID flush) to specify the radix mode of the
partition. Have KVM pass in the radix mode of the guest.
Take out the flushes from early cputable/dt_cpu_ftrs detection hooks,
and move it later in the boot process after, the MMU registers are set
up and before relocation is first turned on.
The TLB flush is no longer called when restoring from deep idle states.
This was not be done as a separate step because booting secondaries
uses the same cpu_restore as idle restore, which needs the TLB flush.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2017-12-23 23:15:50 +08:00
|
|
|
|
|
|
|
if (cpu_has_feature(CPU_FTR_HVMODE))
|
|
|
|
tlbiel_all();
|
2016-04-29 21:25:58 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void radix__early_init_mmu_secondary(void)
|
|
|
|
{
|
|
|
|
unsigned long lpcr;
|
|
|
|
/*
|
2016-05-31 14:26:29 +08:00
|
|
|
* update partition table control register and UPRT
|
2016-04-29 21:25:58 +08:00
|
|
|
*/
|
2016-05-31 14:26:29 +08:00
|
|
|
if (!firmware_has_feature(FW_FEATURE_LPAR)) {
|
2016-11-17 18:16:23 +08:00
|
|
|
|
|
|
|
if (cpu_has_feature(CPU_FTR_POWER9_DD1))
|
|
|
|
update_hid_for_radix();
|
|
|
|
|
2016-05-31 14:26:29 +08:00
|
|
|
lpcr = mfspr(SPRN_LPCR);
|
2016-07-13 17:35:21 +08:00
|
|
|
mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
|
2016-05-31 14:26:29 +08:00
|
|
|
|
2016-04-29 21:25:58 +08:00
|
|
|
mtspr(SPRN_PTCR,
|
|
|
|
__pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
|
2016-11-15 14:56:14 +08:00
|
|
|
radix_init_amor();
|
2016-05-31 14:26:29 +08:00
|
|
|
}
|
2016-11-15 14:56:16 +08:00
|
|
|
radix_init_iamr();
|
powerpc/64s: Improve local TLB flush for boot and MCE on POWER9
There are several cases outside the normal address space management
where a CPU's entire local TLB is to be flushed:
1. Booting the kernel, in case something has left stale entries in
the TLB (e.g., kexec).
2. Machine check, to clean corrupted TLB entries.
One other place where the TLB is flushed, is waking from deep idle
states. The flush is a side-effect of calling ->cpu_restore with the
intention of re-setting various SPRs. The flush itself is unnecessary
because in the first case, the TLB should not acquire new corrupted
TLB entries as part of sleep/wake (though they may be lost).
This type of TLB flush is coded inflexibly, several times for each CPU
type, and they have a number of problems with ISA v3.0B:
- The current radix mode of the MMU is not taken into account, it is
always done as a hash flushn For IS=2 (LPID-matching flush from host)
and IS=3 with HV=0 (guest kernel flush), tlbie(l) is undefined if
the R field does not match the current radix mode.
- ISA v3.0B hash must flush the partition and process table caches as
well.
- ISA v3.0B radix must flush partition and process scoped translations,
partition and process table caches, and also the page walk cache.
So consolidate the flushing code and implement it in C and inline asm
under the mm/ directory with the rest of the flush code. Add ISA v3.0B
cases for radix and hash, and use the radix flush in radix environment.
Provide a way for IS=2 (LPID flush) to specify the radix mode of the
partition. Have KVM pass in the radix mode of the guest.
Take out the flushes from early cputable/dt_cpu_ftrs detection hooks,
and move it later in the boot process after, the MMU registers are set
up and before relocation is first turned on.
The TLB flush is no longer called when restoring from deep idle states.
This was not be done as a separate step because booting secondaries
uses the same cpu_restore as idle restore, which needs the TLB flush.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2017-12-23 23:15:50 +08:00
|
|
|
|
|
|
|
if (cpu_has_feature(CPU_FTR_HVMODE))
|
|
|
|
tlbiel_all();
|
2016-04-29 21:25:58 +08:00
|
|
|
}
|
|
|
|
|
2016-08-19 16:52:37 +08:00
|
|
|
void radix__mmu_cleanup_all(void)
|
|
|
|
{
|
|
|
|
unsigned long lpcr;
|
|
|
|
|
|
|
|
if (!firmware_has_feature(FW_FEATURE_LPAR)) {
|
|
|
|
lpcr = mfspr(SPRN_LPCR);
|
|
|
|
mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT);
|
|
|
|
mtspr(SPRN_PTCR, 0);
|
2016-12-14 10:36:51 +08:00
|
|
|
powernv_set_nmmu_ptcr(0);
|
2016-08-19 16:52:37 +08:00
|
|
|
radix__flush_tlb_all();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-04-29 21:25:58 +08:00
|
|
|
void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base,
|
|
|
|
phys_addr_t first_memblock_size)
|
|
|
|
{
|
2016-04-29 21:26:10 +08:00
|
|
|
/* We don't currently support the first MEMBLOCK not mapping 0
|
|
|
|
* physical on those processors
|
|
|
|
*/
|
|
|
|
BUG_ON(first_memblock_base != 0);
|
2017-12-22 19:17:08 +08:00
|
|
|
|
2017-12-22 19:17:11 +08:00
|
|
|
/*
|
|
|
|
* Radix mode is not limited by RMA / VRMA addressing.
|
|
|
|
*/
|
|
|
|
ppc64_rma_size = ULONG_MAX;
|
2016-04-29 21:25:58 +08:00
|
|
|
}
|
2016-04-29 21:26:00 +08:00
|
|
|
|
2017-01-17 03:07:44 +08:00
|
|
|
#ifdef CONFIG_MEMORY_HOTPLUG
|
2017-01-17 03:07:45 +08:00
|
|
|
static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
|
|
|
|
{
|
|
|
|
pte_t *pte;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < PTRS_PER_PTE; i++) {
|
|
|
|
pte = pte_start + i;
|
|
|
|
if (!pte_none(*pte))
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
pte_free_kernel(&init_mm, pte_start);
|
|
|
|
pmd_clear(pmd);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
|
|
|
|
{
|
|
|
|
pmd_t *pmd;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < PTRS_PER_PMD; i++) {
|
|
|
|
pmd = pmd_start + i;
|
|
|
|
if (!pmd_none(*pmd))
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
pmd_free(&init_mm, pmd_start);
|
|
|
|
pud_clear(pud);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void remove_pte_table(pte_t *pte_start, unsigned long addr,
|
|
|
|
unsigned long end)
|
|
|
|
{
|
|
|
|
unsigned long next;
|
|
|
|
pte_t *pte;
|
|
|
|
|
|
|
|
pte = pte_start + pte_index(addr);
|
|
|
|
for (; addr < end; addr = next, pte++) {
|
|
|
|
next = (addr + PAGE_SIZE) & PAGE_MASK;
|
|
|
|
if (next > end)
|
|
|
|
next = end;
|
|
|
|
|
|
|
|
if (!pte_present(*pte))
|
|
|
|
continue;
|
|
|
|
|
2017-01-17 03:07:46 +08:00
|
|
|
if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) {
|
|
|
|
/*
|
|
|
|
* The vmemmap_free() and remove_section_mapping()
|
|
|
|
* codepaths call us with aligned addresses.
|
|
|
|
*/
|
|
|
|
WARN_ONCE(1, "%s: unaligned range\n", __func__);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2017-01-17 03:07:45 +08:00
|
|
|
pte_clear(&init_mm, addr, pte);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
|
|
|
|
unsigned long end)
|
|
|
|
{
|
|
|
|
unsigned long next;
|
|
|
|
pte_t *pte_base;
|
|
|
|
pmd_t *pmd;
|
|
|
|
|
|
|
|
pmd = pmd_start + pmd_index(addr);
|
|
|
|
for (; addr < end; addr = next, pmd++) {
|
|
|
|
next = pmd_addr_end(addr, end);
|
|
|
|
|
|
|
|
if (!pmd_present(*pmd))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (pmd_huge(*pmd)) {
|
2017-01-17 03:07:46 +08:00
|
|
|
if (!IS_ALIGNED(addr, PMD_SIZE) ||
|
|
|
|
!IS_ALIGNED(next, PMD_SIZE)) {
|
|
|
|
WARN_ONCE(1, "%s: unaligned range\n", __func__);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2017-01-17 03:07:45 +08:00
|
|
|
pte_clear(&init_mm, addr, (pte_t *)pmd);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
pte_base = (pte_t *)pmd_page_vaddr(*pmd);
|
|
|
|
remove_pte_table(pte_base, addr, next);
|
|
|
|
free_pte_table(pte_base, pmd);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void remove_pud_table(pud_t *pud_start, unsigned long addr,
|
|
|
|
unsigned long end)
|
|
|
|
{
|
|
|
|
unsigned long next;
|
|
|
|
pmd_t *pmd_base;
|
|
|
|
pud_t *pud;
|
|
|
|
|
|
|
|
pud = pud_start + pud_index(addr);
|
|
|
|
for (; addr < end; addr = next, pud++) {
|
|
|
|
next = pud_addr_end(addr, end);
|
|
|
|
|
|
|
|
if (!pud_present(*pud))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (pud_huge(*pud)) {
|
2017-01-17 03:07:46 +08:00
|
|
|
if (!IS_ALIGNED(addr, PUD_SIZE) ||
|
|
|
|
!IS_ALIGNED(next, PUD_SIZE)) {
|
|
|
|
WARN_ONCE(1, "%s: unaligned range\n", __func__);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2017-01-17 03:07:45 +08:00
|
|
|
pte_clear(&init_mm, addr, (pte_t *)pud);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
pmd_base = (pmd_t *)pud_page_vaddr(*pud);
|
|
|
|
remove_pmd_table(pmd_base, addr, next);
|
|
|
|
free_pmd_table(pmd_base, pud);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void remove_pagetable(unsigned long start, unsigned long end)
|
|
|
|
{
|
|
|
|
unsigned long addr, next;
|
|
|
|
pud_t *pud_base;
|
|
|
|
pgd_t *pgd;
|
|
|
|
|
|
|
|
spin_lock(&init_mm.page_table_lock);
|
|
|
|
|
|
|
|
for (addr = start; addr < end; addr = next) {
|
|
|
|
next = pgd_addr_end(addr, end);
|
|
|
|
|
|
|
|
pgd = pgd_offset_k(addr);
|
|
|
|
if (!pgd_present(*pgd))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (pgd_huge(*pgd)) {
|
2017-01-17 03:07:46 +08:00
|
|
|
if (!IS_ALIGNED(addr, PGDIR_SIZE) ||
|
|
|
|
!IS_ALIGNED(next, PGDIR_SIZE)) {
|
|
|
|
WARN_ONCE(1, "%s: unaligned range\n", __func__);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2017-01-17 03:07:45 +08:00
|
|
|
pte_clear(&init_mm, addr, (pte_t *)pgd);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
pud_base = (pud_t *)pgd_page_vaddr(*pgd);
|
|
|
|
remove_pud_table(pud_base, addr, next);
|
|
|
|
}
|
|
|
|
|
|
|
|
spin_unlock(&init_mm.page_table_lock);
|
|
|
|
radix__flush_tlb_kernel_range(start, end);
|
|
|
|
}
|
|
|
|
|
2017-01-17 03:07:44 +08:00
|
|
|
int __ref radix__create_section_mapping(unsigned long start, unsigned long end)
|
|
|
|
{
|
2018-02-13 23:08:24 +08:00
|
|
|
return create_physical_mapping(start, end, -1);
|
2017-01-17 03:07:44 +08:00
|
|
|
}
|
2017-01-17 03:07:45 +08:00
|
|
|
|
|
|
|
int radix__remove_section_mapping(unsigned long start, unsigned long end)
|
|
|
|
{
|
|
|
|
remove_pagetable(start, end);
|
|
|
|
return 0;
|
|
|
|
}
|
2017-01-17 03:07:44 +08:00
|
|
|
#endif /* CONFIG_MEMORY_HOTPLUG */
|
|
|
|
|
2016-04-29 21:26:00 +08:00
|
|
|
#ifdef CONFIG_SPARSEMEM_VMEMMAP
|
|
|
|
int __meminit radix__vmemmap_create_mapping(unsigned long start,
|
|
|
|
unsigned long page_size,
|
|
|
|
unsigned long phys)
|
|
|
|
{
|
|
|
|
/* Create a PTE encoding */
|
|
|
|
unsigned long flags = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_KERNEL_RW;
|
2018-02-13 23:08:24 +08:00
|
|
|
int nid = early_pfn_to_nid(phys >> PAGE_SHIFT);
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = __map_kernel_page_nid(start, phys, __pgprot(flags), page_size, nid);
|
|
|
|
BUG_ON(ret);
|
2016-04-29 21:26:00 +08:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_MEMORY_HOTPLUG
|
|
|
|
void radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size)
|
|
|
|
{
|
2017-01-17 03:07:46 +08:00
|
|
|
remove_pagetable(start, start + page_size);
|
2016-04-29 21:26:00 +08:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
#endif
|
2016-04-29 21:26:30 +08:00
|
|
|
|
|
|
|
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
|
|
|
|
|
|
|
unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
|
|
|
|
pmd_t *pmdp, unsigned long clr,
|
|
|
|
unsigned long set)
|
|
|
|
{
|
|
|
|
unsigned long old;
|
|
|
|
|
|
|
|
#ifdef CONFIG_DEBUG_VM
|
2017-06-28 09:32:34 +08:00
|
|
|
WARN_ON(!radix__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
|
2016-04-29 21:26:30 +08:00
|
|
|
assert_spin_locked(&mm->page_table_lock);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
old = radix__pte_update(mm, addr, (pte_t *)pmdp, clr, set, 1);
|
|
|
|
trace_hugepage_update(addr, old, clr, set);
|
|
|
|
|
|
|
|
return old;
|
|
|
|
}
|
|
|
|
|
|
|
|
pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
|
|
|
|
pmd_t *pmdp)
|
|
|
|
|
|
|
|
{
|
|
|
|
pmd_t pmd;
|
|
|
|
|
|
|
|
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
|
|
|
|
VM_BUG_ON(radix__pmd_trans_huge(*pmdp));
|
2017-06-28 09:32:34 +08:00
|
|
|
VM_BUG_ON(pmd_devmap(*pmdp));
|
2016-04-29 21:26:30 +08:00
|
|
|
/*
|
|
|
|
* khugepaged calls this for normal pmd
|
|
|
|
*/
|
|
|
|
pmd = *pmdp;
|
|
|
|
pmd_clear(pmdp);
|
2017-07-19 12:49:06 +08:00
|
|
|
|
2016-04-29 21:26:30 +08:00
|
|
|
/*FIXME!! Verify whether we need this kick below */
|
2017-07-27 14:24:54 +08:00
|
|
|
serialize_against_pte_lookup(vma->vm_mm);
|
2017-07-19 12:49:06 +08:00
|
|
|
|
|
|
|
radix__flush_tlb_collapsed_pmd(vma->vm_mm, address);
|
|
|
|
|
2016-04-29 21:26:30 +08:00
|
|
|
return pmd;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* For us pgtable_t is pte_t *. Inorder to save the deposisted
|
|
|
|
* page table, we consider the allocated page table as a list
|
|
|
|
* head. On withdraw we need to make sure we zero out the used
|
|
|
|
* list_head memory area.
|
|
|
|
*/
|
|
|
|
void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
|
|
|
|
pgtable_t pgtable)
|
|
|
|
{
|
|
|
|
struct list_head *lh = (struct list_head *) pgtable;
|
|
|
|
|
|
|
|
assert_spin_locked(pmd_lockptr(mm, pmdp));
|
|
|
|
|
|
|
|
/* FIFO */
|
|
|
|
if (!pmd_huge_pte(mm, pmdp))
|
|
|
|
INIT_LIST_HEAD(lh);
|
|
|
|
else
|
|
|
|
list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
|
|
|
|
pmd_huge_pte(mm, pmdp) = pgtable;
|
|
|
|
}
|
|
|
|
|
|
|
|
pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
|
|
|
|
{
|
|
|
|
pte_t *ptep;
|
|
|
|
pgtable_t pgtable;
|
|
|
|
struct list_head *lh;
|
|
|
|
|
|
|
|
assert_spin_locked(pmd_lockptr(mm, pmdp));
|
|
|
|
|
|
|
|
/* FIFO */
|
|
|
|
pgtable = pmd_huge_pte(mm, pmdp);
|
|
|
|
lh = (struct list_head *) pgtable;
|
|
|
|
if (list_empty(lh))
|
|
|
|
pmd_huge_pte(mm, pmdp) = NULL;
|
|
|
|
else {
|
|
|
|
pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
|
|
|
|
list_del(lh);
|
|
|
|
}
|
|
|
|
ptep = (pte_t *) pgtable;
|
|
|
|
*ptep = __pte(0);
|
|
|
|
ptep++;
|
|
|
|
*ptep = __pte(0);
|
|
|
|
return pgtable;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
|
|
|
|
unsigned long addr, pmd_t *pmdp)
|
|
|
|
{
|
|
|
|
pmd_t old_pmd;
|
|
|
|
unsigned long old;
|
|
|
|
|
|
|
|
old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
|
|
|
|
old_pmd = __pmd(old);
|
|
|
|
/*
|
2017-07-27 14:24:54 +08:00
|
|
|
* Serialize against find_current_mm_pte which does lock-less
|
2016-04-29 21:26:30 +08:00
|
|
|
* lookup in page tables with local interrupts disabled. For huge pages
|
|
|
|
* it casts pmd_t to pte_t. Since format of pte_t is different from
|
|
|
|
* pmd_t we want to prevent transit from pmd pointing to page table
|
|
|
|
* to pmd pointing to huge page (and back) while interrupts are disabled.
|
|
|
|
* We clear pmd to possibly replace it with page table pointer in
|
|
|
|
* different code paths. So make sure we wait for the parallel
|
2017-07-27 14:24:54 +08:00
|
|
|
* find_current_mm_pte to finish.
|
2016-04-29 21:26:30 +08:00
|
|
|
*/
|
2017-07-27 14:24:54 +08:00
|
|
|
serialize_against_pte_lookup(mm);
|
2016-04-29 21:26:30 +08:00
|
|
|
return old_pmd;
|
|
|
|
}
|
|
|
|
|
|
|
|
int radix__has_transparent_hugepage(void)
|
|
|
|
{
|
|
|
|
/* For radix 2M at PMD level means thp */
|
|
|
|
if (mmu_psize_defs[MMU_PAGE_2M].shift == PMD_SHIFT)
|
|
|
|
return 1;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|