powerpc fixes for 4.19 #2

- An implementation for the newly added hv_ops->flush() for the OPAL hvc
    console driver backends, I forgot to apply this after merging the hvc driver
    changes before the merge window.
 
  - Enable all PCI bridges at boot on powernv, to avoid races when multiple
    children of a bridge try to enable it simultaneously. This is a workaround
    until the PCI core can be enhanced to fix the races.
 
  - A fix to query PowerVM for the correct system topology at boot before
    initialising sched domains, seen in some configurations to cause broken
    scheduling etc.
 
  - A fix for pte_access_permitted() on "nohash" platforms.
 
  - Two commits to fix SIGBUS when using remap_pfn_range() seen on Power9 due to
    a workaround when using the nest MMU (GPUs, accelerators).
 
  - Another fix to the VFIO code used by KVM, the previous fix had some bugs
    which caused guests to not start in some configurations.
 
  - A handful of other minor fixes.
 
 Thanks to:
   Aneesh Kumar K.V, Benjamin Herrenschmidt, Christophe Leroy, Hari Bathini, Luke
   Dashjr, Mahesh Salgaonkar, Nicholas Piggin, Paul Mackerras, Srikar Dronamraju.
 -----BEGIN PGP SIGNATURE-----
 
 iQJHBAABCgAxFiEEJFGtCPCthwEv2Y/bUevqPMjhpYAFAlt//10THG1wZUBlbGxl
 cm1hbi5pZC5hdQAKCRBR6+o8yOGlgLUGD/9y/MPrs+V2lNFhxP+l1jO4Ro0v8DPI
 vARjqq06WfppgQgSS1dvWfzLaMFbGe8wPRfL0T1xMOXCZg8Ts/HrgxVBVFYcv6/Q
 xBaU5bKztg6HKQbwwO+8B/gdTA3hO7yFVux6JGwsGO5Ebl8Q3UDOdbvYX6XTj3H8
 rdiicle9LaI7qodC8bxlBo1Be0YKEW0O/ag179sxXzozzvoPIyFpeX6FL1sAft++
 XlQS1MHu4hErSM2rbmyoFCm+SmyRt3CD0NTVmNd2cgw5XexPOBFlnsdgpaK1jJFc
 CYu1chP83E91ol1/8NAPkcmPWvP6MGyoqOl75RghooY2D1IJ2GtLKoz2Dvc53ay2
 ZlIpMyc2CYIa55Mj18tOV/NbGbh0Lf0Ta++BxqxbcCDt5fq0VxHkoDPqBgWh/tdp
 Po7oQc7U2VTKKC3wiLr//nSHpgtSTAWtucDt7oT7GdP8+EMxUZ8teBFIfTkwfuD2
 plroEmcYuRD3beI4FAG/iCp5POOCsnHLkKVDl7tyQPl3Yvu8hvyLY9gBS9RN4Unt
 /z94YFJtz7UD+VP7jDaPiQS26y0WEJfW8ml0tNxMZVBdksZLPIPN0/EU/04V0jWf
 GxynzKMhElxC67tK/Eb43EGiLEZAlbEnJxoOtiCnL1MK+OIJPjg6e7o6qlsmaa+Y
 zkXVpxLtkq0OoQ==
 =1AUa
 -----END PGP SIGNATURE-----

Merge tag 'powerpc-4.19-2' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux

Pull powerpc fixes from Michael Ellerman:

 - An implementation for the newly added hv_ops->flush() for the OPAL
   hvc console driver backends, I forgot to apply this after merging the
   hvc driver changes before the merge window.

 - Enable all PCI bridges at boot on powernv, to avoid races when
   multiple children of a bridge try to enable it simultaneously. This
   is a workaround until the PCI core can be enhanced to fix the races.

 - A fix to query PowerVM for the correct system topology at boot before
   initialising sched domains, seen in some configurations to cause
   broken scheduling etc.

 - A fix for pte_access_permitted() on "nohash" platforms.

 - Two commits to fix SIGBUS when using remap_pfn_range() seen on Power9
   due to a workaround when using the nest MMU (GPUs, accelerators).

 - Another fix to the VFIO code used by KVM, the previous fix had some
   bugs which caused guests to not start in some configurations.

 - A handful of other minor fixes.

Thanks to: Aneesh Kumar K.V, Benjamin Herrenschmidt, Christophe Leroy,
Hari Bathini, Luke Dashjr, Mahesh Salgaonkar, Nicholas Piggin, Paul
Mackerras, Srikar Dronamraju.

* tag 'powerpc-4.19-2' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux:
  powerpc/mce: Fix SLB rebolting during MCE recovery path.
  KVM: PPC: Book3S: Fix guest DMA when guest partially backed by THP pages
  powerpc/mm/radix: Only need the Nest MMU workaround for R -> RW transition
  powerpc/mm/books3s: Add new pte bit to mark pte temporarily invalid.
  powerpc/nohash: fix pte_access_permitted()
  powerpc/topology: Get topology for shared processors at boot
  powerpc64/ftrace: Include ftrace.h needed for enable/disable calls
  powerpc/powernv/pci: Work around races in PCI bridge enabling
  powerpc/fadump: cleanup crash memory ranges support
  powerpc/powernv: provide a console flush operation for opal hvc driver
  powerpc/traps: Avoid rate limit messages from show unhandled signals
  powerpc/64s: Fix PACA_IRQ_HARD_DIS accounting in idle_power4()
This commit is contained in:
Linus Torvalds 2018-08-24 09:34:23 -07:00
commit aa5b1054ba
16 changed files with 173 additions and 78 deletions

View File

@ -44,6 +44,16 @@
#define _PAGE_PTE 0x4000000000000000UL /* distinguishes PTEs from pointers */
#define _PAGE_PRESENT 0x8000000000000000UL /* pte contains a translation */
/*
* We need to mark a pmd pte invalid while splitting. We can do that by clearing
* the _PAGE_PRESENT bit. But then that will be taken as a swap pte. In order to
* differentiate between two use a SW field when invalidating.
*
* We do that temporary invalidate for regular pte entry in ptep_set_access_flags
*
* This is used only when _PAGE_PRESENT is cleared.
*/
#define _PAGE_INVALID _RPAGE_SW0
/*
* Top and bottom bits of RPN which can be used by hash
@ -568,7 +578,13 @@ static inline pte_t pte_clear_savedwrite(pte_t pte)
static inline int pte_present(pte_t pte)
{
return !!(pte_raw(pte) & cpu_to_be64(_PAGE_PRESENT));
/*
* A pte is considerent present if _PAGE_PRESENT is set.
* We also need to consider the pte present which is marked
* invalid during ptep_set_access_flags. Hence we look for _PAGE_INVALID
* if we find _PAGE_PRESENT cleared.
*/
return !!(pte_raw(pte) & cpu_to_be64(_PAGE_PRESENT | _PAGE_INVALID));
}
#ifdef CONFIG_PPC_MEM_KEYS

View File

@ -51,17 +51,14 @@ static inline int pte_present(pte_t pte)
#define pte_access_permitted pte_access_permitted
static inline bool pte_access_permitted(pte_t pte, bool write)
{
unsigned long pteval = pte_val(pte);
/*
* A read-only access is controlled by _PAGE_USER bit.
* We have _PAGE_READ set for WRITE and EXECUTE
*/
unsigned long need_pte_bits = _PAGE_PRESENT | _PAGE_USER;
if (!pte_present(pte) || !pte_user(pte) || !pte_read(pte))
return false;
if (write)
need_pte_bits |= _PAGE_WRITE;
if ((pteval & need_pte_bits) != need_pte_bits)
if (write && !pte_write(pte))
return false;
return true;

View File

@ -308,6 +308,7 @@ extern void opal_configure_cores(void);
extern int opal_get_chars(uint32_t vtermno, char *buf, int count);
extern int opal_put_chars(uint32_t vtermno, const char *buf, int total_len);
extern int opal_put_chars_atomic(uint32_t vtermno, const char *buf, int total_len);
extern int opal_flush_chars(uint32_t vtermno, bool wait);
extern int opal_flush_console(uint32_t vtermno);
extern void hvc_opal_init_early(void);

View File

@ -92,6 +92,7 @@ extern int stop_topology_update(void);
extern int prrn_is_enabled(void);
extern int find_and_online_cpu_nid(int cpu);
extern int timed_topology_update(int nsecs);
extern void __init shared_proc_topology_init(void);
#else
static inline int start_topology_update(void)
{
@ -113,6 +114,10 @@ static inline int timed_topology_update(int nsecs)
{
return 0;
}
#ifdef CONFIG_SMP
static inline void shared_proc_topology_init(void) {}
#endif
#endif /* CONFIG_NUMA && CONFIG_PPC_SPLPAR */
#include <asm-generic/topology.h>

View File

@ -34,6 +34,7 @@
#include <linux/crash_dump.h>
#include <linux/kobject.h>
#include <linux/sysfs.h>
#include <linux/slab.h>
#include <asm/debugfs.h>
#include <asm/page.h>
@ -1019,13 +1020,6 @@ static int fadump_setup_crash_memory_ranges(void)
pr_debug("Setup crash memory ranges.\n");
crash_mem_ranges = 0;
/* allocate memory for crash memory ranges for the first time */
if (!max_crash_mem_ranges) {
ret = allocate_crash_memory_ranges();
if (ret)
return ret;
}
/*
* add the first memory chunk (RMA_START through boot_memory_size) as
* a separate memory chunk. The reason is, at the time crash firmware

View File

@ -32,6 +32,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_CAN_NAP)
cmpwi 0,r4,0
beqlr
/* This sequence is similar to prep_irq_for_idle() */
/* Hard disable interrupts */
mfmsr r7
rldicl r0,r7,48,1
@ -41,10 +43,15 @@ END_FTR_SECTION_IFCLR(CPU_FTR_CAN_NAP)
/* Check if something happened while soft-disabled */
lbz r0,PACAIRQHAPPENED(r13)
cmpwi cr0,r0,0
bnelr
bne- 2f
/* Soft-enable interrupts */
/*
* Soft-enable interrupts. This will make power4_fixup_nap return
* to our caller with interrupts enabled (soft and hard). The caller
* can cope with either interrupts disabled or enabled upon return.
*/
#ifdef CONFIG_TRACE_IRQFLAGS
/* Tell the tracer interrupts are on, because idle responds to them. */
mflr r0
std r0,16(r1)
stdu r1,-128(r1)
@ -73,3 +80,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
isync
b 1b
2: /* Return if an interrupt had happened while soft disabled */
/* Set the HARD_DIS flag because interrupts are now hard disabled */
ori r0,r0,PACA_IRQ_HARD_DIS
stb r0,PACAIRQHAPPENED(r13)
blr

View File

@ -1160,6 +1160,11 @@ void __init smp_cpus_done(unsigned int max_cpus)
if (smp_ops && smp_ops->bringup_done)
smp_ops->bringup_done();
/*
* On a shared LPAR, associativity needs to be requested.
* Hence, get numa topology before dumping cpu topology
*/
shared_proc_topology_init();
dump_numa_cpu_topology();
/*

View File

@ -315,22 +315,21 @@ void user_single_step_siginfo(struct task_struct *tsk,
info->si_addr = (void __user *)regs->nip;
}
static bool show_unhandled_signals_ratelimited(void)
{
static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
return show_unhandled_signals && __ratelimit(&rs);
}
static void show_signal_msg(int signr, struct pt_regs *regs, int code,
unsigned long addr)
{
if (!show_unhandled_signals_ratelimited())
static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
if (!show_unhandled_signals)
return;
if (!unhandled_signal(current, signr))
return;
if (!__ratelimit(&rs))
return;
pr_info("%s[%d]: %s (%d) at %lx nip %lx lr %lx code %x",
current->comm, current->pid, signame(signr), signr,
addr, regs->nip, regs->link, code);

View File

@ -46,6 +46,7 @@
#include <linux/compiler.h>
#include <linux/of.h>
#include <asm/ftrace.h>
#include <asm/reg.h>
#include <asm/ppc-opcode.h>
#include <asm/asm-prototypes.h>

View File

@ -129,6 +129,7 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries,
long i, j, ret = 0, locked_entries = 0;
unsigned int pageshift;
unsigned long flags;
unsigned long cur_ua;
struct page *page = NULL;
mutex_lock(&mem_list_mutex);
@ -177,7 +178,8 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries,
}
for (i = 0; i < entries; ++i) {
if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT),
cur_ua = ua + (i << PAGE_SHIFT);
if (1 != get_user_pages_fast(cur_ua,
1/* pages */, 1/* iswrite */, &page)) {
ret = -EFAULT;
for (j = 0; j < i; ++j)
@ -196,7 +198,7 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries,
if (is_migrate_cma_page(page)) {
if (mm_iommu_move_page_from_cma(page))
goto populate;
if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT),
if (1 != get_user_pages_fast(cur_ua,
1/* pages */, 1/* iswrite */,
&page)) {
ret = -EFAULT;
@ -210,20 +212,21 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries,
}
populate:
pageshift = PAGE_SHIFT;
if (PageCompound(page)) {
if (mem->pageshift > PAGE_SHIFT && PageCompound(page)) {
pte_t *pte;
struct page *head = compound_head(page);
unsigned int compshift = compound_order(head);
unsigned int pteshift;
local_irq_save(flags); /* disables as well */
pte = find_linux_pte(mm->pgd, ua, NULL, &pageshift);
local_irq_restore(flags);
pte = find_linux_pte(mm->pgd, cur_ua, NULL, &pteshift);
/* Double check it is still the same pinned page */
if (pte && pte_page(*pte) == head &&
pageshift == compshift)
pageshift = max_t(unsigned int, pageshift,
pteshift == compshift + PAGE_SHIFT)
pageshift = max_t(unsigned int, pteshift,
PAGE_SHIFT);
local_irq_restore(flags);
}
mem->pageshift = min(mem->pageshift, pageshift);
mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT;

View File

@ -1078,7 +1078,6 @@ static int prrn_enabled;
static void reset_topology_timer(void);
static int topology_timer_secs = 1;
static int topology_inited;
static int topology_update_needed;
/*
* Change polling interval for associativity changes.
@ -1306,11 +1305,8 @@ int numa_update_cpu_topology(bool cpus_locked)
struct device *dev;
int weight, new_nid, i = 0;
if (!prrn_enabled && !vphn_enabled) {
if (!topology_inited)
topology_update_needed = 1;
if (!prrn_enabled && !vphn_enabled && topology_inited)
return 0;
}
weight = cpumask_weight(&cpu_associativity_changes_mask);
if (!weight)
@ -1423,7 +1419,6 @@ int numa_update_cpu_topology(bool cpus_locked)
out:
kfree(updates);
topology_update_needed = 0;
return changed;
}
@ -1551,6 +1546,15 @@ int prrn_is_enabled(void)
return prrn_enabled;
}
void __init shared_proc_topology_init(void)
{
if (lppaca_shared_proc(get_lppaca())) {
bitmap_fill(cpumask_bits(&cpu_associativity_changes_mask),
nr_cpumask_bits);
numa_update_cpu_topology(false);
}
}
static int topology_read(struct seq_file *file, void *v)
{
if (vphn_enabled || prrn_enabled)
@ -1608,10 +1612,6 @@ static int topology_update_init(void)
return -ENOMEM;
topology_inited = 1;
if (topology_update_needed)
bitmap_fill(cpumask_bits(&cpu_associativity_changes_mask),
nr_cpumask_bits);
return 0;
}
device_initcall(topology_update_init);

View File

@ -1045,20 +1045,22 @@ void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,
struct mm_struct *mm = vma->vm_mm;
unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED |
_PAGE_RW | _PAGE_EXEC);
unsigned long change = pte_val(entry) ^ pte_val(*ptep);
/*
* To avoid NMMU hang while relaxing access, we need mark
* the pte invalid in between.
*/
if (atomic_read(&mm->context.copros) > 0) {
if ((change & _PAGE_RW) && atomic_read(&mm->context.copros) > 0) {
unsigned long old_pte, new_pte;
old_pte = __radix_pte_update(ptep, ~0, 0);
old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID);
/*
* new value of pte
*/
new_pte = old_pte | set;
radix__flush_tlb_page_psize(mm, address, psize);
__radix_pte_update(ptep, 0, new_pte);
__radix_pte_update(ptep, _PAGE_INVALID, new_pte);
} else {
__radix_pte_update(ptep, 0, set);
/*

View File

@ -70,7 +70,7 @@ static inline void slb_shadow_update(unsigned long ea, int ssize,
static inline void slb_shadow_clear(enum slb_index index)
{
WRITE_ONCE(get_slb_shadow()->save_area[index].esid, 0);
WRITE_ONCE(get_slb_shadow()->save_area[index].esid, cpu_to_be64(index));
}
static inline void create_shadowed_slbe(unsigned long ea, int ssize,

View File

@ -370,12 +370,8 @@ static int __opal_put_chars(uint32_t vtermno, const char *data, int total_len, b
olen = cpu_to_be64(total_len);
rc = opal_console_write(vtermno, &olen, data);
if (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
if (rc == OPAL_BUSY_EVENT) {
mdelay(OPAL_BUSY_DELAY_MS);
if (rc == OPAL_BUSY_EVENT)
opal_poll_events(NULL);
} else if (rc == OPAL_BUSY_EVENT) {
mdelay(OPAL_BUSY_DELAY_MS);
}
written = -EAGAIN;
goto out;
}
@ -401,15 +397,6 @@ out:
if (atomic)
spin_unlock_irqrestore(&opal_write_lock, flags);
/* In the -EAGAIN case, callers loop, so we have to flush the console
* here in case they have interrupts off (and we don't want to wait
* for async flushing if we can make immediate progress here). If
* necessary the API could be made entirely non-flushing if the
* callers had a ->flush API to use.
*/
if (written == -EAGAIN)
opal_flush_console(vtermno);
return written;
}
@ -429,40 +416,74 @@ int opal_put_chars_atomic(uint32_t vtermno, const char *data, int total_len)
return __opal_put_chars(vtermno, data, total_len, true);
}
int opal_flush_console(uint32_t vtermno)
static s64 __opal_flush_console(uint32_t vtermno)
{
s64 rc;
if (!opal_check_token(OPAL_CONSOLE_FLUSH)) {
__be64 evt;
WARN_ONCE(1, "opal: OPAL_CONSOLE_FLUSH missing.\n");
/*
* If OPAL_CONSOLE_FLUSH is not implemented in the firmware,
* the console can still be flushed by calling the polling
* function while it has OPAL_EVENT_CONSOLE_OUTPUT events.
*/
do {
opal_poll_events(&evt);
} while (be64_to_cpu(evt) & OPAL_EVENT_CONSOLE_OUTPUT);
WARN_ONCE(1, "opal: OPAL_CONSOLE_FLUSH missing.\n");
return OPAL_SUCCESS;
opal_poll_events(&evt);
if (!(be64_to_cpu(evt) & OPAL_EVENT_CONSOLE_OUTPUT))
return OPAL_SUCCESS;
return OPAL_BUSY;
} else {
rc = opal_console_flush(vtermno);
if (rc == OPAL_BUSY_EVENT) {
opal_poll_events(NULL);
rc = OPAL_BUSY;
}
return rc;
}
do {
rc = OPAL_BUSY;
while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
rc = opal_console_flush(vtermno);
if (rc == OPAL_BUSY_EVENT) {
mdelay(OPAL_BUSY_DELAY_MS);
opal_poll_events(NULL);
} else if (rc == OPAL_BUSY) {
mdelay(OPAL_BUSY_DELAY_MS);
}
}
} while (rc == OPAL_PARTIAL); /* More to flush */
}
return opal_error_code(rc);
/*
* opal_flush_console spins until the console is flushed
*/
int opal_flush_console(uint32_t vtermno)
{
for (;;) {
s64 rc = __opal_flush_console(vtermno);
if (rc == OPAL_BUSY || rc == OPAL_PARTIAL) {
mdelay(1);
continue;
}
return opal_error_code(rc);
}
}
/*
* opal_flush_chars is an hvc interface that sleeps until the console is
* flushed if wait, otherwise it will return -EBUSY if the console has data,
* -EAGAIN if it has data and some of it was flushed.
*/
int opal_flush_chars(uint32_t vtermno, bool wait)
{
for (;;) {
s64 rc = __opal_flush_console(vtermno);
if (rc == OPAL_BUSY || rc == OPAL_PARTIAL) {
if (wait) {
msleep(OPAL_BUSY_DELAY_MS);
continue;
}
if (rc == OPAL_PARTIAL)
return -EAGAIN;
}
return opal_error_code(rc);
}
}
static int opal_recover_mce(struct pt_regs *regs,

View File

@ -3228,12 +3228,49 @@ static void pnv_pci_ioda_create_dbgfs(void)
#endif /* CONFIG_DEBUG_FS */
}
static void pnv_pci_enable_bridge(struct pci_bus *bus)
{
struct pci_dev *dev = bus->self;
struct pci_bus *child;
/* Empty bus ? bail */
if (list_empty(&bus->devices))
return;
/*
* If there's a bridge associated with that bus enable it. This works
* around races in the generic code if the enabling is done during
* parallel probing. This can be removed once those races have been
* fixed.
*/
if (dev) {
int rc = pci_enable_device(dev);
if (rc)
pci_err(dev, "Error enabling bridge (%d)\n", rc);
pci_set_master(dev);
}
/* Perform the same to child busses */
list_for_each_entry(child, &bus->children, node)
pnv_pci_enable_bridge(child);
}
static void pnv_pci_enable_bridges(void)
{
struct pci_controller *hose;
list_for_each_entry(hose, &hose_list, list_node)
pnv_pci_enable_bridge(hose->bus);
}
static void pnv_pci_ioda_fixup(void)
{
pnv_pci_ioda_setup_PEs();
pnv_pci_ioda_setup_iommu_api();
pnv_pci_ioda_create_dbgfs();
pnv_pci_enable_bridges();
#ifdef CONFIG_EEH
pnv_eeh_post_init();
#endif

View File

@ -52,6 +52,7 @@ static u32 hvc_opal_boot_termno;
static const struct hv_ops hvc_opal_raw_ops = {
.get_chars = opal_get_chars,
.put_chars = opal_put_chars,
.flush = opal_flush_chars,
.notifier_add = notifier_add_irq,
.notifier_del = notifier_del_irq,
.notifier_hangup = notifier_hangup_irq,
@ -141,6 +142,7 @@ static int hvc_opal_hvsi_tiocmset(struct hvc_struct *hp, unsigned int set,
static const struct hv_ops hvc_opal_hvsi_ops = {
.get_chars = hvc_opal_hvsi_get_chars,
.put_chars = hvc_opal_hvsi_put_chars,
.flush = opal_flush_chars,
.notifier_add = hvc_opal_hvsi_open,
.notifier_del = hvc_opal_hvsi_close,
.notifier_hangup = hvc_opal_hvsi_hangup,