From 1b5df59e50874b9034c0fa389cd52b65f1f93292 Mon Sep 17 00:00:00 2001
From: Vaibhav Jain <vaibhav@linux.vnet.ibm.com>
Date: Mon, 16 Nov 2015 09:33:45 +0530
Subject: [PATCH 001/149] cxl: Fix possible idr warning when contexts are
 released

An idr warning is reported when a context is release after the capi card
is unbound from the cxl driver via sysfs. Below are the steps to
reproduce:

1. Create multiple afu contexts in an user-space application using libcxl.
2. Unbind capi card from cxl using command of form
   echo <capi-card-pci-addr> > /sys/bus/pci/drivers/cxl-pci/unbind
3. Exit/kill the application owning afu contexts.

After above steps a warning message is usually seen in the kernel logs
of the form "idr_remove called for id=<context-id> which is not
allocated."

This is caused by the function cxl_release_afu which destroys the
contexts_idr table. So when a context is release no entry for context pe
is found in the contexts_idr table and idr code prints this warning.

This patch fixes this issue by increasing & decreasing the ref-count on
the afu device when a context is initialized or when its freed
respectively. This prevents the afu from being released until all the
afu contexts have been released. The patch introduces two new functions
namely cxl_afu_get/put that manage the ref-count on the afu device.

Also the patch removes code inside cxl_dev_context_init that increases ref
on the afu device as its guaranteed to be alive during this function.

Reported-by: Ian Munsie <imunsie@au1.ibm.com>
Signed-off-by: Vaibhav Jain <vaibhav@linux.vnet.ibm.com>
Acked-by: Ian Munsie <imunsie@au1.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 drivers/misc/cxl/api.c     |  4 ----
 drivers/misc/cxl/context.c |  9 +++++++++
 drivers/misc/cxl/cxl.h     | 12 ++++++++++++
 drivers/misc/cxl/file.c    | 19 +++++++++++--------
 4 files changed, 32 insertions(+), 12 deletions(-)
diff --git a/drivers/misc/cxl/api.c b/drivers/misc/cxl/api.c
index 103baf0e0c5b..a6543aefa299 100644
--- a/drivers/misc/cxl/api.c
+++ b/drivers/misc/cxl/api.c
@@ -25,7 +25,6 @@ struct cxl_context *cxl_dev_context_init(struct pci_dev *dev)
 
 	afu = cxl_pci_to_afu(dev);
 
-	get_device(&afu->dev);
 	ctx = cxl_context_alloc();
 	if (IS_ERR(ctx)) {
 		rc = PTR_ERR(ctx);
@@ -61,7 +60,6 @@ err_mapping:
 err_ctx:
 	kfree(ctx);
 err_dev:
-	put_device(&afu->dev);
 	return ERR_PTR(rc);
 }
 EXPORT_SYMBOL_GPL(cxl_dev_context_init);
@@ -87,8 +85,6 @@ int cxl_release_context(struct cxl_context *ctx)
 	if (ctx->status >= STARTED)
 		return -EBUSY;
 
-	put_device(&ctx->afu->dev);
-
 	cxl_context_free(ctx);
 
 	return 0;
diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c
index 2faa1270d085..6dde7a9d6a7e 100644
--- a/drivers/misc/cxl/context.c
+++ b/drivers/misc/cxl/context.c
@@ -97,6 +97,12 @@ int cxl_context_init(struct cxl_context *ctx, struct cxl_afu *afu, bool master,
 	ctx->pe = i;
 	ctx->elem = &ctx->afu->spa[i];
 	ctx->pe_inserted = false;
+
+	/*
+	 * take a ref on the afu so that it stays alive at-least till
+	 * this context is reclaimed inside reclaim_ctx.
+	 */
+	cxl_afu_get(afu);
 	return 0;
 }
 
@@ -278,6 +284,9 @@ static void reclaim_ctx(struct rcu_head *rcu)
 	if (ctx->irq_bitmap)
 		kfree(ctx->irq_bitmap);
 
+	/* Drop ref to the afu device taken during cxl_context_init */
+	cxl_afu_put(ctx->afu);
+
 	kfree(ctx);
 }
 
diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
index 0cfb9c129f27..25ae57fa79b0 100644
--- a/drivers/misc/cxl/cxl.h
+++ b/drivers/misc/cxl/cxl.h
@@ -403,6 +403,18 @@ struct cxl_afu {
 	bool enabled;
 };
 
+/* AFU refcount management */
+static inline struct cxl_afu *cxl_afu_get(struct cxl_afu *afu)
+{
+
+	return (get_device(&afu->dev) == NULL) ? NULL : afu;
+}
+
+static inline void  cxl_afu_put(struct cxl_afu *afu)
+{
+	put_device(&afu->dev);
+}
+
 
 struct cxl_irq_name {
 	struct list_head list;
diff --git a/drivers/misc/cxl/file.c b/drivers/misc/cxl/file.c
index 7ccd2998be92..5cc14599837d 100644
--- a/drivers/misc/cxl/file.c
+++ b/drivers/misc/cxl/file.c
@@ -67,7 +67,13 @@ static int __afu_open(struct inode *inode, struct file *file, bool master)
 		spin_unlock(&adapter->afu_list_lock);
 		goto err_put_adapter;
 	}
-	get_device(&afu->dev);
+
+	/*
+	 * taking a ref to the afu so that it doesn't go away
+	 * for rest of the function. This ref is released before
+	 * we return.
+	 */
+	cxl_afu_get(afu);
 	spin_unlock(&adapter->afu_list_lock);
 
 	if (!afu->current_mode)
@@ -90,13 +96,12 @@ static int __afu_open(struct inode *inode, struct file *file, bool master)
 	file->private_data = ctx;
 	cxl_ctx_get();
 
-	/* Our ref on the AFU will now hold the adapter */
-	put_device(&adapter->dev);
-
-	return 0;
+	/* indicate success */
+	rc = 0;
 
 err_put_afu:
-	put_device(&afu->dev);
+	/* release the ref taken earlier */
+	cxl_afu_put(afu);
 err_put_adapter:
 	put_device(&adapter->dev);
 	return rc;
@@ -131,8 +136,6 @@ int afu_release(struct inode *inode, struct file *file)
 		mutex_unlock(&ctx->mapping_lock);
 	}
 
-	put_device(&ctx->afu->dev);
-
 	/*
 	 * At this this point all bottom halfs have finished and we should be
 	 * getting no more IRQs from the hardware for this context.  Once it's

From 48f0f6b717e314a30be121b67e1d044f6d311d66 Mon Sep 17 00:00:00 2001
From: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Date: Wed, 4 Nov 2015 13:24:09 +1100
Subject: [PATCH 002/149] cxl: use correct operator when writing pcie config
 space values

When writing a value to config space, cxl_pcie_write_config() calls
cxl_pcie_config_info() to obtain a mask and shift value, shifts the new
value accordingly, then uses the mask to combine the shifted value with the
existing value at the address as part of a read-modify-write pattern.

Currently, we use a logical OR operator rather than a bitwise OR operator,
which means any use of this function results in an incorrect value being
written. Replace the logical OR operator with a bitwise OR operator so the
value is written correctly.

Reported-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: stable@vger.kernel.org
Fixes: 6f7f0b3df6d4 ("cxl: Add AFU virtual PHB and kernel API")
Signed-off-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Acked-by: Ian Munsie <imunsie@au1.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 drivers/misc/cxl/vphb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/misc/cxl/vphb.c b/drivers/misc/cxl/vphb.c
index c241e15cacb1..cbd4331fb45c 100644
--- a/drivers/misc/cxl/vphb.c
+++ b/drivers/misc/cxl/vphb.c
@@ -203,7 +203,7 @@ static int cxl_pcie_write_config(struct pci_bus *bus, unsigned int devfn,
 	mask <<= shift;
 	val <<= shift;
 
-	v = (in_le32(ioaddr) & ~mask) || (val & mask);
+	v = (in_le32(ioaddr) & ~mask) | (val & mask);
 
 	out_le32(ioaddr, v);
 	return PCIBIOS_SUCCESSFUL;

From 87630eb1d5ab76fc39e785294d1930bebcecabcf Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Date: Thu, 19 Nov 2015 13:00:39 +0900
Subject: [PATCH 003/149] powerpc/powernv: Drop owner assignment from
 platform_driver

platform_driver does not need to set an owner because
platform_driver_register() will set it.

Signed-off-by: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/opal-prd.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/opal-prd.c b/arch/powerpc/platforms/powernv/opal-prd.c
index 4ece8e40dd54..e315e704cca7 100644
--- a/arch/powerpc/platforms/powernv/opal-prd.c
+++ b/arch/powerpc/platforms/powernv/opal-prd.c
@@ -434,7 +434,6 @@ static const struct of_device_id opal_prd_match[] = {
 static struct platform_driver opal_prd_driver = {
 	.driver = {
 		.name		= "opal-prd",
-		.owner		= THIS_MODULE,
 		.of_match_table	= opal_prd_match,
 	},
 	.probe	= opal_prd_probe,

From 57f889471c0fb55cbb0db98b30483040e2065bd9 Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Wed, 11 Nov 2015 14:48:50 +0100
Subject: [PATCH 004/149] powerpc/powermac: set IRQF_NO_THREAD for xmon/cascade
 handlers

The xmon and cascade irq handlers must not run as threads.
pmac_pic_lock is already a raw_spinlock, but the irq flag
IRQF_NO_THREAD needs to be set as well.

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powermac/pic.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powermac/pic.c b/arch/powerpc/platforms/powermac/pic.c
index 6f4f8b060def..981546345033 100644
--- a/arch/powerpc/platforms/powermac/pic.c
+++ b/arch/powerpc/platforms/powermac/pic.c
@@ -258,13 +258,14 @@ static unsigned int pmac_pic_get_irq(void)
 #ifdef CONFIG_XMON
 static struct irqaction xmon_action = {
 	.handler	= xmon_irq,
-	.flags		= 0,
+	.flags		= IRQF_NO_THREAD,
 	.name		= "NMI - XMON"
 };
 #endif
 
 static struct irqaction gatwick_cascade_action = {
 	.handler	= gatwick_action,
+	.flags		= IRQF_NO_THREAD,
 	.name		= "cascade",
 };
 

From 6c3082151e13846fd872cc216e8cbb5a59cd0b12 Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Wed, 11 Nov 2015 15:15:03 +0100
Subject: [PATCH 005/149] powerpc/powermac: IRQF_NO_SUSPEND not IRQF_TIMER for
 non-timer

Since gpio1 is not a timer, it also should not use IRQF_TIMER.

Similar to commit ba461f094bab ("powerpc: Use IRQF_NO_SUSPEND not
IRQF_TIMER for non-timer interrupts").

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 drivers/macintosh/via-pmu.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c
index f9512bfa6c3c..01ee736fe0ef 100644
--- a/drivers/macintosh/via-pmu.c
+++ b/drivers/macintosh/via-pmu.c
@@ -425,8 +425,9 @@ static int __init via_pmu_start(void)
 			gpio_irq = irq_of_parse_and_map(gpio_node, 0);
 
 		if (gpio_irq != NO_IRQ) {
-			if (request_irq(gpio_irq, gpio1_interrupt, IRQF_TIMER,
-					"GPIO1 ADB", (void *)0))
+			if (request_irq(gpio_irq, gpio1_interrupt,
+					IRQF_NO_SUSPEND, "GPIO1 ADB",
+					(void *)0))
 				printk(KERN_ERR "pmu: can't get irq %d"
 				       " (GPIO1)\n", gpio_irq);
 			else

From 58531b0c800fd514f04ae42b6cf5ab15abdf0651 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <laurent@vivier.eu>
Date: Thu, 5 Nov 2015 12:31:34 +0100
Subject: [PATCH 006/149] powerpc/boot: allow wrapper to work on non-english
 system

if the language is not english objdump output is not parsed correctly
and format is "". Later, "ld -m $format" fails.

This patch adds "LANG=C" to force english output for objdump.

Signed-off-by: Laurent Vivier <laurent@vivier.eu>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/boot/wrapper | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/boot/wrapper b/arch/powerpc/boot/wrapper
index ceaa75d5a684..6a19fcef5596 100755
--- a/arch/powerpc/boot/wrapper
+++ b/arch/powerpc/boot/wrapper
@@ -154,7 +154,7 @@ if [ -z "$kernel" ]; then
     kernel=vmlinux
 fi
 
-elfformat="`${CROSS}objdump -p "$kernel" | grep 'file format' | awk '{print $4}'`"
+LANG=C elfformat="`${CROSS}objdump -p "$kernel" | grep 'file format' | awk '{print $4}'`"
 case "$elfformat" in
     elf64-powerpcle)	format=elf64lppc	;;
     elf64-powerpc)	format=elf32ppc	;;

From cdfc8ed6904d7b0c0ca23d619b387b32070703b1 Mon Sep 17 00:00:00 2001
From: Rashmica Gupta <rashmicy@gmail.com>
Date: Thu, 19 Nov 2015 14:26:28 +1100
Subject: [PATCH 007/149] powerpc: Remove unused function trace_syscall()

This function has been unused since commit 14cf11af6cf6 ("powerpc: Merge enough
to start building in arch/powerpc."), so remove it.

Signed-off-by: Rashmica Gupta <rashmicy@gmail.com>
Reviewed-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Reviewed-by: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/traps.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 37de90f8a845..b6becc795bb5 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -1313,13 +1313,6 @@ void nonrecoverable_exception(struct pt_regs *regs)
 	die("nonrecoverable exception", regs, SIGKILL);
 }
 
-void trace_syscall(struct pt_regs *regs)
-{
-	printk("Task: %p(%d), PC: %08lX/%08lX, Syscall: %3ld, Result: %s%ld    %s\n",
-	       current, task_pid_nr(current), regs->nip, regs->link, regs->gpr[0],
-	       regs->ccr&0x10000000?"Error=":"", regs->gpr[3], print_tainted());
-}
-
 void kernel_fp_unavailable_exception(struct pt_regs *regs)
 {
 	enum ctx_state prev_state = exception_enter();

From f43194e45852b0455d2a3e3730f70daa76958423 Mon Sep 17 00:00:00 2001
From: Rashmica Gupta <rashmicy@gmail.com>
Date: Thu, 19 Nov 2015 17:04:53 +1100
Subject: [PATCH 008/149] powerpc: Standardise on NR_syscalls rather than
 __NR_syscalls.

Most architectures use NR_syscalls as the #define for the number of syscalls.

We use __NR_syscalls, and then define NR_syscalls as __NR_syscalls.

__NR_syscalls is not used outside arch code, whereas NR_syscalls is. So as
NR_syscalls must be defined and __NR_syscalls does not, replace __NR_syscalls
with NR_syscalls.

Signed-off-by: Rashmica Gupta <rashmicy@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/unistd.h        | 3 +--
 arch/powerpc/include/asm/vdso_datapage.h | 2 +-
 arch/powerpc/kernel/systbl_chk.c         | 2 +-
 arch/powerpc/kernel/systbl_chk.sh        | 2 +-
 arch/powerpc/kernel/vdso.c               | 2 +-
 arch/powerpc/kernel/vdso32/datapage.S    | 2 +-
 arch/powerpc/kernel/vdso64/datapage.S    | 2 +-
 arch/powerpc/platforms/cell/spufs/run.c  | 2 +-
 8 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h
index 4b6b8ace18e0..6a5ace5fa0c8 100644
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -12,10 +12,9 @@
 #include <uapi/asm/unistd.h>
 
 
-#define __NR_syscalls		379
+#define NR_syscalls		379
 
 #define __NR__exit __NR_exit
-#define NR_syscalls	__NR_syscalls
 
 #ifndef __ASSEMBLY__
 
diff --git a/arch/powerpc/include/asm/vdso_datapage.h b/arch/powerpc/include/asm/vdso_datapage.h
index b73a8199f161..1afe90ade595 100644
--- a/arch/powerpc/include/asm/vdso_datapage.h
+++ b/arch/powerpc/include/asm/vdso_datapage.h
@@ -41,7 +41,7 @@
 #include <linux/unistd.h>
 #include <linux/time.h>
 
-#define SYSCALL_MAP_SIZE      ((__NR_syscalls + 31) / 32)
+#define SYSCALL_MAP_SIZE      ((NR_syscalls + 31) / 32)
 
 /*
  * So here is the ppc64 backward compatible version
diff --git a/arch/powerpc/kernel/systbl_chk.c b/arch/powerpc/kernel/systbl_chk.c
index 2384129f5893..55323a620cfe 100644
--- a/arch/powerpc/kernel/systbl_chk.c
+++ b/arch/powerpc/kernel/systbl_chk.c
@@ -57,4 +57,4 @@
 
 START_TABLE
 #include <asm/systbl.h>
-END_TABLE __NR_syscalls
+END_TABLE NR_syscalls
diff --git a/arch/powerpc/kernel/systbl_chk.sh b/arch/powerpc/kernel/systbl_chk.sh
index 19415e7674a5..31b6e7c358ca 100644
--- a/arch/powerpc/kernel/systbl_chk.sh
+++ b/arch/powerpc/kernel/systbl_chk.sh
@@ -16,7 +16,7 @@ awk	'BEGIN { num = -1; }	# Ignore the beginning of the file
 	/^START_TABLE/ { num = 0; next; }
 	/^END_TABLE/ {
 		if (num != $2) {
-			printf "__NR_syscalls (%s) is not one more than the last syscall (%s)\n",
+			printf "NR_syscalls (%s) is not one more than the last syscall (%s)\n",
 				$2, num - 1;
 			exit(1);
 		}
diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index b457bfa28436..def1b8b5e6c1 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -671,7 +671,7 @@ static void __init vdso_setup_syscall_map(void)
 	extern unsigned long sys_ni_syscall;
 
 
-	for (i = 0; i < __NR_syscalls; i++) {
+	for (i = 0; i < NR_syscalls; i++) {
 #ifdef CONFIG_PPC64
 		if (sys_call_table[i*2] != sys_ni_syscall)
 			vdso_data->syscall_map_64[i >> 5] |=
diff --git a/arch/powerpc/kernel/vdso32/datapage.S b/arch/powerpc/kernel/vdso32/datapage.S
index 59cf5f452879..3745113fcc65 100644
--- a/arch/powerpc/kernel/vdso32/datapage.S
+++ b/arch/powerpc/kernel/vdso32/datapage.S
@@ -61,7 +61,7 @@ V_FUNCTION_BEGIN(__kernel_get_syscall_map)
 	addi	r3,r3,CFG_SYSCALL_MAP32
 	cmpli	cr0,r4,0
 	beqlr
-	li	r0,__NR_syscalls
+	li	r0,NR_syscalls
 	stw	r0,0(r4)
 	crclr	cr0*4+so
 	blr
diff --git a/arch/powerpc/kernel/vdso64/datapage.S b/arch/powerpc/kernel/vdso64/datapage.S
index 2f01c4a0d8a0..184a6ba7f283 100644
--- a/arch/powerpc/kernel/vdso64/datapage.S
+++ b/arch/powerpc/kernel/vdso64/datapage.S
@@ -62,7 +62,7 @@ V_FUNCTION_BEGIN(__kernel_get_syscall_map)
 	cmpli	cr0,r4,0
 	crclr	cr0*4+so
 	beqlr
-	li	r0,__NR_syscalls
+	li	r0,NR_syscalls
 	stw	r0,0(r4)
 	blr
   .cfi_endproc
diff --git a/arch/powerpc/platforms/cell/spufs/run.c b/arch/powerpc/platforms/cell/spufs/run.c
index 4ddf769a64e5..9f79004e6d6f 100644
--- a/arch/powerpc/platforms/cell/spufs/run.c
+++ b/arch/powerpc/platforms/cell/spufs/run.c
@@ -326,7 +326,7 @@ static int spu_process_callback(struct spu_context *ctx)
 	spu_ret = -ENOSYS;
 	npc += 4;
 
-	if (s.nr_ret < __NR_syscalls) {
+	if (s.nr_ret < NR_syscalls) {
 		spu_release(ctx);
 		/* do actual system call from here */
 		spu_ret = spu_sys_callback(&s);

From 343c3327c12b13655192983007de94bd44fd7941 Mon Sep 17 00:00:00 2001
From: Rashmica Gupta <rashmicy@gmail.com>
Date: Sat, 21 Nov 2015 17:08:16 +1100
Subject: [PATCH 009/149] powerpc: Add rN aliases to the pt_regs_offset table.

It is common practice with powerpc to use 'rN' to refer to register 'N'. However
when using the pt_regs_offset table we have to use 'gprN'.

So add aliases such that both 'rN' and 'gprN' can be used.

For example, we can currently do:
  $ su -
  $ cd /sys/kernel/debug/tracing
  $ echo "p:probe/sys_fchownat sys_fchownat %gpr3:s32 +0(%gpr4):string %gpr5:s32 %gpr6:s32 %gpr7:s32" > kprobe_events
  $ echo 1 > events/probe/sys_fchownat/enable
  $ touch /tmp/foo
  $ chown root /tmp/foo
  $ echo 0 > events/enable
  $ cat trace
    chown-2925  [014] d...    76.160657: sys_fchownat: (SyS_fchownat+0x8/0x1a0) arg1=-100 arg2="/tmp/foo" arg3=0 arg4=-1 arg5=0

Instead we'd like to be able to use:
 $ echo "p:probe/sys_fchownat sys_fchownat %r3:s32 +0(%r4):string %r5:s32 %r6:s32 %r7:s32" > kprobe_events

Signed-off-by: Rashmica Gupta <rashmicy@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/ptrace.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index 737c0d0b53ac..30a03c03fe73 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -60,6 +60,7 @@ struct pt_regs_offset {
 #define STR(s)	#s			/* convert to string */
 #define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)}
 #define GPR_OFFSET_NAME(num)	\
+	{.name = STR(r##num), .offset = offsetof(struct pt_regs, gpr[num])}, \
 	{.name = STR(gpr##num), .offset = offsetof(struct pt_regs, gpr[num])}
 #define REG_OFFSET_END {.name = NULL, .offset = 0}
 

From 6735b2e985b78e0ecb05e8818c22f46b904f3599 Mon Sep 17 00:00:00 2001
From: Luis de Bethencourt <luis@debethencourt.com>
Date: Tue, 20 Oct 2015 16:13:53 +0100
Subject: [PATCH 010/149] powerpc/rackmeter: Fix module autoload for OF
 platform driver

This platform driver has a OF device ID table but the OF module
alias information is not created so module autoloading won't work.

Signed-off-by: Luis de Bethencourt <luisbg@osg.samsung.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 drivers/macintosh/rack-meter.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/macintosh/rack-meter.c b/drivers/macintosh/rack-meter.c
index 048901a1111a..caaec654d7ea 100644
--- a/drivers/macintosh/rack-meter.c
+++ b/drivers/macintosh/rack-meter.c
@@ -582,6 +582,7 @@ static struct of_device_id rackmeter_match[] = {
 	{ .name = "i2s" },
 	{ }
 };
+MODULE_DEVICE_TABLE(of, rackmeter_match);
 
 static struct macio_driver rackmeter_driver = {
 	.driver = {

From b4f8144559e172f792f7fa926e4f342fbdbaf6ee Mon Sep 17 00:00:00 2001
From: Luis de Bethencourt <luis@debethencourt.com>
Date: Tue, 20 Oct 2015 16:04:13 +0100
Subject: [PATCH 011/149] powerpc/axonram: Fix module autoload for OF platform
 driver

This platform driver has a OF device ID table but the OF module
alias information is not created so module autoloading won't work.

Signed-off-by: Luis de Bethencourt <luisbg@osg.samsung.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/sysdev/axonram.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c
index 7a399b4d60a0..c713b349d967 100644
--- a/arch/powerpc/sysdev/axonram.c
+++ b/arch/powerpc/sysdev/axonram.c
@@ -313,6 +313,7 @@ static const struct of_device_id axon_ram_device_id[] = {
 	},
 	{}
 };
+MODULE_DEVICE_TABLE(of, axon_ram_device_id);
 
 static struct platform_driver axon_ram_driver = {
 	.probe		= axon_ram_probe,

From 31a40e2b052c0f2b80df7b56928f9d5ff9c96933 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Thu, 12 Nov 2015 16:44:42 +1100
Subject: [PATCH 012/149] powerpc/64: Include KVM guest test in all interrupt
 vectors

Currently, if HV KVM is configured but PR KVM isn't, we don't include
a test to see whether we were interrupted in KVM guest context for the
set of interrupts which get delivered directly to the guest by hardware
if they occur in the guest.  This includes things like program
interrupts.

However, the recent bug where userspace could set the MSR for a VCPU
to have an illegal value in the TS field, and thus cause a TM Bad Thing
type of program interrupt on the hrfid that enters the guest, showed that
we can never be completely sure that these interrupts can never occur
in the guest entry/exit code.  If one of these interrupts does happen
and we have HV KVM configured but not PR KVM, then we end up trying to
run the handler in the host with the MMU set to the guest MMU context,
which generally ends badly.

Thus, for robustness it is better to have the test in every interrupt
vector, so that if some way is found to trigger some interrupt in the
guest entry/exit path, we can handle it without immediately crashing
the host.

This means that the distinction between KVMTEST and KVMTEST_PR goes
away.  Thus we delete KVMTEST_PR and associated macros and use KVMTEST
everywhere that we previously used either KVMTEST_PR or KVMTEST.  It
also means that SOFTEN_TEST_HV_201 becomes the same as SOFTEN_TEST_PR,
so we deleted SOFTEN_TEST_HV_201 and use SOFTEN_TEST_PR instead.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/exception-64s.h | 21 +++------------
 arch/powerpc/kernel/exceptions-64s.S     | 34 ++++++++++++------------
 2 files changed, 20 insertions(+), 35 deletions(-)

diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index 77f52b26dad6..9ee10781121f 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -263,17 +263,6 @@ do_kvm_##n:								\
 #define KVM_HANDLER_SKIP(area, h, n)
 #endif
 
-#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
-#define KVMTEST_PR(n)			__KVMTEST(n)
-#define KVM_HANDLER_PR(area, h, n)	__KVM_HANDLER(area, h, n)
-#define KVM_HANDLER_PR_SKIP(area, h, n)	__KVM_HANDLER_SKIP(area, h, n)
-
-#else
-#define KVMTEST_PR(n)
-#define KVM_HANDLER_PR(area, h, n)
-#define KVM_HANDLER_PR_SKIP(area, h, n)
-#endif
-
 #define NOTEST(n)
 
 /*
@@ -360,13 +349,13 @@ label##_pSeries:					\
 	HMT_MEDIUM_PPR_DISCARD;				\
 	SET_SCRATCH0(r13);		/* save r13 */		\
 	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common,	\
-				 EXC_STD, KVMTEST_PR, vec)
+				 EXC_STD, KVMTEST, vec)
 
 /* Version of above for when we have to branch out-of-line */
 #define STD_EXCEPTION_PSERIES_OOL(vec, label)			\
 	.globl label##_pSeries;					\
 label##_pSeries:						\
-	EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST_PR, vec);	\
+	EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST, vec);	\
 	EXCEPTION_PROLOG_PSERIES_1(label##_common, EXC_STD)
 
 #define STD_EXCEPTION_HV(loc, vec, label)		\
@@ -436,17 +425,13 @@ label##_relon_hv:						\
 #define _SOFTEN_TEST(h, vec)	__SOFTEN_TEST(h, vec)
 
 #define SOFTEN_TEST_PR(vec)						\
-	KVMTEST_PR(vec);						\
+	KVMTEST(vec);							\
 	_SOFTEN_TEST(EXC_STD, vec)
 
 #define SOFTEN_TEST_HV(vec)						\
 	KVMTEST(vec);							\
 	_SOFTEN_TEST(EXC_HV, vec)
 
-#define SOFTEN_TEST_HV_201(vec)						\
-	KVMTEST(vec);							\
-	_SOFTEN_TEST(EXC_STD, vec)
-
 #define SOFTEN_NOTEST_PR(vec)		_SOFTEN_TEST(EXC_STD, vec)
 #define SOFTEN_NOTEST_HV(vec)		_SOFTEN_TEST(EXC_HV, vec)
 
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 0a0399c2af11..1a03142a69fd 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -242,7 +242,7 @@ instruction_access_slb_pSeries:
 	HMT_MEDIUM_PPR_DISCARD
 	SET_SCRATCH0(r13)
 	EXCEPTION_PROLOG_0(PACA_EXSLB)
-	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x480)
+	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST, 0x480)
 	std	r3,PACA_EXSLB+EX_R3(r13)
 	mfspr	r3,SPRN_SRR0		/* SRR0 is faulting address */
 #ifdef __DISABLED__
@@ -276,18 +276,18 @@ hardware_interrupt_hv:
 		KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x502)
 	FTR_SECTION_ELSE
 		_MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt,
-					    EXC_STD, SOFTEN_TEST_HV_201)
+					    EXC_STD, SOFTEN_TEST_PR)
 		KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x500)
 	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
 
 	STD_EXCEPTION_PSERIES(0x600, 0x600, alignment)
-	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x600)
+	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x600)
 
 	STD_EXCEPTION_PSERIES(0x700, 0x700, program_check)
-	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x700)
+	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x700)
 
 	STD_EXCEPTION_PSERIES(0x800, 0x800, fp_unavailable)
-	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x800)
+	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x800)
 
 	. = 0x900
 	.globl decrementer_pSeries
@@ -297,10 +297,10 @@ decrementer_pSeries:
 	STD_EXCEPTION_HV(0x980, 0x982, hdecrementer)
 
 	MASKABLE_EXCEPTION_PSERIES(0xa00, 0xa00, doorbell_super)
-	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xa00)
+	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xa00)
 
 	STD_EXCEPTION_PSERIES(0xb00, 0xb00, trap_0b)
-	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xb00)
+	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xb00)
 
 	. = 0xc00
 	.globl	system_call_pSeries
@@ -332,7 +332,7 @@ system_call_pSeries:
 	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xc00)
 
 	STD_EXCEPTION_PSERIES(0xd00, 0xd00, single_step)
-	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xd00)
+	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xd00)
 
 	/* At 0xe??? we have a bunch of hypervisor exceptions, we branch
 	 * out of line to handle them
@@ -408,7 +408,7 @@ hv_facility_unavailable_trampoline:
 #endif /* CONFIG_CBE_RAS */
 
 	STD_EXCEPTION_PSERIES(0x1300, 0x1300, instruction_breakpoint)
-	KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_STD, 0x1300)
+	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_STD, 0x1300)
 
 	. = 0x1500
 	.global denorm_exception_hv
@@ -436,7 +436,7 @@ denorm_exception_hv:
 #endif /* CONFIG_CBE_RAS */
 
 	STD_EXCEPTION_PSERIES(0x1700, 0x1700, altivec_assist)
-	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x1700)
+	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x1700)
 
 #ifdef CONFIG_CBE_RAS
 	STD_EXCEPTION_HV(0x1800, 0x1802, cbe_thermal)
@@ -536,9 +536,9 @@ machine_check_pSeries_0:
 	KVM_HANDLER_SKIP(PACA_EXMC, EXC_STD, 0x200)
 	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_STD, 0x300)
 	KVM_HANDLER_SKIP(PACA_EXSLB, EXC_STD, 0x380)
-	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x400)
-	KVM_HANDLER_PR(PACA_EXSLB, EXC_STD, 0x480)
-	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x900)
+	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x400)
+	KVM_HANDLER(PACA_EXSLB, EXC_STD, 0x480)
+	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x900)
 	KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x982)
 
 #ifdef CONFIG_PPC_DENORMALISATION
@@ -621,13 +621,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
 
 	/* moved from 0xf00 */
 	STD_EXCEPTION_PSERIES_OOL(0xf00, performance_monitor)
-	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf00)
+	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xf00)
 	STD_EXCEPTION_PSERIES_OOL(0xf20, altivec_unavailable)
-	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf20)
+	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xf20)
 	STD_EXCEPTION_PSERIES_OOL(0xf40, vsx_unavailable)
-	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf40)
+	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xf40)
 	STD_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable)
-	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf60)
+	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xf60)
 	STD_EXCEPTION_HV_OOL(0xf82, facility_unavailable)
 	KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xf82)
 

From 07e45c120c9c61b792be62182b0a8f706ee2ab24 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Thu, 29 Oct 2015 11:43:53 +1100
Subject: [PATCH 013/149] powerpc: Don't disable kernel FP/VMX/VSX MSR bits on
 context switch

Writing the MSR is slow, so we want to avoid it whenever possible.

A subsequent patch will add a debug option that strictly manages the
FP/VMX/VSX unavailable bits. For now just remove it, matching what
we do in other areas of the kernel (eg enable_kernel_altivec()).

A context switch microbenchmark using yield():

http://ozlabs.org/~anton/junkcode/context_switch2.c

./context_switch2 --test=yield --fp 0 0

shows an improvement of almost 3% on POWER8.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/entry_64.S | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index a94f155db78e..93bb284fddf9 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -453,26 +453,13 @@ _GLOBAL(_switch)
 	SAVE_8GPRS(14, r1)
 	SAVE_10GPRS(22, r1)
 	mflr	r20		/* Return to switch caller */
-	mfmsr	r22
-	li	r0, MSR_FP
-#ifdef CONFIG_VSX
-BEGIN_FTR_SECTION
-	oris	r0,r0,MSR_VSX@h	/* Disable VSX */
-END_FTR_SECTION_IFSET(CPU_FTR_VSX)
-#endif /* CONFIG_VSX */
 #ifdef CONFIG_ALTIVEC
 BEGIN_FTR_SECTION
-	oris	r0,r0,MSR_VEC@h	/* Disable altivec */
 	mfspr	r24,SPRN_VRSAVE	/* save vrsave register value */
 	std	r24,THREAD_VRSAVE(r3)
 END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
 #endif /* CONFIG_ALTIVEC */
-	and.	r0,r0,r22
-	beq+	1f
-	andc	r22,r22,r0
-	MTMSRD(r22)
-	isync
-1:	std	r20,_NIP(r1)
+	std	r20,_NIP(r1)
 	mfcr	r23
 	std	r23,_CCR(r1)
 	std	r1,KSP(r3)	/* Set old stack pointer */

From af72ab646a6bee724f190820e8f56497a5b635f0 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Thu, 29 Oct 2015 11:43:54 +1100
Subject: [PATCH 014/149] powerpc: Don't disable MSR bits in
 do_load_up_transact_*() functions

Similar to the non TM load_up_*() functions, don't disable the MSR
bits on the way out.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/fpu.S    | 4 ----
 arch/powerpc/kernel/vector.S | 4 ----
 2 files changed, 8 deletions(-)

diff --git a/arch/powerpc/kernel/fpu.S b/arch/powerpc/kernel/fpu.S
index 9ad236e5d2c9..38eb79b8a034 100644
--- a/arch/powerpc/kernel/fpu.S
+++ b/arch/powerpc/kernel/fpu.S
@@ -73,10 +73,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
 	MTFSF_L(fr0)
 	REST_32FPVSRS(0, R4, R7)
 
-	/* FP/VSX off again */
-	MTMSRD(r6)
-	SYNC
-
 	blr
 #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
 
diff --git a/arch/powerpc/kernel/vector.S b/arch/powerpc/kernel/vector.S
index f5c80d567d8d..1c5425966204 100644
--- a/arch/powerpc/kernel/vector.S
+++ b/arch/powerpc/kernel/vector.S
@@ -29,10 +29,6 @@ _GLOBAL(do_load_up_transact_altivec)
 	addi	r10,r3,THREAD_TRANSACT_VRSTATE
 	REST_32VRS(0,r4,r10)
 
-	/* Disable VEC again. */
-	MTMSRD(r6)
-	isync
-
 	blr
 #endif
 

From 152d523e6307c7152f9986a542f873b5c5863937 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Thu, 29 Oct 2015 11:43:55 +1100
Subject: [PATCH 015/149] powerpc: Create context switch helpers save_sprs()
 and restore_sprs()

Move all our context switch SPR save and restore code into two
helpers. We do a few optimisations:

- Group all mfsprs and all mtsprs. In many cases an mtspr sets a
scoreboarding bit that an mfspr waits on, so the current practise of
mfspr A; mtspr A; mfpsr B; mtspr B is the worst scheduling we can
do.

- SPR writes are slow, so check that the value is changing before
writing it.

A context switch microbenchmark using yield():

http://ozlabs.org/~anton/junkcode/context_switch2.c

./context_switch2 --test=yield 0 0

shows an improvement of almost 10% on POWER8.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/processor.h |  1 +
 arch/powerpc/include/asm/switch_to.h | 11 ----
 arch/powerpc/kernel/entry_64.S       | 60 +-----------------
 arch/powerpc/kernel/process.c        | 92 ++++++++++++++++++++++++----
 4 files changed, 82 insertions(+), 82 deletions(-)

diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index 5afea361beaa..c273f3e0ba84 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -294,6 +294,7 @@ struct thread_struct {
 #endif
 #ifdef CONFIG_PPC64
 	unsigned long	dscr;
+	unsigned long	fscr;
 	/*
 	 * This member element dscr_inherit indicates that the process
 	 * has explicitly attempted and changed the DSCR register value
diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h
index 15cca17cba4b..33a071d24ba8 100644
--- a/arch/powerpc/include/asm/switch_to.h
+++ b/arch/powerpc/include/asm/switch_to.h
@@ -15,17 +15,6 @@ extern struct task_struct *__switch_to(struct task_struct *,
 struct thread_struct;
 extern struct task_struct *_switch(struct thread_struct *prev,
 				   struct thread_struct *next);
-#ifdef CONFIG_PPC_BOOK3S_64
-static inline void save_early_sprs(struct thread_struct *prev)
-{
-	if (cpu_has_feature(CPU_FTR_ARCH_207S))
-		prev->tar = mfspr(SPRN_TAR);
-	if (cpu_has_feature(CPU_FTR_DSCR))
-		prev->dscr = mfspr(SPRN_DSCR);
-}
-#else
-static inline void save_early_sprs(struct thread_struct *prev) {}
-#endif
 
 extern void enable_kernel_fp(void);
 extern void enable_kernel_altivec(void);
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 93bb284fddf9..e84e5bc7fe34 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -453,29 +453,12 @@ _GLOBAL(_switch)
 	SAVE_8GPRS(14, r1)
 	SAVE_10GPRS(22, r1)
 	mflr	r20		/* Return to switch caller */
-#ifdef CONFIG_ALTIVEC
-BEGIN_FTR_SECTION
-	mfspr	r24,SPRN_VRSAVE	/* save vrsave register value */
-	std	r24,THREAD_VRSAVE(r3)
-END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
-#endif /* CONFIG_ALTIVEC */
+
 	std	r20,_NIP(r1)
 	mfcr	r23
 	std	r23,_CCR(r1)
 	std	r1,KSP(r3)	/* Set old stack pointer */
 
-#ifdef CONFIG_PPC_BOOK3S_64
-BEGIN_FTR_SECTION
-	/* Event based branch registers */
-	mfspr	r0, SPRN_BESCR
-	std	r0, THREAD_BESCR(r3)
-	mfspr	r0, SPRN_EBBHR
-	std	r0, THREAD_EBBHR(r3)
-	mfspr	r0, SPRN_EBBRR
-	std	r0, THREAD_EBBRR(r3)
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
-#endif
-
 #ifdef CONFIG_SMP
 	/* We need a sync somewhere here to make sure that if the
 	 * previous task gets rescheduled on another CPU, it sees all
@@ -563,47 +546,6 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
 	mr	r1,r8		/* start using new stack pointer */
 	std	r7,PACAKSAVE(r13)
 
-#ifdef CONFIG_PPC_BOOK3S_64
-BEGIN_FTR_SECTION
-	/* Event based branch registers */
-	ld	r0, THREAD_BESCR(r4)
-	mtspr	SPRN_BESCR, r0
-	ld	r0, THREAD_EBBHR(r4)
-	mtspr	SPRN_EBBHR, r0
-	ld	r0, THREAD_EBBRR(r4)
-	mtspr	SPRN_EBBRR, r0
-
-	ld	r0,THREAD_TAR(r4)
-	mtspr	SPRN_TAR,r0
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
-#endif
-
-#ifdef CONFIG_ALTIVEC
-BEGIN_FTR_SECTION
-	ld	r0,THREAD_VRSAVE(r4)
-	mtspr	SPRN_VRSAVE,r0		/* if G4, restore VRSAVE reg */
-END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
-#endif /* CONFIG_ALTIVEC */
-#ifdef CONFIG_PPC64
-BEGIN_FTR_SECTION
-	lwz	r6,THREAD_DSCR_INHERIT(r4)
-	ld	r0,THREAD_DSCR(r4)
-	cmpwi	r6,0
-	bne	1f
-	ld	r0,PACA_DSCR_DEFAULT(r13)
-1:
-BEGIN_FTR_SECTION_NESTED(70)
-	mfspr	r8, SPRN_FSCR
-	rldimi	r8, r6, FSCR_DSCR_LG, (63 - FSCR_DSCR_LG)
-	mtspr	SPRN_FSCR, r8
-END_FTR_SECTION_NESTED(CPU_FTR_ARCH_207S, CPU_FTR_ARCH_207S, 70)
-	cmpd	r0,r25
-	beq	2f
-	mtspr	SPRN_DSCR,r0
-2:
-END_FTR_SECTION_IFSET(CPU_FTR_DSCR)
-#endif
-
 	ld	r6,_CCR(r1)
 	mtcrf	0xFF,r6
 
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 75b6676c1a0b..3aabed4a60a9 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -742,6 +742,73 @@ void restore_tm_state(struct pt_regs *regs)
 #define __switch_to_tm(prev)
 #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
 
+static inline void save_sprs(struct thread_struct *t)
+{
+#ifdef CONFIG_ALTIVEC
+	if (cpu_has_feature(cpu_has_feature(CPU_FTR_ALTIVEC)))
+		t->vrsave = mfspr(SPRN_VRSAVE);
+#endif
+#ifdef CONFIG_PPC_BOOK3S_64
+	if (cpu_has_feature(CPU_FTR_DSCR))
+		t->dscr = mfspr(SPRN_DSCR);
+
+	if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
+		t->bescr = mfspr(SPRN_BESCR);
+		t->ebbhr = mfspr(SPRN_EBBHR);
+		t->ebbrr = mfspr(SPRN_EBBRR);
+
+		t->fscr = mfspr(SPRN_FSCR);
+
+		/*
+		 * Note that the TAR is not available for use in the kernel.
+		 * (To provide this, the TAR should be backed up/restored on
+		 * exception entry/exit instead, and be in pt_regs.  FIXME,
+		 * this should be in pt_regs anyway (for debug).)
+		 */
+		t->tar = mfspr(SPRN_TAR);
+	}
+#endif
+}
+
+static inline void restore_sprs(struct thread_struct *old_thread,
+				struct thread_struct *new_thread)
+{
+#ifdef CONFIG_ALTIVEC
+	if (cpu_has_feature(CPU_FTR_ALTIVEC) &&
+	    old_thread->vrsave != new_thread->vrsave)
+		mtspr(SPRN_VRSAVE, new_thread->vrsave);
+#endif
+#ifdef CONFIG_PPC_BOOK3S_64
+	if (cpu_has_feature(CPU_FTR_DSCR)) {
+		u64 dscr = get_paca()->dscr_default;
+		u64 fscr = old_thread->fscr & ~FSCR_DSCR;
+
+		if (new_thread->dscr_inherit) {
+			dscr = new_thread->dscr;
+			fscr |= FSCR_DSCR;
+		}
+
+		if (old_thread->dscr != dscr)
+			mtspr(SPRN_DSCR, dscr);
+
+		if (old_thread->fscr != fscr)
+			mtspr(SPRN_FSCR, fscr);
+	}
+
+	if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
+		if (old_thread->bescr != new_thread->bescr)
+			mtspr(SPRN_BESCR, new_thread->bescr);
+		if (old_thread->ebbhr != new_thread->ebbhr)
+			mtspr(SPRN_EBBHR, new_thread->ebbhr);
+		if (old_thread->ebbrr != new_thread->ebbrr)
+			mtspr(SPRN_EBBRR, new_thread->ebbrr);
+
+		if (old_thread->tar != new_thread->tar)
+			mtspr(SPRN_TAR, new_thread->tar);
+	}
+#endif
+}
+
 struct task_struct *__switch_to(struct task_struct *prev,
 	struct task_struct *new)
 {
@@ -751,17 +818,16 @@ struct task_struct *__switch_to(struct task_struct *prev,
 	struct ppc64_tlb_batch *batch;
 #endif
 
+	new_thread = &new->thread;
+	old_thread = &current->thread;
+
 	WARN_ON(!irqs_disabled());
 
-	/* Back up the TAR and DSCR across context switches.
-	 * Note that the TAR is not available for use in the kernel.  (To
-	 * provide this, the TAR should be backed up/restored on exception
-	 * entry/exit instead, and be in pt_regs.  FIXME, this should be in
-	 * pt_regs anyway (for debug).)
-	 * Save the TAR and DSCR here before we do treclaim/trecheckpoint as
-	 * these will change them.
+	/*
+	 * We need to save SPRs before treclaim/trecheckpoint as these will
+	 * change a number of them.
 	 */
-	save_early_sprs(&prev->thread);
+	save_sprs(&prev->thread);
 
 	__switch_to_tm(prev);
 
@@ -844,10 +910,6 @@ struct task_struct *__switch_to(struct task_struct *prev,
 #endif /* CONFIG_HAVE_HW_BREAKPOINT */
 #endif
 
-
-	new_thread = &new->thread;
-	old_thread = &current->thread;
-
 #ifdef CONFIG_PPC64
 	/*
 	 * Collect processor utilization data per process
@@ -883,6 +945,10 @@ struct task_struct *__switch_to(struct task_struct *prev,
 
 	last = _switch(old_thread, new_thread);
 
+	/* Need to recalculate these after calling _switch() */
+	old_thread = &last->thread;
+	new_thread = &current->thread;
+
 #ifdef CONFIG_PPC_BOOK3S_64
 	if (current_thread_info()->local_flags & _TLF_LAZY_MMU) {
 		current_thread_info()->local_flags &= ~_TLF_LAZY_MMU;
@@ -891,6 +957,8 @@ struct task_struct *__switch_to(struct task_struct *prev,
 	}
 #endif /* CONFIG_PPC_BOOK3S_64 */
 
+	restore_sprs(old_thread, new_thread);
+
 	return last;
 }
 

From 68bfa962bff5783ad65de9dc7f3b9e16ea466766 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Thu, 29 Oct 2015 11:43:56 +1100
Subject: [PATCH 016/149] powerpc: Remove redundant mflr in _switch

No need to execute mflr twice.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/entry_64.S | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index e84e5bc7fe34..c8b4225a0095 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -452,9 +452,7 @@ _GLOBAL(_switch)
 	/* r3-r13 are caller saved -- Cort */
 	SAVE_8GPRS(14, r1)
 	SAVE_10GPRS(22, r1)
-	mflr	r20		/* Return to switch caller */
-
-	std	r20,_NIP(r1)
+	std	r0,_NIP(r1)	/* Return to switch caller */
 	mfcr	r23
 	std	r23,_CCR(r1)
 	std	r1,KSP(r3)	/* Set old stack pointer */

From af1bbc3dd3d501d27da72e1764afe5f5b0d3882d Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Thu, 29 Oct 2015 11:43:57 +1100
Subject: [PATCH 017/149] powerpc: Remove UP only lazy floating point and
 vector optimisations

The UP only lazy floating point and vector optimisations were written
back when SMP was not common, and neither glibc nor gcc used vector
instructions. Now SMP is very common, glibc aggressively uses vector
instructions and gcc autovectorises.

We want to add new optimisations that apply to both UP and SMP, but
in preparation for that remove these UP only optimisations.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/processor.h |   6 --
 arch/powerpc/include/asm/switch_to.h |   8 --
 arch/powerpc/kernel/fpu.S            |  35 ---------
 arch/powerpc/kernel/head_fsl_booke.S |  32 --------
 arch/powerpc/kernel/idle_power7.S    |   7 --
 arch/powerpc/kernel/process.c        | 113 +--------------------------
 arch/powerpc/kernel/signal_32.c      |  18 -----
 arch/powerpc/kernel/signal_64.c      |  18 -----
 arch/powerpc/kernel/vector.S         |  68 ----------------
 9 files changed, 1 insertion(+), 304 deletions(-)

diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index c273f3e0ba84..a2e891840806 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -88,12 +88,6 @@ struct task_struct;
 void start_thread(struct pt_regs *regs, unsigned long fdptr, unsigned long sp);
 void release_thread(struct task_struct *);
 
-/* Lazy FPU handling on uni-processor */
-extern struct task_struct *last_task_used_math;
-extern struct task_struct *last_task_used_altivec;
-extern struct task_struct *last_task_used_vsx;
-extern struct task_struct *last_task_used_spe;
-
 #ifdef CONFIG_PPC32
 
 #if CONFIG_TASK_SIZE > CONFIG_KERNEL_START
diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h
index 33a071d24ba8..bd1d93318350 100644
--- a/arch/powerpc/include/asm/switch_to.h
+++ b/arch/powerpc/include/asm/switch_to.h
@@ -27,14 +27,6 @@ extern void giveup_spe(struct task_struct *);
 extern void load_up_spe(struct task_struct *);
 extern void switch_booke_debug_regs(struct debug_reg *new_debug);
 
-#ifndef CONFIG_SMP
-extern void discard_lazy_cpu_state(void);
-#else
-static inline void discard_lazy_cpu_state(void)
-{
-}
-#endif
-
 #ifdef CONFIG_PPC_FPU
 extern void flush_fp_to_thread(struct task_struct *);
 extern void giveup_fpu(struct task_struct *);
diff --git a/arch/powerpc/kernel/fpu.S b/arch/powerpc/kernel/fpu.S
index 38eb79b8a034..50d2352f2cf4 100644
--- a/arch/powerpc/kernel/fpu.S
+++ b/arch/powerpc/kernel/fpu.S
@@ -132,31 +132,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
 	SYNC
 	MTMSRD(r5)			/* enable use of fpu now */
 	isync
-/*
- * For SMP, we don't do lazy FPU switching because it just gets too
- * horrendously complex, especially when a task switches from one CPU
- * to another.  Instead we call giveup_fpu in switch_to.
- */
-#ifndef CONFIG_SMP
-	LOAD_REG_ADDRBASE(r3, last_task_used_math)
-	toreal(r3)
-	PPC_LL	r4,ADDROFF(last_task_used_math)(r3)
-	PPC_LCMPI	0,r4,0
-	beq	1f
-	toreal(r4)
-	addi	r4,r4,THREAD		/* want last_task_used_math->thread */
-	addi	r10,r4,THREAD_FPSTATE
-	SAVE_32FPVSRS(0, R5, R10)
-	mffs	fr0
-	stfd	fr0,FPSTATE_FPSCR(r10)
-	PPC_LL	r5,PT_REGS(r4)
-	toreal(r5)
-	PPC_LL	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
-	li	r10,MSR_FP|MSR_FE0|MSR_FE1
-	andc	r4,r4,r10		/* disable FP for previous task */
-	PPC_STL	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
-1:
-#endif /* CONFIG_SMP */
 	/* enable use of FP after return */
 #ifdef CONFIG_PPC32
 	mfspr	r5,SPRN_SPRG_THREAD	/* current task's THREAD (phys) */
@@ -175,11 +150,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
 	lfd	fr0,FPSTATE_FPSCR(r10)
 	MTFSF_L(fr0)
 	REST_32FPVSRS(0, R4, R10)
-#ifndef CONFIG_SMP
-	subi	r4,r5,THREAD
-	fromreal(r4)
-	PPC_STL	r4,ADDROFF(last_task_used_math)(r3)
-#endif /* CONFIG_SMP */
 	/* restore registers and return */
 	/* we haven't used ctr or xer or lr */
 	blr
@@ -226,11 +196,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
 	andc	r4,r4,r3		/* disable FP for previous task */
 	PPC_STL	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
 1:
-#ifndef CONFIG_SMP
-	li	r5,0
-	LOAD_REG_ADDRBASE(r4,last_task_used_math)
-	PPC_STL	r5,ADDROFF(last_task_used_math)(r4)
-#endif /* CONFIG_SMP */
 	blr
 
 /*
diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S
index fffd1f96bb1d..ec936abbcadc 100644
--- a/arch/powerpc/kernel/head_fsl_booke.S
+++ b/arch/powerpc/kernel/head_fsl_booke.S
@@ -857,29 +857,6 @@ _GLOBAL(load_up_spe)
 	oris	r5,r5,MSR_SPE@h
 	mtmsr	r5			/* enable use of SPE now */
 	isync
-/*
- * For SMP, we don't do lazy SPE switching because it just gets too
- * horrendously complex, especially when a task switches from one CPU
- * to another.  Instead we call giveup_spe in switch_to.
- */
-#ifndef CONFIG_SMP
-	lis	r3,last_task_used_spe@ha
-	lwz	r4,last_task_used_spe@l(r3)
-	cmpi	0,r4,0
-	beq	1f
-	addi	r4,r4,THREAD	/* want THREAD of last_task_used_spe */
-	SAVE_32EVRS(0,r10,r4,THREAD_EVR0)
-	evxor	evr10, evr10, evr10	/* clear out evr10 */
-	evmwumiaa evr10, evr10, evr10	/* evr10 <- ACC = 0 * 0 + ACC */
-	li	r5,THREAD_ACC
-	evstddx	evr10, r4, r5		/* save off accumulator */
-	lwz	r5,PT_REGS(r4)
-	lwz	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
-	lis	r10,MSR_SPE@h
-	andc	r4,r4,r10	/* disable SPE for previous task */
-	stw	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
-1:
-#endif /* !CONFIG_SMP */
 	/* enable use of SPE after return */
 	oris	r9,r9,MSR_SPE@h
 	mfspr	r5,SPRN_SPRG_THREAD	/* current task's THREAD (phys) */
@@ -889,10 +866,6 @@ _GLOBAL(load_up_spe)
 	evlddx	evr4,r10,r5
 	evmra	evr4,evr4
 	REST_32EVRS(0,r10,r5,THREAD_EVR0)
-#ifndef CONFIG_SMP
-	subi	r4,r5,THREAD
-	stw	r4,last_task_used_spe@l(r3)
-#endif /* !CONFIG_SMP */
 	blr
 
 /*
@@ -1035,11 +1008,6 @@ _GLOBAL(giveup_spe)
 	andc	r4,r4,r3		/* disable SPE for previous task */
 	stw	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
 1:
-#ifndef CONFIG_SMP
-	li	r5,0
-	lis	r4,last_task_used_spe@ha
-	stw	r5,last_task_used_spe@l(r4)
-#endif /* !CONFIG_SMP */
 	blr
 #endif /* CONFIG_SPE */
 
diff --git a/arch/powerpc/kernel/idle_power7.S b/arch/powerpc/kernel/idle_power7.S
index 112ccf497562..cf4fb5429cf1 100644
--- a/arch/powerpc/kernel/idle_power7.S
+++ b/arch/powerpc/kernel/idle_power7.S
@@ -89,13 +89,6 @@ _GLOBAL(power7_powersave_common)
 	std	r0,_LINK(r1)
 	std	r0,_NIP(r1)
 
-#ifndef CONFIG_SMP
-	/* Make sure FPU, VSX etc... are flushed as we may lose
-	 * state when going to nap mode
-	 */
-	bl	discard_lazy_cpu_state
-#endif /* CONFIG_SMP */
-
 	/* Hard disable interrupts */
 	mfmsr	r9
 	rldicl	r9,r9,48,1
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 3aabed4a60a9..e098f4315643 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -67,13 +67,6 @@
 
 extern unsigned long _get_SP(void);
 
-#ifndef CONFIG_SMP
-struct task_struct *last_task_used_math = NULL;
-struct task_struct *last_task_used_altivec = NULL;
-struct task_struct *last_task_used_vsx = NULL;
-struct task_struct *last_task_used_spe = NULL;
-#endif
-
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 void giveup_fpu_maybe_transactional(struct task_struct *tsk)
 {
@@ -134,16 +127,14 @@ void flush_fp_to_thread(struct task_struct *tsk)
 		 */
 		preempt_disable();
 		if (tsk->thread.regs->msr & MSR_FP) {
-#ifdef CONFIG_SMP
 			/*
 			 * This should only ever be called for current or
 			 * for a stopped child process.  Since we save away
-			 * the FP register state on context switch on SMP,
+			 * the FP register state on context switch,
 			 * there is something wrong if a stopped child appears
 			 * to still have its FP state in the CPU registers.
 			 */
 			BUG_ON(tsk != current);
-#endif
 			giveup_fpu_maybe_transactional(tsk);
 		}
 		preempt_enable();
@@ -156,14 +147,10 @@ void enable_kernel_fp(void)
 {
 	WARN_ON(preemptible());
 
-#ifdef CONFIG_SMP
 	if (current->thread.regs && (current->thread.regs->msr & MSR_FP))
 		giveup_fpu_maybe_transactional(current);
 	else
 		giveup_fpu(NULL);	/* just enables FP for kernel */
-#else
-	giveup_fpu_maybe_transactional(last_task_used_math);
-#endif /* CONFIG_SMP */
 }
 EXPORT_SYMBOL(enable_kernel_fp);
 
@@ -172,14 +159,10 @@ void enable_kernel_altivec(void)
 {
 	WARN_ON(preemptible());
 
-#ifdef CONFIG_SMP
 	if (current->thread.regs && (current->thread.regs->msr & MSR_VEC))
 		giveup_altivec_maybe_transactional(current);
 	else
 		giveup_altivec_notask();
-#else
-	giveup_altivec_maybe_transactional(last_task_used_altivec);
-#endif /* CONFIG_SMP */
 }
 EXPORT_SYMBOL(enable_kernel_altivec);
 
@@ -192,9 +175,7 @@ void flush_altivec_to_thread(struct task_struct *tsk)
 	if (tsk->thread.regs) {
 		preempt_disable();
 		if (tsk->thread.regs->msr & MSR_VEC) {
-#ifdef CONFIG_SMP
 			BUG_ON(tsk != current);
-#endif
 			giveup_altivec_maybe_transactional(tsk);
 		}
 		preempt_enable();
@@ -208,14 +189,10 @@ void enable_kernel_vsx(void)
 {
 	WARN_ON(preemptible());
 
-#ifdef CONFIG_SMP
 	if (current->thread.regs && (current->thread.regs->msr & MSR_VSX))
 		giveup_vsx(current);
 	else
 		giveup_vsx(NULL);	/* just enable vsx for kernel - force */
-#else
-	giveup_vsx(last_task_used_vsx);
-#endif /* CONFIG_SMP */
 }
 EXPORT_SYMBOL(enable_kernel_vsx);
 
@@ -232,9 +209,7 @@ void flush_vsx_to_thread(struct task_struct *tsk)
 	if (tsk->thread.regs) {
 		preempt_disable();
 		if (tsk->thread.regs->msr & MSR_VSX) {
-#ifdef CONFIG_SMP
 			BUG_ON(tsk != current);
-#endif
 			giveup_vsx(tsk);
 		}
 		preempt_enable();
@@ -249,14 +224,10 @@ void enable_kernel_spe(void)
 {
 	WARN_ON(preemptible());
 
-#ifdef CONFIG_SMP
 	if (current->thread.regs && (current->thread.regs->msr & MSR_SPE))
 		giveup_spe(current);
 	else
 		giveup_spe(NULL);	/* just enable SPE for kernel - force */
-#else
-	giveup_spe(last_task_used_spe);
-#endif /* __SMP __ */
 }
 EXPORT_SYMBOL(enable_kernel_spe);
 
@@ -265,9 +236,7 @@ void flush_spe_to_thread(struct task_struct *tsk)
 	if (tsk->thread.regs) {
 		preempt_disable();
 		if (tsk->thread.regs->msr & MSR_SPE) {
-#ifdef CONFIG_SMP
 			BUG_ON(tsk != current);
-#endif
 			tsk->thread.spefscr = mfspr(SPRN_SPEFSCR);
 			giveup_spe(tsk);
 		}
@@ -276,32 +245,6 @@ void flush_spe_to_thread(struct task_struct *tsk)
 }
 #endif /* CONFIG_SPE */
 
-#ifndef CONFIG_SMP
-/*
- * If we are doing lazy switching of CPU state (FP, altivec or SPE),
- * and the current task has some state, discard it.
- */
-void discard_lazy_cpu_state(void)
-{
-	preempt_disable();
-	if (last_task_used_math == current)
-		last_task_used_math = NULL;
-#ifdef CONFIG_ALTIVEC
-	if (last_task_used_altivec == current)
-		last_task_used_altivec = NULL;
-#endif /* CONFIG_ALTIVEC */
-#ifdef CONFIG_VSX
-	if (last_task_used_vsx == current)
-		last_task_used_vsx = NULL;
-#endif /* CONFIG_VSX */
-#ifdef CONFIG_SPE
-	if (last_task_used_spe == current)
-		last_task_used_spe = NULL;
-#endif
-	preempt_enable();
-}
-#endif /* CONFIG_SMP */
-
 #ifdef CONFIG_PPC_ADV_DEBUG_REGS
 void do_send_trap(struct pt_regs *regs, unsigned long address,
 		  unsigned long error_code, int signal_code, int breakpt)
@@ -831,30 +774,9 @@ struct task_struct *__switch_to(struct task_struct *prev,
 
 	__switch_to_tm(prev);
 
-#ifdef CONFIG_SMP
-	/* avoid complexity of lazy save/restore of fpu
-	 * by just saving it every time we switch out if
-	 * this task used the fpu during the last quantum.
-	 *
-	 * If it tries to use the fpu again, it'll trap and
-	 * reload its fp regs.  So we don't have to do a restore
-	 * every switch, just a save.
-	 *  -- Cort
-	 */
 	if (prev->thread.regs && (prev->thread.regs->msr & MSR_FP))
 		giveup_fpu(prev);
 #ifdef CONFIG_ALTIVEC
-	/*
-	 * If the previous thread used altivec in the last quantum
-	 * (thus changing altivec regs) then save them.
-	 * We used to check the VRSAVE register but not all apps
-	 * set it, so we don't rely on it now (and in fact we need
-	 * to save & restore VSCR even if VRSAVE == 0).  -- paulus
-	 *
-	 * On SMP we always save/restore altivec regs just to avoid the
-	 * complexity of changing processors.
-	 *  -- Cort
-	 */
 	if (prev->thread.regs && (prev->thread.regs->msr & MSR_VEC))
 		giveup_altivec(prev);
 #endif /* CONFIG_ALTIVEC */
@@ -864,39 +786,10 @@ struct task_struct *__switch_to(struct task_struct *prev,
 		__giveup_vsx(prev);
 #endif /* CONFIG_VSX */
 #ifdef CONFIG_SPE
-	/*
-	 * If the previous thread used spe in the last quantum
-	 * (thus changing spe regs) then save them.
-	 *
-	 * On SMP we always save/restore spe regs just to avoid the
-	 * complexity of changing processors.
-	 */
 	if ((prev->thread.regs && (prev->thread.regs->msr & MSR_SPE)))
 		giveup_spe(prev);
 #endif /* CONFIG_SPE */
 
-#else  /* CONFIG_SMP */
-#ifdef CONFIG_ALTIVEC
-	/* Avoid the trap.  On smp this this never happens since
-	 * we don't set last_task_used_altivec -- Cort
-	 */
-	if (new->thread.regs && last_task_used_altivec == new)
-		new->thread.regs->msr |= MSR_VEC;
-#endif /* CONFIG_ALTIVEC */
-#ifdef CONFIG_VSX
-	if (new->thread.regs && last_task_used_vsx == new)
-		new->thread.regs->msr |= MSR_VSX;
-#endif /* CONFIG_VSX */
-#ifdef CONFIG_SPE
-	/* Avoid the trap.  On smp this this never happens since
-	 * we don't set last_task_used_spe
-	 */
-	if (new->thread.regs && last_task_used_spe == new)
-		new->thread.regs->msr |= MSR_SPE;
-#endif /* CONFIG_SPE */
-
-#endif /* CONFIG_SMP */
-
 #ifdef CONFIG_PPC_ADV_DEBUG_REGS
 	switch_booke_debug_regs(&new->thread.debug);
 #else
@@ -1111,13 +1004,10 @@ void show_regs(struct pt_regs * regs)
 
 void exit_thread(void)
 {
-	discard_lazy_cpu_state();
 }
 
 void flush_thread(void)
 {
-	discard_lazy_cpu_state();
-
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 	flush_ptrace_hw_breakpoint(current);
 #else /* CONFIG_HAVE_HW_BREAKPOINT */
@@ -1355,7 +1245,6 @@ void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp)
 		regs->msr = MSR_USER32;
 	}
 #endif
-	discard_lazy_cpu_state();
 #ifdef CONFIG_VSX
 	current->thread.used_vsr = 0;
 #endif
diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 0dbee465af7a..3cd7a32c8ff4 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -687,15 +687,6 @@ static long restore_user_regs(struct pt_regs *regs,
 	if (sig)
 		regs->msr = (regs->msr & ~MSR_LE) | (msr & MSR_LE);
 
-	/*
-	 * Do this before updating the thread state in
-	 * current->thread.fpr/vr/evr.  That way, if we get preempted
-	 * and another task grabs the FPU/Altivec/SPE, it won't be
-	 * tempted to save the current CPU state into the thread_struct
-	 * and corrupt what we are writing there.
-	 */
-	discard_lazy_cpu_state();
-
 #ifdef CONFIG_ALTIVEC
 	/*
 	 * Force the process to reload the altivec registers from
@@ -798,15 +789,6 @@ static long restore_tm_user_regs(struct pt_regs *regs,
 	/* Restore the previous little-endian mode */
 	regs->msr = (regs->msr & ~MSR_LE) | (msr & MSR_LE);
 
-	/*
-	 * Do this before updating the thread state in
-	 * current->thread.fpr/vr/evr.  That way, if we get preempted
-	 * and another task grabs the FPU/Altivec/SPE, it won't be
-	 * tempted to save the current CPU state into the thread_struct
-	 * and corrupt what we are writing there.
-	 */
-	discard_lazy_cpu_state();
-
 #ifdef CONFIG_ALTIVEC
 	regs->msr &= ~MSR_VEC;
 	if (msr & MSR_VEC) {
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index 20756dfb9f34..6f2b555516e6 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -349,15 +349,6 @@ static long restore_sigcontext(struct pt_regs *regs, sigset_t *set, int sig,
 	if (set != NULL)
 		err |=  __get_user(set->sig[0], &sc->oldmask);
 
-	/*
-	 * Do this before updating the thread state in
-	 * current->thread.fpr/vr.  That way, if we get preempted
-	 * and another task grabs the FPU/Altivec, it won't be
-	 * tempted to save the current CPU state into the thread_struct
-	 * and corrupt what we are writing there.
-	 */
-	discard_lazy_cpu_state();
-
 	/*
 	 * Force reload of FP/VEC.
 	 * This has to be done before copying stuff into current->thread.fpr/vr
@@ -464,15 +455,6 @@ static long restore_tm_sigcontexts(struct pt_regs *regs,
 	err |= __get_user(regs->dsisr, &sc->gp_regs[PT_DSISR]);
 	err |= __get_user(regs->result, &sc->gp_regs[PT_RESULT]);
 
-	/*
-	 * Do this before updating the thread state in
-	 * current->thread.fpr/vr.  That way, if we get preempted
-	 * and another task grabs the FPU/Altivec, it won't be
-	 * tempted to save the current CPU state into the thread_struct
-	 * and corrupt what we are writing there.
-	 */
-	discard_lazy_cpu_state();
-
 	/*
 	 * Force reload of FP/VEC.
 	 * This has to be done before copying stuff into current->thread.fpr/vr
diff --git a/arch/powerpc/kernel/vector.S b/arch/powerpc/kernel/vector.S
index 1c5425966204..1757c0c936c1 100644
--- a/arch/powerpc/kernel/vector.S
+++ b/arch/powerpc/kernel/vector.S
@@ -80,39 +80,6 @@ _GLOBAL(load_up_altivec)
 	MTMSRD(r5)			/* enable use of AltiVec now */
 	isync
 
-/*
- * For SMP, we don't do lazy VMX switching because it just gets too
- * horrendously complex, especially when a task switches from one CPU
- * to another.  Instead we call giveup_altvec in switch_to.
- * VRSAVE isn't dealt with here, that is done in the normal context
- * switch code. Note that we could rely on vrsave value to eventually
- * avoid saving all of the VREGs here...
- */
-#ifndef CONFIG_SMP
-	LOAD_REG_ADDRBASE(r3, last_task_used_altivec)
-	toreal(r3)
-	PPC_LL	r4,ADDROFF(last_task_used_altivec)(r3)
-	PPC_LCMPI	0,r4,0
-	beq	1f
-
-	/* Save VMX state to last_task_used_altivec's THREAD struct */
-	toreal(r4)
-	addi	r4,r4,THREAD
-	addi	r6,r4,THREAD_VRSTATE
-	SAVE_32VRS(0,r5,r6)
-	mfvscr	v0
-	li	r10,VRSTATE_VSCR
-	stvx	v0,r10,r6
-	/* Disable VMX for last_task_used_altivec */
-	PPC_LL	r5,PT_REGS(r4)
-	toreal(r5)
-	PPC_LL	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
-	lis	r10,MSR_VEC@h
-	andc	r4,r4,r10
-	PPC_STL	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
-1:
-#endif /* CONFIG_SMP */
-
 	/* Hack: if we get an altivec unavailable trap with VRSAVE
 	 * set to all zeros, we assume this is a broken application
 	 * that fails to set it properly, and thus we switch it to
@@ -141,12 +108,6 @@ _GLOBAL(load_up_altivec)
 	lvx	v0,r10,r6
 	mtvscr	v0
 	REST_32VRS(0,r4,r6)
-#ifndef CONFIG_SMP
-	/* Update last_task_used_altivec to 'current' */
-	subi	r4,r5,THREAD		/* Back to 'current' */
-	fromreal(r4)
-	PPC_STL	r4,ADDROFF(last_task_used_altivec)(r3)
-#endif /* CONFIG_SMP */
 	/* restore registers and return */
 	blr
 
@@ -199,11 +160,6 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_VSX)
 	andc	r4,r4,r3		/* disable FP for previous task */
 	PPC_STL	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
 1:
-#ifndef CONFIG_SMP
-	li	r5,0
-	LOAD_REG_ADDRBASE(r4,last_task_used_altivec)
-	PPC_STL	r5,ADDROFF(last_task_used_altivec)(r4)
-#endif /* CONFIG_SMP */
 	blr
 
 #ifdef CONFIG_VSX
@@ -226,20 +182,6 @@ _GLOBAL(load_up_vsx)
 	andis.	r5,r12,MSR_VEC@h
 	beql+	load_up_altivec		/* skip if already loaded */
 
-#ifndef CONFIG_SMP
-	ld	r3,last_task_used_vsx@got(r2)
-	ld	r4,0(r3)
-	cmpdi	0,r4,0
-	beq	1f
-	/* Disable VSX for last_task_used_vsx */
-	addi	r4,r4,THREAD
-	ld	r5,PT_REGS(r4)
-	ld	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
-	lis	r6,MSR_VSX@h
-	andc	r6,r4,r6
-	std	r6,_MSR-STACK_FRAME_OVERHEAD(r5)
-1:
-#endif /* CONFIG_SMP */
 	ld	r4,PACACURRENT(r13)
 	addi	r4,r4,THREAD		/* Get THREAD */
 	li	r6,1
@@ -247,11 +189,6 @@ _GLOBAL(load_up_vsx)
 	/* enable use of VSX after return */
 	oris	r12,r12,MSR_VSX@h
 	std	r12,_MSR(r1)
-#ifndef CONFIG_SMP
-	/* Update last_task_used_vsx to 'current' */
-	ld	r4,PACACURRENT(r13)
-	std	r4,0(r3)
-#endif /* CONFIG_SMP */
 	b	fast_exception_return
 
 /*
@@ -277,11 +214,6 @@ _GLOBAL(__giveup_vsx)
 	andc	r4,r4,r3		/* disable VSX for previous task */
 	std	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
 1:
-#ifndef CONFIG_SMP
-	li	r5,0
-	ld	r4,last_task_used_vsx@got(r2)
-	std	r5,0(r4)
-#endif /* CONFIG_SMP */
 	blr
 
 #endif /* CONFIG_VSX */

From b86fd2bd03021ce906bfa0c1456ec38329e31b30 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Thu, 29 Oct 2015 11:43:58 +1100
Subject: [PATCH 018/149] powerpc: Simplify TM restore checks

Instead of having multiple giveup_*_maybe_transactional() functions,
separate out the TM check into a new function called
check_if_tm_restore_required().

This will make it easier to optimise the giveup_*() functions in a
subsequent patch.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/process.c | 53 +++++++++++++----------------------
 1 file changed, 19 insertions(+), 34 deletions(-)

diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index e098f4315643..ef64219548d5 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -68,7 +68,7 @@
 extern unsigned long _get_SP(void);
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-void giveup_fpu_maybe_transactional(struct task_struct *tsk)
+static void check_if_tm_restore_required(struct task_struct *tsk)
 {
 	/*
 	 * If we are saving the current thread's registers, and the
@@ -82,31 +82,9 @@ void giveup_fpu_maybe_transactional(struct task_struct *tsk)
 		tsk->thread.ckpt_regs.msr = tsk->thread.regs->msr;
 		set_thread_flag(TIF_RESTORE_TM);
 	}
-
-	giveup_fpu(tsk);
 }
-
-void giveup_altivec_maybe_transactional(struct task_struct *tsk)
-{
-	/*
-	 * If we are saving the current thread's registers, and the
-	 * thread is in a transactional state, set the TIF_RESTORE_TM
-	 * bit so that we know to restore the registers before
-	 * returning to userspace.
-	 */
-	if (tsk == current && tsk->thread.regs &&
-	    MSR_TM_ACTIVE(tsk->thread.regs->msr) &&
-	    !test_thread_flag(TIF_RESTORE_TM)) {
-		tsk->thread.ckpt_regs.msr = tsk->thread.regs->msr;
-		set_thread_flag(TIF_RESTORE_TM);
-	}
-
-	giveup_altivec(tsk);
-}
-
 #else
-#define giveup_fpu_maybe_transactional(tsk)	giveup_fpu(tsk)
-#define giveup_altivec_maybe_transactional(tsk)	giveup_altivec(tsk)
+static inline void check_if_tm_restore_required(struct task_struct *tsk) { }
 #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
 
 #ifdef CONFIG_PPC_FPU
@@ -135,7 +113,8 @@ void flush_fp_to_thread(struct task_struct *tsk)
 			 * to still have its FP state in the CPU registers.
 			 */
 			BUG_ON(tsk != current);
-			giveup_fpu_maybe_transactional(tsk);
+			check_if_tm_restore_required(tsk);
+			giveup_fpu(tsk);
 		}
 		preempt_enable();
 	}
@@ -147,10 +126,12 @@ void enable_kernel_fp(void)
 {
 	WARN_ON(preemptible());
 
-	if (current->thread.regs && (current->thread.regs->msr & MSR_FP))
-		giveup_fpu_maybe_transactional(current);
-	else
+	if (current->thread.regs && (current->thread.regs->msr & MSR_FP)) {
+		check_if_tm_restore_required(current);
+		giveup_fpu(current);
+	} else {
 		giveup_fpu(NULL);	/* just enables FP for kernel */
+	}
 }
 EXPORT_SYMBOL(enable_kernel_fp);
 
@@ -159,10 +140,12 @@ void enable_kernel_altivec(void)
 {
 	WARN_ON(preemptible());
 
-	if (current->thread.regs && (current->thread.regs->msr & MSR_VEC))
-		giveup_altivec_maybe_transactional(current);
-	else
+	if (current->thread.regs && (current->thread.regs->msr & MSR_VEC)) {
+		check_if_tm_restore_required(current);
+		giveup_altivec(current);
+	} else {
 		giveup_altivec_notask();
+	}
 }
 EXPORT_SYMBOL(enable_kernel_altivec);
 
@@ -176,7 +159,8 @@ void flush_altivec_to_thread(struct task_struct *tsk)
 		preempt_disable();
 		if (tsk->thread.regs->msr & MSR_VEC) {
 			BUG_ON(tsk != current);
-			giveup_altivec_maybe_transactional(tsk);
+			check_if_tm_restore_required(tsk);
+			giveup_altivec(tsk);
 		}
 		preempt_enable();
 	}
@@ -198,8 +182,9 @@ EXPORT_SYMBOL(enable_kernel_vsx);
 
 void giveup_vsx(struct task_struct *tsk)
 {
-	giveup_fpu_maybe_transactional(tsk);
-	giveup_altivec_maybe_transactional(tsk);
+	check_if_tm_restore_required(tsk);
+	giveup_fpu(tsk);
+	giveup_altivec(tsk);
 	__giveup_vsx(tsk);
 }
 EXPORT_SYMBOL(giveup_vsx);

From 611b0e5c19963374175b39f42117b03ee7573228 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Thu, 29 Oct 2015 11:43:59 +1100
Subject: [PATCH 019/149] powerpc: Create mtmsrd_isync()

mtmsrd_isync() will do an mtmsrd followed by an isync on older
processors. On newer processors we avoid the isync via a feature fixup.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/reg.h |  8 ++++++++
 arch/powerpc/kernel/process.c  | 30 ++++++++++++++++++++++--------
 2 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index a908ada8e0a5..987dac090244 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -1193,12 +1193,20 @@
 #define __mtmsrd(v, l)	asm volatile("mtmsrd %0," __stringify(l) \
 				     : : "r" (v) : "memory")
 #define mtmsr(v)	__mtmsrd((v), 0)
+#define __MTMSR		"mtmsrd"
 #else
 #define mtmsr(v)	asm volatile("mtmsr %0" : \
 				     : "r" ((unsigned long)(v)) \
 				     : "memory")
+#define __MTMSR		"mtmsr"
 #endif
 
+static inline void mtmsr_isync(unsigned long val)
+{
+	asm volatile(__MTMSR " %0; " ASM_FTR_IFCLR("isync", "nop", %1) : :
+			"r" (val), "i" (CPU_FTR_ARCH_206) : "memory");
+}
+
 #define mfspr(rn)	({unsigned long rval; \
 			asm volatile("mfspr %0," __stringify(rn) \
 				: "=r" (rval)); rval;})
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index ef64219548d5..5bf8ec2597d4 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -130,7 +130,10 @@ void enable_kernel_fp(void)
 		check_if_tm_restore_required(current);
 		giveup_fpu(current);
 	} else {
-		giveup_fpu(NULL);	/* just enables FP for kernel */
+		u64 oldmsr = mfmsr();
+
+		if (!(oldmsr & MSR_FP))
+			mtmsr_isync(oldmsr | MSR_FP);
 	}
 }
 EXPORT_SYMBOL(enable_kernel_fp);
@@ -144,7 +147,10 @@ void enable_kernel_altivec(void)
 		check_if_tm_restore_required(current);
 		giveup_altivec(current);
 	} else {
-		giveup_altivec_notask();
+		u64 oldmsr = mfmsr();
+
+		if (!(oldmsr & MSR_VEC))
+			mtmsr_isync(oldmsr | MSR_VEC);
 	}
 }
 EXPORT_SYMBOL(enable_kernel_altivec);
@@ -173,10 +179,14 @@ void enable_kernel_vsx(void)
 {
 	WARN_ON(preemptible());
 
-	if (current->thread.regs && (current->thread.regs->msr & MSR_VSX))
+	if (current->thread.regs && (current->thread.regs->msr & MSR_VSX)) {
 		giveup_vsx(current);
-	else
-		giveup_vsx(NULL);	/* just enable vsx for kernel - force */
+	} else {
+		u64 oldmsr = mfmsr();
+
+		if (!(oldmsr & MSR_VSX))
+			mtmsr_isync(oldmsr | MSR_VSX);
+	}
 }
 EXPORT_SYMBOL(enable_kernel_vsx);
 
@@ -209,10 +219,14 @@ void enable_kernel_spe(void)
 {
 	WARN_ON(preemptible());
 
-	if (current->thread.regs && (current->thread.regs->msr & MSR_SPE))
+	if (current->thread.regs && (current->thread.regs->msr & MSR_SPE)) {
 		giveup_spe(current);
-	else
-		giveup_spe(NULL);	/* just enable SPE for kernel - force */
+	} else {
+		u64 oldmsr = mfmsr();
+
+		if (!(oldmsr & MSR_SPE))
+			mtmsr_isync(oldmsr | MSR_SPE);
+	}
 }
 EXPORT_SYMBOL(enable_kernel_spe);
 

From b51b1153d0e78a70767441273331d2de066bb929 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Thu, 29 Oct 2015 11:44:00 +1100
Subject: [PATCH 020/149] powerpc: Remove NULL task struct pointer checks in FP
 and vector code

We used to allow giveup_*() to be called with a NULL task struct
pointer. Now those cases are handled in the caller we can remove
the checks. We can also remove giveup_altivec_notask() which is also
unused.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/switch_to.h |  1 -
 arch/powerpc/kernel/fpu.S            |  2 --
 arch/powerpc/kernel/head_fsl_booke.S |  2 --
 arch/powerpc/kernel/vector.S         | 14 --------------
 4 files changed, 19 deletions(-)

diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h
index bd1d93318350..042aaf05a787 100644
--- a/arch/powerpc/include/asm/switch_to.h
+++ b/arch/powerpc/include/asm/switch_to.h
@@ -38,7 +38,6 @@ static inline void giveup_fpu(struct task_struct *t) { }
 #ifdef CONFIG_ALTIVEC
 extern void flush_altivec_to_thread(struct task_struct *);
 extern void giveup_altivec(struct task_struct *);
-extern void giveup_altivec_notask(void);
 #else
 static inline void flush_altivec_to_thread(struct task_struct *t)
 {
diff --git a/arch/powerpc/kernel/fpu.S b/arch/powerpc/kernel/fpu.S
index 50d2352f2cf4..71bdce284ad9 100644
--- a/arch/powerpc/kernel/fpu.S
+++ b/arch/powerpc/kernel/fpu.S
@@ -173,8 +173,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
 	MTMSRD(r5)			/* enable use of fpu now */
 	SYNC_601
 	isync
-	PPC_LCMPI	0,r3,0
-	beqlr-				/* if no previous owner, done */
 	addi	r3,r3,THREAD	        /* want THREAD of task */
 	PPC_LL	r6,THREAD_FPSAVEAREA(r3)
 	PPC_LL	r5,PT_REGS(r3)
diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S
index ec936abbcadc..d6980bbae954 100644
--- a/arch/powerpc/kernel/head_fsl_booke.S
+++ b/arch/powerpc/kernel/head_fsl_booke.S
@@ -992,8 +992,6 @@ _GLOBAL(giveup_spe)
 	oris	r5,r5,MSR_SPE@h
 	mtmsr	r5			/* enable use of SPE now */
 	isync
-	cmpi	0,r3,0
-	beqlr-				/* if no previous owner, done */
 	addi	r3,r3,THREAD		/* want THREAD of task */
 	lwz	r5,PT_REGS(r3)
 	cmpi	0,r5,0
diff --git a/arch/powerpc/kernel/vector.S b/arch/powerpc/kernel/vector.S
index 1757c0c936c1..b31528c30253 100644
--- a/arch/powerpc/kernel/vector.S
+++ b/arch/powerpc/kernel/vector.S
@@ -111,16 +111,6 @@ _GLOBAL(load_up_altivec)
 	/* restore registers and return */
 	blr
 
-_GLOBAL(giveup_altivec_notask)
-	mfmsr	r3
-	andis.	r4,r3,MSR_VEC@h
-	bnelr				/* Already enabled? */
-	oris	r3,r3,MSR_VEC@h
-	SYNC
-	MTMSRD(r3)			/* enable use of VMX now */
-	isync
-	blr
-
 /*
  * giveup_altivec(tsk)
  * Disable VMX for the task given as the argument,
@@ -133,8 +123,6 @@ _GLOBAL(giveup_altivec)
 	SYNC
 	MTMSRD(r5)			/* enable use of VMX now */
 	isync
-	PPC_LCMPI	0,r3,0
-	beqlr				/* if no previous owner, done */
 	addi	r3,r3,THREAD		/* want THREAD of task */
 	PPC_LL	r7,THREAD_VRSAVEAREA(r3)
 	PPC_LL	r5,PT_REGS(r3)
@@ -203,8 +191,6 @@ _GLOBAL(__giveup_vsx)
 	mtmsrd	r5			/* enable use of VSX now */
 	isync
 
-	cmpdi	0,r3,0
-	beqlr-				/* if no previous owner, done */
 	addi	r3,r3,THREAD		/* want THREAD of task */
 	ld	r5,PT_REGS(r3)
 	cmpdi	0,r5,0

From 98da581e0846f6d932a4bc46a55458140e20478a Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Thu, 29 Oct 2015 11:44:01 +1100
Subject: [PATCH 021/149] powerpc: Move part of giveup_fpu,altivec,spe into c

Move the MSR modification into new c functions. Removing it from
the low level functions will allow us to avoid costly MSR writes
by batching them up.

Move the check_if_tm_restore_required() check into these new functions.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/switch_to.h | 21 +++++------
 arch/powerpc/kernel/fpu.S            | 16 ++-------
 arch/powerpc/kernel/head_fsl_booke.S |  8 ++---
 arch/powerpc/kernel/ppc_ksyms.c      |  6 ----
 arch/powerpc/kernel/process.c        | 52 +++++++++++++++++++++++++---
 arch/powerpc/kernel/vector.S         | 10 ++----
 6 files changed, 65 insertions(+), 48 deletions(-)

diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h
index 042aaf05a787..c2678b93bcba 100644
--- a/arch/powerpc/include/asm/switch_to.h
+++ b/arch/powerpc/include/asm/switch_to.h
@@ -23,28 +23,27 @@ extern int emulate_altivec(struct pt_regs *);
 extern void __giveup_vsx(struct task_struct *);
 extern void giveup_vsx(struct task_struct *);
 extern void enable_kernel_spe(void);
-extern void giveup_spe(struct task_struct *);
 extern void load_up_spe(struct task_struct *);
 extern void switch_booke_debug_regs(struct debug_reg *new_debug);
 
 #ifdef CONFIG_PPC_FPU
 extern void flush_fp_to_thread(struct task_struct *);
 extern void giveup_fpu(struct task_struct *);
+extern void __giveup_fpu(struct task_struct *);
 #else
 static inline void flush_fp_to_thread(struct task_struct *t) { }
 static inline void giveup_fpu(struct task_struct *t) { }
+static inline void __giveup_fpu(struct task_struct *t) { }
 #endif
 
 #ifdef CONFIG_ALTIVEC
 extern void flush_altivec_to_thread(struct task_struct *);
 extern void giveup_altivec(struct task_struct *);
+extern void __giveup_altivec(struct task_struct *);
 #else
-static inline void flush_altivec_to_thread(struct task_struct *t)
-{
-}
-static inline void giveup_altivec(struct task_struct *t)
-{
-}
+static inline void flush_altivec_to_thread(struct task_struct *t) { }
+static inline void giveup_altivec(struct task_struct *t) { }
+static inline void __giveup_altivec(struct task_struct *t) { }
 #endif
 
 #ifdef CONFIG_VSX
@@ -57,10 +56,12 @@ static inline void flush_vsx_to_thread(struct task_struct *t)
 
 #ifdef CONFIG_SPE
 extern void flush_spe_to_thread(struct task_struct *);
+extern void giveup_spe(struct task_struct *);
+extern void __giveup_spe(struct task_struct *);
 #else
-static inline void flush_spe_to_thread(struct task_struct *t)
-{
-}
+static inline void flush_spe_to_thread(struct task_struct *t) { }
+static inline void giveup_spe(struct task_struct *t) { }
+static inline void __giveup_spe(struct task_struct *t) { }
 #endif
 
 static inline void clear_task_ebb(struct task_struct *t)
diff --git a/arch/powerpc/kernel/fpu.S b/arch/powerpc/kernel/fpu.S
index 71bdce284ad9..431ab571ed1b 100644
--- a/arch/powerpc/kernel/fpu.S
+++ b/arch/powerpc/kernel/fpu.S
@@ -155,24 +155,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
 	blr
 
 /*
- * giveup_fpu(tsk)
+ * __giveup_fpu(tsk)
  * Disable FP for the task given as the argument,
  * and save the floating-point registers in its thread_struct.
  * Enables the FPU for use in the kernel on return.
  */
-_GLOBAL(giveup_fpu)
-	mfmsr	r5
-	ori	r5,r5,MSR_FP
-#ifdef CONFIG_VSX
-BEGIN_FTR_SECTION
-	oris	r5,r5,MSR_VSX@h
-END_FTR_SECTION_IFSET(CPU_FTR_VSX)
-#endif
-	SYNC_601
-	ISYNC_601
-	MTMSRD(r5)			/* enable use of fpu now */
-	SYNC_601
-	isync
+_GLOBAL(__giveup_fpu)
 	addi	r3,r3,THREAD	        /* want THREAD of task */
 	PPC_LL	r6,THREAD_FPSAVEAREA(r3)
 	PPC_LL	r5,PT_REGS(r3)
diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S
index d6980bbae954..f705171b924b 100644
--- a/arch/powerpc/kernel/head_fsl_booke.S
+++ b/arch/powerpc/kernel/head_fsl_booke.S
@@ -984,14 +984,10 @@ _GLOBAL(__setup_ehv_ivors)
 
 #ifdef CONFIG_SPE
 /*
- * extern void giveup_spe(struct task_struct *prev)
+ * extern void __giveup_spe(struct task_struct *prev)
  *
  */
-_GLOBAL(giveup_spe)
-	mfmsr	r5
-	oris	r5,r5,MSR_SPE@h
-	mtmsr	r5			/* enable use of SPE now */
-	isync
+_GLOBAL(__giveup_spe)
 	addi	r3,r3,THREAD		/* want THREAD of task */
 	lwz	r5,PT_REGS(r3)
 	cmpi	0,r5,0
diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c
index 202963ee013a..41e1607e800c 100644
--- a/arch/powerpc/kernel/ppc_ksyms.c
+++ b/arch/powerpc/kernel/ppc_ksyms.c
@@ -19,13 +19,11 @@ EXPORT_SYMBOL(_mcount);
 #endif
 
 #ifdef CONFIG_PPC_FPU
-EXPORT_SYMBOL(giveup_fpu);
 EXPORT_SYMBOL(load_fp_state);
 EXPORT_SYMBOL(store_fp_state);
 #endif
 
 #ifdef CONFIG_ALTIVEC
-EXPORT_SYMBOL(giveup_altivec);
 EXPORT_SYMBOL(load_vr_state);
 EXPORT_SYMBOL(store_vr_state);
 #endif
@@ -34,10 +32,6 @@ EXPORT_SYMBOL(store_vr_state);
 EXPORT_SYMBOL_GPL(__giveup_vsx);
 #endif
 
-#ifdef CONFIG_SPE
-EXPORT_SYMBOL(giveup_spe);
-#endif
-
 #ifdef CONFIG_EPAPR_PARAVIRT
 EXPORT_SYMBOL(epapr_hypercall_start);
 #endif
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 5bf8ec2597d4..6bcf82bed610 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -88,6 +88,25 @@ static inline void check_if_tm_restore_required(struct task_struct *tsk) { }
 #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
 
 #ifdef CONFIG_PPC_FPU
+void giveup_fpu(struct task_struct *tsk)
+{
+	u64 oldmsr = mfmsr();
+	u64 newmsr;
+
+	check_if_tm_restore_required(tsk);
+
+	newmsr = oldmsr | MSR_FP;
+#ifdef CONFIG_VSX
+	if (cpu_has_feature(CPU_FTR_VSX))
+		newmsr |= MSR_VSX;
+#endif
+	if (oldmsr != newmsr)
+		mtmsr_isync(newmsr);
+
+	__giveup_fpu(tsk);
+}
+EXPORT_SYMBOL(giveup_fpu);
+
 /*
  * Make sure the floating-point register state in the
  * the thread_struct is up to date for task tsk.
@@ -113,7 +132,6 @@ void flush_fp_to_thread(struct task_struct *tsk)
 			 * to still have its FP state in the CPU registers.
 			 */
 			BUG_ON(tsk != current);
-			check_if_tm_restore_required(tsk);
 			giveup_fpu(tsk);
 		}
 		preempt_enable();
@@ -127,7 +145,6 @@ void enable_kernel_fp(void)
 	WARN_ON(preemptible());
 
 	if (current->thread.regs && (current->thread.regs->msr & MSR_FP)) {
-		check_if_tm_restore_required(current);
 		giveup_fpu(current);
 	} else {
 		u64 oldmsr = mfmsr();
@@ -139,12 +156,26 @@ void enable_kernel_fp(void)
 EXPORT_SYMBOL(enable_kernel_fp);
 
 #ifdef CONFIG_ALTIVEC
+void giveup_altivec(struct task_struct *tsk)
+{
+	u64 oldmsr = mfmsr();
+	u64 newmsr;
+
+	check_if_tm_restore_required(tsk);
+
+	newmsr = oldmsr | MSR_VEC;
+	if (oldmsr != newmsr)
+		mtmsr_isync(newmsr);
+
+	__giveup_altivec(tsk);
+}
+EXPORT_SYMBOL(giveup_altivec);
+
 void enable_kernel_altivec(void)
 {
 	WARN_ON(preemptible());
 
 	if (current->thread.regs && (current->thread.regs->msr & MSR_VEC)) {
-		check_if_tm_restore_required(current);
 		giveup_altivec(current);
 	} else {
 		u64 oldmsr = mfmsr();
@@ -165,7 +196,6 @@ void flush_altivec_to_thread(struct task_struct *tsk)
 		preempt_disable();
 		if (tsk->thread.regs->msr & MSR_VEC) {
 			BUG_ON(tsk != current);
-			check_if_tm_restore_required(tsk);
 			giveup_altivec(tsk);
 		}
 		preempt_enable();
@@ -214,6 +244,20 @@ EXPORT_SYMBOL_GPL(flush_vsx_to_thread);
 #endif /* CONFIG_VSX */
 
 #ifdef CONFIG_SPE
+void giveup_spe(struct task_struct *tsk)
+{
+	u64 oldmsr = mfmsr();
+	u64 newmsr;
+
+	check_if_tm_restore_required(tsk);
+
+	newmsr = oldmsr | MSR_SPE;
+	if (oldmsr != newmsr)
+		mtmsr_isync(newmsr);
+
+	__giveup_spe(tsk);
+}
+EXPORT_SYMBOL(giveup_spe);
 
 void enable_kernel_spe(void)
 {
diff --git a/arch/powerpc/kernel/vector.S b/arch/powerpc/kernel/vector.S
index b31528c30253..6e925b40a484 100644
--- a/arch/powerpc/kernel/vector.S
+++ b/arch/powerpc/kernel/vector.S
@@ -112,17 +112,11 @@ _GLOBAL(load_up_altivec)
 	blr
 
 /*
- * giveup_altivec(tsk)
+ * __giveup_altivec(tsk)
  * Disable VMX for the task given as the argument,
  * and save the vector registers in its thread_struct.
- * Enables the VMX for use in the kernel on return.
  */
-_GLOBAL(giveup_altivec)
-	mfmsr	r5
-	oris	r5,r5,MSR_VEC@h
-	SYNC
-	MTMSRD(r5)			/* enable use of VMX now */
-	isync
+_GLOBAL(__giveup_altivec)
 	addi	r3,r3,THREAD		/* want THREAD of task */
 	PPC_LL	r7,THREAD_VRSAVEAREA(r3)
 	PPC_LL	r5,PT_REGS(r3)

From a7d623d4d053ccb0cdfad210bced2ec25ddf69a2 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Thu, 29 Oct 2015 11:44:02 +1100
Subject: [PATCH 022/149] powerpc: Move part of giveup_vsx into c

Move the MSR modification into c. Removing it from the assembly
function will allow us to avoid costly MSR writes by batching them
up.

Check the FP and VMX bits before calling the relevant giveup_*()
function. This makes giveup_vsx() and flush_vsx_to_thread() perform
more like their sister functions, and allows us to use
flush_vsx_to_thread() in the signal code.

Move the check_if_tm_restore_required() check in.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/process.c   | 28 +++++++++++++++++++---------
 arch/powerpc/kernel/signal_32.c |  4 ++--
 arch/powerpc/kernel/signal_64.c |  4 ++--
 arch/powerpc/kernel/vector.S    |  6 ------
 4 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 6bcf82bed610..0cb627662ded 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -205,6 +205,25 @@ EXPORT_SYMBOL_GPL(flush_altivec_to_thread);
 #endif /* CONFIG_ALTIVEC */
 
 #ifdef CONFIG_VSX
+void giveup_vsx(struct task_struct *tsk)
+{
+	u64 oldmsr = mfmsr();
+	u64 newmsr;
+
+	check_if_tm_restore_required(tsk);
+
+	newmsr = oldmsr | (MSR_FP|MSR_VEC|MSR_VSX);
+	if (oldmsr != newmsr)
+		mtmsr_isync(newmsr);
+
+	if (tsk->thread.regs->msr & MSR_FP)
+		__giveup_fpu(tsk);
+	if (tsk->thread.regs->msr & MSR_VEC)
+		__giveup_altivec(tsk);
+	__giveup_vsx(tsk);
+}
+EXPORT_SYMBOL(giveup_vsx);
+
 void enable_kernel_vsx(void)
 {
 	WARN_ON(preemptible());
@@ -220,15 +239,6 @@ void enable_kernel_vsx(void)
 }
 EXPORT_SYMBOL(enable_kernel_vsx);
 
-void giveup_vsx(struct task_struct *tsk)
-{
-	check_if_tm_restore_required(tsk);
-	giveup_fpu(tsk);
-	giveup_altivec(tsk);
-	__giveup_vsx(tsk);
-}
-EXPORT_SYMBOL(giveup_vsx);
-
 void flush_vsx_to_thread(struct task_struct *tsk)
 {
 	if (tsk->thread.regs) {
diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 3cd7a32c8ff4..4022cbb7e2d6 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -458,7 +458,7 @@ static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame,
 	 * contains valid data
 	 */
 	if (current->thread.used_vsr && ctx_has_vsx_region) {
-		__giveup_vsx(current);
+		flush_vsx_to_thread(current);
 		if (copy_vsx_to_user(&frame->mc_vsregs, current))
 			return 1;
 		msr |= MSR_VSX;
@@ -606,7 +606,7 @@ static int save_tm_user_regs(struct pt_regs *regs,
 	 * contains valid data
 	 */
 	if (current->thread.used_vsr) {
-		__giveup_vsx(current);
+		flush_vsx_to_thread(current);
 		if (copy_vsx_to_user(&frame->mc_vsregs, current))
 			return 1;
 		if (msr & MSR_VSX) {
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index 6f2b555516e6..3b2339912911 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -147,7 +147,7 @@ static long setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs,
 	 * VMX data.
 	 */
 	if (current->thread.used_vsr && ctx_has_vsx_region) {
-		__giveup_vsx(current);
+		flush_vsx_to_thread(current);
 		v_regs += ELF_NVRREG;
 		err |= copy_vsx_to_user(v_regs, current);
 		/* set MSR_VSX in the MSR value in the frame to
@@ -270,7 +270,7 @@ static long setup_tm_sigcontexts(struct sigcontext __user *sc,
 	 * VMX data.
 	 */
 	if (current->thread.used_vsr) {
-		__giveup_vsx(current);
+		flush_vsx_to_thread(current);
 		v_regs += ELF_NVRREG;
 		tm_v_regs += ELF_NVRREG;
 
diff --git a/arch/powerpc/kernel/vector.S b/arch/powerpc/kernel/vector.S
index 6e925b40a484..98675b08efe2 100644
--- a/arch/powerpc/kernel/vector.S
+++ b/arch/powerpc/kernel/vector.S
@@ -177,14 +177,8 @@ _GLOBAL(load_up_vsx)
  * __giveup_vsx(tsk)
  * Disable VSX for the task given as the argument.
  * Does NOT save vsx registers.
- * Enables the VSX for use in the kernel on return.
  */
 _GLOBAL(__giveup_vsx)
-	mfmsr	r5
-	oris	r5,r5,MSR_VSX@h
-	mtmsrd	r5			/* enable use of VSX now */
-	isync
-
 	addi	r3,r3,THREAD		/* want THREAD of task */
 	ld	r5,PT_REGS(r3)
 	cmpdi	0,r5,0

From 1552cd703cf5a07caeb17ccd82f80e20a23b1707 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Thu, 29 Oct 2015 11:44:03 +1100
Subject: [PATCH 023/149] crypto: vmx: Only call enable_kernel_vsx()

With the recent change to enable_kernel_vsx(), we no longer need
to call enable_kernel_fp() and enable_kernel_altivec().

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 drivers/crypto/vmx/aes.c     | 3 ---
 drivers/crypto/vmx/aes_cbc.c | 3 ---
 drivers/crypto/vmx/aes_ctr.c | 3 ---
 drivers/crypto/vmx/ghash.c   | 8 --------
 4 files changed, 17 deletions(-)

diff --git a/drivers/crypto/vmx/aes.c b/drivers/crypto/vmx/aes.c
index 263af709e536..20539fb7e975 100644
--- a/drivers/crypto/vmx/aes.c
+++ b/drivers/crypto/vmx/aes.c
@@ -83,7 +83,6 @@ static int p8_aes_setkey(struct crypto_tfm *tfm, const u8 *key,
 
 	preempt_disable();
 	pagefault_disable();
-	enable_kernel_altivec();
 	enable_kernel_vsx();
 	ret = aes_p8_set_encrypt_key(key, keylen * 8, &ctx->enc_key);
 	ret += aes_p8_set_decrypt_key(key, keylen * 8, &ctx->dec_key);
@@ -103,7 +102,6 @@ static void p8_aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 	} else {
 		preempt_disable();
 		pagefault_disable();
-		enable_kernel_altivec();
 		enable_kernel_vsx();
 		aes_p8_encrypt(src, dst, &ctx->enc_key);
 		pagefault_enable();
@@ -120,7 +118,6 @@ static void p8_aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 	} else {
 		preempt_disable();
 		pagefault_disable();
-		enable_kernel_altivec();
 		enable_kernel_vsx();
 		aes_p8_decrypt(src, dst, &ctx->dec_key);
 		pagefault_enable();
diff --git a/drivers/crypto/vmx/aes_cbc.c b/drivers/crypto/vmx/aes_cbc.c
index 0b8fe2ec5315..8847b92e9ff0 100644
--- a/drivers/crypto/vmx/aes_cbc.c
+++ b/drivers/crypto/vmx/aes_cbc.c
@@ -84,7 +84,6 @@ static int p8_aes_cbc_setkey(struct crypto_tfm *tfm, const u8 *key,
 
 	preempt_disable();
 	pagefault_disable();
-	enable_kernel_altivec();
 	enable_kernel_vsx();
 	ret = aes_p8_set_encrypt_key(key, keylen * 8, &ctx->enc_key);
 	ret += aes_p8_set_decrypt_key(key, keylen * 8, &ctx->dec_key);
@@ -115,7 +114,6 @@ static int p8_aes_cbc_encrypt(struct blkcipher_desc *desc,
 	} else {
 		preempt_disable();
 		pagefault_disable();
-		enable_kernel_altivec();
 		enable_kernel_vsx();
 
 		blkcipher_walk_init(&walk, dst, src, nbytes);
@@ -156,7 +154,6 @@ static int p8_aes_cbc_decrypt(struct blkcipher_desc *desc,
 	} else {
 		preempt_disable();
 		pagefault_disable();
-		enable_kernel_altivec();
 		enable_kernel_vsx();
 
 		blkcipher_walk_init(&walk, dst, src, nbytes);
diff --git a/drivers/crypto/vmx/aes_ctr.c b/drivers/crypto/vmx/aes_ctr.c
index ee1306cd8f59..80958660c31a 100644
--- a/drivers/crypto/vmx/aes_ctr.c
+++ b/drivers/crypto/vmx/aes_ctr.c
@@ -81,7 +81,6 @@ static int p8_aes_ctr_setkey(struct crypto_tfm *tfm, const u8 *key,
 	struct p8_aes_ctr_ctx *ctx = crypto_tfm_ctx(tfm);
 
 	pagefault_disable();
-	enable_kernel_altivec();
 	enable_kernel_vsx();
 	ret = aes_p8_set_encrypt_key(key, keylen * 8, &ctx->enc_key);
 	pagefault_enable();
@@ -100,7 +99,6 @@ static void p8_aes_ctr_final(struct p8_aes_ctr_ctx *ctx,
 	unsigned int nbytes = walk->nbytes;
 
 	pagefault_disable();
-	enable_kernel_altivec();
 	enable_kernel_vsx();
 	aes_p8_encrypt(ctrblk, keystream, &ctx->enc_key);
 	pagefault_enable();
@@ -133,7 +131,6 @@ static int p8_aes_ctr_crypt(struct blkcipher_desc *desc,
 		ret = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE);
 		while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
 			pagefault_disable();
-			enable_kernel_altivec();
 			enable_kernel_vsx();
 			aes_p8_ctr32_encrypt_blocks(walk.src.virt.addr,
 						    walk.dst.virt.addr,
diff --git a/drivers/crypto/vmx/ghash.c b/drivers/crypto/vmx/ghash.c
index 2183a2e77641..1f4586c2fd25 100644
--- a/drivers/crypto/vmx/ghash.c
+++ b/drivers/crypto/vmx/ghash.c
@@ -118,9 +118,7 @@ static int p8_ghash_setkey(struct crypto_shash *tfm, const u8 *key,
 
 	preempt_disable();
 	pagefault_disable();
-	enable_kernel_altivec();
 	enable_kernel_vsx();
-	enable_kernel_fp();
 	gcm_init_p8(ctx->htable, (const u64 *) key);
 	pagefault_enable();
 	preempt_enable();
@@ -149,9 +147,7 @@ static int p8_ghash_update(struct shash_desc *desc,
 			       GHASH_DIGEST_SIZE - dctx->bytes);
 			preempt_disable();
 			pagefault_disable();
-			enable_kernel_altivec();
 			enable_kernel_vsx();
-			enable_kernel_fp();
 			gcm_ghash_p8(dctx->shash, ctx->htable,
 				     dctx->buffer, GHASH_DIGEST_SIZE);
 			pagefault_enable();
@@ -164,9 +160,7 @@ static int p8_ghash_update(struct shash_desc *desc,
 		if (len) {
 			preempt_disable();
 			pagefault_disable();
-			enable_kernel_altivec();
 			enable_kernel_vsx();
-			enable_kernel_fp();
 			gcm_ghash_p8(dctx->shash, ctx->htable, src, len);
 			pagefault_enable();
 			preempt_enable();
@@ -195,9 +189,7 @@ static int p8_ghash_final(struct shash_desc *desc, u8 *out)
 				dctx->buffer[i] = 0;
 			preempt_disable();
 			pagefault_disable();
-			enable_kernel_altivec();
 			enable_kernel_vsx();
-			enable_kernel_fp();
 			gcm_ghash_p8(dctx->shash, ctx->htable,
 				     dctx->buffer, GHASH_DIGEST_SIZE);
 			pagefault_enable();

From a0e72cf12b1a1f159b6822ed2e1e41893d996fc7 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Thu, 29 Oct 2015 11:44:04 +1100
Subject: [PATCH 024/149] powerpc: Create msr_check_and_{set,clear}()

Create helper functions to set and clear MSR bits after first
checking if they are already set. Grouping them will make it
easy to avoid the MSR writes in a subsequent optimisation.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/process.c | 115 +++++++++++++++++-----------------
 1 file changed, 56 insertions(+), 59 deletions(-)

diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 0cb627662ded..5cdd35c0b026 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -87,23 +87,46 @@ static void check_if_tm_restore_required(struct task_struct *tsk)
 static inline void check_if_tm_restore_required(struct task_struct *tsk) { }
 #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
 
+static void msr_check_and_set(unsigned long bits)
+{
+	unsigned long oldmsr = mfmsr();
+	unsigned long newmsr;
+
+	newmsr = oldmsr | bits;
+
+#ifdef CONFIG_VSX
+	if (cpu_has_feature(CPU_FTR_VSX) && (bits & MSR_FP))
+		newmsr |= MSR_VSX;
+#endif
+
+	if (oldmsr != newmsr)
+		mtmsr_isync(newmsr);
+}
+
+static void msr_check_and_clear(unsigned long bits)
+{
+	unsigned long oldmsr = mfmsr();
+	unsigned long newmsr;
+
+	newmsr = oldmsr & ~bits;
+
+#ifdef CONFIG_VSX
+	if (cpu_has_feature(CPU_FTR_VSX) && (bits & MSR_FP))
+		newmsr &= ~MSR_VSX;
+#endif
+
+	if (oldmsr != newmsr)
+		mtmsr_isync(newmsr);
+}
+
 #ifdef CONFIG_PPC_FPU
 void giveup_fpu(struct task_struct *tsk)
 {
-	u64 oldmsr = mfmsr();
-	u64 newmsr;
-
 	check_if_tm_restore_required(tsk);
 
-	newmsr = oldmsr | MSR_FP;
-#ifdef CONFIG_VSX
-	if (cpu_has_feature(CPU_FTR_VSX))
-		newmsr |= MSR_VSX;
-#endif
-	if (oldmsr != newmsr)
-		mtmsr_isync(newmsr);
-
+	msr_check_and_set(MSR_FP);
 	__giveup_fpu(tsk);
+	msr_check_and_clear(MSR_FP);
 }
 EXPORT_SYMBOL(giveup_fpu);
 
@@ -144,30 +167,21 @@ void enable_kernel_fp(void)
 {
 	WARN_ON(preemptible());
 
-	if (current->thread.regs && (current->thread.regs->msr & MSR_FP)) {
-		giveup_fpu(current);
-	} else {
-		u64 oldmsr = mfmsr();
+	msr_check_and_set(MSR_FP);
 
-		if (!(oldmsr & MSR_FP))
-			mtmsr_isync(oldmsr | MSR_FP);
-	}
+	if (current->thread.regs && (current->thread.regs->msr & MSR_FP))
+		__giveup_fpu(current);
 }
 EXPORT_SYMBOL(enable_kernel_fp);
 
 #ifdef CONFIG_ALTIVEC
 void giveup_altivec(struct task_struct *tsk)
 {
-	u64 oldmsr = mfmsr();
-	u64 newmsr;
-
 	check_if_tm_restore_required(tsk);
 
-	newmsr = oldmsr | MSR_VEC;
-	if (oldmsr != newmsr)
-		mtmsr_isync(newmsr);
-
+	msr_check_and_set(MSR_VEC);
 	__giveup_altivec(tsk);
+	msr_check_and_clear(MSR_VEC);
 }
 EXPORT_SYMBOL(giveup_altivec);
 
@@ -175,14 +189,10 @@ void enable_kernel_altivec(void)
 {
 	WARN_ON(preemptible());
 
-	if (current->thread.regs && (current->thread.regs->msr & MSR_VEC)) {
-		giveup_altivec(current);
-	} else {
-		u64 oldmsr = mfmsr();
+	msr_check_and_set(MSR_VEC);
 
-		if (!(oldmsr & MSR_VEC))
-			mtmsr_isync(oldmsr | MSR_VEC);
-	}
+	if (current->thread.regs && (current->thread.regs->msr & MSR_VEC))
+		__giveup_altivec(current);
 }
 EXPORT_SYMBOL(enable_kernel_altivec);
 
@@ -207,20 +217,15 @@ EXPORT_SYMBOL_GPL(flush_altivec_to_thread);
 #ifdef CONFIG_VSX
 void giveup_vsx(struct task_struct *tsk)
 {
-	u64 oldmsr = mfmsr();
-	u64 newmsr;
-
 	check_if_tm_restore_required(tsk);
 
-	newmsr = oldmsr | (MSR_FP|MSR_VEC|MSR_VSX);
-	if (oldmsr != newmsr)
-		mtmsr_isync(newmsr);
-
+	msr_check_and_set(MSR_FP|MSR_VEC|MSR_VSX);
 	if (tsk->thread.regs->msr & MSR_FP)
 		__giveup_fpu(tsk);
 	if (tsk->thread.regs->msr & MSR_VEC)
 		__giveup_altivec(tsk);
 	__giveup_vsx(tsk);
+	msr_check_and_clear(MSR_FP|MSR_VEC|MSR_VSX);
 }
 EXPORT_SYMBOL(giveup_vsx);
 
@@ -228,13 +233,14 @@ void enable_kernel_vsx(void)
 {
 	WARN_ON(preemptible());
 
-	if (current->thread.regs && (current->thread.regs->msr & MSR_VSX)) {
-		giveup_vsx(current);
-	} else {
-		u64 oldmsr = mfmsr();
+	msr_check_and_set(MSR_FP|MSR_VEC|MSR_VSX);
 
-		if (!(oldmsr & MSR_VSX))
-			mtmsr_isync(oldmsr | MSR_VSX);
+	if (current->thread.regs && (current->thread.regs->msr & MSR_VSX)) {
+		if (current->thread.regs->msr & MSR_FP)
+			__giveup_fpu(current);
+		if (current->thread.regs->msr & MSR_VEC)
+			__giveup_altivec(current);
+		__giveup_vsx(current);
 	}
 }
 EXPORT_SYMBOL(enable_kernel_vsx);
@@ -256,16 +262,11 @@ EXPORT_SYMBOL_GPL(flush_vsx_to_thread);
 #ifdef CONFIG_SPE
 void giveup_spe(struct task_struct *tsk)
 {
-	u64 oldmsr = mfmsr();
-	u64 newmsr;
-
 	check_if_tm_restore_required(tsk);
 
-	newmsr = oldmsr | MSR_SPE;
-	if (oldmsr != newmsr)
-		mtmsr_isync(newmsr);
-
+	msr_check_and_set(MSR_SPE);
 	__giveup_spe(tsk);
+	msr_check_and_clear(MSR_SPE);
 }
 EXPORT_SYMBOL(giveup_spe);
 
@@ -273,14 +274,10 @@ void enable_kernel_spe(void)
 {
 	WARN_ON(preemptible());
 
-	if (current->thread.regs && (current->thread.regs->msr & MSR_SPE)) {
-		giveup_spe(current);
-	} else {
-		u64 oldmsr = mfmsr();
+	msr_check_and_set(MSR_SPE);
 
-		if (!(oldmsr & MSR_SPE))
-			mtmsr_isync(oldmsr | MSR_SPE);
-	}
+	if (current->thread.regs && (current->thread.regs->msr & MSR_SPE))
+		__giveup_spe(current);
 }
 EXPORT_SYMBOL(enable_kernel_spe);
 

From dc4fbba11e4661a6a77a1f89ba32f9082e6395ff Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Thu, 29 Oct 2015 11:44:05 +1100
Subject: [PATCH 025/149] powerpc: Create disable_kernel_{fp,altivec,vsx,spe}()

The enable_kernel_*() functions leave the relevant MSR bits enabled
until we exit the kernel sometime later. Create disable versions
that wrap the kernel use of FP, Altivec VSX or SPE.

While we don't want to disable it normally for performance reasons
(MSR writes are slow), it will be used for a debug boot option that
does this and catches bad uses in other areas of the kernel.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/crypto/aes-spe-glue.c       | 1 +
 arch/powerpc/crypto/sha1-spe-glue.c      | 1 +
 arch/powerpc/crypto/sha256-spe-glue.c    | 1 +
 arch/powerpc/include/asm/switch_to.h     | 5 +++++
 arch/powerpc/kernel/align.c              | 2 ++
 arch/powerpc/kvm/book3s_paired_singles.c | 1 +
 arch/powerpc/kvm/book3s_pr.c             | 4 ++++
 arch/powerpc/kvm/booke.c                 | 4 ++++
 arch/powerpc/lib/vmx-helper.c            | 2 ++
 arch/powerpc/lib/xor_vmx.c               | 4 ++++
 drivers/crypto/vmx/aes.c                 | 3 +++
 drivers/crypto/vmx/aes_cbc.c             | 3 +++
 drivers/crypto/vmx/aes_ctr.c             | 3 +++
 drivers/crypto/vmx/ghash.c               | 4 ++++
 lib/raid6/altivec.uc                     | 1 +
 15 files changed, 39 insertions(+)

diff --git a/arch/powerpc/crypto/aes-spe-glue.c b/arch/powerpc/crypto/aes-spe-glue.c
index bd5e63f72ad4..93ee046d12cd 100644
--- a/arch/powerpc/crypto/aes-spe-glue.c
+++ b/arch/powerpc/crypto/aes-spe-glue.c
@@ -85,6 +85,7 @@ static void spe_begin(void)
 
 static void spe_end(void)
 {
+	disable_kernel_spe();
 	/* reenable preemption */
 	preempt_enable();
 }
diff --git a/arch/powerpc/crypto/sha1-spe-glue.c b/arch/powerpc/crypto/sha1-spe-glue.c
index 3e1d22212521..f9ebc38d3fe7 100644
--- a/arch/powerpc/crypto/sha1-spe-glue.c
+++ b/arch/powerpc/crypto/sha1-spe-glue.c
@@ -46,6 +46,7 @@ static void spe_begin(void)
 
 static void spe_end(void)
 {
+	disable_kernel_spe();
 	/* reenable preemption */
 	preempt_enable();
 }
diff --git a/arch/powerpc/crypto/sha256-spe-glue.c b/arch/powerpc/crypto/sha256-spe-glue.c
index f4a616fe1a82..718a079dcdbf 100644
--- a/arch/powerpc/crypto/sha256-spe-glue.c
+++ b/arch/powerpc/crypto/sha256-spe-glue.c
@@ -47,6 +47,7 @@ static void spe_begin(void)
 
 static void spe_end(void)
 {
+	disable_kernel_spe();
 	/* reenable preemption */
 	preempt_enable();
 }
diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h
index c2678b93bcba..438502f59550 100644
--- a/arch/powerpc/include/asm/switch_to.h
+++ b/arch/powerpc/include/asm/switch_to.h
@@ -26,6 +26,11 @@ extern void enable_kernel_spe(void);
 extern void load_up_spe(struct task_struct *);
 extern void switch_booke_debug_regs(struct debug_reg *new_debug);
 
+static inline void disable_kernel_fp(void) { }
+static inline void disable_kernel_altivec(void) { }
+static inline void disable_kernel_spe(void) { }
+static inline void disable_kernel_vsx(void) { }
+
 #ifdef CONFIG_PPC_FPU
 extern void flush_fp_to_thread(struct task_struct *);
 extern void giveup_fpu(struct task_struct *);
diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c
index 86150fbb42c3..8e7cb8e2b21a 100644
--- a/arch/powerpc/kernel/align.c
+++ b/arch/powerpc/kernel/align.c
@@ -960,6 +960,7 @@ int fix_alignment(struct pt_regs *regs)
 			preempt_disable();
 			enable_kernel_fp();
 			cvt_df(&data.dd, (float *)&data.x32.low32);
+			disable_kernel_fp();
 			preempt_enable();
 #else
 			return 0;
@@ -1000,6 +1001,7 @@ int fix_alignment(struct pt_regs *regs)
 		preempt_disable();
 		enable_kernel_fp();
 		cvt_fd((float *)&data.x32.low32, &data.dd);
+		disable_kernel_fp();
 		preempt_enable();
 #else
 		return 0;
diff --git a/arch/powerpc/kvm/book3s_paired_singles.c b/arch/powerpc/kvm/book3s_paired_singles.c
index a759d9adb0b6..eab96cfe82fa 100644
--- a/arch/powerpc/kvm/book3s_paired_singles.c
+++ b/arch/powerpc/kvm/book3s_paired_singles.c
@@ -1265,6 +1265,7 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	if (rcomp)
 		kvmppc_set_cr(vcpu, cr);
 
+	disable_kernel_fp();
 	preempt_enable();
 
 	return emulated;
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 64891b081ad5..49f5dad1bd45 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -751,6 +751,7 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
 		preempt_disable();
 		enable_kernel_fp();
 		load_fp_state(&vcpu->arch.fp);
+		disable_kernel_fp();
 		t->fp_save_area = &vcpu->arch.fp;
 		preempt_enable();
 	}
@@ -760,6 +761,7 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
 		preempt_disable();
 		enable_kernel_altivec();
 		load_vr_state(&vcpu->arch.vr);
+		disable_kernel_altivec();
 		t->vr_save_area = &vcpu->arch.vr;
 		preempt_enable();
 #endif
@@ -788,6 +790,7 @@ static void kvmppc_handle_lost_ext(struct kvm_vcpu *vcpu)
 		preempt_disable();
 		enable_kernel_fp();
 		load_fp_state(&vcpu->arch.fp);
+		disable_kernel_fp();
 		preempt_enable();
 	}
 #ifdef CONFIG_ALTIVEC
@@ -795,6 +798,7 @@ static void kvmppc_handle_lost_ext(struct kvm_vcpu *vcpu)
 		preempt_disable();
 		enable_kernel_altivec();
 		load_vr_state(&vcpu->arch.vr);
+		disable_kernel_altivec();
 		preempt_enable();
 	}
 #endif
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index fd5875179e5c..778ef86e187e 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -98,6 +98,7 @@ void kvmppc_vcpu_disable_spe(struct kvm_vcpu *vcpu)
 	preempt_disable();
 	enable_kernel_spe();
 	kvmppc_save_guest_spe(vcpu);
+	disable_kernel_spe();
 	vcpu->arch.shadow_msr &= ~MSR_SPE;
 	preempt_enable();
 }
@@ -107,6 +108,7 @@ static void kvmppc_vcpu_enable_spe(struct kvm_vcpu *vcpu)
 	preempt_disable();
 	enable_kernel_spe();
 	kvmppc_load_guest_spe(vcpu);
+	disable_kernel_spe();
 	vcpu->arch.shadow_msr |= MSR_SPE;
 	preempt_enable();
 }
@@ -141,6 +143,7 @@ static inline void kvmppc_load_guest_fp(struct kvm_vcpu *vcpu)
 	if (!(current->thread.regs->msr & MSR_FP)) {
 		enable_kernel_fp();
 		load_fp_state(&vcpu->arch.fp);
+		disable_kernel_fp();
 		current->thread.fp_save_area = &vcpu->arch.fp;
 		current->thread.regs->msr |= MSR_FP;
 	}
@@ -182,6 +185,7 @@ static inline void kvmppc_load_guest_altivec(struct kvm_vcpu *vcpu)
 		if (!(current->thread.regs->msr & MSR_VEC)) {
 			enable_kernel_altivec();
 			load_vr_state(&vcpu->arch.vr);
+			disable_kernel_altivec();
 			current->thread.vr_save_area = &vcpu->arch.vr;
 			current->thread.regs->msr |= MSR_VEC;
 		}
diff --git a/arch/powerpc/lib/vmx-helper.c b/arch/powerpc/lib/vmx-helper.c
index ac93a3bd2730..b27e030fc9f8 100644
--- a/arch/powerpc/lib/vmx-helper.c
+++ b/arch/powerpc/lib/vmx-helper.c
@@ -46,6 +46,7 @@ int enter_vmx_usercopy(void)
  */
 int exit_vmx_usercopy(void)
 {
+	disable_kernel_altivec();
 	pagefault_enable();
 	preempt_enable();
 	return 0;
@@ -70,6 +71,7 @@ int enter_vmx_copy(void)
  */
 void *exit_vmx_copy(void *dest)
 {
+	disable_kernel_altivec();
 	preempt_enable();
 	return dest;
 }
diff --git a/arch/powerpc/lib/xor_vmx.c b/arch/powerpc/lib/xor_vmx.c
index e905f7c2ea7b..07f49f1568e5 100644
--- a/arch/powerpc/lib/xor_vmx.c
+++ b/arch/powerpc/lib/xor_vmx.c
@@ -74,6 +74,7 @@ void xor_altivec_2(unsigned long bytes, unsigned long *v1_in,
 		v2 += 4;
 	} while (--lines > 0);
 
+	disable_kernel_altivec();
 	preempt_enable();
 }
 EXPORT_SYMBOL(xor_altivec_2);
@@ -102,6 +103,7 @@ void xor_altivec_3(unsigned long bytes, unsigned long *v1_in,
 		v3 += 4;
 	} while (--lines > 0);
 
+	disable_kernel_altivec();
 	preempt_enable();
 }
 EXPORT_SYMBOL(xor_altivec_3);
@@ -135,6 +137,7 @@ void xor_altivec_4(unsigned long bytes, unsigned long *v1_in,
 		v4 += 4;
 	} while (--lines > 0);
 
+	disable_kernel_altivec();
 	preempt_enable();
 }
 EXPORT_SYMBOL(xor_altivec_4);
@@ -172,6 +175,7 @@ void xor_altivec_5(unsigned long bytes, unsigned long *v1_in,
 		v5 += 4;
 	} while (--lines > 0);
 
+	disable_kernel_altivec();
 	preempt_enable();
 }
 EXPORT_SYMBOL(xor_altivec_5);
diff --git a/drivers/crypto/vmx/aes.c b/drivers/crypto/vmx/aes.c
index 20539fb7e975..022c7ab7351a 100644
--- a/drivers/crypto/vmx/aes.c
+++ b/drivers/crypto/vmx/aes.c
@@ -86,6 +86,7 @@ static int p8_aes_setkey(struct crypto_tfm *tfm, const u8 *key,
 	enable_kernel_vsx();
 	ret = aes_p8_set_encrypt_key(key, keylen * 8, &ctx->enc_key);
 	ret += aes_p8_set_decrypt_key(key, keylen * 8, &ctx->dec_key);
+	disable_kernel_vsx();
 	pagefault_enable();
 	preempt_enable();
 
@@ -104,6 +105,7 @@ static void p8_aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 		pagefault_disable();
 		enable_kernel_vsx();
 		aes_p8_encrypt(src, dst, &ctx->enc_key);
+		disable_kernel_vsx();
 		pagefault_enable();
 		preempt_enable();
 	}
@@ -120,6 +122,7 @@ static void p8_aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 		pagefault_disable();
 		enable_kernel_vsx();
 		aes_p8_decrypt(src, dst, &ctx->dec_key);
+		disable_kernel_vsx();
 		pagefault_enable();
 		preempt_enable();
 	}
diff --git a/drivers/crypto/vmx/aes_cbc.c b/drivers/crypto/vmx/aes_cbc.c
index 8847b92e9ff0..1881b3f413fa 100644
--- a/drivers/crypto/vmx/aes_cbc.c
+++ b/drivers/crypto/vmx/aes_cbc.c
@@ -87,6 +87,7 @@ static int p8_aes_cbc_setkey(struct crypto_tfm *tfm, const u8 *key,
 	enable_kernel_vsx();
 	ret = aes_p8_set_encrypt_key(key, keylen * 8, &ctx->enc_key);
 	ret += aes_p8_set_decrypt_key(key, keylen * 8, &ctx->dec_key);
+	disable_kernel_vsx();
 	pagefault_enable();
 	preempt_enable();
 
@@ -127,6 +128,7 @@ static int p8_aes_cbc_encrypt(struct blkcipher_desc *desc,
 			ret = blkcipher_walk_done(desc, &walk, nbytes);
 		}
 
+		disable_kernel_vsx();
 		pagefault_enable();
 		preempt_enable();
 	}
@@ -167,6 +169,7 @@ static int p8_aes_cbc_decrypt(struct blkcipher_desc *desc,
 			ret = blkcipher_walk_done(desc, &walk, nbytes);
 		}
 
+		disable_kernel_vsx();
 		pagefault_enable();
 		preempt_enable();
 	}
diff --git a/drivers/crypto/vmx/aes_ctr.c b/drivers/crypto/vmx/aes_ctr.c
index 80958660c31a..2d58b18acc10 100644
--- a/drivers/crypto/vmx/aes_ctr.c
+++ b/drivers/crypto/vmx/aes_ctr.c
@@ -83,6 +83,7 @@ static int p8_aes_ctr_setkey(struct crypto_tfm *tfm, const u8 *key,
 	pagefault_disable();
 	enable_kernel_vsx();
 	ret = aes_p8_set_encrypt_key(key, keylen * 8, &ctx->enc_key);
+	disable_kernel_vsx();
 	pagefault_enable();
 
 	ret += crypto_blkcipher_setkey(ctx->fallback, key, keylen);
@@ -101,6 +102,7 @@ static void p8_aes_ctr_final(struct p8_aes_ctr_ctx *ctx,
 	pagefault_disable();
 	enable_kernel_vsx();
 	aes_p8_encrypt(ctrblk, keystream, &ctx->enc_key);
+	disable_kernel_vsx();
 	pagefault_enable();
 
 	crypto_xor(keystream, src, nbytes);
@@ -139,6 +141,7 @@ static int p8_aes_ctr_crypt(struct blkcipher_desc *desc,
 						    AES_BLOCK_SIZE,
 						    &ctx->enc_key,
 						    walk.iv);
+			disable_kernel_vsx();
 			pagefault_enable();
 
 			/* We need to update IV mostly for last bytes/round */
diff --git a/drivers/crypto/vmx/ghash.c b/drivers/crypto/vmx/ghash.c
index 1f4586c2fd25..6c999cb01b80 100644
--- a/drivers/crypto/vmx/ghash.c
+++ b/drivers/crypto/vmx/ghash.c
@@ -120,6 +120,7 @@ static int p8_ghash_setkey(struct crypto_shash *tfm, const u8 *key,
 	pagefault_disable();
 	enable_kernel_vsx();
 	gcm_init_p8(ctx->htable, (const u64 *) key);
+	disable_kernel_vsx();
 	pagefault_enable();
 	preempt_enable();
 	return crypto_shash_setkey(ctx->fallback, key, keylen);
@@ -150,6 +151,7 @@ static int p8_ghash_update(struct shash_desc *desc,
 			enable_kernel_vsx();
 			gcm_ghash_p8(dctx->shash, ctx->htable,
 				     dctx->buffer, GHASH_DIGEST_SIZE);
+			disable_kernel_vsx();
 			pagefault_enable();
 			preempt_enable();
 			src += GHASH_DIGEST_SIZE - dctx->bytes;
@@ -162,6 +164,7 @@ static int p8_ghash_update(struct shash_desc *desc,
 			pagefault_disable();
 			enable_kernel_vsx();
 			gcm_ghash_p8(dctx->shash, ctx->htable, src, len);
+			disable_kernel_vsx();
 			pagefault_enable();
 			preempt_enable();
 			src += len;
@@ -192,6 +195,7 @@ static int p8_ghash_final(struct shash_desc *desc, u8 *out)
 			enable_kernel_vsx();
 			gcm_ghash_p8(dctx->shash, ctx->htable,
 				     dctx->buffer, GHASH_DIGEST_SIZE);
+			disable_kernel_vsx();
 			pagefault_enable();
 			preempt_enable();
 			dctx->bytes = 0;
diff --git a/lib/raid6/altivec.uc b/lib/raid6/altivec.uc
index bec27fce7501..682aae8a1fef 100644
--- a/lib/raid6/altivec.uc
+++ b/lib/raid6/altivec.uc
@@ -101,6 +101,7 @@ static void raid6_altivec$#_gen_syndrome(int disks, size_t bytes, void **ptrs)
 
 	raid6_altivec$#_gen_syndrome_real(disks, bytes, ptrs);
 
+	disable_kernel_altivec();
 	preempt_enable();
 }
 

From 3eb5d5888dc68c9b187998ca4249b8b9fa481eeb Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Thu, 29 Oct 2015 11:44:06 +1100
Subject: [PATCH 026/149] powerpc: Add ppc_strict_facility_enable boot option

Add a boot option that strictly manages the MSR unavailable bits.
This catches kernel uses of FP/Altivec/SPE that would otherwise
corrupt user state.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 Documentation/kernel-parameters.txt  |  6 ++++++
 arch/powerpc/include/asm/reg.h       |  9 +++++++++
 arch/powerpc/include/asm/switch_to.h | 24 +++++++++++++++++++-----
 arch/powerpc/kernel/process.c        | 17 +++++++++++++++--
 4 files changed, 49 insertions(+), 7 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 742f69d18fc8..8978c26cacdd 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2978,6 +2978,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			may be specified.
 			Format: <port>,<port>....
 
+	ppc_strict_facility_enable
+			[PPC] This option catches any kernel floating point,
+			Altivec, VSX and SPE outside of regions specifically
+			allowed (eg kernel_enable_fpu()/kernel_disable_fpu()).
+			There is some performance impact when enabling this.
+
 	print-fatal-signals=
 			[KNL] debug: print fatal signals
 
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 987dac090244..eb2986e60c50 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -1214,6 +1214,15 @@ static inline void mtmsr_isync(unsigned long val)
 				     : "r" ((unsigned long)(v)) \
 				     : "memory")
 
+extern void msr_check_and_set(unsigned long bits);
+extern bool strict_msr_control;
+extern void __msr_check_and_clear(unsigned long bits);
+static inline void msr_check_and_clear(unsigned long bits)
+{
+	if (strict_msr_control)
+		__msr_check_and_clear(bits);
+}
+
 static inline unsigned long mfvtb (void)
 {
 #ifdef CONFIG_PPC_BOOK3S_64
diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h
index 438502f59550..9414dcb180d6 100644
--- a/arch/powerpc/include/asm/switch_to.h
+++ b/arch/powerpc/include/asm/switch_to.h
@@ -4,6 +4,8 @@
 #ifndef _ASM_POWERPC_SWITCH_TO_H
 #define _ASM_POWERPC_SWITCH_TO_H
 
+#include <asm/reg.h>
+
 struct thread_struct;
 struct task_struct;
 struct pt_regs;
@@ -26,15 +28,15 @@ extern void enable_kernel_spe(void);
 extern void load_up_spe(struct task_struct *);
 extern void switch_booke_debug_regs(struct debug_reg *new_debug);
 
-static inline void disable_kernel_fp(void) { }
-static inline void disable_kernel_altivec(void) { }
-static inline void disable_kernel_spe(void) { }
-static inline void disable_kernel_vsx(void) { }
-
 #ifdef CONFIG_PPC_FPU
 extern void flush_fp_to_thread(struct task_struct *);
 extern void giveup_fpu(struct task_struct *);
 extern void __giveup_fpu(struct task_struct *);
+static inline void disable_kernel_fp(void)
+{
+	msr_check_and_clear(MSR_FP);
+}
+
 #else
 static inline void flush_fp_to_thread(struct task_struct *t) { }
 static inline void giveup_fpu(struct task_struct *t) { }
@@ -45,6 +47,10 @@ static inline void __giveup_fpu(struct task_struct *t) { }
 extern void flush_altivec_to_thread(struct task_struct *);
 extern void giveup_altivec(struct task_struct *);
 extern void __giveup_altivec(struct task_struct *);
+static inline void disable_kernel_altivec(void)
+{
+	msr_check_and_clear(MSR_VEC);
+}
 #else
 static inline void flush_altivec_to_thread(struct task_struct *t) { }
 static inline void giveup_altivec(struct task_struct *t) { }
@@ -53,6 +59,10 @@ static inline void __giveup_altivec(struct task_struct *t) { }
 
 #ifdef CONFIG_VSX
 extern void flush_vsx_to_thread(struct task_struct *);
+static inline void disable_kernel_vsx(void)
+{
+	msr_check_and_clear(MSR_FP|MSR_VEC|MSR_VSX);
+}
 #else
 static inline void flush_vsx_to_thread(struct task_struct *t)
 {
@@ -63,6 +73,10 @@ static inline void flush_vsx_to_thread(struct task_struct *t)
 extern void flush_spe_to_thread(struct task_struct *);
 extern void giveup_spe(struct task_struct *);
 extern void __giveup_spe(struct task_struct *);
+static inline void disable_kernel_spe(void)
+{
+	msr_check_and_clear(MSR_SPE);
+}
 #else
 static inline void flush_spe_to_thread(struct task_struct *t) { }
 static inline void giveup_spe(struct task_struct *t) { }
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 5cdd35c0b026..1eafceefeac9 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -87,7 +87,19 @@ static void check_if_tm_restore_required(struct task_struct *tsk)
 static inline void check_if_tm_restore_required(struct task_struct *tsk) { }
 #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
 
-static void msr_check_and_set(unsigned long bits)
+bool strict_msr_control;
+EXPORT_SYMBOL(strict_msr_control);
+
+static int __init enable_strict_msr_control(char *str)
+{
+	strict_msr_control = true;
+	pr_info("Enabling strict facility control\n");
+
+	return 0;
+}
+early_param("ppc_strict_facility_enable", enable_strict_msr_control);
+
+void msr_check_and_set(unsigned long bits)
 {
 	unsigned long oldmsr = mfmsr();
 	unsigned long newmsr;
@@ -103,7 +115,7 @@ static void msr_check_and_set(unsigned long bits)
 		mtmsr_isync(newmsr);
 }
 
-static void msr_check_and_clear(unsigned long bits)
+void __msr_check_and_clear(unsigned long bits)
 {
 	unsigned long oldmsr = mfmsr();
 	unsigned long newmsr;
@@ -118,6 +130,7 @@ static void msr_check_and_clear(unsigned long bits)
 	if (oldmsr != newmsr)
 		mtmsr_isync(newmsr);
 }
+EXPORT_SYMBOL(__msr_check_and_clear);
 
 #ifdef CONFIG_PPC_FPU
 void giveup_fpu(struct task_struct *tsk)

From 1f2e25b2d552cade43eacb2edc4e7f01c1cfecb3 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Thu, 29 Oct 2015 11:44:07 +1100
Subject: [PATCH 027/149] powerpc: Remove fp_enable() and vec_enable(), use
 msr_check_and_{set, clear}()

More consolidation of our MSR available bit handling.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/processor.h |  2 --
 arch/powerpc/kernel/fpu.S            | 16 ----------------
 arch/powerpc/kernel/process.c        |  6 ++++--
 arch/powerpc/kernel/vector.S         | 10 ----------
 4 files changed, 4 insertions(+), 30 deletions(-)

diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index a2e891840806..ac2330820b9a 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -380,8 +380,6 @@ extern int set_endian(struct task_struct *tsk, unsigned int val);
 extern int get_unalign_ctl(struct task_struct *tsk, unsigned long adr);
 extern int set_unalign_ctl(struct task_struct *tsk, unsigned int val);
 
-extern void fp_enable(void);
-extern void vec_enable(void);
 extern void load_fp_state(struct thread_fp_state *fp);
 extern void store_fp_state(struct thread_fp_state *fp);
 extern void load_vr_state(struct thread_vr_state *vr);
diff --git a/arch/powerpc/kernel/fpu.S b/arch/powerpc/kernel/fpu.S
index 431ab571ed1b..2117eaca3d28 100644
--- a/arch/powerpc/kernel/fpu.S
+++ b/arch/powerpc/kernel/fpu.S
@@ -76,22 +76,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
 	blr
 #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
 
-/*
- * Enable use of the FPU, and VSX if possible, for the caller.
- */
-_GLOBAL(fp_enable)
-	mfmsr	r3
-	ori	r3,r3,MSR_FP
-#ifdef CONFIG_VSX
-BEGIN_FTR_SECTION
-	oris	r3,r3,MSR_VSX@h
-END_FTR_SECTION_IFSET(CPU_FTR_VSX)
-#endif
-	SYNC
-	MTMSRD(r3)
-	isync			/* (not necessary for arch 2.02 and later) */
-	blr
-
 /*
  * Load state from memory into FP registers including FPSCR.
  * Assumes the caller has enabled FP in the MSR.
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 1eafceefeac9..9f8444b84dde 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -732,13 +732,15 @@ void restore_tm_state(struct pt_regs *regs)
 	msr_diff = current->thread.ckpt_regs.msr & ~regs->msr;
 	msr_diff &= MSR_FP | MSR_VEC | MSR_VSX;
 	if (msr_diff & MSR_FP) {
-		fp_enable();
+		msr_check_and_set(MSR_FP);
 		load_fp_state(&current->thread.fp_state);
+		msr_check_and_clear(MSR_FP);
 		regs->msr |= current->thread.fpexc_mode;
 	}
 	if (msr_diff & MSR_VEC) {
-		vec_enable();
+		msr_check_and_set(MSR_VEC);
 		load_vr_state(&current->thread.vr_state);
+		msr_check_and_clear(MSR_VEC);
 	}
 	regs->msr |= msr_diff;
 }
diff --git a/arch/powerpc/kernel/vector.S b/arch/powerpc/kernel/vector.S
index 98675b08efe2..162d0f714941 100644
--- a/arch/powerpc/kernel/vector.S
+++ b/arch/powerpc/kernel/vector.S
@@ -32,16 +32,6 @@ _GLOBAL(do_load_up_transact_altivec)
 	blr
 #endif
 
-/*
- * Enable use of VMX/Altivec for the caller.
- */
-_GLOBAL(vec_enable)
-	mfmsr	r3
-	oris	r3,r3,MSR_VEC@h
-	MTMSRD(r3)
-	isync
-	blr
-
 /*
  * Load state from memory into VMX registers including VSCR.
  * Assumes the caller has enabled VMX in the MSR.

From c208505900b232ecdc81dee54cb3a032e75d88d6 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Thu, 29 Oct 2015 11:44:08 +1100
Subject: [PATCH 028/149] powerpc: create giveup_all()

Create a single function that gives everything up (FP, VMX, VSX, SPE).
Doing this all at once means we only do one MSR write.

A context switch microbenchmark using yield():

http://ozlabs.org/~anton/junkcode/context_switch2.c

./context_switch2 --test=yield --fp --altivec --vector 0 0

shows an improvement of 3% on POWER8.

Signed-off-by: Anton Blanchard <anton@samba.org>
[mpe: giveup_all() needs to be EXPORT_SYMBOL'ed]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/switch_to.h |  1 +
 arch/powerpc/kernel/process.c        | 76 ++++++++++++++++++++++------
 arch/powerpc/kvm/book3s_pr.c         | 17 +------
 3 files changed, 64 insertions(+), 30 deletions(-)

diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h
index 9414dcb180d6..8f856788a7cf 100644
--- a/arch/powerpc/include/asm/switch_to.h
+++ b/arch/powerpc/include/asm/switch_to.h
@@ -26,6 +26,7 @@ extern void __giveup_vsx(struct task_struct *);
 extern void giveup_vsx(struct task_struct *);
 extern void enable_kernel_spe(void);
 extern void load_up_spe(struct task_struct *);
+extern void giveup_all(struct task_struct *);
 extern void switch_booke_debug_regs(struct debug_reg *new_debug);
 
 #ifdef CONFIG_PPC_FPU
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 9f8444b84dde..4c087b9ed2d6 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -308,6 +308,65 @@ void flush_spe_to_thread(struct task_struct *tsk)
 }
 #endif /* CONFIG_SPE */
 
+static unsigned long msr_all_available;
+
+static int __init init_msr_all_available(void)
+{
+#ifdef CONFIG_PPC_FPU
+	msr_all_available |= MSR_FP;
+#endif
+#ifdef CONFIG_ALTIVEC
+	if (cpu_has_feature(CPU_FTR_ALTIVEC))
+		msr_all_available |= MSR_VEC;
+#endif
+#ifdef CONFIG_VSX
+	if (cpu_has_feature(CPU_FTR_VSX))
+		msr_all_available |= MSR_VSX;
+#endif
+#ifdef CONFIG_SPE
+	if (cpu_has_feature(CPU_FTR_SPE))
+		msr_all_available |= MSR_SPE;
+#endif
+
+	return 0;
+}
+early_initcall(init_msr_all_available);
+
+void giveup_all(struct task_struct *tsk)
+{
+	unsigned long usermsr;
+
+	if (!tsk->thread.regs)
+		return;
+
+	usermsr = tsk->thread.regs->msr;
+
+	if ((usermsr & msr_all_available) == 0)
+		return;
+
+	msr_check_and_set(msr_all_available);
+
+#ifdef CONFIG_PPC_FPU
+	if (usermsr & MSR_FP)
+		__giveup_fpu(tsk);
+#endif
+#ifdef CONFIG_ALTIVEC
+	if (usermsr & MSR_VEC)
+		__giveup_altivec(tsk);
+#endif
+#ifdef CONFIG_VSX
+	if (usermsr & MSR_VSX)
+		__giveup_vsx(tsk);
+#endif
+#ifdef CONFIG_SPE
+	if (usermsr & MSR_SPE)
+		__giveup_spe(tsk);
+#endif
+
+	msr_check_and_clear(msr_all_available);
+}
+EXPORT_SYMBOL(giveup_all);
+
 #ifdef CONFIG_PPC_ADV_DEBUG_REGS
 void do_send_trap(struct pt_regs *regs, unsigned long address,
 		  unsigned long error_code, int signal_code, int breakpt)
@@ -839,21 +898,8 @@ struct task_struct *__switch_to(struct task_struct *prev,
 
 	__switch_to_tm(prev);
 
-	if (prev->thread.regs && (prev->thread.regs->msr & MSR_FP))
-		giveup_fpu(prev);
-#ifdef CONFIG_ALTIVEC
-	if (prev->thread.regs && (prev->thread.regs->msr & MSR_VEC))
-		giveup_altivec(prev);
-#endif /* CONFIG_ALTIVEC */
-#ifdef CONFIG_VSX
-	if (prev->thread.regs && (prev->thread.regs->msr & MSR_VSX))
-		/* VMX and FPU registers are already save here */
-		__giveup_vsx(prev);
-#endif /* CONFIG_VSX */
-#ifdef CONFIG_SPE
-	if ((prev->thread.regs && (prev->thread.regs->msr & MSR_SPE)))
-		giveup_spe(prev);
-#endif /* CONFIG_SPE */
+	/* Save FPU, Altivec, VSX and SPE state */
+	giveup_all(prev);
 
 #ifdef CONFIG_PPC_ADV_DEBUG_REGS
 	switch_booke_debug_regs(&new->thread.debug);
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 49f5dad1bd45..a78e0e6bd932 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -1490,21 +1490,8 @@ static int kvmppc_vcpu_run_pr(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 		goto out;
 	/* interrupts now hard-disabled */
 
-	/* Save FPU state in thread_struct */
-	if (current->thread.regs->msr & MSR_FP)
-		giveup_fpu(current);
-
-#ifdef CONFIG_ALTIVEC
-	/* Save Altivec state in thread_struct */
-	if (current->thread.regs->msr & MSR_VEC)
-		giveup_altivec(current);
-#endif
-
-#ifdef CONFIG_VSX
-	/* Save VSX state in thread_struct */
-	if (current->thread.regs->msr & MSR_VSX)
-		__giveup_vsx(current);
-#endif
+	/* Save FPU, Altivec and VSX state */
+	giveup_all(current);
 
 	/* Preload FPU if it's enabled */
 	if (kvmppc_get_msr(vcpu) & MSR_FP)

From 579e633e764e6e5f7784b74e7df3e81fe11f40de Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Thu, 29 Oct 2015 11:44:09 +1100
Subject: [PATCH 029/149] powerpc: create flush_all_to_thread()

Create a single function that flushes everything (FP, VMX, VSX, SPE).
Doing this all at once means we only do one MSR write.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/switch_to.h |  1 +
 arch/powerpc/kernel/process.c        | 22 ++++++++++++++++++----
 arch/powerpc/kernel/swsusp.c         |  4 +---
 arch/powerpc/kvm/book3s_hv.c         |  5 ++---
 4 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h
index 8f856788a7cf..81d46a433c03 100644
--- a/arch/powerpc/include/asm/switch_to.h
+++ b/arch/powerpc/include/asm/switch_to.h
@@ -27,6 +27,7 @@ extern void giveup_vsx(struct task_struct *);
 extern void enable_kernel_spe(void);
 extern void load_up_spe(struct task_struct *);
 extern void giveup_all(struct task_struct *);
+extern void flush_all_to_thread(struct task_struct *);
 extern void switch_booke_debug_regs(struct debug_reg *new_debug);
 
 #ifdef CONFIG_PPC_FPU
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 4c087b9ed2d6..7f437e7b273e 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -367,6 +367,23 @@ void giveup_all(struct task_struct *tsk)
 }
 EXPORT_SYMBOL(giveup_all);
 
+void flush_all_to_thread(struct task_struct *tsk)
+{
+	if (tsk->thread.regs) {
+		preempt_disable();
+		BUG_ON(tsk != current);
+		giveup_all(tsk);
+
+#ifdef CONFIG_SPE
+		if (tsk->thread.regs->msr & MSR_SPE)
+			tsk->thread.spefscr = mfspr(SPRN_SPEFSCR);
+#endif
+
+		preempt_enable();
+	}
+}
+EXPORT_SYMBOL(flush_all_to_thread);
+
 #ifdef CONFIG_PPC_ADV_DEBUG_REGS
 void do_send_trap(struct pt_regs *regs, unsigned long address,
 		  unsigned long error_code, int signal_code, int breakpt)
@@ -1137,10 +1154,7 @@ release_thread(struct task_struct *t)
  */
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
-	flush_fp_to_thread(src);
-	flush_altivec_to_thread(src);
-	flush_vsx_to_thread(src);
-	flush_spe_to_thread(src);
+	flush_all_to_thread(src);
 	/*
 	 * Flush TM state out so we can copy it.  __switch_to_tm() does this
 	 * flush but it removes the checkpointed state from the current CPU and
diff --git a/arch/powerpc/kernel/swsusp.c b/arch/powerpc/kernel/swsusp.c
index eae33e10b65f..6669b1752512 100644
--- a/arch/powerpc/kernel/swsusp.c
+++ b/arch/powerpc/kernel/swsusp.c
@@ -20,9 +20,7 @@ void save_processor_state(void)
 	 * flush out all the special registers so we don't need
 	 * to save them in the snapshot
 	 */
-	flush_fp_to_thread(current);
-	flush_altivec_to_thread(current);
-	flush_spe_to_thread(current);
+	flush_all_to_thread(current);
 
 #ifdef CONFIG_PPC64
 	hard_irq_disable();
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 54b45b73195f..8e694707bc56 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -2700,9 +2700,8 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			goto out;
 	}
 
-	flush_fp_to_thread(current);
-	flush_altivec_to_thread(current);
-	flush_vsx_to_thread(current);
+	flush_all_to_thread(current);
+
 	vcpu->arch.wqp = &vcpu->arch.vcore->wq;
 	vcpu->arch.pgdir = current->mm->pgd;
 	vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;

From f3d885ccba8539f62e8be3ba29ecf91687120252 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Thu, 29 Oct 2015 11:44:10 +1100
Subject: [PATCH 030/149] powerpc: Rearrange __switch_to()

Most of __switch_to() is housekeeping, TLB batching, timekeeping etc.
Move these away from the more complex and critical context switching
code.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/process.c | 52 +++++++++++++++++------------------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 7f437e7b273e..49424dc1168d 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -907,30 +907,6 @@ struct task_struct *__switch_to(struct task_struct *prev,
 
 	WARN_ON(!irqs_disabled());
 
-	/*
-	 * We need to save SPRs before treclaim/trecheckpoint as these will
-	 * change a number of them.
-	 */
-	save_sprs(&prev->thread);
-
-	__switch_to_tm(prev);
-
-	/* Save FPU, Altivec, VSX and SPE state */
-	giveup_all(prev);
-
-#ifdef CONFIG_PPC_ADV_DEBUG_REGS
-	switch_booke_debug_regs(&new->thread.debug);
-#else
-/*
- * For PPC_BOOK3S_64, we use the hw-breakpoint interfaces that would
- * schedule DABR
- */
-#ifndef CONFIG_HAVE_HW_BREAKPOINT
-	if (unlikely(!hw_brk_match(this_cpu_ptr(&current_brk), &new->thread.hw_brk)))
-		__set_breakpoint(&new->thread.hw_brk);
-#endif /* CONFIG_HAVE_HW_BREAKPOINT */
-#endif
-
 #ifdef CONFIG_PPC64
 	/*
 	 * Collect processor utilization data per process
@@ -955,6 +931,30 @@ struct task_struct *__switch_to(struct task_struct *prev,
 	}
 #endif /* CONFIG_PPC_BOOK3S_64 */
 
+#ifdef CONFIG_PPC_ADV_DEBUG_REGS
+	switch_booke_debug_regs(&new->thread.debug);
+#else
+/*
+ * For PPC_BOOK3S_64, we use the hw-breakpoint interfaces that would
+ * schedule DABR
+ */
+#ifndef CONFIG_HAVE_HW_BREAKPOINT
+	if (unlikely(!hw_brk_match(this_cpu_ptr(&current_brk), &new->thread.hw_brk)))
+		__set_breakpoint(&new->thread.hw_brk);
+#endif /* CONFIG_HAVE_HW_BREAKPOINT */
+#endif
+
+	/*
+	 * We need to save SPRs before treclaim/trecheckpoint as these will
+	 * change a number of them.
+	 */
+	save_sprs(&prev->thread);
+
+	__switch_to_tm(prev);
+
+	/* Save FPU, Altivec, VSX and SPE state */
+	giveup_all(prev);
+
 	/*
 	 * We can't take a PMU exception inside _switch() since there is a
 	 * window where the kernel stack SLB and the kernel stack are out
@@ -970,6 +970,8 @@ struct task_struct *__switch_to(struct task_struct *prev,
 	old_thread = &last->thread;
 	new_thread = &current->thread;
 
+	restore_sprs(old_thread, new_thread);
+
 #ifdef CONFIG_PPC_BOOK3S_64
 	if (current_thread_info()->local_flags & _TLF_LAZY_MMU) {
 		current_thread_info()->local_flags &= ~_TLF_LAZY_MMU;
@@ -978,8 +980,6 @@ struct task_struct *__switch_to(struct task_struct *prev,
 	}
 #endif /* CONFIG_PPC_BOOK3S_64 */
 
-	restore_sprs(old_thread, new_thread);
-
 	return last;
 }
 

From d1e1cf2e38def301fde42c1a33f896f974941d7b Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Thu, 29 Oct 2015 11:44:11 +1100
Subject: [PATCH 031/149] powerpc: clean up asm/switch_to.h

Remove a bunch of unnecessary fallback functions and group
things in a more logical way.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/switch_to.h | 37 +++++++++-------------------
 arch/powerpc/kernel/process.c        |  2 +-
 2 files changed, 12 insertions(+), 27 deletions(-)

diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h
index 81d46a433c03..5b268b6be74c 100644
--- a/arch/powerpc/include/asm/switch_to.h
+++ b/arch/powerpc/include/asm/switch_to.h
@@ -14,23 +14,18 @@ extern struct task_struct *__switch_to(struct task_struct *,
 	struct task_struct *);
 #define switch_to(prev, next, last)	((last) = __switch_to((prev), (next)))
 
-struct thread_struct;
 extern struct task_struct *_switch(struct thread_struct *prev,
 				   struct thread_struct *next);
 
-extern void enable_kernel_fp(void);
-extern void enable_kernel_altivec(void);
-extern void enable_kernel_vsx(void);
-extern int emulate_altivec(struct pt_regs *);
-extern void __giveup_vsx(struct task_struct *);
-extern void giveup_vsx(struct task_struct *);
-extern void enable_kernel_spe(void);
-extern void load_up_spe(struct task_struct *);
-extern void giveup_all(struct task_struct *);
-extern void flush_all_to_thread(struct task_struct *);
 extern void switch_booke_debug_regs(struct debug_reg *new_debug);
 
+extern int emulate_altivec(struct pt_regs *);
+
+extern void flush_all_to_thread(struct task_struct *);
+extern void giveup_all(struct task_struct *);
+
 #ifdef CONFIG_PPC_FPU
+extern void enable_kernel_fp(void);
 extern void flush_fp_to_thread(struct task_struct *);
 extern void giveup_fpu(struct task_struct *);
 extern void __giveup_fpu(struct task_struct *);
@@ -38,14 +33,12 @@ static inline void disable_kernel_fp(void)
 {
 	msr_check_and_clear(MSR_FP);
 }
-
 #else
 static inline void flush_fp_to_thread(struct task_struct *t) { }
-static inline void giveup_fpu(struct task_struct *t) { }
-static inline void __giveup_fpu(struct task_struct *t) { }
 #endif
 
 #ifdef CONFIG_ALTIVEC
+extern void enable_kernel_altivec(void);
 extern void flush_altivec_to_thread(struct task_struct *);
 extern void giveup_altivec(struct task_struct *);
 extern void __giveup_altivec(struct task_struct *);
@@ -53,25 +46,21 @@ static inline void disable_kernel_altivec(void)
 {
 	msr_check_and_clear(MSR_VEC);
 }
-#else
-static inline void flush_altivec_to_thread(struct task_struct *t) { }
-static inline void giveup_altivec(struct task_struct *t) { }
-static inline void __giveup_altivec(struct task_struct *t) { }
 #endif
 
 #ifdef CONFIG_VSX
+extern void enable_kernel_vsx(void);
 extern void flush_vsx_to_thread(struct task_struct *);
+extern void giveup_vsx(struct task_struct *);
+extern void __giveup_vsx(struct task_struct *);
 static inline void disable_kernel_vsx(void)
 {
 	msr_check_and_clear(MSR_FP|MSR_VEC|MSR_VSX);
 }
-#else
-static inline void flush_vsx_to_thread(struct task_struct *t)
-{
-}
 #endif
 
 #ifdef CONFIG_SPE
+extern void enable_kernel_spe(void);
 extern void flush_spe_to_thread(struct task_struct *);
 extern void giveup_spe(struct task_struct *);
 extern void __giveup_spe(struct task_struct *);
@@ -79,10 +68,6 @@ static inline void disable_kernel_spe(void)
 {
 	msr_check_and_clear(MSR_SPE);
 }
-#else
-static inline void flush_spe_to_thread(struct task_struct *t) { }
-static inline void giveup_spe(struct task_struct *t) { }
-static inline void __giveup_spe(struct task_struct *t) { }
 #endif
 
 static inline void clear_task_ebb(struct task_struct *t)
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 49424dc1168d..58194c3f421e 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -174,7 +174,6 @@ void flush_fp_to_thread(struct task_struct *tsk)
 	}
 }
 EXPORT_SYMBOL_GPL(flush_fp_to_thread);
-#endif /* CONFIG_PPC_FPU */
 
 void enable_kernel_fp(void)
 {
@@ -186,6 +185,7 @@ void enable_kernel_fp(void)
 		__giveup_fpu(current);
 }
 EXPORT_SYMBOL(enable_kernel_fp);
+#endif /* CONFIG_PPC_FPU */
 
 #ifdef CONFIG_ALTIVEC
 void giveup_altivec(struct task_struct *tsk)

From d64d02ce4ebaa79bf1c026e81a956f133938af65 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Thu, 10 Dec 2015 20:04:05 +1100
Subject: [PATCH 032/149] powerpc: Call check_if_tm_restore_required() in
 enable_kernel_*()

Commit a0e72cf12b1a ("powerpc: Create msr_check_and_{set,clear}()")
removed a call to check_if_tm_restore_required() in the
enable_kernel_*() functions. Add them back in.

Fixes: a0e72cf12b1a ("powerpc: Create msr_check_and_{set,clear}()")
Reported-by: Rashmica Gupta <rashmicy@gmail.com>
Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/process.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 58194c3f421e..1eeda3b80b65 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -181,8 +181,10 @@ void enable_kernel_fp(void)
 
 	msr_check_and_set(MSR_FP);
 
-	if (current->thread.regs && (current->thread.regs->msr & MSR_FP))
+	if (current->thread.regs && (current->thread.regs->msr & MSR_FP)) {
+		check_if_tm_restore_required(current);
 		__giveup_fpu(current);
+	}
 }
 EXPORT_SYMBOL(enable_kernel_fp);
 #endif /* CONFIG_PPC_FPU */
@@ -204,8 +206,10 @@ void enable_kernel_altivec(void)
 
 	msr_check_and_set(MSR_VEC);
 
-	if (current->thread.regs && (current->thread.regs->msr & MSR_VEC))
+	if (current->thread.regs && (current->thread.regs->msr & MSR_VEC)) {
+		check_if_tm_restore_required(current);
 		__giveup_altivec(current);
+	}
 }
 EXPORT_SYMBOL(enable_kernel_altivec);
 
@@ -249,6 +253,7 @@ void enable_kernel_vsx(void)
 	msr_check_and_set(MSR_FP|MSR_VEC|MSR_VSX);
 
 	if (current->thread.regs && (current->thread.regs->msr & MSR_VSX)) {
+		check_if_tm_restore_required(current);
 		if (current->thread.regs->msr & MSR_FP)
 			__giveup_fpu(current);
 		if (current->thread.regs->msr & MSR_VEC)
@@ -289,8 +294,10 @@ void enable_kernel_spe(void)
 
 	msr_check_and_set(MSR_SPE);
 
-	if (current->thread.regs && (current->thread.regs->msr & MSR_SPE))
+	if (current->thread.regs && (current->thread.regs->msr & MSR_SPE)) {
+		check_if_tm_restore_required(current);
 		__giveup_spe(current);
+	}
 }
 EXPORT_SYMBOL(enable_kernel_spe);
 

From 20dbe67062062c2a790832f0d30e73dba45df7c4 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Thu, 10 Dec 2015 20:44:39 +1100
Subject: [PATCH 033/149] powerpc: Call restore_sprs() before _switch()

commit 152d523e6307 ("powerpc: Create context switch helpers save_sprs()
and restore_sprs()") moved the restore of SPRs after the call to _switch().

There is an issue with this approach - new tasks do not return through
_switch(), they are set up by copy_thread() to directly return through
ret_from_fork() or ret_from_kernel_thread(). This means restore_sprs() is
not getting called for new tasks.

Fix this by moving restore_sprs() before _switch().

Fixes: 152d523e6307 ("powerpc: Create context switch helpers save_sprs() and restore_sprs()")
Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/process.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 1eeda3b80b65..9da7b5f0c3a5 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -971,14 +971,17 @@ struct task_struct *__switch_to(struct task_struct *prev,
 
 	tm_recheckpoint_new_task(new);
 
-	last = _switch(old_thread, new_thread);
-
-	/* Need to recalculate these after calling _switch() */
-	old_thread = &last->thread;
-	new_thread = &current->thread;
-
+	/*
+	 * Call restore_sprs() before calling _switch(). If we move it after
+	 * _switch() then we miss out on calling it for new tasks. The reason
+	 * for this is we manually create a stack frame for new tasks that
+	 * directly returns through ret_from_fork() or
+	 * ret_from_kernel_thread(). See copy_thread() for details.
+	 */
 	restore_sprs(old_thread, new_thread);
 
+	last = _switch(old_thread, new_thread);
+
 #ifdef CONFIG_PPC_BOOK3S_64
 	if (current_thread_info()->local_flags & _TLF_LAZY_MMU) {
 		current_thread_info()->local_flags &= ~_TLF_LAZY_MMU;

From db1231dcdb4dc6cdcbdef0babe641a9162c0dc98 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Wed, 9 Dec 2015 20:11:47 +1100
Subject: [PATCH 034/149] powerpc: Fix DSCR inheritance over fork()

Two DSCR tests have a hack in them:

	/*
	 * XXX: Force a context switch out so that DSCR
	 * current value is copied into the thread struct
	 * which is required for the child to inherit the
	 * changed value.
	 */
	sleep(1);

We should not be working around this in the testcase, it is a kernel bug.
Fix it by copying the current DSCR to the child, instead of what we
had in the thread struct at last context switch.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/process.c                             | 2 +-
 .../selftests/powerpc/dscr/dscr_inherit_exec_test.c       | 8 --------
 tools/testing/selftests/powerpc/dscr/dscr_inherit_test.c  | 8 --------
 3 files changed, 1 insertion(+), 17 deletions(-)

diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 9da7b5f0c3a5..6f76f25c3ee8 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1287,7 +1287,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp,
 #ifdef CONFIG_PPC64 
 	if (cpu_has_feature(CPU_FTR_DSCR)) {
 		p->thread.dscr_inherit = current->thread.dscr_inherit;
-		p->thread.dscr = current->thread.dscr;
+		p->thread.dscr = mfspr(SPRN_DSCR);
 	}
 	if (cpu_has_feature(CPU_FTR_HAS_PPR))
 		p->thread.ppr = INIT_PPR;
diff --git a/tools/testing/selftests/powerpc/dscr/dscr_inherit_exec_test.c b/tools/testing/selftests/powerpc/dscr/dscr_inherit_exec_test.c
index 8265504de571..08a8b95e3bc1 100644
--- a/tools/testing/selftests/powerpc/dscr/dscr_inherit_exec_test.c
+++ b/tools/testing/selftests/powerpc/dscr/dscr_inherit_exec_test.c
@@ -60,14 +60,6 @@ int dscr_inherit_exec(void)
 		else
 			set_dscr(dscr);
 
-		/*
-		 * XXX: Force a context switch out so that DSCR
-		 * current value is copied into the thread struct
-		 * which is required for the child to inherit the
-		 * changed value.
-		 */
-		sleep(1);
-
 		pid = fork();
 		if (pid == -1) {
 			perror("fork() failed");
diff --git a/tools/testing/selftests/powerpc/dscr/dscr_inherit_test.c b/tools/testing/selftests/powerpc/dscr/dscr_inherit_test.c
index 4e414caf7f40..3e5a6d195e9a 100644
--- a/tools/testing/selftests/powerpc/dscr/dscr_inherit_test.c
+++ b/tools/testing/selftests/powerpc/dscr/dscr_inherit_test.c
@@ -40,14 +40,6 @@ int dscr_inherit(void)
 		else
 			set_dscr(dscr);
 
-		/*
-		 * XXX: Force a context switch out so that DSCR
-		 * current value is copied into the thread struct
-		 * which is required for the child to inherit the
-		 * changed value.
-		 */
-		sleep(1);
-
 		pid = fork();
 		if (pid == -1) {
 			perror("fork() failed");

From 0863d7f2136550a281f40f4d8556bffd09fd4c2d Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 28 Nov 2015 22:39:33 +0530
Subject: [PATCH 035/149] powerpc/mm: Fix infinite loop in hash fault with 4K
 page size

This is the same bug we fixed as part of 09567e7fd44291bfc08accfdd67ad8f467842332
("powerpc/mm: Check paca psize is up to date for huge mappings"). Please
check that for details. The difference here is that faults were
happening on a 4K page at an address previously mapped by hugetlb.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Reviewed-by: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/hash_utils_64.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 7f9616f7c479..7d4f254a2671 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -1148,9 +1148,10 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
 		}
 	}
 
+#endif /* CONFIG_PPC_64K_PAGES */
+
 	if (current->mm == mm)
 		check_paca_psize(ea, mm, psize, user_region);
-#endif /* CONFIG_PPC_64K_PAGES */
 
 #ifdef CONFIG_PPC_64K_PAGES
 	if (psize == MMU_PAGE_64K)

From 26b6a3d9bb48f8b4624a62281bc2a295df3a8109 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 1 Dec 2015 09:06:26 +0530
Subject: [PATCH 036/149] powerpc/mm: move pte headers to book3s directory

Acked-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 .../include/asm/{pte-hash32.h => book3s/32/hash.h}     |  6 +++---
 .../asm/{pte-hash64-4k.h => book3s/64/hash-4k.h}       |  0
 .../asm/{pte-hash64-64k.h => book3s/64/hash-64k.h}     |  0
 .../include/asm/{pte-hash64.h => book3s/64/hash.h}     | 10 +++++-----
 arch/powerpc/include/asm/pgtable-ppc32.h               |  2 +-
 arch/powerpc/include/asm/pgtable-ppc64.h               |  2 +-
 6 files changed, 10 insertions(+), 10 deletions(-)
 rename arch/powerpc/include/asm/{pte-hash32.h => book3s/32/hash.h} (93%)
 rename arch/powerpc/include/asm/{pte-hash64-4k.h => book3s/64/hash-4k.h} (100%)
 rename arch/powerpc/include/asm/{pte-hash64-64k.h => book3s/64/hash-64k.h} (100%)
 rename arch/powerpc/include/asm/{pte-hash64.h => book3s/64/hash.h} (90%)

diff --git a/arch/powerpc/include/asm/pte-hash32.h b/arch/powerpc/include/asm/book3s/32/hash.h
similarity index 93%
rename from arch/powerpc/include/asm/pte-hash32.h
rename to arch/powerpc/include/asm/book3s/32/hash.h
index 62cfb0c663bb..264b754d65b0 100644
--- a/arch/powerpc/include/asm/pte-hash32.h
+++ b/arch/powerpc/include/asm/book3s/32/hash.h
@@ -1,5 +1,5 @@
-#ifndef _ASM_POWERPC_PTE_HASH32_H
-#define _ASM_POWERPC_PTE_HASH32_H
+#ifndef _ASM_POWERPC_BOOK3S_32_HASH_H
+#define _ASM_POWERPC_BOOK3S_32_HASH_H
 #ifdef __KERNEL__
 
 /*
@@ -43,4 +43,4 @@
 #define PTE_ATOMIC_UPDATES	1
 
 #endif /* __KERNEL__ */
-#endif /*  _ASM_POWERPC_PTE_HASH32_H */
+#endif /* _ASM_POWERPC_BOOK3S_32_HASH_H */
diff --git a/arch/powerpc/include/asm/pte-hash64-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h
similarity index 100%
rename from arch/powerpc/include/asm/pte-hash64-4k.h
rename to arch/powerpc/include/asm/book3s/64/hash-4k.h
diff --git a/arch/powerpc/include/asm/pte-hash64-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h
similarity index 100%
rename from arch/powerpc/include/asm/pte-hash64-64k.h
rename to arch/powerpc/include/asm/book3s/64/hash-64k.h
diff --git a/arch/powerpc/include/asm/pte-hash64.h b/arch/powerpc/include/asm/book3s/64/hash.h
similarity index 90%
rename from arch/powerpc/include/asm/pte-hash64.h
rename to arch/powerpc/include/asm/book3s/64/hash.h
index ef612c160da7..8e60d4fa434d 100644
--- a/arch/powerpc/include/asm/pte-hash64.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -1,5 +1,5 @@
-#ifndef _ASM_POWERPC_PTE_HASH64_H
-#define _ASM_POWERPC_PTE_HASH64_H
+#ifndef _ASM_POWERPC_BOOK3S_64_HASH_H
+#define _ASM_POWERPC_BOOK3S_64_HASH_H
 #ifdef __KERNEL__
 
 /*
@@ -45,10 +45,10 @@
 #define PTE_ATOMIC_UPDATES	1
 
 #ifdef CONFIG_PPC_64K_PAGES
-#include <asm/pte-hash64-64k.h>
+#include <asm/book3s/64/hash-64k.h>
 #else
-#include <asm/pte-hash64-4k.h>
+#include <asm/book3s/64/hash-4k.h>
 #endif
 
 #endif /* __KERNEL__ */
-#endif /*  _ASM_POWERPC_PTE_HASH64_H */
+#endif /* _ASM_POWERPC_BOOK3S_64_HASH_H */
diff --git a/arch/powerpc/include/asm/pgtable-ppc32.h b/arch/powerpc/include/asm/pgtable-ppc32.h
index 9c326565d498..1a58a05be99c 100644
--- a/arch/powerpc/include/asm/pgtable-ppc32.h
+++ b/arch/powerpc/include/asm/pgtable-ppc32.h
@@ -116,7 +116,7 @@ extern int icache_44x_need_flush;
 #elif defined(CONFIG_8xx)
 #include <asm/pte-8xx.h>
 #else /* CONFIG_6xx */
-#include <asm/pte-hash32.h>
+#include <asm/book3s/32/hash.h>
 #endif
 
 /* And here we include common definitions */
diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
index 3245f2d96d4f..b36a932abdfb 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -98,7 +98,7 @@
  * Include the PTE bits definitions
  */
 #ifdef CONFIG_PPC_BOOK3S
-#include <asm/pte-hash64.h>
+#include <asm/book3s/64/hash.h>
 #else
 #include <asm/pte-book3e.h>
 #endif

From 3dfcb315d81e663bf70401de61940c1b4de2deea Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 1 Dec 2015 09:06:28 +0530
Subject: [PATCH 037/149] powerpc/mm: make a separate copy for book3s

In this patch we do:
cp pgtable-ppc32.h book3s/32/pgtable.h
cp pgtable-ppc64.h book3s/64/pgtable.h

This enable us to do further changes to hash specific config.
We will change the page table format for 64bit hash in later patches.

Acked-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/32/pgtable.h | 340 ++++++++++
 arch/powerpc/include/asm/book3s/64/pgtable.h | 626 +++++++++++++++++++
 arch/powerpc/include/asm/book3s/pgtable.h    |  10 +
 arch/powerpc/include/asm/mmu-hash64.h        |   2 +-
 arch/powerpc/include/asm/pgtable-ppc32.h     |   2 -
 arch/powerpc/include/asm/pgtable-ppc64.h     |   4 -
 arch/powerpc/include/asm/pgtable.h           |   4 +
 7 files changed, 981 insertions(+), 7 deletions(-)
 create mode 100644 arch/powerpc/include/asm/book3s/32/pgtable.h
 create mode 100644 arch/powerpc/include/asm/book3s/64/pgtable.h
 create mode 100644 arch/powerpc/include/asm/book3s/pgtable.h

diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h
new file mode 100644
index 000000000000..418d2fa3ac7d
--- /dev/null
+++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
@@ -0,0 +1,340 @@
+#ifndef _ASM_POWERPC_BOOK3S_32_PGTABLE_H
+#define _ASM_POWERPC_BOOK3S_32_PGTABLE_H
+
+#include <asm-generic/pgtable-nopmd.h>
+
+#ifndef __ASSEMBLY__
+#include <linux/sched.h>
+#include <linux/threads.h>
+#include <asm/io.h>			/* For sub-arch specific PPC_PIN_SIZE */
+
+extern unsigned long ioremap_bot;
+
+#ifdef CONFIG_44x
+extern int icache_44x_need_flush;
+#endif
+
+#endif /* __ASSEMBLY__ */
+
+/*
+ * The normal case is that PTEs are 32-bits and we have a 1-page
+ * 1024-entry pgdir pointing to 1-page 1024-entry PTE pages.  -- paulus
+ *
+ * For any >32-bit physical address platform, we can use the following
+ * two level page table layout where the pgdir is 8KB and the MS 13 bits
+ * are an index to the second level table.  The combined pgdir/pmd first
+ * level has 2048 entries and the second level has 512 64-bit PTE entries.
+ * -Matt
+ */
+/* PGDIR_SHIFT determines what a top-level page table entry can map */
+#define PGDIR_SHIFT	(PAGE_SHIFT + PTE_SHIFT)
+#define PGDIR_SIZE	(1UL << PGDIR_SHIFT)
+#define PGDIR_MASK	(~(PGDIR_SIZE-1))
+
+/*
+ * entries per page directory level: our page-table tree is two-level, so
+ * we don't really have any PMD directory.
+ */
+#ifndef __ASSEMBLY__
+#define PTE_TABLE_SIZE	(sizeof(pte_t) << PTE_SHIFT)
+#define PGD_TABLE_SIZE	(sizeof(pgd_t) << (32 - PGDIR_SHIFT))
+#endif	/* __ASSEMBLY__ */
+
+#define PTRS_PER_PTE	(1 << PTE_SHIFT)
+#define PTRS_PER_PMD	1
+#define PTRS_PER_PGD	(1 << (32 - PGDIR_SHIFT))
+
+#define USER_PTRS_PER_PGD	(TASK_SIZE / PGDIR_SIZE)
+#define FIRST_USER_ADDRESS	0UL
+
+#define pte_ERROR(e) \
+	pr_err("%s:%d: bad pte %llx.\n", __FILE__, __LINE__, \
+		(unsigned long long)pte_val(e))
+#define pgd_ERROR(e) \
+	pr_err("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
+
+/*
+ * This is the bottom of the PKMAP area with HIGHMEM or an arbitrary
+ * value (for now) on others, from where we can start layout kernel
+ * virtual space that goes below PKMAP and FIXMAP
+ */
+#ifdef CONFIG_HIGHMEM
+#define KVIRT_TOP	PKMAP_BASE
+#else
+#define KVIRT_TOP	(0xfe000000UL)	/* for now, could be FIXMAP_BASE ? */
+#endif
+
+/*
+ * ioremap_bot starts at that address. Early ioremaps move down from there,
+ * until mem_init() at which point this becomes the top of the vmalloc
+ * and ioremap space
+ */
+#ifdef CONFIG_NOT_COHERENT_CACHE
+#define IOREMAP_TOP	((KVIRT_TOP - CONFIG_CONSISTENT_SIZE) & PAGE_MASK)
+#else
+#define IOREMAP_TOP	KVIRT_TOP
+#endif
+
+/*
+ * Just any arbitrary offset to the start of the vmalloc VM area: the
+ * current 16MB value just means that there will be a 64MB "hole" after the
+ * physical memory until the kernel virtual memory starts.  That means that
+ * any out-of-bounds memory accesses will hopefully be caught.
+ * The vmalloc() routines leaves a hole of 4kB between each vmalloced
+ * area for the same reason. ;)
+ *
+ * We no longer map larger than phys RAM with the BATs so we don't have
+ * to worry about the VMALLOC_OFFSET causing problems.  We do have to worry
+ * about clashes between our early calls to ioremap() that start growing down
+ * from ioremap_base being run into the VM area allocations (growing upwards
+ * from VMALLOC_START).  For this reason we have ioremap_bot to check when
+ * we actually run into our mappings setup in the early boot with the VM
+ * system.  This really does become a problem for machines with good amounts
+ * of RAM.  -- Cort
+ */
+#define VMALLOC_OFFSET (0x1000000) /* 16M */
+#ifdef PPC_PIN_SIZE
+#define VMALLOC_START (((_ALIGN((long)high_memory, PPC_PIN_SIZE) + VMALLOC_OFFSET) & ~(VMALLOC_OFFSET-1)))
+#else
+#define VMALLOC_START ((((long)high_memory + VMALLOC_OFFSET) & ~(VMALLOC_OFFSET-1)))
+#endif
+#define VMALLOC_END	ioremap_bot
+
+/*
+ * Bits in a linux-style PTE.  These match the bits in the
+ * (hardware-defined) PowerPC PTE as closely as possible.
+ */
+
+#if defined(CONFIG_40x)
+#include <asm/pte-40x.h>
+#elif defined(CONFIG_44x)
+#include <asm/pte-44x.h>
+#elif defined(CONFIG_FSL_BOOKE) && defined(CONFIG_PTE_64BIT)
+#include <asm/pte-book3e.h>
+#elif defined(CONFIG_FSL_BOOKE)
+#include <asm/pte-fsl-booke.h>
+#elif defined(CONFIG_8xx)
+#include <asm/pte-8xx.h>
+#else /* CONFIG_6xx */
+#include <asm/book3s/32/hash.h>
+#endif
+
+/* And here we include common definitions */
+#include <asm/pte-common.h>
+
+#ifndef __ASSEMBLY__
+
+#define pte_clear(mm, addr, ptep) \
+	do { pte_update(ptep, ~_PAGE_HASHPTE, 0); } while (0)
+
+#define pmd_none(pmd)		(!pmd_val(pmd))
+#define	pmd_bad(pmd)		(pmd_val(pmd) & _PMD_BAD)
+#define	pmd_present(pmd)	(pmd_val(pmd) & _PMD_PRESENT_MASK)
+#define	pmd_clear(pmdp)		do { pmd_val(*(pmdp)) = 0; } while (0)
+
+/*
+ * When flushing the tlb entry for a page, we also need to flush the hash
+ * table entry.  flush_hash_pages is assembler (for speed) in hashtable.S.
+ */
+extern int flush_hash_pages(unsigned context, unsigned long va,
+			    unsigned long pmdval, int count);
+
+/* Add an HPTE to the hash table */
+extern void add_hash_page(unsigned context, unsigned long va,
+			  unsigned long pmdval);
+
+/* Flush an entry from the TLB/hash table */
+extern void flush_hash_entry(struct mm_struct *mm, pte_t *ptep,
+			     unsigned long address);
+
+/*
+ * PTE updates. This function is called whenever an existing
+ * valid PTE is updated. This does -not- include set_pte_at()
+ * which nowadays only sets a new PTE.
+ *
+ * Depending on the type of MMU, we may need to use atomic updates
+ * and the PTE may be either 32 or 64 bit wide. In the later case,
+ * when using atomic updates, only the low part of the PTE is
+ * accessed atomically.
+ *
+ * In addition, on 44x, we also maintain a global flag indicating
+ * that an executable user mapping was modified, which is needed
+ * to properly flush the virtually tagged instruction cache of
+ * those implementations.
+ */
+#ifndef CONFIG_PTE_64BIT
+static inline unsigned long pte_update(pte_t *p,
+				       unsigned long clr,
+				       unsigned long set)
+{
+#ifdef PTE_ATOMIC_UPDATES
+	unsigned long old, tmp;
+
+	__asm__ __volatile__("\
+1:	lwarx	%0,0,%3\n\
+	andc	%1,%0,%4\n\
+	or	%1,%1,%5\n"
+	PPC405_ERR77(0,%3)
+"	stwcx.	%1,0,%3\n\
+	bne-	1b"
+	: "=&r" (old), "=&r" (tmp), "=m" (*p)
+	: "r" (p), "r" (clr), "r" (set), "m" (*p)
+	: "cc" );
+#else /* PTE_ATOMIC_UPDATES */
+	unsigned long old = pte_val(*p);
+	*p = __pte((old & ~clr) | set);
+#endif /* !PTE_ATOMIC_UPDATES */
+
+#ifdef CONFIG_44x
+	if ((old & _PAGE_USER) && (old & _PAGE_EXEC))
+		icache_44x_need_flush = 1;
+#endif
+	return old;
+}
+#else /* CONFIG_PTE_64BIT */
+static inline unsigned long long pte_update(pte_t *p,
+					    unsigned long clr,
+					    unsigned long set)
+{
+#ifdef PTE_ATOMIC_UPDATES
+	unsigned long long old;
+	unsigned long tmp;
+
+	__asm__ __volatile__("\
+1:	lwarx	%L0,0,%4\n\
+	lwzx	%0,0,%3\n\
+	andc	%1,%L0,%5\n\
+	or	%1,%1,%6\n"
+	PPC405_ERR77(0,%3)
+"	stwcx.	%1,0,%4\n\
+	bne-	1b"
+	: "=&r" (old), "=&r" (tmp), "=m" (*p)
+	: "r" (p), "r" ((unsigned long)(p) + 4), "r" (clr), "r" (set), "m" (*p)
+	: "cc" );
+#else /* PTE_ATOMIC_UPDATES */
+	unsigned long long old = pte_val(*p);
+	*p = __pte((old & ~(unsigned long long)clr) | set);
+#endif /* !PTE_ATOMIC_UPDATES */
+
+#ifdef CONFIG_44x
+	if ((old & _PAGE_USER) && (old & _PAGE_EXEC))
+		icache_44x_need_flush = 1;
+#endif
+	return old;
+}
+#endif /* CONFIG_PTE_64BIT */
+
+/*
+ * 2.6 calls this without flushing the TLB entry; this is wrong
+ * for our hash-based implementation, we fix that up here.
+ */
+#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
+static inline int __ptep_test_and_clear_young(unsigned int context, unsigned long addr, pte_t *ptep)
+{
+	unsigned long old;
+	old = pte_update(ptep, _PAGE_ACCESSED, 0);
+#if _PAGE_HASHPTE != 0
+	if (old & _PAGE_HASHPTE) {
+		unsigned long ptephys = __pa(ptep) & PAGE_MASK;
+		flush_hash_pages(context, addr, ptephys, 1);
+	}
+#endif
+	return (old & _PAGE_ACCESSED) != 0;
+}
+#define ptep_test_and_clear_young(__vma, __addr, __ptep) \
+	__ptep_test_and_clear_young((__vma)->vm_mm->context.id, __addr, __ptep)
+
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
+static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
+				       pte_t *ptep)
+{
+	return __pte(pte_update(ptep, ~_PAGE_HASHPTE, 0));
+}
+
+#define __HAVE_ARCH_PTEP_SET_WRPROTECT
+static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr,
+				      pte_t *ptep)
+{
+	pte_update(ptep, (_PAGE_RW | _PAGE_HWWRITE), _PAGE_RO);
+}
+static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
+					   unsigned long addr, pte_t *ptep)
+{
+	ptep_set_wrprotect(mm, addr, ptep);
+}
+
+
+static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry)
+{
+	unsigned long set = pte_val(entry) &
+		(_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC);
+	unsigned long clr = ~pte_val(entry) & _PAGE_RO;
+
+	pte_update(ptep, clr, set);
+}
+
+#define __HAVE_ARCH_PTE_SAME
+#define pte_same(A,B)	(((pte_val(A) ^ pte_val(B)) & ~_PAGE_HASHPTE) == 0)
+
+/*
+ * Note that on Book E processors, the pmd contains the kernel virtual
+ * (lowmem) address of the pte page.  The physical address is less useful
+ * because everything runs with translation enabled (even the TLB miss
+ * handler).  On everything else the pmd contains the physical address
+ * of the pte page.  -- paulus
+ */
+#ifndef CONFIG_BOOKE
+#define pmd_page_vaddr(pmd)	\
+	((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
+#define pmd_page(pmd)		\
+	pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)
+#else
+#define pmd_page_vaddr(pmd)	\
+	((unsigned long) (pmd_val(pmd) & PAGE_MASK))
+#define pmd_page(pmd)		\
+	pfn_to_page((__pa(pmd_val(pmd)) >> PAGE_SHIFT))
+#endif
+
+/* to find an entry in a kernel page-table-directory */
+#define pgd_offset_k(address) pgd_offset(&init_mm, address)
+
+/* to find an entry in a page-table-directory */
+#define pgd_index(address)	 ((address) >> PGDIR_SHIFT)
+#define pgd_offset(mm, address)	 ((mm)->pgd + pgd_index(address))
+
+/* Find an entry in the third-level page table.. */
+#define pte_index(address)		\
+	(((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
+#define pte_offset_kernel(dir, addr)	\
+	((pte_t *) pmd_page_vaddr(*(dir)) + pte_index(addr))
+#define pte_offset_map(dir, addr)		\
+	((pte_t *) kmap_atomic(pmd_page(*(dir))) + pte_index(addr))
+#define pte_unmap(pte)		kunmap_atomic(pte)
+
+/*
+ * Encode and decode a swap entry.
+ * Note that the bits we use in a PTE for representing a swap entry
+ * must not include the _PAGE_PRESENT bit or the _PAGE_HASHPTE bit (if used).
+ *   -- paulus
+ */
+#define __swp_type(entry)		((entry).val & 0x1f)
+#define __swp_offset(entry)		((entry).val >> 5)
+#define __swp_entry(type, offset)	((swp_entry_t) { (type) | ((offset) << 5) })
+#define __pte_to_swp_entry(pte)		((swp_entry_t) { pte_val(pte) >> 3 })
+#define __swp_entry_to_pte(x)		((pte_t) { (x).val << 3 })
+
+#ifndef CONFIG_PPC_4K_PAGES
+void pgtable_cache_init(void);
+#else
+/*
+ * No page table caches to initialise
+ */
+#define pgtable_cache_init()	do { } while (0)
+#endif
+
+extern int get_pteptr(struct mm_struct *mm, unsigned long addr, pte_t **ptep,
+		      pmd_t **pmdp);
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /*  _ASM_POWERPC_BOOK3S_32_PGTABLE_H */
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
new file mode 100644
index 000000000000..cdd5284d9eaa
--- /dev/null
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -0,0 +1,626 @@
+#ifndef _ASM_POWERPC_BOOK3S_64_PGTABLE_H_
+#define _ASM_POWERPC_BOOK3S_64_PGTABLE_H_
+/*
+ * This file contains the functions and defines necessary to modify and use
+ * the ppc64 hashed page table.
+ */
+
+#ifdef CONFIG_PPC_64K_PAGES
+#include <asm/pgtable-ppc64-64k.h>
+#else
+#include <asm/pgtable-ppc64-4k.h>
+#endif
+#include <asm/barrier.h>
+
+#define FIRST_USER_ADDRESS	0UL
+
+/*
+ * Size of EA range mapped by our pagetables.
+ */
+#define PGTABLE_EADDR_SIZE (PTE_INDEX_SIZE + PMD_INDEX_SIZE + \
+			    PUD_INDEX_SIZE + PGD_INDEX_SIZE + PAGE_SHIFT)
+#define PGTABLE_RANGE (ASM_CONST(1) << PGTABLE_EADDR_SIZE)
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define PMD_CACHE_INDEX	(PMD_INDEX_SIZE + 1)
+#else
+#define PMD_CACHE_INDEX	PMD_INDEX_SIZE
+#endif
+/*
+ * Define the address range of the kernel non-linear virtual area
+ */
+
+#ifdef CONFIG_PPC_BOOK3E
+#define KERN_VIRT_START ASM_CONST(0x8000000000000000)
+#else
+#define KERN_VIRT_START ASM_CONST(0xD000000000000000)
+#endif
+#define KERN_VIRT_SIZE	ASM_CONST(0x0000100000000000)
+
+/*
+ * The vmalloc space starts at the beginning of that region, and
+ * occupies half of it on hash CPUs and a quarter of it on Book3E
+ * (we keep a quarter for the virtual memmap)
+ */
+#define VMALLOC_START	KERN_VIRT_START
+#ifdef CONFIG_PPC_BOOK3E
+#define VMALLOC_SIZE	(KERN_VIRT_SIZE >> 2)
+#else
+#define VMALLOC_SIZE	(KERN_VIRT_SIZE >> 1)
+#endif
+#define VMALLOC_END	(VMALLOC_START + VMALLOC_SIZE)
+
+/*
+ * The second half of the kernel virtual space is used for IO mappings,
+ * it's itself carved into the PIO region (ISA and PHB IO space) and
+ * the ioremap space
+ *
+ *  ISA_IO_BASE = KERN_IO_START, 64K reserved area
+ *  PHB_IO_BASE = ISA_IO_BASE + 64K to ISA_IO_BASE + 2G, PHB IO spaces
+ * IOREMAP_BASE = ISA_IO_BASE + 2G to VMALLOC_START + PGTABLE_RANGE
+ */
+#define KERN_IO_START	(KERN_VIRT_START + (KERN_VIRT_SIZE >> 1))
+#define FULL_IO_SIZE	0x80000000ul
+#define  ISA_IO_BASE	(KERN_IO_START)
+#define  ISA_IO_END	(KERN_IO_START + 0x10000ul)
+#define  PHB_IO_BASE	(ISA_IO_END)
+#define  PHB_IO_END	(KERN_IO_START + FULL_IO_SIZE)
+#define IOREMAP_BASE	(PHB_IO_END)
+#define IOREMAP_END	(KERN_VIRT_START + KERN_VIRT_SIZE)
+
+
+/*
+ * Region IDs
+ */
+#define REGION_SHIFT		60UL
+#define REGION_MASK		(0xfUL << REGION_SHIFT)
+#define REGION_ID(ea)		(((unsigned long)(ea)) >> REGION_SHIFT)
+
+#define VMALLOC_REGION_ID	(REGION_ID(VMALLOC_START))
+#define KERNEL_REGION_ID	(REGION_ID(PAGE_OFFSET))
+#define VMEMMAP_REGION_ID	(0xfUL)	/* Server only */
+#define USER_REGION_ID		(0UL)
+
+/*
+ * Defines the address of the vmemap area, in its own region on
+ * hash table CPUs and after the vmalloc space on Book3E
+ */
+#ifdef CONFIG_PPC_BOOK3E
+#define VMEMMAP_BASE		VMALLOC_END
+#define VMEMMAP_END		KERN_IO_START
+#else
+#define VMEMMAP_BASE		(VMEMMAP_REGION_ID << REGION_SHIFT)
+#endif
+#define vmemmap			((struct page *)VMEMMAP_BASE)
+
+
+/*
+ * Include the PTE bits definitions
+ */
+#ifdef CONFIG_PPC_BOOK3S
+#include <asm/book3s/64/hash.h>
+#else
+#include <asm/pte-book3e.h>
+#endif
+#include <asm/pte-common.h>
+
+#ifdef CONFIG_PPC_MM_SLICES
+#define HAVE_ARCH_UNMAPPED_AREA
+#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
+#endif /* CONFIG_PPC_MM_SLICES */
+
+#ifndef __ASSEMBLY__
+
+/*
+ * This is the default implementation of various PTE accessors, it's
+ * used in all cases except Book3S with 64K pages where we have a
+ * concept of sub-pages
+ */
+#ifndef __real_pte
+
+#ifdef CONFIG_STRICT_MM_TYPECHECKS
+#define __real_pte(e,p)		((real_pte_t){(e)})
+#define __rpte_to_pte(r)	((r).pte)
+#else
+#define __real_pte(e,p)		(e)
+#define __rpte_to_pte(r)	(__pte(r))
+#endif
+#define __rpte_to_hidx(r,index)	(pte_val(__rpte_to_pte(r)) >> 12)
+
+#define pte_iterate_hashed_subpages(rpte, psize, va, index, shift)       \
+	do {							         \
+		index = 0;					         \
+		shift = mmu_psize_defs[psize].shift;		         \
+
+#define pte_iterate_hashed_end() } while(0)
+
+/*
+ * We expect this to be called only for user addresses or kernel virtual
+ * addresses other than the linear mapping.
+ */
+#define pte_pagesize_index(mm, addr, pte)	MMU_PAGE_4K
+
+#endif /* __real_pte */
+
+
+/* pte_clear moved to later in this file */
+
+#define PMD_BAD_BITS		(PTE_TABLE_SIZE-1)
+#define PUD_BAD_BITS		(PMD_TABLE_SIZE-1)
+
+#define pmd_set(pmdp, pmdval) 	(pmd_val(*(pmdp)) = (pmdval))
+#define pmd_none(pmd)		(!pmd_val(pmd))
+#define	pmd_bad(pmd)		(!is_kernel_addr(pmd_val(pmd)) \
+				 || (pmd_val(pmd) & PMD_BAD_BITS))
+#define	pmd_present(pmd)	(!pmd_none(pmd))
+#define	pmd_clear(pmdp)		(pmd_val(*(pmdp)) = 0)
+#define pmd_page_vaddr(pmd)	(pmd_val(pmd) & ~PMD_MASKED_BITS)
+extern struct page *pmd_page(pmd_t pmd);
+
+#define pud_set(pudp, pudval)	(pud_val(*(pudp)) = (pudval))
+#define pud_none(pud)		(!pud_val(pud))
+#define	pud_bad(pud)		(!is_kernel_addr(pud_val(pud)) \
+				 || (pud_val(pud) & PUD_BAD_BITS))
+#define pud_present(pud)	(pud_val(pud) != 0)
+#define pud_clear(pudp)		(pud_val(*(pudp)) = 0)
+#define pud_page_vaddr(pud)	(pud_val(pud) & ~PUD_MASKED_BITS)
+
+extern struct page *pud_page(pud_t pud);
+
+static inline pte_t pud_pte(pud_t pud)
+{
+	return __pte(pud_val(pud));
+}
+
+static inline pud_t pte_pud(pte_t pte)
+{
+	return __pud(pte_val(pte));
+}
+#define pud_write(pud)		pte_write(pud_pte(pud))
+#define pgd_set(pgdp, pudp)	({pgd_val(*(pgdp)) = (unsigned long)(pudp);})
+#define pgd_write(pgd)		pte_write(pgd_pte(pgd))
+
+/*
+ * Find an entry in a page-table-directory.  We combine the address region
+ * (the high order N bits) and the pgd portion of the address.
+ */
+#define pgd_index(address) (((address) >> (PGDIR_SHIFT)) & (PTRS_PER_PGD - 1))
+
+#define pgd_offset(mm, address)	 ((mm)->pgd + pgd_index(address))
+
+#define pmd_offset(pudp,addr) \
+  (((pmd_t *) pud_page_vaddr(*(pudp))) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)))
+
+#define pte_offset_kernel(dir,addr) \
+  (((pte_t *) pmd_page_vaddr(*(dir))) + (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)))
+
+#define pte_offset_map(dir,addr)	pte_offset_kernel((dir), (addr))
+#define pte_unmap(pte)			do { } while(0)
+
+/* to find an entry in a kernel page-table-directory */
+/* This now only contains the vmalloc pages */
+#define pgd_offset_k(address) pgd_offset(&init_mm, address)
+extern void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
+			    pte_t *ptep, unsigned long pte, int huge);
+
+/* Atomic PTE updates */
+static inline unsigned long pte_update(struct mm_struct *mm,
+				       unsigned long addr,
+				       pte_t *ptep, unsigned long clr,
+				       unsigned long set,
+				       int huge)
+{
+#ifdef PTE_ATOMIC_UPDATES
+	unsigned long old, tmp;
+
+	__asm__ __volatile__(
+	"1:	ldarx	%0,0,%3		# pte_update\n\
+	andi.	%1,%0,%6\n\
+	bne-	1b \n\
+	andc	%1,%0,%4 \n\
+	or	%1,%1,%7\n\
+	stdcx.	%1,0,%3 \n\
+	bne-	1b"
+	: "=&r" (old), "=&r" (tmp), "=m" (*ptep)
+	: "r" (ptep), "r" (clr), "m" (*ptep), "i" (_PAGE_BUSY), "r" (set)
+	: "cc" );
+#else
+	unsigned long old = pte_val(*ptep);
+	*ptep = __pte((old & ~clr) | set);
+#endif
+	/* huge pages use the old page table lock */
+	if (!huge)
+		assert_pte_locked(mm, addr);
+
+#ifdef CONFIG_PPC_STD_MMU_64
+	if (old & _PAGE_HASHPTE)
+		hpte_need_flush(mm, addr, ptep, old, huge);
+#endif
+
+	return old;
+}
+
+static inline int __ptep_test_and_clear_young(struct mm_struct *mm,
+					      unsigned long addr, pte_t *ptep)
+{
+	unsigned long old;
+
+	if ((pte_val(*ptep) & (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0)
+		return 0;
+	old = pte_update(mm, addr, ptep, _PAGE_ACCESSED, 0, 0);
+	return (old & _PAGE_ACCESSED) != 0;
+}
+#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
+#define ptep_test_and_clear_young(__vma, __addr, __ptep)		   \
+({									   \
+	int __r;							   \
+	__r = __ptep_test_and_clear_young((__vma)->vm_mm, __addr, __ptep); \
+	__r;								   \
+})
+
+#define __HAVE_ARCH_PTEP_SET_WRPROTECT
+static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr,
+				      pte_t *ptep)
+{
+
+	if ((pte_val(*ptep) & _PAGE_RW) == 0)
+		return;
+
+	pte_update(mm, addr, ptep, _PAGE_RW, 0, 0);
+}
+
+static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
+					   unsigned long addr, pte_t *ptep)
+{
+	if ((pte_val(*ptep) & _PAGE_RW) == 0)
+		return;
+
+	pte_update(mm, addr, ptep, _PAGE_RW, 0, 1);
+}
+
+/*
+ * We currently remove entries from the hashtable regardless of whether
+ * the entry was young or dirty. The generic routines only flush if the
+ * entry was young or dirty which is not good enough.
+ *
+ * We should be more intelligent about this but for the moment we override
+ * these functions and force a tlb flush unconditionally
+ */
+#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
+#define ptep_clear_flush_young(__vma, __address, __ptep)		\
+({									\
+	int __young = __ptep_test_and_clear_young((__vma)->vm_mm, __address, \
+						  __ptep);		\
+	__young;							\
+})
+
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
+static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
+				       unsigned long addr, pte_t *ptep)
+{
+	unsigned long old = pte_update(mm, addr, ptep, ~0UL, 0, 0);
+	return __pte(old);
+}
+
+static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
+			     pte_t * ptep)
+{
+	pte_update(mm, addr, ptep, ~0UL, 0, 0);
+}
+
+
+/* Set the dirty and/or accessed bits atomically in a linux PTE, this
+ * function doesn't need to flush the hash entry
+ */
+static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry)
+{
+	unsigned long bits = pte_val(entry) &
+		(_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC);
+
+#ifdef PTE_ATOMIC_UPDATES
+	unsigned long old, tmp;
+
+	__asm__ __volatile__(
+	"1:	ldarx	%0,0,%4\n\
+		andi.	%1,%0,%6\n\
+		bne-	1b \n\
+		or	%0,%3,%0\n\
+		stdcx.	%0,0,%4\n\
+		bne-	1b"
+	:"=&r" (old), "=&r" (tmp), "=m" (*ptep)
+	:"r" (bits), "r" (ptep), "m" (*ptep), "i" (_PAGE_BUSY)
+	:"cc");
+#else
+	unsigned long old = pte_val(*ptep);
+	*ptep = __pte(old | bits);
+#endif
+}
+
+#define __HAVE_ARCH_PTE_SAME
+#define pte_same(A,B)	(((pte_val(A) ^ pte_val(B)) & ~_PAGE_HPTEFLAGS) == 0)
+
+#define pte_ERROR(e) \
+	pr_err("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, pte_val(e))
+#define pmd_ERROR(e) \
+	pr_err("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pmd_val(e))
+#define pgd_ERROR(e) \
+	pr_err("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
+
+/* Encode and de-code a swap entry */
+#define MAX_SWAPFILES_CHECK() do { \
+	BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS); \
+	/*							\
+	 * Don't have overlapping bits with _PAGE_HPTEFLAGS	\
+	 * We filter HPTEFLAGS on set_pte.			\
+	 */							\
+	BUILD_BUG_ON(_PAGE_HPTEFLAGS & (0x1f << _PAGE_BIT_SWAP_TYPE)); \
+	} while (0)
+/*
+ * on pte we don't need handle RADIX_TREE_EXCEPTIONAL_SHIFT;
+ */
+#define SWP_TYPE_BITS 5
+#define __swp_type(x)		(((x).val >> _PAGE_BIT_SWAP_TYPE) \
+				& ((1UL << SWP_TYPE_BITS) - 1))
+#define __swp_offset(x)		((x).val >> PTE_RPN_SHIFT)
+#define __swp_entry(type, offset)	((swp_entry_t) { \
+					((type) << _PAGE_BIT_SWAP_TYPE) \
+					| ((offset) << PTE_RPN_SHIFT) })
+
+#define __pte_to_swp_entry(pte)		((swp_entry_t) { pte_val((pte)) })
+#define __swp_entry_to_pte(x)		__pte((x).val)
+
+void pgtable_cache_add(unsigned shift, void (*ctor)(void *));
+void pgtable_cache_init(void);
+#endif /* __ASSEMBLY__ */
+
+/*
+ * THP pages can't be special. So use the _PAGE_SPECIAL
+ */
+#define _PAGE_SPLITTING _PAGE_SPECIAL
+
+/*
+ * We need to differentiate between explicit huge page and THP huge
+ * page, since THP huge page also need to track real subpage details
+ */
+#define _PAGE_THP_HUGE  _PAGE_4K_PFN
+
+/*
+ * set of bits not changed in pmd_modify.
+ */
+#define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS |		\
+			 _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_SPLITTING | \
+			 _PAGE_THP_HUGE)
+
+#ifndef __ASSEMBLY__
+/*
+ * The linux hugepage PMD now include the pmd entries followed by the address
+ * to the stashed pgtable_t. The stashed pgtable_t contains the hpte bits.
+ * [ 1 bit secondary | 3 bit hidx | 1 bit valid | 000]. We use one byte per
+ * each HPTE entry. With 16MB hugepage and 64K HPTE we need 256 entries and
+ * with 4K HPTE we need 4096 entries. Both will fit in a 4K pgtable_t.
+ *
+ * The last three bits are intentionally left to zero. This memory location
+ * are also used as normal page PTE pointers. So if we have any pointers
+ * left around while we collapse a hugepage, we need to make sure
+ * _PAGE_PRESENT bit of that is zero when we look at them
+ */
+static inline unsigned int hpte_valid(unsigned char *hpte_slot_array, int index)
+{
+	return (hpte_slot_array[index] >> 3) & 0x1;
+}
+
+static inline unsigned int hpte_hash_index(unsigned char *hpte_slot_array,
+					   int index)
+{
+	return hpte_slot_array[index] >> 4;
+}
+
+static inline void mark_hpte_slot_valid(unsigned char *hpte_slot_array,
+					unsigned int index, unsigned int hidx)
+{
+	hpte_slot_array[index] = hidx << 4 | 0x1 << 3;
+}
+
+struct page *realmode_pfn_to_page(unsigned long pfn);
+
+static inline char *get_hpte_slot_array(pmd_t *pmdp)
+{
+	/*
+	 * The hpte hindex is stored in the pgtable whose address is in the
+	 * second half of the PMD
+	 *
+	 * Order this load with the test for pmd_trans_huge in the caller
+	 */
+	smp_rmb();
+	return *(char **)(pmdp + PTRS_PER_PMD);
+
+
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+extern void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
+				   pmd_t *pmdp, unsigned long old_pmd);
+extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot);
+extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot);
+extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot);
+extern void set_pmd_at(struct mm_struct *mm, unsigned long addr,
+		       pmd_t *pmdp, pmd_t pmd);
+extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
+				 pmd_t *pmd);
+/*
+ *
+ * For core kernel code by design pmd_trans_huge is never run on any hugetlbfs
+ * page. The hugetlbfs page table walking and mangling paths are totally
+ * separated form the core VM paths and they're differentiated by
+ *  VM_HUGETLB being set on vm_flags well before any pmd_trans_huge could run.
+ *
+ * pmd_trans_huge() is defined as false at build time if
+ * CONFIG_TRANSPARENT_HUGEPAGE=n to optimize away code blocks at build
+ * time in such case.
+ *
+ * For ppc64 we need to differntiate from explicit hugepages from THP, because
+ * for THP we also track the subpage details at the pmd level. We don't do
+ * that for explicit huge pages.
+ *
+ */
+static inline int pmd_trans_huge(pmd_t pmd)
+{
+	/*
+	 * leaf pte for huge page, bottom two bits != 00
+	 */
+	return (pmd_val(pmd) & 0x3) && (pmd_val(pmd) & _PAGE_THP_HUGE);
+}
+
+static inline int pmd_trans_splitting(pmd_t pmd)
+{
+	if (pmd_trans_huge(pmd))
+		return pmd_val(pmd) & _PAGE_SPLITTING;
+	return 0;
+}
+
+extern int has_transparent_hugepage(void);
+#else
+static inline void hpte_do_hugepage_flush(struct mm_struct *mm,
+					  unsigned long addr, pmd_t *pmdp,
+					  unsigned long old_pmd)
+{
+
+	WARN(1, "%s called with THP disabled\n", __func__);
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+static inline int pmd_large(pmd_t pmd)
+{
+	/*
+	 * leaf pte for huge page, bottom two bits != 00
+	 */
+	return ((pmd_val(pmd) & 0x3) != 0x0);
+}
+
+static inline pte_t pmd_pte(pmd_t pmd)
+{
+	return __pte(pmd_val(pmd));
+}
+
+static inline pmd_t pte_pmd(pte_t pte)
+{
+	return __pmd(pte_val(pte));
+}
+
+static inline pte_t *pmdp_ptep(pmd_t *pmd)
+{
+	return (pte_t *)pmd;
+}
+
+#define pmd_pfn(pmd)		pte_pfn(pmd_pte(pmd))
+#define pmd_dirty(pmd)		pte_dirty(pmd_pte(pmd))
+#define pmd_young(pmd)		pte_young(pmd_pte(pmd))
+#define pmd_mkold(pmd)		pte_pmd(pte_mkold(pmd_pte(pmd)))
+#define pmd_wrprotect(pmd)	pte_pmd(pte_wrprotect(pmd_pte(pmd)))
+#define pmd_mkdirty(pmd)	pte_pmd(pte_mkdirty(pmd_pte(pmd)))
+#define pmd_mkyoung(pmd)	pte_pmd(pte_mkyoung(pmd_pte(pmd)))
+#define pmd_mkwrite(pmd)	pte_pmd(pte_mkwrite(pmd_pte(pmd)))
+
+#define __HAVE_ARCH_PMD_WRITE
+#define pmd_write(pmd)		pte_write(pmd_pte(pmd))
+
+static inline pmd_t pmd_mkhuge(pmd_t pmd)
+{
+	/* Do nothing, mk_pmd() does this part.  */
+	return pmd;
+}
+
+static inline pmd_t pmd_mknotpresent(pmd_t pmd)
+{
+	pmd_val(pmd) &= ~_PAGE_PRESENT;
+	return pmd;
+}
+
+static inline pmd_t pmd_mksplitting(pmd_t pmd)
+{
+	pmd_val(pmd) |= _PAGE_SPLITTING;
+	return pmd;
+}
+
+#define __HAVE_ARCH_PMD_SAME
+static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
+{
+	return (((pmd_val(pmd_a) ^ pmd_val(pmd_b)) & ~_PAGE_HPTEFLAGS) == 0);
+}
+
+#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
+extern int pmdp_set_access_flags(struct vm_area_struct *vma,
+				 unsigned long address, pmd_t *pmdp,
+				 pmd_t entry, int dirty);
+
+extern unsigned long pmd_hugepage_update(struct mm_struct *mm,
+					 unsigned long addr,
+					 pmd_t *pmdp,
+					 unsigned long clr,
+					 unsigned long set);
+
+static inline int __pmdp_test_and_clear_young(struct mm_struct *mm,
+					      unsigned long addr, pmd_t *pmdp)
+{
+	unsigned long old;
+
+	if ((pmd_val(*pmdp) & (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0)
+		return 0;
+	old = pmd_hugepage_update(mm, addr, pmdp, _PAGE_ACCESSED, 0);
+	return ((old & _PAGE_ACCESSED) != 0);
+}
+
+#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
+extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+				     unsigned long address, pmd_t *pmdp);
+#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
+extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
+				  unsigned long address, pmd_t *pmdp);
+
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
+extern pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
+				     unsigned long addr, pmd_t *pmdp);
+
+#define __HAVE_ARCH_PMDP_SET_WRPROTECT
+static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
+				      pmd_t *pmdp)
+{
+
+	if ((pmd_val(*pmdp) & _PAGE_RW) == 0)
+		return;
+
+	pmd_hugepage_update(mm, addr, pmdp, _PAGE_RW, 0);
+}
+
+#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
+extern void pmdp_splitting_flush(struct vm_area_struct *vma,
+				 unsigned long address, pmd_t *pmdp);
+
+extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
+				 unsigned long address, pmd_t *pmdp);
+#define pmdp_collapse_flush pmdp_collapse_flush
+
+#define __HAVE_ARCH_PGTABLE_DEPOSIT
+extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+				       pgtable_t pgtable);
+#define __HAVE_ARCH_PGTABLE_WITHDRAW
+extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
+
+#define __HAVE_ARCH_PMDP_INVALIDATE
+extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
+			    pmd_t *pmdp);
+
+#define pmd_move_must_withdraw pmd_move_must_withdraw
+struct spinlock;
+static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
+					 struct spinlock *old_pmd_ptl)
+{
+	/*
+	 * Archs like ppc64 use pgtable to store per pmd
+	 * specific information. So when we switch the pmd,
+	 * we should also withdraw and deposit the pgtable
+	 */
+	return true;
+}
+#endif /* __ASSEMBLY__ */
+#endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */
diff --git a/arch/powerpc/include/asm/book3s/pgtable.h b/arch/powerpc/include/asm/book3s/pgtable.h
new file mode 100644
index 000000000000..a8d8e5152bd4
--- /dev/null
+++ b/arch/powerpc/include/asm/book3s/pgtable.h
@@ -0,0 +1,10 @@
+#ifndef _ASM_POWERPC_BOOK3S_PGTABLE_H
+#define _ASM_POWERPC_BOOK3S_PGTABLE_H
+
+#ifdef CONFIG_PPC64
+#include <asm/book3s/64/pgtable.h>
+#else
+#include <asm/book3s/32/pgtable.h>
+#endif
+
+#endif
diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h
index ba3342bbdbda..7352d3f212df 100644
--- a/arch/powerpc/include/asm/mmu-hash64.h
+++ b/arch/powerpc/include/asm/mmu-hash64.h
@@ -21,7 +21,7 @@
  * need for various slices related matters. Note that this isn't the
  * complete pgtable.h but only a portion of it.
  */
-#include <asm/pgtable-ppc64.h>
+#include <asm/book3s/64/pgtable.h>
 #include <asm/bug.h>
 #include <asm/processor.h>
 
diff --git a/arch/powerpc/include/asm/pgtable-ppc32.h b/arch/powerpc/include/asm/pgtable-ppc32.h
index 1a58a05be99c..aac6547b0823 100644
--- a/arch/powerpc/include/asm/pgtable-ppc32.h
+++ b/arch/powerpc/include/asm/pgtable-ppc32.h
@@ -115,8 +115,6 @@ extern int icache_44x_need_flush;
 #include <asm/pte-fsl-booke.h>
 #elif defined(CONFIG_8xx)
 #include <asm/pte-8xx.h>
-#else /* CONFIG_6xx */
-#include <asm/book3s/32/hash.h>
 #endif
 
 /* And here we include common definitions */
diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
index b36a932abdfb..1ef0fea32e1e 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -97,11 +97,7 @@
 /*
  * Include the PTE bits definitions
  */
-#ifdef CONFIG_PPC_BOOK3S
-#include <asm/book3s/64/hash.h>
-#else
 #include <asm/pte-book3e.h>
-#endif
 #include <asm/pte-common.h>
 
 #ifdef CONFIG_PPC_MM_SLICES
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index b64b4212b71f..c304d0767919 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -13,11 +13,15 @@ struct mm_struct;
 
 #endif /* !__ASSEMBLY__ */
 
+#ifdef CONFIG_PPC_BOOK3S
+#include <asm/book3s/pgtable.h>
+#else
 #if defined(CONFIG_PPC64)
 #  include <asm/pgtable-ppc64.h>
 #else
 #  include <asm/pgtable-ppc32.h>
 #endif
+#endif /* !CONFIG_PPC_BOOK3S */
 
 /*
  * We save the slot number & secondary bit in the second half of the

From ab537dca2f3303a6ef646c33cccf56eaa8a76f9c Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 1 Dec 2015 09:06:30 +0530
Subject: [PATCH 038/149] powerpc/mm: Move hash specific pte width and other
 defines to book3s

This further make a copy of pte defines to book3s/64/hash*.h. This
remove the dependency on pgtable-ppc64-4k.h and pgtable-ppc64-64k.h

Acked-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/hash-4k.h  | 86 ++++++++++++++++++-
 arch/powerpc/include/asm/book3s/64/hash-64k.h | 46 +++++++++-
 arch/powerpc/include/asm/book3s/64/pgtable.h  |  6 +-
 3 files changed, 129 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h
index c134e809aac3..f2c51cd61f69 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
@@ -1,4 +1,51 @@
-/* To be include by pgtable-hash64.h only */
+#ifndef _ASM_POWERPC_BOOK3S_64_HASH_4K_H
+#define _ASM_POWERPC_BOOK3S_64_HASH_4K_H
+/*
+ * Entries per page directory level.  The PTE level must use a 64b record
+ * for each page table entry.  The PMD and PGD level use a 32b record for
+ * each entry by assuming that each entry is page aligned.
+ */
+#define PTE_INDEX_SIZE  9
+#define PMD_INDEX_SIZE  7
+#define PUD_INDEX_SIZE  9
+#define PGD_INDEX_SIZE  9
+
+#ifndef __ASSEMBLY__
+#define PTE_TABLE_SIZE	(sizeof(pte_t) << PTE_INDEX_SIZE)
+#define PMD_TABLE_SIZE	(sizeof(pmd_t) << PMD_INDEX_SIZE)
+#define PUD_TABLE_SIZE	(sizeof(pud_t) << PUD_INDEX_SIZE)
+#define PGD_TABLE_SIZE	(sizeof(pgd_t) << PGD_INDEX_SIZE)
+#endif	/* __ASSEMBLY__ */
+
+#define PTRS_PER_PTE	(1 << PTE_INDEX_SIZE)
+#define PTRS_PER_PMD	(1 << PMD_INDEX_SIZE)
+#define PTRS_PER_PUD	(1 << PUD_INDEX_SIZE)
+#define PTRS_PER_PGD	(1 << PGD_INDEX_SIZE)
+
+/* PMD_SHIFT determines what a second-level page table entry can map */
+#define PMD_SHIFT	(PAGE_SHIFT + PTE_INDEX_SIZE)
+#define PMD_SIZE	(1UL << PMD_SHIFT)
+#define PMD_MASK	(~(PMD_SIZE-1))
+
+/* With 4k base page size, hugepage PTEs go at the PMD level */
+#define MIN_HUGEPTE_SHIFT	PMD_SHIFT
+
+/* PUD_SHIFT determines what a third-level page table entry can map */
+#define PUD_SHIFT	(PMD_SHIFT + PMD_INDEX_SIZE)
+#define PUD_SIZE	(1UL << PUD_SHIFT)
+#define PUD_MASK	(~(PUD_SIZE-1))
+
+/* PGDIR_SHIFT determines what a fourth-level page table entry can map */
+#define PGDIR_SHIFT	(PUD_SHIFT + PUD_INDEX_SIZE)
+#define PGDIR_SIZE	(1UL << PGDIR_SHIFT)
+#define PGDIR_MASK	(~(PGDIR_SIZE-1))
+
+/* Bits to mask out from a PMD to get to the PTE page */
+#define PMD_MASKED_BITS		0
+/* Bits to mask out from a PUD to get to the PMD page */
+#define PUD_MASKED_BITS		0
+/* Bits to mask out from a PGD to get to the PUD page */
+#define PGD_MASKED_BITS		0
 
 /* PTE bits */
 #define _PAGE_HASHPTE	0x0400 /* software: pte has an associated HPTE */
@@ -15,3 +62,40 @@
 /* shift to put page number into pte */
 #define PTE_RPN_SHIFT	(17)
 
+#ifndef __ASSEMBLY__
+/*
+ * 4-level page tables related bits
+ */
+
+#define pgd_none(pgd)		(!pgd_val(pgd))
+#define pgd_bad(pgd)		(pgd_val(pgd) == 0)
+#define pgd_present(pgd)	(pgd_val(pgd) != 0)
+#define pgd_clear(pgdp)		(pgd_val(*(pgdp)) = 0)
+#define pgd_page_vaddr(pgd)	(pgd_val(pgd) & ~PGD_MASKED_BITS)
+
+static inline pte_t pgd_pte(pgd_t pgd)
+{
+	return __pte(pgd_val(pgd));
+}
+
+static inline pgd_t pte_pgd(pte_t pte)
+{
+	return __pgd(pte_val(pte));
+}
+extern struct page *pgd_page(pgd_t pgd);
+
+#define pud_offset(pgdp, addr)	\
+  (((pud_t *) pgd_page_vaddr(*(pgdp))) + \
+    (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)))
+
+#define pud_ERROR(e) \
+	pr_err("%s:%d: bad pud %08lx.\n", __FILE__, __LINE__, pud_val(e))
+
+/*
+ * On all 4K setups, remap_4k_pfn() equates to remap_pfn_range() */
+#define remap_4k_pfn(vma, addr, pfn, prot)	\
+	remap_pfn_range((vma), (addr), (pfn), PAGE_SIZE, (prot))
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* _ASM_POWERPC_BOOK3S_64_HASH_4K_H */
diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h
index 4f4ec2ab45c9..ee073822145d 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -1,4 +1,35 @@
-/* To be include by pgtable-hash64.h only */
+#ifndef _ASM_POWERPC_BOOK3S_64_HASH_64K_H
+#define _ASM_POWERPC_BOOK3S_64_HASH_64K_H
+
+#include <asm-generic/pgtable-nopud.h>
+
+#define PTE_INDEX_SIZE  8
+#define PMD_INDEX_SIZE  10
+#define PUD_INDEX_SIZE	0
+#define PGD_INDEX_SIZE  12
+
+#define PTRS_PER_PTE	(1 << PTE_INDEX_SIZE)
+#define PTRS_PER_PMD	(1 << PMD_INDEX_SIZE)
+#define PTRS_PER_PGD	(1 << PGD_INDEX_SIZE)
+
+/* With 4k base page size, hugepage PTEs go at the PMD level */
+#define MIN_HUGEPTE_SHIFT	PAGE_SHIFT
+
+/* PMD_SHIFT determines what a second-level page table entry can map */
+#define PMD_SHIFT	(PAGE_SHIFT + PTE_INDEX_SIZE)
+#define PMD_SIZE	(1UL << PMD_SHIFT)
+#define PMD_MASK	(~(PMD_SIZE-1))
+
+/* PGDIR_SHIFT determines what a third-level page table entry can map */
+#define PGDIR_SHIFT	(PMD_SHIFT + PMD_INDEX_SIZE)
+#define PGDIR_SIZE	(1UL << PGDIR_SHIFT)
+#define PGDIR_MASK	(~(PGDIR_SIZE-1))
+
+/* Bits to mask out from a PMD to get to the PTE page */
+/* PMDs point to PTE table fragments which are 4K aligned.  */
+#define PMD_MASKED_BITS		0xfff
+/* Bits to mask out from a PGD/PUD to get to the PMD page */
+#define PUD_MASKED_BITS		0x1ff
 
 /* Additional PTE bits (don't change without checking asm in hash_low.S) */
 #define _PAGE_SPECIAL	0x00000400 /* software: special page */
@@ -74,8 +105,8 @@ static inline unsigned long __rpte_to_hidx(real_pte_t rpte, unsigned long index)
 #define __rpte_to_pte(r)	((r).pte)
 #define __rpte_sub_valid(rpte, index) \
 	(pte_val(rpte.pte) & (_PAGE_HPTE_SUB0 >> (index)))
-
-/* Trick: we set __end to va + 64k, which happens works for
+/*
+ * Trick: we set __end to va + 64k, which happens works for
  * a 16M page as well as we want only one iteration
  */
 #define pte_iterate_hashed_subpages(rpte, psize, vpn, index, shift)	\
@@ -99,4 +130,13 @@ static inline unsigned long __rpte_to_hidx(real_pte_t rpte, unsigned long index)
 		remap_pfn_range((vma), (addr), (pfn), PAGE_SIZE,	\
 			__pgprot(pgprot_val((prot)) | _PAGE_4K_PFN)))
 
+#define PTE_TABLE_SIZE	(sizeof(real_pte_t) << PTE_INDEX_SIZE)
+#define PMD_TABLE_SIZE	(sizeof(pmd_t) << PMD_INDEX_SIZE)
+#define PGD_TABLE_SIZE	(sizeof(pgd_t) << PGD_INDEX_SIZE)
+
+#define pgd_pte(pgd)	(pud_pte(((pud_t){ pgd })))
+#define pte_pgd(pte)	((pgd_t)pte_pud(pte))
+
 #endif	/* __ASSEMBLY__ */
+
+#endif /* _ASM_POWERPC_BOOK3S_64_HASH_64K_H */
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index cdd5284d9eaa..2741ac6fbd3d 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -5,11 +5,7 @@
  * the ppc64 hashed page table.
  */
 
-#ifdef CONFIG_PPC_64K_PAGES
-#include <asm/pgtable-ppc64-64k.h>
-#else
-#include <asm/pgtable-ppc64-4k.h>
-#endif
+#include <asm/book3s/64/hash.h>
 #include <asm/barrier.h>
 
 #define FIRST_USER_ADDRESS	0UL

From cbbb8683fb632ecadafcf8a5f81d38156d4274ab Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 1 Dec 2015 09:06:31 +0530
Subject: [PATCH 039/149] powerpc/mm: Delete booke bits from book3s

We also move __ASSEMBLY__ towards the end of header. This avoid
having #ifndef __ASSEMBLY___ all over the header

Acked-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/32/pgtable.h | 89 +++++---------------
 arch/powerpc/include/asm/book3s/64/pgtable.h | 86 ++++++-------------
 arch/powerpc/include/asm/book3s/pgtable.h    |  1 +
 3 files changed, 47 insertions(+), 129 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h
index 418d2fa3ac7d..5438c0b6aeec 100644
--- a/arch/powerpc/include/asm/book3s/32/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
@@ -3,18 +3,10 @@
 
 #include <asm-generic/pgtable-nopmd.h>
 
-#ifndef __ASSEMBLY__
-#include <linux/sched.h>
-#include <linux/threads.h>
-#include <asm/io.h>			/* For sub-arch specific PPC_PIN_SIZE */
+#include <asm/book3s/32/hash.h>
 
-extern unsigned long ioremap_bot;
-
-#ifdef CONFIG_44x
-extern int icache_44x_need_flush;
-#endif
-
-#endif /* __ASSEMBLY__ */
+/* And here we include common definitions */
+#include <asm/pte-common.h>
 
 /*
  * The normal case is that PTEs are 32-bits and we have a 1-page
@@ -31,28 +23,11 @@ extern int icache_44x_need_flush;
 #define PGDIR_SIZE	(1UL << PGDIR_SHIFT)
 #define PGDIR_MASK	(~(PGDIR_SIZE-1))
 
-/*
- * entries per page directory level: our page-table tree is two-level, so
- * we don't really have any PMD directory.
- */
-#ifndef __ASSEMBLY__
-#define PTE_TABLE_SIZE	(sizeof(pte_t) << PTE_SHIFT)
-#define PGD_TABLE_SIZE	(sizeof(pgd_t) << (32 - PGDIR_SHIFT))
-#endif	/* __ASSEMBLY__ */
-
 #define PTRS_PER_PTE	(1 << PTE_SHIFT)
 #define PTRS_PER_PMD	1
 #define PTRS_PER_PGD	(1 << (32 - PGDIR_SHIFT))
 
 #define USER_PTRS_PER_PGD	(TASK_SIZE / PGDIR_SIZE)
-#define FIRST_USER_ADDRESS	0UL
-
-#define pte_ERROR(e) \
-	pr_err("%s:%d: bad pte %llx.\n", __FILE__, __LINE__, \
-		(unsigned long long)pte_val(e))
-#define pgd_ERROR(e) \
-	pr_err("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
-
 /*
  * This is the bottom of the PKMAP area with HIGHMEM or an arbitrary
  * value (for now) on others, from where we can start layout kernel
@@ -100,30 +75,30 @@ extern int icache_44x_need_flush;
 #endif
 #define VMALLOC_END	ioremap_bot
 
+#ifndef __ASSEMBLY__
+#include <linux/sched.h>
+#include <linux/threads.h>
+#include <asm/io.h>			/* For sub-arch specific PPC_PIN_SIZE */
+
+extern unsigned long ioremap_bot;
+
+/*
+ * entries per page directory level: our page-table tree is two-level, so
+ * we don't really have any PMD directory.
+ */
+#define PTE_TABLE_SIZE	(sizeof(pte_t) << PTE_SHIFT)
+#define PGD_TABLE_SIZE	(sizeof(pgd_t) << (32 - PGDIR_SHIFT))
+
+#define pte_ERROR(e) \
+	pr_err("%s:%d: bad pte %llx.\n", __FILE__, __LINE__, \
+		(unsigned long long)pte_val(e))
+#define pgd_ERROR(e) \
+	pr_err("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
 /*
  * Bits in a linux-style PTE.  These match the bits in the
  * (hardware-defined) PowerPC PTE as closely as possible.
  */
 
-#if defined(CONFIG_40x)
-#include <asm/pte-40x.h>
-#elif defined(CONFIG_44x)
-#include <asm/pte-44x.h>
-#elif defined(CONFIG_FSL_BOOKE) && defined(CONFIG_PTE_64BIT)
-#include <asm/pte-book3e.h>
-#elif defined(CONFIG_FSL_BOOKE)
-#include <asm/pte-fsl-booke.h>
-#elif defined(CONFIG_8xx)
-#include <asm/pte-8xx.h>
-#else /* CONFIG_6xx */
-#include <asm/book3s/32/hash.h>
-#endif
-
-/* And here we include common definitions */
-#include <asm/pte-common.h>
-
-#ifndef __ASSEMBLY__
-
 #define pte_clear(mm, addr, ptep) \
 	do { pte_update(ptep, ~_PAGE_HASHPTE, 0); } while (0)
 
@@ -167,7 +142,6 @@ static inline unsigned long pte_update(pte_t *p,
 				       unsigned long clr,
 				       unsigned long set)
 {
-#ifdef PTE_ATOMIC_UPDATES
 	unsigned long old, tmp;
 
 	__asm__ __volatile__("\
@@ -180,15 +154,7 @@ static inline unsigned long pte_update(pte_t *p,
 	: "=&r" (old), "=&r" (tmp), "=m" (*p)
 	: "r" (p), "r" (clr), "r" (set), "m" (*p)
 	: "cc" );
-#else /* PTE_ATOMIC_UPDATES */
-	unsigned long old = pte_val(*p);
-	*p = __pte((old & ~clr) | set);
-#endif /* !PTE_ATOMIC_UPDATES */
 
-#ifdef CONFIG_44x
-	if ((old & _PAGE_USER) && (old & _PAGE_EXEC))
-		icache_44x_need_flush = 1;
-#endif
 	return old;
 }
 #else /* CONFIG_PTE_64BIT */
@@ -196,7 +162,6 @@ static inline unsigned long long pte_update(pte_t *p,
 					    unsigned long clr,
 					    unsigned long set)
 {
-#ifdef PTE_ATOMIC_UPDATES
 	unsigned long long old;
 	unsigned long tmp;
 
@@ -211,15 +176,7 @@ static inline unsigned long long pte_update(pte_t *p,
 	: "=&r" (old), "=&r" (tmp), "=m" (*p)
 	: "r" (p), "r" ((unsigned long)(p) + 4), "r" (clr), "r" (set), "m" (*p)
 	: "cc" );
-#else /* PTE_ATOMIC_UPDATES */
-	unsigned long long old = pte_val(*p);
-	*p = __pte((old & ~(unsigned long long)clr) | set);
-#endif /* !PTE_ATOMIC_UPDATES */
 
-#ifdef CONFIG_44x
-	if ((old & _PAGE_USER) && (old & _PAGE_EXEC))
-		icache_44x_need_flush = 1;
-#endif
 	return old;
 }
 #endif /* CONFIG_PTE_64BIT */
@@ -233,12 +190,10 @@ static inline int __ptep_test_and_clear_young(unsigned int context, unsigned lon
 {
 	unsigned long old;
 	old = pte_update(ptep, _PAGE_ACCESSED, 0);
-#if _PAGE_HASHPTE != 0
 	if (old & _PAGE_HASHPTE) {
 		unsigned long ptephys = __pa(ptep) & PAGE_MASK;
 		flush_hash_pages(context, addr, ptephys, 1);
 	}
-#endif
 	return (old & _PAGE_ACCESSED) != 0;
 }
 #define ptep_test_and_clear_young(__vma, __addr, __ptep) \
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 2741ac6fbd3d..ddc08bf22709 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -8,7 +8,6 @@
 #include <asm/book3s/64/hash.h>
 #include <asm/barrier.h>
 
-#define FIRST_USER_ADDRESS	0UL
 
 /*
  * Size of EA range mapped by our pagetables.
@@ -25,27 +24,16 @@
 /*
  * Define the address range of the kernel non-linear virtual area
  */
-
-#ifdef CONFIG_PPC_BOOK3E
-#define KERN_VIRT_START ASM_CONST(0x8000000000000000)
-#else
 #define KERN_VIRT_START ASM_CONST(0xD000000000000000)
-#endif
 #define KERN_VIRT_SIZE	ASM_CONST(0x0000100000000000)
-
 /*
  * The vmalloc space starts at the beginning of that region, and
  * occupies half of it on hash CPUs and a quarter of it on Book3E
  * (we keep a quarter for the virtual memmap)
  */
 #define VMALLOC_START	KERN_VIRT_START
-#ifdef CONFIG_PPC_BOOK3E
-#define VMALLOC_SIZE	(KERN_VIRT_SIZE >> 2)
-#else
 #define VMALLOC_SIZE	(KERN_VIRT_SIZE >> 1)
-#endif
 #define VMALLOC_END	(VMALLOC_START + VMALLOC_SIZE)
-
 /*
  * The second half of the kernel virtual space is used for IO mappings,
  * it's itself carved into the PIO region (ISA and PHB IO space) and
@@ -64,7 +52,6 @@
 #define IOREMAP_BASE	(PHB_IO_END)
 #define IOREMAP_END	(KERN_VIRT_START + KERN_VIRT_SIZE)
 
-
 /*
  * Region IDs
  */
@@ -79,32 +66,39 @@
 
 /*
  * Defines the address of the vmemap area, in its own region on
- * hash table CPUs and after the vmalloc space on Book3E
+ * hash table CPUs.
  */
-#ifdef CONFIG_PPC_BOOK3E
-#define VMEMMAP_BASE		VMALLOC_END
-#define VMEMMAP_END		KERN_IO_START
-#else
 #define VMEMMAP_BASE		(VMEMMAP_REGION_ID << REGION_SHIFT)
-#endif
 #define vmemmap			((struct page *)VMEMMAP_BASE)
 
 
-/*
- * Include the PTE bits definitions
- */
-#ifdef CONFIG_PPC_BOOK3S
-#include <asm/book3s/64/hash.h>
-#else
-#include <asm/pte-book3e.h>
-#endif
-#include <asm/pte-common.h>
-
 #ifdef CONFIG_PPC_MM_SLICES
 #define HAVE_ARCH_UNMAPPED_AREA
 #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
 #endif /* CONFIG_PPC_MM_SLICES */
 
+/*
+ * THP pages can't be special. So use the _PAGE_SPECIAL
+ */
+#define _PAGE_SPLITTING _PAGE_SPECIAL
+
+/*
+ * We need to differentiate between explicit huge page and THP huge
+ * page, since THP huge page also need to track real subpage details
+ */
+#define _PAGE_THP_HUGE  _PAGE_4K_PFN
+
+/*
+ * set of bits not changed in pmd_modify.
+ */
+#define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS |		\
+			 _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_SPLITTING | \
+			 _PAGE_THP_HUGE)
+/*
+ * Default defines for things which we don't use.
+ * We should get this removed.
+ */
+#include <asm/pte-common.h>
 #ifndef __ASSEMBLY__
 
 /*
@@ -144,7 +138,7 @@
 #define PMD_BAD_BITS		(PTE_TABLE_SIZE-1)
 #define PUD_BAD_BITS		(PMD_TABLE_SIZE-1)
 
-#define pmd_set(pmdp, pmdval) 	(pmd_val(*(pmdp)) = (pmdval))
+#define pmd_set(pmdp, pmdval)	(pmd_val(*(pmdp)) = (pmdval))
 #define pmd_none(pmd)		(!pmd_val(pmd))
 #define	pmd_bad(pmd)		(!is_kernel_addr(pmd_val(pmd)) \
 				 || (pmd_val(pmd) & PMD_BAD_BITS))
@@ -206,7 +200,6 @@ static inline unsigned long pte_update(struct mm_struct *mm,
 				       unsigned long set,
 				       int huge)
 {
-#ifdef PTE_ATOMIC_UPDATES
 	unsigned long old, tmp;
 
 	__asm__ __volatile__(
@@ -220,18 +213,12 @@ static inline unsigned long pte_update(struct mm_struct *mm,
 	: "=&r" (old), "=&r" (tmp), "=m" (*ptep)
 	: "r" (ptep), "r" (clr), "m" (*ptep), "i" (_PAGE_BUSY), "r" (set)
 	: "cc" );
-#else
-	unsigned long old = pte_val(*ptep);
-	*ptep = __pte((old & ~clr) | set);
-#endif
 	/* huge pages use the old page table lock */
 	if (!huge)
 		assert_pte_locked(mm, addr);
 
-#ifdef CONFIG_PPC_STD_MMU_64
 	if (old & _PAGE_HASHPTE)
 		hpte_need_flush(mm, addr, ptep, old, huge);
-#endif
 
 	return old;
 }
@@ -313,7 +300,6 @@ static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry)
 	unsigned long bits = pte_val(entry) &
 		(_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC);
 
-#ifdef PTE_ATOMIC_UPDATES
 	unsigned long old, tmp;
 
 	__asm__ __volatile__(
@@ -326,10 +312,6 @@ static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry)
 	:"=&r" (old), "=&r" (tmp), "=m" (*ptep)
 	:"r" (bits), "r" (ptep), "m" (*ptep), "i" (_PAGE_BUSY)
 	:"cc");
-#else
-	unsigned long old = pte_val(*ptep);
-	*ptep = __pte(old | bits);
-#endif
 }
 
 #define __HAVE_ARCH_PTE_SAME
@@ -367,27 +349,7 @@ static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry)
 
 void pgtable_cache_add(unsigned shift, void (*ctor)(void *));
 void pgtable_cache_init(void);
-#endif /* __ASSEMBLY__ */
 
-/*
- * THP pages can't be special. So use the _PAGE_SPECIAL
- */
-#define _PAGE_SPLITTING _PAGE_SPECIAL
-
-/*
- * We need to differentiate between explicit huge page and THP huge
- * page, since THP huge page also need to track real subpage details
- */
-#define _PAGE_THP_HUGE  _PAGE_4K_PFN
-
-/*
- * set of bits not changed in pmd_modify.
- */
-#define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS |		\
-			 _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_SPLITTING | \
-			 _PAGE_THP_HUGE)
-
-#ifndef __ASSEMBLY__
 /*
  * The linux hugepage PMD now include the pmd entries followed by the address
  * to the stashed pgtable_t. The stashed pgtable_t contains the hpte bits.
diff --git a/arch/powerpc/include/asm/book3s/pgtable.h b/arch/powerpc/include/asm/book3s/pgtable.h
index a8d8e5152bd4..3818cc7bc9b7 100644
--- a/arch/powerpc/include/asm/book3s/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/pgtable.h
@@ -7,4 +7,5 @@
 #include <asm/book3s/32/pgtable.h>
 #endif
 
+#define FIRST_USER_ADDRESS	0UL
 #endif

From ee4889c7bc2a416d76730f318c741723cd64d432 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 1 Dec 2015 09:06:32 +0530
Subject: [PATCH 040/149] powerpc/mm: Don't have generic headers introduce
 functions touching pte bits

We are going to drop pte_common.h in the later patch. The idea is to
enable hash code not require to define all PTE bits. Having PTE bits
defined in pte_common.h made the code unnecessarily complex.

Acked-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/pgtable.h | 176 +++++++++++++++++++
 arch/powerpc/include/asm/pgtable-book3e.h | 199 ++++++++++++++++++++++
 arch/powerpc/include/asm/pgtable.h        | 192 +--------------------
 3 files changed, 376 insertions(+), 191 deletions(-)
 create mode 100644 arch/powerpc/include/asm/pgtable-book3e.h

diff --git a/arch/powerpc/include/asm/book3s/pgtable.h b/arch/powerpc/include/asm/book3s/pgtable.h
index 3818cc7bc9b7..fa270cfcf30a 100644
--- a/arch/powerpc/include/asm/book3s/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/pgtable.h
@@ -8,4 +8,180 @@
 #endif
 
 #define FIRST_USER_ADDRESS	0UL
+#ifndef __ASSEMBLY__
+
+/* Generic accessors to PTE bits */
+static inline int pte_write(pte_t pte)
+{
+	return (pte_val(pte) & (_PAGE_RW | _PAGE_RO)) != _PAGE_RO;
+}
+static inline int pte_dirty(pte_t pte)		{ return pte_val(pte) & _PAGE_DIRTY; }
+static inline int pte_young(pte_t pte)		{ return pte_val(pte) & _PAGE_ACCESSED; }
+static inline int pte_special(pte_t pte)	{ return pte_val(pte) & _PAGE_SPECIAL; }
+static inline int pte_none(pte_t pte)		{ return (pte_val(pte) & ~_PTE_NONE_MASK) == 0; }
+static inline pgprot_t pte_pgprot(pte_t pte)	{ return __pgprot(pte_val(pte) & PAGE_PROT_BITS); }
+
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * These work without NUMA balancing but the kernel does not care. See the
+ * comment in include/asm-generic/pgtable.h . On powerpc, this will only
+ * work for user pages and always return true for kernel pages.
+ */
+static inline int pte_protnone(pte_t pte)
+{
+	return (pte_val(pte) &
+		(_PAGE_PRESENT | _PAGE_USER)) == _PAGE_PRESENT;
+}
+
+static inline int pmd_protnone(pmd_t pmd)
+{
+	return pte_protnone(pmd_pte(pmd));
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
+static inline int pte_present(pte_t pte)
+{
+	return pte_val(pte) & _PAGE_PRESENT;
+}
+
+/* Conversion functions: convert a page and protection to a page entry,
+ * and a page entry and page directory to the page they refer to.
+ *
+ * Even if PTEs can be unsigned long long, a PFN is always an unsigned
+ * long for now.
+ */
+static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot) {
+	return __pte(((pte_basic_t)(pfn) << PTE_RPN_SHIFT) |
+		     pgprot_val(pgprot)); }
+static inline unsigned long pte_pfn(pte_t pte)	{
+	return pte_val(pte) >> PTE_RPN_SHIFT; }
+
+/* Generic modifiers for PTE bits */
+static inline pte_t pte_wrprotect(pte_t pte) {
+	pte_val(pte) &= ~(_PAGE_RW | _PAGE_HWWRITE);
+	pte_val(pte) |= _PAGE_RO; return pte; }
+static inline pte_t pte_mkclean(pte_t pte) {
+	pte_val(pte) &= ~(_PAGE_DIRTY | _PAGE_HWWRITE); return pte; }
+static inline pte_t pte_mkold(pte_t pte) {
+	pte_val(pte) &= ~_PAGE_ACCESSED; return pte; }
+static inline pte_t pte_mkwrite(pte_t pte) {
+	pte_val(pte) &= ~_PAGE_RO;
+	pte_val(pte) |= _PAGE_RW; return pte; }
+static inline pte_t pte_mkdirty(pte_t pte) {
+	pte_val(pte) |= _PAGE_DIRTY; return pte; }
+static inline pte_t pte_mkyoung(pte_t pte) {
+	pte_val(pte) |= _PAGE_ACCESSED; return pte; }
+static inline pte_t pte_mkspecial(pte_t pte) {
+	pte_val(pte) |= _PAGE_SPECIAL; return pte; }
+static inline pte_t pte_mkhuge(pte_t pte) {
+	return pte; }
+static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
+{
+	pte_val(pte) = (pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot);
+	return pte;
+}
+
+
+/* Insert a PTE, top-level function is out of line. It uses an inline
+ * low level function in the respective pgtable-* files
+ */
+extern void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
+		       pte_t pte);
+
+/* This low level function performs the actual PTE insertion
+ * Setting the PTE depends on the MMU type and other factors. It's
+ * an horrible mess that I'm not going to try to clean up now but
+ * I'm keeping it in one place rather than spread around
+ */
+static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
+				pte_t *ptep, pte_t pte, int percpu)
+{
+#if defined(CONFIG_PPC_STD_MMU_32) && defined(CONFIG_SMP) && !defined(CONFIG_PTE_64BIT)
+	/* First case is 32-bit Hash MMU in SMP mode with 32-bit PTEs. We use the
+	 * helper pte_update() which does an atomic update. We need to do that
+	 * because a concurrent invalidation can clear _PAGE_HASHPTE. If it's a
+	 * per-CPU PTE such as a kmap_atomic, we do a simple update preserving
+	 * the hash bits instead (ie, same as the non-SMP case)
+	 */
+	if (percpu)
+		*ptep = __pte((pte_val(*ptep) & _PAGE_HASHPTE)
+			      | (pte_val(pte) & ~_PAGE_HASHPTE));
+	else
+		pte_update(ptep, ~_PAGE_HASHPTE, pte_val(pte));
+
+#elif defined(CONFIG_PPC32) && defined(CONFIG_PTE_64BIT)
+	/* Second case is 32-bit with 64-bit PTE.  In this case, we
+	 * can just store as long as we do the two halves in the right order
+	 * with a barrier in between. This is possible because we take care,
+	 * in the hash code, to pre-invalidate if the PTE was already hashed,
+	 * which synchronizes us with any concurrent invalidation.
+	 * In the percpu case, we also fallback to the simple update preserving
+	 * the hash bits
+	 */
+	if (percpu) {
+		*ptep = __pte((pte_val(*ptep) & _PAGE_HASHPTE)
+			      | (pte_val(pte) & ~_PAGE_HASHPTE));
+		return;
+	}
+	if (pte_val(*ptep) & _PAGE_HASHPTE)
+		flush_hash_entry(mm, ptep, addr);
+	__asm__ __volatile__("\
+		stw%U0%X0 %2,%0\n\
+		eieio\n\
+		stw%U0%X0 %L2,%1"
+	: "=m" (*ptep), "=m" (*((unsigned char *)ptep+4))
+	: "r" (pte) : "memory");
+
+#elif defined(CONFIG_PPC_STD_MMU_32)
+	/* Third case is 32-bit hash table in UP mode, we need to preserve
+	 * the _PAGE_HASHPTE bit since we may not have invalidated the previous
+	 * translation in the hash yet (done in a subsequent flush_tlb_xxx())
+	 * and see we need to keep track that this PTE needs invalidating
+	 */
+	*ptep = __pte((pte_val(*ptep) & _PAGE_HASHPTE)
+		      | (pte_val(pte) & ~_PAGE_HASHPTE));
+
+#else
+	/* Anything else just stores the PTE normally. That covers all 64-bit
+	 * cases, and 32-bit non-hash with 32-bit PTEs.
+	 */
+	*ptep = pte;
+#endif
+}
+
+
+#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
+extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address,
+				 pte_t *ptep, pte_t entry, int dirty);
+
+/*
+ * Macro to mark a page protection value as "uncacheable".
+ */
+
+#define _PAGE_CACHE_CTL	(_PAGE_COHERENT | _PAGE_GUARDED | _PAGE_NO_CACHE | \
+			 _PAGE_WRITETHRU)
+
+#define pgprot_noncached(prot)	  (__pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | \
+				            _PAGE_NO_CACHE | _PAGE_GUARDED))
+
+#define pgprot_noncached_wc(prot) (__pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | \
+				            _PAGE_NO_CACHE))
+
+#define pgprot_cached(prot)       (__pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | \
+				            _PAGE_COHERENT))
+
+#define pgprot_cached_wthru(prot) (__pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | \
+				            _PAGE_COHERENT | _PAGE_WRITETHRU))
+
+#define pgprot_cached_noncoherent(prot) \
+		(__pgprot(pgprot_val(prot) & ~_PAGE_CACHE_CTL))
+
+#define pgprot_writecombine pgprot_noncached_wc
+
+struct file;
+extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
+				     unsigned long size, pgprot_t vma_prot);
+#define __HAVE_PHYS_MEM_ACCESS_PROT
+
+#endif /* __ASSEMBLY__ */
 #endif
diff --git a/arch/powerpc/include/asm/pgtable-book3e.h b/arch/powerpc/include/asm/pgtable-book3e.h
new file mode 100644
index 000000000000..a3221cff2e31
--- /dev/null
+++ b/arch/powerpc/include/asm/pgtable-book3e.h
@@ -0,0 +1,199 @@
+#ifndef _ASM_POWERPC_PGTABLE_BOOK3E_H
+#define _ASM_POWERPC_PGTABLE_BOOK3E_H
+
+#if defined(CONFIG_PPC64)
+#include <asm/pgtable-ppc64.h>
+#else
+#include <asm/pgtable-ppc32.h>
+#endif
+
+#ifndef __ASSEMBLY__
+
+/* Generic accessors to PTE bits */
+static inline int pte_write(pte_t pte)
+{
+	return (pte_val(pte) & (_PAGE_RW | _PAGE_RO)) != _PAGE_RO;
+}
+static inline int pte_dirty(pte_t pte)		{ return pte_val(pte) & _PAGE_DIRTY; }
+static inline int pte_young(pte_t pte)		{ return pte_val(pte) & _PAGE_ACCESSED; }
+static inline int pte_special(pte_t pte)	{ return pte_val(pte) & _PAGE_SPECIAL; }
+static inline int pte_none(pte_t pte)		{ return (pte_val(pte) & ~_PTE_NONE_MASK) == 0; }
+static inline pgprot_t pte_pgprot(pte_t pte)	{ return __pgprot(pte_val(pte) & PAGE_PROT_BITS); }
+
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * These work without NUMA balancing but the kernel does not care. See the
+ * comment in include/asm-generic/pgtable.h . On powerpc, this will only
+ * work for user pages and always return true for kernel pages.
+ */
+static inline int pte_protnone(pte_t pte)
+{
+	return (pte_val(pte) &
+		(_PAGE_PRESENT | _PAGE_USER)) == _PAGE_PRESENT;
+}
+
+static inline int pmd_protnone(pmd_t pmd)
+{
+	return pte_protnone(pmd_pte(pmd));
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
+static inline int pte_present(pte_t pte)
+{
+	return pte_val(pte) & _PAGE_PRESENT;
+}
+
+/* Conversion functions: convert a page and protection to a page entry,
+ * and a page entry and page directory to the page they refer to.
+ *
+ * Even if PTEs can be unsigned long long, a PFN is always an unsigned
+ * long for now.
+ */
+static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot) {
+	return __pte(((pte_basic_t)(pfn) << PTE_RPN_SHIFT) |
+		     pgprot_val(pgprot)); }
+static inline unsigned long pte_pfn(pte_t pte)	{
+	return pte_val(pte) >> PTE_RPN_SHIFT; }
+
+/* Generic modifiers for PTE bits */
+static inline pte_t pte_wrprotect(pte_t pte) {
+	pte_val(pte) &= ~(_PAGE_RW | _PAGE_HWWRITE);
+	pte_val(pte) |= _PAGE_RO; return pte; }
+static inline pte_t pte_mkclean(pte_t pte) {
+	pte_val(pte) &= ~(_PAGE_DIRTY | _PAGE_HWWRITE); return pte; }
+static inline pte_t pte_mkold(pte_t pte) {
+	pte_val(pte) &= ~_PAGE_ACCESSED; return pte; }
+static inline pte_t pte_mkwrite(pte_t pte) {
+	pte_val(pte) &= ~_PAGE_RO;
+	pte_val(pte) |= _PAGE_RW; return pte; }
+static inline pte_t pte_mkdirty(pte_t pte) {
+	pte_val(pte) |= _PAGE_DIRTY; return pte; }
+static inline pte_t pte_mkyoung(pte_t pte) {
+	pte_val(pte) |= _PAGE_ACCESSED; return pte; }
+static inline pte_t pte_mkspecial(pte_t pte) {
+	pte_val(pte) |= _PAGE_SPECIAL; return pte; }
+static inline pte_t pte_mkhuge(pte_t pte) {
+	return pte; }
+static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
+{
+	pte_val(pte) = (pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot);
+	return pte;
+}
+
+
+/* Insert a PTE, top-level function is out of line. It uses an inline
+ * low level function in the respective pgtable-* files
+ */
+extern void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
+		       pte_t pte);
+
+/* This low level function performs the actual PTE insertion
+ * Setting the PTE depends on the MMU type and other factors. It's
+ * an horrible mess that I'm not going to try to clean up now but
+ * I'm keeping it in one place rather than spread around
+ */
+static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
+				pte_t *ptep, pte_t pte, int percpu)
+{
+#if defined(CONFIG_PPC_STD_MMU_32) && defined(CONFIG_SMP) && !defined(CONFIG_PTE_64BIT)
+	/* First case is 32-bit Hash MMU in SMP mode with 32-bit PTEs. We use the
+	 * helper pte_update() which does an atomic update. We need to do that
+	 * because a concurrent invalidation can clear _PAGE_HASHPTE. If it's a
+	 * per-CPU PTE such as a kmap_atomic, we do a simple update preserving
+	 * the hash bits instead (ie, same as the non-SMP case)
+	 */
+	if (percpu)
+		*ptep = __pte((pte_val(*ptep) & _PAGE_HASHPTE)
+			      | (pte_val(pte) & ~_PAGE_HASHPTE));
+	else
+		pte_update(ptep, ~_PAGE_HASHPTE, pte_val(pte));
+
+#elif defined(CONFIG_PPC32) && defined(CONFIG_PTE_64BIT)
+	/* Second case is 32-bit with 64-bit PTE.  In this case, we
+	 * can just store as long as we do the two halves in the right order
+	 * with a barrier in between. This is possible because we take care,
+	 * in the hash code, to pre-invalidate if the PTE was already hashed,
+	 * which synchronizes us with any concurrent invalidation.
+	 * In the percpu case, we also fallback to the simple update preserving
+	 * the hash bits
+	 */
+	if (percpu) {
+		*ptep = __pte((pte_val(*ptep) & _PAGE_HASHPTE)
+			      | (pte_val(pte) & ~_PAGE_HASHPTE));
+		return;
+	}
+#if _PAGE_HASHPTE != 0
+	if (pte_val(*ptep) & _PAGE_HASHPTE)
+		flush_hash_entry(mm, ptep, addr);
+#endif
+	__asm__ __volatile__("\
+		stw%U0%X0 %2,%0\n\
+		eieio\n\
+		stw%U0%X0 %L2,%1"
+	: "=m" (*ptep), "=m" (*((unsigned char *)ptep+4))
+	: "r" (pte) : "memory");
+
+#elif defined(CONFIG_PPC_STD_MMU_32)
+	/* Third case is 32-bit hash table in UP mode, we need to preserve
+	 * the _PAGE_HASHPTE bit since we may not have invalidated the previous
+	 * translation in the hash yet (done in a subsequent flush_tlb_xxx())
+	 * and see we need to keep track that this PTE needs invalidating
+	 */
+	*ptep = __pte((pte_val(*ptep) & _PAGE_HASHPTE)
+		      | (pte_val(pte) & ~_PAGE_HASHPTE));
+
+#else
+	/* Anything else just stores the PTE normally. That covers all 64-bit
+	 * cases, and 32-bit non-hash with 32-bit PTEs.
+	 */
+	*ptep = pte;
+
+#ifdef CONFIG_PPC_BOOK3E_64
+	/*
+	 * With hardware tablewalk, a sync is needed to ensure that
+	 * subsequent accesses see the PTE we just wrote.  Unlike userspace
+	 * mappings, we can't tolerate spurious faults, so make sure
+	 * the new PTE will be seen the first time.
+	 */
+	if (is_kernel_addr(addr))
+		mb();
+#endif
+#endif
+}
+
+
+#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
+extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address,
+				 pte_t *ptep, pte_t entry, int dirty);
+
+/*
+ * Macro to mark a page protection value as "uncacheable".
+ */
+
+#define _PAGE_CACHE_CTL	(_PAGE_COHERENT | _PAGE_GUARDED | _PAGE_NO_CACHE | \
+			 _PAGE_WRITETHRU)
+
+#define pgprot_noncached(prot)	  (__pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | \
+				            _PAGE_NO_CACHE | _PAGE_GUARDED))
+
+#define pgprot_noncached_wc(prot) (__pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | \
+				            _PAGE_NO_CACHE))
+
+#define pgprot_cached(prot)       (__pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | \
+				            _PAGE_COHERENT))
+
+#define pgprot_cached_wthru(prot) (__pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | \
+				            _PAGE_COHERENT | _PAGE_WRITETHRU))
+
+#define pgprot_cached_noncoherent(prot) \
+		(__pgprot(pgprot_val(prot) & ~_PAGE_CACHE_CTL))
+
+#define pgprot_writecombine pgprot_noncached_wc
+
+struct file;
+extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
+				     unsigned long size, pgprot_t vma_prot);
+#define __HAVE_PHYS_MEM_ACCESS_PROT
+
+#endif /* __ASSEMBLY__ */
+#endif
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index c304d0767919..a27b8cef51d7 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -1,6 +1,5 @@
 #ifndef _ASM_POWERPC_PGTABLE_H
 #define _ASM_POWERPC_PGTABLE_H
-#ifdef __KERNEL__
 
 #ifndef __ASSEMBLY__
 #include <linux/mmdebug.h>
@@ -16,11 +15,7 @@ struct mm_struct;
 #ifdef CONFIG_PPC_BOOK3S
 #include <asm/book3s/pgtable.h>
 #else
-#if defined(CONFIG_PPC64)
-#  include <asm/pgtable-ppc64.h>
-#else
-#  include <asm/pgtable-ppc32.h>
-#endif
+#include <asm/pgtable-book3e.h>
 #endif /* !CONFIG_PPC_BOOK3S */
 
 /*
@@ -33,194 +28,10 @@ struct mm_struct;
 
 #include <asm/tlbflush.h>
 
-/* Generic accessors to PTE bits */
-static inline int pte_write(pte_t pte)
-{	return (pte_val(pte) & (_PAGE_RW | _PAGE_RO)) != _PAGE_RO; }
-static inline int pte_dirty(pte_t pte)		{ return pte_val(pte) & _PAGE_DIRTY; }
-static inline int pte_young(pte_t pte)		{ return pte_val(pte) & _PAGE_ACCESSED; }
-static inline int pte_special(pte_t pte)	{ return pte_val(pte) & _PAGE_SPECIAL; }
-static inline int pte_none(pte_t pte)		{ return (pte_val(pte) & ~_PTE_NONE_MASK) == 0; }
-static inline pgprot_t pte_pgprot(pte_t pte)	{ return __pgprot(pte_val(pte) & PAGE_PROT_BITS); }
-
-#ifdef CONFIG_NUMA_BALANCING
-/*
- * These work without NUMA balancing but the kernel does not care. See the
- * comment in include/asm-generic/pgtable.h . On powerpc, this will only
- * work for user pages and always return true for kernel pages.
- */
-static inline int pte_protnone(pte_t pte)
-{
-	return (pte_val(pte) &
-		(_PAGE_PRESENT | _PAGE_USER)) == _PAGE_PRESENT;
-}
-
-static inline int pmd_protnone(pmd_t pmd)
-{
-	return pte_protnone(pmd_pte(pmd));
-}
-#endif /* CONFIG_NUMA_BALANCING */
-
-static inline int pte_present(pte_t pte)
-{
-	return pte_val(pte) & _PAGE_PRESENT;
-}
-
-/* Conversion functions: convert a page and protection to a page entry,
- * and a page entry and page directory to the page they refer to.
- *
- * Even if PTEs can be unsigned long long, a PFN is always an unsigned
- * long for now.
- */
-static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot) {
-	return __pte(((pte_basic_t)(pfn) << PTE_RPN_SHIFT) |
-		     pgprot_val(pgprot)); }
-static inline unsigned long pte_pfn(pte_t pte)	{
-	return pte_val(pte) >> PTE_RPN_SHIFT; }
-
 /* Keep these as a macros to avoid include dependency mess */
 #define pte_page(x)		pfn_to_page(pte_pfn(x))
 #define mk_pte(page, pgprot)	pfn_pte(page_to_pfn(page), (pgprot))
 
-/* Generic modifiers for PTE bits */
-static inline pte_t pte_wrprotect(pte_t pte) {
-	pte_val(pte) &= ~(_PAGE_RW | _PAGE_HWWRITE);
-	pte_val(pte) |= _PAGE_RO; return pte; }
-static inline pte_t pte_mkclean(pte_t pte) {
-	pte_val(pte) &= ~(_PAGE_DIRTY | _PAGE_HWWRITE); return pte; }
-static inline pte_t pte_mkold(pte_t pte) {
-	pte_val(pte) &= ~_PAGE_ACCESSED; return pte; }
-static inline pte_t pte_mkwrite(pte_t pte) {
-	pte_val(pte) &= ~_PAGE_RO;
-	pte_val(pte) |= _PAGE_RW; return pte; }
-static inline pte_t pte_mkdirty(pte_t pte) {
-	pte_val(pte) |= _PAGE_DIRTY; return pte; }
-static inline pte_t pte_mkyoung(pte_t pte) {
-	pte_val(pte) |= _PAGE_ACCESSED; return pte; }
-static inline pte_t pte_mkspecial(pte_t pte) {
-	pte_val(pte) |= _PAGE_SPECIAL; return pte; }
-static inline pte_t pte_mkhuge(pte_t pte) {
-	return pte; }
-static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
-{
-	pte_val(pte) = (pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot);
-	return pte;
-}
-
-
-/* Insert a PTE, top-level function is out of line. It uses an inline
- * low level function in the respective pgtable-* files
- */
-extern void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
-		       pte_t pte);
-
-/* This low level function performs the actual PTE insertion
- * Setting the PTE depends on the MMU type and other factors. It's
- * an horrible mess that I'm not going to try to clean up now but
- * I'm keeping it in one place rather than spread around
- */
-static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
-				pte_t *ptep, pte_t pte, int percpu)
-{
-#if defined(CONFIG_PPC_STD_MMU_32) && defined(CONFIG_SMP) && !defined(CONFIG_PTE_64BIT)
-	/* First case is 32-bit Hash MMU in SMP mode with 32-bit PTEs. We use the
-	 * helper pte_update() which does an atomic update. We need to do that
-	 * because a concurrent invalidation can clear _PAGE_HASHPTE. If it's a
-	 * per-CPU PTE such as a kmap_atomic, we do a simple update preserving
-	 * the hash bits instead (ie, same as the non-SMP case)
-	 */
-	if (percpu)
-		*ptep = __pte((pte_val(*ptep) & _PAGE_HASHPTE)
-			      | (pte_val(pte) & ~_PAGE_HASHPTE));
-	else
-		pte_update(ptep, ~_PAGE_HASHPTE, pte_val(pte));
-
-#elif defined(CONFIG_PPC32) && defined(CONFIG_PTE_64BIT)
-	/* Second case is 32-bit with 64-bit PTE.  In this case, we
-	 * can just store as long as we do the two halves in the right order
-	 * with a barrier in between. This is possible because we take care,
-	 * in the hash code, to pre-invalidate if the PTE was already hashed,
-	 * which synchronizes us with any concurrent invalidation.
-	 * In the percpu case, we also fallback to the simple update preserving
-	 * the hash bits
-	 */
-	if (percpu) {
-		*ptep = __pte((pte_val(*ptep) & _PAGE_HASHPTE)
-			      | (pte_val(pte) & ~_PAGE_HASHPTE));
-		return;
-	}
-#if _PAGE_HASHPTE != 0
-	if (pte_val(*ptep) & _PAGE_HASHPTE)
-		flush_hash_entry(mm, ptep, addr);
-#endif
-	__asm__ __volatile__("\
-		stw%U0%X0 %2,%0\n\
-		eieio\n\
-		stw%U0%X0 %L2,%1"
-	: "=m" (*ptep), "=m" (*((unsigned char *)ptep+4))
-	: "r" (pte) : "memory");
-
-#elif defined(CONFIG_PPC_STD_MMU_32)
-	/* Third case is 32-bit hash table in UP mode, we need to preserve
-	 * the _PAGE_HASHPTE bit since we may not have invalidated the previous
-	 * translation in the hash yet (done in a subsequent flush_tlb_xxx())
-	 * and see we need to keep track that this PTE needs invalidating
-	 */
-	*ptep = __pte((pte_val(*ptep) & _PAGE_HASHPTE)
-		      | (pte_val(pte) & ~_PAGE_HASHPTE));
-
-#else
-	/* Anything else just stores the PTE normally. That covers all 64-bit
-	 * cases, and 32-bit non-hash with 32-bit PTEs.
-	 */
-	*ptep = pte;
-
-#ifdef CONFIG_PPC_BOOK3E_64
-	/*
-	 * With hardware tablewalk, a sync is needed to ensure that
-	 * subsequent accesses see the PTE we just wrote.  Unlike userspace
-	 * mappings, we can't tolerate spurious faults, so make sure
-	 * the new PTE will be seen the first time.
-	 */
-	if (is_kernel_addr(addr))
-		mb();
-#endif
-#endif
-}
-
-
-#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
-extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address,
-				 pte_t *ptep, pte_t entry, int dirty);
-
-/*
- * Macro to mark a page protection value as "uncacheable".
- */
-
-#define _PAGE_CACHE_CTL	(_PAGE_COHERENT | _PAGE_GUARDED | _PAGE_NO_CACHE | \
-			 _PAGE_WRITETHRU)
-
-#define pgprot_noncached(prot)	  (__pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | \
-				            _PAGE_NO_CACHE | _PAGE_GUARDED))
-
-#define pgprot_noncached_wc(prot) (__pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | \
-				            _PAGE_NO_CACHE))
-
-#define pgprot_cached(prot)       (__pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | \
-				            _PAGE_COHERENT))
-
-#define pgprot_cached_wthru(prot) (__pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | \
-				            _PAGE_COHERENT | _PAGE_WRITETHRU))
-
-#define pgprot_cached_noncoherent(prot) \
-		(__pgprot(pgprot_val(prot) & ~_PAGE_CACHE_CTL))
-
-#define pgprot_writecombine pgprot_noncached_wc
-
-struct file;
-extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
-				     unsigned long size, pgprot_t vma_prot);
-#define __HAVE_PHYS_MEM_ACCESS_PROT
-
 /*
  * ZERO_PAGE is a global shared page that is always zero: used
  * for zero-mapped memory areas etc..
@@ -275,5 +86,4 @@ static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
 }
 #endif /* __ASSEMBLY__ */
 
-#endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_PGTABLE_H */

From b0412ea94bcbd08dc1e61043dfdd9c33272cec48 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 1 Dec 2015 09:06:33 +0530
Subject: [PATCH 041/149] powerpc/mm: Drop pte-common.h from BOOK3S 64

We copy only needed PTE bits define from pte-common.h to respective
hash related header. This should greatly simply later patches in which
we are going to change the pte format for hash config

Acked-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/hash-4k.h |   1 +
 arch/powerpc/include/asm/book3s/64/hash.h    |   2 +
 arch/powerpc/include/asm/book3s/64/pgtable.h | 104 ++++++++++++++++++-
 arch/powerpc/include/asm/book3s/pgtable.h    |  16 ++-
 4 files changed, 111 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h
index f2c51cd61f69..15518b620f5a 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
@@ -62,6 +62,7 @@
 /* shift to put page number into pte */
 #define PTE_RPN_SHIFT	(17)
 
+#define _PAGE_4K_PFN		0
 #ifndef __ASSEMBLY__
 /*
  * 4-level page tables related bits
diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
index 8e60d4fa434d..7deb5063ff8c 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -20,6 +20,7 @@
 #define _PAGE_EXEC		0x0004 /* No execute on POWER4 and newer (we invert) */
 #define _PAGE_GUARDED		0x0008
 /* We can derive Memory coherence from _PAGE_NO_CACHE */
+#define _PAGE_COHERENT		0x0
 #define _PAGE_NO_CACHE		0x0020 /* I: cache inhibit */
 #define _PAGE_WRITETHRU		0x0040 /* W: cache write-through */
 #define _PAGE_DIRTY		0x0080 /* C: page changed */
@@ -30,6 +31,7 @@
 /* No separate kernel read-only */
 #define _PAGE_KERNEL_RW		(_PAGE_RW | _PAGE_DIRTY) /* user access blocked by key */
 #define _PAGE_KERNEL_RO		 _PAGE_KERNEL_RW
+#define _PAGE_KERNEL_RWX	(_PAGE_DIRTY | _PAGE_RW | _PAGE_EXEC)
 
 /* Strong Access Ordering */
 #define _PAGE_SAO		(_PAGE_WRITETHRU | _PAGE_NO_CACHE | _PAGE_COHERENT)
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index ddc08bf22709..f942b27e9c5f 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -94,11 +94,109 @@
 #define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS |		\
 			 _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_SPLITTING | \
 			 _PAGE_THP_HUGE)
+#define _PTE_NONE_MASK	_PAGE_HPTEFLAGS
 /*
- * Default defines for things which we don't use.
- * We should get this removed.
+ * The mask convered by the RPN must be a ULL on 32-bit platforms with
+ * 64-bit PTEs
  */
-#include <asm/pte-common.h>
+#define PTE_RPN_MASK	(~((1UL << PTE_RPN_SHIFT) - 1))
+/*
+ * _PAGE_CHG_MASK masks of bits that are to be preserved across
+ * pgprot changes
+ */
+#define _PAGE_CHG_MASK	(PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \
+			 _PAGE_ACCESSED | _PAGE_SPECIAL)
+/*
+ * Mask of bits returned by pte_pgprot()
+ */
+#define PAGE_PROT_BITS	(_PAGE_GUARDED | _PAGE_COHERENT | _PAGE_NO_CACHE | \
+			 _PAGE_WRITETHRU | _PAGE_4K_PFN | \
+			 _PAGE_USER | _PAGE_ACCESSED |  \
+			 _PAGE_RW |  _PAGE_DIRTY | _PAGE_EXEC)
+/*
+ * We define 2 sets of base prot bits, one for basic pages (ie,
+ * cacheable kernel and user pages) and one for non cacheable
+ * pages. We always set _PAGE_COHERENT when SMP is enabled or
+ * the processor might need it for DMA coherency.
+ */
+#define _PAGE_BASE_NC	(_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_PSIZE)
+#define _PAGE_BASE	(_PAGE_BASE_NC | _PAGE_COHERENT)
+
+/* Permission masks used to generate the __P and __S table,
+ *
+ * Note:__pgprot is defined in arch/powerpc/include/asm/page.h
+ *
+ * Write permissions imply read permissions for now (we could make write-only
+ * pages on BookE but we don't bother for now). Execute permission control is
+ * possible on platforms that define _PAGE_EXEC
+ *
+ * Note due to the way vm flags are laid out, the bits are XWR
+ */
+#define PAGE_NONE	__pgprot(_PAGE_BASE)
+#define PAGE_SHARED	__pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW)
+#define PAGE_SHARED_X	__pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW | \
+				 _PAGE_EXEC)
+#define PAGE_COPY	__pgprot(_PAGE_BASE | _PAGE_USER )
+#define PAGE_COPY_X	__pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC)
+#define PAGE_READONLY	__pgprot(_PAGE_BASE | _PAGE_USER )
+#define PAGE_READONLY_X	__pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC)
+
+#define __P000	PAGE_NONE
+#define __P001	PAGE_READONLY
+#define __P010	PAGE_COPY
+#define __P011	PAGE_COPY
+#define __P100	PAGE_READONLY_X
+#define __P101	PAGE_READONLY_X
+#define __P110	PAGE_COPY_X
+#define __P111	PAGE_COPY_X
+
+#define __S000	PAGE_NONE
+#define __S001	PAGE_READONLY
+#define __S010	PAGE_SHARED
+#define __S011	PAGE_SHARED
+#define __S100	PAGE_READONLY_X
+#define __S101	PAGE_READONLY_X
+#define __S110	PAGE_SHARED_X
+#define __S111	PAGE_SHARED_X
+
+/* Permission masks used for kernel mappings */
+#define PAGE_KERNEL	__pgprot(_PAGE_BASE | _PAGE_KERNEL_RW)
+#define PAGE_KERNEL_NC	__pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \
+				 _PAGE_NO_CACHE)
+#define PAGE_KERNEL_NCG	__pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \
+				 _PAGE_NO_CACHE | _PAGE_GUARDED)
+#define PAGE_KERNEL_X	__pgprot(_PAGE_BASE | _PAGE_KERNEL_RWX)
+#define PAGE_KERNEL_RO	__pgprot(_PAGE_BASE | _PAGE_KERNEL_RO)
+#define PAGE_KERNEL_ROX	__pgprot(_PAGE_BASE | _PAGE_KERNEL_ROX)
+
+/* Protection used for kernel text. We want the debuggers to be able to
+ * set breakpoints anywhere, so don't write protect the kernel text
+ * on platforms where such control is possible.
+ */
+#if defined(CONFIG_KGDB) || defined(CONFIG_XMON) || defined(CONFIG_BDI_SWITCH) ||\
+	defined(CONFIG_KPROBES) || defined(CONFIG_DYNAMIC_FTRACE)
+#define PAGE_KERNEL_TEXT	PAGE_KERNEL_X
+#else
+#define PAGE_KERNEL_TEXT	PAGE_KERNEL_ROX
+#endif
+
+/* Make modules code happy. We don't set RO yet */
+#define PAGE_KERNEL_EXEC	PAGE_KERNEL_X
+
+/*
+ * Don't just check for any non zero bits in __PAGE_USER, since for book3e
+ * and PTE_64BIT, PAGE_KERNEL_X contains _PAGE_BAP_SR which is also in
+ * _PAGE_USER.  Need to explicitly match _PAGE_BAP_UR bit in that case too.
+ */
+#define pte_user(val)		((val & _PAGE_USER) == _PAGE_USER)
+
+/* Advertise special mapping type for AGP */
+#define PAGE_AGP		(PAGE_KERNEL_NC)
+#define HAVE_PAGE_AGP
+
+/* Advertise support for _PAGE_SPECIAL */
+#define __HAVE_ARCH_PTE_SPECIAL
+
 #ifndef __ASSEMBLY__
 
 /*
diff --git a/arch/powerpc/include/asm/book3s/pgtable.h b/arch/powerpc/include/asm/book3s/pgtable.h
index fa270cfcf30a..87333618af3b 100644
--- a/arch/powerpc/include/asm/book3s/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/pgtable.h
@@ -11,10 +11,7 @@
 #ifndef __ASSEMBLY__
 
 /* Generic accessors to PTE bits */
-static inline int pte_write(pte_t pte)
-{
-	return (pte_val(pte) & (_PAGE_RW | _PAGE_RO)) != _PAGE_RO;
-}
+static inline int pte_write(pte_t pte)		{ return !!(pte_val(pte) & _PAGE_RW);}
 static inline int pte_dirty(pte_t pte)		{ return pte_val(pte) & _PAGE_DIRTY; }
 static inline int pte_young(pte_t pte)		{ return pte_val(pte) & _PAGE_ACCESSED; }
 static inline int pte_special(pte_t pte)	{ return pte_val(pte) & _PAGE_SPECIAL; }
@@ -57,15 +54,16 @@ static inline unsigned long pte_pfn(pte_t pte)	{
 	return pte_val(pte) >> PTE_RPN_SHIFT; }
 
 /* Generic modifiers for PTE bits */
-static inline pte_t pte_wrprotect(pte_t pte) {
-	pte_val(pte) &= ~(_PAGE_RW | _PAGE_HWWRITE);
-	pte_val(pte) |= _PAGE_RO; return pte; }
+static inline pte_t pte_wrprotect(pte_t pte)
+{
+	pte_val(pte) &= ~_PAGE_RW;
+	return pte;
+}
 static inline pte_t pte_mkclean(pte_t pte) {
-	pte_val(pte) &= ~(_PAGE_DIRTY | _PAGE_HWWRITE); return pte; }
+	pte_val(pte) &= ~_PAGE_DIRTY; return pte; }
 static inline pte_t pte_mkold(pte_t pte) {
 	pte_val(pte) &= ~_PAGE_ACCESSED; return pte; }
 static inline pte_t pte_mkwrite(pte_t pte) {
-	pte_val(pte) &= ~_PAGE_RO;
 	pte_val(pte) |= _PAGE_RW; return pte; }
 static inline pte_t pte_mkdirty(pte_t pte) {
 	pte_val(pte) |= _PAGE_DIRTY; return pte; }

From 10bd3808dfd067d6d6c941cc6e1b13be165f6a70 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 1 Dec 2015 09:06:34 +0530
Subject: [PATCH 042/149] powerpc/mm: Don't use pte_val as lvalue

We also convert few #define to static inline in this patch for better
type checking

Acked-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/pgtable.h | 118 +++++++++++++++-------
 arch/powerpc/include/asm/page.h           |  10 +-
 arch/powerpc/include/asm/pgtable-book3e.h |  68 +++++++++----
 3 files changed, 139 insertions(+), 57 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/pgtable.h b/arch/powerpc/include/asm/book3s/pgtable.h
index 87333618af3b..ebd6677ea017 100644
--- a/arch/powerpc/include/asm/book3s/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/pgtable.h
@@ -12,9 +12,9 @@
 
 /* Generic accessors to PTE bits */
 static inline int pte_write(pte_t pte)		{ return !!(pte_val(pte) & _PAGE_RW);}
-static inline int pte_dirty(pte_t pte)		{ return pte_val(pte) & _PAGE_DIRTY; }
-static inline int pte_young(pte_t pte)		{ return pte_val(pte) & _PAGE_ACCESSED; }
-static inline int pte_special(pte_t pte)	{ return pte_val(pte) & _PAGE_SPECIAL; }
+static inline int pte_dirty(pte_t pte)		{ return !!(pte_val(pte) & _PAGE_DIRTY); }
+static inline int pte_young(pte_t pte)		{ return !!(pte_val(pte) & _PAGE_ACCESSED); }
+static inline int pte_special(pte_t pte)	{ return !!(pte_val(pte) & _PAGE_SPECIAL); }
 static inline int pte_none(pte_t pte)		{ return (pte_val(pte) & ~_PTE_NONE_MASK) == 0; }
 static inline pgprot_t pte_pgprot(pte_t pte)	{ return __pgprot(pte_val(pte) & PAGE_PROT_BITS); }
 
@@ -47,36 +47,61 @@ static inline int pte_present(pte_t pte)
  * Even if PTEs can be unsigned long long, a PFN is always an unsigned
  * long for now.
  */
-static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot) {
+static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot)
+{
 	return __pte(((pte_basic_t)(pfn) << PTE_RPN_SHIFT) |
-		     pgprot_val(pgprot)); }
-static inline unsigned long pte_pfn(pte_t pte)	{
-	return pte_val(pte) >> PTE_RPN_SHIFT; }
+		     pgprot_val(pgprot));
+}
+
+static inline unsigned long pte_pfn(pte_t pte)
+{
+	return pte_val(pte) >> PTE_RPN_SHIFT;
+}
 
 /* Generic modifiers for PTE bits */
 static inline pte_t pte_wrprotect(pte_t pte)
 {
-	pte_val(pte) &= ~_PAGE_RW;
+	return __pte(pte_val(pte) & ~_PAGE_RW);
+}
+
+static inline pte_t pte_mkclean(pte_t pte)
+{
+	return __pte(pte_val(pte) & ~_PAGE_DIRTY);
+}
+
+static inline pte_t pte_mkold(pte_t pte)
+{
+	return __pte(pte_val(pte) & ~_PAGE_ACCESSED);
+}
+
+static inline pte_t pte_mkwrite(pte_t pte)
+{
+	return __pte(pte_val(pte) | _PAGE_RW);
+}
+
+static inline pte_t pte_mkdirty(pte_t pte)
+{
+	return __pte(pte_val(pte) | _PAGE_DIRTY);
+}
+
+static inline pte_t pte_mkyoung(pte_t pte)
+{
+	return __pte(pte_val(pte) | _PAGE_ACCESSED);
+}
+
+static inline pte_t pte_mkspecial(pte_t pte)
+{
+	return __pte(pte_val(pte) | _PAGE_SPECIAL);
+}
+
+static inline pte_t pte_mkhuge(pte_t pte)
+{
 	return pte;
 }
-static inline pte_t pte_mkclean(pte_t pte) {
-	pte_val(pte) &= ~_PAGE_DIRTY; return pte; }
-static inline pte_t pte_mkold(pte_t pte) {
-	pte_val(pte) &= ~_PAGE_ACCESSED; return pte; }
-static inline pte_t pte_mkwrite(pte_t pte) {
-	pte_val(pte) |= _PAGE_RW; return pte; }
-static inline pte_t pte_mkdirty(pte_t pte) {
-	pte_val(pte) |= _PAGE_DIRTY; return pte; }
-static inline pte_t pte_mkyoung(pte_t pte) {
-	pte_val(pte) |= _PAGE_ACCESSED; return pte; }
-static inline pte_t pte_mkspecial(pte_t pte) {
-	pte_val(pte) |= _PAGE_SPECIAL; return pte; }
-static inline pte_t pte_mkhuge(pte_t pte) {
-	return pte; }
+
 static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 {
-	pte_val(pte) = (pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot);
-	return pte;
+	return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot));
 }
 
 
@@ -159,22 +184,45 @@ extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addre
 #define _PAGE_CACHE_CTL	(_PAGE_COHERENT | _PAGE_GUARDED | _PAGE_NO_CACHE | \
 			 _PAGE_WRITETHRU)
 
-#define pgprot_noncached(prot)	  (__pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | \
-				            _PAGE_NO_CACHE | _PAGE_GUARDED))
+#define pgprot_noncached pgprot_noncached
+static inline pgprot_t pgprot_noncached(pgprot_t prot)
+{
+	return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) |
+			_PAGE_NO_CACHE | _PAGE_GUARDED);
+}
 
-#define pgprot_noncached_wc(prot) (__pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | \
-				            _PAGE_NO_CACHE))
+#define pgprot_noncached_wc pgprot_noncached_wc
+static inline pgprot_t pgprot_noncached_wc(pgprot_t prot)
+{
+	return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) |
+			_PAGE_NO_CACHE);
+}
 
-#define pgprot_cached(prot)       (__pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | \
-				            _PAGE_COHERENT))
+#define pgprot_cached pgprot_cached
+static inline pgprot_t pgprot_cached(pgprot_t prot)
+{
+	return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) |
+			_PAGE_COHERENT);
+}
 
-#define pgprot_cached_wthru(prot) (__pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | \
-				            _PAGE_COHERENT | _PAGE_WRITETHRU))
+#define pgprot_cached_wthru pgprot_cached_wthru
+static inline pgprot_t pgprot_cached_wthru(pgprot_t prot)
+{
+	return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) |
+			_PAGE_COHERENT | _PAGE_WRITETHRU);
+}
 
-#define pgprot_cached_noncoherent(prot) \
-		(__pgprot(pgprot_val(prot) & ~_PAGE_CACHE_CTL))
+#define pgprot_cached_noncoherent pgprot_cached_noncoherent
+static inline pgprot_t pgprot_cached_noncoherent(pgprot_t prot)
+{
+	return __pgprot(pgprot_val(prot) & ~_PAGE_CACHE_CTL);
+}
 
-#define pgprot_writecombine pgprot_noncached_wc
+#define pgprot_writecombine pgprot_writecombine
+static inline pgprot_t pgprot_writecombine(pgprot_t prot)
+{
+	return pgprot_noncached_wc(prot);
+}
 
 struct file;
 extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index 3140c19c448c..3ce534140390 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -286,8 +286,11 @@ extern long long virt_phys_offset;
 
 /* PTE level */
 typedef struct { pte_basic_t pte; } pte_t;
-#define pte_val(x)	((x).pte)
 #define __pte(x)	((pte_t) { (x) })
+static inline pte_basic_t pte_val(pte_t x)
+{
+	return x.pte;
+}
 
 /* 64k pages additionally define a bigger "real PTE" type that gathers
  * the "second half" part of the PTE for pseudo 64k pages
@@ -329,8 +332,11 @@ typedef struct { unsigned long pgprot; } pgprot_t;
  */
 
 typedef pte_basic_t pte_t;
-#define pte_val(x)	(x)
 #define __pte(x)	(x)
+static inline pte_basic_t pte_val(pte_t pte)
+{
+	return pte;
+}
 
 #if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_PPC_STD_MMU_64)
 typedef struct { pte_t pte; unsigned long hidx; } real_pte_t;
diff --git a/arch/powerpc/include/asm/pgtable-book3e.h b/arch/powerpc/include/asm/pgtable-book3e.h
index a3221cff2e31..91325997ba25 100644
--- a/arch/powerpc/include/asm/pgtable-book3e.h
+++ b/arch/powerpc/include/asm/pgtable-book3e.h
@@ -56,30 +56,58 @@ static inline unsigned long pte_pfn(pte_t pte)	{
 	return pte_val(pte) >> PTE_RPN_SHIFT; }
 
 /* Generic modifiers for PTE bits */
-static inline pte_t pte_wrprotect(pte_t pte) {
-	pte_val(pte) &= ~(_PAGE_RW | _PAGE_HWWRITE);
-	pte_val(pte) |= _PAGE_RO; return pte; }
-static inline pte_t pte_mkclean(pte_t pte) {
-	pte_val(pte) &= ~(_PAGE_DIRTY | _PAGE_HWWRITE); return pte; }
-static inline pte_t pte_mkold(pte_t pte) {
-	pte_val(pte) &= ~_PAGE_ACCESSED; return pte; }
-static inline pte_t pte_mkwrite(pte_t pte) {
-	pte_val(pte) &= ~_PAGE_RO;
-	pte_val(pte) |= _PAGE_RW; return pte; }
-static inline pte_t pte_mkdirty(pte_t pte) {
-	pte_val(pte) |= _PAGE_DIRTY; return pte; }
-static inline pte_t pte_mkyoung(pte_t pte) {
-	pte_val(pte) |= _PAGE_ACCESSED; return pte; }
-static inline pte_t pte_mkspecial(pte_t pte) {
-	pte_val(pte) |= _PAGE_SPECIAL; return pte; }
-static inline pte_t pte_mkhuge(pte_t pte) {
-	return pte; }
-static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
+static inline pte_t pte_wrprotect(pte_t pte)
+{
+	pte_basic_t ptev;
+
+	ptev = pte_val(pte) & ~(_PAGE_RW | _PAGE_HWWRITE);
+	ptev |= _PAGE_RO;
+	return __pte(ptev);
+}
+
+static inline pte_t pte_mkclean(pte_t pte)
+{
+	return __pte(pte_val(pte) & ~(_PAGE_DIRTY | _PAGE_HWWRITE));
+}
+
+static inline pte_t pte_mkold(pte_t pte)
+{
+	return __pte(pte_val(pte) & ~_PAGE_ACCESSED);
+}
+
+static inline pte_t pte_mkwrite(pte_t pte)
+{
+	pte_basic_t ptev;
+
+	ptev = pte_val(pte) & ~_PAGE_RO;
+	ptev |= _PAGE_RW;
+	return __pte(ptev);
+}
+
+static inline pte_t pte_mkdirty(pte_t pte)
+{
+	return __pte(pte_val(pte) | _PAGE_DIRTY);
+}
+
+static inline pte_t pte_mkyoung(pte_t pte)
+{
+	return __pte(pte_val(pte) | _PAGE_ACCESSED);
+}
+
+static inline pte_t pte_mkspecial(pte_t pte)
+{
+	return __pte(pte_val(pte) | _PAGE_SPECIAL);
+}
+
+static inline pte_t pte_mkhuge(pte_t pte)
 {
-	pte_val(pte) = (pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot);
 	return pte;
 }
 
+static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
+{
+	return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot));
+}
 
 /* Insert a PTE, top-level function is out of line. It uses an inline
  * low level function in the respective pgtable-* files

From f281b5d50c87ecca108dcbf8f791bd8923fde3de Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 1 Dec 2015 09:06:35 +0530
Subject: [PATCH 043/149] powerpc/mm: Don't use pmd_val, pud_val and pgd_val as
 lvalue

We convert them static inline function here as we did with pte_val in
the previous patch

Acked-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/32/pgtable.h |  6 +++-
 arch/powerpc/include/asm/book3s/64/hash-4k.h |  6 +++-
 arch/powerpc/include/asm/book3s/64/pgtable.h | 36 +++++++++++++++-----
 arch/powerpc/include/asm/page.h              | 34 +++++++++++++-----
 arch/powerpc/include/asm/pgalloc-32.h        | 34 +++++++++++++-----
 arch/powerpc/include/asm/pgalloc-64.h        | 17 ++++++---
 arch/powerpc/include/asm/pgtable-ppc32.h     |  7 +++-
 arch/powerpc/include/asm/pgtable-ppc64-4k.h  |  6 +++-
 arch/powerpc/include/asm/pgtable-ppc64.h     | 36 +++++++++++++++-----
 arch/powerpc/mm/40x_mmu.c                    | 10 +++---
 arch/powerpc/mm/pgtable_64.c                 | 19 +++++------
 11 files changed, 154 insertions(+), 57 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h
index 5438c0b6aeec..226f29d39332 100644
--- a/arch/powerpc/include/asm/book3s/32/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
@@ -105,7 +105,11 @@ extern unsigned long ioremap_bot;
 #define pmd_none(pmd)		(!pmd_val(pmd))
 #define	pmd_bad(pmd)		(pmd_val(pmd) & _PMD_BAD)
 #define	pmd_present(pmd)	(pmd_val(pmd) & _PMD_PRESENT_MASK)
-#define	pmd_clear(pmdp)		do { pmd_val(*(pmdp)) = 0; } while (0)
+static inline void pmd_clear(pmd_t *pmdp)
+{
+	*pmdp = __pmd(0);
+}
+
 
 /*
  * When flushing the tlb entry for a page, we also need to flush the hash
diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h
index 15518b620f5a..537eacecf6e9 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
@@ -71,9 +71,13 @@
 #define pgd_none(pgd)		(!pgd_val(pgd))
 #define pgd_bad(pgd)		(pgd_val(pgd) == 0)
 #define pgd_present(pgd)	(pgd_val(pgd) != 0)
-#define pgd_clear(pgdp)		(pgd_val(*(pgdp)) = 0)
 #define pgd_page_vaddr(pgd)	(pgd_val(pgd) & ~PGD_MASKED_BITS)
 
+static inline void pgd_clear(pgd_t *pgdp)
+{
+	*pgdp = __pgd(0);
+}
+
 static inline pte_t pgd_pte(pgd_t pgd)
 {
 	return __pte(pgd_val(pgd));
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index f942b27e9c5f..09c6474f89c7 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -236,21 +236,38 @@
 #define PMD_BAD_BITS		(PTE_TABLE_SIZE-1)
 #define PUD_BAD_BITS		(PMD_TABLE_SIZE-1)
 
-#define pmd_set(pmdp, pmdval)	(pmd_val(*(pmdp)) = (pmdval))
+static inline void pmd_set(pmd_t *pmdp, unsigned long val)
+{
+	*pmdp = __pmd(val);
+}
+
+static inline void pmd_clear(pmd_t *pmdp)
+{
+	*pmdp = __pmd(0);
+}
+
+
 #define pmd_none(pmd)		(!pmd_val(pmd))
 #define	pmd_bad(pmd)		(!is_kernel_addr(pmd_val(pmd)) \
 				 || (pmd_val(pmd) & PMD_BAD_BITS))
 #define	pmd_present(pmd)	(!pmd_none(pmd))
-#define	pmd_clear(pmdp)		(pmd_val(*(pmdp)) = 0)
 #define pmd_page_vaddr(pmd)	(pmd_val(pmd) & ~PMD_MASKED_BITS)
 extern struct page *pmd_page(pmd_t pmd);
 
-#define pud_set(pudp, pudval)	(pud_val(*(pudp)) = (pudval))
+static inline void pud_set(pud_t *pudp, unsigned long val)
+{
+	*pudp = __pud(val);
+}
+
+static inline void pud_clear(pud_t *pudp)
+{
+	*pudp = __pud(0);
+}
+
 #define pud_none(pud)		(!pud_val(pud))
 #define	pud_bad(pud)		(!is_kernel_addr(pud_val(pud)) \
 				 || (pud_val(pud) & PUD_BAD_BITS))
 #define pud_present(pud)	(pud_val(pud) != 0)
-#define pud_clear(pudp)		(pud_val(*(pudp)) = 0)
 #define pud_page_vaddr(pud)	(pud_val(pud) & ~PUD_MASKED_BITS)
 
 extern struct page *pud_page(pud_t pud);
@@ -265,8 +282,11 @@ static inline pud_t pte_pud(pte_t pte)
 	return __pud(pte_val(pte));
 }
 #define pud_write(pud)		pte_write(pud_pte(pud))
-#define pgd_set(pgdp, pudp)	({pgd_val(*(pgdp)) = (unsigned long)(pudp);})
 #define pgd_write(pgd)		pte_write(pgd_pte(pgd))
+static inline void pgd_set(pgd_t *pgdp, unsigned long val)
+{
+	*pgdp = __pgd(val);
+}
 
 /*
  * Find an entry in a page-table-directory.  We combine the address region
@@ -588,14 +608,12 @@ static inline pmd_t pmd_mkhuge(pmd_t pmd)
 
 static inline pmd_t pmd_mknotpresent(pmd_t pmd)
 {
-	pmd_val(pmd) &= ~_PAGE_PRESENT;
-	return pmd;
+	return __pmd(pmd_val(pmd) & ~_PAGE_PRESENT);
 }
 
 static inline pmd_t pmd_mksplitting(pmd_t pmd)
 {
-	pmd_val(pmd) |= _PAGE_SPLITTING;
-	return pmd;
+	return __pmd(pmd_val(pmd) | _PAGE_SPLITTING);
 }
 
 #define __HAVE_ARCH_PMD_SAME
diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index 3ce534140390..5a3e7c643d73 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -304,21 +304,30 @@ typedef struct { pte_t pte; } real_pte_t;
 /* PMD level */
 #ifdef CONFIG_PPC64
 typedef struct { unsigned long pmd; } pmd_t;
-#define pmd_val(x)	((x).pmd)
 #define __pmd(x)	((pmd_t) { (x) })
+static inline unsigned long pmd_val(pmd_t x)
+{
+	return x.pmd;
+}
 
 /* PUD level exusts only on 4k pages */
 #ifndef CONFIG_PPC_64K_PAGES
 typedef struct { unsigned long pud; } pud_t;
-#define pud_val(x)	((x).pud)
 #define __pud(x)	((pud_t) { (x) })
+static inline unsigned long pud_val(pud_t x)
+{
+	return x.pud;
+}
 #endif /* !CONFIG_PPC_64K_PAGES */
 #endif /* CONFIG_PPC64 */
 
 /* PGD level */
 typedef struct { unsigned long pgd; } pgd_t;
-#define pgd_val(x)	((x).pgd)
 #define __pgd(x)	((pgd_t) { (x) })
+static inline unsigned long pgd_val(pgd_t x)
+{
+	return x.pgd;
+}
 
 /* Page protection bits */
 typedef struct { unsigned long pgprot; } pgprot_t;
@@ -347,22 +356,31 @@ typedef pte_t real_pte_t;
 
 #ifdef CONFIG_PPC64
 typedef unsigned long pmd_t;
-#define pmd_val(x)	(x)
 #define __pmd(x)	(x)
+static inline unsigned long pmd_val(pmd_t pmd)
+{
+	return pmd;
+}
 
 #ifndef CONFIG_PPC_64K_PAGES
 typedef unsigned long pud_t;
-#define pud_val(x)	(x)
 #define __pud(x)	(x)
+static inline unsigned long pud_val(pud_t pud)
+{
+	return pud;
+}
 #endif /* !CONFIG_PPC_64K_PAGES */
 #endif /* CONFIG_PPC64 */
 
 typedef unsigned long pgd_t;
-#define pgd_val(x)	(x)
-#define pgprot_val(x)	(x)
+#define __pgd(x)	(x)
+static inline unsigned long pgd_val(pgd_t pgd)
+{
+	return pgd;
+}
 
 typedef unsigned long pgprot_t;
-#define __pgd(x)	(x)
+#define pgprot_val(x)	(x)
 #define __pgprot(x)	(x)
 
 #endif
diff --git a/arch/powerpc/include/asm/pgalloc-32.h b/arch/powerpc/include/asm/pgalloc-32.h
index 842846c1b711..76d6b9e0c8a9 100644
--- a/arch/powerpc/include/asm/pgalloc-32.h
+++ b/arch/powerpc/include/asm/pgalloc-32.h
@@ -21,16 +21,34 @@ extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
 /* #define pgd_populate(mm, pmd, pte)      BUG() */
 
 #ifndef CONFIG_BOOKE
-#define pmd_populate_kernel(mm, pmd, pte)	\
-		(pmd_val(*(pmd)) = __pa(pte) | _PMD_PRESENT)
-#define pmd_populate(mm, pmd, pte)	\
-		(pmd_val(*(pmd)) = (page_to_pfn(pte) << PAGE_SHIFT) | _PMD_PRESENT)
+
+static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp,
+				       pte_t *pte)
+{
+	*pmdp = __pmd(__pa(pte) | _PMD_PRESENT);
+}
+
+static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp,
+				pgtable_t pte_page)
+{
+	*pmdp = __pmd((page_to_pfn(pte_page) << PAGE_SHIFT) | _PMD_PRESENT);
+}
+
 #define pmd_pgtable(pmd) pmd_page(pmd)
 #else
-#define pmd_populate_kernel(mm, pmd, pte)	\
-		(pmd_val(*(pmd)) = (unsigned long)pte | _PMD_PRESENT)
-#define pmd_populate(mm, pmd, pte)	\
-		(pmd_val(*(pmd)) = (unsigned long)lowmem_page_address(pte) | _PMD_PRESENT)
+
+static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp,
+				       pte_t *pte)
+{
+	*pmdp = __pmd((unsigned long)pte | _PMD_PRESENT);
+}
+
+static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp,
+				pgtable_t pte_page)
+{
+	*pmdp = __pmd((unsigned long)lowmem_page_address(pte_page) | _PMD_PRESENT);
+}
+
 #define pmd_pgtable(pmd) pmd_page(pmd)
 #endif
 
diff --git a/arch/powerpc/include/asm/pgalloc-64.h b/arch/powerpc/include/asm/pgalloc-64.h
index 4b0be20fcbfd..d8cde71f6734 100644
--- a/arch/powerpc/include/asm/pgalloc-64.h
+++ b/arch/powerpc/include/asm/pgalloc-64.h
@@ -53,7 +53,7 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 
 #ifndef CONFIG_PPC_64K_PAGES
 
-#define pgd_populate(MM, PGD, PUD)	pgd_set(PGD, PUD)
+#define pgd_populate(MM, PGD, PUD)	pgd_set(PGD, (unsigned long)PUD)
 
 static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
@@ -71,9 +71,18 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
 	pud_set(pud, (unsigned long)pmd);
 }
 
-#define pmd_populate(mm, pmd, pte_page) \
-	pmd_populate_kernel(mm, pmd, page_address(pte_page))
-#define pmd_populate_kernel(mm, pmd, pte) pmd_set(pmd, (unsigned long)(pte))
+static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd,
+				       pte_t *pte)
+{
+	pmd_set(pmd, (unsigned long)pte);
+}
+
+static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
+				pgtable_t pte_page)
+{
+	pmd_set(pmd, (unsigned long)page_address(pte_page));
+}
+
 #define pmd_pgtable(pmd) pmd_page(pmd)
 
 static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
diff --git a/arch/powerpc/include/asm/pgtable-ppc32.h b/arch/powerpc/include/asm/pgtable-ppc32.h
index aac6547b0823..fbb23c54b998 100644
--- a/arch/powerpc/include/asm/pgtable-ppc32.h
+++ b/arch/powerpc/include/asm/pgtable-ppc32.h
@@ -128,7 +128,12 @@ extern int icache_44x_need_flush;
 #define pmd_none(pmd)		(!pmd_val(pmd))
 #define	pmd_bad(pmd)		(pmd_val(pmd) & _PMD_BAD)
 #define	pmd_present(pmd)	(pmd_val(pmd) & _PMD_PRESENT_MASK)
-#define	pmd_clear(pmdp)		do { pmd_val(*(pmdp)) = 0; } while (0)
+static inline void pmd_clear(pmd_t *pmdp)
+{
+	*pmdp = __pmd(0);
+}
+
+
 
 /*
  * When flushing the tlb entry for a page, we also need to flush the hash
diff --git a/arch/powerpc/include/asm/pgtable-ppc64-4k.h b/arch/powerpc/include/asm/pgtable-ppc64-4k.h
index 132ee1d482c2..7bace25d6b62 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64-4k.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64-4k.h
@@ -55,11 +55,15 @@
 #define pgd_none(pgd)		(!pgd_val(pgd))
 #define pgd_bad(pgd)		(pgd_val(pgd) == 0)
 #define pgd_present(pgd)	(pgd_val(pgd) != 0)
-#define pgd_clear(pgdp)		(pgd_val(*(pgdp)) = 0)
 #define pgd_page_vaddr(pgd)	(pgd_val(pgd) & ~PGD_MASKED_BITS)
 
 #ifndef __ASSEMBLY__
 
+static inline void pgd_clear(pgd_t *pgdp)
+{
+	*pgdp = __pgd(0);
+}
+
 static inline pte_t pgd_pte(pgd_t pgd)
 {
 	return __pte(pgd_val(pgd));
diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
index 1ef0fea32e1e..6be203d43fd1 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -144,21 +144,37 @@
 #define PMD_BAD_BITS		(PTE_TABLE_SIZE-1)
 #define PUD_BAD_BITS		(PMD_TABLE_SIZE-1)
 
-#define pmd_set(pmdp, pmdval) 	(pmd_val(*(pmdp)) = (pmdval))
+static inline void pmd_set(pmd_t *pmdp, unsigned long val)
+{
+	*pmdp = __pmd(val);
+}
+
+static inline void pmd_clear(pmd_t *pmdp)
+{
+	*pmdp = __pmd(0);
+}
+
 #define pmd_none(pmd)		(!pmd_val(pmd))
 #define	pmd_bad(pmd)		(!is_kernel_addr(pmd_val(pmd)) \
 				 || (pmd_val(pmd) & PMD_BAD_BITS))
 #define	pmd_present(pmd)	(!pmd_none(pmd))
-#define	pmd_clear(pmdp)		(pmd_val(*(pmdp)) = 0)
 #define pmd_page_vaddr(pmd)	(pmd_val(pmd) & ~PMD_MASKED_BITS)
 extern struct page *pmd_page(pmd_t pmd);
 
-#define pud_set(pudp, pudval)	(pud_val(*(pudp)) = (pudval))
+static inline void pud_set(pud_t *pudp, unsigned long val)
+{
+	*pudp = __pud(val);
+}
+
+static inline void pud_clear(pud_t *pudp)
+{
+	*pudp = __pud(0);
+}
+
 #define pud_none(pud)		(!pud_val(pud))
 #define	pud_bad(pud)		(!is_kernel_addr(pud_val(pud)) \
 				 || (pud_val(pud) & PUD_BAD_BITS))
 #define pud_present(pud)	(pud_val(pud) != 0)
-#define pud_clear(pudp)		(pud_val(*(pudp)) = 0)
 #define pud_page_vaddr(pud)	(pud_val(pud) & ~PUD_MASKED_BITS)
 
 extern struct page *pud_page(pud_t pud);
@@ -173,9 +189,13 @@ static inline pud_t pte_pud(pte_t pte)
 	return __pud(pte_val(pte));
 }
 #define pud_write(pud)		pte_write(pud_pte(pud))
-#define pgd_set(pgdp, pudp)	({pgd_val(*(pgdp)) = (unsigned long)(pudp);})
 #define pgd_write(pgd)		pte_write(pgd_pte(pgd))
 
+static inline void pgd_set(pgd_t *pgdp, unsigned long val)
+{
+	*pgdp = __pgd(val);
+}
+
 /*
  * Find an entry in a page-table-directory.  We combine the address region
  * (the high order N bits) and the pgd portion of the address.
@@ -528,14 +548,12 @@ static inline pmd_t pmd_mkhuge(pmd_t pmd)
 
 static inline pmd_t pmd_mknotpresent(pmd_t pmd)
 {
-	pmd_val(pmd) &= ~_PAGE_PRESENT;
-	return pmd;
+	return __pmd(pmd_val(pmd) & ~_PAGE_PRESENT);
 }
 
 static inline pmd_t pmd_mksplitting(pmd_t pmd)
 {
-	pmd_val(pmd) |= _PAGE_SPLITTING;
-	return pmd;
+	return __pmd(pmd_val(pmd) | _PAGE_SPLITTING);
 }
 
 #define __HAVE_ARCH_PMD_SAME
diff --git a/arch/powerpc/mm/40x_mmu.c b/arch/powerpc/mm/40x_mmu.c
index 5810967511d4..31a5d42df8c9 100644
--- a/arch/powerpc/mm/40x_mmu.c
+++ b/arch/powerpc/mm/40x_mmu.c
@@ -110,10 +110,10 @@ unsigned long __init mmu_mapin_ram(unsigned long top)
 		unsigned long val = p | _PMD_SIZE_16M | _PAGE_EXEC | _PAGE_HWWRITE;
 
 		pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v);
-		pmd_val(*pmdp++) = val;
-		pmd_val(*pmdp++) = val;
-		pmd_val(*pmdp++) = val;
-		pmd_val(*pmdp++) = val;
+		*pmdp++ = __pmd(val);
+		*pmdp++ = __pmd(val);
+		*pmdp++ = __pmd(val);
+		*pmdp++ = __pmd(val);
 
 		v += LARGE_PAGE_SIZE_16M;
 		p += LARGE_PAGE_SIZE_16M;
@@ -125,7 +125,7 @@ unsigned long __init mmu_mapin_ram(unsigned long top)
 		unsigned long val = p | _PMD_SIZE_4M | _PAGE_EXEC | _PAGE_HWWRITE;
 
 		pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v);
-		pmd_val(*pmdp) = val;
+		*pmdp = __pmd(val);
 
 		v += LARGE_PAGE_SIZE_4M;
 		p += LARGE_PAGE_SIZE_4M;
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index e92cb2146b18..d692ae31cfc7 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -759,22 +759,20 @@ void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
 
 static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
 {
-	pmd_val(pmd) |= pgprot_val(pgprot);
-	return pmd;
+	return __pmd(pmd_val(pmd) | pgprot_val(pgprot));
 }
 
 pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
 {
-	pmd_t pmd;
+	unsigned long pmdv;
 	/*
 	 * For a valid pte, we would have _PAGE_PRESENT always
 	 * set. We use this to check THP page at pmd level.
 	 * leaf pte for huge page, bottom two bits != 00
 	 */
-	pmd_val(pmd) = pfn << PTE_RPN_SHIFT;
-	pmd_val(pmd) |= _PAGE_THP_HUGE;
-	pmd = pmd_set_protbits(pmd, pgprot);
-	return pmd;
+	pmdv = pfn << PTE_RPN_SHIFT;
+	pmdv |= _PAGE_THP_HUGE;
+	return pmd_set_protbits(__pmd(pmdv), pgprot);
 }
 
 pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
@@ -784,10 +782,11 @@ pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
 
 pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
 {
+	unsigned long pmdv;
 
-	pmd_val(pmd) &= _HPAGE_CHG_MASK;
-	pmd = pmd_set_protbits(pmd, newprot);
-	return pmd;
+	pmdv = pmd_val(pmd);
+	pmdv &= _HPAGE_CHG_MASK;
+	return pmd_set_protbits(__pmd(pmdv), newprot);
 }
 
 /*

From 371352ca0e7f3fad8406933e37c965d5a44365d9 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 1 Dec 2015 09:06:36 +0530
Subject: [PATCH 044/149] powerpc/mm: Move hash64 PTE bits from
 book3s/64/pgtable.h to hash.h

This enables us to keep hash64 related bits together, and makes it easy
to follow.

Acked-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/hash.h    | 448 ++++++++++++++++++-
 arch/powerpc/include/asm/book3s/64/pgtable.h | 445 +-----------------
 arch/powerpc/include/asm/pgtable.h           |   6 -
 3 files changed, 448 insertions(+), 451 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
index 7deb5063ff8c..447b212649c8 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -2,6 +2,61 @@
 #define _ASM_POWERPC_BOOK3S_64_HASH_H
 #ifdef __KERNEL__
 
+#ifdef CONFIG_PPC_64K_PAGES
+#include <asm/book3s/64/hash-64k.h>
+#else
+#include <asm/book3s/64/hash-4k.h>
+#endif
+
+/*
+ * Size of EA range mapped by our pagetables.
+ */
+#define PGTABLE_EADDR_SIZE	(PTE_INDEX_SIZE + PMD_INDEX_SIZE + \
+				 PUD_INDEX_SIZE + PGD_INDEX_SIZE + PAGE_SHIFT)
+#define PGTABLE_RANGE		(ASM_CONST(1) << PGTABLE_EADDR_SIZE)
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define PMD_CACHE_INDEX	(PMD_INDEX_SIZE + 1)
+#else
+#define PMD_CACHE_INDEX	PMD_INDEX_SIZE
+#endif
+/*
+ * Define the address range of the kernel non-linear virtual area
+ */
+#define KERN_VIRT_START ASM_CONST(0xD000000000000000)
+#define KERN_VIRT_SIZE	ASM_CONST(0x0000100000000000)
+
+/*
+ * The vmalloc space starts at the beginning of that region, and
+ * occupies half of it on hash CPUs and a quarter of it on Book3E
+ * (we keep a quarter for the virtual memmap)
+ */
+#define VMALLOC_START	KERN_VIRT_START
+#define VMALLOC_SIZE	(KERN_VIRT_SIZE >> 1)
+#define VMALLOC_END	(VMALLOC_START + VMALLOC_SIZE)
+
+/*
+ * Region IDs
+ */
+#define REGION_SHIFT		60UL
+#define REGION_MASK		(0xfUL << REGION_SHIFT)
+#define REGION_ID(ea)		(((unsigned long)(ea)) >> REGION_SHIFT)
+
+#define VMALLOC_REGION_ID	(REGION_ID(VMALLOC_START))
+#define KERNEL_REGION_ID	(REGION_ID(PAGE_OFFSET))
+#define VMEMMAP_REGION_ID	(0xfUL)	/* Server only */
+#define USER_REGION_ID		(0UL)
+
+/*
+ * Defines the address of the vmemap area, in its own region on
+ * hash table CPUs.
+ */
+#define VMEMMAP_BASE		(VMEMMAP_REGION_ID << REGION_SHIFT)
+
+#ifdef CONFIG_PPC_MM_SLICES
+#define HAVE_ARCH_UNMAPPED_AREA
+#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
+#endif /* CONFIG_PPC_MM_SLICES */
 /*
  * Common bits between 4K and 64K pages in a linux-style PTE.
  * These match the bits in the (hardware-defined) PowerPC PTE as closely
@@ -46,11 +101,398 @@
 /* Hash table based platforms need atomic updates of the linux PTE */
 #define PTE_ATOMIC_UPDATES	1
 
-#ifdef CONFIG_PPC_64K_PAGES
-#include <asm/book3s/64/hash-64k.h>
+/*
+ * THP pages can't be special. So use the _PAGE_SPECIAL
+ */
+#define _PAGE_SPLITTING _PAGE_SPECIAL
+
+/*
+ * We need to differentiate between explicit huge page and THP huge
+ * page, since THP huge page also need to track real subpage details
+ */
+#define _PAGE_THP_HUGE  _PAGE_4K_PFN
+
+/*
+ * set of bits not changed in pmd_modify.
+ */
+#define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS |		\
+			 _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_SPLITTING | \
+			 _PAGE_THP_HUGE)
+#define _PTE_NONE_MASK	_PAGE_HPTEFLAGS
+/*
+ * The mask convered by the RPN must be a ULL on 32-bit platforms with
+ * 64-bit PTEs
+ */
+#define PTE_RPN_MASK	(~((1UL << PTE_RPN_SHIFT) - 1))
+/*
+ * _PAGE_CHG_MASK masks of bits that are to be preserved across
+ * pgprot changes
+ */
+#define _PAGE_CHG_MASK	(PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \
+			 _PAGE_ACCESSED | _PAGE_SPECIAL)
+/*
+ * Mask of bits returned by pte_pgprot()
+ */
+#define PAGE_PROT_BITS	(_PAGE_GUARDED | _PAGE_COHERENT | _PAGE_NO_CACHE | \
+			 _PAGE_WRITETHRU | _PAGE_4K_PFN | \
+			 _PAGE_USER | _PAGE_ACCESSED |  \
+			 _PAGE_RW |  _PAGE_DIRTY | _PAGE_EXEC)
+/*
+ * We define 2 sets of base prot bits, one for basic pages (ie,
+ * cacheable kernel and user pages) and one for non cacheable
+ * pages. We always set _PAGE_COHERENT when SMP is enabled or
+ * the processor might need it for DMA coherency.
+ */
+#define _PAGE_BASE_NC	(_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_PSIZE)
+#define _PAGE_BASE	(_PAGE_BASE_NC | _PAGE_COHERENT)
+
+/* Permission masks used to generate the __P and __S table,
+ *
+ * Note:__pgprot is defined in arch/powerpc/include/asm/page.h
+ *
+ * Write permissions imply read permissions for now (we could make write-only
+ * pages on BookE but we don't bother for now). Execute permission control is
+ * possible on platforms that define _PAGE_EXEC
+ *
+ * Note due to the way vm flags are laid out, the bits are XWR
+ */
+#define PAGE_NONE	__pgprot(_PAGE_BASE)
+#define PAGE_SHARED	__pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW)
+#define PAGE_SHARED_X	__pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW | \
+				 _PAGE_EXEC)
+#define PAGE_COPY	__pgprot(_PAGE_BASE | _PAGE_USER )
+#define PAGE_COPY_X	__pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC)
+#define PAGE_READONLY	__pgprot(_PAGE_BASE | _PAGE_USER )
+#define PAGE_READONLY_X	__pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC)
+
+#define __P000	PAGE_NONE
+#define __P001	PAGE_READONLY
+#define __P010	PAGE_COPY
+#define __P011	PAGE_COPY
+#define __P100	PAGE_READONLY_X
+#define __P101	PAGE_READONLY_X
+#define __P110	PAGE_COPY_X
+#define __P111	PAGE_COPY_X
+
+#define __S000	PAGE_NONE
+#define __S001	PAGE_READONLY
+#define __S010	PAGE_SHARED
+#define __S011	PAGE_SHARED
+#define __S100	PAGE_READONLY_X
+#define __S101	PAGE_READONLY_X
+#define __S110	PAGE_SHARED_X
+#define __S111	PAGE_SHARED_X
+
+/* Permission masks used for kernel mappings */
+#define PAGE_KERNEL	__pgprot(_PAGE_BASE | _PAGE_KERNEL_RW)
+#define PAGE_KERNEL_NC	__pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \
+				 _PAGE_NO_CACHE)
+#define PAGE_KERNEL_NCG	__pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \
+				 _PAGE_NO_CACHE | _PAGE_GUARDED)
+#define PAGE_KERNEL_X	__pgprot(_PAGE_BASE | _PAGE_KERNEL_RWX)
+#define PAGE_KERNEL_RO	__pgprot(_PAGE_BASE | _PAGE_KERNEL_RO)
+#define PAGE_KERNEL_ROX	__pgprot(_PAGE_BASE | _PAGE_KERNEL_ROX)
+
+/* Protection used for kernel text. We want the debuggers to be able to
+ * set breakpoints anywhere, so don't write protect the kernel text
+ * on platforms where such control is possible.
+ */
+#if defined(CONFIG_KGDB) || defined(CONFIG_XMON) || defined(CONFIG_BDI_SWITCH) ||\
+	defined(CONFIG_KPROBES) || defined(CONFIG_DYNAMIC_FTRACE)
+#define PAGE_KERNEL_TEXT	PAGE_KERNEL_X
 #else
-#include <asm/book3s/64/hash-4k.h>
+#define PAGE_KERNEL_TEXT	PAGE_KERNEL_ROX
 #endif
 
+/* Make modules code happy. We don't set RO yet */
+#define PAGE_KERNEL_EXEC	PAGE_KERNEL_X
+#define PAGE_AGP		(PAGE_KERNEL_NC)
+
+#define PMD_BAD_BITS		(PTE_TABLE_SIZE-1)
+#define PUD_BAD_BITS		(PMD_TABLE_SIZE-1)
+/*
+ * We save the slot number & secondary bit in the second half of the
+ * PTE page. We use the 8 bytes per each pte entry.
+ */
+#define PTE_PAGE_HIDX_OFFSET (PTRS_PER_PTE * 8)
+
+#ifndef __ASSEMBLY__
+#define	pmd_bad(pmd)		(!is_kernel_addr(pmd_val(pmd)) \
+				 || (pmd_val(pmd) & PMD_BAD_BITS))
+#define pmd_page_vaddr(pmd)	(pmd_val(pmd) & ~PMD_MASKED_BITS)
+
+#define	pud_bad(pud)		(!is_kernel_addr(pud_val(pud)) \
+				 || (pud_val(pud) & PUD_BAD_BITS))
+#define pud_page_vaddr(pud)	(pud_val(pud) & ~PUD_MASKED_BITS)
+
+#define pgd_index(address) (((address) >> (PGDIR_SHIFT)) & (PTRS_PER_PGD - 1))
+#define pmd_index(address) (((address) >> (PMD_SHIFT)) & (PTRS_PER_PMD - 1))
+#define pte_index(address) (((address) >> (PAGE_SHIFT)) & (PTRS_PER_PTE - 1))
+
+extern void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
+			    pte_t *ptep, unsigned long pte, int huge);
+extern unsigned long pmd_hugepage_update(struct mm_struct *mm,
+					 unsigned long addr,
+					 pmd_t *pmdp,
+					 unsigned long clr,
+					 unsigned long set);
+/* Atomic PTE updates */
+static inline unsigned long pte_update(struct mm_struct *mm,
+				       unsigned long addr,
+				       pte_t *ptep, unsigned long clr,
+				       unsigned long set,
+				       int huge)
+{
+	unsigned long old, tmp;
+
+	__asm__ __volatile__(
+	"1:	ldarx	%0,0,%3		# pte_update\n\
+	andi.	%1,%0,%6\n\
+	bne-	1b \n\
+	andc	%1,%0,%4 \n\
+	or	%1,%1,%7\n\
+	stdcx.	%1,0,%3 \n\
+	bne-	1b"
+	: "=&r" (old), "=&r" (tmp), "=m" (*ptep)
+	: "r" (ptep), "r" (clr), "m" (*ptep), "i" (_PAGE_BUSY), "r" (set)
+	: "cc" );
+	/* huge pages use the old page table lock */
+	if (!huge)
+		assert_pte_locked(mm, addr);
+
+	if (old & _PAGE_HASHPTE)
+		hpte_need_flush(mm, addr, ptep, old, huge);
+
+	return old;
+}
+
+static inline int __ptep_test_and_clear_young(struct mm_struct *mm,
+					      unsigned long addr, pte_t *ptep)
+{
+	unsigned long old;
+
+	if ((pte_val(*ptep) & (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0)
+		return 0;
+	old = pte_update(mm, addr, ptep, _PAGE_ACCESSED, 0, 0);
+	return (old & _PAGE_ACCESSED) != 0;
+}
+#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
+#define ptep_test_and_clear_young(__vma, __addr, __ptep)		   \
+({									   \
+	int __r;							   \
+	__r = __ptep_test_and_clear_young((__vma)->vm_mm, __addr, __ptep); \
+	__r;								   \
+})
+
+#define __HAVE_ARCH_PTEP_SET_WRPROTECT
+static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr,
+				      pte_t *ptep)
+{
+
+	if ((pte_val(*ptep) & _PAGE_RW) == 0)
+		return;
+
+	pte_update(mm, addr, ptep, _PAGE_RW, 0, 0);
+}
+
+static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
+					   unsigned long addr, pte_t *ptep)
+{
+	if ((pte_val(*ptep) & _PAGE_RW) == 0)
+		return;
+
+	pte_update(mm, addr, ptep, _PAGE_RW, 0, 1);
+}
+
+/*
+ * We currently remove entries from the hashtable regardless of whether
+ * the entry was young or dirty. The generic routines only flush if the
+ * entry was young or dirty which is not good enough.
+ *
+ * We should be more intelligent about this but for the moment we override
+ * these functions and force a tlb flush unconditionally
+ */
+#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
+#define ptep_clear_flush_young(__vma, __address, __ptep)		\
+({									\
+	int __young = __ptep_test_and_clear_young((__vma)->vm_mm, __address, \
+						  __ptep);		\
+	__young;							\
+})
+
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
+static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
+				       unsigned long addr, pte_t *ptep)
+{
+	unsigned long old = pte_update(mm, addr, ptep, ~0UL, 0, 0);
+	return __pte(old);
+}
+
+static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
+			     pte_t * ptep)
+{
+	pte_update(mm, addr, ptep, ~0UL, 0, 0);
+}
+
+
+/* Set the dirty and/or accessed bits atomically in a linux PTE, this
+ * function doesn't need to flush the hash entry
+ */
+static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry)
+{
+	unsigned long bits = pte_val(entry) &
+		(_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC);
+
+	unsigned long old, tmp;
+
+	__asm__ __volatile__(
+	"1:	ldarx	%0,0,%4\n\
+		andi.	%1,%0,%6\n\
+		bne-	1b \n\
+		or	%0,%3,%0\n\
+		stdcx.	%0,0,%4\n\
+		bne-	1b"
+	:"=&r" (old), "=&r" (tmp), "=m" (*ptep)
+	:"r" (bits), "r" (ptep), "m" (*ptep), "i" (_PAGE_BUSY)
+	:"cc");
+}
+
+#define __HAVE_ARCH_PTE_SAME
+#define pte_same(A,B)	(((pte_val(A) ^ pte_val(B)) & ~_PAGE_HPTEFLAGS) == 0)
+
+static inline char *get_hpte_slot_array(pmd_t *pmdp)
+{
+	/*
+	 * The hpte hindex is stored in the pgtable whose address is in the
+	 * second half of the PMD
+	 *
+	 * Order this load with the test for pmd_trans_huge in the caller
+	 */
+	smp_rmb();
+	return *(char **)(pmdp + PTRS_PER_PMD);
+
+
+}
+/*
+ * The linux hugepage PMD now include the pmd entries followed by the address
+ * to the stashed pgtable_t. The stashed pgtable_t contains the hpte bits.
+ * [ 1 bit secondary | 3 bit hidx | 1 bit valid | 000]. We use one byte per
+ * each HPTE entry. With 16MB hugepage and 64K HPTE we need 256 entries and
+ * with 4K HPTE we need 4096 entries. Both will fit in a 4K pgtable_t.
+ *
+ * The last three bits are intentionally left to zero. This memory location
+ * are also used as normal page PTE pointers. So if we have any pointers
+ * left around while we collapse a hugepage, we need to make sure
+ * _PAGE_PRESENT bit of that is zero when we look at them
+ */
+static inline unsigned int hpte_valid(unsigned char *hpte_slot_array, int index)
+{
+	return (hpte_slot_array[index] >> 3) & 0x1;
+}
+
+static inline unsigned int hpte_hash_index(unsigned char *hpte_slot_array,
+					   int index)
+{
+	return hpte_slot_array[index] >> 4;
+}
+
+static inline void mark_hpte_slot_valid(unsigned char *hpte_slot_array,
+					unsigned int index, unsigned int hidx)
+{
+	hpte_slot_array[index] = hidx << 4 | 0x1 << 3;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/*
+ *
+ * For core kernel code by design pmd_trans_huge is never run on any hugetlbfs
+ * page. The hugetlbfs page table walking and mangling paths are totally
+ * separated form the core VM paths and they're differentiated by
+ *  VM_HUGETLB being set on vm_flags well before any pmd_trans_huge could run.
+ *
+ * pmd_trans_huge() is defined as false at build time if
+ * CONFIG_TRANSPARENT_HUGEPAGE=n to optimize away code blocks at build
+ * time in such case.
+ *
+ * For ppc64 we need to differntiate from explicit hugepages from THP, because
+ * for THP we also track the subpage details at the pmd level. We don't do
+ * that for explicit huge pages.
+ *
+ */
+static inline int pmd_trans_huge(pmd_t pmd)
+{
+	/*
+	 * leaf pte for huge page, bottom two bits != 00
+	 */
+	return (pmd_val(pmd) & 0x3) && (pmd_val(pmd) & _PAGE_THP_HUGE);
+}
+
+static inline int pmd_trans_splitting(pmd_t pmd)
+{
+	if (pmd_trans_huge(pmd))
+		return pmd_val(pmd) & _PAGE_SPLITTING;
+	return 0;
+}
+
+#endif
+static inline int pmd_large(pmd_t pmd)
+{
+	/*
+	 * leaf pte for huge page, bottom two bits != 00
+	 */
+	return ((pmd_val(pmd) & 0x3) != 0x0);
+}
+
+static inline pmd_t pmd_mknotpresent(pmd_t pmd)
+{
+	return __pmd(pmd_val(pmd) & ~_PAGE_PRESENT);
+}
+
+static inline pmd_t pmd_mksplitting(pmd_t pmd)
+{
+	return __pmd(pmd_val(pmd) | _PAGE_SPLITTING);
+}
+
+#define __HAVE_ARCH_PMD_SAME
+static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
+{
+	return (((pmd_val(pmd_a) ^ pmd_val(pmd_b)) & ~_PAGE_HPTEFLAGS) == 0);
+}
+
+static inline int __pmdp_test_and_clear_young(struct mm_struct *mm,
+					      unsigned long addr, pmd_t *pmdp)
+{
+	unsigned long old;
+
+	if ((pmd_val(*pmdp) & (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0)
+		return 0;
+	old = pmd_hugepage_update(mm, addr, pmdp, _PAGE_ACCESSED, 0);
+	return ((old & _PAGE_ACCESSED) != 0);
+}
+
+#define __HAVE_ARCH_PMDP_SET_WRPROTECT
+static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
+				      pmd_t *pmdp)
+{
+
+	if ((pmd_val(*pmdp) & _PAGE_RW) == 0)
+		return;
+
+	pmd_hugepage_update(mm, addr, pmdp, _PAGE_RW, 0);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+extern void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
+				   pmd_t *pmdp, unsigned long old_pmd);
+#else
+static inline void hpte_do_hugepage_flush(struct mm_struct *mm,
+					  unsigned long addr, pmd_t *pmdp,
+					  unsigned long old_pmd)
+{
+	WARN(1, "%s called with THP disabled\n", __func__);
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+#endif /* !__ASSEMBLY__ */
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_BOOK3S_64_HASH_H */
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 09c6474f89c7..aac630b4a15e 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -8,32 +8,6 @@
 #include <asm/book3s/64/hash.h>
 #include <asm/barrier.h>
 
-
-/*
- * Size of EA range mapped by our pagetables.
- */
-#define PGTABLE_EADDR_SIZE (PTE_INDEX_SIZE + PMD_INDEX_SIZE + \
-			    PUD_INDEX_SIZE + PGD_INDEX_SIZE + PAGE_SHIFT)
-#define PGTABLE_RANGE (ASM_CONST(1) << PGTABLE_EADDR_SIZE)
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define PMD_CACHE_INDEX	(PMD_INDEX_SIZE + 1)
-#else
-#define PMD_CACHE_INDEX	PMD_INDEX_SIZE
-#endif
-/*
- * Define the address range of the kernel non-linear virtual area
- */
-#define KERN_VIRT_START ASM_CONST(0xD000000000000000)
-#define KERN_VIRT_SIZE	ASM_CONST(0x0000100000000000)
-/*
- * The vmalloc space starts at the beginning of that region, and
- * occupies half of it on hash CPUs and a quarter of it on Book3E
- * (we keep a quarter for the virtual memmap)
- */
-#define VMALLOC_START	KERN_VIRT_START
-#define VMALLOC_SIZE	(KERN_VIRT_SIZE >> 1)
-#define VMALLOC_END	(VMALLOC_START + VMALLOC_SIZE)
 /*
  * The second half of the kernel virtual space is used for IO mappings,
  * it's itself carved into the PIO region (ISA and PHB IO space) and
@@ -52,146 +26,9 @@
 #define IOREMAP_BASE	(PHB_IO_END)
 #define IOREMAP_END	(KERN_VIRT_START + KERN_VIRT_SIZE)
 
-/*
- * Region IDs
- */
-#define REGION_SHIFT		60UL
-#define REGION_MASK		(0xfUL << REGION_SHIFT)
-#define REGION_ID(ea)		(((unsigned long)(ea)) >> REGION_SHIFT)
-
-#define VMALLOC_REGION_ID	(REGION_ID(VMALLOC_START))
-#define KERNEL_REGION_ID	(REGION_ID(PAGE_OFFSET))
-#define VMEMMAP_REGION_ID	(0xfUL)	/* Server only */
-#define USER_REGION_ID		(0UL)
-
-/*
- * Defines the address of the vmemap area, in its own region on
- * hash table CPUs.
- */
-#define VMEMMAP_BASE		(VMEMMAP_REGION_ID << REGION_SHIFT)
 #define vmemmap			((struct page *)VMEMMAP_BASE)
 
-
-#ifdef CONFIG_PPC_MM_SLICES
-#define HAVE_ARCH_UNMAPPED_AREA
-#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
-#endif /* CONFIG_PPC_MM_SLICES */
-
-/*
- * THP pages can't be special. So use the _PAGE_SPECIAL
- */
-#define _PAGE_SPLITTING _PAGE_SPECIAL
-
-/*
- * We need to differentiate between explicit huge page and THP huge
- * page, since THP huge page also need to track real subpage details
- */
-#define _PAGE_THP_HUGE  _PAGE_4K_PFN
-
-/*
- * set of bits not changed in pmd_modify.
- */
-#define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS |		\
-			 _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_SPLITTING | \
-			 _PAGE_THP_HUGE)
-#define _PTE_NONE_MASK	_PAGE_HPTEFLAGS
-/*
- * The mask convered by the RPN must be a ULL on 32-bit platforms with
- * 64-bit PTEs
- */
-#define PTE_RPN_MASK	(~((1UL << PTE_RPN_SHIFT) - 1))
-/*
- * _PAGE_CHG_MASK masks of bits that are to be preserved across
- * pgprot changes
- */
-#define _PAGE_CHG_MASK	(PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \
-			 _PAGE_ACCESSED | _PAGE_SPECIAL)
-/*
- * Mask of bits returned by pte_pgprot()
- */
-#define PAGE_PROT_BITS	(_PAGE_GUARDED | _PAGE_COHERENT | _PAGE_NO_CACHE | \
-			 _PAGE_WRITETHRU | _PAGE_4K_PFN | \
-			 _PAGE_USER | _PAGE_ACCESSED |  \
-			 _PAGE_RW |  _PAGE_DIRTY | _PAGE_EXEC)
-/*
- * We define 2 sets of base prot bits, one for basic pages (ie,
- * cacheable kernel and user pages) and one for non cacheable
- * pages. We always set _PAGE_COHERENT when SMP is enabled or
- * the processor might need it for DMA coherency.
- */
-#define _PAGE_BASE_NC	(_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_PSIZE)
-#define _PAGE_BASE	(_PAGE_BASE_NC | _PAGE_COHERENT)
-
-/* Permission masks used to generate the __P and __S table,
- *
- * Note:__pgprot is defined in arch/powerpc/include/asm/page.h
- *
- * Write permissions imply read permissions for now (we could make write-only
- * pages on BookE but we don't bother for now). Execute permission control is
- * possible on platforms that define _PAGE_EXEC
- *
- * Note due to the way vm flags are laid out, the bits are XWR
- */
-#define PAGE_NONE	__pgprot(_PAGE_BASE)
-#define PAGE_SHARED	__pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW)
-#define PAGE_SHARED_X	__pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW | \
-				 _PAGE_EXEC)
-#define PAGE_COPY	__pgprot(_PAGE_BASE | _PAGE_USER )
-#define PAGE_COPY_X	__pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC)
-#define PAGE_READONLY	__pgprot(_PAGE_BASE | _PAGE_USER )
-#define PAGE_READONLY_X	__pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC)
-
-#define __P000	PAGE_NONE
-#define __P001	PAGE_READONLY
-#define __P010	PAGE_COPY
-#define __P011	PAGE_COPY
-#define __P100	PAGE_READONLY_X
-#define __P101	PAGE_READONLY_X
-#define __P110	PAGE_COPY_X
-#define __P111	PAGE_COPY_X
-
-#define __S000	PAGE_NONE
-#define __S001	PAGE_READONLY
-#define __S010	PAGE_SHARED
-#define __S011	PAGE_SHARED
-#define __S100	PAGE_READONLY_X
-#define __S101	PAGE_READONLY_X
-#define __S110	PAGE_SHARED_X
-#define __S111	PAGE_SHARED_X
-
-/* Permission masks used for kernel mappings */
-#define PAGE_KERNEL	__pgprot(_PAGE_BASE | _PAGE_KERNEL_RW)
-#define PAGE_KERNEL_NC	__pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \
-				 _PAGE_NO_CACHE)
-#define PAGE_KERNEL_NCG	__pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \
-				 _PAGE_NO_CACHE | _PAGE_GUARDED)
-#define PAGE_KERNEL_X	__pgprot(_PAGE_BASE | _PAGE_KERNEL_RWX)
-#define PAGE_KERNEL_RO	__pgprot(_PAGE_BASE | _PAGE_KERNEL_RO)
-#define PAGE_KERNEL_ROX	__pgprot(_PAGE_BASE | _PAGE_KERNEL_ROX)
-
-/* Protection used for kernel text. We want the debuggers to be able to
- * set breakpoints anywhere, so don't write protect the kernel text
- * on platforms where such control is possible.
- */
-#if defined(CONFIG_KGDB) || defined(CONFIG_XMON) || defined(CONFIG_BDI_SWITCH) ||\
-	defined(CONFIG_KPROBES) || defined(CONFIG_DYNAMIC_FTRACE)
-#define PAGE_KERNEL_TEXT	PAGE_KERNEL_X
-#else
-#define PAGE_KERNEL_TEXT	PAGE_KERNEL_ROX
-#endif
-
-/* Make modules code happy. We don't set RO yet */
-#define PAGE_KERNEL_EXEC	PAGE_KERNEL_X
-
-/*
- * Don't just check for any non zero bits in __PAGE_USER, since for book3e
- * and PTE_64BIT, PAGE_KERNEL_X contains _PAGE_BAP_SR which is also in
- * _PAGE_USER.  Need to explicitly match _PAGE_BAP_UR bit in that case too.
- */
-#define pte_user(val)		((val & _PAGE_USER) == _PAGE_USER)
-
 /* Advertise special mapping type for AGP */
-#define PAGE_AGP		(PAGE_KERNEL_NC)
 #define HAVE_PAGE_AGP
 
 /* Advertise support for _PAGE_SPECIAL */
@@ -230,12 +67,6 @@
 
 #endif /* __real_pte */
 
-
-/* pte_clear moved to later in this file */
-
-#define PMD_BAD_BITS		(PTE_TABLE_SIZE-1)
-#define PUD_BAD_BITS		(PMD_TABLE_SIZE-1)
-
 static inline void pmd_set(pmd_t *pmdp, unsigned long val)
 {
 	*pmdp = __pmd(val);
@@ -246,13 +77,8 @@ static inline void pmd_clear(pmd_t *pmdp)
 	*pmdp = __pmd(0);
 }
 
-
 #define pmd_none(pmd)		(!pmd_val(pmd))
-#define	pmd_bad(pmd)		(!is_kernel_addr(pmd_val(pmd)) \
-				 || (pmd_val(pmd) & PMD_BAD_BITS))
 #define	pmd_present(pmd)	(!pmd_none(pmd))
-#define pmd_page_vaddr(pmd)	(pmd_val(pmd) & ~PMD_MASKED_BITS)
-extern struct page *pmd_page(pmd_t pmd);
 
 static inline void pud_set(pud_t *pudp, unsigned long val)
 {
@@ -265,13 +91,10 @@ static inline void pud_clear(pud_t *pudp)
 }
 
 #define pud_none(pud)		(!pud_val(pud))
-#define	pud_bad(pud)		(!is_kernel_addr(pud_val(pud)) \
-				 || (pud_val(pud) & PUD_BAD_BITS))
 #define pud_present(pud)	(pud_val(pud) != 0)
-#define pud_page_vaddr(pud)	(pud_val(pud) & ~PUD_MASKED_BITS)
 
 extern struct page *pud_page(pud_t pud);
-
+extern struct page *pmd_page(pmd_t pmd);
 static inline pte_t pud_pte(pud_t pud)
 {
 	return __pte(pud_val(pud));
@@ -292,15 +115,14 @@ static inline void pgd_set(pgd_t *pgdp, unsigned long val)
  * Find an entry in a page-table-directory.  We combine the address region
  * (the high order N bits) and the pgd portion of the address.
  */
-#define pgd_index(address) (((address) >> (PGDIR_SHIFT)) & (PTRS_PER_PGD - 1))
 
 #define pgd_offset(mm, address)	 ((mm)->pgd + pgd_index(address))
 
 #define pmd_offset(pudp,addr) \
-  (((pmd_t *) pud_page_vaddr(*(pudp))) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)))
+	(((pmd_t *) pud_page_vaddr(*(pudp))) + pmd_index(addr))
 
 #define pte_offset_kernel(dir,addr) \
-  (((pte_t *) pmd_page_vaddr(*(dir))) + (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)))
+	(((pte_t *) pmd_page_vaddr(*(dir))) + pte_index(addr))
 
 #define pte_offset_map(dir,addr)	pte_offset_kernel((dir), (addr))
 #define pte_unmap(pte)			do { } while(0)
@@ -308,132 +130,6 @@ static inline void pgd_set(pgd_t *pgdp, unsigned long val)
 /* to find an entry in a kernel page-table-directory */
 /* This now only contains the vmalloc pages */
 #define pgd_offset_k(address) pgd_offset(&init_mm, address)
-extern void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
-			    pte_t *ptep, unsigned long pte, int huge);
-
-/* Atomic PTE updates */
-static inline unsigned long pte_update(struct mm_struct *mm,
-				       unsigned long addr,
-				       pte_t *ptep, unsigned long clr,
-				       unsigned long set,
-				       int huge)
-{
-	unsigned long old, tmp;
-
-	__asm__ __volatile__(
-	"1:	ldarx	%0,0,%3		# pte_update\n\
-	andi.	%1,%0,%6\n\
-	bne-	1b \n\
-	andc	%1,%0,%4 \n\
-	or	%1,%1,%7\n\
-	stdcx.	%1,0,%3 \n\
-	bne-	1b"
-	: "=&r" (old), "=&r" (tmp), "=m" (*ptep)
-	: "r" (ptep), "r" (clr), "m" (*ptep), "i" (_PAGE_BUSY), "r" (set)
-	: "cc" );
-	/* huge pages use the old page table lock */
-	if (!huge)
-		assert_pte_locked(mm, addr);
-
-	if (old & _PAGE_HASHPTE)
-		hpte_need_flush(mm, addr, ptep, old, huge);
-
-	return old;
-}
-
-static inline int __ptep_test_and_clear_young(struct mm_struct *mm,
-					      unsigned long addr, pte_t *ptep)
-{
-	unsigned long old;
-
-	if ((pte_val(*ptep) & (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0)
-		return 0;
-	old = pte_update(mm, addr, ptep, _PAGE_ACCESSED, 0, 0);
-	return (old & _PAGE_ACCESSED) != 0;
-}
-#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
-#define ptep_test_and_clear_young(__vma, __addr, __ptep)		   \
-({									   \
-	int __r;							   \
-	__r = __ptep_test_and_clear_young((__vma)->vm_mm, __addr, __ptep); \
-	__r;								   \
-})
-
-#define __HAVE_ARCH_PTEP_SET_WRPROTECT
-static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr,
-				      pte_t *ptep)
-{
-
-	if ((pte_val(*ptep) & _PAGE_RW) == 0)
-		return;
-
-	pte_update(mm, addr, ptep, _PAGE_RW, 0, 0);
-}
-
-static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
-					   unsigned long addr, pte_t *ptep)
-{
-	if ((pte_val(*ptep) & _PAGE_RW) == 0)
-		return;
-
-	pte_update(mm, addr, ptep, _PAGE_RW, 0, 1);
-}
-
-/*
- * We currently remove entries from the hashtable regardless of whether
- * the entry was young or dirty. The generic routines only flush if the
- * entry was young or dirty which is not good enough.
- *
- * We should be more intelligent about this but for the moment we override
- * these functions and force a tlb flush unconditionally
- */
-#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
-#define ptep_clear_flush_young(__vma, __address, __ptep)		\
-({									\
-	int __young = __ptep_test_and_clear_young((__vma)->vm_mm, __address, \
-						  __ptep);		\
-	__young;							\
-})
-
-#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
-static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
-				       unsigned long addr, pte_t *ptep)
-{
-	unsigned long old = pte_update(mm, addr, ptep, ~0UL, 0, 0);
-	return __pte(old);
-}
-
-static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
-			     pte_t * ptep)
-{
-	pte_update(mm, addr, ptep, ~0UL, 0, 0);
-}
-
-
-/* Set the dirty and/or accessed bits atomically in a linux PTE, this
- * function doesn't need to flush the hash entry
- */
-static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry)
-{
-	unsigned long bits = pte_val(entry) &
-		(_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC);
-
-	unsigned long old, tmp;
-
-	__asm__ __volatile__(
-	"1:	ldarx	%0,0,%4\n\
-		andi.	%1,%0,%6\n\
-		bne-	1b \n\
-		or	%0,%3,%0\n\
-		stdcx.	%0,0,%4\n\
-		bne-	1b"
-	:"=&r" (old), "=&r" (tmp), "=m" (*ptep)
-	:"r" (bits), "r" (ptep), "m" (*ptep), "i" (_PAGE_BUSY)
-	:"cc");
-}
-
-#define __HAVE_ARCH_PTE_SAME
-#define pte_same(A,B)	(((pte_val(A) ^ pte_val(B)) & ~_PAGE_HPTEFLAGS) == 0)
 
 #define pte_ERROR(e) \
 	pr_err("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, pte_val(e))
@@ -468,54 +164,9 @@ static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry)
 void pgtable_cache_add(unsigned shift, void (*ctor)(void *));
 void pgtable_cache_init(void);
 
-/*
- * The linux hugepage PMD now include the pmd entries followed by the address
- * to the stashed pgtable_t. The stashed pgtable_t contains the hpte bits.
- * [ 1 bit secondary | 3 bit hidx | 1 bit valid | 000]. We use one byte per
- * each HPTE entry. With 16MB hugepage and 64K HPTE we need 256 entries and
- * with 4K HPTE we need 4096 entries. Both will fit in a 4K pgtable_t.
- *
- * The last three bits are intentionally left to zero. This memory location
- * are also used as normal page PTE pointers. So if we have any pointers
- * left around while we collapse a hugepage, we need to make sure
- * _PAGE_PRESENT bit of that is zero when we look at them
- */
-static inline unsigned int hpte_valid(unsigned char *hpte_slot_array, int index)
-{
-	return (hpte_slot_array[index] >> 3) & 0x1;
-}
-
-static inline unsigned int hpte_hash_index(unsigned char *hpte_slot_array,
-					   int index)
-{
-	return hpte_slot_array[index] >> 4;
-}
-
-static inline void mark_hpte_slot_valid(unsigned char *hpte_slot_array,
-					unsigned int index, unsigned int hidx)
-{
-	hpte_slot_array[index] = hidx << 4 | 0x1 << 3;
-}
-
 struct page *realmode_pfn_to_page(unsigned long pfn);
 
-static inline char *get_hpte_slot_array(pmd_t *pmdp)
-{
-	/*
-	 * The hpte hindex is stored in the pgtable whose address is in the
-	 * second half of the PMD
-	 *
-	 * Order this load with the test for pmd_trans_huge in the caller
-	 */
-	smp_rmb();
-	return *(char **)(pmdp + PTRS_PER_PMD);
-
-
-}
-
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-extern void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
-				   pmd_t *pmdp, unsigned long old_pmd);
 extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot);
 extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot);
 extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot);
@@ -523,55 +174,9 @@ extern void set_pmd_at(struct mm_struct *mm, unsigned long addr,
 		       pmd_t *pmdp, pmd_t pmd);
 extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
 				 pmd_t *pmd);
-/*
- *
- * For core kernel code by design pmd_trans_huge is never run on any hugetlbfs
- * page. The hugetlbfs page table walking and mangling paths are totally
- * separated form the core VM paths and they're differentiated by
- *  VM_HUGETLB being set on vm_flags well before any pmd_trans_huge could run.
- *
- * pmd_trans_huge() is defined as false at build time if
- * CONFIG_TRANSPARENT_HUGEPAGE=n to optimize away code blocks at build
- * time in such case.
- *
- * For ppc64 we need to differntiate from explicit hugepages from THP, because
- * for THP we also track the subpage details at the pmd level. We don't do
- * that for explicit huge pages.
- *
- */
-static inline int pmd_trans_huge(pmd_t pmd)
-{
-	/*
-	 * leaf pte for huge page, bottom two bits != 00
-	 */
-	return (pmd_val(pmd) & 0x3) && (pmd_val(pmd) & _PAGE_THP_HUGE);
-}
-
-static inline int pmd_trans_splitting(pmd_t pmd)
-{
-	if (pmd_trans_huge(pmd))
-		return pmd_val(pmd) & _PAGE_SPLITTING;
-	return 0;
-}
-
 extern int has_transparent_hugepage(void);
-#else
-static inline void hpte_do_hugepage_flush(struct mm_struct *mm,
-					  unsigned long addr, pmd_t *pmdp,
-					  unsigned long old_pmd)
-{
-
-	WARN(1, "%s called with THP disabled\n", __func__);
-}
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
-static inline int pmd_large(pmd_t pmd)
-{
-	/*
-	 * leaf pte for huge page, bottom two bits != 00
-	 */
-	return ((pmd_val(pmd) & 0x3) != 0x0);
-}
 
 static inline pte_t pmd_pte(pmd_t pmd)
 {
@@ -606,44 +211,11 @@ static inline pmd_t pmd_mkhuge(pmd_t pmd)
 	return pmd;
 }
 
-static inline pmd_t pmd_mknotpresent(pmd_t pmd)
-{
-	return __pmd(pmd_val(pmd) & ~_PAGE_PRESENT);
-}
-
-static inline pmd_t pmd_mksplitting(pmd_t pmd)
-{
-	return __pmd(pmd_val(pmd) | _PAGE_SPLITTING);
-}
-
-#define __HAVE_ARCH_PMD_SAME
-static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
-{
-	return (((pmd_val(pmd_a) ^ pmd_val(pmd_b)) & ~_PAGE_HPTEFLAGS) == 0);
-}
-
 #define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
 extern int pmdp_set_access_flags(struct vm_area_struct *vma,
 				 unsigned long address, pmd_t *pmdp,
 				 pmd_t entry, int dirty);
 
-extern unsigned long pmd_hugepage_update(struct mm_struct *mm,
-					 unsigned long addr,
-					 pmd_t *pmdp,
-					 unsigned long clr,
-					 unsigned long set);
-
-static inline int __pmdp_test_and_clear_young(struct mm_struct *mm,
-					      unsigned long addr, pmd_t *pmdp)
-{
-	unsigned long old;
-
-	if ((pmd_val(*pmdp) & (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0)
-		return 0;
-	old = pmd_hugepage_update(mm, addr, pmdp, _PAGE_ACCESSED, 0);
-	return ((old & _PAGE_ACCESSED) != 0);
-}
-
 #define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
 extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
 				     unsigned long address, pmd_t *pmdp);
@@ -655,17 +227,6 @@ extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
 extern pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
 				     unsigned long addr, pmd_t *pmdp);
 
-#define __HAVE_ARCH_PMDP_SET_WRPROTECT
-static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
-				      pmd_t *pmdp)
-{
-
-	if ((pmd_val(*pmdp) & _PAGE_RW) == 0)
-		return;
-
-	pmd_hugepage_update(mm, addr, pmdp, _PAGE_RW, 0);
-}
-
 #define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
 extern void pmdp_splitting_flush(struct vm_area_struct *vma,
 				 unsigned long address, pmd_t *pmdp);
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index a27b8cef51d7..8f7338678fdc 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -18,12 +18,6 @@ struct mm_struct;
 #include <asm/pgtable-book3e.h>
 #endif /* !CONFIG_PPC_BOOK3S */
 
-/*
- * We save the slot number & secondary bit in the second half of the
- * PTE page. We use the 8 bytes per each pte entry.
- */
-#define PTE_PAGE_HIDX_OFFSET (PTRS_PER_PTE * 8)
-
 #ifndef __ASSEMBLY__
 
 #include <asm/tlbflush.h>

From 1ca7212932862e348f2f9307f35bd309a7da82d8 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 1 Dec 2015 09:06:37 +0530
Subject: [PATCH 045/149] powerpc/mm: Move PTE bits from generic functions to
 hash64 functions.

functions which operate on pte bits are moved to hash*.h and other
generic functions are moved to pgtable.h

Acked-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/32/pgtable.h | 183 +++++++++++++++++
 arch/powerpc/include/asm/book3s/64/hash.h    | 151 ++++++++++++++
 arch/powerpc/include/asm/book3s/64/pgtable.h |   6 +
 arch/powerpc/include/asm/book3s/pgtable.h    | 204 -------------------
 4 files changed, 340 insertions(+), 204 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h
index 226f29d39332..38b33dcfcc9d 100644
--- a/arch/powerpc/include/asm/book3s/32/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
@@ -294,6 +294,189 @@ void pgtable_cache_init(void);
 extern int get_pteptr(struct mm_struct *mm, unsigned long addr, pte_t **ptep,
 		      pmd_t **pmdp);
 
+/* Generic accessors to PTE bits */
+static inline int pte_write(pte_t pte)		{ return !!(pte_val(pte) & _PAGE_RW);}
+static inline int pte_dirty(pte_t pte)		{ return !!(pte_val(pte) & _PAGE_DIRTY); }
+static inline int pte_young(pte_t pte)		{ return !!(pte_val(pte) & _PAGE_ACCESSED); }
+static inline int pte_special(pte_t pte)	{ return !!(pte_val(pte) & _PAGE_SPECIAL); }
+static inline int pte_none(pte_t pte)		{ return (pte_val(pte) & ~_PTE_NONE_MASK) == 0; }
+static inline pgprot_t pte_pgprot(pte_t pte)	{ return __pgprot(pte_val(pte) & PAGE_PROT_BITS); }
+
+static inline int pte_present(pte_t pte)
+{
+	return pte_val(pte) & _PAGE_PRESENT;
+}
+
+/* Conversion functions: convert a page and protection to a page entry,
+ * and a page entry and page directory to the page they refer to.
+ *
+ * Even if PTEs can be unsigned long long, a PFN is always an unsigned
+ * long for now.
+ */
+static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot)
+{
+	return __pte(((pte_basic_t)(pfn) << PTE_RPN_SHIFT) |
+		     pgprot_val(pgprot));
+}
+
+static inline unsigned long pte_pfn(pte_t pte)
+{
+	return pte_val(pte) >> PTE_RPN_SHIFT;
+}
+
+/* Generic modifiers for PTE bits */
+static inline pte_t pte_wrprotect(pte_t pte)
+{
+	return __pte(pte_val(pte) & ~_PAGE_RW);
+}
+
+static inline pte_t pte_mkclean(pte_t pte)
+{
+	return __pte(pte_val(pte) & ~_PAGE_DIRTY);
+}
+
+static inline pte_t pte_mkold(pte_t pte)
+{
+	return __pte(pte_val(pte) & ~_PAGE_ACCESSED);
+}
+
+static inline pte_t pte_mkwrite(pte_t pte)
+{
+	return __pte(pte_val(pte) | _PAGE_RW);
+}
+
+static inline pte_t pte_mkdirty(pte_t pte)
+{
+	return __pte(pte_val(pte) | _PAGE_DIRTY);
+}
+
+static inline pte_t pte_mkyoung(pte_t pte)
+{
+	return __pte(pte_val(pte) | _PAGE_ACCESSED);
+}
+
+static inline pte_t pte_mkspecial(pte_t pte)
+{
+	return __pte(pte_val(pte) | _PAGE_SPECIAL);
+}
+
+static inline pte_t pte_mkhuge(pte_t pte)
+{
+	return pte;
+}
+
+static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
+{
+	return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot));
+}
+
+
+
+/* This low level function performs the actual PTE insertion
+ * Setting the PTE depends on the MMU type and other factors. It's
+ * an horrible mess that I'm not going to try to clean up now but
+ * I'm keeping it in one place rather than spread around
+ */
+static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
+				pte_t *ptep, pte_t pte, int percpu)
+{
+#if defined(CONFIG_PPC_STD_MMU_32) && defined(CONFIG_SMP) && !defined(CONFIG_PTE_64BIT)
+	/* First case is 32-bit Hash MMU in SMP mode with 32-bit PTEs. We use the
+	 * helper pte_update() which does an atomic update. We need to do that
+	 * because a concurrent invalidation can clear _PAGE_HASHPTE. If it's a
+	 * per-CPU PTE such as a kmap_atomic, we do a simple update preserving
+	 * the hash bits instead (ie, same as the non-SMP case)
+	 */
+	if (percpu)
+		*ptep = __pte((pte_val(*ptep) & _PAGE_HASHPTE)
+			      | (pte_val(pte) & ~_PAGE_HASHPTE));
+	else
+		pte_update(ptep, ~_PAGE_HASHPTE, pte_val(pte));
+
+#elif defined(CONFIG_PPC32) && defined(CONFIG_PTE_64BIT)
+	/* Second case is 32-bit with 64-bit PTE.  In this case, we
+	 * can just store as long as we do the two halves in the right order
+	 * with a barrier in between. This is possible because we take care,
+	 * in the hash code, to pre-invalidate if the PTE was already hashed,
+	 * which synchronizes us with any concurrent invalidation.
+	 * In the percpu case, we also fallback to the simple update preserving
+	 * the hash bits
+	 */
+	if (percpu) {
+		*ptep = __pte((pte_val(*ptep) & _PAGE_HASHPTE)
+			      | (pte_val(pte) & ~_PAGE_HASHPTE));
+		return;
+	}
+	if (pte_val(*ptep) & _PAGE_HASHPTE)
+		flush_hash_entry(mm, ptep, addr);
+	__asm__ __volatile__("\
+		stw%U0%X0 %2,%0\n\
+		eieio\n\
+		stw%U0%X0 %L2,%1"
+	: "=m" (*ptep), "=m" (*((unsigned char *)ptep+4))
+	: "r" (pte) : "memory");
+
+#elif defined(CONFIG_PPC_STD_MMU_32)
+	/* Third case is 32-bit hash table in UP mode, we need to preserve
+	 * the _PAGE_HASHPTE bit since we may not have invalidated the previous
+	 * translation in the hash yet (done in a subsequent flush_tlb_xxx())
+	 * and see we need to keep track that this PTE needs invalidating
+	 */
+	*ptep = __pte((pte_val(*ptep) & _PAGE_HASHPTE)
+		      | (pte_val(pte) & ~_PAGE_HASHPTE));
+
+#else
+#error "Not supported "
+#endif
+}
+
+/*
+ * Macro to mark a page protection value as "uncacheable".
+ */
+
+#define _PAGE_CACHE_CTL	(_PAGE_COHERENT | _PAGE_GUARDED | _PAGE_NO_CACHE | \
+			 _PAGE_WRITETHRU)
+
+#define pgprot_noncached pgprot_noncached
+static inline pgprot_t pgprot_noncached(pgprot_t prot)
+{
+	return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) |
+			_PAGE_NO_CACHE | _PAGE_GUARDED);
+}
+
+#define pgprot_noncached_wc pgprot_noncached_wc
+static inline pgprot_t pgprot_noncached_wc(pgprot_t prot)
+{
+	return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) |
+			_PAGE_NO_CACHE);
+}
+
+#define pgprot_cached pgprot_cached
+static inline pgprot_t pgprot_cached(pgprot_t prot)
+{
+	return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) |
+			_PAGE_COHERENT);
+}
+
+#define pgprot_cached_wthru pgprot_cached_wthru
+static inline pgprot_t pgprot_cached_wthru(pgprot_t prot)
+{
+	return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) |
+			_PAGE_COHERENT | _PAGE_WRITETHRU);
+}
+
+#define pgprot_cached_noncoherent pgprot_cached_noncoherent
+static inline pgprot_t pgprot_cached_noncoherent(pgprot_t prot)
+{
+	return __pgprot(pgprot_val(prot) & ~_PAGE_CACHE_CTL);
+}
+
+#define pgprot_writecombine pgprot_writecombine
+static inline pgprot_t pgprot_writecombine(pgprot_t prot)
+{
+	return pgprot_noncached_wc(prot);
+}
+
 #endif /* !__ASSEMBLY__ */
 
 #endif /*  _ASM_POWERPC_BOOK3S_32_PGTABLE_H */
diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
index 447b212649c8..48237e66e823 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -481,6 +481,157 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
 	pmd_hugepage_update(mm, addr, pmdp, _PAGE_RW, 0);
 }
 
+/* Generic accessors to PTE bits */
+static inline int pte_write(pte_t pte)		{ return !!(pte_val(pte) & _PAGE_RW);}
+static inline int pte_dirty(pte_t pte)		{ return !!(pte_val(pte) & _PAGE_DIRTY); }
+static inline int pte_young(pte_t pte)		{ return !!(pte_val(pte) & _PAGE_ACCESSED); }
+static inline int pte_special(pte_t pte)	{ return !!(pte_val(pte) & _PAGE_SPECIAL); }
+static inline int pte_none(pte_t pte)		{ return (pte_val(pte) & ~_PTE_NONE_MASK) == 0; }
+static inline pgprot_t pte_pgprot(pte_t pte)	{ return __pgprot(pte_val(pte) & PAGE_PROT_BITS); }
+
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * These work without NUMA balancing but the kernel does not care. See the
+ * comment in include/asm-generic/pgtable.h . On powerpc, this will only
+ * work for user pages and always return true for kernel pages.
+ */
+static inline int pte_protnone(pte_t pte)
+{
+	return (pte_val(pte) &
+		(_PAGE_PRESENT | _PAGE_USER)) == _PAGE_PRESENT;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
+static inline int pte_present(pte_t pte)
+{
+	return pte_val(pte) & _PAGE_PRESENT;
+}
+
+/* Conversion functions: convert a page and protection to a page entry,
+ * and a page entry and page directory to the page they refer to.
+ *
+ * Even if PTEs can be unsigned long long, a PFN is always an unsigned
+ * long for now.
+ */
+static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot)
+{
+	return __pte(((pte_basic_t)(pfn) << PTE_RPN_SHIFT) |
+		     pgprot_val(pgprot));
+}
+
+static inline unsigned long pte_pfn(pte_t pte)
+{
+	return pte_val(pte) >> PTE_RPN_SHIFT;
+}
+
+/* Generic modifiers for PTE bits */
+static inline pte_t pte_wrprotect(pte_t pte)
+{
+	return __pte(pte_val(pte) & ~_PAGE_RW);
+}
+
+static inline pte_t pte_mkclean(pte_t pte)
+{
+	return __pte(pte_val(pte) & ~_PAGE_DIRTY);
+}
+
+static inline pte_t pte_mkold(pte_t pte)
+{
+	return __pte(pte_val(pte) & ~_PAGE_ACCESSED);
+}
+
+static inline pte_t pte_mkwrite(pte_t pte)
+{
+	return __pte(pte_val(pte) | _PAGE_RW);
+}
+
+static inline pte_t pte_mkdirty(pte_t pte)
+{
+	return __pte(pte_val(pte) | _PAGE_DIRTY);
+}
+
+static inline pte_t pte_mkyoung(pte_t pte)
+{
+	return __pte(pte_val(pte) | _PAGE_ACCESSED);
+}
+
+static inline pte_t pte_mkspecial(pte_t pte)
+{
+	return __pte(pte_val(pte) | _PAGE_SPECIAL);
+}
+
+static inline pte_t pte_mkhuge(pte_t pte)
+{
+	return pte;
+}
+
+static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
+{
+	return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot));
+}
+
+/* This low level function performs the actual PTE insertion
+ * Setting the PTE depends on the MMU type and other factors. It's
+ * an horrible mess that I'm not going to try to clean up now but
+ * I'm keeping it in one place rather than spread around
+ */
+static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
+				pte_t *ptep, pte_t pte, int percpu)
+{
+	/*
+	 * Anything else just stores the PTE normally. That covers all 64-bit
+	 * cases, and 32-bit non-hash with 32-bit PTEs.
+	 */
+	*ptep = pte;
+}
+
+/*
+ * Macro to mark a page protection value as "uncacheable".
+ */
+
+#define _PAGE_CACHE_CTL	(_PAGE_COHERENT | _PAGE_GUARDED | _PAGE_NO_CACHE | \
+			 _PAGE_WRITETHRU)
+
+#define pgprot_noncached pgprot_noncached
+static inline pgprot_t pgprot_noncached(pgprot_t prot)
+{
+	return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) |
+			_PAGE_NO_CACHE | _PAGE_GUARDED);
+}
+
+#define pgprot_noncached_wc pgprot_noncached_wc
+static inline pgprot_t pgprot_noncached_wc(pgprot_t prot)
+{
+	return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) |
+			_PAGE_NO_CACHE);
+}
+
+#define pgprot_cached pgprot_cached
+static inline pgprot_t pgprot_cached(pgprot_t prot)
+{
+	return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) |
+			_PAGE_COHERENT);
+}
+
+#define pgprot_cached_wthru pgprot_cached_wthru
+static inline pgprot_t pgprot_cached_wthru(pgprot_t prot)
+{
+	return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) |
+			_PAGE_COHERENT | _PAGE_WRITETHRU);
+}
+
+#define pgprot_cached_noncoherent pgprot_cached_noncoherent
+static inline pgprot_t pgprot_cached_noncoherent(pgprot_t prot)
+{
+	return __pgprot(pgprot_val(prot) & ~_PAGE_CACHE_CTL);
+}
+
+#define pgprot_writecombine pgprot_writecombine
+static inline pgprot_t pgprot_writecombine(pgprot_t prot)
+{
+	return pgprot_noncached_wc(prot);
+}
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 extern void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
 				   pmd_t *pmdp, unsigned long old_pmd);
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index aac630b4a15e..f2ace2cac7bb 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -201,6 +201,12 @@ static inline pte_t *pmdp_ptep(pmd_t *pmd)
 #define pmd_mkdirty(pmd)	pte_pmd(pte_mkdirty(pmd_pte(pmd)))
 #define pmd_mkyoung(pmd)	pte_pmd(pte_mkyoung(pmd_pte(pmd)))
 #define pmd_mkwrite(pmd)	pte_pmd(pte_mkwrite(pmd_pte(pmd)))
+#ifdef CONFIG_NUMA_BALANCING
+static inline int pmd_protnone(pmd_t pmd)
+{
+	return pte_protnone(pmd_pte(pmd));
+}
+#endif /* CONFIG_NUMA_BALANCING */
 
 #define __HAVE_ARCH_PMD_WRITE
 #define pmd_write(pmd)		pte_write(pmd_pte(pmd))
diff --git a/arch/powerpc/include/asm/book3s/pgtable.h b/arch/powerpc/include/asm/book3s/pgtable.h
index ebd6677ea017..8b0f4a29259a 100644
--- a/arch/powerpc/include/asm/book3s/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/pgtable.h
@@ -9,221 +9,17 @@
 
 #define FIRST_USER_ADDRESS	0UL
 #ifndef __ASSEMBLY__
-
-/* Generic accessors to PTE bits */
-static inline int pte_write(pte_t pte)		{ return !!(pte_val(pte) & _PAGE_RW);}
-static inline int pte_dirty(pte_t pte)		{ return !!(pte_val(pte) & _PAGE_DIRTY); }
-static inline int pte_young(pte_t pte)		{ return !!(pte_val(pte) & _PAGE_ACCESSED); }
-static inline int pte_special(pte_t pte)	{ return !!(pte_val(pte) & _PAGE_SPECIAL); }
-static inline int pte_none(pte_t pte)		{ return (pte_val(pte) & ~_PTE_NONE_MASK) == 0; }
-static inline pgprot_t pte_pgprot(pte_t pte)	{ return __pgprot(pte_val(pte) & PAGE_PROT_BITS); }
-
-#ifdef CONFIG_NUMA_BALANCING
-/*
- * These work without NUMA balancing but the kernel does not care. See the
- * comment in include/asm-generic/pgtable.h . On powerpc, this will only
- * work for user pages and always return true for kernel pages.
- */
-static inline int pte_protnone(pte_t pte)
-{
-	return (pte_val(pte) &
-		(_PAGE_PRESENT | _PAGE_USER)) == _PAGE_PRESENT;
-}
-
-static inline int pmd_protnone(pmd_t pmd)
-{
-	return pte_protnone(pmd_pte(pmd));
-}
-#endif /* CONFIG_NUMA_BALANCING */
-
-static inline int pte_present(pte_t pte)
-{
-	return pte_val(pte) & _PAGE_PRESENT;
-}
-
-/* Conversion functions: convert a page and protection to a page entry,
- * and a page entry and page directory to the page they refer to.
- *
- * Even if PTEs can be unsigned long long, a PFN is always an unsigned
- * long for now.
- */
-static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot)
-{
-	return __pte(((pte_basic_t)(pfn) << PTE_RPN_SHIFT) |
-		     pgprot_val(pgprot));
-}
-
-static inline unsigned long pte_pfn(pte_t pte)
-{
-	return pte_val(pte) >> PTE_RPN_SHIFT;
-}
-
-/* Generic modifiers for PTE bits */
-static inline pte_t pte_wrprotect(pte_t pte)
-{
-	return __pte(pte_val(pte) & ~_PAGE_RW);
-}
-
-static inline pte_t pte_mkclean(pte_t pte)
-{
-	return __pte(pte_val(pte) & ~_PAGE_DIRTY);
-}
-
-static inline pte_t pte_mkold(pte_t pte)
-{
-	return __pte(pte_val(pte) & ~_PAGE_ACCESSED);
-}
-
-static inline pte_t pte_mkwrite(pte_t pte)
-{
-	return __pte(pte_val(pte) | _PAGE_RW);
-}
-
-static inline pte_t pte_mkdirty(pte_t pte)
-{
-	return __pte(pte_val(pte) | _PAGE_DIRTY);
-}
-
-static inline pte_t pte_mkyoung(pte_t pte)
-{
-	return __pte(pte_val(pte) | _PAGE_ACCESSED);
-}
-
-static inline pte_t pte_mkspecial(pte_t pte)
-{
-	return __pte(pte_val(pte) | _PAGE_SPECIAL);
-}
-
-static inline pte_t pte_mkhuge(pte_t pte)
-{
-	return pte;
-}
-
-static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
-{
-	return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot));
-}
-
-
 /* Insert a PTE, top-level function is out of line. It uses an inline
  * low level function in the respective pgtable-* files
  */
 extern void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
 		       pte_t pte);
 
-/* This low level function performs the actual PTE insertion
- * Setting the PTE depends on the MMU type and other factors. It's
- * an horrible mess that I'm not going to try to clean up now but
- * I'm keeping it in one place rather than spread around
- */
-static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
-				pte_t *ptep, pte_t pte, int percpu)
-{
-#if defined(CONFIG_PPC_STD_MMU_32) && defined(CONFIG_SMP) && !defined(CONFIG_PTE_64BIT)
-	/* First case is 32-bit Hash MMU in SMP mode with 32-bit PTEs. We use the
-	 * helper pte_update() which does an atomic update. We need to do that
-	 * because a concurrent invalidation can clear _PAGE_HASHPTE. If it's a
-	 * per-CPU PTE such as a kmap_atomic, we do a simple update preserving
-	 * the hash bits instead (ie, same as the non-SMP case)
-	 */
-	if (percpu)
-		*ptep = __pte((pte_val(*ptep) & _PAGE_HASHPTE)
-			      | (pte_val(pte) & ~_PAGE_HASHPTE));
-	else
-		pte_update(ptep, ~_PAGE_HASHPTE, pte_val(pte));
-
-#elif defined(CONFIG_PPC32) && defined(CONFIG_PTE_64BIT)
-	/* Second case is 32-bit with 64-bit PTE.  In this case, we
-	 * can just store as long as we do the two halves in the right order
-	 * with a barrier in between. This is possible because we take care,
-	 * in the hash code, to pre-invalidate if the PTE was already hashed,
-	 * which synchronizes us with any concurrent invalidation.
-	 * In the percpu case, we also fallback to the simple update preserving
-	 * the hash bits
-	 */
-	if (percpu) {
-		*ptep = __pte((pte_val(*ptep) & _PAGE_HASHPTE)
-			      | (pte_val(pte) & ~_PAGE_HASHPTE));
-		return;
-	}
-	if (pte_val(*ptep) & _PAGE_HASHPTE)
-		flush_hash_entry(mm, ptep, addr);
-	__asm__ __volatile__("\
-		stw%U0%X0 %2,%0\n\
-		eieio\n\
-		stw%U0%X0 %L2,%1"
-	: "=m" (*ptep), "=m" (*((unsigned char *)ptep+4))
-	: "r" (pte) : "memory");
-
-#elif defined(CONFIG_PPC_STD_MMU_32)
-	/* Third case is 32-bit hash table in UP mode, we need to preserve
-	 * the _PAGE_HASHPTE bit since we may not have invalidated the previous
-	 * translation in the hash yet (done in a subsequent flush_tlb_xxx())
-	 * and see we need to keep track that this PTE needs invalidating
-	 */
-	*ptep = __pte((pte_val(*ptep) & _PAGE_HASHPTE)
-		      | (pte_val(pte) & ~_PAGE_HASHPTE));
-
-#else
-	/* Anything else just stores the PTE normally. That covers all 64-bit
-	 * cases, and 32-bit non-hash with 32-bit PTEs.
-	 */
-	*ptep = pte;
-#endif
-}
-
 
 #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
 extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address,
 				 pte_t *ptep, pte_t entry, int dirty);
 
-/*
- * Macro to mark a page protection value as "uncacheable".
- */
-
-#define _PAGE_CACHE_CTL	(_PAGE_COHERENT | _PAGE_GUARDED | _PAGE_NO_CACHE | \
-			 _PAGE_WRITETHRU)
-
-#define pgprot_noncached pgprot_noncached
-static inline pgprot_t pgprot_noncached(pgprot_t prot)
-{
-	return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) |
-			_PAGE_NO_CACHE | _PAGE_GUARDED);
-}
-
-#define pgprot_noncached_wc pgprot_noncached_wc
-static inline pgprot_t pgprot_noncached_wc(pgprot_t prot)
-{
-	return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) |
-			_PAGE_NO_CACHE);
-}
-
-#define pgprot_cached pgprot_cached
-static inline pgprot_t pgprot_cached(pgprot_t prot)
-{
-	return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) |
-			_PAGE_COHERENT);
-}
-
-#define pgprot_cached_wthru pgprot_cached_wthru
-static inline pgprot_t pgprot_cached_wthru(pgprot_t prot)
-{
-	return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) |
-			_PAGE_COHERENT | _PAGE_WRITETHRU);
-}
-
-#define pgprot_cached_noncoherent pgprot_cached_noncoherent
-static inline pgprot_t pgprot_cached_noncoherent(pgprot_t prot)
-{
-	return __pgprot(pgprot_val(prot) & ~_PAGE_CACHE_CTL);
-}
-
-#define pgprot_writecombine pgprot_writecombine
-static inline pgprot_t pgprot_writecombine(pgprot_t prot)
-{
-	return pgprot_noncached_wc(prot);
-}
-
 struct file;
 extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 				     unsigned long size, pgprot_t vma_prot);

From 17ed9e3192b2b29ad24ffe711fa4b71716ef3ff3 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 1 Dec 2015 09:06:38 +0530
Subject: [PATCH 046/149] powerpc/booke: Move nohash headers

Move the booke related headers below booke/32 or booke/64

Acked-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 .../asm/{pgtable-ppc32.h => nohash/32/pgtable.h} | 16 ++++++++--------
 .../include/asm/{ => nohash/32}/pte-40x.h        |  6 +++---
 .../include/asm/{ => nohash/32}/pte-44x.h        |  6 +++---
 .../include/asm/{ => nohash/32}/pte-8xx.h        |  6 +++---
 .../include/asm/{ => nohash/32}/pte-fsl-booke.h  |  6 +++---
 .../64/pgtable-4k.h}                             |  6 +++---
 .../64/pgtable-64k.h}                            |  6 +++---
 .../asm/{pgtable-ppc64.h => nohash/64/pgtable.h} | 14 +++++++-------
 .../asm/{pgtable-book3e.h => nohash/pgtable.h}   |  8 ++++----
 .../include/asm/{ => nohash}/pte-book3e.h        |  6 +++---
 arch/powerpc/include/asm/pgtable.h               |  2 +-
 11 files changed, 41 insertions(+), 41 deletions(-)
 rename arch/powerpc/include/asm/{pgtable-ppc32.h => nohash/32/pgtable.h} (97%)
 rename arch/powerpc/include/asm/{ => nohash/32}/pte-40x.h (95%)
 rename arch/powerpc/include/asm/{ => nohash/32}/pte-44x.h (96%)
 rename arch/powerpc/include/asm/{ => nohash/32}/pte-8xx.h (95%)
 rename arch/powerpc/include/asm/{ => nohash/32}/pte-fsl-booke.h (88%)
 rename arch/powerpc/include/asm/{pgtable-ppc64-4k.h => nohash/64/pgtable-4k.h} (94%)
 rename arch/powerpc/include/asm/{pgtable-ppc64-64k.h => nohash/64/pgtable-64k.h} (90%)
 rename arch/powerpc/include/asm/{pgtable-ppc64.h => nohash/64/pgtable.h} (98%)
 rename arch/powerpc/include/asm/{pgtable-book3e.h => nohash/pgtable.h} (97%)
 rename arch/powerpc/include/asm/{ => nohash}/pte-book3e.h (95%)

diff --git a/arch/powerpc/include/asm/pgtable-ppc32.h b/arch/powerpc/include/asm/nohash/32/pgtable.h
similarity index 97%
rename from arch/powerpc/include/asm/pgtable-ppc32.h
rename to arch/powerpc/include/asm/nohash/32/pgtable.h
index fbb23c54b998..c82cbf52d19e 100644
--- a/arch/powerpc/include/asm/pgtable-ppc32.h
+++ b/arch/powerpc/include/asm/nohash/32/pgtable.h
@@ -1,5 +1,5 @@
-#ifndef _ASM_POWERPC_PGTABLE_PPC32_H
-#define _ASM_POWERPC_PGTABLE_PPC32_H
+#ifndef _ASM_POWERPC_NOHASH_32_PGTABLE_H
+#define _ASM_POWERPC_NOHASH_32_PGTABLE_H
 
 #include <asm-generic/pgtable-nopmd.h>
 
@@ -106,15 +106,15 @@ extern int icache_44x_need_flush;
  */
 
 #if defined(CONFIG_40x)
-#include <asm/pte-40x.h>
+#include <asm/nohash/32/pte-40x.h>
 #elif defined(CONFIG_44x)
-#include <asm/pte-44x.h>
+#include <asm/nohash/32/pte-44x.h>
 #elif defined(CONFIG_FSL_BOOKE) && defined(CONFIG_PTE_64BIT)
-#include <asm/pte-book3e.h>
+#include <asm/nohash/pte-book3e.h>
 #elif defined(CONFIG_FSL_BOOKE)
-#include <asm/pte-fsl-booke.h>
+#include <asm/nohash/32/pte-fsl-booke.h>
 #elif defined(CONFIG_8xx)
-#include <asm/pte-8xx.h>
+#include <asm/nohash/32/pte-8xx.h>
 #endif
 
 /* And here we include common definitions */
@@ -340,4 +340,4 @@ extern int get_pteptr(struct mm_struct *mm, unsigned long addr, pte_t **ptep,
 
 #endif /* !__ASSEMBLY__ */
 
-#endif /* _ASM_POWERPC_PGTABLE_PPC32_H */
+#endif /* __ASM_POWERPC_NOHASH_32_PGTABLE_H */
diff --git a/arch/powerpc/include/asm/pte-40x.h b/arch/powerpc/include/asm/nohash/32/pte-40x.h
similarity index 95%
rename from arch/powerpc/include/asm/pte-40x.h
rename to arch/powerpc/include/asm/nohash/32/pte-40x.h
index 486b1ef81338..9624ebdacc47 100644
--- a/arch/powerpc/include/asm/pte-40x.h
+++ b/arch/powerpc/include/asm/nohash/32/pte-40x.h
@@ -1,5 +1,5 @@
-#ifndef _ASM_POWERPC_PTE_40x_H
-#define _ASM_POWERPC_PTE_40x_H
+#ifndef _ASM_POWERPC_NOHASH_32_PTE_40x_H
+#define _ASM_POWERPC_NOHASH_32_PTE_40x_H
 #ifdef __KERNEL__
 
 /*
@@ -61,4 +61,4 @@
 #define PTE_ATOMIC_UPDATES	1
 
 #endif /* __KERNEL__ */
-#endif /*  _ASM_POWERPC_PTE_40x_H */
+#endif /*  _ASM_POWERPC_NOHASH_32_PTE_40x_H */
diff --git a/arch/powerpc/include/asm/pte-44x.h b/arch/powerpc/include/asm/nohash/32/pte-44x.h
similarity index 96%
rename from arch/powerpc/include/asm/pte-44x.h
rename to arch/powerpc/include/asm/nohash/32/pte-44x.h
index 36f75fab23f5..fdab41c654ef 100644
--- a/arch/powerpc/include/asm/pte-44x.h
+++ b/arch/powerpc/include/asm/nohash/32/pte-44x.h
@@ -1,5 +1,5 @@
-#ifndef _ASM_POWERPC_PTE_44x_H
-#define _ASM_POWERPC_PTE_44x_H
+#ifndef _ASM_POWERPC_NOHASH_32_PTE_44x_H
+#define _ASM_POWERPC_NOHASH_32_PTE_44x_H
 #ifdef __KERNEL__
 
 /*
@@ -94,4 +94,4 @@
 
 
 #endif /* __KERNEL__ */
-#endif /*  _ASM_POWERPC_PTE_44x_H */
+#endif /*  _ASM_POWERPC_NOHASH_32_PTE_44x_H */
diff --git a/arch/powerpc/include/asm/pte-8xx.h b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
similarity index 95%
rename from arch/powerpc/include/asm/pte-8xx.h
rename to arch/powerpc/include/asm/nohash/32/pte-8xx.h
index a0e2ba960976..3742b1919661 100644
--- a/arch/powerpc/include/asm/pte-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
@@ -1,5 +1,5 @@
-#ifndef _ASM_POWERPC_PTE_8xx_H
-#define _ASM_POWERPC_PTE_8xx_H
+#ifndef _ASM_POWERPC_NOHASH_32_PTE_8xx_H
+#define _ASM_POWERPC_NOHASH_32_PTE_8xx_H
 #ifdef __KERNEL__
 
 /*
@@ -62,4 +62,4 @@
 				 _PAGE_HWWRITE | _PAGE_EXEC)
 
 #endif /* __KERNEL__ */
-#endif /*  _ASM_POWERPC_PTE_8xx_H */
+#endif /*  _ASM_POWERPC_NOHASH_32_PTE_8xx_H */
diff --git a/arch/powerpc/include/asm/pte-fsl-booke.h b/arch/powerpc/include/asm/nohash/32/pte-fsl-booke.h
similarity index 88%
rename from arch/powerpc/include/asm/pte-fsl-booke.h
rename to arch/powerpc/include/asm/nohash/32/pte-fsl-booke.h
index 9f5c3d04a1a3..5422d00c6145 100644
--- a/arch/powerpc/include/asm/pte-fsl-booke.h
+++ b/arch/powerpc/include/asm/nohash/32/pte-fsl-booke.h
@@ -1,5 +1,5 @@
-#ifndef _ASM_POWERPC_PTE_FSL_BOOKE_H
-#define _ASM_POWERPC_PTE_FSL_BOOKE_H
+#ifndef _ASM_POWERPC_NOHASH_32_PTE_FSL_BOOKE_H
+#define _ASM_POWERPC_NOHASH_32_PTE_FSL_BOOKE_H
 #ifdef __KERNEL__
 
 /* PTE bit definitions for Freescale BookE SW loaded TLB MMU based
@@ -37,4 +37,4 @@
 #define PTE_WIMGE_SHIFT (6)
 
 #endif /* __KERNEL__ */
-#endif /*  _ASM_POWERPC_PTE_FSL_BOOKE_H */
+#endif /*  _ASM_POWERPC_NOHASH_32_PTE_FSL_BOOKE_H */
diff --git a/arch/powerpc/include/asm/pgtable-ppc64-4k.h b/arch/powerpc/include/asm/nohash/64/pgtable-4k.h
similarity index 94%
rename from arch/powerpc/include/asm/pgtable-ppc64-4k.h
rename to arch/powerpc/include/asm/nohash/64/pgtable-4k.h
index 7bace25d6b62..fc7d51753f81 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64-4k.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable-4k.h
@@ -1,5 +1,5 @@
-#ifndef _ASM_POWERPC_PGTABLE_PPC64_4K_H
-#define _ASM_POWERPC_PGTABLE_PPC64_4K_H
+#ifndef _ASM_POWERPC_NOHASH_64_PGTABLE_4K_H
+#define _ASM_POWERPC_NOHASH_64_PGTABLE_4K_H
 /*
  * Entries per page directory level.  The PTE level must use a 64b record
  * for each page table entry.  The PMD and PGD level use a 32b record for
@@ -89,4 +89,4 @@ extern struct page *pgd_page(pgd_t pgd);
 #define remap_4k_pfn(vma, addr, pfn, prot)	\
 	remap_pfn_range((vma), (addr), (pfn), PAGE_SIZE, (prot))
 
-#endif /* _ASM_POWERPC_PGTABLE_PPC64_4K_H */
+#endif /* _ _ASM_POWERPC_NOHASH_64_PGTABLE_4K_H */
diff --git a/arch/powerpc/include/asm/pgtable-ppc64-64k.h b/arch/powerpc/include/asm/nohash/64/pgtable-64k.h
similarity index 90%
rename from arch/powerpc/include/asm/pgtable-ppc64-64k.h
rename to arch/powerpc/include/asm/nohash/64/pgtable-64k.h
index 1de35bbd02a6..a44660d76096 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64-64k.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable-64k.h
@@ -1,5 +1,5 @@
-#ifndef _ASM_POWERPC_PGTABLE_PPC64_64K_H
-#define _ASM_POWERPC_PGTABLE_PPC64_64K_H
+#ifndef _ASM_POWERPC_NOHASH_64_PGTABLE_64K_H
+#define _ASM_POWERPC_NOHASH_64_PGTABLE_64K_H
 
 #include <asm-generic/pgtable-nopud.h>
 
@@ -41,4 +41,4 @@
 #define pgd_pte(pgd)	(pud_pte(((pud_t){ pgd })))
 #define pte_pgd(pte)	((pgd_t)pte_pud(pte))
 
-#endif /* _ASM_POWERPC_PGTABLE_PPC64_64K_H */
+#endif /* _ASM_POWERPC_NOHASH_64_PGTABLE_64K_H */
diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/nohash/64/pgtable.h
similarity index 98%
rename from arch/powerpc/include/asm/pgtable-ppc64.h
rename to arch/powerpc/include/asm/nohash/64/pgtable.h
index 6be203d43fd1..c24e03f22655 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable.h
@@ -1,14 +1,14 @@
-#ifndef _ASM_POWERPC_PGTABLE_PPC64_H_
-#define _ASM_POWERPC_PGTABLE_PPC64_H_
+#ifndef _ASM_POWERPC_NOHASH_64_PGTABLE_H
+#define _ASM_POWERPC_NOHASH_64_PGTABLE_H
 /*
  * This file contains the functions and defines necessary to modify and use
  * the ppc64 hashed page table.
  */
 
 #ifdef CONFIG_PPC_64K_PAGES
-#include <asm/pgtable-ppc64-64k.h>
+#include <asm/nohash/64/pgtable-64k.h>
 #else
-#include <asm/pgtable-ppc64-4k.h>
+#include <asm/nohash/64/pgtable-4k.h>
 #endif
 #include <asm/barrier.h>
 
@@ -18,7 +18,7 @@
  * Size of EA range mapped by our pagetables.
  */
 #define PGTABLE_EADDR_SIZE (PTE_INDEX_SIZE + PMD_INDEX_SIZE + \
-                	    PUD_INDEX_SIZE + PGD_INDEX_SIZE + PAGE_SHIFT)
+			    PUD_INDEX_SIZE + PGD_INDEX_SIZE + PAGE_SHIFT)
 #define PGTABLE_RANGE (ASM_CONST(1) << PGTABLE_EADDR_SIZE)
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -97,7 +97,7 @@
 /*
  * Include the PTE bits definitions
  */
-#include <asm/pte-book3e.h>
+#include <asm/nohash/pte-book3e.h>
 #include <asm/pte-common.h>
 
 #ifdef CONFIG_PPC_MM_SLICES
@@ -637,4 +637,4 @@ static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
 	return true;
 }
 #endif /* __ASSEMBLY__ */
-#endif /* _ASM_POWERPC_PGTABLE_PPC64_H_ */
+#endif /* _ASM_POWERPC_NOHASH_64_PGTABLE_H */
diff --git a/arch/powerpc/include/asm/pgtable-book3e.h b/arch/powerpc/include/asm/nohash/pgtable.h
similarity index 97%
rename from arch/powerpc/include/asm/pgtable-book3e.h
rename to arch/powerpc/include/asm/nohash/pgtable.h
index 91325997ba25..c0c41a2409d2 100644
--- a/arch/powerpc/include/asm/pgtable-book3e.h
+++ b/arch/powerpc/include/asm/nohash/pgtable.h
@@ -1,10 +1,10 @@
-#ifndef _ASM_POWERPC_PGTABLE_BOOK3E_H
-#define _ASM_POWERPC_PGTABLE_BOOK3E_H
+#ifndef _ASM_POWERPC_NOHASH_PGTABLE_H
+#define _ASM_POWERPC_NOHASH_PGTABLE_H
 
 #if defined(CONFIG_PPC64)
-#include <asm/pgtable-ppc64.h>
+#include <asm/nohash/64/pgtable.h>
 #else
-#include <asm/pgtable-ppc32.h>
+#include <asm/nohash/32/pgtable.h>
 #endif
 
 #ifndef __ASSEMBLY__
diff --git a/arch/powerpc/include/asm/pte-book3e.h b/arch/powerpc/include/asm/nohash/pte-book3e.h
similarity index 95%
rename from arch/powerpc/include/asm/pte-book3e.h
rename to arch/powerpc/include/asm/nohash/pte-book3e.h
index 8d8473278d91..e16807b78edf 100644
--- a/arch/powerpc/include/asm/pte-book3e.h
+++ b/arch/powerpc/include/asm/nohash/pte-book3e.h
@@ -1,5 +1,5 @@
-#ifndef _ASM_POWERPC_PTE_BOOK3E_H
-#define _ASM_POWERPC_PTE_BOOK3E_H
+#ifndef _ASM_POWERPC_NOHASH_PTE_BOOK3E_H
+#define _ASM_POWERPC_NOHASH_PTE_BOOK3E_H
 #ifdef __KERNEL__
 
 /* PTE bit definitions for processors compliant to the Book3E
@@ -84,4 +84,4 @@
 #endif
 
 #endif /* __KERNEL__ */
-#endif /*  _ASM_POWERPC_PTE_FSL_BOOKE_H */
+#endif /*  _ASM_POWERPC_NOHASH_PTE_BOOK3E_H */
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index 8f7338678fdc..ac9fb114e25d 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -15,7 +15,7 @@ struct mm_struct;
 #ifdef CONFIG_PPC_BOOK3S
 #include <asm/book3s/pgtable.h>
 #else
-#include <asm/pgtable-book3e.h>
+#include <asm/nohash/pgtable.h>
 #endif /* !CONFIG_PPC_BOOK3S */
 
 #ifndef __ASSEMBLY__

From 91f1da99792a1d133df94c4753510305353064a1 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 1 Dec 2015 09:06:43 +0530
Subject: [PATCH 047/149] powerpc/mm: Convert 4k hash insert to C

Acked-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/Makefile        |   3 +
 arch/powerpc/mm/hash64_64k.c    | 202 +++++++++++++++++
 arch/powerpc/mm/hash_low_64.S   | 380 --------------------------------
 arch/powerpc/mm/hash_utils_64.c |   4 +-
 4 files changed, 208 insertions(+), 381 deletions(-)
 create mode 100644 arch/powerpc/mm/hash64_64k.c

diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 3eb73a38220d..f80ad1a76cc8 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -18,6 +18,9 @@ obj-$(CONFIG_PPC_STD_MMU_32)	+= ppc_mmu_32.o
 obj-$(CONFIG_PPC_STD_MMU)	+= hash_low_$(CONFIG_WORD_SIZE).o \
 				   tlb_hash$(CONFIG_WORD_SIZE).o \
 				   mmu_context_hash$(CONFIG_WORD_SIZE).o
+ifeq ($(CONFIG_PPC_STD_MMU_64),y)
+obj-$(CONFIG_PPC_64K_PAGES)	+= hash64_64k.o
+endif
 obj-$(CONFIG_PPC_ICSWX)		+= icswx.o
 obj-$(CONFIG_PPC_ICSWX_PID)	+= icswx_pid.o
 obj-$(CONFIG_40x)		+= 40x_mmu.o
diff --git a/arch/powerpc/mm/hash64_64k.c b/arch/powerpc/mm/hash64_64k.c
new file mode 100644
index 000000000000..9ffeae2cbb57
--- /dev/null
+++ b/arch/powerpc/mm/hash64_64k.c
@@ -0,0 +1,202 @@
+/*
+ * Copyright IBM Corporation, 2015
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+#include <linux/mm.h>
+#include <asm/machdep.h>
+#include <asm/mmu.h>
+
+int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
+		   pte_t *ptep, unsigned long trap, unsigned long flags,
+		   int ssize, int subpg_prot)
+{
+	real_pte_t rpte;
+	unsigned long *hidxp;
+	unsigned long hpte_group;
+	unsigned int subpg_index;
+	unsigned long rflags, pa, hidx;
+	unsigned long old_pte, new_pte, subpg_pte;
+	unsigned long vpn, hash, slot;
+	unsigned long shift = mmu_psize_defs[MMU_PAGE_4K].shift;
+
+	/*
+	 * atomically mark the linux large page PTE busy and dirty
+	 */
+	do {
+		pte_t pte = READ_ONCE(*ptep);
+
+		old_pte = pte_val(pte);
+		/* If PTE busy, retry the access */
+		if (unlikely(old_pte & _PAGE_BUSY))
+			return 0;
+		/* If PTE permissions don't match, take page fault */
+		if (unlikely(access & ~old_pte))
+			return 1;
+		/*
+		 * Try to lock the PTE, add ACCESSED and DIRTY if it was
+		 * a write access. Since this is 4K insert of 64K page size
+		 * also add _PAGE_COMBO
+		 */
+		new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED | _PAGE_COMBO;
+		if (access & _PAGE_RW)
+			new_pte |= _PAGE_DIRTY;
+	} while (old_pte != __cmpxchg_u64((unsigned long *)ptep,
+					  old_pte, new_pte));
+	/*
+	 * Handle the subpage protection bits
+	 */
+	subpg_pte = new_pte & ~subpg_prot;
+	/*
+	 * PP bits. _PAGE_USER is already PP bit 0x2, so we only
+	 * need to add in 0x1 if it's a read-only user page
+	 */
+	rflags = subpg_pte & _PAGE_USER;
+	if ((subpg_pte & _PAGE_USER) && !((subpg_pte & _PAGE_RW) &&
+					(subpg_pte & _PAGE_DIRTY)))
+		rflags |= 0x1;
+	/*
+	 * _PAGE_EXEC -> HW_NO_EXEC since it's inverted
+	 */
+	rflags |= ((subpg_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
+	/*
+	 * Always add C and Memory coherence bit
+	 */
+	rflags |= HPTE_R_C | HPTE_R_M;
+	/*
+	 * Add in WIMG bits
+	 */
+	rflags |= (subpg_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
+				_PAGE_COHERENT | _PAGE_GUARDED));
+
+	if (!cpu_has_feature(CPU_FTR_NOEXECUTE) &&
+	    !cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) {
+
+		/*
+		 * No CPU has hugepages but lacks no execute, so we
+		 * don't need to worry about that case
+		 */
+		rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
+	}
+
+	subpg_index = (ea & (PAGE_SIZE - 1)) >> shift;
+	vpn  = hpt_vpn(ea, vsid, ssize);
+	rpte = __real_pte(__pte(old_pte), ptep);
+	/*
+	 *None of the sub 4k page is hashed
+	 */
+	if (!(old_pte & _PAGE_HASHPTE))
+		goto htab_insert_hpte;
+	/*
+	 * Check if the pte was already inserted into the hash table
+	 * as a 64k HW page, and invalidate the 64k HPTE if so.
+	 */
+	if (!(old_pte & _PAGE_COMBO)) {
+		flush_hash_page(vpn, rpte, MMU_PAGE_64K, ssize, flags);
+		old_pte &= ~_PAGE_HPTE_SUB;
+		goto htab_insert_hpte;
+	}
+	/*
+	 * Check for sub page valid and update
+	 */
+	if (__rpte_sub_valid(rpte, subpg_index)) {
+		int ret;
+
+		hash = hpt_hash(vpn, shift, ssize);
+		hidx = __rpte_to_hidx(rpte, subpg_index);
+		if (hidx & _PTEIDX_SECONDARY)
+			hash = ~hash;
+		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+		slot += hidx & _PTEIDX_GROUP_IX;
+
+		ret = ppc_md.hpte_updatepp(slot, rflags, vpn,
+					   MMU_PAGE_4K, MMU_PAGE_4K,
+					   ssize, flags);
+		/*
+		 *if we failed because typically the HPTE wasn't really here
+		 * we try an insertion.
+		 */
+		if (ret == -1)
+			goto htab_insert_hpte;
+
+		*ptep = __pte(new_pte & ~_PAGE_BUSY);
+		return 0;
+	}
+
+htab_insert_hpte:
+	/*
+	 * handle _PAGE_4K_PFN case
+	 */
+	if (old_pte & _PAGE_4K_PFN) {
+		/*
+		 * All the sub 4k page have the same
+		 * physical address.
+		 */
+		pa = pte_pfn(__pte(old_pte)) << HW_PAGE_SHIFT;
+	} else {
+		pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
+		pa += (subpg_index << shift);
+	}
+	hash = hpt_hash(vpn, shift, ssize);
+repeat:
+	hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
+
+	/* Insert into the hash table, primary slot */
+	slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, 0,
+				  MMU_PAGE_4K, MMU_PAGE_4K, ssize);
+	/*
+	 * Primary is full, try the secondary
+	 */
+	if (unlikely(slot == -1)) {
+		hpte_group = ((~hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
+		slot = ppc_md.hpte_insert(hpte_group, vpn, pa,
+					  rflags, HPTE_V_SECONDARY,
+					  MMU_PAGE_4K, MMU_PAGE_4K, ssize);
+		if (slot == -1) {
+			if (mftb() & 0x1)
+				hpte_group = ((hash & htab_hash_mask) *
+					      HPTES_PER_GROUP) & ~0x7UL;
+			ppc_md.hpte_remove(hpte_group);
+			/*
+			 * FIXME!! Should be try the group from which we removed ?
+			 */
+			goto repeat;
+		}
+	}
+	/*
+	 * Hypervisor failure. Restore old pmd and return -1
+	 * similar to __hash_page_*
+	 */
+	if (unlikely(slot == -2)) {
+		*ptep = __pte(old_pte);
+		hash_failure_debug(ea, access, vsid, trap, ssize,
+				   MMU_PAGE_4K, MMU_PAGE_4K, old_pte);
+		return -1;
+	}
+	/*
+	 * Insert slot number & secondary bit in PTE second half,
+	 * clear _PAGE_BUSY and set appropriate HPTE slot bit
+	 * Since we have _PAGE_BUSY set on ptep, we can be sure
+	 * nobody is undating hidx.
+	 */
+	hidxp = (unsigned long *)(ptep + PTRS_PER_PTE);
+	/* __real_pte use pte_val() any idea why ? FIXME!! */
+	rpte.hidx &= ~(0xfUL << (subpg_index << 2));
+	*hidxp = rpte.hidx  | (slot << (subpg_index << 2));
+	new_pte |= (_PAGE_HPTE_SUB0 >> subpg_index);
+	/*
+	 * check __real_pte for details on matching smp_rmb()
+	 */
+	smp_wmb();
+	*ptep = __pte(new_pte & ~_PAGE_BUSY);
+	return 0;
+}
diff --git a/arch/powerpc/mm/hash_low_64.S b/arch/powerpc/mm/hash_low_64.S
index 3b49e3295901..6b4d4c1d0628 100644
--- a/arch/powerpc/mm/hash_low_64.S
+++ b/arch/powerpc/mm/hash_low_64.S
@@ -328,381 +328,8 @@ htab_pte_insert_failure:
 	li	r3,-1
 	b	htab_bail
 
-
 #else /* CONFIG_PPC_64K_PAGES */
 
-
-/*****************************************************************************
- *                                                                           *
- *           64K SW & 4K or 64K HW in a 4K segment pages implementation      *
- *                                                                           *
- *****************************************************************************/
-
-/* _hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
- *		 pte_t *ptep, unsigned long trap, unsigned local flags,
- *		 int ssize, int subpg_prot)
- */
-
-/*
- * For now, we do NOT implement Admixed pages
- */
-_GLOBAL(__hash_page_4K)
-	mflr	r0
-	std	r0,16(r1)
-	stdu	r1,-STACKFRAMESIZE(r1)
-	/* Save all params that we need after a function call */
-	std	r6,STK_PARAM(R6)(r1)
-	std	r8,STK_PARAM(R8)(r1)
-	std	r9,STK_PARAM(R9)(r1)
-
-	/* Save non-volatile registers.
-	 * r31 will hold "old PTE"
-	 * r30 is "new PTE"
-	 * r29 is vpn
-	 * r28 is a hash value
-	 * r27 is hashtab mask (maybe dynamic patched instead ?)
-	 * r26 is the hidx mask
-	 * r25 is the index in combo page
-	 */
-	std	r25,STK_REG(R25)(r1)
-	std	r26,STK_REG(R26)(r1)
-	std	r27,STK_REG(R27)(r1)
-	std	r28,STK_REG(R28)(r1)
-	std	r29,STK_REG(R29)(r1)
-	std	r30,STK_REG(R30)(r1)
-	std	r31,STK_REG(R31)(r1)
-
-	/* Step 1:
-	 *
-	 * Check permissions, atomically mark the linux PTE busy
-	 * and hashed.
-	 */
-1:
-	ldarx	r31,0,r6
-	/* Check access rights (access & ~(pte_val(*ptep))) */
-	andc.	r0,r4,r31
-	bne-	htab_wrong_access
-	/* Check if PTE is busy */
-	andi.	r0,r31,_PAGE_BUSY
-	/* If so, just bail out and refault if needed. Someone else
-	 * is changing this PTE anyway and might hash it.
-	 */
-	bne-	htab_bail_ok
-	/* Prepare new PTE value (turn access RW into DIRTY, then
-	 * add BUSY and ACCESSED)
-	 */
-	rlwinm	r30,r4,32-9+7,31-7,31-7	/* _PAGE_RW -> _PAGE_DIRTY */
-	or	r30,r30,r31
-	ori	r30,r30,_PAGE_BUSY | _PAGE_ACCESSED
-	oris	r30,r30,_PAGE_COMBO@h
-	/* Write the linux PTE atomically (setting busy) */
-	stdcx.	r30,0,r6
-	bne-	1b
-	isync
-
-	/* Step 2:
-	 *
-	 * Insert/Update the HPTE in the hash table. At this point,
-	 * r4 (access) is re-useable, we use it for the new HPTE flags
-	 */
-
-	/* Load the hidx index */
-	rldicl	r25,r3,64-12,60
-
-BEGIN_FTR_SECTION
-	cmpdi	r9,0			/* check segment size */
-	bne	3f
-END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
-	/* Calc vpn and put it in r29 */
-	sldi	r29,r5,SID_SHIFT - VPN_SHIFT
-	/*
-	 * clrldi r3,r3,64 - SID_SHIFT -->  ea & 0xfffffff
-	 * srdi	 r28,r3,VPN_SHIFT
-	 */
-	rldicl  r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT - VPN_SHIFT)
-	or	r29,r28,r29
-	/*
-	 * Calculate hash value for primary slot and store it in r28
-	 * r3 = va, r5 = vsid
-	 * r0 = (va >> 12) & ((1ul << (28 - 12)) -1)
-	 */
-	rldicl	r0,r3,64-12,48
-	xor	r28,r5,r0		/* hash */
-	b	4f
-
-3:	/* Calc vpn and put it in r29 */
-	sldi	r29,r5,SID_SHIFT_1T - VPN_SHIFT
-	/*
-	 * clrldi r3,r3,64 - SID_SHIFT_1T -->  ea & 0xffffffffff
-	 * srdi	r28,r3,VPN_SHIFT
-	 */
-	rldicl  r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT_1T - VPN_SHIFT)
-	or	r29,r28,r29
-
-	/*
-	 * Calculate hash value for primary slot and
-	 * store it in r28  for 1T segment
-	 * r3 = va, r5 = vsid
-	 */
-	sldi	r28,r5,25		/* vsid << 25 */
-	/* r0 = (va >> 12) & ((1ul << (40 - 12)) -1) */
-	rldicl	r0,r3,64-12,36
-	xor	r28,r28,r5		/* vsid ^ ( vsid << 25) */
-	xor	r28,r28,r0		/* hash */
-
-	/* Convert linux PTE bits into HW equivalents */
-4:
-#ifdef CONFIG_PPC_SUBPAGE_PROT
-	andc	r10,r30,r10
-	andi.	r3,r10,0x1fe		/* Get basic set of flags */
-	rlwinm	r0,r10,32-9+1,30,30	/* _PAGE_RW -> _PAGE_USER (r0) */
-#else
-	andi.	r3,r30,0x1fe		/* Get basic set of flags */
-	rlwinm	r0,r30,32-9+1,30,30	/* _PAGE_RW -> _PAGE_USER (r0) */
-#endif
-	xori	r3,r3,HPTE_R_N		/* _PAGE_EXEC -> NOEXEC */
-	rlwinm	r4,r30,32-7+1,30,30	/* _PAGE_DIRTY -> _PAGE_USER (r4) */
-	and	r0,r0,r4		/* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/
-	andc	r0,r3,r0		/* r0 = pte & ~r0 */
-	rlwimi	r3,r0,32-1,31,31	/* Insert result into PP lsb */
-	/*
-	 * Always add "C" bit for perf. Memory coherence is always enabled
-	 */
-	ori	r3,r3,HPTE_R_C | HPTE_R_M
-
-	/* We eventually do the icache sync here (maybe inline that
-	 * code rather than call a C function...)
-	 */
-BEGIN_FTR_SECTION
-	mr	r4,r30
-	mr	r5,r7
-	bl	hash_page_do_lazy_icache
-END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE)
-
-	/* At this point, r3 contains new PP bits, save them in
-	 * place of "access" in the param area (sic)
-	 */
-	std	r3,STK_PARAM(R4)(r1)
-
-	/* Get htab_hash_mask */
-	ld	r4,htab_hash_mask@got(2)
-	ld	r27,0(r4)	/* htab_hash_mask -> r27 */
-
-	/* Check if we may already be in the hashtable, in this case, we
-	 * go to out-of-line code to try to modify the HPTE. We look for
-	 * the bit at (1 >> (index + 32))
-	 */
-	rldicl.	r0,r31,64-12,48
-	li	r26,0			/* Default hidx */
-	beq	htab_insert_pte
-
-	/*
-	 * Check if the pte was already inserted into the hash table
-	 * as a 64k HW page, and invalidate the 64k HPTE if so.
-	 */
-	andis.	r0,r31,_PAGE_COMBO@h
-	beq	htab_inval_old_hpte
-
-	ld	r6,STK_PARAM(R6)(r1)
-	ori	r26,r6,PTE_PAGE_HIDX_OFFSET /* Load the hidx mask. */
-	ld	r26,0(r26)
-	addi	r5,r25,36		/* Check actual HPTE_SUB bit, this */
-	rldcr.	r0,r31,r5,0		/* must match pgtable.h definition */
-	bne	htab_modify_pte
-
-htab_insert_pte:
-	/* real page number in r5, PTE RPN value + index */
-	andis.	r0,r31,_PAGE_4K_PFN@h
-	srdi	r5,r31,PTE_RPN_SHIFT
-	bne-	htab_special_pfn
-	sldi	r5,r5,PAGE_FACTOR
-	add	r5,r5,r25
-htab_special_pfn:
-	sldi	r5,r5,HW_PAGE_SHIFT
-
-	/* Calculate primary group hash */
-	and	r0,r28,r27
-	rldicr	r3,r0,3,63-3		/* r0 = (hash & mask) << 3 */
-
-	/* Call ppc_md.hpte_insert */
-	ld	r6,STK_PARAM(R4)(r1)	/* Retrieve new pp bits */
-	mr	r4,r29			/* Retrieve vpn */
-	li	r7,0			/* !bolted, !secondary */
-	li	r8,MMU_PAGE_4K		/* page size */
-	li	r9,MMU_PAGE_4K		/* actual page size */
-	ld	r10,STK_PARAM(R9)(r1)	/* segment size */
-.globl htab_call_hpte_insert1
-htab_call_hpte_insert1:
-	bl	.			/* patched by htab_finish_init() */
-	cmpdi	0,r3,0
-	bge	htab_pte_insert_ok	/* Insertion successful */
-	cmpdi	0,r3,-2			/* Critical failure */
-	beq-	htab_pte_insert_failure
-
-	/* Now try secondary slot */
-
-	/* real page number in r5, PTE RPN value + index */
-	andis.	r0,r31,_PAGE_4K_PFN@h
-	srdi	r5,r31,PTE_RPN_SHIFT
-	bne-	3f
-	sldi	r5,r5,PAGE_FACTOR
-	add	r5,r5,r25
-3:	sldi	r5,r5,HW_PAGE_SHIFT
-
-	/* Calculate secondary group hash */
-	andc	r0,r27,r28
-	rldicr	r3,r0,3,63-3		/* r0 = (~hash & mask) << 3 */
-
-	/* Call ppc_md.hpte_insert */
-	ld	r6,STK_PARAM(R4)(r1)	/* Retrieve new pp bits */
-	mr	r4,r29			/* Retrieve vpn */
-	li	r7,HPTE_V_SECONDARY	/* !bolted, secondary */
-	li	r8,MMU_PAGE_4K		/* page size */
-	li	r9,MMU_PAGE_4K		/* actual page size */
-	ld	r10,STK_PARAM(R9)(r1)	/* segment size */
-.globl htab_call_hpte_insert2
-htab_call_hpte_insert2:
-	bl	.			/* patched by htab_finish_init() */
-	cmpdi	0,r3,0
-	bge+	htab_pte_insert_ok	/* Insertion successful */
-	cmpdi	0,r3,-2			/* Critical failure */
-	beq-	htab_pte_insert_failure
-
-	/* Both are full, we need to evict something */
-	mftb	r0
-	/* Pick a random group based on TB */
-	andi.	r0,r0,1
-	mr	r5,r28
-	bne	2f
-	not	r5,r5
-2:	and	r0,r5,r27
-	rldicr	r3,r0,3,63-3		/* r0 = (hash & mask) << 3 */
-	/* Call ppc_md.hpte_remove */
-.globl htab_call_hpte_remove
-htab_call_hpte_remove:
-	bl	.			/* patched by htab_finish_init() */
-
-	/* Try all again */
-	b	htab_insert_pte
-
-	/*
-	 * Call out to C code to invalidate an 64k HW HPTE that is
-	 * useless now that the segment has been switched to 4k pages.
-	 */
-htab_inval_old_hpte:
-	mr	r3,r29			/* vpn */
-	mr	r4,r31			/* PTE.pte */
-	li	r5,0			/* PTE.hidx */
-	li	r6,MMU_PAGE_64K		/* psize */
-	ld	r7,STK_PARAM(R9)(r1)	/* ssize */
-	ld	r8,STK_PARAM(R8)(r1)	/* flags */
-	bl	flush_hash_page
-	/* Clear out _PAGE_HPTE_SUB bits in the new linux PTE */
-	lis	r0,_PAGE_HPTE_SUB@h
-	ori	r0,r0,_PAGE_HPTE_SUB@l
-	andc	r30,r30,r0
-	b	htab_insert_pte
-	
-htab_bail_ok:
-	li	r3,0
-	b	htab_bail
-
-htab_pte_insert_ok:
-	/* Insert slot number & secondary bit in PTE second half,
-	 * clear _PAGE_BUSY and set approriate HPTE slot bit
-	 */
-	ld	r6,STK_PARAM(R6)(r1)
-	li	r0,_PAGE_BUSY
-	andc	r30,r30,r0
-	/* HPTE SUB bit */
-	li	r0,1
-	subfic	r5,r25,27		/* Must match bit position in */
-	sld	r0,r0,r5		/* pgtable.h */
-	or	r30,r30,r0
-	/* hindx */
-	sldi	r5,r25,2
-	sld	r3,r3,r5
-	li	r4,0xf
-	sld	r4,r4,r5
-	andc	r26,r26,r4
-	or	r26,r26,r3
-	ori	r5,r6,PTE_PAGE_HIDX_OFFSET
-	std	r26,0(r5)
-	lwsync
-	std	r30,0(r6)
-	li	r3, 0
-htab_bail:
-	ld	r25,STK_REG(R25)(r1)
-	ld	r26,STK_REG(R26)(r1)
-	ld	r27,STK_REG(R27)(r1)
-	ld	r28,STK_REG(R28)(r1)
-	ld	r29,STK_REG(R29)(r1)
-	ld      r30,STK_REG(R30)(r1)
-	ld      r31,STK_REG(R31)(r1)
-	addi    r1,r1,STACKFRAMESIZE
-	ld      r0,16(r1)
-	mtlr    r0
-	blr
-
-htab_modify_pte:
-	/* Keep PP bits in r4 and slot idx from the PTE around in r3 */
-	mr	r4,r3
-	sldi	r5,r25,2
-	srd	r3,r26,r5
-
-	/* Secondary group ? if yes, get a inverted hash value */
-	mr	r5,r28
-	andi.	r0,r3,0x8 /* page secondary ? */
-	beq	1f
-	not	r5,r5
-1:	andi.	r3,r3,0x7 /* extract idx alone */
-
-	/* Calculate proper slot value for ppc_md.hpte_updatepp */
-	and	r0,r5,r27
-	rldicr	r0,r0,3,63-3	/* r0 = (hash & mask) << 3 */
-	add	r3,r0,r3	/* add slot idx */
-
-	/* Call ppc_md.hpte_updatepp */
-	mr	r5,r29			/* vpn */
-	li	r6,MMU_PAGE_4K		/* base page size */
-	li	r7,MMU_PAGE_4K		/* actual page size */
-	ld	r8,STK_PARAM(R9)(r1)	/* segment size */
-	ld	r9,STK_PARAM(R8)(r1)	/* get "flags" param */
-.globl htab_call_hpte_updatepp
-htab_call_hpte_updatepp:
-	bl	.			/* patched by htab_finish_init() */
-
-	/* if we failed because typically the HPTE wasn't really here
-	 * we try an insertion.
-	 */
-	cmpdi	0,r3,-1
-	beq-	htab_insert_pte
-
-	/* Clear the BUSY bit and Write out the PTE */
-	li	r0,_PAGE_BUSY
-	andc	r30,r30,r0
-	ld	r6,STK_PARAM(R6)(r1)
-	std	r30,0(r6)
-	li	r3,0
-	b	htab_bail
-
-htab_wrong_access:
-	/* Bail out clearing reservation */
-	stdcx.	r31,0,r6
-	li	r3,1
-	b	htab_bail
-
-htab_pte_insert_failure:
-	/* Bail out restoring old PTE */
-	ld	r6,STK_PARAM(R6)(r1)
-	std	r31,0(r6)
-	li	r3,-1
-	b	htab_bail
-
-#endif /* CONFIG_PPC_64K_PAGES */
-
-#ifdef CONFIG_PPC_64K_PAGES
-
 /*****************************************************************************
  *                                                                           *
  *           64K SW & 64K HW in a 64K segment pages implementation           *
@@ -994,10 +621,3 @@ ht64_pte_insert_failure:
 
 
 #endif /* CONFIG_PPC_64K_PAGES */
-
-
-/*****************************************************************************
- *                                                                           *
- *           Huge pages implementation is in hugetlbpage.c                   *
- *                                                                           *
- *****************************************************************************/
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 7d4f254a2671..995809911f17 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -653,7 +653,7 @@ static void __init htab_finish_init(void)
 	patch_branch(ht64_call_hpte_updatepp,
 		ppc_function_entry(ppc_md.hpte_updatepp),
 		BRANCH_SET_LINK);
-#endif /* CONFIG_PPC_64K_PAGES */
+#else /* !CONFIG_PPC_64K_PAGES */
 
 	patch_branch(htab_call_hpte_insert1,
 		ppc_function_entry(ppc_md.hpte_insert),
@@ -667,6 +667,8 @@ static void __init htab_finish_init(void)
 	patch_branch(htab_call_hpte_updatepp,
 		ppc_function_entry(ppc_md.hpte_updatepp),
 		BRANCH_SET_LINK);
+#endif
+
 }
 
 static void __init htab_initialize(void)

From 106713a14590cd7b223db000f4f47f7d1d898153 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 1 Dec 2015 09:06:44 +0530
Subject: [PATCH 048/149] powerpc/mm: Remove the dependency on pte bit position
 in asm code

We should not expect pte bit position in asm code. Simply
by moving part of that to C

Acked-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/exceptions-64s.S | 18 ++++-------------
 arch/powerpc/mm/hash_utils_64.c      | 29 ++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 1a03142a69fd..3419cbf2ad59 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1556,29 +1556,19 @@ do_hash_page:
 	lwz	r0,TI_PREEMPT(r11)	/* If we're in an "NMI" */
 	andis.	r0,r0,NMI_MASK@h	/* (i.e. an irq when soft-disabled) */
 	bne	77f			/* then don't call hash_page now */
-	/*
-	 * We need to set the _PAGE_USER bit if MSR_PR is set or if we are
-	 * accessing a userspace segment (even from the kernel). We assume
-	 * kernel addresses always have the high bit set.
-	 */
-	rlwinm	r4,r4,32-25+9,31-9,31-9	/* DSISR_STORE -> _PAGE_RW */
-	rotldi	r0,r3,15		/* Move high bit into MSR_PR posn */
-	orc	r0,r12,r0		/* MSR_PR | ~high_bit */
-	rlwimi	r4,r0,32-13,30,30	/* becomes _PAGE_USER access bit */
-	ori	r4,r4,1			/* add _PAGE_PRESENT */
-	rlwimi	r4,r5,22+2,31-2,31-2	/* Set _PAGE_EXEC if trap is 0x400 */
 
 	/*
 	 * r3 contains the faulting address
-	 * r4 contains the required access permissions
+	 * r4 msr
 	 * r5 contains the trap number
 	 * r6 contains dsisr
 	 *
 	 * at return r3 = 0 for success, 1 for page fault, negative for error
 	 */
+        mr 	r4,r12
 	ld      r6,_DSISR(r1)
-	bl	hash_page		/* build HPTE if possible */
-	cmpdi	r3,0			/* see if hash_page succeeded */
+	bl	__hash_page		/* build HPTE if possible */
+        cmpdi	r3,0			/* see if __hash_page succeeded */
 
 	/* Success */
 	beq	fast_exc_return_irq	/* Return from exception on success */
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 995809911f17..30b7648e687a 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -1206,6 +1206,35 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap,
 }
 EXPORT_SYMBOL_GPL(hash_page);
 
+int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap,
+		unsigned long dsisr)
+{
+	unsigned long access = _PAGE_PRESENT;
+	unsigned long flags = 0;
+	struct mm_struct *mm = current->mm;
+
+	if (REGION_ID(ea) == VMALLOC_REGION_ID)
+		mm = &init_mm;
+
+	if (dsisr & DSISR_NOHPTE)
+		flags |= HPTE_NOHPTE_UPDATE;
+
+	if (dsisr & DSISR_ISSTORE)
+		access |= _PAGE_RW;
+	/*
+	 * We need to set the _PAGE_USER bit if MSR_PR is set or if we are
+	 * accessing a userspace segment (even from the kernel). We assume
+	 * kernel addresses always have the high bit set.
+	 */
+	if ((msr & MSR_PR) || (REGION_ID(ea) == USER_REGION_ID))
+		access |= _PAGE_USER;
+
+	if (trap == 0x400)
+		access |= _PAGE_EXEC;
+
+	return hash_page_mm(mm, ea, access, trap, flags);
+}
+
 void hash_preload(struct mm_struct *mm, unsigned long ea,
 		  unsigned long access, unsigned long trap)
 {

From bf680d51605662aae5482d87e0e0a54ba6db056b Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 1 Dec 2015 09:06:45 +0530
Subject: [PATCH 049/149] powerpc/mm: Don't track subpage valid bit in pte_t

This free up 11 bits in pte_t. In the later patch we also change
the pte_t format so that we can start supporting migration pte
at pmd level. We now track 4k subpage valid bit as below

If we have _PAGE_COMBO set, we override the _PAGE_F_GIX_SHIFT
and _PAGE_F_SECOND. Together we have 4 bits, each of them
used to indicate whether any of the 4 4k subpage in that group
is valid. ie,

[ group 1 bit ]   [ group 2 bit ]  ..... [ group 4 ]
[ subpage 1 - 4]  [ subpage 5- 8]  ..... [ subpage 13 - 16]

We still track each 4k subpage slot number and secondary hash
information in the second half of pgtable_t. Removing the subpage
tracking have some significant overhead on aim9 and ebizzy benchmark and
to support THP with 4K subpage, we do need a pgtable_t of 4096 bytes.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/hash-4k.h  | 10 +-----
 arch/powerpc/include/asm/book3s/64/hash-64k.h | 35 ++++---------------
 arch/powerpc/include/asm/book3s/64/hash.h     | 10 +++---
 arch/powerpc/mm/hash64_64k.c                  | 34 ++++++++++++++++--
 arch/powerpc/mm/hash_low_64.S                 |  6 +---
 arch/powerpc/mm/hugetlbpage-hash64.c          |  5 +--
 arch/powerpc/mm/pgtable_64.c                  |  2 +-
 7 files changed, 48 insertions(+), 54 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h
index 537eacecf6e9..75e8b9326e4b 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
@@ -47,17 +47,9 @@
 /* Bits to mask out from a PGD to get to the PUD page */
 #define PGD_MASKED_BITS		0
 
-/* PTE bits */
-#define _PAGE_HASHPTE	0x0400 /* software: pte has an associated HPTE */
-#define _PAGE_SECONDARY 0x8000 /* software: HPTE is in secondary group */
-#define _PAGE_GROUP_IX  0x7000 /* software: HPTE index within group */
-#define _PAGE_F_SECOND  _PAGE_SECONDARY
-#define _PAGE_F_GIX     _PAGE_GROUP_IX
-#define _PAGE_SPECIAL	0x10000 /* software: special page */
-
 /* PTE flags to conserve for HPTE identification */
 #define _PAGE_HPTEFLAGS (_PAGE_BUSY | _PAGE_HASHPTE | \
-			 _PAGE_SECONDARY | _PAGE_GROUP_IX)
+			 _PAGE_F_SECOND | _PAGE_F_GIX)
 
 /* shift to put page number into pte */
 #define PTE_RPN_SHIFT	(17)
diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h
index ee073822145d..a268416ca4a4 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -31,33 +31,13 @@
 /* Bits to mask out from a PGD/PUD to get to the PMD page */
 #define PUD_MASKED_BITS		0x1ff
 
-/* Additional PTE bits (don't change without checking asm in hash_low.S) */
-#define _PAGE_SPECIAL	0x00000400 /* software: special page */
-#define _PAGE_HPTE_SUB	0x0ffff000 /* combo only: sub pages HPTE bits */
-#define _PAGE_HPTE_SUB0	0x08000000 /* combo only: first sub page */
-#define _PAGE_COMBO	0x10000000 /* this is a combo 4k page */
-#define _PAGE_4K_PFN	0x20000000 /* PFN is for a single 4k page */
-
-/* For 64K page, we don't have a separate _PAGE_HASHPTE bit. Instead,
- * we set that to be the whole sub-bits mask. The C code will only
- * test this, so a multi-bit mask will work. For combo pages, this
- * is equivalent as effectively, the old _PAGE_HASHPTE was an OR of
- * all the sub bits. For real 64k pages, we now have the assembly set
- * _PAGE_HPTE_SUB0 in addition to setting the HIDX bits which overlap
- * that mask. This is fine as long as the HIDX bits are never set on
- * a PTE that isn't hashed, which is the case today.
- *
- * A little nit is for the huge page C code, which does the hashing
- * in C, we need to provide which bit to use.
+#define _PAGE_COMBO	0x00020000 /* this is a combo 4k page */
+#define _PAGE_4K_PFN	0x00040000 /* PFN is for a single 4k page */
+/*
+ * Used to track subpage group valid if _PAGE_COMBO is set
+ * This overloads _PAGE_F_GIX and _PAGE_F_SECOND
  */
-#define _PAGE_HASHPTE	_PAGE_HPTE_SUB
-
-/* Note the full page bits must be in the same location as for normal
- * 4k pages as the same assembly will be used to insert 64K pages
- * whether the kernel has CONFIG_PPC_64K_PAGES or not
- */
-#define _PAGE_F_SECOND  0x00008000 /* full page: hidx bits */
-#define _PAGE_F_GIX     0x00007000 /* full page: hidx bits */
+#define _PAGE_COMBO_VALID	(_PAGE_F_GIX | _PAGE_F_SECOND)
 
 /* PTE flags to conserve for HPTE identification */
 #define _PAGE_HPTEFLAGS (_PAGE_BUSY | _PAGE_HASHPTE | _PAGE_COMBO)
@@ -103,8 +83,7 @@ static inline unsigned long __rpte_to_hidx(real_pte_t rpte, unsigned long index)
 }
 
 #define __rpte_to_pte(r)	((r).pte)
-#define __rpte_sub_valid(rpte, index) \
-	(pte_val(rpte.pte) & (_PAGE_HPTE_SUB0 >> (index)))
+extern bool __rpte_sub_valid(real_pte_t rpte, unsigned long index);
 /*
  * Trick: we set __end to va + 64k, which happens works for
  * a 16M page as well as we want only one iteration
diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
index 48237e66e823..2f2034621a69 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -81,7 +81,12 @@
 #define _PAGE_DIRTY		0x0080 /* C: page changed */
 #define _PAGE_ACCESSED		0x0100 /* R: page referenced */
 #define _PAGE_RW		0x0200 /* software: user write access allowed */
+#define _PAGE_HASHPTE		0x0400 /* software: pte has an associated HPTE */
 #define _PAGE_BUSY		0x0800 /* software: PTE & hash are busy */
+#define _PAGE_F_GIX		0x7000 /* full page: hidx bits */
+#define _PAGE_F_GIX_SHIFT	12
+#define _PAGE_F_SECOND		0x8000 /* Whether to use secondary hash or not */
+#define _PAGE_SPECIAL		0x10000 /* software: special page */
 
 /* No separate kernel read-only */
 #define _PAGE_KERNEL_RW		(_PAGE_RW | _PAGE_DIRTY) /* user access blocked by key */
@@ -210,11 +215,6 @@
 
 #define PMD_BAD_BITS		(PTE_TABLE_SIZE-1)
 #define PUD_BAD_BITS		(PMD_TABLE_SIZE-1)
-/*
- * We save the slot number & secondary bit in the second half of the
- * PTE page. We use the 8 bytes per each pte entry.
- */
-#define PTE_PAGE_HIDX_OFFSET (PTRS_PER_PTE * 8)
 
 #ifndef __ASSEMBLY__
 #define	pmd_bad(pmd)		(!is_kernel_addr(pmd_val(pmd)) \
diff --git a/arch/powerpc/mm/hash64_64k.c b/arch/powerpc/mm/hash64_64k.c
index 9ffeae2cbb57..f1b86ba63430 100644
--- a/arch/powerpc/mm/hash64_64k.c
+++ b/arch/powerpc/mm/hash64_64k.c
@@ -15,6 +15,35 @@
 #include <linux/mm.h>
 #include <asm/machdep.h>
 #include <asm/mmu.h>
+/*
+ * index from 0 - 15
+ */
+bool __rpte_sub_valid(real_pte_t rpte, unsigned long index)
+{
+	unsigned long g_idx;
+	unsigned long ptev = pte_val(rpte.pte);
+
+	g_idx = (ptev & _PAGE_COMBO_VALID) >> _PAGE_F_GIX_SHIFT;
+	index = index >> 2;
+	if (g_idx & (0x1 << index))
+		return true;
+	else
+		return false;
+}
+/*
+ * index from 0 - 15
+ */
+static unsigned long mark_subptegroup_valid(unsigned long ptev, unsigned long index)
+{
+	unsigned long g_idx;
+
+	if (!(ptev & _PAGE_COMBO))
+		return ptev;
+	index = index >> 2;
+	g_idx = 0x1 << index;
+
+	return ptev | (g_idx << _PAGE_F_GIX_SHIFT);
+}
 
 int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
 		   pte_t *ptep, unsigned long trap, unsigned long flags,
@@ -102,7 +131,7 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
 	 */
 	if (!(old_pte & _PAGE_COMBO)) {
 		flush_hash_page(vpn, rpte, MMU_PAGE_64K, ssize, flags);
-		old_pte &= ~_PAGE_HPTE_SUB;
+		old_pte &= ~_PAGE_HASHPTE | _PAGE_F_GIX | _PAGE_F_SECOND;
 		goto htab_insert_hpte;
 	}
 	/*
@@ -192,7 +221,8 @@ repeat:
 	/* __real_pte use pte_val() any idea why ? FIXME!! */
 	rpte.hidx &= ~(0xfUL << (subpg_index << 2));
 	*hidxp = rpte.hidx  | (slot << (subpg_index << 2));
-	new_pte |= (_PAGE_HPTE_SUB0 >> subpg_index);
+	new_pte = mark_subptegroup_valid(new_pte, subpg_index);
+	new_pte |=  _PAGE_HASHPTE;
 	/*
 	 * check __real_pte for details on matching smp_rmb()
 	 */
diff --git a/arch/powerpc/mm/hash_low_64.S b/arch/powerpc/mm/hash_low_64.S
index 6b4d4c1d0628..359839a57f26 100644
--- a/arch/powerpc/mm/hash_low_64.S
+++ b/arch/powerpc/mm/hash_low_64.S
@@ -285,7 +285,7 @@ htab_modify_pte:
 
 	/* Secondary group ? if yes, get a inverted hash value */
 	mr	r5,r28
-	andi.	r0,r31,_PAGE_SECONDARY
+	andi.	r0,r31,_PAGE_F_SECOND
 	beq	1f
 	not	r5,r5
 1:
@@ -473,11 +473,7 @@ ht64_insert_pte:
 	lis	r0,_PAGE_HPTEFLAGS@h
 	ori	r0,r0,_PAGE_HPTEFLAGS@l
 	andc	r30,r30,r0
-#ifdef CONFIG_PPC_64K_PAGES
-	oris	r30,r30,_PAGE_HPTE_SUB0@h
-#else
 	ori	r30,r30,_PAGE_HASHPTE
-#endif
 	/* Phyical address in r5 */
 	rldicl	r5,r31,64-PTE_RPN_SHIFT,PTE_RPN_SHIFT
 	sldi	r5,r5,PAGE_SHIFT
diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c
index d94b1af53a93..7584e8445512 100644
--- a/arch/powerpc/mm/hugetlbpage-hash64.c
+++ b/arch/powerpc/mm/hugetlbpage-hash64.c
@@ -91,11 +91,8 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
 		pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
 
 		/* clear HPTE slot informations in new PTE */
-#ifdef CONFIG_PPC_64K_PAGES
-		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HPTE_SUB0;
-#else
 		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
-#endif
+
 		/* Add in WIMG bits */
 		rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
 				      _PAGE_COHERENT | _PAGE_GUARDED));
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index d692ae31cfc7..3967e3cce03e 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -625,7 +625,7 @@ void pmdp_splitting_flush(struct vm_area_struct *vma,
 	"1:	ldarx	%0,0,%3\n\
 		andi.	%1,%0,%6\n\
 		bne-	1b \n\
-		ori	%1,%0,%4 \n\
+		oris	%1,%0,%4@h \n\
 		stdcx.	%1,0,%3 \n\
 		bne-	1b"
 	: "=&r" (old), "=&r" (tmp), "=m" (*pmdp)

From 506b863c68cd6e720037f1548e101932af3bb006 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 1 Dec 2015 09:06:46 +0530
Subject: [PATCH 050/149] powerpc/mm: Remove pte_val usage for the second half
 of pgtable_t

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/hash-64k.h | 4 +++-
 arch/powerpc/mm/hash64_64k.c                  | 1 -
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h
index a268416ca4a4..9fca7fae434b 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -61,6 +61,7 @@
 static inline real_pte_t __real_pte(pte_t pte, pte_t *ptep)
 {
 	real_pte_t rpte;
+	unsigned long *hidxp;
 
 	rpte.pte = pte;
 	rpte.hidx = 0;
@@ -70,7 +71,8 @@ static inline real_pte_t __real_pte(pte_t pte, pte_t *ptep)
 		 * check. The store side ordering is done in __hash_page_4K
 		 */
 		smp_rmb();
-		rpte.hidx = pte_val(*((ptep) + PTRS_PER_PTE));
+		hidxp = (unsigned long *)(ptep + PTRS_PER_PTE);
+		rpte.hidx = *hidxp;
 	}
 	return rpte;
 }
diff --git a/arch/powerpc/mm/hash64_64k.c b/arch/powerpc/mm/hash64_64k.c
index f1b86ba63430..8f7328075f04 100644
--- a/arch/powerpc/mm/hash64_64k.c
+++ b/arch/powerpc/mm/hash64_64k.c
@@ -218,7 +218,6 @@ repeat:
 	 * nobody is undating hidx.
 	 */
 	hidxp = (unsigned long *)(ptep + PTRS_PER_PTE);
-	/* __real_pte use pte_val() any idea why ? FIXME!! */
 	rpte.hidx &= ~(0xfUL << (subpg_index << 2));
 	*hidxp = rpte.hidx  | (slot << (subpg_index << 2));
 	new_pte = mark_subptegroup_valid(new_pte, subpg_index);

From 227fdbee5a963f4358bb1edd78a6f654574a4991 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 1 Dec 2015 09:06:47 +0530
Subject: [PATCH 051/149] powerpc/mm: Increase the width of #define

No real change, only style changes

Acked-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/hash.h | 26 +++++++++++------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
index 2f2034621a69..e4ea9d73a541 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -69,23 +69,23 @@
  * We could create separate kernel read-only if we used the 3 PP bits
  * combinations that newer processors provide but we currently don't.
  */
-#define _PAGE_PRESENT		0x0001 /* software: pte contains a translation */
-#define _PAGE_USER		0x0002 /* matches one of the PP bits */
+#define _PAGE_PRESENT		0x00001 /* software: pte contains a translation */
+#define _PAGE_USER		0x00002 /* matches one of the PP bits */
 #define _PAGE_BIT_SWAP_TYPE	2
-#define _PAGE_EXEC		0x0004 /* No execute on POWER4 and newer (we invert) */
-#define _PAGE_GUARDED		0x0008
+#define _PAGE_EXEC		0x00004 /* No execute on POWER4 and newer (we invert) */
+#define _PAGE_GUARDED		0x00008
 /* We can derive Memory coherence from _PAGE_NO_CACHE */
 #define _PAGE_COHERENT		0x0
-#define _PAGE_NO_CACHE		0x0020 /* I: cache inhibit */
-#define _PAGE_WRITETHRU		0x0040 /* W: cache write-through */
-#define _PAGE_DIRTY		0x0080 /* C: page changed */
-#define _PAGE_ACCESSED		0x0100 /* R: page referenced */
-#define _PAGE_RW		0x0200 /* software: user write access allowed */
-#define _PAGE_HASHPTE		0x0400 /* software: pte has an associated HPTE */
-#define _PAGE_BUSY		0x0800 /* software: PTE & hash are busy */
-#define _PAGE_F_GIX		0x7000 /* full page: hidx bits */
+#define _PAGE_NO_CACHE		0x00020 /* I: cache inhibit */
+#define _PAGE_WRITETHRU		0x00040 /* W: cache write-through */
+#define _PAGE_DIRTY		0x00080 /* C: page changed */
+#define _PAGE_ACCESSED		0x00100 /* R: page referenced */
+#define _PAGE_RW		0x00200 /* software: user write access allowed */
+#define _PAGE_HASHPTE		0x00400 /* software: pte has an associated HPTE */
+#define _PAGE_BUSY		0x00800 /* software: PTE & hash are busy */
+#define _PAGE_F_GIX		0x07000 /* full page: hidx bits */
 #define _PAGE_F_GIX_SHIFT	12
-#define _PAGE_F_SECOND		0x8000 /* Whether to use secondary hash or not */
+#define _PAGE_F_SECOND		0x08000 /* Whether to use secondary hash or not */
 #define _PAGE_SPECIAL		0x10000 /* software: special page */
 
 /* No separate kernel read-only */

From 89ff725051d177556b23d80f2a30f880a657a6c1 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 1 Dec 2015 09:06:48 +0530
Subject: [PATCH 052/149] powerpc/mm: Convert __hash_page_64K to C

Convert from asm to C

Acked-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/hash-64k.h |   3 +-
 arch/powerpc/mm/hash64_64k.c                  | 130 ++++++++
 arch/powerpc/mm/hash_low_64.S                 | 290 +-----------------
 arch/powerpc/mm/hash_utils_64.c               |  19 +-
 4 files changed, 134 insertions(+), 308 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h
index 9fca7fae434b..46b5d0ab11de 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -40,7 +40,8 @@
 #define _PAGE_COMBO_VALID	(_PAGE_F_GIX | _PAGE_F_SECOND)
 
 /* PTE flags to conserve for HPTE identification */
-#define _PAGE_HPTEFLAGS (_PAGE_BUSY | _PAGE_HASHPTE | _PAGE_COMBO)
+#define _PAGE_HPTEFLAGS (_PAGE_BUSY | _PAGE_F_SECOND | \
+			 _PAGE_F_GIX | _PAGE_HASHPTE | _PAGE_COMBO)
 
 /* Shift to put page number into pte.
  *
diff --git a/arch/powerpc/mm/hash64_64k.c b/arch/powerpc/mm/hash64_64k.c
index 8f7328075f04..fc0898eb309d 100644
--- a/arch/powerpc/mm/hash64_64k.c
+++ b/arch/powerpc/mm/hash64_64k.c
@@ -229,3 +229,133 @@ repeat:
 	*ptep = __pte(new_pte & ~_PAGE_BUSY);
 	return 0;
 }
+
+int __hash_page_64K(unsigned long ea, unsigned long access,
+		    unsigned long vsid, pte_t *ptep, unsigned long trap,
+		    unsigned long flags, int ssize)
+{
+
+	unsigned long hpte_group;
+	unsigned long rflags, pa;
+	unsigned long old_pte, new_pte;
+	unsigned long vpn, hash, slot;
+	unsigned long shift = mmu_psize_defs[MMU_PAGE_64K].shift;
+
+	/*
+	 * atomically mark the linux large page PTE busy and dirty
+	 */
+	do {
+		pte_t pte = READ_ONCE(*ptep);
+
+		old_pte = pte_val(pte);
+		/* If PTE busy, retry the access */
+		if (unlikely(old_pte & _PAGE_BUSY))
+			return 0;
+		/* If PTE permissions don't match, take page fault */
+		if (unlikely(access & ~old_pte))
+			return 1;
+		/*
+		 * Check if PTE has the cache-inhibit bit set
+		 * If so, bail out and refault as a 4k page
+		 */
+		if (!mmu_has_feature(MMU_FTR_CI_LARGE_PAGE) &&
+		    unlikely(old_pte & _PAGE_NO_CACHE))
+			return 0;
+		/*
+		 * Try to lock the PTE, add ACCESSED and DIRTY if it was
+		 * a write access. Since this is 4K insert of 64K page size
+		 * also add _PAGE_COMBO
+		 */
+		new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED;
+		if (access & _PAGE_RW)
+			new_pte |= _PAGE_DIRTY;
+	} while (old_pte != __cmpxchg_u64((unsigned long *)ptep,
+					  old_pte, new_pte));
+	/*
+	 * PP bits. _PAGE_USER is already PP bit 0x2, so we only
+	 * need to add in 0x1 if it's a read-only user page
+	 */
+	rflags = new_pte & _PAGE_USER;
+	if ((new_pte & _PAGE_USER) && !((new_pte & _PAGE_RW) &&
+					(new_pte & _PAGE_DIRTY)))
+		rflags |= 0x1;
+	/*
+	 * _PAGE_EXEC -> HW_NO_EXEC since it's inverted
+	 */
+	rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
+	/*
+	 * Always add C and Memory coherence bit
+	 */
+	rflags |= HPTE_R_C | HPTE_R_M;
+	/*
+	 * Add in WIMG bits
+	 */
+	rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
+				_PAGE_COHERENT | _PAGE_GUARDED));
+
+	if (!cpu_has_feature(CPU_FTR_NOEXECUTE) &&
+	    !cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
+		rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
+
+	vpn  = hpt_vpn(ea, vsid, ssize);
+	if (unlikely(old_pte & _PAGE_HASHPTE)) {
+		/*
+		 * There MIGHT be an HPTE for this pte
+		 */
+		hash = hpt_hash(vpn, shift, ssize);
+		if (old_pte & _PAGE_F_SECOND)
+			hash = ~hash;
+		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+		slot += (old_pte & _PAGE_F_GIX) >> _PAGE_F_GIX_SHIFT;
+
+		if (ppc_md.hpte_updatepp(slot, rflags, vpn, MMU_PAGE_64K,
+					 MMU_PAGE_64K, ssize, flags) == -1)
+			old_pte &= ~_PAGE_HPTEFLAGS;
+	}
+
+	if (likely(!(old_pte & _PAGE_HASHPTE))) {
+
+		pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
+		hash = hpt_hash(vpn, shift, ssize);
+
+repeat:
+		hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
+
+		/* Insert into the hash table, primary slot */
+		slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, 0,
+				  MMU_PAGE_64K, MMU_PAGE_64K, ssize);
+		/*
+		 * Primary is full, try the secondary
+		 */
+		if (unlikely(slot == -1)) {
+			hpte_group = ((~hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
+			slot = ppc_md.hpte_insert(hpte_group, vpn, pa,
+						  rflags, HPTE_V_SECONDARY,
+						  MMU_PAGE_64K, MMU_PAGE_64K, ssize);
+			if (slot == -1) {
+				if (mftb() & 0x1)
+					hpte_group = ((hash & htab_hash_mask) *
+						      HPTES_PER_GROUP) & ~0x7UL;
+				ppc_md.hpte_remove(hpte_group);
+				/*
+				 * FIXME!! Should be try the group from which we removed ?
+				 */
+				goto repeat;
+			}
+		}
+		/*
+		 * Hypervisor failure. Restore old pmd and return -1
+		 * similar to __hash_page_*
+		 */
+		if (unlikely(slot == -2)) {
+			*ptep = __pte(old_pte);
+			hash_failure_debug(ea, access, vsid, trap, ssize,
+					   MMU_PAGE_64K, MMU_PAGE_64K, old_pte);
+			return -1;
+		}
+		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
+		new_pte |= (slot << _PAGE_F_GIX_SHIFT) & (_PAGE_F_SECOND | _PAGE_F_GIX);
+	}
+	*ptep = __pte(new_pte & ~_PAGE_BUSY);
+	return 0;
+}
diff --git a/arch/powerpc/mm/hash_low_64.S b/arch/powerpc/mm/hash_low_64.S
index 359839a57f26..f7d49cf0ccb7 100644
--- a/arch/powerpc/mm/hash_low_64.S
+++ b/arch/powerpc/mm/hash_low_64.S
@@ -328,292 +328,4 @@ htab_pte_insert_failure:
 	li	r3,-1
 	b	htab_bail
 
-#else /* CONFIG_PPC_64K_PAGES */
-
-/*****************************************************************************
- *                                                                           *
- *           64K SW & 64K HW in a 64K segment pages implementation           *
- *                                                                           *
- *****************************************************************************/
-
-_GLOBAL(__hash_page_64K)
-	mflr	r0
-	std	r0,16(r1)
-	stdu	r1,-STACKFRAMESIZE(r1)
-	/* Save all params that we need after a function call */
-	std	r6,STK_PARAM(R6)(r1)
-	std	r8,STK_PARAM(R8)(r1)
-	std	r9,STK_PARAM(R9)(r1)
-
-	/* Save non-volatile registers.
-	 * r31 will hold "old PTE"
-	 * r30 is "new PTE"
-	 * r29 is vpn
-	 * r28 is a hash value
-	 * r27 is hashtab mask (maybe dynamic patched instead ?)
-	 */
-	std	r27,STK_REG(R27)(r1)
-	std	r28,STK_REG(R28)(r1)
-	std	r29,STK_REG(R29)(r1)
-	std	r30,STK_REG(R30)(r1)
-	std	r31,STK_REG(R31)(r1)
-
-	/* Step 1:
-	 *
-	 * Check permissions, atomically mark the linux PTE busy
-	 * and hashed.
-	 */
-1:
-	ldarx	r31,0,r6
-	/* Check access rights (access & ~(pte_val(*ptep))) */
-	andc.	r0,r4,r31
-	bne-	ht64_wrong_access
-	/* Check if PTE is busy */
-	andi.	r0,r31,_PAGE_BUSY
-	/* If so, just bail out and refault if needed. Someone else
-	 * is changing this PTE anyway and might hash it.
-	 */
-	bne-	ht64_bail_ok
-BEGIN_FTR_SECTION
-	/* Check if PTE has the cache-inhibit bit set */
-	andi.	r0,r31,_PAGE_NO_CACHE
-	/* If so, bail out and refault as a 4k page */
-	bne-	ht64_bail_ok
-END_MMU_FTR_SECTION_IFCLR(MMU_FTR_CI_LARGE_PAGE)
-	/* Prepare new PTE value (turn access RW into DIRTY, then
-	 * add BUSY and ACCESSED)
-	 */
-	rlwinm	r30,r4,32-9+7,31-7,31-7	/* _PAGE_RW -> _PAGE_DIRTY */
-	or	r30,r30,r31
-	ori	r30,r30,_PAGE_BUSY | _PAGE_ACCESSED
-	/* Write the linux PTE atomically (setting busy) */
-	stdcx.	r30,0,r6
-	bne-	1b
-	isync
-
-	/* Step 2:
-	 *
-	 * Insert/Update the HPTE in the hash table. At this point,
-	 * r4 (access) is re-useable, we use it for the new HPTE flags
-	 */
-
-BEGIN_FTR_SECTION
-	cmpdi	r9,0			/* check segment size */
-	bne	3f
-END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
-	/* Calc vpn and put it in r29 */
-	sldi	r29,r5,SID_SHIFT - VPN_SHIFT
-	rldicl  r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT - VPN_SHIFT)
-	or	r29,r28,r29
-
-	/* Calculate hash value for primary slot and store it in r28
-	 * r3 = va, r5 = vsid
-	 * r0 = (va >> 16) & ((1ul << (28 - 16)) -1)
-	 */
-	rldicl	r0,r3,64-16,52
-	xor	r28,r5,r0		/* hash */
-	b	4f
-
-3:	/* Calc vpn and put it in r29 */
-	sldi	r29,r5,SID_SHIFT_1T - VPN_SHIFT
-	rldicl  r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT_1T - VPN_SHIFT)
-	or	r29,r28,r29
-	/*
-	 * calculate hash value for primary slot and
-	 * store it in r28 for 1T segment
-	 * r3 = va, r5 = vsid
-	 */
-	sldi	r28,r5,25		/* vsid << 25 */
-	/* r0 = (va >> 16) & ((1ul << (40 - 16)) -1) */
-	rldicl	r0,r3,64-16,40
-	xor	r28,r28,r5		/* vsid ^ ( vsid << 25) */
-	xor	r28,r28,r0		/* hash */
-
-	/* Convert linux PTE bits into HW equivalents */
-4:	andi.	r3,r30,0x1fe		/* Get basic set of flags */
-	xori	r3,r3,HPTE_R_N		/* _PAGE_EXEC -> NOEXEC */
-	rlwinm	r0,r30,32-9+1,30,30	/* _PAGE_RW -> _PAGE_USER (r0) */
-	rlwinm	r4,r30,32-7+1,30,30	/* _PAGE_DIRTY -> _PAGE_USER (r4) */
-	and	r0,r0,r4		/* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/
-	andc	r0,r30,r0		/* r0 = pte & ~r0 */
-	rlwimi	r3,r0,32-1,31,31	/* Insert result into PP lsb */
-	/*
-	 * Always add "C" bit for perf. Memory coherence is always enabled
-	 */
-	ori	r3,r3,HPTE_R_C | HPTE_R_M
-
-	/* We eventually do the icache sync here (maybe inline that
-	 * code rather than call a C function...)
-	 */
-BEGIN_FTR_SECTION
-	mr	r4,r30
-	mr	r5,r7
-	bl	hash_page_do_lazy_icache
-END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE)
-
-	/* At this point, r3 contains new PP bits, save them in
-	 * place of "access" in the param area (sic)
-	 */
-	std	r3,STK_PARAM(R4)(r1)
-
-	/* Get htab_hash_mask */
-	ld	r4,htab_hash_mask@got(2)
-	ld	r27,0(r4)	/* htab_hash_mask -> r27 */
-
-	/* Check if we may already be in the hashtable, in this case, we
-	 * go to out-of-line code to try to modify the HPTE
-	 */
-	rldicl.	r0,r31,64-12,48
-	bne	ht64_modify_pte
-
-ht64_insert_pte:
-	/* Clear hpte bits in new pte (we also clear BUSY btw) and
-	 * add _PAGE_HPTE_SUB0
-	 */
-	lis	r0,_PAGE_HPTEFLAGS@h
-	ori	r0,r0,_PAGE_HPTEFLAGS@l
-	andc	r30,r30,r0
-	ori	r30,r30,_PAGE_HASHPTE
-	/* Phyical address in r5 */
-	rldicl	r5,r31,64-PTE_RPN_SHIFT,PTE_RPN_SHIFT
-	sldi	r5,r5,PAGE_SHIFT
-
-	/* Calculate primary group hash */
-	and	r0,r28,r27
-	rldicr	r3,r0,3,63-3	/* r0 = (hash & mask) << 3 */
-
-	/* Call ppc_md.hpte_insert */
-	ld	r6,STK_PARAM(R4)(r1)	/* Retrieve new pp bits */
-	mr	r4,r29			/* Retrieve vpn */
-	li	r7,0			/* !bolted, !secondary */
-	li	r8,MMU_PAGE_64K
-	li	r9,MMU_PAGE_64K		/* actual page size */
-	ld	r10,STK_PARAM(R9)(r1)	/* segment size */
-.globl ht64_call_hpte_insert1
-ht64_call_hpte_insert1:
-	bl	.			/* patched by htab_finish_init() */
-	cmpdi	0,r3,0
-	bge	ht64_pte_insert_ok	/* Insertion successful */
-	cmpdi	0,r3,-2			/* Critical failure */
-	beq-	ht64_pte_insert_failure
-
-	/* Now try secondary slot */
-
-	/* Phyical address in r5 */
-	rldicl	r5,r31,64-PTE_RPN_SHIFT,PTE_RPN_SHIFT
-	sldi	r5,r5,PAGE_SHIFT
-
-	/* Calculate secondary group hash */
-	andc	r0,r27,r28
-	rldicr	r3,r0,3,63-3	/* r0 = (~hash & mask) << 3 */
-
-	/* Call ppc_md.hpte_insert */
-	ld	r6,STK_PARAM(R4)(r1)	/* Retrieve new pp bits */
-	mr	r4,r29			/* Retrieve vpn */
-	li	r7,HPTE_V_SECONDARY	/* !bolted, secondary */
-	li	r8,MMU_PAGE_64K
-	li	r9,MMU_PAGE_64K		/* actual page size */
-	ld	r10,STK_PARAM(R9)(r1)	/* segment size */
-.globl ht64_call_hpte_insert2
-ht64_call_hpte_insert2:
-	bl	.			/* patched by htab_finish_init() */
-	cmpdi	0,r3,0
-	bge+	ht64_pte_insert_ok	/* Insertion successful */
-	cmpdi	0,r3,-2			/* Critical failure */
-	beq-	ht64_pte_insert_failure
-
-	/* Both are full, we need to evict something */
-	mftb	r0
-	/* Pick a random group based on TB */
-	andi.	r0,r0,1
-	mr	r5,r28
-	bne	2f
-	not	r5,r5
-2:	and	r0,r5,r27
-	rldicr	r3,r0,3,63-3	/* r0 = (hash & mask) << 3 */
-	/* Call ppc_md.hpte_remove */
-.globl ht64_call_hpte_remove
-ht64_call_hpte_remove:
-	bl	.			/* patched by htab_finish_init() */
-
-	/* Try all again */
-	b	ht64_insert_pte
-
-ht64_bail_ok:
-	li	r3,0
-	b	ht64_bail
-
-ht64_pte_insert_ok:
-	/* Insert slot number & secondary bit in PTE */
-	rldimi	r30,r3,12,63-15
-
-	/* Write out the PTE with a normal write
-	 * (maybe add eieio may be good still ?)
-	 */
-ht64_write_out_pte:
-	ld	r6,STK_PARAM(R6)(r1)
-	std	r30,0(r6)
-	li	r3, 0
-ht64_bail:
-	ld	r27,STK_REG(R27)(r1)
-	ld	r28,STK_REG(R28)(r1)
-	ld	r29,STK_REG(R29)(r1)
-	ld      r30,STK_REG(R30)(r1)
-	ld      r31,STK_REG(R31)(r1)
-	addi    r1,r1,STACKFRAMESIZE
-	ld      r0,16(r1)
-	mtlr    r0
-	blr
-
-ht64_modify_pte:
-	/* Keep PP bits in r4 and slot idx from the PTE around in r3 */
-	mr	r4,r3
-	rlwinm	r3,r31,32-12,29,31
-
-	/* Secondary group ? if yes, get a inverted hash value */
-	mr	r5,r28
-	andi.	r0,r31,_PAGE_F_SECOND
-	beq	1f
-	not	r5,r5
-1:
-	/* Calculate proper slot value for ppc_md.hpte_updatepp */
-	and	r0,r5,r27
-	rldicr	r0,r0,3,63-3	/* r0 = (hash & mask) << 3 */
-	add	r3,r0,r3	/* add slot idx */
-
-	/* Call ppc_md.hpte_updatepp */
-	mr	r5,r29			/* vpn */
-	li	r6,MMU_PAGE_64K		/* base page size */
-	li	r7,MMU_PAGE_64K		/* actual page size */
-	ld	r8,STK_PARAM(R9)(r1)	/* segment size */
-	ld	r9,STK_PARAM(R8)(r1)	/* get "flags" param */
-.globl ht64_call_hpte_updatepp
-ht64_call_hpte_updatepp:
-	bl	.			/* patched by htab_finish_init() */
-
-	/* if we failed because typically the HPTE wasn't really here
-	 * we try an insertion.
-	 */
-	cmpdi	0,r3,-1
-	beq-	ht64_insert_pte
-
-	/* Clear the BUSY bit and Write out the PTE */
-	li	r0,_PAGE_BUSY
-	andc	r30,r30,r0
-	b	ht64_write_out_pte
-
-ht64_wrong_access:
-	/* Bail out clearing reservation */
-	stdcx.	r31,0,r6
-	li	r3,1
-	b	ht64_bail
-
-ht64_pte_insert_failure:
-	/* Bail out restoring old PTE */
-	ld	r6,STK_PARAM(R6)(r1)
-	std	r31,0(r6)
-	li	r3,-1
-	b	ht64_bail
-
-
-#endif /* CONFIG_PPC_64K_PAGES */
+#endif
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 30b7648e687a..fb6e15c607a6 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -633,28 +633,11 @@ extern u32 htab_call_hpte_insert1[];
 extern u32 htab_call_hpte_insert2[];
 extern u32 htab_call_hpte_remove[];
 extern u32 htab_call_hpte_updatepp[];
-extern u32 ht64_call_hpte_insert1[];
-extern u32 ht64_call_hpte_insert2[];
-extern u32 ht64_call_hpte_remove[];
-extern u32 ht64_call_hpte_updatepp[];
 
 static void __init htab_finish_init(void)
 {
-#ifdef CONFIG_PPC_64K_PAGES
-	patch_branch(ht64_call_hpte_insert1,
-		ppc_function_entry(ppc_md.hpte_insert),
-		BRANCH_SET_LINK);
-	patch_branch(ht64_call_hpte_insert2,
-		ppc_function_entry(ppc_md.hpte_insert),
-		BRANCH_SET_LINK);
-	patch_branch(ht64_call_hpte_remove,
-		ppc_function_entry(ppc_md.hpte_remove),
-		BRANCH_SET_LINK);
-	patch_branch(ht64_call_hpte_updatepp,
-		ppc_function_entry(ppc_md.hpte_updatepp),
-		BRANCH_SET_LINK);
-#else /* !CONFIG_PPC_64K_PAGES */
 
+#ifdef CONFIG_PPC_4K_PAGES
 	patch_branch(htab_call_hpte_insert1,
 		ppc_function_entry(ppc_md.hpte_insert),
 		BRANCH_SET_LINK);

From a43c0eb8364c022725df586e91dd753633374d66 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 1 Dec 2015 09:06:49 +0530
Subject: [PATCH 053/149] powerpc/mm: Convert 4k insert from asm to C

This is similar to 64K insert. May be we want to consolidate

Acked-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/Makefile        |   6 +-
 arch/powerpc/mm/hash64_4k.c     | 139 ++++++++++++++
 arch/powerpc/mm/hash_low_64.S   | 331 --------------------------------
 arch/powerpc/mm/hash_utils_64.c |  26 ---
 4 files changed, 142 insertions(+), 360 deletions(-)
 create mode 100644 arch/powerpc/mm/hash64_4k.c
 delete mode 100644 arch/powerpc/mm/hash_low_64.S

diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index f80ad1a76cc8..1ffeda85c086 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -14,11 +14,11 @@ obj-$(CONFIG_PPC_MMU_NOHASH)	+= mmu_context_nohash.o tlb_nohash.o \
 obj-$(CONFIG_PPC_BOOK3E)	+= tlb_low_$(CONFIG_WORD_SIZE)e.o
 hash64-$(CONFIG_PPC_NATIVE)	:= hash_native_64.o
 obj-$(CONFIG_PPC_STD_MMU_64)	+= hash_utils_64.o slb_low.o slb.o $(hash64-y)
-obj-$(CONFIG_PPC_STD_MMU_32)	+= ppc_mmu_32.o
-obj-$(CONFIG_PPC_STD_MMU)	+= hash_low_$(CONFIG_WORD_SIZE).o \
-				   tlb_hash$(CONFIG_WORD_SIZE).o \
+obj-$(CONFIG_PPC_STD_MMU_32)	+= ppc_mmu_32.o hash_low_32.o
+obj-$(CONFIG_PPC_STD_MMU)	+= tlb_hash$(CONFIG_WORD_SIZE).o \
 				   mmu_context_hash$(CONFIG_WORD_SIZE).o
 ifeq ($(CONFIG_PPC_STD_MMU_64),y)
+obj-$(CONFIG_PPC_4K_PAGES)	+= hash64_4k.o
 obj-$(CONFIG_PPC_64K_PAGES)	+= hash64_64k.o
 endif
 obj-$(CONFIG_PPC_ICSWX)		+= icswx.o
diff --git a/arch/powerpc/mm/hash64_4k.c b/arch/powerpc/mm/hash64_4k.c
new file mode 100644
index 000000000000..3b49c6f18741
--- /dev/null
+++ b/arch/powerpc/mm/hash64_4k.c
@@ -0,0 +1,139 @@
+/*
+ * Copyright IBM Corporation, 2015
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+#include <linux/mm.h>
+#include <asm/machdep.h>
+#include <asm/mmu.h>
+
+int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
+		   pte_t *ptep, unsigned long trap, unsigned long flags,
+		   int ssize, int subpg_prot)
+{
+	unsigned long hpte_group;
+	unsigned long rflags, pa;
+	unsigned long old_pte, new_pte;
+	unsigned long vpn, hash, slot;
+	unsigned long shift = mmu_psize_defs[MMU_PAGE_4K].shift;
+
+	/*
+	 * atomically mark the linux large page PTE busy and dirty
+	 */
+	do {
+		pte_t pte = READ_ONCE(*ptep);
+
+		old_pte = pte_val(pte);
+		/* If PTE busy, retry the access */
+		if (unlikely(old_pte & _PAGE_BUSY))
+			return 0;
+		/* If PTE permissions don't match, take page fault */
+		if (unlikely(access & ~old_pte))
+			return 1;
+		/*
+		 * Try to lock the PTE, add ACCESSED and DIRTY if it was
+		 * a write access. Since this is 4K insert of 64K page size
+		 * also add _PAGE_COMBO
+		 */
+		new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED | _PAGE_HASHPTE;
+		if (access & _PAGE_RW)
+			new_pte |= _PAGE_DIRTY;
+	} while (old_pte != __cmpxchg_u64((unsigned long *)ptep,
+					  old_pte, new_pte));
+	/*
+	 * PP bits. _PAGE_USER is already PP bit 0x2, so we only
+	 * need to add in 0x1 if it's a read-only user page
+	 */
+	rflags = new_pte & _PAGE_USER;
+	if ((new_pte & _PAGE_USER) && !((new_pte & _PAGE_RW) &&
+					(new_pte & _PAGE_DIRTY)))
+		rflags |= 0x1;
+	/*
+	 * _PAGE_EXEC -> HW_NO_EXEC since it's inverted
+	 */
+	rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
+	/*
+	 * Always add C and Memory coherence bit
+	 */
+	rflags |= HPTE_R_C | HPTE_R_M;
+	/*
+	 * Add in WIMG bits
+	 */
+	rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
+				_PAGE_COHERENT | _PAGE_GUARDED));
+
+	if (!cpu_has_feature(CPU_FTR_NOEXECUTE) &&
+	    !cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
+		rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
+
+	vpn  = hpt_vpn(ea, vsid, ssize);
+	if (unlikely(old_pte & _PAGE_HASHPTE)) {
+		/*
+		 * There MIGHT be an HPTE for this pte
+		 */
+		hash = hpt_hash(vpn, shift, ssize);
+		if (old_pte & _PAGE_F_SECOND)
+			hash = ~hash;
+		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+		slot += (old_pte & _PAGE_F_GIX) >> _PAGE_F_GIX_SHIFT;
+
+		if (ppc_md.hpte_updatepp(slot, rflags, vpn, MMU_PAGE_4K,
+					 MMU_PAGE_4K, ssize, flags) == -1)
+			old_pte &= ~_PAGE_HPTEFLAGS;
+	}
+
+	if (likely(!(old_pte & _PAGE_HASHPTE))) {
+
+		pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
+		hash = hpt_hash(vpn, shift, ssize);
+
+repeat:
+		hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
+
+		/* Insert into the hash table, primary slot */
+		slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, 0,
+				  MMU_PAGE_4K, MMU_PAGE_4K, ssize);
+		/*
+		 * Primary is full, try the secondary
+		 */
+		if (unlikely(slot == -1)) {
+			hpte_group = ((~hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
+			slot = ppc_md.hpte_insert(hpte_group, vpn, pa,
+						  rflags, HPTE_V_SECONDARY,
+						  MMU_PAGE_4K, MMU_PAGE_4K, ssize);
+			if (slot == -1) {
+				if (mftb() & 0x1)
+					hpte_group = ((hash & htab_hash_mask) *
+						      HPTES_PER_GROUP) & ~0x7UL;
+				ppc_md.hpte_remove(hpte_group);
+				/*
+				 * FIXME!! Should be try the group from which we removed ?
+				 */
+				goto repeat;
+			}
+		}
+		/*
+		 * Hypervisor failure. Restore old pmd and return -1
+		 * similar to __hash_page_*
+		 */
+		if (unlikely(slot == -2)) {
+			*ptep = __pte(old_pte);
+			hash_failure_debug(ea, access, vsid, trap, ssize,
+					   MMU_PAGE_4K, MMU_PAGE_4K, old_pte);
+			return -1;
+		}
+		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
+		new_pte |= (slot << _PAGE_F_GIX_SHIFT) & (_PAGE_F_SECOND | _PAGE_F_GIX);
+	}
+	*ptep = __pte(new_pte & ~_PAGE_BUSY);
+	return 0;
+}
diff --git a/arch/powerpc/mm/hash_low_64.S b/arch/powerpc/mm/hash_low_64.S
deleted file mode 100644
index f7d49cf0ccb7..000000000000
--- a/arch/powerpc/mm/hash_low_64.S
+++ /dev/null
@@ -1,331 +0,0 @@
-/*
- * ppc64 MMU hashtable management routines
- *
- * (c) Copyright IBM Corp. 2003, 2005
- *
- * Maintained by: Benjamin Herrenschmidt
- *                <benh@kernel.crashing.org>
- *
- * This file is covered by the GNU Public Licence v2 as
- * described in the kernel's COPYING file.
- */
-
-#include <asm/reg.h>
-#include <asm/pgtable.h>
-#include <asm/mmu.h>
-#include <asm/page.h>
-#include <asm/types.h>
-#include <asm/ppc_asm.h>
-#include <asm/asm-offsets.h>
-#include <asm/cputable.h>
-
-	.text
-
-/*
- * Stackframe:
- *		
- *         +-> Back chain			(SP + 256)
- *         |   General register save area	(SP + 112)
- *         |   Parameter save area		(SP + 48)
- *         |   TOC save area			(SP + 40)
- *         |   link editor doubleword		(SP + 32)
- *         |   compiler doubleword		(SP + 24)
- *         |   LR save area			(SP + 16)
- *         |   CR save area			(SP + 8)
- * SP ---> +-- Back chain			(SP + 0)
- */
-
-#ifndef CONFIG_PPC_64K_PAGES
-
-/*****************************************************************************
- *                                                                           *
- *           4K SW & 4K HW pages implementation                              *
- *                                                                           *
- *****************************************************************************/
-
-
-/*
- * _hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
- *		 pte_t *ptep, unsigned long trap, unsigned long flags,
- *		 int ssize)
- *
- * Adds a 4K page to the hash table in a segment of 4K pages only
- */
-
-_GLOBAL(__hash_page_4K)
-	mflr	r0
-	std	r0,16(r1)
-	stdu	r1,-STACKFRAMESIZE(r1)
-	/* Save all params that we need after a function call */
-	std	r6,STK_PARAM(R6)(r1)
-	std	r8,STK_PARAM(R8)(r1)
-	std	r9,STK_PARAM(R9)(r1)
-	
-	/* Save non-volatile registers.
-	 * r31 will hold "old PTE"
-	 * r30 is "new PTE"
-	 * r29 is vpn
-	 * r28 is a hash value
-	 * r27 is hashtab mask (maybe dynamic patched instead ?)
-	 */
-	std	r27,STK_REG(R27)(r1)
-	std	r28,STK_REG(R28)(r1)
-	std	r29,STK_REG(R29)(r1)
-	std	r30,STK_REG(R30)(r1)
-	std	r31,STK_REG(R31)(r1)
-	
-	/* Step 1:
-	 *
-	 * Check permissions, atomically mark the linux PTE busy
-	 * and hashed.
-	 */ 
-1:
-	ldarx	r31,0,r6
-	/* Check access rights (access & ~(pte_val(*ptep))) */
-	andc.	r0,r4,r31
-	bne-	htab_wrong_access
-	/* Check if PTE is busy */
-	andi.	r0,r31,_PAGE_BUSY
-	/* If so, just bail out and refault if needed. Someone else
-	 * is changing this PTE anyway and might hash it.
-	 */
-	bne-	htab_bail_ok
-
-	/* Prepare new PTE value (turn access RW into DIRTY, then
-	 * add BUSY,HASHPTE and ACCESSED)
-	 */
-	rlwinm	r30,r4,32-9+7,31-7,31-7	/* _PAGE_RW -> _PAGE_DIRTY */
-	or	r30,r30,r31
-	ori	r30,r30,_PAGE_BUSY | _PAGE_ACCESSED | _PAGE_HASHPTE
-	/* Write the linux PTE atomically (setting busy) */
-	stdcx.	r30,0,r6
-	bne-	1b
-	isync
-
-	/* Step 2:
-	 *
-	 * Insert/Update the HPTE in the hash table. At this point,
-	 * r4 (access) is re-useable, we use it for the new HPTE flags
-	 */
-
-BEGIN_FTR_SECTION
-	cmpdi	r9,0			/* check segment size */
-	bne	3f
-END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
-	/* Calc vpn and put it in r29 */
-	sldi	r29,r5,SID_SHIFT - VPN_SHIFT
-	rldicl  r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT - VPN_SHIFT)
-	or	r29,r28,r29
-	/*
-	 * Calculate hash value for primary slot and store it in r28
-	 * r3 = va, r5 = vsid
-	 * r0 = (va >> 12) & ((1ul << (28 - 12)) -1)
-	 */
-	rldicl	r0,r3,64-12,48
-	xor	r28,r5,r0		/* hash */
-	b	4f
-
-3:	/* Calc vpn and put it in r29 */
-	sldi	r29,r5,SID_SHIFT_1T - VPN_SHIFT
-	rldicl  r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT_1T - VPN_SHIFT)
-	or	r29,r28,r29
-
-	/*
-	 * calculate hash value for primary slot and
-	 * store it in r28 for 1T segment
-	 * r3 = va, r5 = vsid
-	 */
-	sldi	r28,r5,25		/* vsid << 25 */
-	/* r0 =  (va >> 12) & ((1ul << (40 - 12)) -1) */
-	rldicl	r0,r3,64-12,36
-	xor	r28,r28,r5		/* vsid ^ ( vsid << 25) */
-	xor	r28,r28,r0		/* hash */
-
-	/* Convert linux PTE bits into HW equivalents */
-4:	andi.	r3,r30,0x1fe		/* Get basic set of flags */
-	xori	r3,r3,HPTE_R_N		/* _PAGE_EXEC -> NOEXEC */
-	rlwinm	r0,r30,32-9+1,30,30	/* _PAGE_RW -> _PAGE_USER (r0) */
-	rlwinm	r4,r30,32-7+1,30,30	/* _PAGE_DIRTY -> _PAGE_USER (r4) */
-	and	r0,r0,r4		/* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/
-	andc	r0,r30,r0		/* r0 = pte & ~r0 */
-	rlwimi	r3,r0,32-1,31,31	/* Insert result into PP lsb */
-	/*
-	 * Always add "C" bit for perf. Memory coherence is always enabled
-	 */
-	ori	r3,r3,HPTE_R_C | HPTE_R_M
-
-	/* We eventually do the icache sync here (maybe inline that
-	 * code rather than call a C function...) 
-	 */
-BEGIN_FTR_SECTION
-	mr	r4,r30
-	mr	r5,r7
-	bl	hash_page_do_lazy_icache
-END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE)
-
-	/* At this point, r3 contains new PP bits, save them in
-	 * place of "access" in the param area (sic)
-	 */
-	std	r3,STK_PARAM(R4)(r1)
-
-	/* Get htab_hash_mask */
-	ld	r4,htab_hash_mask@got(2)
-	ld	r27,0(r4)	/* htab_hash_mask -> r27 */
-
-	/* Check if we may already be in the hashtable, in this case, we
-	 * go to out-of-line code to try to modify the HPTE
-	 */
-	andi.	r0,r31,_PAGE_HASHPTE
-	bne	htab_modify_pte
-
-htab_insert_pte:
-	/* Clear hpte bits in new pte (we also clear BUSY btw) and
-	 * add _PAGE_HASHPTE
-	 */
-	lis	r0,_PAGE_HPTEFLAGS@h
-	ori	r0,r0,_PAGE_HPTEFLAGS@l
-	andc	r30,r30,r0
-	ori	r30,r30,_PAGE_HASHPTE
-
-	/* physical address r5 */
-	rldicl	r5,r31,64-PTE_RPN_SHIFT,PTE_RPN_SHIFT
-	sldi	r5,r5,PAGE_SHIFT
-
-	/* Calculate primary group hash */
-	and	r0,r28,r27
-	rldicr	r3,r0,3,63-3		/* r3 = (hash & mask) << 3 */
-
-	/* Call ppc_md.hpte_insert */
-	ld	r6,STK_PARAM(R4)(r1)	/* Retrieve new pp bits */
-	mr	r4,r29			/* Retrieve vpn */
-	li	r7,0			/* !bolted, !secondary */
-	li	r8,MMU_PAGE_4K		/* page size */
-	li	r9,MMU_PAGE_4K		/* actual page size */
-	ld	r10,STK_PARAM(R9)(r1)	/* segment size */
-.globl htab_call_hpte_insert1
-htab_call_hpte_insert1:
-	bl	.			/* Patched by htab_finish_init() */
-	cmpdi	0,r3,0
-	bge	htab_pte_insert_ok	/* Insertion successful */
-	cmpdi	0,r3,-2			/* Critical failure */
-	beq-	htab_pte_insert_failure
-
-	/* Now try secondary slot */
-	
-	/* physical address r5 */
-	rldicl	r5,r31,64-PTE_RPN_SHIFT,PTE_RPN_SHIFT
-	sldi	r5,r5,PAGE_SHIFT
-
-	/* Calculate secondary group hash */
-	andc	r0,r27,r28
-	rldicr	r3,r0,3,63-3	/* r0 = (~hash & mask) << 3 */
-	
-	/* Call ppc_md.hpte_insert */
-	ld	r6,STK_PARAM(R4)(r1)	/* Retrieve new pp bits */
-	mr	r4,r29			/* Retrieve vpn */
-	li	r7,HPTE_V_SECONDARY	/* !bolted, secondary */
-	li	r8,MMU_PAGE_4K		/* page size */
-	li	r9,MMU_PAGE_4K		/* actual page size */
-	ld	r10,STK_PARAM(R9)(r1)	/* segment size */
-.globl htab_call_hpte_insert2
-htab_call_hpte_insert2:
-	bl	.			/* Patched by htab_finish_init() */
-	cmpdi	0,r3,0
-	bge+	htab_pte_insert_ok	/* Insertion successful */
-	cmpdi	0,r3,-2			/* Critical failure */
-	beq-	htab_pte_insert_failure
-
-	/* Both are full, we need to evict something */
-	mftb	r0
-	/* Pick a random group based on TB */
-	andi.	r0,r0,1
-	mr	r5,r28
-	bne	2f
-	not	r5,r5
-2:	and	r0,r5,r27
-	rldicr	r3,r0,3,63-3	/* r0 = (hash & mask) << 3 */	
-	/* Call ppc_md.hpte_remove */
-.globl htab_call_hpte_remove
-htab_call_hpte_remove:
-	bl	.			/* Patched by htab_finish_init() */
-
-	/* Try all again */
-	b	htab_insert_pte	
-
-htab_bail_ok:
-	li	r3,0
-	b	htab_bail
-
-htab_pte_insert_ok:
-	/* Insert slot number & secondary bit in PTE */
-	rldimi	r30,r3,12,63-15
-		
-	/* Write out the PTE with a normal write
-	 * (maybe add eieio may be good still ?)
-	 */
-htab_write_out_pte:
-	ld	r6,STK_PARAM(R6)(r1)
-	std	r30,0(r6)
-	li	r3, 0
-htab_bail:
-	ld	r27,STK_REG(R27)(r1)
-	ld	r28,STK_REG(R28)(r1)
-	ld	r29,STK_REG(R29)(r1)
-	ld      r30,STK_REG(R30)(r1)
-	ld      r31,STK_REG(R31)(r1)
-	addi    r1,r1,STACKFRAMESIZE
-	ld      r0,16(r1)
-	mtlr    r0
-	blr
-
-htab_modify_pte:
-	/* Keep PP bits in r4 and slot idx from the PTE around in r3 */
-	mr	r4,r3
-	rlwinm	r3,r31,32-12,29,31
-
-	/* Secondary group ? if yes, get a inverted hash value */
-	mr	r5,r28
-	andi.	r0,r31,_PAGE_F_SECOND
-	beq	1f
-	not	r5,r5
-1:
-	/* Calculate proper slot value for ppc_md.hpte_updatepp */
-	and	r0,r5,r27
-	rldicr	r0,r0,3,63-3	/* r0 = (hash & mask) << 3 */
-	add	r3,r0,r3	/* add slot idx */
-
-	/* Call ppc_md.hpte_updatepp */
-	mr	r5,r29			/* vpn */
-	li	r6,MMU_PAGE_4K		/* base page size */
-	li	r7,MMU_PAGE_4K		/* actual page size */
-	ld	r8,STK_PARAM(R9)(r1)	/* segment size */
-	ld	r9,STK_PARAM(R8)(r1)	/* get "flags" param */
-.globl htab_call_hpte_updatepp
-htab_call_hpte_updatepp:
-	bl	.			/* Patched by htab_finish_init() */
-
-	/* if we failed because typically the HPTE wasn't really here
-	 * we try an insertion. 
-	 */
-	cmpdi	0,r3,-1
-	beq-	htab_insert_pte
-
-	/* Clear the BUSY bit and Write out the PTE */
-	li	r0,_PAGE_BUSY
-	andc	r30,r30,r0
-	b	htab_write_out_pte
-
-htab_wrong_access:
-	/* Bail out clearing reservation */
-	stdcx.	r31,0,r6
-	li	r3,1
-	b	htab_bail
-
-htab_pte_insert_failure:
-	/* Bail out restoring old PTE */
-	ld	r6,STK_PARAM(R6)(r1)
-	std	r31,0(r6)
-	li	r3,-1
-	b	htab_bail
-
-#endif
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index fb6e15c607a6..cab2deb8d20b 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -629,31 +629,6 @@ int remove_section_mapping(unsigned long start, unsigned long end)
 }
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
-extern u32 htab_call_hpte_insert1[];
-extern u32 htab_call_hpte_insert2[];
-extern u32 htab_call_hpte_remove[];
-extern u32 htab_call_hpte_updatepp[];
-
-static void __init htab_finish_init(void)
-{
-
-#ifdef CONFIG_PPC_4K_PAGES
-	patch_branch(htab_call_hpte_insert1,
-		ppc_function_entry(ppc_md.hpte_insert),
-		BRANCH_SET_LINK);
-	patch_branch(htab_call_hpte_insert2,
-		ppc_function_entry(ppc_md.hpte_insert),
-		BRANCH_SET_LINK);
-	patch_branch(htab_call_hpte_remove,
-		ppc_function_entry(ppc_md.hpte_remove),
-		BRANCH_SET_LINK);
-	patch_branch(htab_call_hpte_updatepp,
-		ppc_function_entry(ppc_md.hpte_updatepp),
-		BRANCH_SET_LINK);
-#endif
-
-}
-
 static void __init htab_initialize(void)
 {
 	unsigned long table;
@@ -800,7 +775,6 @@ static void __init htab_initialize(void)
 					 mmu_linear_psize, mmu_kernel_ssize));
 	}
 
-	htab_finish_init();
 
 	DBG(" <- htab_initialize()\n");
 }

From c6a3c495f05a070d4c4016d4a51c384cba723971 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 1 Dec 2015 09:06:50 +0530
Subject: [PATCH 054/149] powerpc/mm: Add helper for converting pte bit to hpte
 bits

Instead of open coding it in multiple code paths, export the helper
and add more documentation. Also make sure we don't make assumption
regarding pte bit position

Acked-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/hash.h |  1 +
 arch/powerpc/mm/hash64_4k.c               | 13 +--------
 arch/powerpc/mm/hash64_64k.c              | 35 ++---------------------
 arch/powerpc/mm/hash_utils_64.c           | 22 ++++++++------
 arch/powerpc/mm/hugepage-hash64.c         | 13 +--------
 arch/powerpc/mm/hugetlbpage-hash64.c      |  4 +--
 6 files changed, 21 insertions(+), 67 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
index e4ea9d73a541..9c212449b2e8 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -236,6 +236,7 @@ extern unsigned long pmd_hugepage_update(struct mm_struct *mm,
 					 pmd_t *pmdp,
 					 unsigned long clr,
 					 unsigned long set);
+extern unsigned long htab_convert_pte_flags(unsigned long pteflags);
 /* Atomic PTE updates */
 static inline unsigned long pte_update(struct mm_struct *mm,
 				       unsigned long addr,
diff --git a/arch/powerpc/mm/hash64_4k.c b/arch/powerpc/mm/hash64_4k.c
index 3b49c6f18741..ee863137035a 100644
--- a/arch/powerpc/mm/hash64_4k.c
+++ b/arch/powerpc/mm/hash64_4k.c
@@ -53,18 +53,7 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
 	 * PP bits. _PAGE_USER is already PP bit 0x2, so we only
 	 * need to add in 0x1 if it's a read-only user page
 	 */
-	rflags = new_pte & _PAGE_USER;
-	if ((new_pte & _PAGE_USER) && !((new_pte & _PAGE_RW) &&
-					(new_pte & _PAGE_DIRTY)))
-		rflags |= 0x1;
-	/*
-	 * _PAGE_EXEC -> HW_NO_EXEC since it's inverted
-	 */
-	rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
-	/*
-	 * Always add C and Memory coherence bit
-	 */
-	rflags |= HPTE_R_C | HPTE_R_M;
+	rflags = htab_convert_pte_flags(new_pte);
 	/*
 	 * Add in WIMG bits
 	 */
diff --git a/arch/powerpc/mm/hash64_64k.c b/arch/powerpc/mm/hash64_64k.c
index fc0898eb309d..b14280e9d850 100644
--- a/arch/powerpc/mm/hash64_64k.c
+++ b/arch/powerpc/mm/hash64_64k.c
@@ -85,22 +85,7 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
 	 * Handle the subpage protection bits
 	 */
 	subpg_pte = new_pte & ~subpg_prot;
-	/*
-	 * PP bits. _PAGE_USER is already PP bit 0x2, so we only
-	 * need to add in 0x1 if it's a read-only user page
-	 */
-	rflags = subpg_pte & _PAGE_USER;
-	if ((subpg_pte & _PAGE_USER) && !((subpg_pte & _PAGE_RW) &&
-					(subpg_pte & _PAGE_DIRTY)))
-		rflags |= 0x1;
-	/*
-	 * _PAGE_EXEC -> HW_NO_EXEC since it's inverted
-	 */
-	rflags |= ((subpg_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
-	/*
-	 * Always add C and Memory coherence bit
-	 */
-	rflags |= HPTE_R_C | HPTE_R_M;
+	rflags = htab_convert_pte_flags(subpg_pte);
 	/*
 	 * Add in WIMG bits
 	 */
@@ -271,22 +256,8 @@ int __hash_page_64K(unsigned long ea, unsigned long access,
 			new_pte |= _PAGE_DIRTY;
 	} while (old_pte != __cmpxchg_u64((unsigned long *)ptep,
 					  old_pte, new_pte));
-	/*
-	 * PP bits. _PAGE_USER is already PP bit 0x2, so we only
-	 * need to add in 0x1 if it's a read-only user page
-	 */
-	rflags = new_pte & _PAGE_USER;
-	if ((new_pte & _PAGE_USER) && !((new_pte & _PAGE_RW) &&
-					(new_pte & _PAGE_DIRTY)))
-		rflags |= 0x1;
-	/*
-	 * _PAGE_EXEC -> HW_NO_EXEC since it's inverted
-	 */
-	rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
-	/*
-	 * Always add C and Memory coherence bit
-	 */
-	rflags |= HPTE_R_C | HPTE_R_M;
+
+	rflags = htab_convert_pte_flags(new_pte);
 	/*
 	 * Add in WIMG bits
 	 */
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index cab2deb8d20b..6c67bd0bec55 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -159,20 +159,26 @@ static struct mmu_psize_def mmu_psize_defaults_gp[] = {
 	},
 };
 
-static unsigned long htab_convert_pte_flags(unsigned long pteflags)
+unsigned long htab_convert_pte_flags(unsigned long pteflags)
 {
-	unsigned long rflags = pteflags & 0x1fa;
+	unsigned long rflags = 0;
 
 	/* _PAGE_EXEC -> NOEXEC */
 	if ((pteflags & _PAGE_EXEC) == 0)
 		rflags |= HPTE_R_N;
-
-	/* PP bits. PAGE_USER is already PP bit 0x2, so we only
-	 * need to add in 0x1 if it's a read-only user page
+	/*
+	 * PP bits:
+	 * Linux use slb key 0 for kernel and 1 for user.
+	 * kernel areas are mapped by PP bits 00
+	 * and and there is no kernel RO (_PAGE_KERNEL_RO).
+	 * User area mapped by 0x2 and read only use by
+	 * 0x3.
 	 */
-	if ((pteflags & _PAGE_USER) && !((pteflags & _PAGE_RW) &&
-					 (pteflags & _PAGE_DIRTY)))
-		rflags |= 1;
+	if (pteflags & _PAGE_USER) {
+		rflags |= 0x2;
+		if (!((pteflags & _PAGE_RW) && (pteflags & _PAGE_DIRTY)))
+			rflags |= 0x1;
+	}
 	/*
 	 * Always add "C" bit for perf. Memory coherence is always enabled
 	 */
diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c
index 4d87122cf6a7..91fcac6f989d 100644
--- a/arch/powerpc/mm/hugepage-hash64.c
+++ b/arch/powerpc/mm/hugepage-hash64.c
@@ -54,18 +54,7 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
 			new_pmd |= _PAGE_DIRTY;
 	} while (old_pmd != __cmpxchg_u64((unsigned long *)pmdp,
 					  old_pmd, new_pmd));
-	/*
-	 * PP bits. _PAGE_USER is already PP bit 0x2, so we only
-	 * need to add in 0x1 if it's a read-only user page
-	 */
-	rflags = new_pmd & _PAGE_USER;
-	if ((new_pmd & _PAGE_USER) && !((new_pmd & _PAGE_RW) &&
-					   (new_pmd & _PAGE_DIRTY)))
-		rflags |= 0x1;
-	/*
-	 * _PAGE_EXEC -> HW_NO_EXEC since it's inverted
-	 */
-	rflags |= ((new_pmd & _PAGE_EXEC) ? 0 : HPTE_R_N);
+	rflags = htab_convert_pte_flags(new_pmd);
 
 #if 0
 	if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) {
diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c
index 7584e8445512..304c8520506e 100644
--- a/arch/powerpc/mm/hugetlbpage-hash64.c
+++ b/arch/powerpc/mm/hugetlbpage-hash64.c
@@ -59,10 +59,8 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
 			new_pte |= _PAGE_DIRTY;
 	} while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
 					 old_pte, new_pte));
+	rflags = htab_convert_pte_flags(new_pte);
 
-	rflags = 0x2 | (!(new_pte & _PAGE_RW));
-	/* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
-	rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
 	sz = ((1UL) << shift);
 	if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
 		/* No CPU has hugepages but lacks no execute, so we

From 40e8550afc19dfc588171c089cb3f31f7b9e16f7 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 1 Dec 2015 09:06:51 +0530
Subject: [PATCH 055/149] powerpc/mm: Move WIMG update to helper.

Only difference here is, we apply the WIMG mapping early, so rflags
passed to updatepp will also be changed.

Acked-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/hash64_4k.c          |  5 -----
 arch/powerpc/mm/hash64_64k.c         | 10 ----------
 arch/powerpc/mm/hash_utils_64.c      | 13 ++++++++++++-
 arch/powerpc/mm/hugepage-hash64.c    |  7 -------
 arch/powerpc/mm/hugetlbpage-hash64.c |  8 --------
 5 files changed, 12 insertions(+), 31 deletions(-)

diff --git a/arch/powerpc/mm/hash64_4k.c b/arch/powerpc/mm/hash64_4k.c
index ee863137035a..e7c04542ba62 100644
--- a/arch/powerpc/mm/hash64_4k.c
+++ b/arch/powerpc/mm/hash64_4k.c
@@ -54,11 +54,6 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
 	 * need to add in 0x1 if it's a read-only user page
 	 */
 	rflags = htab_convert_pte_flags(new_pte);
-	/*
-	 * Add in WIMG bits
-	 */
-	rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
-				_PAGE_COHERENT | _PAGE_GUARDED));
 
 	if (!cpu_has_feature(CPU_FTR_NOEXECUTE) &&
 	    !cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
diff --git a/arch/powerpc/mm/hash64_64k.c b/arch/powerpc/mm/hash64_64k.c
index b14280e9d850..0762c1e08c88 100644
--- a/arch/powerpc/mm/hash64_64k.c
+++ b/arch/powerpc/mm/hash64_64k.c
@@ -86,11 +86,6 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
 	 */
 	subpg_pte = new_pte & ~subpg_prot;
 	rflags = htab_convert_pte_flags(subpg_pte);
-	/*
-	 * Add in WIMG bits
-	 */
-	rflags |= (subpg_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
-				_PAGE_COHERENT | _PAGE_GUARDED));
 
 	if (!cpu_has_feature(CPU_FTR_NOEXECUTE) &&
 	    !cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) {
@@ -258,11 +253,6 @@ int __hash_page_64K(unsigned long ea, unsigned long access,
 					  old_pte, new_pte));
 
 	rflags = htab_convert_pte_flags(new_pte);
-	/*
-	 * Add in WIMG bits
-	 */
-	rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
-				_PAGE_COHERENT | _PAGE_GUARDED));
 
 	if (!cpu_has_feature(CPU_FTR_NOEXECUTE) &&
 	    !cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 6c67bd0bec55..4233dcccbaf7 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -182,7 +182,18 @@ unsigned long htab_convert_pte_flags(unsigned long pteflags)
 	/*
 	 * Always add "C" bit for perf. Memory coherence is always enabled
 	 */
-	return rflags | HPTE_R_C | HPTE_R_M;
+	rflags |=  HPTE_R_C | HPTE_R_M;
+	/*
+	 * Add in WIG bits
+	 */
+	if (pteflags & _PAGE_WRITETHRU)
+		rflags |= HPTE_R_W;
+	if (pteflags & _PAGE_NO_CACHE)
+		rflags |= HPTE_R_I;
+	if (pteflags & _PAGE_GUARDED)
+		rflags |= HPTE_R_G;
+
+	return rflags;
 }
 
 int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c
index 91fcac6f989d..1f666de0110a 100644
--- a/arch/powerpc/mm/hugepage-hash64.c
+++ b/arch/powerpc/mm/hugepage-hash64.c
@@ -120,13 +120,6 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
 		pa = pmd_pfn(__pmd(old_pmd)) << PAGE_SHIFT;
 		new_pmd |= _PAGE_HASHPTE;
 
-		/* Add in WIMG bits */
-		rflags |= (new_pmd & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
-				      _PAGE_GUARDED));
-		/*
-		 * enable the memory coherence always
-		 */
-		rflags |= HPTE_R_M;
 repeat:
 		hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
 
diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c
index 304c8520506e..0734e4daffef 100644
--- a/arch/powerpc/mm/hugetlbpage-hash64.c
+++ b/arch/powerpc/mm/hugetlbpage-hash64.c
@@ -91,14 +91,6 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
 		/* clear HPTE slot informations in new PTE */
 		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
 
-		/* Add in WIMG bits */
-		rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
-				      _PAGE_COHERENT | _PAGE_GUARDED));
-		/*
-		 * enable the memory coherence always
-		 */
-		rflags |= HPTE_R_M;
-
 		slot = hpte_insert_repeating(hash, vpn, pa, rflags, 0,
 					     mmu_psize, ssize);
 

From 26a344aea48c99cfd80d292a470a480e1c2bd5d9 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 1 Dec 2015 09:06:52 +0530
Subject: [PATCH 056/149] powerpc/mm: Move hugetlb related headers

W.r.t hugetlb, we support two format for pmd. With book3s_64 and
64K linux page size, we can have pte at the pmd level. Hence we
don't need to support hugepd there. For everything else hugepd
is supported and pmd_huge is (0).

Acked-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/hash-4k.h  | 31 ++++++++
 arch/powerpc/include/asm/book3s/64/hash-64k.h | 51 +++++++++++++
 arch/powerpc/include/asm/nohash/pgtable.h     | 25 +++++++
 arch/powerpc/include/asm/page.h               | 42 ++---------
 arch/powerpc/mm/hugetlbpage-hash64.c          | 18 +++++
 arch/powerpc/mm/hugetlbpage.c                 | 72 -------------------
 6 files changed, 129 insertions(+), 110 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h
index 75e8b9326e4b..b4d25529d179 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
@@ -93,6 +93,37 @@ extern struct page *pgd_page(pgd_t pgd);
 #define remap_4k_pfn(vma, addr, pfn, prot)	\
 	remap_pfn_range((vma), (addr), (pfn), PAGE_SIZE, (prot))
 
+#ifdef CONFIG_HUGETLB_PAGE
+/*
+ * For 4k page size, we support explicit hugepage via hugepd
+ */
+static inline int pmd_huge(pmd_t pmd)
+{
+	return 0;
+}
+
+static inline int pud_huge(pud_t pud)
+{
+	return 0;
+}
+
+static inline int pgd_huge(pgd_t pgd)
+{
+	return 0;
+}
+#define pgd_huge pgd_huge
+
+static inline int hugepd_ok(hugepd_t hpd)
+{
+	/*
+	 * hugepd pointer, bottom two bits == 00 and next 4 bits
+	 * indicate size of table
+	 */
+	return (((hpd.pd & 0x3) == 0x0) && ((hpd.pd & HUGEPD_SHIFT_MASK) != 0));
+}
+#define is_hugepd(hpd)		(hugepd_ok(hpd))
+#endif
+
 #endif /* !__ASSEMBLY__ */
 
 #endif /* _ASM_POWERPC_BOOK3S_64_HASH_4K_H */
diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h
index 46b5d0ab11de..1857d19de18e 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -119,6 +119,57 @@ extern bool __rpte_sub_valid(real_pte_t rpte, unsigned long index);
 #define pgd_pte(pgd)	(pud_pte(((pud_t){ pgd })))
 #define pte_pgd(pte)	((pgd_t)pte_pud(pte))
 
+#ifdef CONFIG_HUGETLB_PAGE
+/*
+ * We have PGD_INDEX_SIZ = 12 and PTE_INDEX_SIZE = 8, so that we can have
+ * 16GB hugepage pte in PGD and 16MB hugepage pte at PMD;
+ *
+ * Defined in such a way that we can optimize away code block at build time
+ * if CONFIG_HUGETLB_PAGE=n.
+ */
+static inline int pmd_huge(pmd_t pmd)
+{
+	/*
+	 * leaf pte for huge page, bottom two bits != 00
+	 */
+	return ((pmd_val(pmd) & 0x3) != 0x0);
+}
+
+static inline int pud_huge(pud_t pud)
+{
+	/*
+	 * leaf pte for huge page, bottom two bits != 00
+	 */
+	return ((pud_val(pud) & 0x3) != 0x0);
+}
+
+static inline int pgd_huge(pgd_t pgd)
+{
+	/*
+	 * leaf pte for huge page, bottom two bits != 00
+	 */
+	return ((pgd_val(pgd) & 0x3) != 0x0);
+}
+#define pgd_huge pgd_huge
+
+#ifdef CONFIG_DEBUG_VM
+extern int hugepd_ok(hugepd_t hpd);
+#define is_hugepd(hpd)               (hugepd_ok(hpd))
+#else
+/*
+ * With 64k page size, we have hugepage ptes in the pgd and pmd entries. We don't
+ * need to setup hugepage directory for them. Our pte and page directory format
+ * enable us to have this enabled.
+ */
+static inline int hugepd_ok(hugepd_t hpd)
+{
+	return 0;
+}
+#define is_hugepd(pdep)			0
+#endif /* CONFIG_DEBUG_VM */
+
+#endif /* CONFIG_HUGETLB_PAGE */
+
 #endif	/* __ASSEMBLY__ */
 
 #endif /* _ASM_POWERPC_BOOK3S_64_HASH_64K_H */
diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h
index c0c41a2409d2..1263c22d60d8 100644
--- a/arch/powerpc/include/asm/nohash/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/pgtable.h
@@ -223,5 +223,30 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 				     unsigned long size, pgprot_t vma_prot);
 #define __HAVE_PHYS_MEM_ACCESS_PROT
 
+#ifdef CONFIG_HUGETLB_PAGE
+static inline int hugepd_ok(hugepd_t hpd)
+{
+	return (hpd.pd > 0);
+}
+
+static inline int pmd_huge(pmd_t pmd)
+{
+	return 0;
+}
+
+static inline int pud_huge(pud_t pud)
+{
+	return 0;
+}
+
+static inline int pgd_huge(pgd_t pgd)
+{
+	return 0;
+}
+#define pgd_huge		pgd_huge
+
+#define is_hugepd(hpd)		(hugepd_ok(hpd))
+#endif
+
 #endif /* __ASSEMBLY__ */
 #endif
diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index 5a3e7c643d73..e34124f6fbf2 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -387,45 +387,11 @@ typedef unsigned long pgprot_t;
 
 typedef struct { signed long pd; } hugepd_t;
 
-#ifdef CONFIG_HUGETLB_PAGE
-#ifdef CONFIG_PPC_BOOK3S_64
-#ifdef CONFIG_PPC_64K_PAGES
-/*
- * With 64k page size, we have hugepage ptes in the pgd and pmd entries. We don't
- * need to setup hugepage directory for them. Our pte and page directory format
- * enable us to have this enabled. But to avoid errors when implementing new
- * features disable hugepd for 64K. We enable a debug version here, So we catch
- * wrong usage.
- */
-#ifdef CONFIG_DEBUG_VM
-extern int hugepd_ok(hugepd_t hpd);
-#else
-#define hugepd_ok(x)	(0)
-#endif
-#else
-static inline int hugepd_ok(hugepd_t hpd)
-{
-	/*
-	 * hugepd pointer, bottom two bits == 00 and next 4 bits
-	 * indicate size of table
-	 */
-	return (((hpd.pd & 0x3) == 0x0) && ((hpd.pd & HUGEPD_SHIFT_MASK) != 0));
-}
-#endif
-#else
-static inline int hugepd_ok(hugepd_t hpd)
-{
-	return (hpd.pd > 0);
-}
-#endif
-
-#define is_hugepd(hpd)               (hugepd_ok(hpd))
-#define pgd_huge pgd_huge
-int pgd_huge(pgd_t pgd);
-#else /* CONFIG_HUGETLB_PAGE */
-#define is_hugepd(pdep)			0
-#define pgd_huge(pgd)			0
+#ifndef CONFIG_HUGETLB_PAGE
+#define is_hugepd(pdep)		(0)
+#define pgd_huge(pgd)		(0)
 #endif /* CONFIG_HUGETLB_PAGE */
+
 #define __hugepd(x) ((hugepd_t) { (x) })
 
 struct page;
diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c
index 0734e4daffef..e2138c7ae70f 100644
--- a/arch/powerpc/mm/hugetlbpage-hash64.c
+++ b/arch/powerpc/mm/hugetlbpage-hash64.c
@@ -114,3 +114,21 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
 	*ptep = __pte(new_pte & ~_PAGE_BUSY);
 	return 0;
 }
+
+#if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_DEBUG_VM)
+/*
+ * This enables us to catch the wrong page directory format
+ * Moved here so that we can use WARN() in the call.
+ */
+int hugepd_ok(hugepd_t hpd)
+{
+	bool is_hugepd;
+
+	/*
+	 * We should not find this format in page directory, warn otherwise.
+	 */
+	is_hugepd = (((hpd.pd & 0x3) == 0x0) && ((hpd.pd & HUGEPD_SHIFT_MASK) != 0));
+	WARN(is_hugepd, "Found wrong page directory format\n");
+	return 0;
+}
+#endif
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 9833fee493ec..bc72e542a83e 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -53,78 +53,6 @@ static unsigned nr_gpages;
 
 #define hugepd_none(hpd)	((hpd).pd == 0)
 
-#ifdef CONFIG_PPC_BOOK3S_64
-/*
- * At this point we do the placement change only for BOOK3S 64. This would
- * possibly work on other subarchs.
- */
-
-/*
- * We have PGD_INDEX_SIZ = 12 and PTE_INDEX_SIZE = 8, so that we can have
- * 16GB hugepage pte in PGD and 16MB hugepage pte at PMD;
- *
- * Defined in such a way that we can optimize away code block at build time
- * if CONFIG_HUGETLB_PAGE=n.
- */
-int pmd_huge(pmd_t pmd)
-{
-	/*
-	 * leaf pte for huge page, bottom two bits != 00
-	 */
-	return ((pmd_val(pmd) & 0x3) != 0x0);
-}
-
-int pud_huge(pud_t pud)
-{
-	/*
-	 * leaf pte for huge page, bottom two bits != 00
-	 */
-	return ((pud_val(pud) & 0x3) != 0x0);
-}
-
-int pgd_huge(pgd_t pgd)
-{
-	/*
-	 * leaf pte for huge page, bottom two bits != 00
-	 */
-	return ((pgd_val(pgd) & 0x3) != 0x0);
-}
-
-#if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_DEBUG_VM)
-/*
- * This enables us to catch the wrong page directory format
- * Moved here so that we can use WARN() in the call.
- */
-int hugepd_ok(hugepd_t hpd)
-{
-	bool is_hugepd;
-
-	/*
-	 * We should not find this format in page directory, warn otherwise.
-	 */
-	is_hugepd = (((hpd.pd & 0x3) == 0x0) && ((hpd.pd & HUGEPD_SHIFT_MASK) != 0));
-	WARN(is_hugepd, "Found wrong page directory format\n");
-	return 0;
-}
-#endif
-
-#else
-int pmd_huge(pmd_t pmd)
-{
-	return 0;
-}
-
-int pud_huge(pud_t pud)
-{
-	return 0;
-}
-
-int pgd_huge(pgd_t pgd)
-{
-	return 0;
-}
-#endif
-
 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 {
 	/* Only called for hugetlbfs pages, hence can ignore THP */

From e34aa03ca48d0c7982530436ce996f374b65913c Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 1 Dec 2015 09:06:53 +0530
Subject: [PATCH 057/149] powerpc/mm: Move THP headers around

We support THP only with book3s_64 and 64K page size. Move
THP details to hash64-64k.h to clarify the same.

Acked-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/hash-64k.h | 126 +++++++++
 arch/powerpc/include/asm/book3s/64/hash.h     | 223 ++++-----------
 arch/powerpc/include/asm/nohash/64/pgtable.h  | 253 +-----------------
 arch/powerpc/mm/hash_native_64.c              |  10 +
 arch/powerpc/mm/pgtable_64.c                  |   2 +-
 arch/powerpc/platforms/pseries/lpar.c         |  10 +
 6 files changed, 201 insertions(+), 423 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h
index 1857d19de18e..7570677c11c3 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -170,6 +170,132 @@ static inline int hugepd_ok(hugepd_t hpd)
 
 #endif /* CONFIG_HUGETLB_PAGE */
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+extern unsigned long pmd_hugepage_update(struct mm_struct *mm,
+					 unsigned long addr,
+					 pmd_t *pmdp,
+					 unsigned long clr,
+					 unsigned long set);
+static inline char *get_hpte_slot_array(pmd_t *pmdp)
+{
+	/*
+	 * The hpte hindex is stored in the pgtable whose address is in the
+	 * second half of the PMD
+	 *
+	 * Order this load with the test for pmd_trans_huge in the caller
+	 */
+	smp_rmb();
+	return *(char **)(pmdp + PTRS_PER_PMD);
+
+
+}
+/*
+ * The linux hugepage PMD now include the pmd entries followed by the address
+ * to the stashed pgtable_t. The stashed pgtable_t contains the hpte bits.
+ * [ 1 bit secondary | 3 bit hidx | 1 bit valid | 000]. We use one byte per
+ * each HPTE entry. With 16MB hugepage and 64K HPTE we need 256 entries and
+ * with 4K HPTE we need 4096 entries. Both will fit in a 4K pgtable_t.
+ *
+ * The last three bits are intentionally left to zero. This memory location
+ * are also used as normal page PTE pointers. So if we have any pointers
+ * left around while we collapse a hugepage, we need to make sure
+ * _PAGE_PRESENT bit of that is zero when we look at them
+ */
+static inline unsigned int hpte_valid(unsigned char *hpte_slot_array, int index)
+{
+	return (hpte_slot_array[index] >> 3) & 0x1;
+}
+
+static inline unsigned int hpte_hash_index(unsigned char *hpte_slot_array,
+					   int index)
+{
+	return hpte_slot_array[index] >> 4;
+}
+
+static inline void mark_hpte_slot_valid(unsigned char *hpte_slot_array,
+					unsigned int index, unsigned int hidx)
+{
+	hpte_slot_array[index] = hidx << 4 | 0x1 << 3;
+}
+
+/*
+ *
+ * For core kernel code by design pmd_trans_huge is never run on any hugetlbfs
+ * page. The hugetlbfs page table walking and mangling paths are totally
+ * separated form the core VM paths and they're differentiated by
+ *  VM_HUGETLB being set on vm_flags well before any pmd_trans_huge could run.
+ *
+ * pmd_trans_huge() is defined as false at build time if
+ * CONFIG_TRANSPARENT_HUGEPAGE=n to optimize away code blocks at build
+ * time in such case.
+ *
+ * For ppc64 we need to differntiate from explicit hugepages from THP, because
+ * for THP we also track the subpage details at the pmd level. We don't do
+ * that for explicit huge pages.
+ *
+ */
+static inline int pmd_trans_huge(pmd_t pmd)
+{
+	/*
+	 * leaf pte for huge page, bottom two bits != 00
+	 */
+	return (pmd_val(pmd) & 0x3) && (pmd_val(pmd) & _PAGE_THP_HUGE);
+}
+
+static inline int pmd_trans_splitting(pmd_t pmd)
+{
+	if (pmd_trans_huge(pmd))
+		return pmd_val(pmd) & _PAGE_SPLITTING;
+	return 0;
+}
+
+static inline int pmd_large(pmd_t pmd)
+{
+	/*
+	 * leaf pte for huge page, bottom two bits != 00
+	 */
+	return ((pmd_val(pmd) & 0x3) != 0x0);
+}
+
+static inline pmd_t pmd_mknotpresent(pmd_t pmd)
+{
+	return __pmd(pmd_val(pmd) & ~_PAGE_PRESENT);
+}
+
+static inline pmd_t pmd_mksplitting(pmd_t pmd)
+{
+	return __pmd(pmd_val(pmd) | _PAGE_SPLITTING);
+}
+
+#define __HAVE_ARCH_PMD_SAME
+static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
+{
+	return (((pmd_val(pmd_a) ^ pmd_val(pmd_b)) & ~_PAGE_HPTEFLAGS) == 0);
+}
+
+static inline int __pmdp_test_and_clear_young(struct mm_struct *mm,
+					      unsigned long addr, pmd_t *pmdp)
+{
+	unsigned long old;
+
+	if ((pmd_val(*pmdp) & (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0)
+		return 0;
+	old = pmd_hugepage_update(mm, addr, pmdp, _PAGE_ACCESSED, 0);
+	return ((old & _PAGE_ACCESSED) != 0);
+}
+
+#define __HAVE_ARCH_PMDP_SET_WRPROTECT
+static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
+				      pmd_t *pmdp)
+{
+
+	if ((pmd_val(*pmdp) & _PAGE_RW) == 0)
+		return;
+
+	pmd_hugepage_update(mm, addr, pmdp, _PAGE_RW, 0);
+}
+
+#endif /*  CONFIG_TRANSPARENT_HUGEPAGE */
 #endif	/* __ASSEMBLY__ */
 
 #endif /* _ASM_POWERPC_BOOK3S_64_HASH_64K_H */
diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
index 9c212449b2e8..42e1273adad1 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -2,6 +2,55 @@
 #define _ASM_POWERPC_BOOK3S_64_HASH_H
 #ifdef __KERNEL__
 
+/*
+ * Common bits between 4K and 64K pages in a linux-style PTE.
+ * These match the bits in the (hardware-defined) PowerPC PTE as closely
+ * as possible. Additional bits may be defined in pgtable-hash64-*.h
+ *
+ * Note: We only support user read/write permissions. Supervisor always
+ * have full read/write to pages above PAGE_OFFSET (pages below that
+ * always use the user access permissions).
+ *
+ * We could create separate kernel read-only if we used the 3 PP bits
+ * combinations that newer processors provide but we currently don't.
+ */
+#define _PAGE_PRESENT		0x00001 /* software: pte contains a translation */
+#define _PAGE_USER		0x00002 /* matches one of the PP bits */
+#define _PAGE_BIT_SWAP_TYPE	2
+#define _PAGE_EXEC		0x00004 /* No execute on POWER4 and newer (we invert) */
+#define _PAGE_GUARDED		0x00008
+/* We can derive Memory coherence from _PAGE_NO_CACHE */
+#define _PAGE_COHERENT		0x0
+#define _PAGE_NO_CACHE		0x00020 /* I: cache inhibit */
+#define _PAGE_WRITETHRU		0x00040 /* W: cache write-through */
+#define _PAGE_DIRTY		0x00080 /* C: page changed */
+#define _PAGE_ACCESSED		0x00100 /* R: page referenced */
+#define _PAGE_RW		0x00200 /* software: user write access allowed */
+#define _PAGE_HASHPTE		0x00400 /* software: pte has an associated HPTE */
+#define _PAGE_BUSY		0x00800 /* software: PTE & hash are busy */
+#define _PAGE_F_GIX		0x07000 /* full page: hidx bits */
+#define _PAGE_F_GIX_SHIFT	12
+#define _PAGE_F_SECOND		0x08000 /* Whether to use secondary hash or not */
+#define _PAGE_SPECIAL		0x10000 /* software: special page */
+
+/*
+ * THP pages can't be special. So use the _PAGE_SPECIAL
+ */
+#define _PAGE_SPLITTING _PAGE_SPECIAL
+
+/*
+ * We need to differentiate between explicit huge page and THP huge
+ * page, since THP huge page also need to track real subpage details
+ */
+#define _PAGE_THP_HUGE  _PAGE_4K_PFN
+
+/*
+ * set of bits not changed in pmd_modify.
+ */
+#define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS |		\
+			 _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_SPLITTING | \
+			 _PAGE_THP_HUGE)
+
 #ifdef CONFIG_PPC_64K_PAGES
 #include <asm/book3s/64/hash-64k.h>
 #else
@@ -57,36 +106,6 @@
 #define HAVE_ARCH_UNMAPPED_AREA
 #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
 #endif /* CONFIG_PPC_MM_SLICES */
-/*
- * Common bits between 4K and 64K pages in a linux-style PTE.
- * These match the bits in the (hardware-defined) PowerPC PTE as closely
- * as possible. Additional bits may be defined in pgtable-hash64-*.h
- *
- * Note: We only support user read/write permissions. Supervisor always
- * have full read/write to pages above PAGE_OFFSET (pages below that
- * always use the user access permissions).
- *
- * We could create separate kernel read-only if we used the 3 PP bits
- * combinations that newer processors provide but we currently don't.
- */
-#define _PAGE_PRESENT		0x00001 /* software: pte contains a translation */
-#define _PAGE_USER		0x00002 /* matches one of the PP bits */
-#define _PAGE_BIT_SWAP_TYPE	2
-#define _PAGE_EXEC		0x00004 /* No execute on POWER4 and newer (we invert) */
-#define _PAGE_GUARDED		0x00008
-/* We can derive Memory coherence from _PAGE_NO_CACHE */
-#define _PAGE_COHERENT		0x0
-#define _PAGE_NO_CACHE		0x00020 /* I: cache inhibit */
-#define _PAGE_WRITETHRU		0x00040 /* W: cache write-through */
-#define _PAGE_DIRTY		0x00080 /* C: page changed */
-#define _PAGE_ACCESSED		0x00100 /* R: page referenced */
-#define _PAGE_RW		0x00200 /* software: user write access allowed */
-#define _PAGE_HASHPTE		0x00400 /* software: pte has an associated HPTE */
-#define _PAGE_BUSY		0x00800 /* software: PTE & hash are busy */
-#define _PAGE_F_GIX		0x07000 /* full page: hidx bits */
-#define _PAGE_F_GIX_SHIFT	12
-#define _PAGE_F_SECOND		0x08000 /* Whether to use secondary hash or not */
-#define _PAGE_SPECIAL		0x10000 /* software: special page */
 
 /* No separate kernel read-only */
 #define _PAGE_KERNEL_RW		(_PAGE_RW | _PAGE_DIRTY) /* user access blocked by key */
@@ -105,24 +124,6 @@
 
 /* Hash table based platforms need atomic updates of the linux PTE */
 #define PTE_ATOMIC_UPDATES	1
-
-/*
- * THP pages can't be special. So use the _PAGE_SPECIAL
- */
-#define _PAGE_SPLITTING _PAGE_SPECIAL
-
-/*
- * We need to differentiate between explicit huge page and THP huge
- * page, since THP huge page also need to track real subpage details
- */
-#define _PAGE_THP_HUGE  _PAGE_4K_PFN
-
-/*
- * set of bits not changed in pmd_modify.
- */
-#define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS |		\
-			 _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_SPLITTING | \
-			 _PAGE_THP_HUGE)
 #define _PTE_NONE_MASK	_PAGE_HPTEFLAGS
 /*
  * The mask convered by the RPN must be a ULL on 32-bit platforms with
@@ -231,11 +232,6 @@
 
 extern void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
 			    pte_t *ptep, unsigned long pte, int huge);
-extern unsigned long pmd_hugepage_update(struct mm_struct *mm,
-					 unsigned long addr,
-					 pmd_t *pmdp,
-					 unsigned long clr,
-					 unsigned long set);
 extern unsigned long htab_convert_pte_flags(unsigned long pteflags);
 /* Atomic PTE updates */
 static inline unsigned long pte_update(struct mm_struct *mm,
@@ -361,127 +357,6 @@ static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry)
 #define __HAVE_ARCH_PTE_SAME
 #define pte_same(A,B)	(((pte_val(A) ^ pte_val(B)) & ~_PAGE_HPTEFLAGS) == 0)
 
-static inline char *get_hpte_slot_array(pmd_t *pmdp)
-{
-	/*
-	 * The hpte hindex is stored in the pgtable whose address is in the
-	 * second half of the PMD
-	 *
-	 * Order this load with the test for pmd_trans_huge in the caller
-	 */
-	smp_rmb();
-	return *(char **)(pmdp + PTRS_PER_PMD);
-
-
-}
-/*
- * The linux hugepage PMD now include the pmd entries followed by the address
- * to the stashed pgtable_t. The stashed pgtable_t contains the hpte bits.
- * [ 1 bit secondary | 3 bit hidx | 1 bit valid | 000]. We use one byte per
- * each HPTE entry. With 16MB hugepage and 64K HPTE we need 256 entries and
- * with 4K HPTE we need 4096 entries. Both will fit in a 4K pgtable_t.
- *
- * The last three bits are intentionally left to zero. This memory location
- * are also used as normal page PTE pointers. So if we have any pointers
- * left around while we collapse a hugepage, we need to make sure
- * _PAGE_PRESENT bit of that is zero when we look at them
- */
-static inline unsigned int hpte_valid(unsigned char *hpte_slot_array, int index)
-{
-	return (hpte_slot_array[index] >> 3) & 0x1;
-}
-
-static inline unsigned int hpte_hash_index(unsigned char *hpte_slot_array,
-					   int index)
-{
-	return hpte_slot_array[index] >> 4;
-}
-
-static inline void mark_hpte_slot_valid(unsigned char *hpte_slot_array,
-					unsigned int index, unsigned int hidx)
-{
-	hpte_slot_array[index] = hidx << 4 | 0x1 << 3;
-}
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-/*
- *
- * For core kernel code by design pmd_trans_huge is never run on any hugetlbfs
- * page. The hugetlbfs page table walking and mangling paths are totally
- * separated form the core VM paths and they're differentiated by
- *  VM_HUGETLB being set on vm_flags well before any pmd_trans_huge could run.
- *
- * pmd_trans_huge() is defined as false at build time if
- * CONFIG_TRANSPARENT_HUGEPAGE=n to optimize away code blocks at build
- * time in such case.
- *
- * For ppc64 we need to differntiate from explicit hugepages from THP, because
- * for THP we also track the subpage details at the pmd level. We don't do
- * that for explicit huge pages.
- *
- */
-static inline int pmd_trans_huge(pmd_t pmd)
-{
-	/*
-	 * leaf pte for huge page, bottom two bits != 00
-	 */
-	return (pmd_val(pmd) & 0x3) && (pmd_val(pmd) & _PAGE_THP_HUGE);
-}
-
-static inline int pmd_trans_splitting(pmd_t pmd)
-{
-	if (pmd_trans_huge(pmd))
-		return pmd_val(pmd) & _PAGE_SPLITTING;
-	return 0;
-}
-
-#endif
-static inline int pmd_large(pmd_t pmd)
-{
-	/*
-	 * leaf pte for huge page, bottom two bits != 00
-	 */
-	return ((pmd_val(pmd) & 0x3) != 0x0);
-}
-
-static inline pmd_t pmd_mknotpresent(pmd_t pmd)
-{
-	return __pmd(pmd_val(pmd) & ~_PAGE_PRESENT);
-}
-
-static inline pmd_t pmd_mksplitting(pmd_t pmd)
-{
-	return __pmd(pmd_val(pmd) | _PAGE_SPLITTING);
-}
-
-#define __HAVE_ARCH_PMD_SAME
-static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
-{
-	return (((pmd_val(pmd_a) ^ pmd_val(pmd_b)) & ~_PAGE_HPTEFLAGS) == 0);
-}
-
-static inline int __pmdp_test_and_clear_young(struct mm_struct *mm,
-					      unsigned long addr, pmd_t *pmdp)
-{
-	unsigned long old;
-
-	if ((pmd_val(*pmdp) & (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0)
-		return 0;
-	old = pmd_hugepage_update(mm, addr, pmdp, _PAGE_ACCESSED, 0);
-	return ((old & _PAGE_ACCESSED) != 0);
-}
-
-#define __HAVE_ARCH_PMDP_SET_WRPROTECT
-static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
-				      pmd_t *pmdp)
-{
-
-	if ((pmd_val(*pmdp) & _PAGE_RW) == 0)
-		return;
-
-	pmd_hugepage_update(mm, addr, pmdp, _PAGE_RW, 0);
-}
-
 /* Generic accessors to PTE bits */
 static inline int pte_write(pte_t pte)		{ return !!(pte_val(pte) & _PAGE_RW);}
 static inline int pte_dirty(pte_t pte)		{ return !!(pte_val(pte) & _PAGE_DIRTY); }
diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h
index c24e03f22655..d635a924d652 100644
--- a/arch/powerpc/include/asm/nohash/64/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable.h
@@ -154,6 +154,11 @@ static inline void pmd_clear(pmd_t *pmdp)
 	*pmdp = __pmd(0);
 }
 
+static inline pte_t pmd_pte(pmd_t pmd)
+{
+	return __pte(pmd_val(pmd));
+}
+
 #define pmd_none(pmd)		(!pmd_val(pmd))
 #define	pmd_bad(pmd)		(!is_kernel_addr(pmd_val(pmd)) \
 				 || (pmd_val(pmd) & PMD_BAD_BITS))
@@ -389,252 +394,4 @@ void pgtable_cache_add(unsigned shift, void (*ctor)(void *));
 void pgtable_cache_init(void);
 #endif /* __ASSEMBLY__ */
 
-/*
- * THP pages can't be special. So use the _PAGE_SPECIAL
- */
-#define _PAGE_SPLITTING _PAGE_SPECIAL
-
-/*
- * We need to differentiate between explicit huge page and THP huge
- * page, since THP huge page also need to track real subpage details
- */
-#define _PAGE_THP_HUGE  _PAGE_4K_PFN
-
-/*
- * set of bits not changed in pmd_modify.
- */
-#define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS |		\
-			 _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_SPLITTING | \
-			 _PAGE_THP_HUGE)
-
-#ifndef __ASSEMBLY__
-/*
- * The linux hugepage PMD now include the pmd entries followed by the address
- * to the stashed pgtable_t. The stashed pgtable_t contains the hpte bits.
- * [ 1 bit secondary | 3 bit hidx | 1 bit valid | 000]. We use one byte per
- * each HPTE entry. With 16MB hugepage and 64K HPTE we need 256 entries and
- * with 4K HPTE we need 4096 entries. Both will fit in a 4K pgtable_t.
- *
- * The last three bits are intentionally left to zero. This memory location
- * are also used as normal page PTE pointers. So if we have any pointers
- * left around while we collapse a hugepage, we need to make sure
- * _PAGE_PRESENT bit of that is zero when we look at them
- */
-static inline unsigned int hpte_valid(unsigned char *hpte_slot_array, int index)
-{
-	return (hpte_slot_array[index] >> 3) & 0x1;
-}
-
-static inline unsigned int hpte_hash_index(unsigned char *hpte_slot_array,
-					   int index)
-{
-	return hpte_slot_array[index] >> 4;
-}
-
-static inline void mark_hpte_slot_valid(unsigned char *hpte_slot_array,
-					unsigned int index, unsigned int hidx)
-{
-	hpte_slot_array[index] = hidx << 4 | 0x1 << 3;
-}
-
-struct page *realmode_pfn_to_page(unsigned long pfn);
-
-static inline char *get_hpte_slot_array(pmd_t *pmdp)
-{
-	/*
-	 * The hpte hindex is stored in the pgtable whose address is in the
-	 * second half of the PMD
-	 *
-	 * Order this load with the test for pmd_trans_huge in the caller
-	 */
-	smp_rmb();
-	return *(char **)(pmdp + PTRS_PER_PMD);
-
-
-}
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-extern void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
-				   pmd_t *pmdp, unsigned long old_pmd);
-extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot);
-extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot);
-extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot);
-extern void set_pmd_at(struct mm_struct *mm, unsigned long addr,
-		       pmd_t *pmdp, pmd_t pmd);
-extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
-				 pmd_t *pmd);
-/*
- *
- * For core kernel code by design pmd_trans_huge is never run on any hugetlbfs
- * page. The hugetlbfs page table walking and mangling paths are totally
- * separated form the core VM paths and they're differentiated by
- *  VM_HUGETLB being set on vm_flags well before any pmd_trans_huge could run.
- *
- * pmd_trans_huge() is defined as false at build time if
- * CONFIG_TRANSPARENT_HUGEPAGE=n to optimize away code blocks at build
- * time in such case.
- *
- * For ppc64 we need to differntiate from explicit hugepages from THP, because
- * for THP we also track the subpage details at the pmd level. We don't do
- * that for explicit huge pages.
- *
- */
-static inline int pmd_trans_huge(pmd_t pmd)
-{
-	/*
-	 * leaf pte for huge page, bottom two bits != 00
-	 */
-	return (pmd_val(pmd) & 0x3) && (pmd_val(pmd) & _PAGE_THP_HUGE);
-}
-
-static inline int pmd_trans_splitting(pmd_t pmd)
-{
-	if (pmd_trans_huge(pmd))
-		return pmd_val(pmd) & _PAGE_SPLITTING;
-	return 0;
-}
-
-extern int has_transparent_hugepage(void);
-#else
-static inline void hpte_do_hugepage_flush(struct mm_struct *mm,
-					  unsigned long addr, pmd_t *pmdp,
-					  unsigned long old_pmd)
-{
-
-	WARN(1, "%s called with THP disabled\n", __func__);
-}
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-
-static inline int pmd_large(pmd_t pmd)
-{
-	/*
-	 * leaf pte for huge page, bottom two bits != 00
-	 */
-	return ((pmd_val(pmd) & 0x3) != 0x0);
-}
-
-static inline pte_t pmd_pte(pmd_t pmd)
-{
-	return __pte(pmd_val(pmd));
-}
-
-static inline pmd_t pte_pmd(pte_t pte)
-{
-	return __pmd(pte_val(pte));
-}
-
-static inline pte_t *pmdp_ptep(pmd_t *pmd)
-{
-	return (pte_t *)pmd;
-}
-
-#define pmd_pfn(pmd)		pte_pfn(pmd_pte(pmd))
-#define pmd_dirty(pmd)		pte_dirty(pmd_pte(pmd))
-#define pmd_young(pmd)		pte_young(pmd_pte(pmd))
-#define pmd_mkold(pmd)		pte_pmd(pte_mkold(pmd_pte(pmd)))
-#define pmd_wrprotect(pmd)	pte_pmd(pte_wrprotect(pmd_pte(pmd)))
-#define pmd_mkdirty(pmd)	pte_pmd(pte_mkdirty(pmd_pte(pmd)))
-#define pmd_mkyoung(pmd)	pte_pmd(pte_mkyoung(pmd_pte(pmd)))
-#define pmd_mkwrite(pmd)	pte_pmd(pte_mkwrite(pmd_pte(pmd)))
-
-#define __HAVE_ARCH_PMD_WRITE
-#define pmd_write(pmd)		pte_write(pmd_pte(pmd))
-
-static inline pmd_t pmd_mkhuge(pmd_t pmd)
-{
-	/* Do nothing, mk_pmd() does this part.  */
-	return pmd;
-}
-
-static inline pmd_t pmd_mknotpresent(pmd_t pmd)
-{
-	return __pmd(pmd_val(pmd) & ~_PAGE_PRESENT);
-}
-
-static inline pmd_t pmd_mksplitting(pmd_t pmd)
-{
-	return __pmd(pmd_val(pmd) | _PAGE_SPLITTING);
-}
-
-#define __HAVE_ARCH_PMD_SAME
-static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
-{
-	return (((pmd_val(pmd_a) ^ pmd_val(pmd_b)) & ~_PAGE_HPTEFLAGS) == 0);
-}
-
-#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
-extern int pmdp_set_access_flags(struct vm_area_struct *vma,
-				 unsigned long address, pmd_t *pmdp,
-				 pmd_t entry, int dirty);
-
-extern unsigned long pmd_hugepage_update(struct mm_struct *mm,
-					 unsigned long addr,
-					 pmd_t *pmdp,
-					 unsigned long clr,
-					 unsigned long set);
-
-static inline int __pmdp_test_and_clear_young(struct mm_struct *mm,
-					      unsigned long addr, pmd_t *pmdp)
-{
-	unsigned long old;
-
-	if ((pmd_val(*pmdp) & (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0)
-		return 0;
-	old = pmd_hugepage_update(mm, addr, pmdp, _PAGE_ACCESSED, 0);
-	return ((old & _PAGE_ACCESSED) != 0);
-}
-
-#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
-extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
-				     unsigned long address, pmd_t *pmdp);
-#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
-extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
-				  unsigned long address, pmd_t *pmdp);
-
-#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
-extern pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
-				     unsigned long addr, pmd_t *pmdp);
-
-#define __HAVE_ARCH_PMDP_SET_WRPROTECT
-static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
-				      pmd_t *pmdp)
-{
-
-	if ((pmd_val(*pmdp) & _PAGE_RW) == 0)
-		return;
-
-	pmd_hugepage_update(mm, addr, pmdp, _PAGE_RW, 0);
-}
-
-#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-extern void pmdp_splitting_flush(struct vm_area_struct *vma,
-				 unsigned long address, pmd_t *pmdp);
-
-extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
-				 unsigned long address, pmd_t *pmdp);
-#define pmdp_collapse_flush pmdp_collapse_flush
-
-#define __HAVE_ARCH_PGTABLE_DEPOSIT
-extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
-				       pgtable_t pgtable);
-#define __HAVE_ARCH_PGTABLE_WITHDRAW
-extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
-
-#define __HAVE_ARCH_PMDP_INVALIDATE
-extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
-			    pmd_t *pmdp);
-
-#define pmd_move_must_withdraw pmd_move_must_withdraw
-struct spinlock;
-static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
-					 struct spinlock *old_pmd_ptl)
-{
-	/*
-	 * Archs like ppc64 use pgtable to store per pmd
-	 * specific information. So when we switch the pmd,
-	 * we should also withdraw and deposit the pgtable
-	 */
-	return true;
-}
-#endif /* __ASSEMBLY__ */
 #endif /* _ASM_POWERPC_NOHASH_64_PGTABLE_H */
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index c8822af10a58..8eaac81347fd 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -429,6 +429,7 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
 	local_irq_restore(flags);
 }
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 static void native_hugepage_invalidate(unsigned long vsid,
 				       unsigned long addr,
 				       unsigned char *hpte_slot_array,
@@ -482,6 +483,15 @@ static void native_hugepage_invalidate(unsigned long vsid,
 	}
 	local_irq_restore(flags);
 }
+#else
+static void native_hugepage_invalidate(unsigned long vsid,
+				       unsigned long addr,
+				       unsigned char *hpte_slot_array,
+				       int psize, int ssize, int local)
+{
+	WARN(1, "%s called without THP support\n", __func__);
+}
+#endif
 
 static inline int __hpte_actual_psize(unsigned int lp, int psize)
 {
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 3967e3cce03e..d42dd289abfe 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -359,7 +359,7 @@ struct page *pud_page(pud_t pud)
 struct page *pmd_page(pmd_t pmd)
 {
 	if (pmd_trans_huge(pmd) || pmd_huge(pmd))
-		return pfn_to_page(pmd_pfn(pmd));
+		return pte_page(pmd_pte(pmd));
 	return virt_to_page(pmd_page_vaddr(pmd));
 }
 
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index b7a67e3d2201..6d46547871aa 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -396,6 +396,7 @@ static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long vpn,
 	BUG_ON(lpar_rc != H_SUCCESS);
 }
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 /*
  * Limit iterations holding pSeries_lpar_tlbie_lock to 3. We also need
  * to make sure that we avoid bouncing the hypervisor tlbie lock.
@@ -494,6 +495,15 @@ static void pSeries_lpar_hugepage_invalidate(unsigned long vsid,
 		__pSeries_lpar_hugepage_invalidate(slot_array, vpn_array,
 						   index, psize, ssize);
 }
+#else
+static void pSeries_lpar_hugepage_invalidate(unsigned long vsid,
+					     unsigned long addr,
+					     unsigned char *hpte_slot_array,
+					     int psize, int ssize, int local)
+{
+	WARN(1, "%s called without THP support\n", __func__);
+}
+#endif
 
 static void pSeries_lpar_hpte_removebolted(unsigned long ea,
 					   int psize, int ssize)

From 6a119eae942c51ccf1091936c534bac12cae630e Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 1 Dec 2015 09:06:54 +0530
Subject: [PATCH 058/149] powerpc/mm: Add a _PAGE_PTE bit

For a pte entry we will have _PAGE_PTE set. Our pte page
address have a minimum alignment requirement of HUGEPD_SHIFT_MASK + 1.
We use the lower 7 bits to indicate hugepd. ie.

For pmd and pgd we can find:
1) _PAGE_PTE set pte -> indicate PTE
2) bits [2..6] non zero -> indicate hugepd.
   They also encode the size. We skip bit 1 (_PAGE_PRESENT).
3) othewise pointer to next table.

Acked-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/hash-4k.h  |  9 +++++---
 arch/powerpc/include/asm/book3s/64/hash-64k.h | 23 ++++++++-----------
 arch/powerpc/include/asm/book3s/64/hash.h     | 13 ++++++-----
 arch/powerpc/include/asm/book3s/64/pgtable.h  |  3 +--
 arch/powerpc/include/asm/pte-common.h         |  5 ++++
 arch/powerpc/mm/hugetlbpage.c                 |  4 ++--
 arch/powerpc/mm/pgtable.c                     |  4 ++++
 arch/powerpc/mm/pgtable_64.c                  |  7 +-----
 8 files changed, 35 insertions(+), 33 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h
index b4d25529d179..e59832c94609 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
@@ -116,10 +116,13 @@ static inline int pgd_huge(pgd_t pgd)
 static inline int hugepd_ok(hugepd_t hpd)
 {
 	/*
-	 * hugepd pointer, bottom two bits == 00 and next 4 bits
-	 * indicate size of table
+	 * if it is not a pte and have hugepd shift mask
+	 * set, then it is a hugepd directory pointer
 	 */
-	return (((hpd.pd & 0x3) == 0x0) && ((hpd.pd & HUGEPD_SHIFT_MASK) != 0));
+	if (!(hpd.pd & _PAGE_PTE) &&
+	    ((hpd.pd & HUGEPD_SHIFT_MASK) != 0))
+		return true;
+	return false;
 }
 #define is_hugepd(hpd)		(hugepd_ok(hpd))
 #endif
diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h
index 7570677c11c3..52110d7af659 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -130,25 +130,25 @@ extern bool __rpte_sub_valid(real_pte_t rpte, unsigned long index);
 static inline int pmd_huge(pmd_t pmd)
 {
 	/*
-	 * leaf pte for huge page, bottom two bits != 00
+	 * leaf pte for huge page
 	 */
-	return ((pmd_val(pmd) & 0x3) != 0x0);
+	return !!(pmd_val(pmd) & _PAGE_PTE);
 }
 
 static inline int pud_huge(pud_t pud)
 {
 	/*
-	 * leaf pte for huge page, bottom two bits != 00
+	 * leaf pte for huge page
 	 */
-	return ((pud_val(pud) & 0x3) != 0x0);
+	return !!(pud_val(pud) & _PAGE_PTE);
 }
 
 static inline int pgd_huge(pgd_t pgd)
 {
 	/*
-	 * leaf pte for huge page, bottom two bits != 00
+	 * leaf pte for huge page
 	 */
-	return ((pgd_val(pgd) & 0x3) != 0x0);
+	return !!(pgd_val(pgd) & _PAGE_PTE);
 }
 #define pgd_huge pgd_huge
 
@@ -236,10 +236,8 @@ static inline void mark_hpte_slot_valid(unsigned char *hpte_slot_array,
  */
 static inline int pmd_trans_huge(pmd_t pmd)
 {
-	/*
-	 * leaf pte for huge page, bottom two bits != 00
-	 */
-	return (pmd_val(pmd) & 0x3) && (pmd_val(pmd) & _PAGE_THP_HUGE);
+	return !!((pmd_val(pmd) & (_PAGE_PTE | _PAGE_THP_HUGE)) ==
+		  (_PAGE_PTE | _PAGE_THP_HUGE));
 }
 
 static inline int pmd_trans_splitting(pmd_t pmd)
@@ -251,10 +249,7 @@ static inline int pmd_trans_splitting(pmd_t pmd)
 
 static inline int pmd_large(pmd_t pmd)
 {
-	/*
-	 * leaf pte for huge page, bottom two bits != 00
-	 */
-	return ((pmd_val(pmd) & 0x3) != 0x0);
+	return !!(pmd_val(pmd) & _PAGE_PTE);
 }
 
 static inline pmd_t pmd_mknotpresent(pmd_t pmd)
diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
index 42e1273adad1..8b929e531758 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -14,11 +14,12 @@
  * We could create separate kernel read-only if we used the 3 PP bits
  * combinations that newer processors provide but we currently don't.
  */
-#define _PAGE_PRESENT		0x00001 /* software: pte contains a translation */
-#define _PAGE_USER		0x00002 /* matches one of the PP bits */
+#define _PAGE_PTE		0x00001
+#define _PAGE_PRESENT		0x00002 /* software: pte contains a translation */
 #define _PAGE_BIT_SWAP_TYPE	2
-#define _PAGE_EXEC		0x00004 /* No execute on POWER4 and newer (we invert) */
-#define _PAGE_GUARDED		0x00008
+#define _PAGE_USER		0x00004 /* matches one of the PP bits */
+#define _PAGE_EXEC		0x00008 /* No execute on POWER4 and newer (we invert) */
+#define _PAGE_GUARDED		0x00010
 /* We can derive Memory coherence from _PAGE_NO_CACHE */
 #define _PAGE_COHERENT		0x0
 #define _PAGE_NO_CACHE		0x00020 /* I: cache inhibit */
@@ -49,7 +50,7 @@
  */
 #define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS |		\
 			 _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_SPLITTING | \
-			 _PAGE_THP_HUGE)
+			 _PAGE_THP_HUGE | _PAGE_PTE)
 
 #ifdef CONFIG_PPC_64K_PAGES
 #include <asm/book3s/64/hash-64k.h>
@@ -135,7 +136,7 @@
  * pgprot changes
  */
 #define _PAGE_CHG_MASK	(PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \
-			 _PAGE_ACCESSED | _PAGE_SPECIAL)
+			 _PAGE_ACCESSED | _PAGE_SPECIAL | _PAGE_PTE)
 /*
  * Mask of bits returned by pte_pgprot()
  */
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index f2ace2cac7bb..bb97b6a52b84 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -213,8 +213,7 @@ static inline int pmd_protnone(pmd_t pmd)
 
 static inline pmd_t pmd_mkhuge(pmd_t pmd)
 {
-	/* Do nothing, mk_pmd() does this part.  */
-	return pmd;
+	return __pmd(pmd_val(pmd) | (_PAGE_PTE | _PAGE_THP_HUGE));
 }
 
 #define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
diff --git a/arch/powerpc/include/asm/pte-common.h b/arch/powerpc/include/asm/pte-common.h
index 71537a319fc8..1ec67b043065 100644
--- a/arch/powerpc/include/asm/pte-common.h
+++ b/arch/powerpc/include/asm/pte-common.h
@@ -40,6 +40,11 @@
 #else
 #define _PAGE_RW 0
 #endif
+
+#ifndef _PAGE_PTE
+#define _PAGE_PTE 0
+#endif
+
 #ifndef _PMD_PRESENT_MASK
 #define _PMD_PRESENT_MASK	_PMD_PRESENT
 #endif
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index bc72e542a83e..61b8b7ccea4f 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -894,8 +894,8 @@ void flush_dcache_icache_hugepage(struct page *page)
  * We have 4 cases for pgds and pmds:
  * (1) invalid (all zeroes)
  * (2) pointer to next table, as normal; bottom 6 bits == 0
- * (3) leaf pte for huge page, bottom two bits != 00
- * (4) hugepd pointer, bottom two bits == 00, next 4 bits indicate size of table
+ * (3) leaf pte for huge page _PAGE_PTE set
+ * (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table
  *
  * So long as we atomically load page table pointers we are safe against teardown,
  * we can follow the address down to the the page and take a ref on it.
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 83dfcb55ffef..83dfd7925c72 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -179,6 +179,10 @@ void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
 	 */
 	VM_WARN_ON((pte_val(*ptep) & (_PAGE_PRESENT | _PAGE_USER)) ==
 		(_PAGE_PRESENT | _PAGE_USER));
+	/*
+	 * Add the pte bit when tryint set a pte
+	 */
+	pte = __pte(pte_val(pte) | _PAGE_PTE);
 
 	/* Note: mm->context.id might not yet have been assigned as
 	 * this context might not have been activated yet when this
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index d42dd289abfe..ea6bc31debb0 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -765,13 +765,8 @@ static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
 pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
 {
 	unsigned long pmdv;
-	/*
-	 * For a valid pte, we would have _PAGE_PRESENT always
-	 * set. We use this to check THP page at pmd level.
-	 * leaf pte for huge page, bottom two bits != 00
-	 */
+
 	pmdv = pfn << PTE_RPN_SHIFT;
-	pmdv |= _PAGE_THP_HUGE;
 	return pmd_set_protbits(__pmd(pmdv), pgprot);
 }
 

From 62607bc64c5cbb8d9b330da4be34c6d5302348af Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 1 Dec 2015 09:06:55 +0530
Subject: [PATCH 059/149] powerpc/mm: Don't hardcode page table size

pte and pmd table size are dependent on config items. Don't
hard code the same. This make sure we use the right value
when masking pmd entries and also while checking pmd_bad

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/hash-64k.h | 30 ++++++++++++++-----
 .../include/asm/nohash/64/pgtable-64k.h       | 21 ++++++++++---
 arch/powerpc/include/asm/pgalloc-64.h         | 10 -------
 arch/powerpc/mm/init_64.c                     |  4 ---
 4 files changed, 40 insertions(+), 25 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h
index 52110d7af659..cca050aa1aa8 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -25,12 +25,6 @@
 #define PGDIR_SIZE	(1UL << PGDIR_SHIFT)
 #define PGDIR_MASK	(~(PGDIR_SIZE-1))
 
-/* Bits to mask out from a PMD to get to the PTE page */
-/* PMDs point to PTE table fragments which are 4K aligned.  */
-#define PMD_MASKED_BITS		0xfff
-/* Bits to mask out from a PGD/PUD to get to the PMD page */
-#define PUD_MASKED_BITS		0x1ff
-
 #define _PAGE_COMBO	0x00020000 /* this is a combo 4k page */
 #define _PAGE_4K_PFN	0x00040000 /* PFN is for a single 4k page */
 /*
@@ -49,6 +43,24 @@
  * of addressable physical space, or 46 bits for the special 4k PFNs.
  */
 #define PTE_RPN_SHIFT	(30)
+/*
+ * we support 16 fragments per PTE page of 64K size.
+ */
+#define PTE_FRAG_NR	16
+/*
+ * We use a 2K PTE page fragment and another 2K for storing
+ * real_pte_t hash index
+ */
+#define PTE_FRAG_SIZE_SHIFT  12
+#define PTE_FRAG_SIZE (1UL << PTE_FRAG_SIZE_SHIFT)
+
+/*
+ * Bits to mask out from a PMD to get to the PTE page
+ * PMDs point to PTE table fragments which are PTE_FRAG_SIZE aligned.
+ */
+#define PMD_MASKED_BITS		(PTE_FRAG_SIZE - 1)
+/* Bits to mask out from a PGD/PUD to get to the PMD page */
+#define PUD_MASKED_BITS		0x1ff
 
 #ifndef __ASSEMBLY__
 
@@ -112,8 +124,12 @@ extern bool __rpte_sub_valid(real_pte_t rpte, unsigned long index);
 		remap_pfn_range((vma), (addr), (pfn), PAGE_SIZE,	\
 			__pgprot(pgprot_val((prot)) | _PAGE_4K_PFN)))
 
-#define PTE_TABLE_SIZE	(sizeof(real_pte_t) << PTE_INDEX_SIZE)
+#define PTE_TABLE_SIZE	PTE_FRAG_SIZE
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define PMD_TABLE_SIZE	((sizeof(pmd_t) << PMD_INDEX_SIZE) + (sizeof(unsigned long) << PMD_INDEX_SIZE))
+#else
 #define PMD_TABLE_SIZE	(sizeof(pmd_t) << PMD_INDEX_SIZE)
+#endif
 #define PGD_TABLE_SIZE	(sizeof(pgd_t) << PGD_INDEX_SIZE)
 
 #define pgd_pte(pgd)	(pud_pte(((pud_t){ pgd })))
diff --git a/arch/powerpc/include/asm/nohash/64/pgtable-64k.h b/arch/powerpc/include/asm/nohash/64/pgtable-64k.h
index a44660d76096..2217de6454d6 100644
--- a/arch/powerpc/include/asm/nohash/64/pgtable-64k.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable-64k.h
@@ -9,8 +9,19 @@
 #define PUD_INDEX_SIZE	0
 #define PGD_INDEX_SIZE  12
 
+/*
+ * we support 16 fragments per PTE page of 64K size.
+ */
+#define PTE_FRAG_NR	16
+/*
+ * We use a 2K PTE page fragment and another 2K for storing
+ * real_pte_t hash index
+ */
+#define PTE_FRAG_SIZE_SHIFT  12
+#define PTE_FRAG_SIZE (1UL << PTE_FRAG_SIZE_SHIFT)
+
 #ifndef __ASSEMBLY__
-#define PTE_TABLE_SIZE	(sizeof(real_pte_t) << PTE_INDEX_SIZE)
+#define PTE_TABLE_SIZE	PTE_FRAG_SIZE
 #define PMD_TABLE_SIZE	(sizeof(pmd_t) << PMD_INDEX_SIZE)
 #define PGD_TABLE_SIZE	(sizeof(pgd_t) << PGD_INDEX_SIZE)
 #endif	/* __ASSEMBLY__ */
@@ -32,9 +43,11 @@
 #define PGDIR_SIZE	(1UL << PGDIR_SHIFT)
 #define PGDIR_MASK	(~(PGDIR_SIZE-1))
 
-/* Bits to mask out from a PMD to get to the PTE page */
-/* PMDs point to PTE table fragments which are 4K aligned.  */
-#define PMD_MASKED_BITS		0xfff
+/*
+ * Bits to mask out from a PMD to get to the PTE page
+ * PMDs point to PTE table fragments which are PTE_FRAG_SIZE aligned.
+ */
+#define PMD_MASKED_BITS		(PTE_FRAG_SIZE - 1)
 /* Bits to mask out from a PGD/PUD to get to the PMD page */
 #define PUD_MASKED_BITS		0x1ff
 
diff --git a/arch/powerpc/include/asm/pgalloc-64.h b/arch/powerpc/include/asm/pgalloc-64.h
index d8cde71f6734..69ef28a81733 100644
--- a/arch/powerpc/include/asm/pgalloc-64.h
+++ b/arch/powerpc/include/asm/pgalloc-64.h
@@ -163,16 +163,6 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table,
 }
 
 #else /* if CONFIG_PPC_64K_PAGES */
-/*
- * we support 16 fragments per PTE page.
- */
-#define PTE_FRAG_NR	16
-/*
- * We use a 2K PTE page fragment and another 2K for storing
- * real_pte_t hash index
- */
-#define PTE_FRAG_SIZE_SHIFT  12
-#define PTE_FRAG_SIZE (2 * PTRS_PER_PTE * sizeof(pte_t))
 
 extern pte_t *page_table_alloc(struct mm_struct *, unsigned long, int);
 extern void page_table_free(struct mm_struct *, unsigned long *, int);
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index d747dd7bc90b..379a6a90644b 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -87,11 +87,7 @@ static void pgd_ctor(void *addr)
 
 static void pmd_ctor(void *addr)
 {
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-	memset(addr, 0, PMD_TABLE_SIZE * 2);
-#else
 	memset(addr, 0, PMD_TABLE_SIZE);
-#endif
 }
 
 struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE];

From 4d9057c39aceb3a94ccb6005f4433a0105e60521 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 1 Dec 2015 09:06:56 +0530
Subject: [PATCH 060/149] powerpc/mm: Don't hardcode the hash pte slot shift

Use the #define instead of open-coding the same

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/hash-64k.h | 2 +-
 arch/powerpc/include/asm/book3s/64/pgtable.h  | 2 +-
 arch/powerpc/include/asm/nohash/64/pgtable.h  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h
index cca050aa1aa8..9f9942998587 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -94,7 +94,7 @@ static inline unsigned long __rpte_to_hidx(real_pte_t rpte, unsigned long index)
 {
 	if ((pte_val(rpte.pte) & _PAGE_COMBO))
 		return (rpte.hidx >> (index<<2)) & 0xf;
-	return (pte_val(rpte.pte) >> 12) & 0xf;
+	return (pte_val(rpte.pte) >> _PAGE_F_GIX_SHIFT) & 0xf;
 }
 
 #define __rpte_to_pte(r)	((r).pte)
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index bb97b6a52b84..a2d4e0e37067 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -50,7 +50,7 @@
 #define __real_pte(e,p)		(e)
 #define __rpte_to_pte(r)	(__pte(r))
 #endif
-#define __rpte_to_hidx(r,index)	(pte_val(__rpte_to_pte(r)) >> 12)
+#define __rpte_to_hidx(r,index)	(pte_val(__rpte_to_pte(r)) >>_PAGE_F_GIX_SHIFT)
 
 #define pte_iterate_hashed_subpages(rpte, psize, va, index, shift)       \
 	do {							         \
diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h
index d635a924d652..03c226965b46 100644
--- a/arch/powerpc/include/asm/nohash/64/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable.h
@@ -121,7 +121,7 @@
 #define __real_pte(e,p)		(e)
 #define __rpte_to_pte(r)	(__pte(r))
 #endif
-#define __rpte_to_hidx(r,index)	(pte_val(__rpte_to_pte(r)) >> 12)
+#define __rpte_to_hidx(r,index)	(pte_val(__rpte_to_pte(r)) >> _PAGE_F_GIX_SHIFT)
 
 #define pte_iterate_hashed_subpages(rpte, psize, va, index, shift)       \
 	do {							         \

From cc50380db32771af61201cff39da1043b90f2a6d Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 1 Dec 2015 09:06:57 +0530
Subject: [PATCH 061/149] powerpc/nohash: Update 64K nohash config to have 32
 pte fragement

They don't need to track 4k subpage slot details and hence don't need
second half of pgtable_t.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/nohash/64/pgtable-64k.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/nohash/64/pgtable-64k.h b/arch/powerpc/include/asm/nohash/64/pgtable-64k.h
index 2217de6454d6..570fb30be21c 100644
--- a/arch/powerpc/include/asm/nohash/64/pgtable-64k.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable-64k.h
@@ -10,14 +10,14 @@
 #define PGD_INDEX_SIZE  12
 
 /*
- * we support 16 fragments per PTE page of 64K size.
+ * we support 32 fragments per PTE page of 64K size
  */
-#define PTE_FRAG_NR	16
+#define PTE_FRAG_NR	32
 /*
  * We use a 2K PTE page fragment and another 2K for storing
  * real_pte_t hash index
  */
-#define PTE_FRAG_SIZE_SHIFT  12
+#define PTE_FRAG_SIZE_SHIFT  11
 #define PTE_FRAG_SIZE (1UL << PTE_FRAG_SIZE_SHIFT)
 
 #ifndef __ASSEMBLY__

From 45949ebe6c748cba93c1dd6ab9d03190f862ecf7 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 1 Dec 2015 09:06:58 +0530
Subject: [PATCH 062/149] powerpc/nohash: we don't use real_pte_t for nohash

Remove the related functions and #defines

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/nohash/64/pgtable.h | 33 --------------------
 1 file changed, 33 deletions(-)

diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h
index 03c226965b46..b9f734dd5b81 100644
--- a/arch/powerpc/include/asm/nohash/64/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable.h
@@ -106,39 +106,6 @@
 #endif /* CONFIG_PPC_MM_SLICES */
 
 #ifndef __ASSEMBLY__
-
-/*
- * This is the default implementation of various PTE accessors, it's
- * used in all cases except Book3S with 64K pages where we have a
- * concept of sub-pages
- */
-#ifndef __real_pte
-
-#ifdef CONFIG_STRICT_MM_TYPECHECKS
-#define __real_pte(e,p)		((real_pte_t){(e)})
-#define __rpte_to_pte(r)	((r).pte)
-#else
-#define __real_pte(e,p)		(e)
-#define __rpte_to_pte(r)	(__pte(r))
-#endif
-#define __rpte_to_hidx(r,index)	(pte_val(__rpte_to_pte(r)) >> _PAGE_F_GIX_SHIFT)
-
-#define pte_iterate_hashed_subpages(rpte, psize, va, index, shift)       \
-	do {							         \
-		index = 0;					         \
-		shift = mmu_psize_defs[psize].shift;		         \
-
-#define pte_iterate_hashed_end() } while(0)
-
-/*
- * We expect this to be called only for user addresses or kernel virtual
- * addresses other than the linear mapping.
- */
-#define pte_pagesize_index(mm, addr, pte)	MMU_PAGE_4K
-
-#endif /* __real_pte */
-
-
 /* pte_clear moved to later in this file */
 
 #define PMD_BAD_BITS		(PTE_TABLE_SIZE-1)

From 4ad90c864989337e7946f456478b6417325689d0 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 1 Dec 2015 09:06:59 +0530
Subject: [PATCH 063/149] powerpc/mm: Use H_READ with H_READ_4

This will bulk read 4 hash pte slot entries and should reduce the loop

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/plpar_wrappers.h | 17 +++++++
 arch/powerpc/platforms/pseries/lpar.c     | 54 +++++++++++------------
 2 files changed, 44 insertions(+), 27 deletions(-)

diff --git a/arch/powerpc/include/asm/plpar_wrappers.h b/arch/powerpc/include/asm/plpar_wrappers.h
index 67859edbf8fd..1b394247afc2 100644
--- a/arch/powerpc/include/asm/plpar_wrappers.h
+++ b/arch/powerpc/include/asm/plpar_wrappers.h
@@ -201,6 +201,23 @@ static inline long plpar_pte_read_raw(unsigned long flags, unsigned long ptex,
 	return rc;
 }
 
+/*
+ * ptes must be 8*sizeof(unsigned long)
+ */
+static inline long plpar_pte_read_4(unsigned long flags, unsigned long ptex,
+				    unsigned long *ptes)
+
+{
+	long rc;
+	unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
+
+	rc = plpar_hcall9(H_READ, retbuf, flags | H_READ_4, ptex);
+
+	memcpy(ptes, retbuf, 8*sizeof(unsigned long));
+
+	return rc;
+}
+
 /*
  * plpar_pte_read_4_raw can be called in real mode.
  * ptes must be 8*sizeof(unsigned long)
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index 6d46547871aa..477290ad855e 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -315,48 +315,48 @@ static long pSeries_lpar_hpte_updatepp(unsigned long slot,
 	return 0;
 }
 
-static unsigned long pSeries_lpar_hpte_getword0(unsigned long slot)
+static long __pSeries_lpar_hpte_find(unsigned long want_v, unsigned long hpte_group)
 {
-	unsigned long dword0;
-	unsigned long lpar_rc;
-	unsigned long dummy_word1;
-	unsigned long flags;
+	long lpar_rc;
+	unsigned long i, j;
+	struct {
+		unsigned long pteh;
+		unsigned long ptel;
+	} ptes[4];
 
-	/* Read 1 pte at a time                        */
-	/* Do not need RPN to logical page translation */
-	/* No cross CEC PFT access                     */
-	flags = 0;
+	for (i = 0; i < HPTES_PER_GROUP; i += 4, hpte_group += 4) {
 
-	lpar_rc = plpar_pte_read(flags, slot, &dword0, &dummy_word1);
+		lpar_rc = plpar_pte_read_4(0, hpte_group, (void *)ptes);
+		if (lpar_rc != H_SUCCESS)
+			continue;
 
-	BUG_ON(lpar_rc != H_SUCCESS);
+		for (j = 0; j < 4; j++) {
+			if (HPTE_V_COMPARE(ptes[j].pteh, want_v) &&
+			    (ptes[j].pteh & HPTE_V_VALID))
+				return i + j;
+		}
+	}
 
-	return dword0;
+	return -1;
 }
 
 static long pSeries_lpar_hpte_find(unsigned long vpn, int psize, int ssize)
 {
-	unsigned long hash;
-	unsigned long i;
 	long slot;
-	unsigned long want_v, hpte_v;
+	unsigned long hash;
+	unsigned long want_v;
+	unsigned long hpte_group;
 
 	hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize);
 	want_v = hpte_encode_avpn(vpn, psize, ssize);
 
 	/* Bolted entries are always in the primary group */
-	slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-	for (i = 0; i < HPTES_PER_GROUP; i++) {
-		hpte_v = pSeries_lpar_hpte_getword0(slot);
-
-		if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID))
-			/* HPTE matches */
-			return slot;
-		++slot;
-	}
-
-	return -1;
-} 
+	hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+	slot = __pSeries_lpar_hpte_find(want_v, hpte_group);
+	if (slot < 0)
+		return -1;
+	return hpte_group + slot;
+}
 
 static void pSeries_lpar_hpte_updateboltedpp(unsigned long newpp,
 					     unsigned long ea,

From 4dcbd88eb600d52ce52a75c5075c2eff2f6849e6 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 1 Dec 2015 09:07:00 +0530
Subject: [PATCH 064/149] powerpc/mm: Don't open code pgtable_t size

The slot information of base page size hash pte is stored in the
pgtable_t w.r.t transparent hugepage. We need to make sure we don't
index beyond pgtable_t size.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/hugepage-hash64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c
index 1f666de0110a..baf1301ded0c 100644
--- a/arch/powerpc/mm/hugepage-hash64.c
+++ b/arch/powerpc/mm/hugepage-hash64.c
@@ -71,7 +71,7 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
 	 */
 	shift = mmu_psize_defs[psize].shift;
 	index = (ea & ~HPAGE_PMD_MASK) >> shift;
-	BUG_ON(index >= 4096);
+	BUG_ON(index >= PTE_FRAG_SIZE);
 
 	vpn = hpt_vpn(ea, vsid, ssize);
 	hpte_slot_array = get_hpte_slot_array(pmdp);

From 49e9cf3f0c04bf76ffa59242254110309554861d Mon Sep 17 00:00:00 2001
From: Boqun Feng <boqun.feng@gmail.com>
Date: Mon, 2 Nov 2015 09:30:31 +0800
Subject: [PATCH 065/149] powerpc: Make value-returning atomics fully ordered

According to memory-barriers.txt:

> Any atomic operation that modifies some state in memory and returns
> information about the state (old or new) implies an SMP-conditional
> general memory barrier (smp_mb()) on each side of the actual
> operation ...

Which mean these operations should be fully ordered. However on PPC,
PPC_ATOMIC_ENTRY_BARRIER is the barrier before the actual operation,
which is currently "lwsync" if SMP=y. The leading "lwsync" can not
guarantee fully ordered atomics, according to Paul Mckenney:

https://lkml.org/lkml/2015/10/14/970

To fix this, we define PPC_ATOMIC_ENTRY_BARRIER as "sync" to guarantee
the fully-ordered semantics.

This also makes futex atomics fully ordered, which can avoid possible
memory ordering problems if userspace code relies on futex system call
for fully ordered semantics.

Fixes: b97021f85517 ("powerpc: Fix atomic_xxx_return barrier semantics")
Cc: stable@vger.kernel.org # 3.2+
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/synch.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/synch.h b/arch/powerpc/include/asm/synch.h
index e682a7143edb..c50868681f9e 100644
--- a/arch/powerpc/include/asm/synch.h
+++ b/arch/powerpc/include/asm/synch.h
@@ -44,7 +44,7 @@ static inline void isync(void)
 	MAKE_LWSYNC_SECTION_ENTRY(97, __lwsync_fixup);
 #define PPC_ACQUIRE_BARRIER	 "\n" stringify_in_c(__PPC_ACQUIRE_BARRIER)
 #define PPC_RELEASE_BARRIER	 stringify_in_c(LWSYNC) "\n"
-#define PPC_ATOMIC_ENTRY_BARRIER "\n" stringify_in_c(LWSYNC) "\n"
+#define PPC_ATOMIC_ENTRY_BARRIER "\n" stringify_in_c(sync) "\n"
 #define PPC_ATOMIC_EXIT_BARRIER	 "\n" stringify_in_c(sync) "\n"
 #else
 #define PPC_ACQUIRE_BARRIER

From 81d7a3294de7e9828310bbf986a67246b13fa01e Mon Sep 17 00:00:00 2001
From: Boqun Feng <boqun.feng@gmail.com>
Date: Mon, 2 Nov 2015 09:30:32 +0800
Subject: [PATCH 066/149] powerpc: Make {cmp}xchg* and their atomic_ versions
 fully ordered

According to memory-barriers.txt, xchg*, cmpxchg* and their atomic_
versions all need to be fully ordered, however they are now just
RELEASE+ACQUIRE, which are not fully ordered.

So also replace PPC_RELEASE_BARRIER and PPC_ACQUIRE_BARRIER with
PPC_ATOMIC_ENTRY_BARRIER and PPC_ATOMIC_EXIT_BARRIER in
__{cmp,}xchg_{u32,u64} respectively to guarantee fully ordered semantics
of atomic{,64}_{cmp,}xchg() and {cmp,}xchg(), as a complement of commit
b97021f85517 ("powerpc: Fix atomic_xxx_return barrier semantics")

This patch depends on patch "powerpc: Make value-returning atomics fully
ordered" for PPC_ATOMIC_ENTRY_BARRIER definition.

Cc: stable@vger.kernel.org # 3.2+
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/cmpxchg.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h
index ad6263cffb0f..d1a8d93cccfd 100644
--- a/arch/powerpc/include/asm/cmpxchg.h
+++ b/arch/powerpc/include/asm/cmpxchg.h
@@ -18,12 +18,12 @@ __xchg_u32(volatile void *p, unsigned long val)
 	unsigned long prev;
 
 	__asm__ __volatile__(
-	PPC_RELEASE_BARRIER
+	PPC_ATOMIC_ENTRY_BARRIER
 "1:	lwarx	%0,0,%2 \n"
 	PPC405_ERR77(0,%2)
 "	stwcx.	%3,0,%2 \n\
 	bne-	1b"
-	PPC_ACQUIRE_BARRIER
+	PPC_ATOMIC_EXIT_BARRIER
 	: "=&r" (prev), "+m" (*(volatile unsigned int *)p)
 	: "r" (p), "r" (val)
 	: "cc", "memory");
@@ -61,12 +61,12 @@ __xchg_u64(volatile void *p, unsigned long val)
 	unsigned long prev;
 
 	__asm__ __volatile__(
-	PPC_RELEASE_BARRIER
+	PPC_ATOMIC_ENTRY_BARRIER
 "1:	ldarx	%0,0,%2 \n"
 	PPC405_ERR77(0,%2)
 "	stdcx.	%3,0,%2 \n\
 	bne-	1b"
-	PPC_ACQUIRE_BARRIER
+	PPC_ATOMIC_EXIT_BARRIER
 	: "=&r" (prev), "+m" (*(volatile unsigned long *)p)
 	: "r" (p), "r" (val)
 	: "cc", "memory");
@@ -151,14 +151,14 @@ __cmpxchg_u32(volatile unsigned int *p, unsigned long old, unsigned long new)
 	unsigned int prev;
 
 	__asm__ __volatile__ (
-	PPC_RELEASE_BARRIER
+	PPC_ATOMIC_ENTRY_BARRIER
 "1:	lwarx	%0,0,%2		# __cmpxchg_u32\n\
 	cmpw	0,%0,%3\n\
 	bne-	2f\n"
 	PPC405_ERR77(0,%2)
 "	stwcx.	%4,0,%2\n\
 	bne-	1b"
-	PPC_ACQUIRE_BARRIER
+	PPC_ATOMIC_EXIT_BARRIER
 	"\n\
 2:"
 	: "=&r" (prev), "+m" (*p)
@@ -197,13 +197,13 @@ __cmpxchg_u64(volatile unsigned long *p, unsigned long old, unsigned long new)
 	unsigned long prev;
 
 	__asm__ __volatile__ (
-	PPC_RELEASE_BARRIER
+	PPC_ATOMIC_ENTRY_BARRIER
 "1:	ldarx	%0,0,%2		# __cmpxchg_u64\n\
 	cmpd	0,%0,%3\n\
 	bne-	2f\n\
 	stdcx.	%4,0,%2\n\
 	bne-	1b"
-	PPC_ACQUIRE_BARRIER
+	PPC_ATOMIC_EXIT_BARRIER
 	"\n\
 2:"
 	: "=&r" (prev), "+m" (*p)

From 801c0b2c4db3a33d56b3e19240df7b897e5bbfbc Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Fri, 20 Nov 2015 15:15:32 +1100
Subject: [PATCH 067/149] powerpc: Print MSR TM bits in oops messages

Print MSR TM bits in oops messages.  This appends them to the end
like this:

    MSR: 8000000502823031 <SF,VEC,VSX,FP,ME,IR,DR,LE,TM[TE]>

You get the TM[] only if at least one TM MSR bit is set.  Inside the
TM[], E means Enabled (bit 32), S means Suspended (bit 33), and T
means Transactional (bit 34)

If no bits are set, you get no TM[] output.

Include rework of printbits() to handle this case.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/process.c | 51 +++++++++++++++++++++++++++++------
 1 file changed, 43 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 6f76f25c3ee8..ab9373bfabda 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1033,10 +1033,12 @@ static void show_instructions(struct pt_regs *regs)
 	printk("\n");
 }
 
-static struct regbit {
+struct regbit {
 	unsigned long bit;
 	const char *name;
-} msr_bits[] = {
+};
+
+static struct regbit msr_bits[] = {
 #if defined(CONFIG_PPC64) && !defined(CONFIG_BOOKE)
 	{MSR_SF,	"SF"},
 	{MSR_HV,	"HV"},
@@ -1066,16 +1068,49 @@ static struct regbit {
 	{0,		NULL}
 };
 
-static void printbits(unsigned long val, struct regbit *bits)
+static void print_bits(unsigned long val, struct regbit *bits, const char *sep)
 {
-	const char *sep = "";
+	const char *s = "";
 
-	printk("<");
 	for (; bits->bit; ++bits)
 		if (val & bits->bit) {
-			printk("%s%s", sep, bits->name);
-			sep = ",";
+			printk("%s%s", s, bits->name);
+			s = sep;
 		}
+}
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+static struct regbit msr_tm_bits[] = {
+	{MSR_TS_T,	"T"},
+	{MSR_TS_S,	"S"},
+	{MSR_TM,	"E"},
+	{0,		NULL}
+};
+
+static void print_tm_bits(unsigned long val)
+{
+/*
+ * This only prints something if at least one of the TM bit is set.
+ * Inside the TM[], the output means:
+ *   E: Enabled		(bit 32)
+ *   S: Suspended	(bit 33)
+ *   T: Transactional	(bit 34)
+ */
+	if (val & (MSR_TM | MSR_TS_S | MSR_TS_T)) {
+		printk(",TM[");
+		print_bits(val, msr_tm_bits, "");
+		printk("]");
+	}
+}
+#else
+static void print_tm_bits(unsigned long val) {}
+#endif
+
+static void print_msr_bits(unsigned long val)
+{
+	printk("<");
+	print_bits(val, msr_bits, ",");
+	print_tm_bits(val);
 	printk(">");
 }
 
@@ -1100,7 +1135,7 @@ void show_regs(struct pt_regs * regs)
 	printk("REGS: %p TRAP: %04lx   %s  (%s)\n",
 	       regs, regs->trap, print_tainted(), init_utsname()->release);
 	printk("MSR: "REG" ", regs->msr);
-	printbits(regs->msr, msr_bits);
+	print_msr_bits(regs->msr);
 	printk("  CR: %08lx  XER: %08lx\n", regs->ccr, regs->xer);
 	trap = TRAP(regs);
 	if ((regs->trap != 0xc00) && cpu_has_feature(CPU_FTR_CFAR))

From fcb45ec074725baeb3aaa1b1854b9f44c3eebacf Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Tue, 24 Nov 2015 13:05:38 +1100
Subject: [PATCH 068/149] selftests/powerpc: Move get_auxv_entry() into utils.c

This doesn't really belong in harness.c, it's a helper function. So move
it into utils.c.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 tools/testing/selftests/powerpc/harness.c     | 43 --------------
 tools/testing/selftests/powerpc/pmu/Makefile  |  2 +
 .../selftests/powerpc/pmu/ebb/Makefile        |  3 +-
 tools/testing/selftests/powerpc/tm/Makefile   |  2 +-
 tools/testing/selftests/powerpc/utils.c       | 58 +++++++++++++++++++
 5 files changed, 63 insertions(+), 45 deletions(-)
 create mode 100644 tools/testing/selftests/powerpc/utils.c

diff --git a/tools/testing/selftests/powerpc/harness.c b/tools/testing/selftests/powerpc/harness.c
index f7997affd143..52f9be7f61f0 100644
--- a/tools/testing/selftests/powerpc/harness.c
+++ b/tools/testing/selftests/powerpc/harness.c
@@ -116,46 +116,3 @@ int test_harness(int (test_function)(void), char *name)
 
 	return rc;
 }
-
-static char auxv[4096];
-
-void *get_auxv_entry(int type)
-{
-	ElfW(auxv_t) *p;
-	void *result;
-	ssize_t num;
-	int fd;
-
-	fd = open("/proc/self/auxv", O_RDONLY);
-	if (fd == -1) {
-		perror("open");
-		return NULL;
-	}
-
-	result = NULL;
-
-	num = read(fd, auxv, sizeof(auxv));
-	if (num < 0) {
-		perror("read");
-		goto out;
-	}
-
-	if (num > sizeof(auxv)) {
-		printf("Overflowed auxv buffer\n");
-		goto out;
-	}
-
-	p = (ElfW(auxv_t) *)auxv;
-
-	while (p->a_type != AT_NULL) {
-		if (p->a_type == type) {
-			result = (void *)p->a_un.a_val;
-			break;
-		}
-
-		p++;
-	}
-out:
-	close(fd);
-	return result;
-}
diff --git a/tools/testing/selftests/powerpc/pmu/Makefile b/tools/testing/selftests/powerpc/pmu/Makefile
index a9099d9f8f39..50326cbb372d 100644
--- a/tools/testing/selftests/powerpc/pmu/Makefile
+++ b/tools/testing/selftests/powerpc/pmu/Makefile
@@ -12,6 +12,8 @@ $(TEST_PROGS): $(EXTRA_SOURCES)
 count_instructions: loop.S count_instructions.c $(EXTRA_SOURCES)
 	$(CC) $(CFLAGS) -m64 -o $@ $^
 
+per_event_excludes: ../utils.c
+
 include ../../lib.mk
 
 DEFAULT_RUN_TESTS := $(RUN_TESTS)
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/Makefile b/tools/testing/selftests/powerpc/pmu/ebb/Makefile
index 5cdc9dbf2b27..8d2279c4bb4b 100644
--- a/tools/testing/selftests/powerpc/pmu/ebb/Makefile
+++ b/tools/testing/selftests/powerpc/pmu/ebb/Makefile
@@ -18,7 +18,8 @@ TEST_PROGS := reg_access_test event_attributes_test cycles_test	\
 
 all: $(TEST_PROGS)
 
-$(TEST_PROGS): ../../harness.c ../event.c ../lib.c ebb.c ebb_handler.S trace.c busy_loop.S
+$(TEST_PROGS): ../../harness.c ../../utils.c ../event.c ../lib.c \
+	       ebb.c ebb_handler.S trace.c busy_loop.S
 
 instruction_count_test: ../loop.S
 
diff --git a/tools/testing/selftests/powerpc/tm/Makefile b/tools/testing/selftests/powerpc/tm/Makefile
index 4bea62a319dc..e7b9be7947c8 100644
--- a/tools/testing/selftests/powerpc/tm/Makefile
+++ b/tools/testing/selftests/powerpc/tm/Makefile
@@ -4,7 +4,7 @@ all: $(TEST_PROGS)
 
 $(TEST_PROGS): ../harness.c
 
-tm-syscall: tm-syscall-asm.S
+tm-syscall: tm-syscall-asm.S ../utils.c
 tm-syscall: CFLAGS += -mhtm -I../../../../../usr/include
 
 include ../../lib.mk
diff --git a/tools/testing/selftests/powerpc/utils.c b/tools/testing/selftests/powerpc/utils.c
new file mode 100644
index 000000000000..536113add380
--- /dev/null
+++ b/tools/testing/selftests/powerpc/utils.c
@@ -0,0 +1,58 @@
+/*
+ * Copyright 2013-2015, Michael Ellerman, IBM Corp.
+ * Licensed under GPLv2.
+ */
+
+#include <elf.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <link.h>
+#include <stdio.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "utils.h"
+
+static char auxv[4096];
+
+void *get_auxv_entry(int type)
+{
+	ElfW(auxv_t) *p;
+	void *result;
+	ssize_t num;
+	int fd;
+
+	fd = open("/proc/self/auxv", O_RDONLY);
+	if (fd == -1) {
+		perror("open");
+		return NULL;
+	}
+
+	result = NULL;
+
+	num = read(fd, auxv, sizeof(auxv));
+	if (num < 0) {
+		perror("read");
+		goto out;
+	}
+
+	if (num > sizeof(auxv)) {
+		printf("Overflowed auxv buffer\n");
+		goto out;
+	}
+
+	p = (ElfW(auxv_t) *)auxv;
+
+	while (p->a_type != AT_NULL) {
+		if (p->a_type == type) {
+			result = (void *)p->a_un.a_val;
+			break;
+		}
+
+		p++;
+	}
+out:
+	close(fd);
+	return result;
+}

From ede8ef3f824ea6e853a5e4b27467f583cdaa314e Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Tue, 24 Nov 2015 13:05:39 +1100
Subject: [PATCH 069/149] selftests/powerpc: Add have_hwcap2() helper

We already do this twice and want to add another so add a helper.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 tools/testing/selftests/powerpc/pmu/ebb/ebb.c   | 3 +--
 tools/testing/selftests/powerpc/tm/tm-syscall.c | 3 +--
 tools/testing/selftests/powerpc/utils.h         | 6 ++++++
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/powerpc/pmu/ebb/ebb.c b/tools/testing/selftests/powerpc/pmu/ebb/ebb.c
index 9729d9f90218..e67452f1bcff 100644
--- a/tools/testing/selftests/powerpc/pmu/ebb/ebb.c
+++ b/tools/testing/selftests/powerpc/pmu/ebb/ebb.c
@@ -13,7 +13,6 @@
 #include <stdlib.h>
 #include <string.h>
 #include <sys/ioctl.h>
-#include <linux/auxvec.h>
 
 #include "trace.h"
 #include "reg.h"
@@ -324,7 +323,7 @@ bool ebb_is_supported(void)
 {
 #ifdef PPC_FEATURE2_EBB
 	/* EBB requires at least POWER8 */
-	return ((long)get_auxv_entry(AT_HWCAP2) & PPC_FEATURE2_EBB);
+	return have_hwcap2(PPC_FEATURE2_EBB);
 #else
 	return false;
 #endif
diff --git a/tools/testing/selftests/powerpc/tm/tm-syscall.c b/tools/testing/selftests/powerpc/tm/tm-syscall.c
index e835bf7ec7ae..d7256b79ec4c 100644
--- a/tools/testing/selftests/powerpc/tm/tm-syscall.c
+++ b/tools/testing/selftests/powerpc/tm/tm-syscall.c
@@ -14,7 +14,6 @@
 #include <sys/syscall.h>
 #include <asm/tm.h>
 #include <asm/cputable.h>
-#include <linux/auxvec.h>
 #include <sys/time.h>
 #include <stdlib.h>
 
@@ -80,7 +79,7 @@ pid_t getppid_tm(bool suspend)
 static inline bool have_htm_nosc(void)
 {
 #ifdef PPC_FEATURE2_HTM_NOSC
-	return ((long)get_auxv_entry(AT_HWCAP2) & PPC_FEATURE2_HTM_NOSC);
+	return have_hwcap2(PPC_FEATURE2_HTM_NOSC);
 #else
 	printf("PPC_FEATURE2_HTM_NOSC not defined, can't check AT_HWCAP2\n");
 	return false;
diff --git a/tools/testing/selftests/powerpc/utils.h b/tools/testing/selftests/powerpc/utils.h
index b7d41086bb0a..fbf2bf530e50 100644
--- a/tools/testing/selftests/powerpc/utils.h
+++ b/tools/testing/selftests/powerpc/utils.h
@@ -8,6 +8,7 @@
 
 #include <stdint.h>
 #include <stdbool.h>
+#include <linux/auxvec.h>
 
 /* Avoid headaches with PRI?64 - just use %ll? always */
 typedef unsigned long long u64;
@@ -22,6 +23,11 @@ typedef uint8_t u8;
 int test_harness(int (test_function)(void), char *name);
 extern void *get_auxv_entry(int type);
 
+static inline bool have_hwcap2(unsigned long ftr2)
+{
+	return ((unsigned long)get_auxv_entry(AT_HWCAP2) & ftr2) == ftr2;
+}
+
 /* Yes, this is evil */
 #define FAIL_IF(x)						\
 do {								\

From 34dc8b279dc5dd3ce863298056989bdd7f4979c8 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Tue, 24 Nov 2015 13:05:40 +1100
Subject: [PATCH 070/149] selftests/powerpc: Move TM helpers into tm.h

Move have_htm_nosc() into a new tm.h, and add a new helper, have_htm()
which we'll use in the next patch.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 .../testing/selftests/powerpc/tm/tm-syscall.c | 12 +------
 tools/testing/selftests/powerpc/tm/tm.h       | 34 +++++++++++++++++++
 2 files changed, 35 insertions(+), 11 deletions(-)
 create mode 100644 tools/testing/selftests/powerpc/tm/tm.h

diff --git a/tools/testing/selftests/powerpc/tm/tm-syscall.c b/tools/testing/selftests/powerpc/tm/tm-syscall.c
index d7256b79ec4c..60560cb20e38 100644
--- a/tools/testing/selftests/powerpc/tm/tm-syscall.c
+++ b/tools/testing/selftests/powerpc/tm/tm-syscall.c
@@ -13,11 +13,11 @@
 #include <unistd.h>
 #include <sys/syscall.h>
 #include <asm/tm.h>
-#include <asm/cputable.h>
 #include <sys/time.h>
 #include <stdlib.h>
 
 #include "utils.h"
+#include "tm.h"
 
 extern int getppid_tm_active(void);
 extern int getppid_tm_suspended(void);
@@ -76,16 +76,6 @@ pid_t getppid_tm(bool suspend)
 	exit(-1);
 }
 
-static inline bool have_htm_nosc(void)
-{
-#ifdef PPC_FEATURE2_HTM_NOSC
-	return have_hwcap2(PPC_FEATURE2_HTM_NOSC);
-#else
-	printf("PPC_FEATURE2_HTM_NOSC not defined, can't check AT_HWCAP2\n");
-	return false;
-#endif
-}
-
 int tm_syscall(void)
 {
 	unsigned count = 0;
diff --git a/tools/testing/selftests/powerpc/tm/tm.h b/tools/testing/selftests/powerpc/tm/tm.h
new file mode 100644
index 000000000000..24144b25772c
--- /dev/null
+++ b/tools/testing/selftests/powerpc/tm/tm.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2015, Michael Ellerman, IBM Corp.
+ * Licensed under GPLv2.
+ */
+
+#ifndef _SELFTESTS_POWERPC_TM_TM_H
+#define _SELFTESTS_POWERPC_TM_TM_H
+
+#include <stdbool.h>
+#include <asm/cputable.h>
+
+#include "../utils.h"
+
+static inline bool have_htm(void)
+{
+#ifdef PPC_FEATURE2_HTM
+	return have_hwcap2(PPC_FEATURE2_HTM);
+#else
+	printf("PPC_FEATURE2_HTM not defined, can't check AT_HWCAP2\n");
+	return false;
+#endif
+}
+
+static inline bool have_htm_nosc(void)
+{
+#ifdef PPC_FEATURE2_HTM_NOSC
+	return have_hwcap2(PPC_FEATURE2_HTM_NOSC);
+#else
+	printf("PPC_FEATURE2_HTM_NOSC not defined, can't check AT_HWCAP2\n");
+	return false;
+#endif
+}
+
+#endif /* _SELFTESTS_POWERPC_TM_TM_H */

From b319ee8445961c5f7b2fd199c0ef99c418ee2d4a Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 2 Dec 2015 16:00:04 +1100
Subject: [PATCH 071/149] selftests/powerpc: Skip tm-resched-dscr if we don't
 have TM

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 tools/testing/selftests/powerpc/tm/Makefile          | 4 ++--
 tools/testing/selftests/powerpc/tm/tm-resched-dscr.c | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/powerpc/tm/Makefile b/tools/testing/selftests/powerpc/tm/Makefile
index e7b9be7947c8..63b55d01da35 100644
--- a/tools/testing/selftests/powerpc/tm/Makefile
+++ b/tools/testing/selftests/powerpc/tm/Makefile
@@ -2,9 +2,9 @@ TEST_PROGS := tm-resched-dscr tm-syscall
 
 all: $(TEST_PROGS)
 
-$(TEST_PROGS): ../harness.c
+$(TEST_PROGS): ../harness.c ../utils.c
 
-tm-syscall: tm-syscall-asm.S ../utils.c
+tm-syscall: tm-syscall-asm.S
 tm-syscall: CFLAGS += -mhtm -I../../../../../usr/include
 
 include ../../lib.mk
diff --git a/tools/testing/selftests/powerpc/tm/tm-resched-dscr.c b/tools/testing/selftests/powerpc/tm/tm-resched-dscr.c
index 42d4c8caad81..8fde93d6021f 100644
--- a/tools/testing/selftests/powerpc/tm/tm-resched-dscr.c
+++ b/tools/testing/selftests/powerpc/tm/tm-resched-dscr.c
@@ -29,6 +29,7 @@
 #include <asm/tm.h>
 
 #include "utils.h"
+#include "tm.h"
 
 #define TBEGIN          ".long 0x7C00051D ;"
 #define TEND            ".long 0x7C00055D ;"
@@ -42,6 +43,8 @@ int test_body(void)
 {
 	uint64_t rv, dscr1 = 1, dscr2, texasr;
 
+	SKIP_IF(!have_htm());
+
 	printf("Check DSCR TM context switch: ");
 	fflush(stdout);
 	for (;;) {

From 25007a69e852389985ee98235e76d740d4821c6c Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Fri, 20 Nov 2015 15:15:33 +1100
Subject: [PATCH 072/149] selftests/powerpc: Add TM signal return test

Test the kernel's signal return code to ensure that it doesn't crash
when both the transactional and suspend MSR bits are set in the signal
context.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Tested-by: Anshuman Khandual <khandual@linux.vnet.ibm.com>
[mpe: Skip if we don't have TM]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 tools/testing/selftests/powerpc/tm/.gitignore |  1 +
 tools/testing/selftests/powerpc/tm/Makefile   |  2 +-
 .../selftests/powerpc/tm/tm-signal-msr-resv.c | 74 +++++++++++++++++++
 3 files changed, 76 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/powerpc/tm/tm-signal-msr-resv.c

diff --git a/tools/testing/selftests/powerpc/tm/.gitignore b/tools/testing/selftests/powerpc/tm/.gitignore
index 2699635d2cd9..61c318fdace4 100644
--- a/tools/testing/selftests/powerpc/tm/.gitignore
+++ b/tools/testing/selftests/powerpc/tm/.gitignore
@@ -1,2 +1,3 @@
 tm-resched-dscr
 tm-syscall
+tm-signal-msr-resv
diff --git a/tools/testing/selftests/powerpc/tm/Makefile b/tools/testing/selftests/powerpc/tm/Makefile
index 63b55d01da35..c6b4ca8b2812 100644
--- a/tools/testing/selftests/powerpc/tm/Makefile
+++ b/tools/testing/selftests/powerpc/tm/Makefile
@@ -1,4 +1,4 @@
-TEST_PROGS := tm-resched-dscr tm-syscall
+TEST_PROGS := tm-resched-dscr tm-syscall tm-signal-msr-resv
 
 all: $(TEST_PROGS)
 
diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-msr-resv.c b/tools/testing/selftests/powerpc/tm/tm-signal-msr-resv.c
new file mode 100644
index 000000000000..d86653f282b1
--- /dev/null
+++ b/tools/testing/selftests/powerpc/tm/tm-signal-msr-resv.c
@@ -0,0 +1,74 @@
+/*
+ * Copyright 2015, Michael Neuling, IBM Corp.
+ * Licensed under GPLv2.
+ *
+ * Test the kernel's signal return code to ensure that it doesn't
+ * crash when both the transactional and suspend MSR bits are set in
+ * the signal context.
+ *
+ * For this test, we send ourselves a SIGUSR1.  In the SIGUSR1 handler
+ * we modify the signal context to set both MSR TM S and T bits (which
+ * is "reserved" by the PowerISA). When we return from the signal
+ * handler (implicit sigreturn), the kernel should detect reserved MSR
+ * value and send us with a SIGSEGV.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <signal.h>
+#include <unistd.h>
+
+#include "utils.h"
+#include "tm.h"
+
+int segv_expected = 0;
+
+void signal_segv(int signum)
+{
+	if (segv_expected && (signum == SIGSEGV))
+		_exit(0);
+	_exit(1);
+}
+
+void signal_usr1(int signum, siginfo_t *info, void *uc)
+{
+	ucontext_t *ucp = uc;
+
+	/* Link tm checkpointed context to normal context */
+	ucp->uc_link = ucp;
+	/* Set all TM bits so that the context is now invalid */
+#ifdef __powerpc64__
+	ucp->uc_mcontext.gp_regs[PT_MSR] |= (7ULL << 32);
+#else
+	ucp->uc_mcontext.regs->gpr[PT_MSR] |= (7ULL);
+#endif
+	/* Should segv on return becuase of invalid context */
+	segv_expected = 1;
+}
+
+int tm_signal_msr_resv()
+{
+	struct sigaction act;
+
+	SKIP_IF(!have_htm());
+
+	act.sa_sigaction = signal_usr1;
+	sigemptyset(&act.sa_mask);
+	act.sa_flags = SA_SIGINFO;
+	if (sigaction(SIGUSR1, &act, NULL) < 0) {
+		perror("sigaction sigusr1");
+		exit(1);
+	}
+	if (signal(SIGSEGV, signal_segv) == SIG_ERR)
+		exit(1);
+
+	raise(SIGUSR1);
+
+	/* We shouldn't get here as we exit in the segv handler */
+	return 1;
+}
+
+int main(void)
+{
+	return test_harness(tm_signal_msr_resv, "tm_signal_msr_resv");
+}

From a26f415bf71640f0141e5e946384444675206b6a Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Fri, 20 Nov 2015 15:15:34 +1100
Subject: [PATCH 073/149] selftests/powerpc: Add TM signal with invalid stack
 test

Test the kernels signal generation code to ensure it can handle an
invalid stack pointer when transactional.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Tested-by: Anshuman Khandual <khandual@linux.vnet.ibm.com>
[mpe: Skip if we don't have TM]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 tools/testing/selftests/powerpc/tm/.gitignore |  1 +
 tools/testing/selftests/powerpc/tm/Makefile   |  2 +-
 .../selftests/powerpc/tm/tm-signal-stack.c    | 76 +++++++++++++++++++
 3 files changed, 78 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/powerpc/tm/tm-signal-stack.c

diff --git a/tools/testing/selftests/powerpc/tm/.gitignore b/tools/testing/selftests/powerpc/tm/.gitignore
index 61c318fdace4..e6668217ccd0 100644
--- a/tools/testing/selftests/powerpc/tm/.gitignore
+++ b/tools/testing/selftests/powerpc/tm/.gitignore
@@ -1,3 +1,4 @@
 tm-resched-dscr
 tm-syscall
 tm-signal-msr-resv
+tm-signal-stack
diff --git a/tools/testing/selftests/powerpc/tm/Makefile b/tools/testing/selftests/powerpc/tm/Makefile
index c6b4ca8b2812..e7ceff809fa0 100644
--- a/tools/testing/selftests/powerpc/tm/Makefile
+++ b/tools/testing/selftests/powerpc/tm/Makefile
@@ -1,4 +1,4 @@
-TEST_PROGS := tm-resched-dscr tm-syscall tm-signal-msr-resv
+TEST_PROGS := tm-resched-dscr tm-syscall tm-signal-msr-resv tm-signal-stack
 
 all: $(TEST_PROGS)
 
diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-stack.c b/tools/testing/selftests/powerpc/tm/tm-signal-stack.c
new file mode 100644
index 000000000000..e44a238c1d77
--- /dev/null
+++ b/tools/testing/selftests/powerpc/tm/tm-signal-stack.c
@@ -0,0 +1,76 @@
+/*
+ * Copyright 2015, Michael Neuling, IBM Corp.
+ * Licensed under GPLv2.
+ *
+ * Test the kernel's signal delievery code to ensure that we don't
+ * trelaim twice in the kernel signal delivery code.  This can happen
+ * if we trigger a signal when in a transaction and the stack pointer
+ * is bogus.
+ *
+ * This test case registers a SEGV handler, sets the stack pointer
+ * (r1) to NULL, starts a transaction and then generates a SEGV.  The
+ * SEGV should be handled but we exit here as the stack pointer is
+ * invalid and hance we can't sigreturn.  We only need to check that
+ * this flow doesn't crash the kernel.
+ */
+
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <signal.h>
+
+#include "utils.h"
+#include "tm.h"
+
+void signal_segv(int signum)
+{
+	/* This should never actually run since stack is foobar */
+	exit(1);
+}
+
+int tm_signal_stack()
+{
+	int pid;
+
+	SKIP_IF(!have_htm());
+
+	pid = fork();
+	if (pid < 0)
+		exit(1);
+
+	if (pid) { /* Parent */
+		/*
+		 * It's likely the whole machine will crash here so if
+		 * the child ever exits, we are good.
+		 */
+		wait(NULL);
+		return 0;
+	}
+
+	/*
+	 * The flow here is:
+	 * 1) register a signal handler (so signal delievery occurs)
+	 * 2) make stack pointer (r1) = NULL
+	 * 3) start transaction
+	 * 4) cause segv
+	 */
+	if (signal(SIGSEGV, signal_segv) == SIG_ERR)
+		exit(1);
+	asm volatile("li 1, 0 ;"		/* stack ptr == NULL */
+		     "1:"
+		     ".long 0x7C00051D ;"	/* tbegin */
+		     "beq 1b ;"			/* retry forever */
+		     ".long 0x7C0005DD ; ;"	/* tsuspend */
+		     "ld 2, 0(1) ;"		/* trigger segv" */
+		     : : : "memory");
+
+	/* This should never get here due to above segv */
+	return 1;
+}
+
+int main(void)
+{
+	return test_harness(tm_signal_stack, "tm_signal_stack");
+}

From b4af279a7cba5cc1f665485e8ecdf272f1ba0cc5 Mon Sep 17 00:00:00 2001
From: Vipin K Parashar <vipin@linux.vnet.ibm.com>
Date: Tue, 1 Dec 2015 16:43:42 +0530
Subject: [PATCH 074/149] powerpc/pseries: Limit EPOW reset event warnings

Kernel prints respective warnings about various EPOW events for
user information/action after parsing EPOW interrupts. At times
below EPOW reset event warning is seen to be flooding kernel log
over a period of time.

May 25 03:46:34 alp kernel: Non critical power or cooling issue cleared
May 25 03:46:52 alp kernel: Non critical power or cooling issue cleared
May 25 03:53:48 alp kernel: Non critical power or cooling issue cleared
May 25 03:55:46 alp kernel: Non critical power or cooling issue cleared
May 25 03:56:34 alp kernel: Non critical power or cooling issue cleared
May 25 03:59:04 alp kernel: Non critical power or cooling issue cleared
May 25 04:02:01 alp kernel: Non critical power or cooling issue cleared

These EPOW reset events are spurious in nature and are triggered by
firmware without an actual EPOW event being reset. This patch avoids these
multiple EPOW reset warnings by using a counter variable. This variable
is incremented every time an EPOW event is reported. Upon receiving a EPOW
reset event the same variable is checked to filter out spurious events and
decremented accordingly.

This patch also improves log messages to better describe EPOW event being
reported. Merged adjacent log messages into single one to reduce number of
lines printed per event.

Signed-off-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Signed-off-by: Vipin K Parashar <vipin@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/pseries/ras.c | 55 ++++++++++++++++------------
 1 file changed, 31 insertions(+), 24 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
index 3b6647e574b6..9a3e27b863ce 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -40,6 +40,9 @@ static int ras_check_exception_token;
 #define EPOW_SENSOR_TOKEN	9
 #define EPOW_SENSOR_INDEX	0
 
+/* EPOW events counter variable */
+static int num_epow_events;
+
 static irqreturn_t ras_epow_interrupt(int irq, void *dev_id);
 static irqreturn_t ras_error_interrupt(int irq, void *dev_id);
 
@@ -82,32 +85,30 @@ static void handle_system_shutdown(char event_modifier)
 {
 	switch (event_modifier) {
 	case EPOW_SHUTDOWN_NORMAL:
-		pr_emerg("Firmware initiated power off");
+		pr_emerg("Power off requested\n");
 		orderly_poweroff(true);
 		break;
 
 	case EPOW_SHUTDOWN_ON_UPS:
-		pr_emerg("Loss of power reported by firmware, system is "
-			"running on UPS/battery");
-		pr_emerg("Check RTAS error log for details");
+		pr_emerg("Loss of system power detected. System is running on"
+			 " UPS/battery. Check RTAS error log for details\n");
 		orderly_poweroff(true);
 		break;
 
 	case EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS:
-		pr_emerg("Loss of system critical functions reported by "
-			"firmware");
-		pr_emerg("Check RTAS error log for details");
+		pr_emerg("Loss of system critical functions detected. Check"
+			 " RTAS error log for details\n");
 		orderly_poweroff(true);
 		break;
 
 	case EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH:
-		pr_emerg("Ambient temperature too high reported by firmware");
-		pr_emerg("Check RTAS error log for details");
+		pr_emerg("High ambient temperature detected. Check RTAS"
+			 " error log for details\n");
 		orderly_poweroff(true);
 		break;
 
 	default:
-		pr_err("Unknown power/cooling shutdown event (modifier %d)",
+		pr_err("Unknown power/cooling shutdown event (modifier = %d)\n",
 			event_modifier);
 	}
 }
@@ -145,17 +146,20 @@ static void rtas_parse_epow_errlog(struct rtas_error_log *log)
 
 	switch (action_code) {
 	case EPOW_RESET:
-		pr_err("Non critical power or cooling issue cleared");
+		if (num_epow_events) {
+			pr_info("Non critical power/cooling issue cleared\n");
+			num_epow_events--;
+		}
 		break;
 
 	case EPOW_WARN_COOLING:
-		pr_err("Non critical cooling issue reported by firmware");
-		pr_err("Check RTAS error log for details");
+		pr_info("Non-critical cooling issue detected. Check RTAS error"
+			" log for details\n");
 		break;
 
 	case EPOW_WARN_POWER:
-		pr_err("Non critical power issue reported by firmware");
-		pr_err("Check RTAS error log for details");
+		pr_info("Non-critical power issue detected. Check RTAS error"
+			" log for details\n");
 		break;
 
 	case EPOW_SYSTEM_SHUTDOWN:
@@ -163,23 +167,27 @@ static void rtas_parse_epow_errlog(struct rtas_error_log *log)
 		break;
 
 	case EPOW_SYSTEM_HALT:
-		pr_emerg("Firmware initiated power off");
+		pr_emerg("Critical power/cooling issue detected. Check RTAS"
+			 " error log for details. Powering off.\n");
 		orderly_poweroff(true);
 		break;
 
 	case EPOW_MAIN_ENCLOSURE:
 	case EPOW_POWER_OFF:
-		pr_emerg("Critical power/cooling issue reported by firmware");
-		pr_emerg("Check RTAS error log for details");
-		pr_emerg("Immediate power off");
+		pr_emerg("System about to lose power. Check RTAS error log "
+			 " for details. Powering off immediately.\n");
 		emergency_sync();
 		kernel_power_off();
 		break;
 
 	default:
-		pr_err("Unknown power/cooling event (action code %d)",
+		pr_err("Unknown power/cooling event (action code  = %d)\n",
 			action_code);
 	}
+
+	/* Increment epow events counter variable */
+	if (action_code != EPOW_RESET)
+		num_epow_events++;
 }
 
 /* Handle environmental and power warning (EPOW) interrupts. */
@@ -249,13 +257,12 @@ static irqreturn_t ras_error_interrupt(int irq, void *dev_id)
 	log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, fatal);
 
 	if (fatal) {
-		pr_emerg("Fatal hardware error reported by firmware");
-		pr_emerg("Check RTAS error log for details");
-		pr_emerg("Immediate power off");
+		pr_emerg("Fatal hardware error detected. Check RTAS error"
+			 " log for details. Powering off immediately\n");
 		emergency_sync();
 		kernel_power_off();
 	} else {
-		pr_err("Recoverable hardware error reported by firmware");
+		pr_err("Recoverable hardware error detected\n");
 	}
 
 	spin_unlock(&ras_log_buf_lock);

From 24ad1648edcc8b1c4a68c406296e0b171753a981 Mon Sep 17 00:00:00 2001
From: Rashmica Gupta <rashmicy@gmail.com>
Date: Tue, 1 Dec 2015 14:51:38 +1100
Subject: [PATCH 075/149] powerpc/cell: Remove the Cell QPACE code

All users of QPACE have upgraded to QPACE2 so remove the Cell QPACE code.

Signed-off-by: Rashmica Gupta <rashmicy@gmail.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 .../devicetree/bindings/serial/8250.txt       |   1 -
 arch/powerpc/boot/Makefile                    |   2 -
 arch/powerpc/configs/ppc64_defconfig          |   1 -
 arch/powerpc/platforms/cell/Kconfig           |   5 -
 arch/powerpc/platforms/cell/Makefile          |   4 -
 arch/powerpc/platforms/cell/qpace_setup.c     | 148 ------------------
 6 files changed, 161 deletions(-)
 delete mode 100644 arch/powerpc/platforms/cell/qpace_setup.c

diff --git a/Documentation/devicetree/bindings/serial/8250.txt b/Documentation/devicetree/bindings/serial/8250.txt
index 91d5ab0e60fc..936ab5b87324 100644
--- a/Documentation/devicetree/bindings/serial/8250.txt
+++ b/Documentation/devicetree/bindings/serial/8250.txt
@@ -14,7 +14,6 @@ Required properties:
 	  tegra132, or tegra210.
 	- "nxp,lpc3220-uart"
 	- "ralink,rt2880-uart"
-	- "ibm,qpace-nwp-serial"
 	- "altr,16550-FIFO32"
 	- "altr,16550-FIFO64"
 	- "altr,16550-FIFO128"
diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index 99e4487248ff..61165101342c 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -113,7 +113,6 @@ src-plat-$(CONFIG_EPAPR_BOOT) += epapr.c epapr-wrapper.c
 src-plat-$(CONFIG_PPC_PSERIES) += pseries-head.S
 src-plat-$(CONFIG_PPC_POWERNV) += pseries-head.S
 src-plat-$(CONFIG_PPC_IBM_CELL_BLADE) += pseries-head.S
-src-plat-$(CONFIG_PPC_CELL_QPACE) += pseries-head.S
 
 src-wlib := $(sort $(src-wlib-y))
 src-plat := $(sort $(src-plat-y))
@@ -217,7 +216,6 @@ image-$(CONFIG_PPC_POWERNV)		+= zImage.pseries
 image-$(CONFIG_PPC_MAPLE)		+= zImage.maple
 image-$(CONFIG_PPC_IBM_CELL_BLADE)	+= zImage.pseries
 image-$(CONFIG_PPC_PS3)			+= dtbImage.ps3
-image-$(CONFIG_PPC_CELL_QPACE)		+= zImage.pseries
 image-$(CONFIG_PPC_CHRP)		+= zImage.chrp
 image-$(CONFIG_PPC_EFIKA)		+= zImage.chrp
 image-$(CONFIG_PPC_PMAC)		+= zImage.pmac
diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig
index 2c041b535a64..b041fb607376 100644
--- a/arch/powerpc/configs/ppc64_defconfig
+++ b/arch/powerpc/configs/ppc64_defconfig
@@ -36,7 +36,6 @@ CONFIG_PS3_ROM=m
 CONFIG_PS3_FLASH=m
 CONFIG_PS3_LPM=m
 CONFIG_PPC_IBM_CELL_BLADE=y
-CONFIG_PPC_CELL_QPACE=y
 CONFIG_RTAS_FLASH=m
 CONFIG_IBMEBUS=y
 CONFIG_CPU_FREQ_PMAC64=y
diff --git a/arch/powerpc/platforms/cell/Kconfig b/arch/powerpc/platforms/cell/Kconfig
index 429fc59d2a47..d9088f0b8fcc 100644
--- a/arch/powerpc/platforms/cell/Kconfig
+++ b/arch/powerpc/platforms/cell/Kconfig
@@ -33,11 +33,6 @@ config PPC_IBM_CELL_BLADE
 	select PPC_UDBG_16550
 	select UDBG_RTAS_CONSOLE
 
-config PPC_CELL_QPACE
-	bool "IBM Cell - QPACE"
-	depends on PPC64 && PPC_BOOK3S && CPU_BIG_ENDIAN
-	select PPC_CELL_COMMON
-
 config AXON_MSI
 	bool
 	depends on PPC_IBM_CELL_BLADE && PCI_MSI
diff --git a/arch/powerpc/platforms/cell/Makefile b/arch/powerpc/platforms/cell/Makefile
index 34699bddfddd..00464305763d 100644
--- a/arch/powerpc/platforms/cell/Makefile
+++ b/arch/powerpc/platforms/cell/Makefile
@@ -11,7 +11,6 @@ obj-$(CONFIG_PPC_IBM_CELL_POWERBUTTON)	+= cbe_powerbutton.o
 
 ifeq ($(CONFIG_SMP),y)
 obj-$(CONFIG_PPC_CELL_NATIVE)		+= smp.o
-obj-$(CONFIG_PPC_CELL_QPACE)		+= smp.o
 endif
 
 # needed only when building loadable spufs.ko
@@ -26,6 +25,3 @@ obj-$(CONFIG_SPU_BASE)			+= spu_callbacks.o spu_base.o \
 					   spufs/
 
 obj-$(CONFIG_AXON_MSI)			+= axon_msi.o
-
-# qpace setup
-obj-$(CONFIG_PPC_CELL_QPACE)		+= qpace_setup.o
diff --git a/arch/powerpc/platforms/cell/qpace_setup.c b/arch/powerpc/platforms/cell/qpace_setup.c
deleted file mode 100644
index d328140dc6f5..000000000000
--- a/arch/powerpc/platforms/cell/qpace_setup.c
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- *  linux/arch/powerpc/platforms/cell/qpace_setup.c
- *
- *  Copyright (C) 1995  Linus Torvalds
- *  Adapted from 'alpha' version by Gary Thomas
- *  Modified by Cort Dougan (cort@cs.nmt.edu)
- *  Modified by PPC64 Team, IBM Corp
- *  Modified by Cell Team, IBM Deutschland Entwicklung GmbH
- *  Modified by Benjamin Krill <ben@codiert.org>, IBM Corp.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/export.h>
-#include <linux/delay.h>
-#include <linux/irq.h>
-#include <linux/console.h>
-#include <linux/of_platform.h>
-
-#include <asm/mmu.h>
-#include <asm/processor.h>
-#include <asm/io.h>
-#include <asm/kexec.h>
-#include <asm/pgtable.h>
-#include <asm/prom.h>
-#include <asm/rtas.h>
-#include <asm/dma.h>
-#include <asm/machdep.h>
-#include <asm/time.h>
-#include <asm/cputable.h>
-#include <asm/irq.h>
-#include <asm/spu.h>
-#include <asm/spu_priv1.h>
-#include <asm/udbg.h>
-#include <asm/cell-regs.h>
-
-#include "interrupt.h"
-#include "pervasive.h"
-#include "ras.h"
-
-static void qpace_show_cpuinfo(struct seq_file *m)
-{
-	struct device_node *root;
-	const char *model = "";
-
-	root = of_find_node_by_path("/");
-	if (root)
-		model = of_get_property(root, "model", NULL);
-	seq_printf(m, "machine\t\t: CHRP %s\n", model);
-	of_node_put(root);
-}
-
-static void qpace_progress(char *s, unsigned short hex)
-{
-	printk("*** %04x : %s\n", hex, s ? s : "");
-}
-
-static const struct of_device_id qpace_bus_ids[] __initconst = {
-	{ .type = "soc", },
-	{ .compatible = "soc", },
-	{ .type = "spider", },
-	{ .type = "axon", },
-	{ .type = "plb5", },
-	{ .type = "plb4", },
-	{ .type = "opb", },
-	{ .type = "ebc", },
-	{},
-};
-
-static int __init qpace_publish_devices(void)
-{
-	int node;
-
-	/* Publish OF platform devices for southbridge IOs */
-	of_platform_bus_probe(NULL, qpace_bus_ids, NULL);
-
-	/* There is no device for the MIC memory controller, thus we create
-	 * a platform device for it to attach the EDAC driver to.
-	 */
-	for_each_online_node(node) {
-		if (cbe_get_cpu_mic_tm_regs(cbe_node_to_cpu(node)) == NULL)
-			continue;
-		platform_device_register_simple("cbe-mic", node, NULL, 0);
-	}
-
-	return 0;
-}
-machine_subsys_initcall(qpace, qpace_publish_devices);
-
-static void __init qpace_setup_arch(void)
-{
-#ifdef CONFIG_SPU_BASE
-	spu_priv1_ops = &spu_priv1_mmio_ops;
-	spu_management_ops = &spu_management_of_ops;
-#endif
-
-	cbe_regs_init();
-
-#ifdef CONFIG_CBE_RAS
-	cbe_ras_init();
-#endif
-
-#ifdef CONFIG_SMP
-	smp_init_cell();
-#endif
-
-	/* init to some ~sane value until calibrate_delay() runs */
-	loops_per_jiffy = 50000000;
-
-	cbe_pervasive_init();
-#ifdef CONFIG_DUMMY_CONSOLE
-	conswitchp = &dummy_con;
-#endif
-}
-
-static int __init qpace_probe(void)
-{
-	unsigned long root = of_get_flat_dt_root();
-
-	if (!of_flat_dt_is_compatible(root, "IBM,QPACE"))
-		return 0;
-
-	hpte_init_native();
-	pm_power_off = rtas_power_off;
-
-	return 1;
-}
-
-define_machine(qpace) {
-	.name			= "QPACE",
-	.probe			= qpace_probe,
-	.setup_arch		= qpace_setup_arch,
-	.show_cpuinfo		= qpace_show_cpuinfo,
-	.restart		= rtas_restart,
-	.halt			= rtas_halt,
-	.get_boot_time		= rtas_get_boot_time,
-	.get_rtc_time		= rtas_get_rtc_time,
-	.set_rtc_time		= rtas_set_rtc_time,
-	.calibrate_decr		= generic_calibrate_decr,
-	.progress		= qpace_progress,
-	.init_IRQ		= iic_init_IRQ,
-};

From eb925d64604991095b6e9476d7c437a994f3369c Mon Sep 17 00:00:00 2001
From: Rashmica Gupta <rashmicy@gmail.com>
Date: Wed, 25 Nov 2015 13:46:25 +1100
Subject: [PATCH 076/149] powerpc/xmon: Append linux_banner to exception
 information in xmon.

Currently if you are in xmon without an oops etc. to view the kernel
version you have to type "d $linux_banner" - not necessarily obvious. As
this is useful information, append to the output of "e" command.

Example output:
  $mon> e
  cpu 0x1: Vector: 0  at [c0000000f879ba80]
      pc: c000000000081718: sysrq_handle_xmon+0x68/0x80
      lr: c000000000081718: sysrq_handle_xmon+0x68/0x80
      sp: c0000000f879bbe0
     msr: 8000000000009033
    current = 0xc0000000f604d5c0
    paca    = 0xc00000000fdc0480	 softe: 0	 irq_happened: 0x01
      pid   = 2467, comm = bash
  Linux version 4.4.0-rc2-00008-gc51af91c3ab3-dirty (rashmica@circle) (gcc
  version 5.1.1 20150629 (GCC) ) #45 SMP Wed Nov 25 10:25:12 AEDT 2015

Signed-off-by: Rashmica Gupta <rashmicy@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/xmon/xmon.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 786bf01691c9..e8c7a937955e 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -1522,6 +1522,8 @@ static void excprint(struct pt_regs *fp)
 
 	if (trap == 0x700)
 		print_bug_trap(fp);
+
+	printf(linux_banner);
 }
 
 static void prregs(struct pt_regs *fp)

From 5f337e3e5b04b32793fd51adab438d46df99c933 Mon Sep 17 00:00:00 2001
From: Rashmica Gupta <rashmicy@gmail.com>
Date: Thu, 10 Dec 2015 20:49:33 +1100
Subject: [PATCH 077/149] selftests/powerpc: Add test to check if VSRs are
 corrupted

When a transaction is aborted, VSR values should rollback to the
checkpointed values before the transaction began. VSRs used elsewhere in
the kernel during a transaction, or while the transaction is suspended
should not affect the checkpointed values.

Prior to the bug fix in commit d31626f70b61 ("powerpc: Don't corrupt
transactional state when using FP/VMX in kernel") when VMX was requested
by the kernel the .vr_state (which held the checkpointed state of VSRs
before the transaction) was overwritten with the current state from
outside the transation. Thus if the transaction did not complete, the
VSR values would be "rolled back" to potentially incorrect values.

Signed-off-by: Rashmica Gupta <rashmicy@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 tools/testing/selftests/powerpc/tm/.gitignore |   1 +
 tools/testing/selftests/powerpc/tm/Makefile   |   2 +-
 .../testing/selftests/powerpc/tm/tm-vmxcopy.c | 103 ++++++++++++++++++
 3 files changed, 105 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/powerpc/tm/tm-vmxcopy.c

diff --git a/tools/testing/selftests/powerpc/tm/.gitignore b/tools/testing/selftests/powerpc/tm/.gitignore
index e6668217ccd0..7d0f14b8cb2e 100644
--- a/tools/testing/selftests/powerpc/tm/.gitignore
+++ b/tools/testing/selftests/powerpc/tm/.gitignore
@@ -2,3 +2,4 @@ tm-resched-dscr
 tm-syscall
 tm-signal-msr-resv
 tm-signal-stack
+tm-vmxcopy
diff --git a/tools/testing/selftests/powerpc/tm/Makefile b/tools/testing/selftests/powerpc/tm/Makefile
index e7ceff809fa0..737f72c964e6 100644
--- a/tools/testing/selftests/powerpc/tm/Makefile
+++ b/tools/testing/selftests/powerpc/tm/Makefile
@@ -1,4 +1,4 @@
-TEST_PROGS := tm-resched-dscr tm-syscall tm-signal-msr-resv tm-signal-stack
+TEST_PROGS := tm-resched-dscr tm-syscall tm-signal-msr-resv tm-signal-stack tm-vmxcopy
 
 all: $(TEST_PROGS)
 
diff --git a/tools/testing/selftests/powerpc/tm/tm-vmxcopy.c b/tools/testing/selftests/powerpc/tm/tm-vmxcopy.c
new file mode 100644
index 000000000000..0274de7b11f3
--- /dev/null
+++ b/tools/testing/selftests/powerpc/tm/tm-vmxcopy.c
@@ -0,0 +1,103 @@
+/*
+ * Copyright 2015, Michael Neuling, IBM Corp.
+ * Licensed under GPLv2.
+ *
+ * Original: Michael Neuling 4/12/2013
+ * Edited: Rashmica Gupta 4/12/2015
+ *
+ * See if the altivec state is leaked out of an aborted transaction due to
+ * kernel vmx copy loops.
+ *
+ * When the transaction aborts, VSR values should rollback to the values
+ * they held before the transaction commenced. Using VSRs while transaction
+ * is suspended should not affect the checkpointed values.
+ *
+ * (1) write A to a VSR
+ * (2) start transaction
+ * (3) suspend transaction
+ * (4) change the VSR to B
+ * (5) trigger kernel vmx copy loop
+ * (6) abort transaction
+ * (7) check that the VSR value is A
+ */
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <string.h>
+#include <assert.h>
+
+#include "tm.h"
+#include "utils.h"
+
+int test_vmxcopy()
+{
+	long double vecin = 1.3;
+	long double vecout;
+	unsigned long pgsize = getpagesize();
+	int i;
+	int fd;
+	int size = pgsize*16;
+	char tmpfile[] = "/tmp/page_faultXXXXXX";
+	char buf[pgsize];
+	char *a;
+	uint64_t aborted = 0;
+
+	SKIP_IF(!have_htm());
+
+	fd = mkstemp(tmpfile);
+	assert(fd >= 0);
+
+	memset(buf, 0, pgsize);
+	for (i = 0; i < size; i += pgsize)
+		assert(write(fd, buf, pgsize) == pgsize);
+
+	unlink(tmpfile);
+
+	a = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
+	assert(a != MAP_FAILED);
+
+	asm __volatile__(
+		"lxvd2x 40,0,%[vecinptr];"	/* set 40 to initial value*/
+		"tbegin.;"
+		"beq	3f;"
+		"tsuspend.;"
+		"xxlxor 40,40,40;"		/* set 40 to 0 */
+		"std	5, 0(%[map]);"		/* cause kernel vmx copy page */
+		"tabort. 0;"
+		"tresume.;"
+		"tend.;"
+		"li	%[res], 0;"
+		"b	5f;"
+
+		/* Abort handler */
+		"3:;"
+		"li	%[res], 1;"
+
+		"5:;"
+		"stxvd2x 40,0,%[vecoutptr];"
+		: [res]"=r"(aborted)
+		: [vecinptr]"r"(&vecin),
+		  [vecoutptr]"r"(&vecout),
+		  [map]"r"(a)
+		: "memory", "r0", "r3", "r4", "r5", "r6", "r7");
+
+	if (aborted && (vecin != vecout)){
+		printf("FAILED: vector state leaked on abort %f != %f\n",
+		       (double)vecin, (double)vecout);
+		return 1;
+	}
+
+	munmap(a, size);
+
+	close(fd);
+
+	return 0;
+}
+
+int main(void)
+{
+	return test_harness(test_vmxcopy, "tm_vmxcopy");
+}

From 00b912b0c88e690b1662067497182454357b18b0 Mon Sep 17 00:00:00 2001
From: Daniel Axtens <dja@axtens.net>
Date: Tue, 15 Dec 2015 18:09:14 +1100
Subject: [PATCH 078/149] powerpc: Remove broken GregorianDay()

GregorianDay() is supposed to calculate the day of the week
(tm->tm_wday) for a given day/month/year. In that calcuation it
indexed into an array called MonthOffset using tm->tm_mon-1. However
tm_mon is zero-based, not one-based, so this is off-by-one. It also
means that every January, GregoiranDay() will access element -1 of
the MonthOffset array.

It also doesn't appear to be a correct algorithm either: see in
contrast kernel/time/timeconv.c's time_to_tm function.

It's been broken forever, which suggests no-one in userland uses
this. It looks like no-one in the kernel uses tm->tm_wday either
(see e.g. drivers/rtc/rtc-ds1305.c:319).

tm->tm_wday is conventionally set to -1 when not available in
hardware so we can simply set it to -1 and drop the function.
(There are over a dozen other drivers in drivers/rtc that do
this.)

Found using UBSAN.

Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andrew Morton <akpm@linux-foundation.org> # as an example of what UBSan finds.
Cc: Alessandro Zummo <a.zummo@towertech.it>
Cc: Alexandre Belloni <alexandre.belloni@free-electrons.com>
Cc: rtc-linux@googlegroups.com
Signed-off-by: Daniel Axtens <dja@axtens.net>
Acked-by: Alexandre Belloni <alexandre.belloni@free-electrons.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/time.h           |  1 -
 arch/powerpc/kernel/time.c                | 36 ++---------------------
 arch/powerpc/platforms/maple/time.c       |  2 +-
 arch/powerpc/platforms/powernv/opal-rtc.c |  3 +-
 drivers/rtc/rtc-opal.c                    |  2 +-
 5 files changed, 5 insertions(+), 39 deletions(-)

diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 10fc784a2ad4..2d7109a8d296 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -27,7 +27,6 @@ extern struct clock_event_device decrementer_clockevent;
 
 struct rtc_time;
 extern void to_tm(int tim, struct rtc_time * tm);
-extern void GregorianDay(struct rtc_time *tm);
 extern void tick_broadcast_ipi_handler(void);
 
 extern void generic_calibrate_decr(void);
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 1be1092c7204..81b0900a39ee 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -1002,38 +1002,6 @@ static int month_days[12] = {
 	31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31
 };
 
-/*
- * This only works for the Gregorian calendar - i.e. after 1752 (in the UK)
- */
-void GregorianDay(struct rtc_time * tm)
-{
-	int leapsToDate;
-	int lastYear;
-	int day;
-	int MonthOffset[] = { 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334 };
-
-	lastYear = tm->tm_year - 1;
-
-	/*
-	 * Number of leap corrections to apply up to end of last year
-	 */
-	leapsToDate = lastYear / 4 - lastYear / 100 + lastYear / 400;
-
-	/*
-	 * This year is a leap year if it is divisible by 4 except when it is
-	 * divisible by 100 unless it is divisible by 400
-	 *
-	 * e.g. 1904 was a leap year, 1900 was not, 1996 is, and 2000 was
-	 */
-	day = tm->tm_mon > 2 && leapyear(tm->tm_year);
-
-	day += lastYear*365 + leapsToDate + MonthOffset[tm->tm_mon-1] +
-		   tm->tm_mday;
-
-	tm->tm_wday = day % 7;
-}
-EXPORT_SYMBOL_GPL(GregorianDay);
-
 void to_tm(int tim, struct rtc_time * tm)
 {
 	register int    i;
@@ -1064,9 +1032,9 @@ void to_tm(int tim, struct rtc_time * tm)
 	tm->tm_mday = day + 1;
 
 	/*
-	 * Determine the day of week
+	 * No-one uses the day of the week.
 	 */
-	GregorianDay(tm);
+	tm->tm_wday = -1;
 }
 EXPORT_SYMBOL(to_tm);
 
diff --git a/arch/powerpc/platforms/maple/time.c b/arch/powerpc/platforms/maple/time.c
index b4a369dac3a8..81799d70a1ee 100644
--- a/arch/powerpc/platforms/maple/time.c
+++ b/arch/powerpc/platforms/maple/time.c
@@ -77,7 +77,7 @@ void maple_get_rtc_time(struct rtc_time *tm)
 	if ((tm->tm_year + 1900) < 1970)
 		tm->tm_year += 100;
 
-	GregorianDay(tm);
+	tm->tm_wday = -1;
 }
 
 int maple_set_rtc_time(struct rtc_time *tm)
diff --git a/arch/powerpc/platforms/powernv/opal-rtc.c b/arch/powerpc/platforms/powernv/opal-rtc.c
index 37dbee15769f..1b149c92fca1 100644
--- a/arch/powerpc/platforms/powernv/opal-rtc.c
+++ b/arch/powerpc/platforms/powernv/opal-rtc.c
@@ -31,8 +31,7 @@ static void opal_to_tm(u32 y_m_d, u64 h_m_s_ms, struct rtc_time *tm)
 	tm->tm_hour	= bcd2bin((h_m_s_ms >> 56) & 0xff);
 	tm->tm_min	= bcd2bin((h_m_s_ms >> 48) & 0xff);
 	tm->tm_sec	= bcd2bin((h_m_s_ms >> 40) & 0xff);
-
-        GregorianDay(tm);
+	tm->tm_wday     = -1;
 }
 
 unsigned long __init opal_get_boot_time(void)
diff --git a/drivers/rtc/rtc-opal.c b/drivers/rtc/rtc-opal.c
index df39ce02a99d..9c18d6fd8107 100644
--- a/drivers/rtc/rtc-opal.c
+++ b/drivers/rtc/rtc-opal.c
@@ -40,7 +40,7 @@ static void opal_to_tm(u32 y_m_d, u64 h_m_s_ms, struct rtc_time *tm)
 	tm->tm_min  = bcd2bin((h_m_s_ms >> 48) & 0xff);
 	tm->tm_sec  = bcd2bin((h_m_s_ms >> 40) & 0xff);
 
-	GregorianDay(tm);
+	tm->tm_wday = -1;
 }
 
 static void tm_to_opal(struct rtc_time *tm, u32 *y_m_d, u64 *h_m_s_ms)

From d1301afd71bd38b1610b391e50debf766faa84be Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 16 Dec 2015 18:59:31 +1100
Subject: [PATCH 079/149] selftests/powerpc: Move pick_online_cpu() up into
 utils.c

We want to use this in another test, so make it available at the top of
the powerpc selftests tree.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 tools/testing/selftests/powerpc/pmu/Makefile |  2 +-
 tools/testing/selftests/powerpc/pmu/lib.c    | 26 ------------------
 tools/testing/selftests/powerpc/pmu/lib.h    |  1 -
 tools/testing/selftests/powerpc/utils.c      | 29 ++++++++++++++++++++
 tools/testing/selftests/powerpc/utils.h      |  1 +
 5 files changed, 31 insertions(+), 28 deletions(-)

diff --git a/tools/testing/selftests/powerpc/pmu/Makefile b/tools/testing/selftests/powerpc/pmu/Makefile
index 50326cbb372d..ac41a7177f2e 100644
--- a/tools/testing/selftests/powerpc/pmu/Makefile
+++ b/tools/testing/selftests/powerpc/pmu/Makefile
@@ -2,7 +2,7 @@ noarg:
 	$(MAKE) -C ../
 
 TEST_PROGS := count_instructions l3_bank_test per_event_excludes
-EXTRA_SOURCES := ../harness.c event.c lib.c
+EXTRA_SOURCES := ../harness.c event.c lib.c ../utils.c
 
 all: $(TEST_PROGS) ebb
 
diff --git a/tools/testing/selftests/powerpc/pmu/lib.c b/tools/testing/selftests/powerpc/pmu/lib.c
index a07104c2afe6..a361ad3334ce 100644
--- a/tools/testing/selftests/powerpc/pmu/lib.c
+++ b/tools/testing/selftests/powerpc/pmu/lib.c
@@ -15,32 +15,6 @@
 #include "lib.h"
 
 
-int pick_online_cpu(void)
-{
-	cpu_set_t mask;
-	int cpu;
-
-	CPU_ZERO(&mask);
-
-	if (sched_getaffinity(0, sizeof(mask), &mask)) {
-		perror("sched_getaffinity");
-		return -1;
-	}
-
-	/* We prefer a primary thread, but skip 0 */
-	for (cpu = 8; cpu < CPU_SETSIZE; cpu += 8)
-		if (CPU_ISSET(cpu, &mask))
-			return cpu;
-
-	/* Search for anything, but in reverse */
-	for (cpu = CPU_SETSIZE - 1; cpu >= 0; cpu--)
-		if (CPU_ISSET(cpu, &mask))
-			return cpu;
-
-	printf("No cpus in affinity mask?!\n");
-	return -1;
-}
-
 int bind_to_cpu(int cpu)
 {
 	cpu_set_t mask;
diff --git a/tools/testing/selftests/powerpc/pmu/lib.h b/tools/testing/selftests/powerpc/pmu/lib.h
index ca5d72ae3be6..0213af4ff332 100644
--- a/tools/testing/selftests/powerpc/pmu/lib.h
+++ b/tools/testing/selftests/powerpc/pmu/lib.h
@@ -19,7 +19,6 @@ union pipe {
 	int fds[2];
 };
 
-extern int pick_online_cpu(void);
 extern int bind_to_cpu(int cpu);
 extern int kill_child_and_wait(pid_t child_pid);
 extern int wait_for_child(pid_t child_pid);
diff --git a/tools/testing/selftests/powerpc/utils.c b/tools/testing/selftests/powerpc/utils.c
index 536113add380..dcf74184bfd0 100644
--- a/tools/testing/selftests/powerpc/utils.c
+++ b/tools/testing/selftests/powerpc/utils.c
@@ -3,10 +3,13 @@
  * Licensed under GPLv2.
  */
 
+#define _GNU_SOURCE	/* For CPU_ZERO etc. */
+
 #include <elf.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <link.h>
+#include <sched.h>
 #include <stdio.h>
 #include <sys/stat.h>
 #include <sys/types.h>
@@ -56,3 +59,29 @@ out:
 	close(fd);
 	return result;
 }
+
+int pick_online_cpu(void)
+{
+	cpu_set_t mask;
+	int cpu;
+
+	CPU_ZERO(&mask);
+
+	if (sched_getaffinity(0, sizeof(mask), &mask)) {
+		perror("sched_getaffinity");
+		return -1;
+	}
+
+	/* We prefer a primary thread, but skip 0 */
+	for (cpu = 8; cpu < CPU_SETSIZE; cpu += 8)
+		if (CPU_ISSET(cpu, &mask))
+			return cpu;
+
+	/* Search for anything, but in reverse */
+	for (cpu = CPU_SETSIZE - 1; cpu >= 0; cpu--)
+		if (CPU_ISSET(cpu, &mask))
+			return cpu;
+
+	printf("No cpus in affinity mask?!\n");
+	return -1;
+}
diff --git a/tools/testing/selftests/powerpc/utils.h b/tools/testing/selftests/powerpc/utils.h
index fbf2bf530e50..175ac6ad10dd 100644
--- a/tools/testing/selftests/powerpc/utils.h
+++ b/tools/testing/selftests/powerpc/utils.h
@@ -22,6 +22,7 @@ typedef uint8_t u8;
 
 int test_harness(int (test_function)(void), char *name);
 extern void *get_auxv_entry(int type);
+int pick_online_cpu(void);
 
 static inline bool have_hwcap2(unsigned long ftr2)
 {

From 00b7ec5c9cf338902faea2e40801573a384e45be Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 2 Dec 2015 20:44:09 +1100
Subject: [PATCH 080/149] selftests/powerpc: Import Anton's context_switch2
 benchmark

This gets referred to a lot in commit messages, so let's pull it into
the selftests.

Almost vanilla from: http://ozlabs.org/~anton/junkcode/context_switch2.c

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Acked-by: Anton Blanchard <anton@samba.org>
---
 .../selftests/powerpc/benchmarks/.gitignore   |   1 +
 .../selftests/powerpc/benchmarks/Makefile     |   4 +-
 .../powerpc/benchmarks/context_switch.c       | 452 ++++++++++++++++++
 3 files changed, 456 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/powerpc/benchmarks/context_switch.c

diff --git a/tools/testing/selftests/powerpc/benchmarks/.gitignore b/tools/testing/selftests/powerpc/benchmarks/.gitignore
index b4709ea588c1..6fa673316ac2 100644
--- a/tools/testing/selftests/powerpc/benchmarks/.gitignore
+++ b/tools/testing/selftests/powerpc/benchmarks/.gitignore
@@ -1 +1,2 @@
 gettimeofday
+context_switch
diff --git a/tools/testing/selftests/powerpc/benchmarks/Makefile b/tools/testing/selftests/powerpc/benchmarks/Makefile
index 5fa48702070d..8cb7415c55aa 100644
--- a/tools/testing/selftests/powerpc/benchmarks/Makefile
+++ b/tools/testing/selftests/powerpc/benchmarks/Makefile
@@ -1,4 +1,4 @@
-TEST_PROGS := gettimeofday
+TEST_PROGS := gettimeofday context_switch
 
 CFLAGS += -O2
 
@@ -6,6 +6,8 @@ all: $(TEST_PROGS)
 
 $(TEST_PROGS): ../harness.c
 
+context_switch: LDLIBS += -lpthread
+
 include ../../lib.mk
 
 clean:
diff --git a/tools/testing/selftests/powerpc/benchmarks/context_switch.c b/tools/testing/selftests/powerpc/benchmarks/context_switch.c
new file mode 100644
index 000000000000..ed21a83a0f99
--- /dev/null
+++ b/tools/testing/selftests/powerpc/benchmarks/context_switch.c
@@ -0,0 +1,452 @@
+/*
+ * Context switch microbenchmark.
+ *
+ * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define _GNU_SOURCE
+#include <sched.h>
+#include <string.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <getopt.h>
+#include <signal.h>
+#include <assert.h>
+#include <pthread.h>
+#include <limits.h>
+#include <sys/time.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/shm.h>
+#include <linux/futex.h>
+
+static unsigned int timeout = INT_MAX;
+
+static int touch_vdso;
+struct timeval tv;
+
+static int touch_fp;
+double fp;
+
+static int touch_vector;
+typedef int v4si __attribute__ ((vector_size (16)));
+v4si a, b, c;
+
+#ifdef __powerpc__
+static int touch_altivec;
+
+static void __attribute__((__target__("no-vsx"))) altivec_touch_fn(void)
+{
+	c = a + b;
+}
+#endif
+
+static void touch(void)
+{
+	if (touch_vdso)
+		gettimeofday(&tv, NULL);
+
+	if (touch_fp)
+		fp += 0.1;
+
+#ifdef __powerpc__
+	if (touch_altivec)
+		altivec_touch_fn();
+#endif
+
+	if (touch_vector)
+		c = a + b;
+
+	asm volatile("# %0 %1 %2": : "r"(&tv), "r"(&fp), "r"(&c));
+}
+
+static void start_thread_on(void *(*fn)(void *), void *arg, unsigned long cpu)
+{
+	pthread_t tid;
+	cpu_set_t cpuset;
+	pthread_attr_t attr;
+
+	CPU_ZERO(&cpuset);
+	CPU_SET(cpu, &cpuset);
+
+	pthread_attr_init(&attr);
+
+	if (pthread_attr_setaffinity_np(&attr, sizeof(cpu_set_t), &cpuset)) {
+		perror("pthread_attr_setaffinity_np");
+		exit(1);
+	}
+
+	if (pthread_create(&tid, &attr, fn, arg)) {
+		perror("pthread_create");
+		exit(1);
+	}
+}
+
+static void start_process_on(void *(*fn)(void *), void *arg, unsigned long cpu)
+{
+	int pid;
+	cpu_set_t cpuset;
+
+	pid = fork();
+	if (pid == -1) {
+		perror("fork");
+		exit(1);
+	}
+
+	if (pid)
+		return;
+
+	CPU_ZERO(&cpuset);
+	CPU_SET(cpu, &cpuset);
+
+	if (sched_setaffinity(0, sizeof(cpuset), &cpuset)) {
+		perror("sched_setaffinity");
+		exit(1);
+	}
+
+	fn(arg);
+
+	exit(0);
+}
+
+static unsigned long iterations;
+static unsigned long iterations_prev;
+
+static void sigalrm_handler(int junk)
+{
+	unsigned long i = iterations;
+
+	printf("%ld\n", i - iterations_prev);
+	iterations_prev = i;
+
+	if (--timeout == 0)
+		kill(0, SIGUSR1);
+
+	alarm(1);
+}
+
+static void sigusr1_handler(int junk)
+{
+	exit(0);
+}
+
+struct actions {
+	void (*setup)(int, int);
+	void *(*thread1)(void *);
+	void *(*thread2)(void *);
+};
+
+#define READ 0
+#define WRITE 1
+
+static int pipe_fd1[2];
+static int pipe_fd2[2];
+
+static void pipe_setup(int cpu1, int cpu2)
+{
+	if (pipe(pipe_fd1) || pipe(pipe_fd2))
+		exit(1);
+}
+
+static void *pipe_thread1(void *arg)
+{
+	signal(SIGALRM, sigalrm_handler);
+	alarm(1);
+
+	while (1) {
+		assert(read(pipe_fd1[READ], &c, 1) == 1);
+		touch();
+
+		assert(write(pipe_fd2[WRITE], &c, 1) == 1);
+		touch();
+
+		iterations += 2;
+	}
+
+	return NULL;
+}
+
+static void *pipe_thread2(void *arg)
+{
+	while (1) {
+		assert(write(pipe_fd1[WRITE], &c, 1) == 1);
+		touch();
+
+		assert(read(pipe_fd2[READ], &c, 1) == 1);
+		touch();
+	}
+
+	return NULL;
+}
+
+static struct actions pipe_actions = {
+	.setup = pipe_setup,
+	.thread1 = pipe_thread1,
+	.thread2 = pipe_thread2,
+};
+
+static void yield_setup(int cpu1, int cpu2)
+{
+	if (cpu1 != cpu2) {
+		fprintf(stderr, "Both threads must be on the same CPU for yield test\n");
+		exit(1);
+	}
+}
+
+static void *yield_thread1(void *arg)
+{
+	signal(SIGALRM, sigalrm_handler);
+	alarm(1);
+
+	while (1) {
+		sched_yield();
+		touch();
+
+		iterations += 2;
+	}
+
+	return NULL;
+}
+
+static void *yield_thread2(void *arg)
+{
+	while (1) {
+		sched_yield();
+		touch();
+	}
+
+	return NULL;
+}
+
+static struct actions yield_actions = {
+	.setup = yield_setup,
+	.thread1 = yield_thread1,
+	.thread2 = yield_thread2,
+};
+
+static long sys_futex(void *addr1, int op, int val1, struct timespec *timeout,
+		      void *addr2, int val3)
+{
+	return syscall(SYS_futex, addr1, op, val1, timeout, addr2, val3);
+}
+
+static unsigned long cmpxchg(unsigned long *p, unsigned long expected,
+			     unsigned long desired)
+{
+	unsigned long exp = expected;
+
+	__atomic_compare_exchange_n(p, &exp, desired, 0,
+				    __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+	return exp;
+}
+
+static unsigned long xchg(unsigned long *p, unsigned long val)
+{
+	return __atomic_exchange_n(p, val, __ATOMIC_SEQ_CST);
+}
+
+static int mutex_lock(unsigned long *m)
+{
+	int c;
+
+	c = cmpxchg(m, 0, 1);
+	if (!c)
+		return 0;
+
+	if (c == 1)
+		c = xchg(m, 2);
+
+	while (c) {
+		sys_futex(m, FUTEX_WAIT, 2, NULL, NULL, 0);
+		c = xchg(m, 2);
+	}
+
+	return 0;
+}
+
+static int mutex_unlock(unsigned long *m)
+{
+	if (*m == 2)
+		*m = 0;
+	else if (xchg(m, 0) == 1)
+		return 0;
+
+	sys_futex(m, FUTEX_WAKE, 1, NULL, NULL, 0);
+
+	return 0;
+}
+
+static unsigned long *m1, *m2;
+
+static void futex_setup(int cpu1, int cpu2)
+{
+	int shmid;
+	void *shmaddr;
+
+	shmid = shmget(IPC_PRIVATE, getpagesize(), SHM_R | SHM_W);
+	if (shmid < 0) {
+		perror("shmget");
+		exit(1);
+	}
+
+	shmaddr = shmat(shmid, NULL, 0);
+	if (shmaddr == (char *)-1) {
+		perror("shmat");
+		shmctl(shmid, IPC_RMID, NULL);
+		exit(1);
+	}
+
+	shmctl(shmid, IPC_RMID, NULL);
+
+	m1 = shmaddr;
+	m2 = shmaddr + sizeof(*m1);
+
+	*m1 = 0;
+	*m2 = 0;
+
+	mutex_lock(m1);
+	mutex_lock(m2);
+}
+
+static void *futex_thread1(void *arg)
+{
+	signal(SIGALRM, sigalrm_handler);
+	alarm(1);
+
+	while (1) {
+		mutex_lock(m2);
+		mutex_unlock(m1);
+
+		iterations += 2;
+	}
+
+	return NULL;
+}
+
+static void *futex_thread2(void *arg)
+{
+	while (1) {
+		mutex_unlock(m2);
+		mutex_lock(m1);
+	}
+
+	return NULL;
+}
+
+static struct actions futex_actions = {
+	.setup = futex_setup,
+	.thread1 = futex_thread1,
+	.thread2 = futex_thread2,
+};
+
+static int processes;
+
+static struct option options[] = {
+	{ "test", required_argument, 0, 't' },
+	{ "process", no_argument, &processes, 1 },
+	{ "timeout", required_argument, 0, 's' },
+	{ "vdso", no_argument, &touch_vdso, 1 },
+	{ "fp", no_argument, &touch_fp, 1 },
+#ifdef __powerpc__
+	{ "altivec", no_argument, &touch_altivec, 1 },
+#endif
+	{ "vector", no_argument, &touch_vector, 1 },
+	{ 0, },
+};
+
+static void usage(void)
+{
+	fprintf(stderr, "Usage: context_switch2 <options> CPU1 CPU2\n\n");
+	fprintf(stderr, "\t\t--test=X\tpipe, futex or yield\n");
+	fprintf(stderr, "\t\t--process\tUse processes (default threads)\n");
+	fprintf(stderr, "\t\t--timeout=X\tDuration in seconds to run\n");
+	fprintf(stderr, "\t\t--vdso\t\ttouch VDSO\n");
+	fprintf(stderr, "\t\t--fp\t\ttouch FP\n");
+#ifdef __powerpc__
+	fprintf(stderr, "\t\t--altivec\ttouch altivec\n");
+#endif
+	fprintf(stderr, "\t\t--vector\ttouch vector\n");
+}
+
+int main(int argc, char *argv[])
+{
+	signed char c;
+	struct actions *actions = &pipe_actions;
+	int cpu1;
+	int cpu2;
+	static void (*start_fn)(void *(*fn)(void *), void *arg, unsigned long cpu);
+
+	while (1) {
+		int option_index = 0;
+
+		c = getopt_long(argc, argv, "", options, &option_index);
+
+		if (c == -1)
+			break;
+
+		switch (c) {
+		case 0:
+			if (options[option_index].flag != 0)
+				break;
+
+			usage();
+			exit(1);
+			break;
+
+		case 't':
+			if (!strcmp(optarg, "pipe")) {
+				actions = &pipe_actions;
+			} else if (!strcmp(optarg, "yield")) {
+				actions = &yield_actions;
+			} else if (!strcmp(optarg, "futex")) {
+				actions = &futex_actions;
+			} else {
+				usage();
+				exit(1);
+			}
+			break;
+
+		case 's':
+			timeout = atoi(optarg);
+			break;
+
+		default:
+			usage();
+			exit(1);
+		}
+	}
+
+	if (processes)
+		start_fn = start_process_on;
+	else
+		start_fn = start_thread_on;
+
+	if (((argc - optind) != 2)) {
+		usage();
+		exit(1);
+	}
+
+	/* Create a new process group so we can signal everyone for exit */
+	setpgid(getpid(), getpid());
+
+	signal(SIGUSR1, sigusr1_handler);
+
+	cpu1 = atoi(argv[optind++]);
+	cpu2 = atoi(argv[optind++]);
+
+	actions->setup(cpu1, cpu2);
+
+	start_fn(actions->thread1, NULL, cpu1);
+	start_fn(actions->thread2, NULL, cpu2);
+
+	while (1)
+		sleep(3600);
+
+	return 0;
+}

From ea0c321784565c681507e02acf900deaa1e9e952 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 2 Dec 2015 20:44:10 +1100
Subject: [PATCH 081/149] selftests/powerpc: Make context_switch do something
 with no args

For ease of use make the context_switch test do something useful when
called with no arguments.

Default to a 30 second run, using threads, doing yield, and use any
online cpu. Make it print out what it's doing to avoid confusion.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Acked-by: Anton Blanchard <anton@samba.org>
---
 .../selftests/powerpc/benchmarks/Makefile     |  1 +
 .../powerpc/benchmarks/context_switch.c       | 32 +++++++++++++------
 2 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/tools/testing/selftests/powerpc/benchmarks/Makefile b/tools/testing/selftests/powerpc/benchmarks/Makefile
index 8cb7415c55aa..912445ff7ce7 100644
--- a/tools/testing/selftests/powerpc/benchmarks/Makefile
+++ b/tools/testing/selftests/powerpc/benchmarks/Makefile
@@ -6,6 +6,7 @@ all: $(TEST_PROGS)
 
 $(TEST_PROGS): ../harness.c
 
+context_switch: ../utils.c
 context_switch: LDLIBS += -lpthread
 
 include ../../lib.mk
diff --git a/tools/testing/selftests/powerpc/benchmarks/context_switch.c b/tools/testing/selftests/powerpc/benchmarks/context_switch.c
index ed21a83a0f99..d8b6d10f36a6 100644
--- a/tools/testing/selftests/powerpc/benchmarks/context_switch.c
+++ b/tools/testing/selftests/powerpc/benchmarks/context_switch.c
@@ -26,7 +26,9 @@
 #include <sys/shm.h>
 #include <linux/futex.h>
 
-static unsigned int timeout = INT_MAX;
+#include "../utils.h"
+
+static unsigned int timeout = 30;
 
 static int touch_vdso;
 struct timeval tv;
@@ -363,9 +365,9 @@ static struct option options[] = {
 static void usage(void)
 {
 	fprintf(stderr, "Usage: context_switch2 <options> CPU1 CPU2\n\n");
-	fprintf(stderr, "\t\t--test=X\tpipe, futex or yield\n");
+	fprintf(stderr, "\t\t--test=X\tpipe, futex or yield (default)\n");
 	fprintf(stderr, "\t\t--process\tUse processes (default threads)\n");
-	fprintf(stderr, "\t\t--timeout=X\tDuration in seconds to run\n");
+	fprintf(stderr, "\t\t--timeout=X\tDuration in seconds to run (default 30)\n");
 	fprintf(stderr, "\t\t--vdso\t\ttouch VDSO\n");
 	fprintf(stderr, "\t\t--fp\t\ttouch FP\n");
 #ifdef __powerpc__
@@ -377,7 +379,7 @@ static void usage(void)
 int main(int argc, char *argv[])
 {
 	signed char c;
-	struct actions *actions = &pipe_actions;
+	struct actions *actions = &yield_actions;
 	int cpu1;
 	int cpu2;
 	static void (*start_fn)(void *(*fn)(void *), void *arg, unsigned long cpu);
@@ -428,18 +430,30 @@ int main(int argc, char *argv[])
 		start_fn = start_thread_on;
 
 	if (((argc - optind) != 2)) {
-		usage();
-		exit(1);
+		cpu1 = cpu2 = pick_online_cpu();
+	} else {
+		cpu1 = atoi(argv[optind++]);
+		cpu2 = atoi(argv[optind++]);
 	}
 
+	printf("Using %s with ", processes ? "processes" : "threads");
+
+	if (actions == &pipe_actions)
+		printf("pipe");
+	else if (actions == &yield_actions)
+		printf("yield");
+	else
+		printf("futex");
+
+	printf(" on cpus %d/%d touching FP:%s altivec:%s vector:%s vdso:%s\n",
+	       cpu1, cpu2, touch_fp ?  "yes" : "no", touch_altivec ? "yes" : "no",
+	       touch_vector ? "yes" : "no", touch_vdso ? "yes" : "no");
+
 	/* Create a new process group so we can signal everyone for exit */
 	setpgid(getpid(), getpid());
 
 	signal(SIGUSR1, sigusr1_handler);
 
-	cpu1 = atoi(argv[optind++]);
-	cpu2 = atoi(argv[optind++]);
-
 	actions->setup(cpu1, cpu2);
 
 	start_fn(actions->thread1, NULL, cpu1);

From 51c21e72eb99d1136614135d633baae269893778 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 2 Dec 2015 20:44:11 +1100
Subject: [PATCH 082/149] selftests/powerpc: Make context_switch touch
 FP/altivec/vector by default

Simply because it touches more code paths that way, and therefore tests
more things.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Acked-by: Anton Blanchard <anton@samba.org>
---
 .../selftests/powerpc/benchmarks/context_switch.c    | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tools/testing/selftests/powerpc/benchmarks/context_switch.c b/tools/testing/selftests/powerpc/benchmarks/context_switch.c
index d8b6d10f36a6..7b785941adec 100644
--- a/tools/testing/selftests/powerpc/benchmarks/context_switch.c
+++ b/tools/testing/selftests/powerpc/benchmarks/context_switch.c
@@ -33,15 +33,15 @@ static unsigned int timeout = 30;
 static int touch_vdso;
 struct timeval tv;
 
-static int touch_fp;
+static int touch_fp = 1;
 double fp;
 
-static int touch_vector;
+static int touch_vector = 1;
 typedef int v4si __attribute__ ((vector_size (16)));
 v4si a, b, c;
 
 #ifdef __powerpc__
-static int touch_altivec;
+static int touch_altivec = 1;
 
 static void __attribute__((__target__("no-vsx"))) altivec_touch_fn(void)
 {
@@ -354,11 +354,11 @@ static struct option options[] = {
 	{ "process", no_argument, &processes, 1 },
 	{ "timeout", required_argument, 0, 's' },
 	{ "vdso", no_argument, &touch_vdso, 1 },
-	{ "fp", no_argument, &touch_fp, 1 },
+	{ "no-fp", no_argument, &touch_fp, 0 },
 #ifdef __powerpc__
-	{ "altivec", no_argument, &touch_altivec, 1 },
+	{ "no-altivec", no_argument, &touch_altivec, 0 },
 #endif
-	{ "vector", no_argument, &touch_vector, 1 },
+	{ "no-vector", no_argument, &touch_vector, 0 },
 	{ 0, },
 };
 

From a8da474ec18f4c4c39f83202d64d73a23b755c1d Mon Sep 17 00:00:00 2001
From: Daniel Axtens <dja@axtens.net>
Date: Mon, 7 Dec 2015 10:50:51 +1100
Subject: [PATCH 083/149] selftests/powerpc: Add script to test HMI
 functionality

HMIs (Hypervisor Management|Maintenance Interrupts) are a class of interrupt
on POWER systems.

HMI support has traditionally been exceptionally difficult to test, however
Skiboot ships a tool that, with the correct magic numbers, will inject them.

This, therefore, is a first pass at a script to inject HMIs and monitor
Linux's response. It injects an HMI on each core on every chip in turn
It then watches dmesg to see if it's acknowledged by Linux.

On a Tuletta, I observed that we see 8 (or sometimes 9 or more) events per
injection, regardless of SMT setting, so we wait for 8 before progressing.

It sits in a new scripts/ directory in selftests/powerpc, because it's not
designed to be run as part of the regular make selftests process. In
particular, it is quite possibly going to end up garding lots of your CPUs,
so it should only be run if you know how to undo that.

CC: Mahesh J Salgaonkar <mahesh.salgaonkar@in.ibm.com>
Signed-off-by: Daniel Axtens <dja@axtens.net>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 .../testing/selftests/powerpc/scripts/hmi.sh  | 89 +++++++++++++++++++
 1 file changed, 89 insertions(+)
 create mode 100755 tools/testing/selftests/powerpc/scripts/hmi.sh

diff --git a/tools/testing/selftests/powerpc/scripts/hmi.sh b/tools/testing/selftests/powerpc/scripts/hmi.sh
new file mode 100755
index 000000000000..83fb253ae3bd
--- /dev/null
+++ b/tools/testing/selftests/powerpc/scripts/hmi.sh
@@ -0,0 +1,89 @@
+#!/bin/sh
+#
+# Copyright 2015, Daniel Axtens, IBM Corporation
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+
+# do we have ./getscom, ./putscom?
+if [ -x ./getscom ] && [ -x ./putscom ]; then
+	GETSCOM=./getscom
+	PUTSCOM=./putscom
+elif which getscom > /dev/null; then
+	GETSCOM=$(which getscom)
+	PUTSCOM=$(which putscom)
+else
+	cat <<EOF
+Can't find getscom/putscom in . or \$PATH.
+See https://github.com/open-power/skiboot.
+The tool is in external/xscom-utils
+EOF
+	exit 1
+fi
+
+# We will get 8 HMI events per injection
+# todo: deal with things being offline
+expected_hmis=8
+COUNT_HMIS() {
+    dmesg | grep -c 'Harmless Hypervisor Maintenance interrupt'
+}
+
+# massively expand snooze delay, allowing injection on all cores
+ppc64_cpu --smt-snooze-delay=1000000000
+
+# when we exit, restore it
+trap "ppc64_cpu --smt-snooze-delay=100" 0 1
+
+# for each chip+core combination
+# todo - less fragile parsing
+egrep -o 'OCC: Chip [0-9a-f]+ Core [0-9a-f]' < /sys/firmware/opal/msglog |
+while read chipcore; do
+	chip=$(echo "$chipcore"|awk '{print $3}')
+	core=$(echo "$chipcore"|awk '{print $5}')
+	fir="0x1${core}013100"
+
+	# verify that Core FIR is zero as expected
+	if [ "$($GETSCOM -c 0x${chip} $fir)" != 0 ]; then
+		echo "FIR was not zero before injection for chip $chip, core $core. Aborting!"
+		echo "Result of $GETSCOM -c 0x${chip} $fir:"
+		$GETSCOM -c 0x${chip} $fir
+		echo "If you get a -5 error, the core may be in idle state. Try stress-ng."
+		echo "Otherwise, try $PUTSCOM -c 0x${chip} $fir 0"
+		exit 1
+	fi
+
+	# keep track of the number of HMIs handled
+	old_hmis=$(COUNT_HMIS)
+
+	# do injection, adding a marker to dmesg for clarity
+	echo "Injecting HMI on core $core, chip $chip" | tee /dev/kmsg
+	# inject a RegFile recoverable error
+	if ! $PUTSCOM -c 0x${chip} $fir 2000000000000000 > /dev/null; then
+		echo "Error injecting. Aborting!"
+		exit 1
+	fi
+
+	# now we want to wait for all the HMIs to be processed
+	# we expect one per thread on the core
+	i=0;
+	new_hmis=$(COUNT_HMIS)
+	while [ $new_hmis -lt $((old_hmis + expected_hmis)) ] && [ $i -lt 12 ]; do
+	    echo "Seen $((new_hmis - old_hmis)) HMI(s) out of $expected_hmis expected, sleeping"
+	    sleep 5;
+	    i=$((i + 1))
+	    new_hmis=$(COUNT_HMIS)
+	done
+	if [ $i = 12 ]; then
+	    echo "Haven't seen expected $expected_hmis recoveries after 1 min. Aborting."
+	    exit 1
+	fi
+	echo "Processed $expected_hmis events; presumed success. Check dmesg."
+	echo ""
+done

From 786842b62f81f20d14894925e8c225328ee8144b Mon Sep 17 00:00:00 2001
From: Stewart Smith <stewart@linux.vnet.ibm.com>
Date: Wed, 9 Dec 2015 17:18:18 +1100
Subject: [PATCH 084/149] powerpc/powernv: panic() on OPAL < V3

The OpenPower Abstraction Layer firmware went through a couple
of iterations in the lab before being released. What we now know
as OPAL advertises itself as OPALv3.

OPALv2 and OPALv1 never made it outside the lab, and the possibility
of anyone at all ever building a mainline kernel today and expecting
it to boot on such hardware is zero.

Signed-off-by: Stewart Smith <stewart@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/opal.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index 4296d55e88f3..faea1abaa785 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -103,11 +103,8 @@ int __init early_init_dt_scan_opal(unsigned long node,
 		powerpc_firmware_features |= FW_FEATURE_OPALv2;
 		powerpc_firmware_features |= FW_FEATURE_OPALv3;
 		pr_info("OPAL V3 detected !\n");
-	} else if (of_flat_dt_is_compatible(node, "ibm,opal-v2")) {
-		powerpc_firmware_features |= FW_FEATURE_OPALv2;
-		pr_info("OPAL V2 detected !\n");
 	} else {
-		pr_info("OPAL V1 detected !\n");
+		panic("OPAL != V3 detected, no longer supported.\n");
 	}
 
 	/* Reinit all cores with the right endian */

From 7261aafc095763b119136a562540dea7b1ccf657 Mon Sep 17 00:00:00 2001
From: Stewart Smith <stewart@linux.vnet.ibm.com>
Date: Wed, 9 Dec 2015 17:18:19 +1100
Subject: [PATCH 085/149] powerpc/powernv: Remove OPALv2 firmware define and
 references

OPALv2 only ever existed in the lab and didn't escape to the world.
All OPAL systems in the wild are OPALv3.

The probability of there being an OPALv2 system still powered on
anywhere inside IBM is approximately zero, let alone anyone
expecting to run mainline kernels.

So, start to remove references to OPALv2.

Signed-off-by: Stewart Smith <stewart@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/firmware.h    | 4 +---
 arch/powerpc/platforms/powernv/opal.c  | 8 ++------
 arch/powerpc/platforms/powernv/setup.c | 4 ----
 arch/powerpc/platforms/powernv/smp.c   | 4 ++--
 4 files changed, 5 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/include/asm/firmware.h b/arch/powerpc/include/asm/firmware.h
index e05808a328db..50af5e5ea86f 100644
--- a/arch/powerpc/include/asm/firmware.h
+++ b/arch/powerpc/include/asm/firmware.h
@@ -47,7 +47,6 @@
 #define FW_FEATURE_VPHN		ASM_CONST(0x0000000004000000)
 #define FW_FEATURE_XCMO		ASM_CONST(0x0000000008000000)
 #define FW_FEATURE_OPAL		ASM_CONST(0x0000000010000000)
-#define FW_FEATURE_OPALv2	ASM_CONST(0x0000000020000000)
 #define FW_FEATURE_SET_MODE	ASM_CONST(0x0000000040000000)
 #define FW_FEATURE_BEST_ENERGY	ASM_CONST(0x0000000080000000)
 #define FW_FEATURE_TYPE1_AFFINITY ASM_CONST(0x0000000100000000)
@@ -70,8 +69,7 @@ enum {
 		FW_FEATURE_SET_MODE | FW_FEATURE_BEST_ENERGY |
 		FW_FEATURE_TYPE1_AFFINITY | FW_FEATURE_PRRN,
 	FW_FEATURE_PSERIES_ALWAYS = 0,
-	FW_FEATURE_POWERNV_POSSIBLE = FW_FEATURE_OPAL | FW_FEATURE_OPALv2 |
-		FW_FEATURE_OPALv3,
+	FW_FEATURE_POWERNV_POSSIBLE = FW_FEATURE_OPAL | FW_FEATURE_OPALv3,
 	FW_FEATURE_POWERNV_ALWAYS = 0,
 	FW_FEATURE_PS3_POSSIBLE = FW_FEATURE_LPAR | FW_FEATURE_PS3_LV1,
 	FW_FEATURE_PS3_ALWAYS = FW_FEATURE_LPAR | FW_FEATURE_PS3_LV1,
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index faea1abaa785..5ce51d9b4ca6 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -100,7 +100,6 @@ int __init early_init_dt_scan_opal(unsigned long node,
 
 	powerpc_firmware_features |= FW_FEATURE_OPAL;
 	if (of_flat_dt_is_compatible(node, "ibm,opal-v3")) {
-		powerpc_firmware_features |= FW_FEATURE_OPALv2;
 		powerpc_firmware_features |= FW_FEATURE_OPALv3;
 		pr_info("OPAL V3 detected !\n");
 	} else {
@@ -349,7 +348,7 @@ int opal_put_chars(uint32_t vtermno, const char *data, int total_len)
 	 * enough room and be done with it
 	 */
 	spin_lock_irqsave(&opal_write_lock, flags);
-	if (firmware_has_feature(FW_FEATURE_OPALv2)) {
+	if (firmware_has_feature(FW_FEATURE_OPALv3)) {
 		rc = opal_console_write_buffer_space(vtermno, &olen);
 		len = be64_to_cpu(olen);
 		if (rc || len < total_len) {
@@ -693,10 +692,7 @@ static int __init opal_init(void)
 	}
 
 	/* Register OPAL consoles if any ports */
-	if (firmware_has_feature(FW_FEATURE_OPALv2))
-		consoles = of_find_node_by_path("/ibm,opal/consoles");
-	else
-		consoles = of_node_get(opal_node);
+	consoles = of_find_node_by_path("/ibm,opal/consoles");
 	if (consoles) {
 		for_each_child_of_node(consoles, np) {
 			if (strcmp(np->name, "serial"))
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index a9a8fa37a555..54583fc417be 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -92,10 +92,6 @@ static void pnv_show_cpuinfo(struct seq_file *m)
 	seq_printf(m, "machine\t\t: PowerNV %s\n", model);
 	if (firmware_has_feature(FW_FEATURE_OPALv3))
 		seq_printf(m, "firmware\t: OPAL v3\n");
-	else if (firmware_has_feature(FW_FEATURE_OPALv2))
-		seq_printf(m, "firmware\t: OPAL v2\n");
-	else if (firmware_has_feature(FW_FEATURE_OPAL))
-		seq_printf(m, "firmware\t: OPAL v1\n");
 	else
 		seq_printf(m, "firmware\t: BML\n");
 	of_node_put(root);
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index ca264833ee64..9b968a315103 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -65,10 +65,10 @@ static int pnv_smp_kick_cpu(int nr)
 	BUG_ON(nr < 0 || nr >= NR_CPUS);
 
 	/*
-	 * If we already started or OPALv2 is not supported, we just
+	 * If we already started or OPALv3 is not supported, we just
 	 * kick the CPU via the PACA
 	 */
-	if (paca[nr].cpu_start || !firmware_has_feature(FW_FEATURE_OPALv2))
+	if (paca[nr].cpu_start || !firmware_has_feature(FW_FEATURE_OPALv3))
 		goto kick;
 
 	/*

From e4d54f71d29997344b4c4c8d47708240f9f23a5c Mon Sep 17 00:00:00 2001
From: Stewart Smith <stewart@linux.vnet.ibm.com>
Date: Wed, 9 Dec 2015 17:18:20 +1100
Subject: [PATCH 086/149] powerpc/powernv: remove FW_FEATURE_OPALv3 and just
 use FW_FEATURE_OPAL

Long ago, only in the lab, there was OPALv1 and OPALv2. Now there is
just OPALv3, with nobody ever expecting anything on pre-OPALv3 to
be cared about or supported by mainline kernels.

So, let's remove FW_FEATURE_OPALv3 and instead use FW_FEATURE_OPAL
exclusively.

Signed-off-by: Stewart Smith <stewart@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/firmware.h          |  3 +-
 arch/powerpc/platforms/powernv/eeh-powernv.c |  4 +-
 arch/powerpc/platforms/powernv/idle.c        |  2 +-
 arch/powerpc/platforms/powernv/opal-xscom.c  |  2 +-
 arch/powerpc/platforms/powernv/opal.c        | 25 +++----
 arch/powerpc/platforms/powernv/pci-ioda.c    |  2 +-
 arch/powerpc/platforms/powernv/setup.c       |  8 +--
 arch/powerpc/platforms/powernv/smp.c         | 74 ++++++++------------
 drivers/cpufreq/powernv-cpufreq.c            |  2 +-
 drivers/cpuidle/cpuidle-powernv.c            |  2 +-
 10 files changed, 54 insertions(+), 70 deletions(-)

diff --git a/arch/powerpc/include/asm/firmware.h b/arch/powerpc/include/asm/firmware.h
index 50af5e5ea86f..b0629249778b 100644
--- a/arch/powerpc/include/asm/firmware.h
+++ b/arch/powerpc/include/asm/firmware.h
@@ -51,7 +51,6 @@
 #define FW_FEATURE_BEST_ENERGY	ASM_CONST(0x0000000080000000)
 #define FW_FEATURE_TYPE1_AFFINITY ASM_CONST(0x0000000100000000)
 #define FW_FEATURE_PRRN		ASM_CONST(0x0000000200000000)
-#define FW_FEATURE_OPALv3	ASM_CONST(0x0000000400000000)
 
 #ifndef __ASSEMBLY__
 
@@ -69,7 +68,7 @@ enum {
 		FW_FEATURE_SET_MODE | FW_FEATURE_BEST_ENERGY |
 		FW_FEATURE_TYPE1_AFFINITY | FW_FEATURE_PRRN,
 	FW_FEATURE_PSERIES_ALWAYS = 0,
-	FW_FEATURE_POWERNV_POSSIBLE = FW_FEATURE_OPAL | FW_FEATURE_OPALv3,
+	FW_FEATURE_POWERNV_POSSIBLE = FW_FEATURE_OPAL,
 	FW_FEATURE_POWERNV_ALWAYS = 0,
 	FW_FEATURE_PS3_POSSIBLE = FW_FEATURE_LPAR | FW_FEATURE_PS3_LV1,
 	FW_FEATURE_PS3_ALWAYS = FW_FEATURE_LPAR | FW_FEATURE_PS3_LV1,
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
index e1c90725522a..5f152b95ca0c 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -48,8 +48,8 @@ static int pnv_eeh_init(void)
 	struct pci_controller *hose;
 	struct pnv_phb *phb;
 
-	if (!firmware_has_feature(FW_FEATURE_OPALv3)) {
-		pr_warn("%s: OPALv3 is required !\n",
+	if (!firmware_has_feature(FW_FEATURE_OPAL)) {
+		pr_warn("%s: OPAL is required !\n",
 			__func__);
 		return -EINVAL;
 	}
diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
index 59d735d2e5c0..15bfbcd5debc 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -242,7 +242,7 @@ static int __init pnv_init_idle_states(void)
 	if (cpuidle_disable != IDLE_NO_OVERRIDE)
 		goto out;
 
-	if (!firmware_has_feature(FW_FEATURE_OPALv3))
+	if (!firmware_has_feature(FW_FEATURE_OPAL))
 		goto out;
 
 	power_mgt = of_find_node_by_path("/ibm,opal/power-mgt");
diff --git a/arch/powerpc/platforms/powernv/opal-xscom.c b/arch/powerpc/platforms/powernv/opal-xscom.c
index 7634d1c62299..d0ac535cf5d7 100644
--- a/arch/powerpc/platforms/powernv/opal-xscom.c
+++ b/arch/powerpc/platforms/powernv/opal-xscom.c
@@ -126,7 +126,7 @@ static const struct scom_controller opal_scom_controller = {
 
 static int opal_xscom_init(void)
 {
-	if (firmware_has_feature(FW_FEATURE_OPALv3))
+	if (firmware_has_feature(FW_FEATURE_OPAL))
 		scom_init(&opal_scom_controller);
 	return 0;
 }
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index 5ce51d9b4ca6..aad0033d65d1 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -98,10 +98,9 @@ int __init early_init_dt_scan_opal(unsigned long node,
 	pr_debug("OPAL Entry = 0x%llx (sizep=%p runtimesz=%d)\n",
 		 opal.size, sizep, runtimesz);
 
-	powerpc_firmware_features |= FW_FEATURE_OPAL;
 	if (of_flat_dt_is_compatible(node, "ibm,opal-v3")) {
-		powerpc_firmware_features |= FW_FEATURE_OPALv3;
-		pr_info("OPAL V3 detected !\n");
+		powerpc_firmware_features |= FW_FEATURE_OPAL;
+		pr_info("OPAL detected !\n");
 	} else {
 		panic("OPAL != V3 detected, no longer supported.\n");
 	}
@@ -348,17 +347,15 @@ int opal_put_chars(uint32_t vtermno, const char *data, int total_len)
 	 * enough room and be done with it
 	 */
 	spin_lock_irqsave(&opal_write_lock, flags);
-	if (firmware_has_feature(FW_FEATURE_OPALv3)) {
-		rc = opal_console_write_buffer_space(vtermno, &olen);
-		len = be64_to_cpu(olen);
-		if (rc || len < total_len) {
-			spin_unlock_irqrestore(&opal_write_lock, flags);
-			/* Closed -> drop characters */
-			if (rc)
-				return total_len;
-			opal_poll_events(NULL);
-			return -EAGAIN;
-		}
+	rc = opal_console_write_buffer_space(vtermno, &olen);
+	len = be64_to_cpu(olen);
+	if (rc || len < total_len) {
+		spin_unlock_irqrestore(&opal_write_lock, flags);
+		/* Closed -> drop characters */
+		if (rc)
+			return total_len;
+		opal_poll_events(NULL);
+		return -EAGAIN;
 	}
 
 	/* We still try to handle partial completions, though they
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 414fd1a00fda..cdd5fa942aed 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -344,7 +344,7 @@ static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb)
 		return;
 	}
 
-	if (!firmware_has_feature(FW_FEATURE_OPALv3)) {
+	if (!firmware_has_feature(FW_FEATURE_OPAL)) {
 		pr_info("  Firmware too old to support M64 window\n");
 		return;
 	}
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index 54583fc417be..1acb0c72d923 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -90,8 +90,8 @@ static void pnv_show_cpuinfo(struct seq_file *m)
 	if (root)
 		model = of_get_property(root, "model", NULL);
 	seq_printf(m, "machine\t\t: PowerNV %s\n", model);
-	if (firmware_has_feature(FW_FEATURE_OPALv3))
-		seq_printf(m, "firmware\t: OPAL v3\n");
+	if (firmware_has_feature(FW_FEATURE_OPAL))
+		seq_printf(m, "firmware\t: OPAL\n");
 	else
 		seq_printf(m, "firmware\t: BML\n");
 	of_node_put(root);
@@ -220,9 +220,9 @@ static void pnv_kexec_cpu_down(int crash_shutdown, int secondary)
 {
 	xics_kexec_teardown_cpu(secondary);
 
-	/* On OPAL v3, we return all CPUs to firmware */
+	/* On OPAL, we return all CPUs to firmware */
 
-	if (!firmware_has_feature(FW_FEATURE_OPALv3))
+	if (!firmware_has_feature(FW_FEATURE_OPAL))
 		return;
 
 	if (secondary) {
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index 9b968a315103..ad7b1a3dbed0 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -61,14 +61,15 @@ static int pnv_smp_kick_cpu(int nr)
 	unsigned long start_here =
 			__pa(ppc_function_entry(generic_secondary_smp_init));
 	long rc;
+	uint8_t status;
 
 	BUG_ON(nr < 0 || nr >= NR_CPUS);
 
 	/*
-	 * If we already started or OPALv3 is not supported, we just
+	 * If we already started or OPAL is not supported, we just
 	 * kick the CPU via the PACA
 	 */
-	if (paca[nr].cpu_start || !firmware_has_feature(FW_FEATURE_OPALv3))
+	if (paca[nr].cpu_start || !firmware_has_feature(FW_FEATURE_OPAL))
 		goto kick;
 
 	/*
@@ -77,55 +78,42 @@ static int pnv_smp_kick_cpu(int nr)
 	 * first time. OPAL v3 allows us to query OPAL to know if it
 	 * has the CPUs, so we do that
 	 */
-	if (firmware_has_feature(FW_FEATURE_OPALv3)) {
-		uint8_t status;
+	rc = opal_query_cpu_status(pcpu, &status);
+	if (rc != OPAL_SUCCESS) {
+		pr_warn("OPAL Error %ld querying CPU %d state\n", rc, nr);
+		return -ENODEV;
+	}
 
-		rc = opal_query_cpu_status(pcpu, &status);
+	/*
+	 * Already started, just kick it, probably coming from
+	 * kexec and spinning
+	 */
+	if (status == OPAL_THREAD_STARTED)
+		goto kick;
+
+	/*
+	 * Available/inactive, let's kick it
+	 */
+	if (status == OPAL_THREAD_INACTIVE) {
+		pr_devel("OPAL: Starting CPU %d (HW 0x%x)...\n", nr, pcpu);
+		rc = opal_start_cpu(pcpu, start_here);
 		if (rc != OPAL_SUCCESS) {
-			pr_warn("OPAL Error %ld querying CPU %d state\n",
-				rc, nr);
-			return -ENODEV;
-		}
-
-		/*
-		 * Already started, just kick it, probably coming from
-		 * kexec and spinning
-		 */
-		if (status == OPAL_THREAD_STARTED)
-			goto kick;
-
-		/*
-		 * Available/inactive, let's kick it
-		 */
-		if (status == OPAL_THREAD_INACTIVE) {
-			pr_devel("OPAL: Starting CPU %d (HW 0x%x)...\n",
-				 nr, pcpu);
-			rc = opal_start_cpu(pcpu, start_here);
-			if (rc != OPAL_SUCCESS) {
-				pr_warn("OPAL Error %ld starting CPU %d\n",
-					rc, nr);
-				return -ENODEV;
-			}
-		} else {
-			/*
-			 * An unavailable CPU (or any other unknown status)
-			 * shouldn't be started. It should also
-			 * not be in the possible map but currently it can
-			 * happen
-			 */
-			pr_devel("OPAL: CPU %d (HW 0x%x) is unavailable"
-				 " (status %d)...\n", nr, pcpu, status);
+			pr_warn("OPAL Error %ld starting CPU %d\n", rc, nr);
 			return -ENODEV;
 		}
 	} else {
 		/*
-		 * On OPAL v2, we just kick it and hope for the best,
-		 * we must not test the error from opal_start_cpu() or
-		 * we would fail to get CPUs from kexec.
+		 * An unavailable CPU (or any other unknown status)
+		 * shouldn't be started. It should also
+		 * not be in the possible map but currently it can
+		 * happen
 		 */
-		opal_start_cpu(pcpu, start_here);
+		pr_devel("OPAL: CPU %d (HW 0x%x) is unavailable"
+			 " (status %d)...\n", nr, pcpu, status);
+		return -ENODEV;
 	}
- kick:
+
+kick:
 	return smp_generic_kick_cpu(nr);
 }
 
diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c
index cb501386eb6e..547890fd9572 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -586,7 +586,7 @@ static int __init powernv_cpufreq_init(void)
 	int rc = 0;
 
 	/* Don't probe on pseries (guest) platforms */
-	if (!firmware_has_feature(FW_FEATURE_OPALv3))
+	if (!firmware_has_feature(FW_FEATURE_OPAL))
 		return -ENODEV;
 
 	/* Discover pstates from device tree and init */
diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c
index 845bafcfa792..e12dc30d8864 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -264,7 +264,7 @@ static int powernv_idle_probe(void)
 	if (cpuidle_disable != IDLE_NO_OVERRIDE)
 		return -ENODEV;
 
-	if (firmware_has_feature(FW_FEATURE_OPALv3)) {
+	if (firmware_has_feature(FW_FEATURE_OPAL)) {
 		cpuidle_state_table = powernv_states;
 		/* Device tree can indicate more idle states */
 		max_idle_state = powernv_add_idle_states();

From 209eb4e5cbaba53ab555f3e7b43aa27176f3a925 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 16 Dec 2015 21:01:42 +1100
Subject: [PATCH 087/149] powerpc/rtas: Add rtas_call_unlocked()

Most users of RTAS (Run-Time Abstraction Services) use rtas_call(),
which deals with locking as well as endian handling.

However we have two users outside of rtas.c that can't use rtas_call()
because they have different locking requirements.

The hotplug CPU code can't take the RTAS lock because the CPU would go
offline with the lock held and no other CPUs would be able to call RTAS
until the CPU came back online.

The xmon code doesn't want to take the lock because it would risk dead
locking when we are trying to recover from a crash.

Both sites required multiple patches when we added little endian
support, proving that programmers can't do endian right.

Although that ship has sailed, we can still clean the code up by
providing an unlocked version of rtas_call() which avoids the need to
open code the logic elsewhere.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/rtas.h |  2 ++
 arch/powerpc/kernel/rtas.c      | 44 ++++++++++++++++++++++++---------
 2 files changed, 35 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index b77ef369c0f0..6db1d6977a0d 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -338,6 +338,8 @@ extern void enter_rtas(unsigned long);
 extern int rtas_token(const char *service);
 extern int rtas_service_present(const char *service);
 extern int rtas_call(int token, int, int, int *, ...);
+void rtas_call_unlocked(struct rtas_args *args, int token, int nargs,
+			int nret, ...);
 extern void rtas_restart(char *cmd);
 extern void rtas_power_off(void);
 extern void rtas_halt(void);
diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index 5a753fae8265..fcf2d653a6fe 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -418,6 +418,36 @@ static char *__fetch_rtas_last_error(char *altbuf)
 #define get_errorlog_buffer()		NULL
 #endif
 
+
+static void
+va_rtas_call_unlocked(struct rtas_args *args, int token, int nargs, int nret,
+		      va_list list)
+{
+	int i;
+
+	args->token = cpu_to_be32(token);
+	args->nargs = cpu_to_be32(nargs);
+	args->nret  = cpu_to_be32(nret);
+	args->rets  = &(args->args[nargs]);
+
+	for (i = 0; i < nargs; ++i)
+		args->args[i] = cpu_to_be32(va_arg(list, __u32));
+
+	for (i = 0; i < nret; ++i)
+		args->rets[i] = 0;
+
+	enter_rtas(__pa(args));
+}
+
+void rtas_call_unlocked(struct rtas_args *args, int token, int nargs, int nret, ...)
+{
+	va_list list;
+
+	va_start(list, nret);
+	va_rtas_call_unlocked(args, token, nargs, nret, list);
+	va_end(list);
+}
+
 int rtas_call(int token, int nargs, int nret, int *outputs, ...)
 {
 	va_list list;
@@ -431,22 +461,14 @@ int rtas_call(int token, int nargs, int nret, int *outputs, ...)
 		return -1;
 
 	s = lock_rtas();
+
+	/* We use the global rtas args buffer */
 	rtas_args = &rtas.args;
 
-	rtas_args->token = cpu_to_be32(token);
-	rtas_args->nargs = cpu_to_be32(nargs);
-	rtas_args->nret  = cpu_to_be32(nret);
-	rtas_args->rets  = &(rtas_args->args[nargs]);
 	va_start(list, outputs);
-	for (i = 0; i < nargs; ++i)
-		rtas_args->args[i] = cpu_to_be32(va_arg(list, __u32));
+	va_rtas_call_unlocked(rtas_args, token, nargs, nret, list);
 	va_end(list);
 
-	for (i = 0; i < nret; ++i)
-		rtas_args->rets[i] = 0;
-
-	enter_rtas(__pa(rtas_args));
-
 	/* A -1 return code indicates that the last command couldn't
 	   be completed due to a hardware error. */
 	if (be32_to_cpu(rtas_args->rets[0]) == -1)

From 08eb105a7c18c917f2ed7afc5a151f0514f26460 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Tue, 24 Nov 2015 22:26:09 +1100
Subject: [PATCH 088/149] powerpc/xmon: Use rtas_call_unlocked() in xmon

Avoid open coding the logic by using rtas_call_unlocked().

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/xmon/xmon.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index e8c7a937955e..07a8508cb7fa 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -320,6 +320,7 @@ static inline void disable_surveillance(void)
 #ifdef CONFIG_PPC_PSERIES
 	/* Since this can't be a module, args should end up below 4GB. */
 	static struct rtas_args args;
+	int token;
 
 	/*
 	 * At this point we have got all the cpus we can into
@@ -328,17 +329,12 @@ static inline void disable_surveillance(void)
 	 * If we did try to take rtas.lock there would be a
 	 * real possibility of deadlock.
 	 */
-	args.token = rtas_token("set-indicator");
-	if (args.token == RTAS_UNKNOWN_SERVICE)
+	token = rtas_token("set-indicator");
+	if (token == RTAS_UNKNOWN_SERVICE)
 		return;
-	args.token = cpu_to_be32(args.token);
-	args.nargs = cpu_to_be32(3);
-	args.nret = cpu_to_be32(1);
-	args.rets = &args.args[3];
-	args.args[0] = cpu_to_be32(SURVEILLANCE_TOKEN);
-	args.args[1] = 0;
-	args.args[2] = 0;
-	enter_rtas(__pa(&args));
+
+	rtas_call_unlocked(&args, token, 3, 1, NULL, SURVEILLANCE_TOKEN, 0, 0);
+
 #endif /* CONFIG_PPC_PSERIES */
 }
 

From b2e8590fa1e35c38680dcb87c9d1bfdcc6c61a40 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Tue, 24 Nov 2015 22:26:10 +1100
Subject: [PATCH 089/149] powerpc/pseries: Use rtas_call_unlocked() in pseries
 hotplug

Avoid open coding the logic by using rtas_call_unlocked().

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/pseries/hotplug-cpu.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index 62475440fd45..86d2ecacb237 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -88,13 +88,7 @@ void set_default_offline_state(int cpu)
 
 static void rtas_stop_self(void)
 {
-	static struct rtas_args args = {
-		.nargs = 0,
-		.nret = cpu_to_be32(1),
-		.rets = &args.args[0],
-	};
-
-	args.token = cpu_to_be32(rtas_stop_self_token);
+	static struct rtas_args args;
 
 	local_irq_disable();
 
@@ -102,7 +96,8 @@ static void rtas_stop_self(void)
 
 	printk("cpu %u (hwid %u) Ready to die...\n",
 	       smp_processor_id(), hard_smp_processor_id());
-	enter_rtas(__pa(&args));
+
+	rtas_call_unlocked(&args, rtas_stop_self_token, 0, 1, NULL);
 
 	panic("Alas, I survived.\n");
 }

From 4456f4524604be2558e5f6a8e0f7cc9ed17c783e Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Tue, 24 Nov 2015 22:26:11 +1100
Subject: [PATCH 090/149] powerpc/rtas: Use rtas_call_unlocked() in
 call_rtas_display_status()

Although call_rtas_display_status() does actually want to use the
regular RTAS locking, it doesn't want the extra logic that is in
rtas_call(), so currently it open codes the logic.

Instead we can use rtas_call_unlocked(), after taking the RTAS lock.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/rtas.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index fcf2d653a6fe..f4fa137292c4 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -93,21 +93,13 @@ static void unlock_rtas(unsigned long flags)
  */
 static void call_rtas_display_status(unsigned char c)
 {
-	struct rtas_args *args = &rtas.args;
 	unsigned long s;
 
 	if (!rtas.base)
 		return;
+
 	s = lock_rtas();
-
-	args->token = cpu_to_be32(10);
-	args->nargs = cpu_to_be32(1);
-	args->nret  = cpu_to_be32(1);
-	args->rets  = &(args->args[1]);
-	args->args[0] = cpu_to_be32(c);
-
-	enter_rtas(__pa(args));
-
+	rtas_call_unlocked(&rtas.args, 10, 1, 1, NULL, c);
 	unlock_rtas(s);
 }
 

From cd5cdeb6c8a42fb87644b0eb5d240f6ce6172402 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Tue, 24 Nov 2015 22:26:12 +1100
Subject: [PATCH 091/149] powerpc/rtas: Make enter_rtas() private

There are no longer any users of enter_rtas() outside of rtas.c, so make
it "private", by moving the declaration inside rtas.c. Hopefully this
will encourage people to use one of the wrappers which takes the sharp
edges off the RTAS calling sequence.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/rtas.h | 1 -
 arch/powerpc/kernel/rtas.c      | 3 +++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index 6db1d6977a0d..51400baa8d48 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -334,7 +334,6 @@ extern void (*rtas_flash_term_hook)(int);
 
 extern struct rtas_t rtas;
 
-extern void enter_rtas(unsigned long);
 extern int rtas_token(const char *service);
 extern int rtas_service_present(const char *service);
 extern int rtas_call(int token, int, int, int *, ...);
diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index f4fa137292c4..28736ff27fea 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -44,6 +44,9 @@
 #include <asm/mmu.h>
 #include <asm/topology.h>
 
+/* This is here deliberately so it's only used in this file */
+void enter_rtas(unsigned long);
+
 struct rtas_t rtas = {
 	.lock = __ARCH_SPIN_LOCK_UNLOCKED
 };

From d6265aeaf815801ad53a95f11cea8ea752862176 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 25 Nov 2015 14:25:16 +1100
Subject: [PATCH 092/149] powerpc/kernel: Drop HMT_MEDIUM_PPR_DISCARD

HMT_MEDIUM_PPR_DISCARD is a macro which is present at the start of most
of our first level exception handlers. It conditionally executes a
HMT_MEDIUM instruction, which sets the processor priority to medium.

On on modern systems, ie. Power7 and later, it is nop'ed out at boot.
All it does is make the exception vectors more cramped, and consume 4
bytes of icache.

On old systems it has the effect of boosting the processor priority at
the start of exception processing. If we were previously in the idle
loop for example, we may be at low or very low priority. This is
desirable as we want to process the exception as fast as possible.

However looking closely at the generated code, we see that in all cases
we execute another HMT_MEDIUM just four instructions later. With code
patching applied, the final code on an old (Power6) system will look
like, eg:

  c000000000000300 <data_access_pSeries>:
  c000000000000300:	7c 42 13 78	mr	r2,r2		<-
  c000000000000304:	7d b2 43 a6	mtsprg	2,r13
  c000000000000308:	7d b1 42 a6	mfsprg	r13,1
  c00000000000030c:	f9 2d 00 80	std	r9,128(r13)
  c000000000000310:	60 00 00 00	nop
  c000000000000314:	7c 42 13 78	mr	r2,r2		<-

So I suggest that the added code complexity of HMT_MEDIUM_PPR_DISCARD is
not justified by the benefit of boosting the processor priority for the
duration of four instructions, and therefore we drop it.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/exception-64s.h | 15 ---------------
 arch/powerpc/kernel/exceptions-64s.S     |  9 ---------
 2 files changed, 24 deletions(-)

diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index 9ee10781121f..60b2bbda212d 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -129,15 +129,6 @@ BEGIN_FTR_SECTION_NESTED(941)						\
 	mtspr	SPRN_PPR,ra;						\
 END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,941)
 
-/*
- * Increase the priority on systems where PPR save/restore is not
- * implemented/ supported.
- */
-#define HMT_MEDIUM_PPR_DISCARD						\
-BEGIN_FTR_SECTION_NESTED(942)						\
-	HMT_MEDIUM;							\
-END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,0,942)  /*non P7*/		
-
 /*
  * Get an SPR into a register if the CPU has the given feature
  */
@@ -346,7 +337,6 @@ do_kvm_##n:								\
 	. = loc;					\
 	.globl label##_pSeries;				\
 label##_pSeries:					\
-	HMT_MEDIUM_PPR_DISCARD;				\
 	SET_SCRATCH0(r13);		/* save r13 */		\
 	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common,	\
 				 EXC_STD, KVMTEST, vec)
@@ -362,7 +352,6 @@ label##_pSeries:						\
 	. = loc;					\
 	.globl label##_hv;				\
 label##_hv:						\
-	HMT_MEDIUM_PPR_DISCARD;				\
 	SET_SCRATCH0(r13);	/* save r13 */			\
 	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common,	\
 				 EXC_HV, KVMTEST, vec)
@@ -378,7 +367,6 @@ label##_hv:						\
 	. = loc;					\
 	.globl label##_relon_pSeries;			\
 label##_relon_pSeries:					\
-	HMT_MEDIUM_PPR_DISCARD;				\
 	/* No guest interrupts come through here */	\
 	SET_SCRATCH0(r13);		/* save r13 */	\
 	EXCEPTION_RELON_PROLOG_PSERIES(PACA_EXGEN, label##_common, \
@@ -394,7 +382,6 @@ label##_relon_pSeries:						\
 	. = loc;					\
 	.globl label##_relon_hv;			\
 label##_relon_hv:					\
-	HMT_MEDIUM_PPR_DISCARD;				\
 	/* No guest interrupts come through here */	\
 	SET_SCRATCH0(r13);	/* save r13 */		\
 	EXCEPTION_RELON_PROLOG_PSERIES(PACA_EXGEN, label##_common, \
@@ -448,7 +435,6 @@ label##_relon_hv:						\
 	. = loc;							\
 	.globl label##_pSeries;						\
 label##_pSeries:							\
-	HMT_MEDIUM_PPR_DISCARD;						\
 	_MASKABLE_EXCEPTION_PSERIES(vec, label,				\
 				    EXC_STD, SOFTEN_TEST_PR)
 
@@ -466,7 +452,6 @@ label##_hv:								\
 	EXCEPTION_PROLOG_PSERIES_1(label##_common, EXC_HV);
 
 #define __MASKABLE_RELON_EXCEPTION_PSERIES(vec, label, h, extra)	\
-	HMT_MEDIUM_PPR_DISCARD;						\
 	SET_SCRATCH0(r13);    /* save r13 */				\
 	EXCEPTION_PROLOG_0(PACA_EXGEN);					\
 	__EXCEPTION_PROLOG_1(PACA_EXGEN, extra, vec);		\
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 3419cbf2ad59..0757f23d35aa 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -96,7 +96,6 @@ __start_interrupts:
 
 	.globl system_reset_pSeries;
 system_reset_pSeries:
-	HMT_MEDIUM_PPR_DISCARD
 	SET_SCRATCH0(r13)
 #ifdef CONFIG_PPC_P7_NAP
 BEGIN_FTR_SECTION
@@ -164,7 +163,6 @@ machine_check_pSeries_1:
 	 * some code path might still want to branch into the original
 	 * vector
 	 */
-	HMT_MEDIUM_PPR_DISCARD
 	SET_SCRATCH0(r13)		/* save r13 */
 #ifdef CONFIG_PPC_P7_NAP
 BEGIN_FTR_SECTION
@@ -199,7 +197,6 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
 	. = 0x300
 	.globl data_access_pSeries
 data_access_pSeries:
-	HMT_MEDIUM_PPR_DISCARD
 	SET_SCRATCH0(r13)
 	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, data_access_common, EXC_STD,
 				 KVMTEST, 0x300)
@@ -207,7 +204,6 @@ data_access_pSeries:
 	. = 0x380
 	.globl data_access_slb_pSeries
 data_access_slb_pSeries:
-	HMT_MEDIUM_PPR_DISCARD
 	SET_SCRATCH0(r13)
 	EXCEPTION_PROLOG_0(PACA_EXSLB)
 	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST, 0x380)
@@ -239,7 +235,6 @@ data_access_slb_pSeries:
 	. = 0x480
 	.globl instruction_access_slb_pSeries
 instruction_access_slb_pSeries:
-	HMT_MEDIUM_PPR_DISCARD
 	SET_SCRATCH0(r13)
 	EXCEPTION_PROLOG_0(PACA_EXSLB)
 	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST, 0x480)
@@ -269,7 +264,6 @@ instruction_access_slb_pSeries:
 	.globl hardware_interrupt_hv;
 hardware_interrupt_pSeries:
 hardware_interrupt_hv:
-	HMT_MEDIUM_PPR_DISCARD
 	BEGIN_FTR_SECTION
 		_MASKABLE_EXCEPTION_PSERIES(0x502, hardware_interrupt,
 					    EXC_HV, SOFTEN_TEST_HV)
@@ -413,7 +407,6 @@ hv_facility_unavailable_trampoline:
 	. = 0x1500
 	.global denorm_exception_hv
 denorm_exception_hv:
-	HMT_MEDIUM_PPR_DISCARD
 	mtspr	SPRN_SPRG_HSCRATCH0,r13
 	EXCEPTION_PROLOG_0(PACA_EXGEN)
 	EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0x1500)
@@ -527,7 +520,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
 machine_check_pSeries:
 	.globl machine_check_fwnmi
 machine_check_fwnmi:
-	HMT_MEDIUM_PPR_DISCARD
 	SET_SCRATCH0(r13)		/* save r13 */
 	EXCEPTION_PROLOG_0(PACA_EXMC)
 machine_check_pSeries_0:
@@ -711,7 +703,6 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
 	.globl system_reset_fwnmi
       .align 7
 system_reset_fwnmi:
-	HMT_MEDIUM_PPR_DISCARD
 	SET_SCRATCH0(r13)		/* save r13 */
 	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD,
 				 NOTEST, 0x100)

From d030a4b5eb7e9514c15b84a765bcc395cc26ab40 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 25 Nov 2015 14:25:17 +1100
Subject: [PATCH 093/149] powerpc/kernel: Open code HMT_MEDIUM_LOW_HAS_PPR

HMT_MEDIUM_LOW_HAS_PPR is only used in once place, open code it.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/ppc_asm.h | 5 -----
 arch/powerpc/kernel/entry_64.S     | 6 +++++-
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h
index dd0fc18d8103..67f05d4935a0 100644
--- a/arch/powerpc/include/asm/ppc_asm.h
+++ b/arch/powerpc/include/asm/ppc_asm.h
@@ -418,11 +418,6 @@ END_FTR_SECTION_IFCLR(CPU_FTR_601)
  * PPR restore macros used in entry_64.S
  * Used for P7 or later processors
  */
-#define HMT_MEDIUM_LOW_HAS_PPR						\
-BEGIN_FTR_SECTION_NESTED(944)						\
-	HMT_MEDIUM_LOW;							\
-END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,944)
-
 #define SET_DEFAULT_THREAD_PPR(ra, rb)					\
 BEGIN_FTR_SECTION_NESTED(945)						\
 	lis	ra,INIT_PPR@highest;	/* default ppr=3 */		\
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index c8b4225a0095..651a65552ac8 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -223,7 +223,11 @@ END_FTR_SECTION_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS)
 
 	beq-	1f
 	ACCOUNT_CPU_USER_EXIT(r11, r12)
-	HMT_MEDIUM_LOW_HAS_PPR
+
+BEGIN_FTR_SECTION
+	HMT_MEDIUM_LOW
+END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
+
 	ld	r13,GPR13(r1)	/* only restore r13 if returning to usermode */
 1:	ld	r2,GPR2(r1)
 	ld	r1,GPR1(r1)

From d8725ce86c37fa750fc01f739ee4d4ced39167da Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 25 Nov 2015 14:25:18 +1100
Subject: [PATCH 094/149] powerpc/kernel: Open code SET_DEFAULT_THREAD_PPR

This is only used in one location, open code it.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/ppc_asm.h | 13 -------------
 arch/powerpc/kernel/entry_64.S     |  8 +++++++-
 2 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h
index 67f05d4935a0..499d9f89435a 100644
--- a/arch/powerpc/include/asm/ppc_asm.h
+++ b/arch/powerpc/include/asm/ppc_asm.h
@@ -413,19 +413,6 @@ END_FTR_SECTION_IFCLR(CPU_FTR_601)
 	FTR_SECTION_ELSE_NESTED(848);	\
 	mtocrf (FXM), RS;		\
 	ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_NOEXECUTE, 848)
-
-/*
- * PPR restore macros used in entry_64.S
- * Used for P7 or later processors
- */
-#define SET_DEFAULT_THREAD_PPR(ra, rb)					\
-BEGIN_FTR_SECTION_NESTED(945)						\
-	lis	ra,INIT_PPR@highest;	/* default ppr=3 */		\
-	ld	rb,PACACURRENT(r13);					\
-	sldi	ra,ra,32;	/* 11- 13 bits are used for ppr */	\
-	std	ra,TASKTHREADPPR(rb);					\
-END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,945)
-
 #endif
 
 /*
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 651a65552ac8..0d525ce3717f 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -316,7 +316,13 @@ syscall_exit_work:
 	subi	r12,r12,TI_FLAGS
 
 4:	/* Anything else left to do? */
-	SET_DEFAULT_THREAD_PPR(r3, r10)		/* Set thread.ppr = 3 */
+BEGIN_FTR_SECTION
+	lis	r3,INIT_PPR@highest	/* Set thread.ppr = 3 */
+	ld	r10,PACACURRENT(r13)
+	sldi	r3,r3,32	/* bits 11-13 are used for ppr */
+	std	r3,TASKTHREADPPR(r10)
+END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
+
 	andi.	r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP)
 	beq	ret_from_except_lite
 

From 2613265cb5b07a46bc01eb67202874136efd7049 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 16 Dec 2015 21:10:22 +1100
Subject: [PATCH 095/149] powerpc/kernel: Combine vec/loc for
 STD_EXCEPTION_PSERIES

The STD_EXCEPTION_PSERIES macro takes both a vector number, and a
location (memory address). However both are always identical, so combine
them to save repeating ourselves.

This does mean an exception handler must always exist at the location in
memory that matches its vector number. But that's OK because this is the
"STD" macro (standard), which does exactly that. We have other macros
for the other cases, eg. STD_EXCEPTION_PSERIES_OOL (out of line).

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/exception-64s.h |  4 ++--
 arch/powerpc/kernel/exceptions-64s.S     | 16 ++++++++--------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index 60b2bbda212d..93ae809fe5ea 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -333,8 +333,8 @@ do_kvm_##n:								\
 /*
  * Exception vectors.
  */
-#define STD_EXCEPTION_PSERIES(loc, vec, label)		\
-	. = loc;					\
+#define STD_EXCEPTION_PSERIES(vec, label)		\
+	. = vec;					\
 	.globl label##_pSeries;				\
 label##_pSeries:					\
 	SET_SCRATCH0(r13);		/* save r13 */		\
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 0757f23d35aa..7716cebf4b8e 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -230,7 +230,7 @@ data_access_slb_pSeries:
 	bctr
 #endif
 
-	STD_EXCEPTION_PSERIES(0x400, 0x400, instruction_access)
+	STD_EXCEPTION_PSERIES(0x400, instruction_access)
 
 	. = 0x480
 	.globl instruction_access_slb_pSeries
@@ -274,13 +274,13 @@ hardware_interrupt_hv:
 		KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x500)
 	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
 
-	STD_EXCEPTION_PSERIES(0x600, 0x600, alignment)
+	STD_EXCEPTION_PSERIES(0x600, alignment)
 	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x600)
 
-	STD_EXCEPTION_PSERIES(0x700, 0x700, program_check)
+	STD_EXCEPTION_PSERIES(0x700, program_check)
 	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x700)
 
-	STD_EXCEPTION_PSERIES(0x800, 0x800, fp_unavailable)
+	STD_EXCEPTION_PSERIES(0x800, fp_unavailable)
 	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x800)
 
 	. = 0x900
@@ -293,7 +293,7 @@ decrementer_pSeries:
 	MASKABLE_EXCEPTION_PSERIES(0xa00, 0xa00, doorbell_super)
 	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xa00)
 
-	STD_EXCEPTION_PSERIES(0xb00, 0xb00, trap_0b)
+	STD_EXCEPTION_PSERIES(0xb00, trap_0b)
 	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xb00)
 
 	. = 0xc00
@@ -325,7 +325,7 @@ system_call_pSeries:
 	SYSCALL_PSERIES_3
 	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xc00)
 
-	STD_EXCEPTION_PSERIES(0xd00, 0xd00, single_step)
+	STD_EXCEPTION_PSERIES(0xd00, single_step)
 	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xd00)
 
 	/* At 0xe??? we have a bunch of hypervisor exceptions, we branch
@@ -401,7 +401,7 @@ hv_facility_unavailable_trampoline:
 	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1202)
 #endif /* CONFIG_CBE_RAS */
 
-	STD_EXCEPTION_PSERIES(0x1300, 0x1300, instruction_breakpoint)
+	STD_EXCEPTION_PSERIES(0x1300, instruction_breakpoint)
 	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_STD, 0x1300)
 
 	. = 0x1500
@@ -428,7 +428,7 @@ denorm_exception_hv:
 	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1602)
 #endif /* CONFIG_CBE_RAS */
 
-	STD_EXCEPTION_PSERIES(0x1700, 0x1700, altivec_assist)
+	STD_EXCEPTION_PSERIES(0x1700, altivec_assist)
 	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x1700)
 
 #ifdef CONFIG_CBE_RAS

From 7207f43665b83ed7881c5111bc45475ccf5ce48b Mon Sep 17 00:00:00 2001
From: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Date: Thu, 3 Dec 2015 11:29:19 +0100
Subject: [PATCH 096/149] powerpc/mm: Add page soft dirty tracking

User space checkpoint and restart tool (CRIU) needs the page's change
to be soft tracked. This allows to do a pre checkpoint and then dump
only touched pages.

This is done by using a newly assigned PTE bit (_PAGE_SOFT_DIRTY) when
the page is backed in memory, and a new _PAGE_SWP_SOFT_DIRTY bit when
the page is swapped out.

To introduce a new PTE _PAGE_SOFT_DIRTY bit value common to hash 4k
and hash 64k pte, the bits already defined in hash-*4k.h should be
shifted left by one.

The _PAGE_SWP_SOFT_DIRTY bit is dynamically put after the swap type in
the swap pte. A check is added to ensure that the bit is not
overwritten by _PAGE_HPTEFLAGS.

Signed-off-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
CC: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/Kconfig                          |  2 ++
 arch/powerpc/include/asm/book3s/64/hash-4k.h  |  2 +-
 arch/powerpc/include/asm/book3s/64/hash-64k.h |  4 +--
 arch/powerpc/include/asm/book3s/64/hash.h     | 30 +++++++++++++++----
 arch/powerpc/include/asm/book3s/64/pgtable.h  | 26 ++++++++++++++++
 5 files changed, 56 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index db49e0d796b1..6e03f85b11cd 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -559,6 +559,7 @@ choice
 
 config PPC_4K_PAGES
 	bool "4k page size"
+	select HAVE_ARCH_SOFT_DIRTY if CHECKPOINT_RESTORE && PPC_BOOK3S
 
 config PPC_16K_PAGES
 	bool "16k page size"
@@ -567,6 +568,7 @@ config PPC_16K_PAGES
 config PPC_64K_PAGES
 	bool "64k page size"
 	depends on !PPC_FSL_BOOK3E && (44x || PPC_STD_MMU_64 || PPC_BOOK3E_64)
+	select HAVE_ARCH_SOFT_DIRTY if CHECKPOINT_RESTORE && PPC_BOOK3S
 
 config PPC_256K_PAGES
 	bool "256k page size"
diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h
index e59832c94609..ea0414d6659e 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
@@ -52,7 +52,7 @@
 			 _PAGE_F_SECOND | _PAGE_F_GIX)
 
 /* shift to put page number into pte */
-#define PTE_RPN_SHIFT	(17)
+#define PTE_RPN_SHIFT	(18)
 
 #define _PAGE_4K_PFN		0
 #ifndef __ASSEMBLY__
diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h
index 9f9942998587..9e55e3b1fef0 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -25,8 +25,8 @@
 #define PGDIR_SIZE	(1UL << PGDIR_SHIFT)
 #define PGDIR_MASK	(~(PGDIR_SIZE-1))
 
-#define _PAGE_COMBO	0x00020000 /* this is a combo 4k page */
-#define _PAGE_4K_PFN	0x00040000 /* PFN is for a single 4k page */
+#define _PAGE_COMBO	0x00040000 /* this is a combo 4k page */
+#define _PAGE_4K_PFN	0x00080000 /* PFN is for a single 4k page */
 /*
  * Used to track subpage group valid if _PAGE_COMBO is set
  * This overloads _PAGE_F_GIX and _PAGE_F_SECOND
diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
index 8b929e531758..9e861b4378bd 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -33,6 +33,7 @@
 #define _PAGE_F_GIX_SHIFT	12
 #define _PAGE_F_SECOND		0x08000 /* Whether to use secondary hash or not */
 #define _PAGE_SPECIAL		0x10000 /* software: special page */
+#define _PAGE_SOFT_DIRTY	0x20000 /* software: software dirty tracking */
 
 /*
  * THP pages can't be special. So use the _PAGE_SPECIAL
@@ -50,7 +51,7 @@
  */
 #define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS |		\
 			 _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_SPLITTING | \
-			 _PAGE_THP_HUGE | _PAGE_PTE)
+			 _PAGE_THP_HUGE | _PAGE_PTE | _PAGE_SOFT_DIRTY)
 
 #ifdef CONFIG_PPC_64K_PAGES
 #include <asm/book3s/64/hash-64k.h>
@@ -136,14 +137,16 @@
  * pgprot changes
  */
 #define _PAGE_CHG_MASK	(PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \
-			 _PAGE_ACCESSED | _PAGE_SPECIAL | _PAGE_PTE)
+			 _PAGE_ACCESSED | _PAGE_SPECIAL | _PAGE_PTE | \
+			 _PAGE_SOFT_DIRTY)
 /*
  * Mask of bits returned by pte_pgprot()
  */
 #define PAGE_PROT_BITS	(_PAGE_GUARDED | _PAGE_COHERENT | _PAGE_NO_CACHE | \
 			 _PAGE_WRITETHRU | _PAGE_4K_PFN | \
 			 _PAGE_USER | _PAGE_ACCESSED |  \
-			 _PAGE_RW |  _PAGE_DIRTY | _PAGE_EXEC)
+			 _PAGE_RW |  _PAGE_DIRTY | _PAGE_EXEC | \
+			 _PAGE_SOFT_DIRTY)
 /*
  * We define 2 sets of base prot bits, one for basic pages (ie,
  * cacheable kernel and user pages) and one for non cacheable
@@ -339,7 +342,8 @@ static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
 static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry)
 {
 	unsigned long bits = pte_val(entry) &
-		(_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC);
+		(_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC |
+		 _PAGE_SOFT_DIRTY);
 
 	unsigned long old, tmp;
 
@@ -366,6 +370,22 @@ static inline int pte_special(pte_t pte)	{ return !!(pte_val(pte) & _PAGE_SPECIA
 static inline int pte_none(pte_t pte)		{ return (pte_val(pte) & ~_PTE_NONE_MASK) == 0; }
 static inline pgprot_t pte_pgprot(pte_t pte)	{ return __pgprot(pte_val(pte) & PAGE_PROT_BITS); }
 
+#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
+static inline bool pte_soft_dirty(pte_t pte)
+{
+	return !!(pte_val(pte) & _PAGE_SOFT_DIRTY);
+}
+static inline pte_t pte_mksoft_dirty(pte_t pte)
+{
+	return __pte(pte_val(pte) | _PAGE_SOFT_DIRTY);
+}
+
+static inline pte_t pte_clear_soft_dirty(pte_t pte)
+{
+	return __pte(pte_val(pte) & ~_PAGE_SOFT_DIRTY);
+}
+#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
+
 #ifdef CONFIG_NUMA_BALANCING
 /*
  * These work without NUMA balancing but the kernel does not care. See the
@@ -424,7 +444,7 @@ static inline pte_t pte_mkwrite(pte_t pte)
 
 static inline pte_t pte_mkdirty(pte_t pte)
 {
-	return __pte(pte_val(pte) | _PAGE_DIRTY);
+	return __pte(pte_val(pte) | _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
 }
 
 static inline pte_t pte_mkyoung(pte_t pte)
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index a2d4e0e37067..03c1a5a21c0c 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -146,6 +146,7 @@ static inline void pgd_set(pgd_t *pgdp, unsigned long val)
 	 * We filter HPTEFLAGS on set_pte.			\
 	 */							\
 	BUILD_BUG_ON(_PAGE_HPTEFLAGS & (0x1f << _PAGE_BIT_SWAP_TYPE)); \
+	BUILD_BUG_ON(_PAGE_HPTEFLAGS & _PAGE_SWP_SOFT_DIRTY);	\
 	} while (0)
 /*
  * on pte we don't need handle RADIX_TREE_EXCEPTIONAL_SHIFT;
@@ -161,6 +162,24 @@ static inline void pgd_set(pgd_t *pgdp, unsigned long val)
 #define __pte_to_swp_entry(pte)		((swp_entry_t) { pte_val((pte)) })
 #define __swp_entry_to_pte(x)		__pte((x).val)
 
+#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
+#define _PAGE_SWP_SOFT_DIRTY   (1UL << (SWP_TYPE_BITS + _PAGE_BIT_SWAP_TYPE))
+static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
+{
+	return __pte(pte_val(pte) | _PAGE_SWP_SOFT_DIRTY);
+}
+static inline bool pte_swp_soft_dirty(pte_t pte)
+{
+	return !!(pte_val(pte) & _PAGE_SWP_SOFT_DIRTY);
+}
+static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
+{
+	return __pte(pte_val(pte) & ~_PAGE_SWP_SOFT_DIRTY);
+}
+#else
+#define _PAGE_SWP_SOFT_DIRTY	0
+#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
+
 void pgtable_cache_add(unsigned shift, void (*ctor)(void *));
 void pgtable_cache_init(void);
 
@@ -201,6 +220,13 @@ static inline pte_t *pmdp_ptep(pmd_t *pmd)
 #define pmd_mkdirty(pmd)	pte_pmd(pte_mkdirty(pmd_pte(pmd)))
 #define pmd_mkyoung(pmd)	pte_pmd(pte_mkyoung(pmd_pte(pmd)))
 #define pmd_mkwrite(pmd)	pte_pmd(pte_mkwrite(pmd_pte(pmd)))
+
+#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
+#define pmd_soft_dirty(pmd)    pte_soft_dirty(pmd_pte(pmd))
+#define pmd_mksoft_dirty(pmd)  pte_pmd(pte_mksoft_dirty(pmd_pte(pmd)))
+#define pmd_clear_soft_dirty(pmd) pte_pmd(pte_clear_soft_dirty(pmd_pte(pmd)))
+#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
+
 #ifdef CONFIG_NUMA_BALANCING
 static inline int pmd_protnone(pmd_t pmd)
 {

From e80c4e7ca5aed7d2fa766191bcf4e83fa411c720 Mon Sep 17 00:00:00 2001
From: Gavin Shan <gwshan@linux.vnet.ibm.com>
Date: Thu, 22 Oct 2015 12:03:08 +1100
Subject: [PATCH 097/149] powerpc/powernv: Fix M64 resource name in /proc/iomem

The name of PCI root bus's M64 resource isn't initialized properly.
When dumping "/proc/iomem", "<BAD>" is seen for those M64 resources
on PCI root buses.

   ~# cat /proc/iomem | grep -e "BAD"
   3b0000000000-3b0fefffffff : <BAD>
   3b1000000000-3b1fefffffff : <BAD>
   3c0000000000-3c0fefffffff : <BAD>
   3c1000000000-3c1fefffffff : <BAD>
   3c2000000000-3c2fefffffff : <BAD>

This fixes the issue by setting the name of PCI root bus's M64
resource to that of PHB's device node full name. With the patch,
no "<BAD>" is seen from "/proc/iomem".

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index cdd5fa942aed..272f6566d790 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -357,6 +357,7 @@ static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb)
 	}
 
 	res = &hose->mem_resources[1];
+	res->name = dn->full_name;
 	res->start = of_translate_address(dn, r + 2);
 	res->end = res->start + of_read_number(r + 4, 2) - 1;
 	res->flags = (IORESOURCE_MEM | IORESOURCE_MEM_64 | IORESOURCE_PREFETCH);

From 94973b24d61b03f99017f836d85e7d6739741bab Mon Sep 17 00:00:00 2001
From: Alistair Popple <alistair@popple.id.au>
Date: Thu, 17 Dec 2015 13:43:11 +1100
Subject: [PATCH 098/149] Revert "powerpc/pci: Remove unused struct
 pci_dn.pcidev field"

This commit removed the pcidev field from struct pci_dn as it was no
longer in use by the kernel. However to support finding the
association of Nvlink devices to GPU devices from the device-tree this
field is required.

This reverts commit 250c7b277c65 ("powerpc/pci: Remove unused struct
pci_dn.pcidev field").

Signed-off-by: Alistair Popple <alistair@popple.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/pci-bridge.h     | 1 +
 arch/powerpc/platforms/powernv/pci-ioda.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index 37fc53587bb4..54843ca5fa2b 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -205,6 +205,7 @@ struct pci_dn {
 
 	int	pci_ext_config_space;	/* for pci devices */
 
+	struct	pci_dev *pcidev;	/* back-pointer to the pci device */
 #ifdef CONFIG_EEH
 	struct eeh_dev *edev;		/* eeh device */
 #endif
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 272f6566d790..258ece5c0a8f 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1008,6 +1008,7 @@ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
 				pci_name(dev));
 			continue;
 		}
+		pdn->pcidev = dev;
 		pdn->pe_number = pe->pe_number;
 		pe->dma_weight += pnv_ioda_dma_weight(dev);
 		if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)

From a84bf321401ab206baafbbfd3bfad485a1a2c3b4 Mon Sep 17 00:00:00 2001
From: Alistair Popple <alistair@popple.id.au>
Date: Thu, 17 Dec 2015 13:43:12 +1100
Subject: [PATCH 099/149] powerpc: Add __raw_rm_writeq() function

Move __raw_rm_writeq() from platforms/powernv/pci-ioda.c to
include/asm/io.h so that it can be used by other code.

Signed-off-by: Alistair Popple <alistair@popple.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/io.h             | 11 +++++++++++
 arch/powerpc/platforms/powernv/pci-ioda.c | 10 ----------
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h
index 5879fde56f3c..6c1297ec374c 100644
--- a/arch/powerpc/include/asm/io.h
+++ b/arch/powerpc/include/asm/io.h
@@ -385,6 +385,17 @@ static inline void __raw_writeq(unsigned long v, volatile void __iomem *addr)
 {
 	*(volatile unsigned long __force *)PCI_FIX_ADDR(addr) = v;
 }
+
+/*
+ * Real mode version of the above. stdcix is only supposed to be used
+ * in hypervisor real mode as per the architecture spec.
+ */
+static inline void __raw_rm_writeq(u64 val, volatile void __iomem *paddr)
+{
+	__asm__ __volatile__("stdcix %0,0,%1"
+		: : "r" (val), "r" (paddr) : "memory");
+}
+
 #endif /* __powerpc64__ */
 
 /*
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 258ece5c0a8f..7a3a30ee6468 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -116,16 +116,6 @@ static int __init iommu_setup(char *str)
 }
 early_param("iommu", iommu_setup);
 
-/*
- * stdcix is only supposed to be used in hypervisor real mode as per
- * the architecture spec
- */
-static inline void __raw_rm_writeq(u64 val, volatile void __iomem *paddr)
-{
-	__asm__ __volatile__("stdcix %0,0,%1"
-		: : "r" (val), "r" (paddr) : "memory");
-}
-
 static inline bool pnv_pci_is_mem_pref_64(unsigned long flags)
 {
 	return ((flags & (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH)) ==

From 5d2aa710e697244f5504125e4aa6e2cfcf6c4791 Mon Sep 17 00:00:00 2001
From: Alistair Popple <alistair@popple.id.au>
Date: Thu, 17 Dec 2015 13:43:13 +1100
Subject: [PATCH 100/149] powerpc/powernv: Add support for Nvlink NPUs

NVLink is a high speed interconnect that is used in conjunction with a
PCI-E connection to create an interface between CPU and GPU that
provides very high data bandwidth. A PCI-E connection to a GPU is used
as the control path to initiate and report status of large data
transfers sent via the NVLink.

On IBM Power systems the NVLink processing unit (NPU) is similar to
the existing PHB3. This patch adds support for a new NPU PHB type. DMA
operations on the NPU are not supported as this patch sets the TCE
translation tables to be the same as the related GPU PCIe device for
each NVLink. Therefore all DMA operations are setup and controlled via
the PCIe device.

EEH is not presently supported for the NPU devices, although it may be
added in future.

Signed-off-by: Alistair Popple <alistair@popple.id.au>
Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/pci.h            |   4 +
 arch/powerpc/platforms/powernv/Makefile   |   2 +-
 arch/powerpc/platforms/powernv/npu-dma.c  | 348 ++++++++++++++++++++++
 arch/powerpc/platforms/powernv/pci-ioda.c | 138 ++++++++-
 arch/powerpc/platforms/powernv/pci.c      |   4 +
 arch/powerpc/platforms/powernv/pci.h      |  19 ++
 6 files changed, 502 insertions(+), 13 deletions(-)
 create mode 100644 arch/powerpc/platforms/powernv/npu-dma.c

diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h
index 3453bd8dc18f..6f8065a7d487 100644
--- a/arch/powerpc/include/asm/pci.h
+++ b/arch/powerpc/include/asm/pci.h
@@ -149,4 +149,8 @@ extern void pcibios_setup_phb_io_space(struct pci_controller *hose);
 extern void pcibios_scan_phb(struct pci_controller *hose);
 
 #endif	/* __KERNEL__ */
+
+extern struct pci_dev *pnv_pci_get_gpu_dev(struct pci_dev *npdev);
+extern struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index);
+
 #endif /* __ASM_POWERPC_PCI_H */
diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
index 1c8cdb6250e7..ee774e8a4837 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -4,7 +4,7 @@ obj-y			+= rng.o opal-elog.o opal-dump.o opal-sysparam.o opal-sensor.o
 obj-y			+= opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o
 
 obj-$(CONFIG_SMP)	+= smp.o subcore.o subcore-asm.o
-obj-$(CONFIG_PCI)	+= pci.o pci-p5ioc2.o pci-ioda.o
+obj-$(CONFIG_PCI)	+= pci.o pci-p5ioc2.o pci-ioda.o npu-dma.o
 obj-$(CONFIG_EEH)	+= eeh-powernv.o
 obj-$(CONFIG_PPC_SCOM)	+= opal-xscom.o
 obj-$(CONFIG_MEMORY_FAILURE)	+= opal-memory-errors.o
diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
new file mode 100644
index 000000000000..e85aa900f5c0
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -0,0 +1,348 @@
+/*
+ * This file implements the DMA operations for NVLink devices. The NPU
+ * devices all point to the same iommu table as the parent PCI device.
+ *
+ * Copyright Alistair Popple, IBM Corporation 2015.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+
+#include <linux/export.h>
+#include <linux/pci.h>
+#include <linux/memblock.h>
+
+#include <asm/iommu.h>
+#include <asm/pnv-pci.h>
+#include <asm/msi_bitmap.h>
+#include <asm/opal.h>
+
+#include "powernv.h"
+#include "pci.h"
+
+/*
+ * Other types of TCE cache invalidation are not functional in the
+ * hardware.
+ */
+#define TCE_KILL_INVAL_ALL PPC_BIT(0)
+
+static struct pci_dev *get_pci_dev(struct device_node *dn)
+{
+	return PCI_DN(dn)->pcidev;
+}
+
+/* Given a NPU device get the associated PCI device. */
+struct pci_dev *pnv_pci_get_gpu_dev(struct pci_dev *npdev)
+{
+	struct device_node *dn;
+	struct pci_dev *gpdev;
+
+	/* Get assoicated PCI device */
+	dn = of_parse_phandle(npdev->dev.of_node, "ibm,gpu", 0);
+	if (!dn)
+		return NULL;
+
+	gpdev = get_pci_dev(dn);
+	of_node_put(dn);
+
+	return gpdev;
+}
+EXPORT_SYMBOL(pnv_pci_get_gpu_dev);
+
+/* Given the real PCI device get a linked NPU device. */
+struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index)
+{
+	struct device_node *dn;
+	struct pci_dev *npdev;
+
+	/* Get assoicated PCI device */
+	dn = of_parse_phandle(gpdev->dev.of_node, "ibm,npu", index);
+	if (!dn)
+		return NULL;
+
+	npdev = get_pci_dev(dn);
+	of_node_put(dn);
+
+	return npdev;
+}
+EXPORT_SYMBOL(pnv_pci_get_npu_dev);
+
+#define NPU_DMA_OP_UNSUPPORTED()					\
+	dev_err_once(dev, "%s operation unsupported for NVLink devices\n", \
+		__func__)
+
+static void *dma_npu_alloc(struct device *dev, size_t size,
+			   dma_addr_t *dma_handle, gfp_t flag,
+			   struct dma_attrs *attrs)
+{
+	NPU_DMA_OP_UNSUPPORTED();
+	return NULL;
+}
+
+static void dma_npu_free(struct device *dev, size_t size,
+			 void *vaddr, dma_addr_t dma_handle,
+			 struct dma_attrs *attrs)
+{
+	NPU_DMA_OP_UNSUPPORTED();
+}
+
+static dma_addr_t dma_npu_map_page(struct device *dev, struct page *page,
+				   unsigned long offset, size_t size,
+				   enum dma_data_direction direction,
+				   struct dma_attrs *attrs)
+{
+	NPU_DMA_OP_UNSUPPORTED();
+	return 0;
+}
+
+static int dma_npu_map_sg(struct device *dev, struct scatterlist *sglist,
+			  int nelems, enum dma_data_direction direction,
+			  struct dma_attrs *attrs)
+{
+	NPU_DMA_OP_UNSUPPORTED();
+	return 0;
+}
+
+static int dma_npu_dma_supported(struct device *dev, u64 mask)
+{
+	NPU_DMA_OP_UNSUPPORTED();
+	return 0;
+}
+
+static u64 dma_npu_get_required_mask(struct device *dev)
+{
+	NPU_DMA_OP_UNSUPPORTED();
+	return 0;
+}
+
+struct dma_map_ops dma_npu_ops = {
+	.map_page		= dma_npu_map_page,
+	.map_sg			= dma_npu_map_sg,
+	.alloc			= dma_npu_alloc,
+	.free			= dma_npu_free,
+	.dma_supported		= dma_npu_dma_supported,
+	.get_required_mask	= dma_npu_get_required_mask,
+};
+
+/*
+ * Returns the PE assoicated with the PCI device of the given
+ * NPU. Returns the linked pci device if pci_dev != NULL.
+ */
+static struct pnv_ioda_pe *get_gpu_pci_dev_and_pe(struct pnv_ioda_pe *npe,
+						  struct pci_dev **gpdev)
+{
+	struct pnv_phb *phb;
+	struct pci_controller *hose;
+	struct pci_dev *pdev;
+	struct pnv_ioda_pe *pe;
+	struct pci_dn *pdn;
+
+	if (npe->flags & PNV_IODA_PE_PEER) {
+		pe = npe->peers[0];
+		pdev = pe->pdev;
+	} else {
+		pdev = pnv_pci_get_gpu_dev(npe->pdev);
+		if (!pdev)
+			return NULL;
+
+		pdn = pci_get_pdn(pdev);
+		if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
+			return NULL;
+
+		hose = pci_bus_to_host(pdev->bus);
+		phb = hose->private_data;
+		pe = &phb->ioda.pe_array[pdn->pe_number];
+	}
+
+	if (gpdev)
+		*gpdev = pdev;
+
+	return pe;
+}
+
+void pnv_npu_tce_invalidate_entire(struct pnv_ioda_pe *npe)
+{
+	struct pnv_phb *phb = npe->phb;
+
+	if (WARN_ON(phb->type != PNV_PHB_NPU ||
+		    !phb->ioda.tce_inval_reg ||
+		    !(npe->flags & PNV_IODA_PE_DEV)))
+		return;
+
+	mb(); /* Ensure previous TCE table stores are visible */
+	__raw_writeq(cpu_to_be64(TCE_KILL_INVAL_ALL),
+		phb->ioda.tce_inval_reg);
+}
+
+void pnv_npu_tce_invalidate(struct pnv_ioda_pe *npe,
+				struct iommu_table *tbl,
+				unsigned long index,
+				unsigned long npages,
+				bool rm)
+{
+	struct pnv_phb *phb = npe->phb;
+
+	/* We can only invalidate the whole cache on NPU */
+	unsigned long val = TCE_KILL_INVAL_ALL;
+
+	if (WARN_ON(phb->type != PNV_PHB_NPU ||
+		    !phb->ioda.tce_inval_reg ||
+		    !(npe->flags & PNV_IODA_PE_DEV)))
+		return;
+
+	mb(); /* Ensure previous TCE table stores are visible */
+	if (rm)
+		__raw_rm_writeq(cpu_to_be64(val),
+		  (__be64 __iomem *) phb->ioda.tce_inval_reg_phys);
+	else
+		__raw_writeq(cpu_to_be64(val),
+			phb->ioda.tce_inval_reg);
+}
+
+void pnv_npu_init_dma_pe(struct pnv_ioda_pe *npe)
+{
+	struct pnv_ioda_pe *gpe;
+	struct pci_dev *gpdev;
+	int i, avail = -1;
+
+	if (!npe->pdev || !(npe->flags & PNV_IODA_PE_DEV))
+		return;
+
+	gpe = get_gpu_pci_dev_and_pe(npe, &gpdev);
+	if (!gpe)
+		return;
+
+	for (i = 0; i < PNV_IODA_MAX_PEER_PES; i++) {
+		/* Nothing to do if the PE is already connected. */
+		if (gpe->peers[i] == npe)
+			return;
+
+		if (!gpe->peers[i])
+			avail = i;
+	}
+
+	if (WARN_ON(avail < 0))
+		return;
+
+	gpe->peers[avail] = npe;
+	gpe->flags |= PNV_IODA_PE_PEER;
+
+	/*
+	 * We assume that the NPU devices only have a single peer PE
+	 * (the GPU PCIe device PE).
+	 */
+	npe->peers[0] = gpe;
+	npe->flags |= PNV_IODA_PE_PEER;
+}
+
+/*
+ * For the NPU we want to point the TCE table at the same table as the
+ * real PCI device.
+ */
+static void pnv_npu_disable_bypass(struct pnv_ioda_pe *npe)
+{
+	struct pnv_phb *phb = npe->phb;
+	struct pci_dev *gpdev;
+	struct pnv_ioda_pe *gpe;
+	void *addr;
+	unsigned int size;
+	int64_t rc;
+
+	/*
+	 * Find the assoicated PCI devices and get the dma window
+	 * information from there.
+	 */
+	if (!npe->pdev || !(npe->flags & PNV_IODA_PE_DEV))
+		return;
+
+	gpe = get_gpu_pci_dev_and_pe(npe, &gpdev);
+	if (!gpe)
+		return;
+
+	addr = (void *)gpe->table_group.tables[0]->it_base;
+	size = gpe->table_group.tables[0]->it_size << 3;
+	rc = opal_pci_map_pe_dma_window(phb->opal_id, npe->pe_number,
+					npe->pe_number, 1, __pa(addr),
+					size, 0x1000);
+	if (rc != OPAL_SUCCESS)
+		pr_warn("%s: Error %lld setting DMA window on PHB#%d-PE#%d\n",
+			__func__, rc, phb->hose->global_number, npe->pe_number);
+
+	/*
+	 * We don't initialise npu_pe->tce32_table as we always use
+	 * dma_npu_ops which are nops.
+	 */
+	set_dma_ops(&npe->pdev->dev, &dma_npu_ops);
+}
+
+/*
+ * Enable/disable bypass mode on the NPU. The NPU only supports one
+ * window per link, so bypass needs to be explicity enabled or
+ * disabled. Unlike for a PHB3 bypass and non-bypass modes can't be
+ * active at the same time.
+ */
+int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe, bool enable)
+{
+	struct pnv_phb *phb = npe->phb;
+	int64_t rc = 0;
+
+	if (phb->type != PNV_PHB_NPU || !npe->pdev)
+		return -EINVAL;
+
+	if (enable) {
+		/* Enable the bypass window */
+		phys_addr_t top = memblock_end_of_DRAM();
+
+		npe->tce_bypass_base = 0;
+		top = roundup_pow_of_two(top);
+		dev_info(&npe->pdev->dev, "Enabling bypass for PE %d\n",
+			 npe->pe_number);
+		rc = opal_pci_map_pe_dma_window_real(phb->opal_id,
+					npe->pe_number, npe->pe_number,
+					npe->tce_bypass_base, top);
+	} else {
+		/*
+		 * Disable the bypass window by replacing it with the
+		 * TCE32 window.
+		 */
+		pnv_npu_disable_bypass(npe);
+	}
+
+	return rc;
+}
+
+int pnv_npu_dma_set_mask(struct pci_dev *npdev, u64 dma_mask)
+{
+	struct pci_controller *hose = pci_bus_to_host(npdev->bus);
+	struct pnv_phb *phb = hose->private_data;
+	struct pci_dn *pdn = pci_get_pdn(npdev);
+	struct pnv_ioda_pe *npe, *gpe;
+	struct pci_dev *gpdev;
+	uint64_t top;
+	bool bypass = false;
+
+	if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
+		return -ENXIO;
+
+	/* We only do bypass if it's enabled on the linked device */
+	npe = &phb->ioda.pe_array[pdn->pe_number];
+	gpe = get_gpu_pci_dev_and_pe(npe, &gpdev);
+	if (!gpe)
+		return -ENODEV;
+
+	if (gpe->tce_bypass_enabled) {
+		top = gpe->tce_bypass_base + memblock_end_of_DRAM() - 1;
+		bypass = (dma_mask >= top);
+	}
+
+	if (bypass)
+		dev_info(&npdev->dev, "Using 64-bit DMA iommu bypass\n");
+	else
+		dev_info(&npdev->dev, "Using 32-bit DMA via iommu\n");
+
+	pnv_npu_dma_set_bypass(npe, bypass);
+	*npdev->dev.dma_mask = dma_mask;
+
+	return 0;
+}
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 7a3a30ee6468..323e1e58da93 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -771,8 +771,12 @@ static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
 		return -ENXIO;
 	}
 
-	/* Configure PELTV */
-	pnv_ioda_set_peltv(phb, pe, true);
+	/*
+	 * Configure PELTV. NPUs don't have a PELTV table so skip
+	 * configuration on them.
+	 */
+	if (phb->type != PNV_PHB_NPU)
+		pnv_ioda_set_peltv(phb, pe, true);
 
 	/* Setup reverse map */
 	for (rid = pe->rid; rid < rid_end; rid++)
@@ -915,7 +919,6 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
 }
 #endif /* CONFIG_PCI_IOV */
 
-#if 0
 static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
 {
 	struct pci_controller *hose = pci_bus_to_host(dev->bus);
@@ -932,11 +935,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
 	if (pdn->pe_number != IODA_INVALID_PE)
 		return NULL;
 
-	/* PE#0 has been pre-set */
-	if (dev->bus->number == 0)
-		pe_num = 0;
-	else
-		pe_num = pnv_ioda_alloc_pe(phb);
+	pe_num = pnv_ioda_alloc_pe(phb);
 	if (pe_num == IODA_INVALID_PE) {
 		pr_warning("%s: Not enough PE# available, disabling device\n",
 			   pci_name(dev));
@@ -954,6 +953,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
 	pci_dev_get(dev);
 	pdn->pcidev = dev;
 	pdn->pe_number = pe_num;
+	pe->flags = PNV_IODA_PE_DEV;
 	pe->pdev = dev;
 	pe->pbus = NULL;
 	pe->tce32_seg = -1;
@@ -984,7 +984,6 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
 
 	return pe;
 }
-#endif /* Useful for SRIOV case */
 
 static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
 {
@@ -1075,6 +1074,18 @@ static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
 	pnv_ioda_link_pe_by_weight(phb, pe);
 }
 
+static void pnv_ioda_setup_dev_PEs(struct pci_bus *bus)
+{
+	struct pci_bus *child;
+	struct pci_dev *pdev;
+
+	list_for_each_entry(pdev, &bus->devices, bus_list)
+		pnv_ioda_setup_dev_PE(pdev);
+
+	list_for_each_entry(child, &bus->children, node)
+		pnv_ioda_setup_dev_PEs(child);
+}
+
 static void pnv_ioda_setup_PEs(struct pci_bus *bus)
 {
 	struct pci_dev *dev;
@@ -1111,7 +1122,15 @@ static void pnv_pci_ioda_setup_PEs(void)
 		if (phb->reserve_m64_pe)
 			phb->reserve_m64_pe(hose->bus, NULL, true);
 
-		pnv_ioda_setup_PEs(hose->bus);
+		/*
+		 * On NPU PHB, we expect separate PEs for individual PCI
+		 * functions. PCI bus dependent PEs are required for the
+		 * remaining types of PHBs.
+		 */
+		if (phb->type == PNV_PHB_NPU)
+			pnv_ioda_setup_dev_PEs(hose->bus);
+		else
+			pnv_ioda_setup_PEs(hose->bus);
 	}
 }
 
@@ -1570,6 +1589,8 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
 	struct pnv_ioda_pe *pe;
 	uint64_t top;
 	bool bypass = false;
+	struct pci_dev *linked_npu_dev;
+	int i;
 
 	if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
 		return -ENODEV;;
@@ -1588,6 +1609,15 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
 		set_dma_ops(&pdev->dev, &dma_iommu_ops);
 	}
 	*pdev->dev.dma_mask = dma_mask;
+
+	/* Update peer npu devices */
+	if (pe->flags & PNV_IODA_PE_PEER)
+		for (i = 0; pe->peers[i]; i++) {
+			linked_npu_dev = pe->peers[i]->pdev;
+			if (dma_get_mask(&linked_npu_dev->dev) != dma_mask)
+				dma_set_mask(&linked_npu_dev->dev, dma_mask);
+		}
+
 	return 0;
 }
 
@@ -1732,12 +1762,23 @@ static inline void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_ioda_pe *pe)
 	/* 01xb - invalidate TCEs that match the specified PE# */
 	unsigned long val = (0x4ull << 60) | (pe->pe_number & 0xFF);
 	struct pnv_phb *phb = pe->phb;
+	struct pnv_ioda_pe *npe;
+	int i;
 
 	if (!phb->ioda.tce_inval_reg)
 		return;
 
 	mb(); /* Ensure above stores are visible */
 	__raw_writeq(cpu_to_be64(val), phb->ioda.tce_inval_reg);
+
+	if (pe->flags & PNV_IODA_PE_PEER)
+		for (i = 0; i < PNV_IODA_MAX_PEER_PES; i++) {
+			npe = pe->peers[i];
+			if (!npe || npe->phb->type != PNV_PHB_NPU)
+				continue;
+
+			pnv_npu_tce_invalidate_entire(npe);
+		}
 }
 
 static void pnv_pci_ioda2_do_tce_invalidate(unsigned pe_number, bool rm,
@@ -1772,15 +1813,28 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
 	struct iommu_table_group_link *tgl;
 
 	list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) {
+		struct pnv_ioda_pe *npe;
 		struct pnv_ioda_pe *pe = container_of(tgl->table_group,
 				struct pnv_ioda_pe, table_group);
 		__be64 __iomem *invalidate = rm ?
 			(__be64 __iomem *)pe->phb->ioda.tce_inval_reg_phys :
 			pe->phb->ioda.tce_inval_reg;
+		int i;
 
 		pnv_pci_ioda2_do_tce_invalidate(pe->pe_number, rm,
 			invalidate, tbl->it_page_shift,
 			index, npages);
+
+		if (pe->flags & PNV_IODA_PE_PEER)
+			/* Invalidate PEs using the same TCE table */
+			for (i = 0; i < PNV_IODA_MAX_PEER_PES; i++) {
+				npe = pe->peers[i];
+				if (!npe || npe->phb->type != PNV_PHB_NPU)
+					continue;
+
+				pnv_npu_tce_invalidate(npe, tbl, index,
+							npages, rm);
+			}
 	}
 }
 
@@ -2428,10 +2482,17 @@ static void pnv_ioda_setup_dma(struct pnv_phb *phb)
 			pe_info(pe, "DMA weight %d, assigned %d DMA32 segments\n",
 				pe->dma_weight, segs);
 			pnv_pci_ioda_setup_dma_pe(phb, pe, base, segs);
-		} else {
+		} else if (phb->type == PNV_PHB_IODA2) {
 			pe_info(pe, "Assign DMA32 space\n");
 			segs = 0;
 			pnv_pci_ioda2_setup_dma_pe(phb, pe);
+		} else if (phb->type == PNV_PHB_NPU) {
+			/*
+			 * We initialise the DMA space for an NPU PHB
+			 * after setup of the PHB is complete as we
+			 * point the NPU TVT to the the same location
+			 * as the PHB3 TVT.
+			 */
 		}
 
 		remaining -= segs;
@@ -2873,6 +2934,11 @@ static void pnv_pci_ioda_setup_seg(void)
 
 	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
 		phb = hose->private_data;
+
+		/* NPU PHB does not support IO or MMIO segmentation */
+		if (phb->type == PNV_PHB_NPU)
+			continue;
+
 		list_for_each_entry(pe, &phb->ioda.pe_list, list) {
 			pnv_ioda_setup_pe_seg(hose, pe);
 		}
@@ -2912,6 +2978,27 @@ static void pnv_pci_ioda_create_dbgfs(void)
 #endif /* CONFIG_DEBUG_FS */
 }
 
+static void pnv_npu_ioda_fixup(void)
+{
+	bool enable_bypass;
+	struct pci_controller *hose, *tmp;
+	struct pnv_phb *phb;
+	struct pnv_ioda_pe *pe;
+
+	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
+		phb = hose->private_data;
+		if (phb->type != PNV_PHB_NPU)
+			continue;
+
+		list_for_each_entry(pe, &phb->ioda.pe_dma_list, dma_link) {
+			enable_bypass = dma_get_mask(&pe->pdev->dev) ==
+				DMA_BIT_MASK(64);
+			pnv_npu_init_dma_pe(pe);
+			pnv_npu_dma_set_bypass(pe, enable_bypass);
+		}
+	}
+}
+
 static void pnv_pci_ioda_fixup(void)
 {
 	pnv_pci_ioda_setup_PEs();
@@ -2924,6 +3011,9 @@ static void pnv_pci_ioda_fixup(void)
 	eeh_init();
 	eeh_addr_cache_build();
 #endif
+
+	/* Link NPU IODA tables to their PCI devices. */
+	pnv_npu_ioda_fixup();
 }
 
 /*
@@ -3038,6 +3128,19 @@ static const struct pci_controller_ops pnv_pci_ioda_controller_ops = {
        .shutdown = pnv_pci_ioda_shutdown,
 };
 
+static const struct pci_controller_ops pnv_npu_ioda_controller_ops = {
+	.dma_dev_setup = pnv_pci_dma_dev_setup,
+#ifdef CONFIG_PCI_MSI
+	.setup_msi_irqs = pnv_setup_msi_irqs,
+	.teardown_msi_irqs = pnv_teardown_msi_irqs,
+#endif
+	.enable_device_hook = pnv_pci_enable_device_hook,
+	.window_alignment = pnv_pci_window_alignment,
+	.reset_secondary_bus = pnv_pci_reset_secondary_bus,
+	.dma_set_mask = pnv_npu_dma_set_mask,
+	.shutdown = pnv_pci_ioda_shutdown,
+};
+
 static void __init pnv_pci_init_ioda_phb(struct device_node *np,
 					 u64 hub_id, int ioda_type)
 {
@@ -3093,6 +3196,8 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
 		phb->model = PNV_PHB_MODEL_P7IOC;
 	else if (of_device_is_compatible(np, "ibm,power8-pciex"))
 		phb->model = PNV_PHB_MODEL_PHB3;
+	else if (of_device_is_compatible(np, "ibm,power8-npu-pciex"))
+		phb->model = PNV_PHB_MODEL_NPU;
 	else
 		phb->model = PNV_PHB_MODEL_UNKNOWN;
 
@@ -3193,7 +3298,11 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
 	 * the child P2P bridges) can form individual PE.
 	 */
 	ppc_md.pcibios_fixup = pnv_pci_ioda_fixup;
-	hose->controller_ops = pnv_pci_ioda_controller_ops;
+
+	if (phb->type == PNV_PHB_NPU)
+		hose->controller_ops = pnv_npu_ioda_controller_ops;
+	else
+		hose->controller_ops = pnv_pci_ioda_controller_ops;
 
 #ifdef CONFIG_PCI_IOV
 	ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov_resources;
@@ -3228,6 +3337,11 @@ void __init pnv_pci_init_ioda2_phb(struct device_node *np)
 	pnv_pci_init_ioda_phb(np, 0, PNV_PHB_IODA2);
 }
 
+void __init pnv_pci_init_npu_phb(struct device_node *np)
+{
+	pnv_pci_init_ioda_phb(np, 0, PNV_PHB_NPU);
+}
+
 void __init pnv_pci_init_ioda_hub(struct device_node *np)
 {
 	struct device_node *phbn;
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index f2dd77234240..ff4e42d9d259 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -807,6 +807,10 @@ void __init pnv_pci_init(void)
 	for_each_compatible_node(np, NULL, "ibm,ioda2-phb")
 		pnv_pci_init_ioda2_phb(np);
 
+	/* Look for NPU PHBs */
+	for_each_compatible_node(np, NULL, "ibm,ioda2-npu-phb")
+		pnv_pci_init_npu_phb(np);
+
 	/* Setup the linkage between OF nodes and PHBs */
 	pci_devs_phb_init();
 
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index c8ff50e90766..7f56313e8d72 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -7,6 +7,7 @@ enum pnv_phb_type {
 	PNV_PHB_P5IOC2	= 0,
 	PNV_PHB_IODA1	= 1,
 	PNV_PHB_IODA2	= 2,
+	PNV_PHB_NPU	= 3,
 };
 
 /* Precise PHB model for error management */
@@ -15,6 +16,7 @@ enum pnv_phb_model {
 	PNV_PHB_MODEL_P5IOC2,
 	PNV_PHB_MODEL_P7IOC,
 	PNV_PHB_MODEL_PHB3,
+	PNV_PHB_MODEL_NPU,
 };
 
 #define PNV_PCI_DIAG_BUF_SIZE	8192
@@ -24,6 +26,7 @@ enum pnv_phb_model {
 #define PNV_IODA_PE_MASTER	(1 << 3)	/* Master PE in compound case	*/
 #define PNV_IODA_PE_SLAVE	(1 << 4)	/* Slave PE in compound case	*/
 #define PNV_IODA_PE_VF		(1 << 5)	/* PE for one VF 		*/
+#define PNV_IODA_PE_PEER	(1 << 6)	/* PE has peers			*/
 
 /* Data associated with a PE, including IOMMU tracking etc.. */
 struct pnv_phb;
@@ -31,6 +34,9 @@ struct pnv_ioda_pe {
 	unsigned long		flags;
 	struct pnv_phb		*phb;
 
+#define PNV_IODA_MAX_PEER_PES	8
+	struct pnv_ioda_pe	*peers[PNV_IODA_MAX_PEER_PES];
+
 	/* A PE can be associated with a single device or an
 	 * entire bus (& children). In the former case, pdev
 	 * is populated, in the later case, pbus is.
@@ -229,6 +235,7 @@ extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
 extern void pnv_pci_init_p5ioc2_hub(struct device_node *np);
 extern void pnv_pci_init_ioda_hub(struct device_node *np);
 extern void pnv_pci_init_ioda2_phb(struct device_node *np);
+extern void pnv_pci_init_npu_phb(struct device_node *np);
 extern void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl,
 					__be64 *startp, __be64 *endp, bool rm);
 extern void pnv_pci_reset_secondary_bus(struct pci_dev *dev);
@@ -238,4 +245,16 @@ extern void pnv_pci_dma_dev_setup(struct pci_dev *pdev);
 extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type);
 extern void pnv_teardown_msi_irqs(struct pci_dev *pdev);
 
+/* Nvlink functions */
+extern void pnv_npu_tce_invalidate_entire(struct pnv_ioda_pe *npe);
+extern void pnv_npu_tce_invalidate(struct pnv_ioda_pe *npe,
+				       struct iommu_table *tbl,
+				       unsigned long index,
+				       unsigned long npages,
+				       bool rm);
+extern void pnv_npu_init_dma_pe(struct pnv_ioda_pe *npe);
+extern void pnv_npu_setup_dma_pe(struct pnv_ioda_pe *npe);
+extern int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe, bool enabled);
+extern int pnv_npu_dma_set_mask(struct pci_dev *npdev, u64 dma_mask);
+
 #endif /* __POWERNV_PCI_H */

From 4450022b4952ce67d2f3006b4c38e12a0f38cd77 Mon Sep 17 00:00:00 2001
From: Alistair Popple <alistair@popple.id.au>
Date: Mon, 14 Dec 2015 14:31:24 +1100
Subject: [PATCH 101/149] powerpc/476fpe: Add support for kexec

PPC476FPE has a different PVR from previous PPC476 processors. The
kexec code checks the PVR in order to correctly setup the MMU. When
the initial support for 476FPE processors was added the corresponding
change in the kexec code was missed. This patch simply adds the check
and solves the following bug on kexec:

kexec: Starting new kernel
Bye!
Unable to handle kernel paging request for instruction fetch
Faulting instruction address: 0xee9a50f8
cpu 0x0: Vector: 400 (Instruction Access) at [ee9d7d20]
    pc: ee9a50f8
    lr: ee9a50e4
    sp: ee9d7dd0
    msr: 21020
    current = 0xee40f000
    pid   = 960, comm = kexec
enter ? for help
[link register   ] ee9a50e4
[ee9d7dd0] c0013748 default_machine_kexec+0x58/0x70 (unreliable)
[ee9d7df0] c0012f04 machine_kexec+0x34/0x40
[ee9d7e00] c00aa1ec kernel_kexec+0x9c/0xb0
[ee9d7e20] c005d704 SyS_reboot+0x1f4/0x220
[ee9d7f40] c000db68 ret_from_syscall+0x0/0x3c

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/misc_32.S | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
index ed3ab509faca..be8edd67f05b 100644
--- a/arch/powerpc/kernel/misc_32.S
+++ b/arch/powerpc/kernel/misc_32.S
@@ -743,6 +743,8 @@ relocate_new_kernel:
 	/* Check for 47x cores */
 	mfspr	r3,SPRN_PVR
 	srwi	r3,r3,16
+	cmplwi	cr0,r3,PVR_476FPE@h
+	beq	setup_map_47x
 	cmplwi	cr0,r3,PVR_476@h
 	beq	setup_map_47x
 	cmplwi	cr0,r3,PVR_476_ISS@h

From 1f859adb9253c201079962582253236e9b2cc3ce Mon Sep 17 00:00:00 2001
From: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Date: Fri, 23 Oct 2015 12:45:57 -0500
Subject: [PATCH 102/149] powerpc/pseries: Verify CPU doesn't exist before
 adding

When DLPAR adding a CPU we should verify that the CPU does not already
exist. Failure to do so can generate a kernel oops;

[    9.465585] kernel BUG at arch/powerpc/platforms/pseries/dlpar.c:382!
[    9.465796] Oops: Exception in kernel mode, sig: 5 [#1]

This oops can be generated by causing a probe to be performed on a cpu
by writing to the sysfs cpu probe file (/sys/devices/system/cpu/probe).
This patch adds a check for the existence of cpu prior to probing the cpu
so userspace doing the wrong thing won't trigger a BUG_ON().

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/pseries/dlpar.c | 43 +++++++++++++++++++++++---
 1 file changed, 39 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c
index f244dcb4f2cf..fe6320db9255 100644
--- a/arch/powerpc/platforms/pseries/dlpar.c
+++ b/arch/powerpc/platforms/pseries/dlpar.c
@@ -381,6 +381,32 @@ out:
 
 }
 
+static bool dlpar_cpu_exists(struct device_node *parent, u32 drc_index)
+{
+	struct device_node *child = NULL;
+	u32 my_drc_index;
+	bool found;
+	int rc;
+
+	/* Assume cpu doesn't exist */
+	found = false;
+
+	for_each_child_of_node(parent, child) {
+		rc = of_property_read_u32(child, "ibm,my-drc-index",
+					  &my_drc_index);
+		if (rc)
+			continue;
+
+		if (my_drc_index == drc_index) {
+			of_node_put(child);
+			found = true;
+			break;
+		}
+	}
+
+	return found;
+}
+
 static ssize_t dlpar_cpu_probe(const char *buf, size_t count)
 {
 	struct device_node *dn, *parent;
@@ -391,14 +417,23 @@ static ssize_t dlpar_cpu_probe(const char *buf, size_t count)
 	if (rc)
 		return -EINVAL;
 
-	rc = dlpar_acquire_drc(drc_index);
-	if (rc)
-		return -EINVAL;
-
 	parent = of_find_node_by_path("/cpus");
 	if (!parent)
 		return -ENODEV;
 
+	if (dlpar_cpu_exists(parent, drc_index)) {
+		of_node_put(parent);
+		printk(KERN_WARNING "CPU with drc index %x already exists\n",
+		       drc_index);
+		return -EINVAL;
+	}
+
+	rc = dlpar_acquire_drc(drc_index);
+	if (rc) {
+		of_node_put(parent);
+		return -EINVAL;
+	}
+
 	dn = dlpar_configure_connector(cpu_to_be32(drc_index), parent);
 	of_node_put(parent);
 	if (!dn) {

From 183deeea5871a6f750ec64ab1cff85fb089d38df Mon Sep 17 00:00:00 2001
From: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Date: Wed, 16 Dec 2015 14:50:21 -0600
Subject: [PATCH 103/149] powerpc/pseries: Consolidate CPU hotplug code to
 hotplug-cpu.c

No functional changes, this patch is simply a move of the cpu hotplug
code from pseries/dlpar.c to pseries/hotplug-cpu.c. This is in an effort
to consolidate all of the cpu hotplug code in a common place.

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/pseries/dlpar.c       | 226 +------------------
 arch/powerpc/platforms/pseries/hotplug-cpu.c | 218 ++++++++++++++++++
 2 files changed, 219 insertions(+), 225 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c
index fe6320db9255..438fdbd7e40e 100644
--- a/arch/powerpc/platforms/pseries/dlpar.c
+++ b/arch/powerpc/platforms/pseries/dlpar.c
@@ -20,7 +20,6 @@
 #include <linux/of.h>
 
 #include "of_helpers.h"
-#include "offline_states.h"
 #include "pseries.h"
 
 #include <asm/prom.h>
@@ -338,220 +337,6 @@ int dlpar_release_drc(u32 drc_index)
 	return 0;
 }
 
-#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
-
-static int dlpar_online_cpu(struct device_node *dn)
-{
-	int rc = 0;
-	unsigned int cpu;
-	int len, nthreads, i;
-	const __be32 *intserv;
-	u32 thread;
-
-	intserv = of_get_property(dn, "ibm,ppc-interrupt-server#s", &len);
-	if (!intserv)
-		return -EINVAL;
-
-	nthreads = len / sizeof(u32);
-
-	cpu_maps_update_begin();
-	for (i = 0; i < nthreads; i++) {
-		thread = be32_to_cpu(intserv[i]);
-		for_each_present_cpu(cpu) {
-			if (get_hard_smp_processor_id(cpu) != thread)
-				continue;
-			BUG_ON(get_cpu_current_state(cpu)
-					!= CPU_STATE_OFFLINE);
-			cpu_maps_update_done();
-			rc = device_online(get_cpu_device(cpu));
-			if (rc)
-				goto out;
-			cpu_maps_update_begin();
-
-			break;
-		}
-		if (cpu == num_possible_cpus())
-			printk(KERN_WARNING "Could not find cpu to online "
-			       "with physical id 0x%x\n", thread);
-	}
-	cpu_maps_update_done();
-
-out:
-	return rc;
-
-}
-
-static bool dlpar_cpu_exists(struct device_node *parent, u32 drc_index)
-{
-	struct device_node *child = NULL;
-	u32 my_drc_index;
-	bool found;
-	int rc;
-
-	/* Assume cpu doesn't exist */
-	found = false;
-
-	for_each_child_of_node(parent, child) {
-		rc = of_property_read_u32(child, "ibm,my-drc-index",
-					  &my_drc_index);
-		if (rc)
-			continue;
-
-		if (my_drc_index == drc_index) {
-			of_node_put(child);
-			found = true;
-			break;
-		}
-	}
-
-	return found;
-}
-
-static ssize_t dlpar_cpu_probe(const char *buf, size_t count)
-{
-	struct device_node *dn, *parent;
-	u32 drc_index;
-	int rc;
-
-	rc = kstrtou32(buf, 0, &drc_index);
-	if (rc)
-		return -EINVAL;
-
-	parent = of_find_node_by_path("/cpus");
-	if (!parent)
-		return -ENODEV;
-
-	if (dlpar_cpu_exists(parent, drc_index)) {
-		of_node_put(parent);
-		printk(KERN_WARNING "CPU with drc index %x already exists\n",
-		       drc_index);
-		return -EINVAL;
-	}
-
-	rc = dlpar_acquire_drc(drc_index);
-	if (rc) {
-		of_node_put(parent);
-		return -EINVAL;
-	}
-
-	dn = dlpar_configure_connector(cpu_to_be32(drc_index), parent);
-	of_node_put(parent);
-	if (!dn) {
-		dlpar_release_drc(drc_index);
-		return -EINVAL;
-	}
-
-	rc = dlpar_attach_node(dn);
-	if (rc) {
-		dlpar_release_drc(drc_index);
-		dlpar_free_cc_nodes(dn);
-		return rc;
-	}
-
-	rc = dlpar_online_cpu(dn);
-	if (rc)
-		return rc;
-
-	return count;
-}
-
-static int dlpar_offline_cpu(struct device_node *dn)
-{
-	int rc = 0;
-	unsigned int cpu;
-	int len, nthreads, i;
-	const __be32 *intserv;
-	u32 thread;
-
-	intserv = of_get_property(dn, "ibm,ppc-interrupt-server#s", &len);
-	if (!intserv)
-		return -EINVAL;
-
-	nthreads = len / sizeof(u32);
-
-	cpu_maps_update_begin();
-	for (i = 0; i < nthreads; i++) {
-		thread = be32_to_cpu(intserv[i]);
-		for_each_present_cpu(cpu) {
-			if (get_hard_smp_processor_id(cpu) != thread)
-				continue;
-
-			if (get_cpu_current_state(cpu) == CPU_STATE_OFFLINE)
-				break;
-
-			if (get_cpu_current_state(cpu) == CPU_STATE_ONLINE) {
-				set_preferred_offline_state(cpu, CPU_STATE_OFFLINE);
-				cpu_maps_update_done();
-				rc = device_offline(get_cpu_device(cpu));
-				if (rc)
-					goto out;
-				cpu_maps_update_begin();
-				break;
-
-			}
-
-			/*
-			 * The cpu is in CPU_STATE_INACTIVE.
-			 * Upgrade it's state to CPU_STATE_OFFLINE.
-			 */
-			set_preferred_offline_state(cpu, CPU_STATE_OFFLINE);
-			BUG_ON(plpar_hcall_norets(H_PROD, thread)
-								!= H_SUCCESS);
-			__cpu_die(cpu);
-			break;
-		}
-		if (cpu == num_possible_cpus())
-			printk(KERN_WARNING "Could not find cpu to offline "
-			       "with physical id 0x%x\n", thread);
-	}
-	cpu_maps_update_done();
-
-out:
-	return rc;
-
-}
-
-static ssize_t dlpar_cpu_release(const char *buf, size_t count)
-{
-	struct device_node *dn;
-	u32 drc_index;
-	int rc;
-
-	dn = of_find_node_by_path(buf);
-	if (!dn)
-		return -EINVAL;
-
-	rc = of_property_read_u32(dn, "ibm,my-drc-index", &drc_index);
-	if (rc) {
-		of_node_put(dn);
-		return -EINVAL;
-	}
-
-	rc = dlpar_offline_cpu(dn);
-	if (rc) {
-		of_node_put(dn);
-		return -EINVAL;
-	}
-
-	rc = dlpar_release_drc(drc_index);
-	if (rc) {
-		of_node_put(dn);
-		return rc;
-	}
-
-	rc = dlpar_detach_node(dn);
-	if (rc) {
-		dlpar_acquire_drc(drc_index);
-		return rc;
-	}
-
-	of_node_put(dn);
-
-	return count;
-}
-
-#endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */
-
 static int handle_dlpar_errorlog(struct pseries_hp_errorlog *hp_elog)
 {
 	int rc;
@@ -659,16 +444,7 @@ static CLASS_ATTR(dlpar, S_IWUSR, NULL, dlpar_store);
 
 static int __init pseries_dlpar_init(void)
 {
-	int rc;
-
-#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
-	ppc_md.cpu_probe = dlpar_cpu_probe;
-	ppc_md.cpu_release = dlpar_cpu_release;
-#endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */
-
-	rc = sysfs_create_file(kernel_kobj, &class_attr_dlpar.attr);
-
-	return rc;
+	return sysfs_create_file(kernel_kobj, &class_attr_dlpar.attr);
 }
 machine_device_initcall(pseries, pseries_dlpar_init);
 
diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index 86d2ecacb237..66d8c2c64aa9 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -32,6 +32,7 @@
 #include <asm/xics.h>
 #include <asm/plpar_wrappers.h>
 
+#include "pseries.h"
 #include "offline_states.h"
 
 /* This version can't take the spinlock, because it never returns */
@@ -334,6 +335,218 @@ static void pseries_remove_processor(struct device_node *np)
 	cpu_maps_update_done();
 }
 
+#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
+
+static int dlpar_online_cpu(struct device_node *dn)
+{
+	int rc = 0;
+	unsigned int cpu;
+	int len, nthreads, i;
+	const __be32 *intserv;
+	u32 thread;
+
+	intserv = of_get_property(dn, "ibm,ppc-interrupt-server#s", &len);
+	if (!intserv)
+		return -EINVAL;
+
+	nthreads = len / sizeof(u32);
+
+	cpu_maps_update_begin();
+	for (i = 0; i < nthreads; i++) {
+		thread = be32_to_cpu(intserv[i]);
+		for_each_present_cpu(cpu) {
+			if (get_hard_smp_processor_id(cpu) != thread)
+				continue;
+			BUG_ON(get_cpu_current_state(cpu)
+					!= CPU_STATE_OFFLINE);
+			cpu_maps_update_done();
+			rc = device_online(get_cpu_device(cpu));
+			if (rc)
+				goto out;
+			cpu_maps_update_begin();
+
+			break;
+		}
+		if (cpu == num_possible_cpus())
+			printk(KERN_WARNING "Could not find cpu to online "
+			       "with physical id 0x%x\n", thread);
+	}
+	cpu_maps_update_done();
+
+out:
+	return rc;
+
+}
+
+static bool dlpar_cpu_exists(struct device_node *parent, u32 drc_index)
+{
+	struct device_node *child = NULL;
+	u32 my_drc_index;
+	bool found;
+	int rc;
+
+	/* Assume cpu doesn't exist */
+	found = false;
+
+	for_each_child_of_node(parent, child) {
+		rc = of_property_read_u32(child, "ibm,my-drc-index",
+					  &my_drc_index);
+		if (rc)
+			continue;
+
+		if (my_drc_index == drc_index) {
+			of_node_put(child);
+			found = true;
+			break;
+		}
+	}
+
+	return found;
+}
+
+static ssize_t dlpar_cpu_probe(const char *buf, size_t count)
+{
+	struct device_node *dn, *parent;
+	u32 drc_index;
+	int rc;
+
+	rc = kstrtou32(buf, 0, &drc_index);
+	if (rc)
+		return -EINVAL;
+
+	parent = of_find_node_by_path("/cpus");
+	if (!parent)
+		return -ENODEV;
+
+	if (dlpar_cpu_exists(parent, drc_index)) {
+		of_node_put(parent);
+		printk(KERN_WARNING "CPU with drc index %x already exists\n",
+		       drc_index);
+		return -EINVAL;
+	}
+
+	rc = dlpar_acquire_drc(drc_index);
+	if (rc) {
+		of_node_put(parent);
+		return -EINVAL;
+	}
+
+	dn = dlpar_configure_connector(cpu_to_be32(drc_index), parent);
+	of_node_put(parent);
+	if (!dn)
+		return -EINVAL;
+
+	rc = dlpar_attach_node(dn);
+	if (rc) {
+		dlpar_release_drc(drc_index);
+		dlpar_free_cc_nodes(dn);
+		return rc;
+	}
+
+	rc = dlpar_online_cpu(dn);
+	if (rc)
+		return rc;
+
+	return count;
+}
+
+static int dlpar_offline_cpu(struct device_node *dn)
+{
+	int rc = 0;
+	unsigned int cpu;
+	int len, nthreads, i;
+	const __be32 *intserv;
+	u32 thread;
+
+	intserv = of_get_property(dn, "ibm,ppc-interrupt-server#s", &len);
+	if (!intserv)
+		return -EINVAL;
+
+	nthreads = len / sizeof(u32);
+
+	cpu_maps_update_begin();
+	for (i = 0; i < nthreads; i++) {
+		thread = be32_to_cpu(intserv[i]);
+		for_each_present_cpu(cpu) {
+			if (get_hard_smp_processor_id(cpu) != thread)
+				continue;
+
+			if (get_cpu_current_state(cpu) == CPU_STATE_OFFLINE)
+				break;
+
+			if (get_cpu_current_state(cpu) == CPU_STATE_ONLINE) {
+				set_preferred_offline_state(cpu,
+							    CPU_STATE_OFFLINE);
+				cpu_maps_update_done();
+				rc = device_offline(get_cpu_device(cpu));
+				if (rc)
+					goto out;
+				cpu_maps_update_begin();
+				break;
+
+			}
+
+			/*
+			 * The cpu is in CPU_STATE_INACTIVE.
+			 * Upgrade it's state to CPU_STATE_OFFLINE.
+			 */
+			set_preferred_offline_state(cpu, CPU_STATE_OFFLINE);
+			BUG_ON(plpar_hcall_norets(H_PROD, thread)
+								!= H_SUCCESS);
+			__cpu_die(cpu);
+			break;
+		}
+		if (cpu == num_possible_cpus())
+			printk(KERN_WARNING "Could not find cpu to offline with physical id 0x%x\n", thread);
+	}
+	cpu_maps_update_done();
+
+out:
+	return rc;
+
+}
+
+static ssize_t dlpar_cpu_release(const char *buf, size_t count)
+{
+	struct device_node *dn;
+	u32 drc_index;
+	int rc;
+
+	dn = of_find_node_by_path(buf);
+	if (!dn)
+		return -EINVAL;
+
+	rc = of_property_read_u32(dn, "ibm,my-drc-index", &drc_index);
+	if (rc) {
+		of_node_put(dn);
+		return -EINVAL;
+	}
+
+	rc = dlpar_offline_cpu(dn);
+	if (rc) {
+		of_node_put(dn);
+		return -EINVAL;
+	}
+
+	rc = dlpar_release_drc(drc_index);
+	if (rc) {
+		of_node_put(dn);
+		return rc;
+	}
+
+	rc = dlpar_detach_node(dn);
+	if (rc) {
+		dlpar_acquire_drc(drc_index);
+		return rc;
+	}
+
+	of_node_put(dn);
+
+	return count;
+}
+
+#endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */
+
 static int pseries_smp_notifier(struct notifier_block *nb,
 				unsigned long action, void *data)
 {
@@ -380,6 +593,11 @@ static int __init pseries_cpu_hotplug_init(void)
 	int cpu;
 	int qcss_tok;
 
+#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
+	ppc_md.cpu_probe = dlpar_cpu_probe;
+	ppc_md.cpu_release = dlpar_cpu_release;
+#endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */
+
 	for_each_node_by_name(np, "interrupt-controller") {
 		typep = of_get_property(np, "compatible", NULL);
 		if (strstr(typep, "open-pic")) {

From d98389f375329b7a37d0e9211a1216d9141d7a5f Mon Sep 17 00:00:00 2001
From: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Date: Wed, 16 Dec 2015 14:51:26 -0600
Subject: [PATCH 104/149] powerpc/pseries: Factor out common cpu hotplug code

Re-factor the cpu hotplug code to support doing cpu hotplug completely in
the kernel and using the existing sysfs probe/release interfaces. This
patch pulls out pieces of existing cpu hotplug code into common routines,
dlpar_cpu_add() and dlpar_cpu_remove(), to be used by both interfaces.
There are no functional changes introduced.

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/pseries/hotplug-cpu.c | 70 +++++++++++---------
 1 file changed, 39 insertions(+), 31 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index 66d8c2c64aa9..6fb28cf229e7 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -335,8 +335,6 @@ static void pseries_remove_processor(struct device_node *np)
 	cpu_maps_update_done();
 }
 
-#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
-
 static int dlpar_online_cpu(struct device_node *dn)
 {
 	int rc = 0;
@@ -404,16 +402,11 @@ static bool dlpar_cpu_exists(struct device_node *parent, u32 drc_index)
 	return found;
 }
 
-static ssize_t dlpar_cpu_probe(const char *buf, size_t count)
+static ssize_t dlpar_cpu_add(u32 drc_index)
 {
 	struct device_node *dn, *parent;
-	u32 drc_index;
 	int rc;
 
-	rc = kstrtou32(buf, 0, &drc_index);
-	if (rc)
-		return -EINVAL;
-
 	parent = of_find_node_by_path("/cpus");
 	if (!parent)
 		return -ENODEV;
@@ -444,10 +437,7 @@ static ssize_t dlpar_cpu_probe(const char *buf, size_t count)
 	}
 
 	rc = dlpar_online_cpu(dn);
-	if (rc)
-		return rc;
-
-	return count;
+	return rc;
 }
 
 static int dlpar_offline_cpu(struct device_node *dn)
@@ -506,6 +496,41 @@ out:
 
 }
 
+static ssize_t dlpar_cpu_remove(struct device_node *dn, u32 drc_index)
+{
+	int rc;
+
+	rc = dlpar_offline_cpu(dn);
+	if (rc)
+		return -EINVAL;
+
+	rc = dlpar_release_drc(drc_index);
+	if (rc)
+		return rc;
+
+	rc = dlpar_detach_node(dn);
+	if (rc)
+		dlpar_acquire_drc(drc_index);
+
+	return rc;
+}
+
+#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
+
+static ssize_t dlpar_cpu_probe(const char *buf, size_t count)
+{
+	u32 drc_index;
+	int rc;
+
+	rc = kstrtou32(buf, 0, &drc_index);
+	if (rc)
+		return -EINVAL;
+
+	rc = dlpar_cpu_add(drc_index);
+
+	return rc ? rc : count;
+}
+
 static ssize_t dlpar_cpu_release(const char *buf, size_t count)
 {
 	struct device_node *dn;
@@ -522,27 +547,10 @@ static ssize_t dlpar_cpu_release(const char *buf, size_t count)
 		return -EINVAL;
 	}
 
-	rc = dlpar_offline_cpu(dn);
-	if (rc) {
-		of_node_put(dn);
-		return -EINVAL;
-	}
-
-	rc = dlpar_release_drc(drc_index);
-	if (rc) {
-		of_node_put(dn);
-		return rc;
-	}
-
-	rc = dlpar_detach_node(dn);
-	if (rc) {
-		dlpar_acquire_drc(drc_index);
-		return rc;
-	}
-
+	rc = dlpar_cpu_remove(dn, drc_index);
 	of_node_put(dn);
 
-	return count;
+	return rc ? rc : count;
 }
 
 #endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */

From e666ae0b10aaa1c961c928558bafc28bc049ac87 Mon Sep 17 00:00:00 2001
From: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Date: Wed, 16 Dec 2015 14:52:39 -0600
Subject: [PATCH 105/149] powerpc/pseries: Update CPU hotplug error recovery

Update the cpu dlpar add/remove paths to do better error recovery when
a failure occurs during the add/remove operation.

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/pseries/hotplug-cpu.c | 76 ++++++++++++++++----
 1 file changed, 63 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index 6fb28cf229e7..a54aee982589 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -18,6 +18,8 @@
  *      2 of the License, or (at your option) any later version.
  */
 
+#define pr_fmt(fmt)     "pseries-hotplug-cpu: " fmt
+
 #include <linux/kernel.h>
 #include <linux/interrupt.h>
 #include <linux/delay.h>
@@ -405,38 +407,67 @@ static bool dlpar_cpu_exists(struct device_node *parent, u32 drc_index)
 static ssize_t dlpar_cpu_add(u32 drc_index)
 {
 	struct device_node *dn, *parent;
-	int rc;
+	int rc, saved_rc;
+
+	pr_debug("Attempting to add CPU, drc index: %x\n", drc_index);
 
 	parent = of_find_node_by_path("/cpus");
-	if (!parent)
+	if (!parent) {
+		pr_warn("Failed to find CPU root node \"/cpus\"\n");
 		return -ENODEV;
+	}
 
 	if (dlpar_cpu_exists(parent, drc_index)) {
 		of_node_put(parent);
-		printk(KERN_WARNING "CPU with drc index %x already exists\n",
-		       drc_index);
+		pr_warn("CPU with drc index %x already exists\n", drc_index);
 		return -EINVAL;
 	}
 
 	rc = dlpar_acquire_drc(drc_index);
 	if (rc) {
+		pr_warn("Failed to acquire DRC, rc: %d, drc index: %x\n",
+			rc, drc_index);
 		of_node_put(parent);
 		return -EINVAL;
 	}
 
 	dn = dlpar_configure_connector(cpu_to_be32(drc_index), parent);
 	of_node_put(parent);
-	if (!dn)
+	if (!dn) {
+		pr_warn("Failed call to configure-connector, drc index: %x\n",
+			drc_index);
+		dlpar_release_drc(drc_index);
 		return -EINVAL;
+	}
 
 	rc = dlpar_attach_node(dn);
 	if (rc) {
-		dlpar_release_drc(drc_index);
-		dlpar_free_cc_nodes(dn);
-		return rc;
+		saved_rc = rc;
+		pr_warn("Failed to attach node %s, rc: %d, drc index: %x\n",
+			dn->name, rc, drc_index);
+
+		rc = dlpar_release_drc(drc_index);
+		if (!rc)
+			dlpar_free_cc_nodes(dn);
+
+		return saved_rc;
 	}
 
 	rc = dlpar_online_cpu(dn);
+	if (rc) {
+		saved_rc = rc;
+		pr_warn("Failed to online cpu %s, rc: %d, drc index: %x\n",
+			dn->name, rc, drc_index);
+
+		rc = dlpar_detach_node(dn);
+		if (!rc)
+			dlpar_release_drc(drc_index);
+
+		return saved_rc;
+	}
+
+	pr_debug("Successfully added CPU %s, drc index: %x\n", dn->name,
+		 drc_index);
 	return rc;
 }
 
@@ -500,19 +531,38 @@ static ssize_t dlpar_cpu_remove(struct device_node *dn, u32 drc_index)
 {
 	int rc;
 
+	pr_debug("Attemping to remove CPU %s, drc index: %x\n",
+		 dn->name, drc_index);
+
 	rc = dlpar_offline_cpu(dn);
-	if (rc)
+	if (rc) {
+		pr_warn("Failed to offline CPU %s, rc: %d\n", dn->name, rc);
 		return -EINVAL;
+	}
 
 	rc = dlpar_release_drc(drc_index);
-	if (rc)
+	if (rc) {
+		pr_warn("Failed to release drc (%x) for CPU %s, rc: %d\n",
+			drc_index, dn->name, rc);
+		dlpar_online_cpu(dn);
 		return rc;
+	}
 
 	rc = dlpar_detach_node(dn);
-	if (rc)
-		dlpar_acquire_drc(drc_index);
+	if (rc) {
+		int saved_rc = rc;
 
-	return rc;
+		pr_warn("Failed to detach CPU %s, rc: %d", dn->name, rc);
+
+		rc = dlpar_acquire_drc(drc_index);
+		if (!rc)
+			dlpar_online_cpu(dn);
+
+		return saved_rc;
+	}
+
+	pr_debug("Successfully removed CPU, drc index: %x\n", drc_index);
+	return 0;
 }
 
 #ifdef CONFIG_ARCH_CPU_PROBE_RELEASE

From ac71380071d19d4ac7cd5f9fe4168d7109902cd5 Mon Sep 17 00:00:00 2001
From: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Date: Wed, 16 Dec 2015 14:54:05 -0600
Subject: [PATCH 106/149] powerpc/pseries: Add CPU dlpar remove functionality

Add the ability to dlpar remove CPUs via hotplug rtas events, either by
specifying the drc-index of the CPU to remove or providing a count of cpus
to remove.

To remove multiple cpus in a single request we create a list of possible
DR (Dynamic Reconfiguration) cpus and their drc indexes that can be
removed.  We can then traverse the list remove each cpu and easily clean
up in any cases of failure.

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/pseries/hotplug-cpu.c | 147 +++++++++++++++++++
 arch/powerpc/platforms/pseries/pseries.h     |   9 ++
 2 files changed, 156 insertions(+)

diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index a54aee982589..86c7ae3db50e 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -26,6 +26,7 @@
 #include <linux/sched.h>	/* for idle_task_exit */
 #include <linux/cpu.h>
 #include <linux/of.h>
+#include <linux/slab.h>
 #include <asm/prom.h>
 #include <asm/rtas.h>
 #include <asm/firmware.h>
@@ -565,6 +566,152 @@ static ssize_t dlpar_cpu_remove(struct device_node *dn, u32 drc_index)
 	return 0;
 }
 
+static struct device_node *cpu_drc_index_to_dn(u32 drc_index)
+{
+	struct device_node *dn;
+	u32 my_index;
+	int rc;
+
+	for_each_node_by_type(dn, "cpu") {
+		rc = of_property_read_u32(dn, "ibm,my-drc-index", &my_index);
+		if (rc)
+			continue;
+
+		if (my_index == drc_index)
+			break;
+	}
+
+	return dn;
+}
+
+static int dlpar_cpu_remove_by_index(u32 drc_index)
+{
+	struct device_node *dn;
+	int rc;
+
+	dn = cpu_drc_index_to_dn(drc_index);
+	if (!dn) {
+		pr_warn("Cannot find CPU (drc index %x) to remove\n",
+			drc_index);
+		return -ENODEV;
+	}
+
+	rc = dlpar_cpu_remove(dn, drc_index);
+	of_node_put(dn);
+	return rc;
+}
+
+static int find_dlpar_cpus_to_remove(u32 *cpu_drcs, int cpus_to_remove)
+{
+	struct device_node *dn;
+	int cpus_found = 0;
+	int rc;
+
+	/* We want to find cpus_to_remove + 1 CPUs to ensure we do not
+	 * remove the last CPU.
+	 */
+	for_each_node_by_type(dn, "cpu") {
+		cpus_found++;
+
+		if (cpus_found > cpus_to_remove) {
+			of_node_put(dn);
+			break;
+		}
+
+		/* Note that cpus_found is always 1 ahead of the index
+		 * into the cpu_drcs array, so we use cpus_found - 1
+		 */
+		rc = of_property_read_u32(dn, "ibm,my-drc-index",
+					  &cpu_drcs[cpus_found - 1]);
+		if (rc) {
+			pr_warn("Error occurred getting drc-index for %s\n",
+				dn->name);
+			of_node_put(dn);
+			return -1;
+		}
+	}
+
+	if (cpus_found < cpus_to_remove) {
+		pr_warn("Failed to find enough CPUs (%d of %d) to remove\n",
+			cpus_found, cpus_to_remove);
+	} else if (cpus_found == cpus_to_remove) {
+		pr_warn("Cannot remove all CPUs\n");
+	}
+
+	return cpus_found;
+}
+
+static int dlpar_cpu_remove_by_count(u32 cpus_to_remove)
+{
+	u32 *cpu_drcs;
+	int cpus_found;
+	int cpus_removed = 0;
+	int i, rc;
+
+	pr_debug("Attempting to hot-remove %d CPUs\n", cpus_to_remove);
+
+	cpu_drcs = kcalloc(cpus_to_remove, sizeof(*cpu_drcs), GFP_KERNEL);
+	if (!cpu_drcs)
+		return -EINVAL;
+
+	cpus_found = find_dlpar_cpus_to_remove(cpu_drcs, cpus_to_remove);
+	if (cpus_found <= cpus_to_remove) {
+		kfree(cpu_drcs);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < cpus_to_remove; i++) {
+		rc = dlpar_cpu_remove_by_index(cpu_drcs[i]);
+		if (rc)
+			break;
+
+		cpus_removed++;
+	}
+
+	if (cpus_removed != cpus_to_remove) {
+		pr_warn("CPU hot-remove failed, adding back removed CPUs\n");
+
+		for (i = 0; i < cpus_removed; i++)
+			dlpar_cpu_add(cpu_drcs[i]);
+
+		rc = -EINVAL;
+	} else {
+		rc = 0;
+	}
+
+	kfree(cpu_drcs);
+	return rc;
+}
+
+int dlpar_cpu(struct pseries_hp_errorlog *hp_elog)
+{
+	u32 count, drc_index;
+	int rc;
+
+	count = hp_elog->_drc_u.drc_count;
+	drc_index = hp_elog->_drc_u.drc_index;
+
+	lock_device_hotplug();
+
+	switch (hp_elog->action) {
+	case PSERIES_HP_ELOG_ACTION_REMOVE:
+		if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_COUNT)
+			rc = dlpar_cpu_remove_by_count(count);
+		else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX)
+			rc = dlpar_cpu_remove_by_index(drc_index);
+		else
+			rc = -EINVAL;
+		break;
+	default:
+		pr_err("Invalid action (%d) specified\n", hp_elog->action);
+		rc = -EINVAL;
+		break;
+	}
+
+	unlock_device_hotplug();
+	return rc;
+}
+
 #ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
 
 static ssize_t dlpar_cpu_probe(const char *buf, size_t count)
diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h
index 8411c27293e4..7aa83f00ac62 100644
--- a/arch/powerpc/platforms/pseries/pseries.h
+++ b/arch/powerpc/platforms/pseries/pseries.h
@@ -73,6 +73,15 @@ static inline int dlpar_memory(struct pseries_hp_errorlog *hp_elog)
 }
 #endif
 
+#ifdef CONFIG_HOTPLUG_CPU
+int dlpar_cpu(struct pseries_hp_errorlog *hp_elog);
+#else
+static inline int dlpar_cpu(struct pseries_hp_errorlog *hp_elog)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
 /* PCI root bridge prepare function override for pseries */
 struct pci_host_bridge;
 int pseries_root_bridge_prepare(struct pci_host_bridge *bridge);

From 90edf184b9b7275d248f1b9902733a0000e4ecf8 Mon Sep 17 00:00:00 2001
From: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Date: Wed, 16 Dec 2015 14:55:07 -0600
Subject: [PATCH 107/149] powerpc/pseries: Add CPU dlpar add functionality

Add the ability to hotplug add cpus via rtas hotplug events by either
specifying the drc index of the CPU to add, or providing a count of the
number of CPUs to add.

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/pseries/hotplug-cpu.c | 116 +++++++++++++++++++
 1 file changed, 116 insertions(+)

diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index 86c7ae3db50e..32274f72fe3f 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -405,6 +405,27 @@ static bool dlpar_cpu_exists(struct device_node *parent, u32 drc_index)
 	return found;
 }
 
+static bool valid_cpu_drc_index(struct device_node *parent, u32 drc_index)
+{
+	bool found = false;
+	int rc, index;
+
+	index = 0;
+	while (!found) {
+		u32 drc;
+
+		rc = of_property_read_u32_index(parent, "ibm,drc-indexes",
+						index++, &drc);
+		if (rc)
+			break;
+
+		if (drc == drc_index)
+			found = true;
+	}
+
+	return found;
+}
+
 static ssize_t dlpar_cpu_add(u32 drc_index)
 {
 	struct device_node *dn, *parent;
@@ -424,6 +445,12 @@ static ssize_t dlpar_cpu_add(u32 drc_index)
 		return -EINVAL;
 	}
 
+	if (!valid_cpu_drc_index(parent, drc_index)) {
+		of_node_put(parent);
+		pr_warn("Cannot find CPU (drc index %x) to add.\n", drc_index);
+		return -EINVAL;
+	}
+
 	rc = dlpar_acquire_drc(drc_index);
 	if (rc) {
 		pr_warn("Failed to acquire DRC, rc: %d, drc index: %x\n",
@@ -683,6 +710,87 @@ static int dlpar_cpu_remove_by_count(u32 cpus_to_remove)
 	return rc;
 }
 
+static int find_dlpar_cpus_to_add(u32 *cpu_drcs, u32 cpus_to_add)
+{
+	struct device_node *parent;
+	int cpus_found = 0;
+	int index, rc;
+
+	parent = of_find_node_by_path("/cpus");
+	if (!parent) {
+		pr_warn("Could not find CPU root node in device tree\n");
+		kfree(cpu_drcs);
+		return -1;
+	}
+
+	/* Search the ibm,drc-indexes array for possible CPU drcs to
+	 * add. Note that the format of the ibm,drc-indexes array is
+	 * the number of entries in the array followed by the array
+	 * of drc values so we start looking at index = 1.
+	 */
+	index = 1;
+	while (cpus_found < cpus_to_add) {
+		u32 drc;
+
+		rc = of_property_read_u32_index(parent, "ibm,drc-indexes",
+						index++, &drc);
+		if (rc)
+			break;
+
+		if (dlpar_cpu_exists(parent, drc))
+			continue;
+
+		cpu_drcs[cpus_found++] = drc;
+	}
+
+	of_node_put(parent);
+	return cpus_found;
+}
+
+static int dlpar_cpu_add_by_count(u32 cpus_to_add)
+{
+	u32 *cpu_drcs;
+	int cpus_added = 0;
+	int cpus_found;
+	int i, rc;
+
+	pr_debug("Attempting to hot-add %d CPUs\n", cpus_to_add);
+
+	cpu_drcs = kcalloc(cpus_to_add, sizeof(*cpu_drcs), GFP_KERNEL);
+	if (!cpu_drcs)
+		return -EINVAL;
+
+	cpus_found = find_dlpar_cpus_to_add(cpu_drcs, cpus_to_add);
+	if (cpus_found < cpus_to_add) {
+		pr_warn("Failed to find enough CPUs (%d of %d) to add\n",
+			cpus_found, cpus_to_add);
+		kfree(cpu_drcs);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < cpus_to_add; i++) {
+		rc = dlpar_cpu_add(cpu_drcs[i]);
+		if (rc)
+			break;
+
+		cpus_added++;
+	}
+
+	if (cpus_added < cpus_to_add) {
+		pr_warn("CPU hot-add failed, removing any added CPUs\n");
+
+		for (i = 0; i < cpus_added; i++)
+			dlpar_cpu_remove_by_index(cpu_drcs[i]);
+
+		rc = -EINVAL;
+	} else {
+		rc = 0;
+	}
+
+	kfree(cpu_drcs);
+	return rc;
+}
+
 int dlpar_cpu(struct pseries_hp_errorlog *hp_elog)
 {
 	u32 count, drc_index;
@@ -702,6 +810,14 @@ int dlpar_cpu(struct pseries_hp_errorlog *hp_elog)
 		else
 			rc = -EINVAL;
 		break;
+	case PSERIES_HP_ELOG_ACTION_ADD:
+		if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_COUNT)
+			rc = dlpar_cpu_add_by_count(count);
+		else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX)
+			rc = dlpar_cpu_add(drc_index);
+		else
+			rc = -EINVAL;
+		break;
 	default:
 		pr_err("Invalid action (%d) specified\n", hp_elog->action);
 		rc = -EINVAL;

From e9d764f803964a54ca7da4a67d124fe824ebd80a Mon Sep 17 00:00:00 2001
From: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Date: Wed, 16 Dec 2015 14:56:02 -0600
Subject: [PATCH 108/149] powerpc/pseries: Enable kernel CPU dlpar from sysfs

Enable new kernel cpu hotplug functionality by allowing cpu dlpar requests
to be initiated from sysfs.

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/pseries/dlpar.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c
index 438fdbd7e40e..2b93ae8d557a 100644
--- a/arch/powerpc/platforms/pseries/dlpar.c
+++ b/arch/powerpc/platforms/pseries/dlpar.c
@@ -356,6 +356,9 @@ static int handle_dlpar_errorlog(struct pseries_hp_errorlog *hp_elog)
 	case PSERIES_HP_ELOG_RESOURCE_MEM:
 		rc = dlpar_memory(hp_elog);
 		break;
+	case PSERIES_HP_ELOG_RESOURCE_CPU:
+		rc = dlpar_cpu(hp_elog);
+		break;
 	default:
 		pr_warn_ratelimited("Invalid resource (%d) specified\n",
 				    hp_elog->resource);
@@ -385,6 +388,9 @@ static ssize_t dlpar_store(struct class *class, struct class_attribute *attr,
 	if (!strncmp(arg, "memory", 6)) {
 		hp_elog->resource = PSERIES_HP_ELOG_RESOURCE_MEM;
 		arg += strlen("memory ");
+	} else if (!strncmp(arg, "cpu", 3)) {
+		hp_elog->resource = PSERIES_HP_ELOG_RESOURCE_CPU;
+		arg += strlen("cpu ");
 	} else {
 		pr_err("Invalid resource specified: \"%s\"\n", buf);
 		rc = -EINVAL;

From 1b855e167b90fcb353977c08932d0a52eb8ae5b9 Mon Sep 17 00:00:00 2001
From: Daniel Axtens <dja@axtens.net>
Date: Thu, 17 Dec 2015 19:41:00 +1100
Subject: [PATCH 109/149] powerpc: Add missing calls to va_end()

cppcheck picked up that there were a couple of missing va_end()
calls in functions using va_start().

Signed-off-by: Daniel Axtens <dja@axtens.net>
Reviewed-by: Russell Currey <ruscur@russell.cc>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/prom_init.c              | 1 +
 arch/powerpc/platforms/powermac/bootx_init.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index 92dea8df6b26..da5192590c44 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -389,6 +389,7 @@ static void __init prom_printf(const char *format, ...)
 			break;
 		}
 	}
+	va_end(args);
 }
 
 
diff --git a/arch/powerpc/platforms/powermac/bootx_init.c b/arch/powerpc/platforms/powermac/bootx_init.c
index 76f5013c35e5..c3c9bbb3573a 100644
--- a/arch/powerpc/platforms/powermac/bootx_init.c
+++ b/arch/powerpc/platforms/powermac/bootx_init.c
@@ -84,6 +84,7 @@ static void __init bootx_printf(const char *format, ...)
 			break;
 		}
 	}
+	va_end(args);
 }
 #else /* CONFIG_BOOTX_TEXT */
 static void __init bootx_printf(const char *format, ...) {}

From c395465da68bfc3a238d5bc15f862e33e6e9ecec Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Wed, 28 Oct 2015 15:54:06 +1100
Subject: [PATCH 110/149] powerpc: Add function to copy mm_context_t to the
 paca

This adds a function to copy the mm->context to the paca.  This is
only a basic conversion for now but will be used more extensively in
the next patch.

This also adds #ifdef CONFIG_PPC_BOOK3S around this code since it's
not used elsewhere.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/paca.h   | 11 +++++++++++
 arch/powerpc/kernel/asm-offsets.c |  2 ++
 arch/powerpc/mm/hash_utils_64.c   |  5 +++--
 arch/powerpc/mm/slb.c             |  2 +-
 arch/powerpc/mm/slice.c           |  3 +--
 5 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 70bd4381f8e6..1cc6e0828907 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -131,7 +131,9 @@ struct paca_struct {
 	struct tlb_core_data tcd;
 #endif /* CONFIG_PPC_BOOK3E */
 
+#ifdef CONFIG_PPC_BOOK3S
 	mm_context_t context;
+#endif
 
 	/*
 	 * then miscellaneous read-write fields
@@ -194,6 +196,15 @@ struct paca_struct {
 #endif
 };
 
+#ifdef CONFIG_PPC_BOOK3S
+static inline void copy_mm_to_paca(mm_context_t *context)
+{
+	get_paca()->context = *context;
+}
+#else
+static inline void copy_mm_to_paca(mm_context_t *context){}
+#endif
+
 extern struct paca_struct *paca;
 extern void initialise_paca(struct paca_struct *new_paca, int cpu);
 extern void setup_paca(struct paca_struct *new_paca);
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 221d584d089f..9db7be292bf3 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -185,6 +185,7 @@ int main(void)
 	DEFINE(PACAKMSR, offsetof(struct paca_struct, kernel_msr));
 	DEFINE(PACASOFTIRQEN, offsetof(struct paca_struct, soft_enabled));
 	DEFINE(PACAIRQHAPPENED, offsetof(struct paca_struct, irq_happened));
+#ifdef CONFIG_PPC_BOOK3S
 	DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id));
 #ifdef CONFIG_PPC_MM_SLICES
 	DEFINE(PACALOWSLICESPSIZE, offsetof(struct paca_struct,
@@ -193,6 +194,7 @@ int main(void)
 					    context.high_slices_psize));
 	DEFINE(MMUPSIZEDEFSIZE, sizeof(struct mmu_psize_def));
 #endif /* CONFIG_PPC_MM_SLICES */
+#endif
 
 #ifdef CONFIG_PPC_BOOK3E
 	DEFINE(PACAPGD, offsetof(struct paca_struct, pgd));
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 4233dcccbaf7..03279eac0957 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -882,7 +882,8 @@ void demote_segment_4k(struct mm_struct *mm, unsigned long addr)
 	slice_set_range_psize(mm, addr, 1, MMU_PAGE_4K);
 	copro_flush_all_slbs(mm);
 	if ((get_paca_psize(addr) != MMU_PAGE_4K) && (current->mm == mm)) {
-		get_paca()->context = mm->context;
+
+		copy_mm_to_paca(&mm->context);
 		slb_flush_and_rebolt();
 	}
 }
@@ -949,7 +950,7 @@ static void check_paca_psize(unsigned long ea, struct mm_struct *mm,
 {
 	if (user_region) {
 		if (psize != get_paca_psize(ea)) {
-			get_paca()->context = mm->context;
+			copy_mm_to_paca(&mm->context);
 			slb_flush_and_rebolt();
 		}
 	} else if (get_paca()->vmalloc_sllp !=
diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
index 515730e499fe..825b6873391f 100644
--- a/arch/powerpc/mm/slb.c
+++ b/arch/powerpc/mm/slb.c
@@ -228,7 +228,7 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
 		asm volatile("slbie %0" : : "r" (slbie_data));
 
 	get_paca()->slb_cache_ptr = 0;
-	get_paca()->context = mm->context;
+	copy_mm_to_paca(&mm->context);
 
 	/*
 	 * preload some userspace segments into the SLB.
diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index 0f432a702870..42954f0b47ac 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -185,8 +185,7 @@ static void slice_flush_segments(void *parm)
 	if (mm != current->active_mm)
 		return;
 
-	/* update the paca copy of the context struct */
-	get_paca()->context = current->active_mm->context;
+	copy_mm_to_paca(&current->active_mm->context);
 
 	local_irq_save(flags);
 	slb_flush_and_rebolt();

From de2dd0eb30af55d3893979d5641c50c7a8969c99 Mon Sep 17 00:00:00 2001
From: Zhao Qiang <qiang.zhao@freescale.com>
Date: Mon, 30 Nov 2015 10:48:52 +0800
Subject: [PATCH 111/149] genalloc:support memory-allocation with
 bytes-alignment to genalloc

Bytes alignment is required to manage some special RAM,
so add gen_pool_first_fit_align to genalloc,
meanwhile add gen_pool_alloc_algo to pass algo in case user
layer using more than one algo, and pass data to
gen_pool_first_fit_align(modify gen_pool_alloc as a wrapper)

Signed-off-by: Zhao Qiang <qiang.zhao@freescale.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>
---
 include/linux/genalloc.h | 27 +++++++++++++++---
 lib/genalloc.c           | 61 ++++++++++++++++++++++++++++++++++++----
 2 files changed, 78 insertions(+), 10 deletions(-)

diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h
index 7ff168d06967..3c676ce46ee0 100644
--- a/include/linux/genalloc.h
+++ b/include/linux/genalloc.h
@@ -30,10 +30,12 @@
 #ifndef __GENALLOC_H__
 #define __GENALLOC_H__
 
+#include <linux/types.h>
 #include <linux/spinlock_types.h>
 
 struct device;
 struct device_node;
+struct gen_pool;
 
 /**
  * Allocation callback function type definition
@@ -47,7 +49,7 @@ typedef unsigned long (*genpool_algo_t)(unsigned long *map,
 			unsigned long size,
 			unsigned long start,
 			unsigned int nr,
-			void *data);
+			void *data, struct gen_pool *pool);
 
 /*
  *  General purpose special memory pool descriptor.
@@ -75,6 +77,13 @@ struct gen_pool_chunk {
 	unsigned long bits[0];		/* bitmap for allocating memory chunk */
 };
 
+/*
+ *  gen_pool data descriptor for gen_pool_first_fit_align.
+ */
+struct genpool_data_align {
+	int align;		/* alignment by bytes for starting address */
+};
+
 extern struct gen_pool *gen_pool_create(int, int);
 extern phys_addr_t gen_pool_virt_to_phys(struct gen_pool *pool, unsigned long);
 extern int gen_pool_add_virt(struct gen_pool *, unsigned long, phys_addr_t,
@@ -98,6 +107,8 @@ static inline int gen_pool_add(struct gen_pool *pool, unsigned long addr,
 }
 extern void gen_pool_destroy(struct gen_pool *);
 extern unsigned long gen_pool_alloc(struct gen_pool *, size_t);
+extern unsigned long gen_pool_alloc_algo(struct gen_pool *, size_t,
+		genpool_algo_t algo, void *data);
 extern void *gen_pool_dma_alloc(struct gen_pool *pool, size_t size,
 		dma_addr_t *dma);
 extern void gen_pool_free(struct gen_pool *, unsigned long, size_t);
@@ -110,14 +121,22 @@ extern void gen_pool_set_algo(struct gen_pool *pool, genpool_algo_t algo,
 		void *data);
 
 extern unsigned long gen_pool_first_fit(unsigned long *map, unsigned long size,
-		unsigned long start, unsigned int nr, void *data);
+		unsigned long start, unsigned int nr, void *data,
+		struct gen_pool *pool);
+
+extern unsigned long gen_pool_first_fit_align(unsigned long *map,
+		unsigned long size, unsigned long start, unsigned int nr,
+		void *data, struct gen_pool *pool);
+
 
 extern unsigned long gen_pool_first_fit_order_align(unsigned long *map,
 		unsigned long size, unsigned long start, unsigned int nr,
-		void *data);
+		void *data, struct gen_pool *pool);
 
 extern unsigned long gen_pool_best_fit(unsigned long *map, unsigned long size,
-		unsigned long start, unsigned int nr, void *data);
+		unsigned long start, unsigned int nr, void *data,
+		struct gen_pool *pool);
+
 
 extern struct gen_pool *devm_gen_pool_create(struct device *dev,
 		int min_alloc_order, int nid, const char *name);
diff --git a/lib/genalloc.c b/lib/genalloc.c
index 116a166b096f..b8cf89d9e17d 100644
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -269,6 +269,25 @@ EXPORT_SYMBOL(gen_pool_destroy);
  * NMI-safe cmpxchg implementation.
  */
 unsigned long gen_pool_alloc(struct gen_pool *pool, size_t size)
+{
+	return gen_pool_alloc_algo(pool, size, pool->algo, pool->data);
+}
+EXPORT_SYMBOL(gen_pool_alloc);
+
+/**
+ * gen_pool_alloc_algo - allocate special memory from the pool
+ * @pool: pool to allocate from
+ * @size: number of bytes to allocate from the pool
+ * @algo: algorithm passed from caller
+ * @data: data passed to algorithm
+ *
+ * Allocate the requested number of bytes from the specified pool.
+ * Uses the pool allocation function (with first-fit algorithm by default).
+ * Can not be used in NMI handler on architectures without
+ * NMI-safe cmpxchg implementation.
+ */
+unsigned long gen_pool_alloc_algo(struct gen_pool *pool, size_t size,
+		genpool_algo_t algo, void *data)
 {
 	struct gen_pool_chunk *chunk;
 	unsigned long addr = 0;
@@ -290,8 +309,8 @@ unsigned long gen_pool_alloc(struct gen_pool *pool, size_t size)
 
 		end_bit = chunk_size(chunk) >> order;
 retry:
-		start_bit = pool->algo(chunk->bits, end_bit, start_bit, nbits,
-				pool->data);
+		start_bit = algo(chunk->bits, end_bit, start_bit,
+				 nbits, data, pool);
 		if (start_bit >= end_bit)
 			continue;
 		remain = bitmap_set_ll(chunk->bits, start_bit, nbits);
@@ -310,7 +329,7 @@ retry:
 	rcu_read_unlock();
 	return addr;
 }
-EXPORT_SYMBOL(gen_pool_alloc);
+EXPORT_SYMBOL(gen_pool_alloc_algo);
 
 /**
  * gen_pool_dma_alloc - allocate special memory from the pool for DMA usage
@@ -501,14 +520,41 @@ EXPORT_SYMBOL(gen_pool_set_algo);
  * @start: The bitnumber to start searching at
  * @nr: The number of zeroed bits we're looking for
  * @data: additional data - unused
+ * @pool: pool to find the fit region memory from
  */
 unsigned long gen_pool_first_fit(unsigned long *map, unsigned long size,
-		unsigned long start, unsigned int nr, void *data)
+		unsigned long start, unsigned int nr, void *data,
+		struct gen_pool *pool)
 {
 	return bitmap_find_next_zero_area(map, size, start, nr, 0);
 }
 EXPORT_SYMBOL(gen_pool_first_fit);
 
+/**
+ * gen_pool_first_fit_align - find the first available region
+ * of memory matching the size requirement (alignment constraint)
+ * @map: The address to base the search on
+ * @size: The bitmap size in bits
+ * @start: The bitnumber to start searching at
+ * @nr: The number of zeroed bits we're looking for
+ * @data: data for alignment
+ * @pool: pool to get order from
+ */
+unsigned long gen_pool_first_fit_align(unsigned long *map, unsigned long size,
+		unsigned long start, unsigned int nr, void *data,
+		struct gen_pool *pool)
+{
+	struct genpool_data_align *alignment;
+	unsigned long align_mask;
+	int order;
+
+	alignment = data;
+	order = pool->min_alloc_order;
+	align_mask = ((alignment->align + (1UL << order) - 1) >> order) - 1;
+	return bitmap_find_next_zero_area(map, size, start, nr, align_mask);
+}
+EXPORT_SYMBOL(gen_pool_first_fit_align);
+
 /**
  * gen_pool_first_fit_order_align - find the first available region
  * of memory matching the size requirement. The region will be aligned
@@ -518,10 +564,11 @@ EXPORT_SYMBOL(gen_pool_first_fit);
  * @start: The bitnumber to start searching at
  * @nr: The number of zeroed bits we're looking for
  * @data: additional data - unused
+ * @pool: pool to find the fit region memory from
  */
 unsigned long gen_pool_first_fit_order_align(unsigned long *map,
 		unsigned long size, unsigned long start,
-		unsigned int nr, void *data)
+		unsigned int nr, void *data, struct gen_pool *pool)
 {
 	unsigned long align_mask = roundup_pow_of_two(nr) - 1;
 
@@ -537,12 +584,14 @@ EXPORT_SYMBOL(gen_pool_first_fit_order_align);
  * @start: The bitnumber to start searching at
  * @nr: The number of zeroed bits we're looking for
  * @data: additional data - unused
+ * @pool: pool to find the fit region memory from
  *
  * Iterate over the bitmap to find the smallest free region
  * which we can allocate the memory.
  */
 unsigned long gen_pool_best_fit(unsigned long *map, unsigned long size,
-		unsigned long start, unsigned int nr, void *data)
+		unsigned long start, unsigned int nr, void *data,
+		struct gen_pool *pool)
 {
 	unsigned long start_bit = size;
 	unsigned long len = size + 1;

From b26981c8f743d3cb64a6907eb1f5c6c4ba6ca672 Mon Sep 17 00:00:00 2001
From: Zhao Qiang <qiang.zhao@freescale.com>
Date: Mon, 30 Nov 2015 10:48:53 +0800
Subject: [PATCH 112/149] genalloc:support allocating specific region

Add new algo for genalloc, it reserve a specific region of
memory

Signed-off-by: Zhao Qiang <qiang.zhao@freescale.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>
---
 include/linux/genalloc.h | 11 +++++++++++
 lib/genalloc.c           | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+)

diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h
index 3c676ce46ee0..29d4385903d4 100644
--- a/include/linux/genalloc.h
+++ b/include/linux/genalloc.h
@@ -84,6 +84,13 @@ struct genpool_data_align {
 	int align;		/* alignment by bytes for starting address */
 };
 
+/*
+ *  gen_pool data descriptor for gen_pool_fixed_alloc.
+ */
+struct genpool_data_fixed {
+	unsigned long offset;		/* The offset of the specific region */
+};
+
 extern struct gen_pool *gen_pool_create(int, int);
 extern phys_addr_t gen_pool_virt_to_phys(struct gen_pool *pool, unsigned long);
 extern int gen_pool_add_virt(struct gen_pool *, unsigned long, phys_addr_t,
@@ -124,6 +131,10 @@ extern unsigned long gen_pool_first_fit(unsigned long *map, unsigned long size,
 		unsigned long start, unsigned int nr, void *data,
 		struct gen_pool *pool);
 
+extern unsigned long gen_pool_fixed_alloc(unsigned long *map,
+		unsigned long size, unsigned long start, unsigned int nr,
+		void *data, struct gen_pool *pool);
+
 extern unsigned long gen_pool_first_fit_align(unsigned long *map,
 		unsigned long size, unsigned long start, unsigned int nr,
 		void *data, struct gen_pool *pool);
diff --git a/lib/genalloc.c b/lib/genalloc.c
index b8cf89d9e17d..5ec83cd93284 100644
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -555,6 +555,38 @@ unsigned long gen_pool_first_fit_align(unsigned long *map, unsigned long size,
 }
 EXPORT_SYMBOL(gen_pool_first_fit_align);
 
+/**
+ * gen_pool_fixed_alloc - reserve a specific region
+ * @map: The address to base the search on
+ * @size: The bitmap size in bits
+ * @start: The bitnumber to start searching at
+ * @nr: The number of zeroed bits we're looking for
+ * @data: data for alignment
+ * @pool: pool to get order from
+ */
+unsigned long gen_pool_fixed_alloc(unsigned long *map, unsigned long size,
+		unsigned long start, unsigned int nr, void *data,
+		struct gen_pool *pool)
+{
+	struct genpool_data_fixed *fixed_data;
+	int order;
+	unsigned long offset_bit;
+	unsigned long start_bit;
+
+	fixed_data = data;
+	order = pool->min_alloc_order;
+	offset_bit = fixed_data->offset >> order;
+	if (WARN_ON(fixed_data->offset & (1UL << order - 1)))
+		return size;
+
+	start_bit = bitmap_find_next_zero_area(map, size,
+			start + offset_bit, nr, 0);
+	if (start_bit != offset_bit)
+		start_bit = size;
+	return start_bit;
+}
+EXPORT_SYMBOL(gen_pool_fixed_alloc);
+
 /**
  * gen_pool_first_fit_order_align - find the first available region
  * of memory matching the size requirement. The region will be aligned

From 0e6e01ff694ee222acc5a9184211678473c948e3 Mon Sep 17 00:00:00 2001
From: Zhao Qiang <qiang.zhao@freescale.com>
Date: Mon, 30 Nov 2015 10:48:54 +0800
Subject: [PATCH 113/149] CPM/QE: use genalloc to manage CPM/QE muram

Use genalloc to manage CPM/QE muram instead of rheap.

Signed-off-by: Zhao Qiang <qiang.zhao@freescale.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>
---
 arch/powerpc/include/asm/cpm.h   |   3 +
 arch/powerpc/platforms/Kconfig   |   4 +-
 arch/powerpc/sysdev/cpm_common.c | 126 +++++++++++++++++++++----------
 lib/genalloc.c                   |   2 +-
 4 files changed, 94 insertions(+), 41 deletions(-)

diff --git a/arch/powerpc/include/asm/cpm.h b/arch/powerpc/include/asm/cpm.h
index 4398a6cdcf53..46e86b50abf3 100644
--- a/arch/powerpc/include/asm/cpm.h
+++ b/arch/powerpc/include/asm/cpm.h
@@ -2,6 +2,7 @@
 #define __CPM_H
 
 #include <linux/compiler.h>
+#include <linux/genalloc.h>
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/of.h>
@@ -161,6 +162,8 @@ int cpm_muram_init(void);
 unsigned long cpm_muram_alloc(unsigned long size, unsigned long align);
 int cpm_muram_free(unsigned long offset);
 unsigned long cpm_muram_alloc_fixed(unsigned long offset, unsigned long size);
+unsigned long cpm_muram_alloc_common(unsigned long size, genpool_algo_t algo,
+				     void *data);
 void __iomem *cpm_muram_addr(unsigned long offset);
 unsigned long cpm_muram_offset(void __iomem *addr);
 dma_addr_t cpm_muram_dma(void __iomem *addr);
diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig
index b7f9c408bf24..57069eb8f093 100644
--- a/arch/powerpc/platforms/Kconfig
+++ b/arch/powerpc/platforms/Kconfig
@@ -275,7 +275,7 @@ config TAU_AVERAGE
 config QUICC_ENGINE
 	bool "Freescale QUICC Engine (QE) Support"
 	depends on FSL_SOC && PPC32
-	select PPC_LIB_RHEAP
+	select GENERIC_ALLOCATOR
 	select CRC32
 	help
 	  The QUICC Engine (QE) is a new generation of communications
@@ -295,7 +295,6 @@ config CPM2
 	bool "Enable support for the CPM2 (Communications Processor Module)"
 	depends on (FSL_SOC_BOOKE && PPC32) || 8260
 	select CPM
-	select PPC_LIB_RHEAP
 	select PPC_PCI_CHOICE
 	select ARCH_REQUIRE_GPIOLIB
 	help
@@ -325,6 +324,7 @@ config FSL_ULI1575
 
 config CPM
 	bool
+	select GENERIC_ALLOCATOR
 
 config OF_RTC
 	bool
diff --git a/arch/powerpc/sysdev/cpm_common.c b/arch/powerpc/sysdev/cpm_common.c
index e00a5ee58fd7..fcc83cd9cc2f 100644
--- a/arch/powerpc/sysdev/cpm_common.c
+++ b/arch/powerpc/sysdev/cpm_common.c
@@ -17,6 +17,7 @@
  * published by the Free Software Foundation.
  */
 
+#include <linux/genalloc.h>
 #include <linux/init.h>
 #include <linux/of_device.h>
 #include <linux/spinlock.h>
@@ -27,7 +28,6 @@
 
 #include <asm/udbg.h>
 #include <asm/io.h>
-#include <asm/rheap.h>
 #include <asm/cpm.h>
 
 #include <mm/mmu_decl.h>
@@ -65,14 +65,22 @@ void __init udbg_init_cpm(void)
 }
 #endif
 
+static struct gen_pool *muram_pool;
 static spinlock_t cpm_muram_lock;
-static rh_block_t cpm_boot_muram_rh_block[16];
-static rh_info_t cpm_muram_info;
 static u8 __iomem *muram_vbase;
 static phys_addr_t muram_pbase;
 
-/* Max address size we deal with */
+struct muram_block {
+	struct list_head head;
+	unsigned long start;
+	int size;
+};
+
+static LIST_HEAD(muram_block_list);
+
+/* max address size we deal with */
 #define OF_MAX_ADDR_CELLS	4
+#define GENPOOL_OFFSET		(4096 * 8)
 
 int cpm_muram_init(void)
 {
@@ -87,50 +95,51 @@ int cpm_muram_init(void)
 		return 0;
 
 	spin_lock_init(&cpm_muram_lock);
-	/* initialize the info header */
-	rh_init(&cpm_muram_info, 1,
-	        sizeof(cpm_boot_muram_rh_block) /
-	        sizeof(cpm_boot_muram_rh_block[0]),
-	        cpm_boot_muram_rh_block);
-
 	np = of_find_compatible_node(NULL, NULL, "fsl,cpm-muram-data");
 	if (!np) {
 		/* try legacy bindings */
 		np = of_find_node_by_name(NULL, "data-only");
 		if (!np) {
-			printk(KERN_ERR "Cannot find CPM muram data node");
+			pr_err("Cannot find CPM muram data node");
 			ret = -ENODEV;
-			goto out;
+			goto out_muram;
 		}
 	}
 
+	muram_pool = gen_pool_create(0, -1);
 	muram_pbase = of_translate_address(np, zero);
 	if (muram_pbase == (phys_addr_t)OF_BAD_ADDR) {
-		printk(KERN_ERR "Cannot translate zero through CPM muram node");
+		pr_err("Cannot translate zero through CPM muram node");
 		ret = -ENODEV;
-		goto out;
+		goto out_pool;
 	}
 
 	while (of_address_to_resource(np, i++, &r) == 0) {
 		if (r.end > max)
 			max = r.end;
-
-		rh_attach_region(&cpm_muram_info, r.start - muram_pbase,
-				 resource_size(&r));
+		ret = gen_pool_add(muram_pool, r.start - muram_pbase +
+				   GENPOOL_OFFSET, resource_size(&r), -1);
+		if (ret) {
+			pr_err("QE: couldn't add muram to pool!\n");
+			goto out_pool;
+		}
 	}
 
 	muram_vbase = ioremap(muram_pbase, max - muram_pbase + 1);
 	if (!muram_vbase) {
-		printk(KERN_ERR "Cannot map CPM muram");
+		pr_err("Cannot map QE muram");
 		ret = -ENOMEM;
+		goto out_pool;
 	}
-
-out:
+	goto out_muram;
+out_pool:
+	gen_pool_destroy(muram_pool);
+out_muram:
 	of_node_put(np);
 	return ret;
 }
 
-/**
+/*
  * cpm_muram_alloc - allocate the requested size worth of multi-user ram
  * @size: number of bytes to allocate
  * @align: requested alignment, in bytes
@@ -143,14 +152,13 @@ unsigned long cpm_muram_alloc(unsigned long size, unsigned long align)
 {
 	unsigned long start;
 	unsigned long flags;
+	struct genpool_data_align muram_pool_data;
 
 	spin_lock_irqsave(&cpm_muram_lock, flags);
-	cpm_muram_info.alignment = align;
-	start = rh_alloc(&cpm_muram_info, size, "commproc");
-	if (!IS_ERR_VALUE(start))
-		memset_io(cpm_muram_addr(start), 0, size);
+	muram_pool_data.align = align;
+	start = cpm_muram_alloc_common(size, gen_pool_first_fit_align,
+				       &muram_pool_data);
 	spin_unlock_irqrestore(&cpm_muram_lock, flags);
-
 	return start;
 }
 EXPORT_SYMBOL(cpm_muram_alloc);
@@ -161,23 +169,31 @@ EXPORT_SYMBOL(cpm_muram_alloc);
  */
 int cpm_muram_free(unsigned long offset)
 {
-	int ret;
 	unsigned long flags;
+	int size;
+	struct muram_block *tmp;
 
+	size = 0;
 	spin_lock_irqsave(&cpm_muram_lock, flags);
-	ret = rh_free(&cpm_muram_info, offset);
+	list_for_each_entry(tmp, &muram_block_list, head) {
+		if (tmp->start == offset) {
+			size = tmp->size;
+			list_del(&tmp->head);
+			kfree(tmp);
+			break;
+		}
+	}
+	gen_pool_free(muram_pool, offset + GENPOOL_OFFSET, size);
 	spin_unlock_irqrestore(&cpm_muram_lock, flags);
-
-	return ret;
+	return size;
 }
 EXPORT_SYMBOL(cpm_muram_free);
 
-/**
+/*
  * cpm_muram_alloc_fixed - reserve a specific region of multi-user ram
- * @offset: the offset into the muram area to reserve
- * @size: the number of bytes to reserve
- *
- * This function returns "start" on success, -ENOMEM on failure.
+ * @offset: offset of allocation start address
+ * @size: number of bytes to allocate
+ * This function returns an offset into the muram area
  * Use cpm_dpram_addr() to get the virtual address of the area.
  * Use cpm_muram_free() to free the allocation.
  */
@@ -185,16 +201,50 @@ unsigned long cpm_muram_alloc_fixed(unsigned long offset, unsigned long size)
 {
 	unsigned long start;
 	unsigned long flags;
+	struct genpool_data_fixed muram_pool_data_fixed;
 
 	spin_lock_irqsave(&cpm_muram_lock, flags);
-	cpm_muram_info.alignment = 1;
-	start = rh_alloc_fixed(&cpm_muram_info, offset, size, "commproc");
+	muram_pool_data_fixed.offset = offset + GENPOOL_OFFSET;
+	start = cpm_muram_alloc_common(size, gen_pool_fixed_alloc,
+				       &muram_pool_data_fixed);
 	spin_unlock_irqrestore(&cpm_muram_lock, flags);
-
 	return start;
 }
 EXPORT_SYMBOL(cpm_muram_alloc_fixed);
 
+/*
+ * cpm_muram_alloc_common - cpm_muram_alloc common code
+ * @size: number of bytes to allocate
+ * @algo: algorithm for alloc.
+ * @data: data for genalloc's algorithm.
+ *
+ * This function returns an offset into the muram area.
+ */
+unsigned long cpm_muram_alloc_common(unsigned long size, genpool_algo_t algo,
+				     void *data)
+{
+	struct muram_block *entry;
+	unsigned long start;
+
+	start = gen_pool_alloc_algo(muram_pool, size, algo, data);
+	if (!start)
+		goto out2;
+	start = start - GENPOOL_OFFSET;
+	memset_io(cpm_muram_addr(start), 0, size);
+	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry)
+		goto out1;
+	entry->start = start;
+	entry->size = size;
+	list_add(&entry->head, &muram_block_list);
+
+	return start;
+out1:
+	gen_pool_free(muram_pool, start, size);
+out2:
+	return (unsigned long)-ENOMEM;
+}
+
 /**
  * cpm_muram_addr - turn a muram offset into a virtual address
  * @offset: muram offset to convert
diff --git a/lib/genalloc.c b/lib/genalloc.c
index 5ec83cd93284..0a1139644d32 100644
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -576,7 +576,7 @@ unsigned long gen_pool_fixed_alloc(unsigned long *map, unsigned long size,
 	fixed_data = data;
 	order = pool->min_alloc_order;
 	offset_bit = fixed_data->offset >> order;
-	if (WARN_ON(fixed_data->offset & (1UL << order - 1)))
+	if (WARN_ON(fixed_data->offset & ((1UL << order) - 1)))
 		return size;
 
 	start_bit = bitmap_find_next_zero_area(map, size,

From 1291e49e893703e04e129fe2e17e87af40757bf1 Mon Sep 17 00:00:00 2001
From: Zhao Qiang <qiang.zhao@freescale.com>
Date: Mon, 30 Nov 2015 10:48:55 +0800
Subject: [PATCH 114/149] QE/CPM: move muram management functions to qe_common

QE and CPM have the same muram, they use the same management
functions. Now QE support both ARM and PowerPC, it is necessary
to move QE to "driver/soc", so move the muram management functions
from cpm_common to qe_common for preparing to move QE code to "driver/soc"

Signed-off-by: Zhao Qiang <qiang.zhao@freescale.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>
---
 arch/powerpc/include/asm/cpm.h         |  47 +----
 arch/powerpc/include/asm/qe.h          |  50 ++++++
 arch/powerpc/sysdev/Makefile           |   1 +
 arch/powerpc/sysdev/cpm_common.c       | 208 +---------------------
 arch/powerpc/sysdev/qe_lib/Makefile    |   4 +-
 arch/powerpc/sysdev/qe_lib/qe_common.c | 235 +++++++++++++++++++++++++
 6 files changed, 290 insertions(+), 255 deletions(-)
 create mode 100644 arch/powerpc/sysdev/qe_lib/qe_common.c

diff --git a/arch/powerpc/include/asm/cpm.h b/arch/powerpc/include/asm/cpm.h
index 46e86b50abf3..0958028cf31a 100644
--- a/arch/powerpc/include/asm/cpm.h
+++ b/arch/powerpc/include/asm/cpm.h
@@ -2,10 +2,10 @@
 #define __CPM_H
 
 #include <linux/compiler.h>
-#include <linux/genalloc.h>
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/of.h>
+#include <asm/qe.h>
 
 /*
  * SPI Parameter RAM common to QE and CPM.
@@ -156,51 +156,6 @@ typedef struct cpm_buf_desc {
  */
 #define BD_I2C_START		(0x0400)
 
-int cpm_muram_init(void);
-
-#if defined(CONFIG_CPM) || defined(CONFIG_QUICC_ENGINE)
-unsigned long cpm_muram_alloc(unsigned long size, unsigned long align);
-int cpm_muram_free(unsigned long offset);
-unsigned long cpm_muram_alloc_fixed(unsigned long offset, unsigned long size);
-unsigned long cpm_muram_alloc_common(unsigned long size, genpool_algo_t algo,
-				     void *data);
-void __iomem *cpm_muram_addr(unsigned long offset);
-unsigned long cpm_muram_offset(void __iomem *addr);
-dma_addr_t cpm_muram_dma(void __iomem *addr);
-#else
-static inline unsigned long cpm_muram_alloc(unsigned long size,
-					    unsigned long align)
-{
-	return -ENOSYS;
-}
-
-static inline int cpm_muram_free(unsigned long offset)
-{
-	return -ENOSYS;
-}
-
-static inline unsigned long cpm_muram_alloc_fixed(unsigned long offset,
-						  unsigned long size)
-{
-	return -ENOSYS;
-}
-
-static inline void __iomem *cpm_muram_addr(unsigned long offset)
-{
-	return NULL;
-}
-
-static inline unsigned long cpm_muram_offset(void __iomem *addr)
-{
-	return -ENOSYS;
-}
-
-static inline dma_addr_t cpm_muram_dma(void __iomem *addr)
-{
-	return 0;
-}
-#endif /* defined(CONFIG_CPM) || defined(CONFIG_QUICC_ENGINE) */
-
 #ifdef CONFIG_CPM
 int cpm_command(u32 command, u8 opcode);
 #else
diff --git a/arch/powerpc/include/asm/qe.h b/arch/powerpc/include/asm/qe.h
index 32b9bfa0c9bd..ceeaf91854b5 100644
--- a/arch/powerpc/include/asm/qe.h
+++ b/arch/powerpc/include/asm/qe.h
@@ -16,11 +16,16 @@
 #define _ASM_POWERPC_QE_H
 #ifdef __KERNEL__
 
+#include <linux/compiler.h>
+#include <linux/genalloc.h>
 #include <linux/spinlock.h>
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <asm/cpm.h>
 #include <asm/immap_qe.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/types.h>
 
 #define QE_NUM_OF_SNUM	256	/* There are 256 serial number in QE */
 #define QE_NUM_OF_BRGS	16
@@ -92,6 +97,51 @@ extern void qe_reset(void);
 static inline void qe_reset(void) {}
 #endif
 
+int cpm_muram_init(void);
+
+#if defined(CONFIG_CPM) || defined(CONFIG_QUICC_ENGINE)
+unsigned long cpm_muram_alloc(unsigned long size, unsigned long align);
+int cpm_muram_free(unsigned long offset);
+unsigned long cpm_muram_alloc_fixed(unsigned long offset, unsigned long size);
+unsigned long cpm_muram_alloc_common(unsigned long size, genpool_algo_t algo,
+				     void *data);
+void __iomem *cpm_muram_addr(unsigned long offset);
+unsigned long cpm_muram_offset(void __iomem *addr);
+dma_addr_t cpm_muram_dma(void __iomem *addr);
+#else
+static inline unsigned long cpm_muram_alloc(unsigned long size,
+					    unsigned long align)
+{
+	return -ENOSYS;
+}
+
+static inline int cpm_muram_free(unsigned long offset)
+{
+	return -ENOSYS;
+}
+
+static inline unsigned long cpm_muram_alloc_fixed(unsigned long offset,
+						  unsigned long size)
+{
+	return -ENOSYS;
+}
+
+static inline void __iomem *cpm_muram_addr(unsigned long offset)
+{
+	return NULL;
+}
+
+static inline unsigned long cpm_muram_offset(void __iomem *addr)
+{
+	return -ENOSYS;
+}
+
+static inline dma_addr_t cpm_muram_dma(void __iomem *addr)
+{
+	return 0;
+}
+#endif /* defined(CONFIG_CPM) || defined(CONFIG_QUICC_ENGINE) */
+
 /* QE PIO */
 #define QE_PIO_PINS 32
 
diff --git a/arch/powerpc/sysdev/Makefile b/arch/powerpc/sysdev/Makefile
index 5b492a6438ff..f1d47498ddf2 100644
--- a/arch/powerpc/sysdev/Makefile
+++ b/arch/powerpc/sysdev/Makefile
@@ -27,6 +27,7 @@ obj-$(CONFIG_SIMPLE_GPIO)	+= simple_gpio.o
 obj-$(CONFIG_FSL_RIO)		+= fsl_rio.o fsl_rmu.o
 obj-$(CONFIG_TSI108_BRIDGE)	+= tsi108_pci.o tsi108_dev.o
 obj-$(CONFIG_QUICC_ENGINE)	+= qe_lib/
+obj-$(CONFIG_CPM)		+= qe_lib/
 mv64x60-$(CONFIG_PCI)		+= mv64x60_pci.o
 obj-$(CONFIG_MV64X60)		+= $(mv64x60-y) mv64x60_pic.o mv64x60_dev.o \
 				   mv64x60_udbg.o
diff --git a/arch/powerpc/sysdev/cpm_common.c b/arch/powerpc/sysdev/cpm_common.c
index fcc83cd9cc2f..6993aa8e7242 100644
--- a/arch/powerpc/sysdev/cpm_common.c
+++ b/arch/powerpc/sysdev/cpm_common.c
@@ -17,7 +17,6 @@
  * published by the Free Software Foundation.
  */
 
-#include <linux/genalloc.h>
 #include <linux/init.h>
 #include <linux/of_device.h>
 #include <linux/spinlock.h>
@@ -29,6 +28,7 @@
 #include <asm/udbg.h>
 #include <asm/io.h>
 #include <asm/cpm.h>
+#include <asm/qe.h>
 
 #include <mm/mmu_decl.h>
 
@@ -65,212 +65,6 @@ void __init udbg_init_cpm(void)
 }
 #endif
 
-static struct gen_pool *muram_pool;
-static spinlock_t cpm_muram_lock;
-static u8 __iomem *muram_vbase;
-static phys_addr_t muram_pbase;
-
-struct muram_block {
-	struct list_head head;
-	unsigned long start;
-	int size;
-};
-
-static LIST_HEAD(muram_block_list);
-
-/* max address size we deal with */
-#define OF_MAX_ADDR_CELLS	4
-#define GENPOOL_OFFSET		(4096 * 8)
-
-int cpm_muram_init(void)
-{
-	struct device_node *np;
-	struct resource r;
-	u32 zero[OF_MAX_ADDR_CELLS] = {};
-	resource_size_t max = 0;
-	int i = 0;
-	int ret = 0;
-
-	if (muram_pbase)
-		return 0;
-
-	spin_lock_init(&cpm_muram_lock);
-	np = of_find_compatible_node(NULL, NULL, "fsl,cpm-muram-data");
-	if (!np) {
-		/* try legacy bindings */
-		np = of_find_node_by_name(NULL, "data-only");
-		if (!np) {
-			pr_err("Cannot find CPM muram data node");
-			ret = -ENODEV;
-			goto out_muram;
-		}
-	}
-
-	muram_pool = gen_pool_create(0, -1);
-	muram_pbase = of_translate_address(np, zero);
-	if (muram_pbase == (phys_addr_t)OF_BAD_ADDR) {
-		pr_err("Cannot translate zero through CPM muram node");
-		ret = -ENODEV;
-		goto out_pool;
-	}
-
-	while (of_address_to_resource(np, i++, &r) == 0) {
-		if (r.end > max)
-			max = r.end;
-		ret = gen_pool_add(muram_pool, r.start - muram_pbase +
-				   GENPOOL_OFFSET, resource_size(&r), -1);
-		if (ret) {
-			pr_err("QE: couldn't add muram to pool!\n");
-			goto out_pool;
-		}
-	}
-
-	muram_vbase = ioremap(muram_pbase, max - muram_pbase + 1);
-	if (!muram_vbase) {
-		pr_err("Cannot map QE muram");
-		ret = -ENOMEM;
-		goto out_pool;
-	}
-	goto out_muram;
-out_pool:
-	gen_pool_destroy(muram_pool);
-out_muram:
-	of_node_put(np);
-	return ret;
-}
-
-/*
- * cpm_muram_alloc - allocate the requested size worth of multi-user ram
- * @size: number of bytes to allocate
- * @align: requested alignment, in bytes
- *
- * This function returns an offset into the muram area.
- * Use cpm_dpram_addr() to get the virtual address of the area.
- * Use cpm_muram_free() to free the allocation.
- */
-unsigned long cpm_muram_alloc(unsigned long size, unsigned long align)
-{
-	unsigned long start;
-	unsigned long flags;
-	struct genpool_data_align muram_pool_data;
-
-	spin_lock_irqsave(&cpm_muram_lock, flags);
-	muram_pool_data.align = align;
-	start = cpm_muram_alloc_common(size, gen_pool_first_fit_align,
-				       &muram_pool_data);
-	spin_unlock_irqrestore(&cpm_muram_lock, flags);
-	return start;
-}
-EXPORT_SYMBOL(cpm_muram_alloc);
-
-/**
- * cpm_muram_free - free a chunk of multi-user ram
- * @offset: The beginning of the chunk as returned by cpm_muram_alloc().
- */
-int cpm_muram_free(unsigned long offset)
-{
-	unsigned long flags;
-	int size;
-	struct muram_block *tmp;
-
-	size = 0;
-	spin_lock_irqsave(&cpm_muram_lock, flags);
-	list_for_each_entry(tmp, &muram_block_list, head) {
-		if (tmp->start == offset) {
-			size = tmp->size;
-			list_del(&tmp->head);
-			kfree(tmp);
-			break;
-		}
-	}
-	gen_pool_free(muram_pool, offset + GENPOOL_OFFSET, size);
-	spin_unlock_irqrestore(&cpm_muram_lock, flags);
-	return size;
-}
-EXPORT_SYMBOL(cpm_muram_free);
-
-/*
- * cpm_muram_alloc_fixed - reserve a specific region of multi-user ram
- * @offset: offset of allocation start address
- * @size: number of bytes to allocate
- * This function returns an offset into the muram area
- * Use cpm_dpram_addr() to get the virtual address of the area.
- * Use cpm_muram_free() to free the allocation.
- */
-unsigned long cpm_muram_alloc_fixed(unsigned long offset, unsigned long size)
-{
-	unsigned long start;
-	unsigned long flags;
-	struct genpool_data_fixed muram_pool_data_fixed;
-
-	spin_lock_irqsave(&cpm_muram_lock, flags);
-	muram_pool_data_fixed.offset = offset + GENPOOL_OFFSET;
-	start = cpm_muram_alloc_common(size, gen_pool_fixed_alloc,
-				       &muram_pool_data_fixed);
-	spin_unlock_irqrestore(&cpm_muram_lock, flags);
-	return start;
-}
-EXPORT_SYMBOL(cpm_muram_alloc_fixed);
-
-/*
- * cpm_muram_alloc_common - cpm_muram_alloc common code
- * @size: number of bytes to allocate
- * @algo: algorithm for alloc.
- * @data: data for genalloc's algorithm.
- *
- * This function returns an offset into the muram area.
- */
-unsigned long cpm_muram_alloc_common(unsigned long size, genpool_algo_t algo,
-				     void *data)
-{
-	struct muram_block *entry;
-	unsigned long start;
-
-	start = gen_pool_alloc_algo(muram_pool, size, algo, data);
-	if (!start)
-		goto out2;
-	start = start - GENPOOL_OFFSET;
-	memset_io(cpm_muram_addr(start), 0, size);
-	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
-	if (!entry)
-		goto out1;
-	entry->start = start;
-	entry->size = size;
-	list_add(&entry->head, &muram_block_list);
-
-	return start;
-out1:
-	gen_pool_free(muram_pool, start, size);
-out2:
-	return (unsigned long)-ENOMEM;
-}
-
-/**
- * cpm_muram_addr - turn a muram offset into a virtual address
- * @offset: muram offset to convert
- */
-void __iomem *cpm_muram_addr(unsigned long offset)
-{
-	return muram_vbase + offset;
-}
-EXPORT_SYMBOL(cpm_muram_addr);
-
-unsigned long cpm_muram_offset(void __iomem *addr)
-{
-	return addr - (void __iomem *)muram_vbase;
-}
-EXPORT_SYMBOL(cpm_muram_offset);
-
-/**
- * cpm_muram_dma - turn a muram virtual address into a DMA address
- * @offset: virtual address from cpm_muram_addr() to convert
- */
-dma_addr_t cpm_muram_dma(void __iomem *addr)
-{
-	return muram_pbase + ((u8 __iomem *)addr - muram_vbase);
-}
-EXPORT_SYMBOL(cpm_muram_dma);
-
 #if defined(CONFIG_CPM2) || defined(CONFIG_8xx_GPIO)
 
 struct cpm2_ioports {
diff --git a/arch/powerpc/sysdev/qe_lib/Makefile b/arch/powerpc/sysdev/qe_lib/Makefile
index f1855c185291..ffac5410c5c7 100644
--- a/arch/powerpc/sysdev/qe_lib/Makefile
+++ b/arch/powerpc/sysdev/qe_lib/Makefile
@@ -1,8 +1,8 @@
 #
 # Makefile for the linux ppc-specific parts of QE
 #
-obj-$(CONFIG_QUICC_ENGINE)+= qe.o qe_ic.o qe_io.o
-
+obj-$(CONFIG_QUICC_ENGINE)+= qe.o qe_common.o qe_ic.o qe_io.o
+obj-$(CONFIG_CPM)	+= qe_common.o
 obj-$(CONFIG_UCC)	+= ucc.o
 obj-$(CONFIG_UCC_SLOW)	+= ucc_slow.o
 obj-$(CONFIG_UCC_FAST)	+= ucc_fast.o
diff --git a/arch/powerpc/sysdev/qe_lib/qe_common.c b/arch/powerpc/sysdev/qe_lib/qe_common.c
new file mode 100644
index 000000000000..b90043f1503b
--- /dev/null
+++ b/arch/powerpc/sysdev/qe_lib/qe_common.c
@@ -0,0 +1,235 @@
+/*
+ * Common CPM code
+ *
+ * Author: Scott Wood <scottwood@freescale.com>
+ *
+ * Copyright 2007-2008,2010 Freescale Semiconductor, Inc.
+ *
+ * Some parts derived from commproc.c/cpm2_common.c, which is:
+ * Copyright (c) 1997 Dan error_act (dmalek@jlc.net)
+ * Copyright (c) 1999-2001 Dan Malek <dan@embeddedalley.com>
+ * Copyright (c) 2000 MontaVista Software, Inc (source@mvista.com)
+ * 2006 (c) MontaVista Software, Inc.
+ * Vitaly Bordug <vbordug@ru.mvista.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ */
+#include <linux/genalloc.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/of_device.h>
+#include <linux/spinlock.h>
+#include <linux/export.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/slab.h>
+#include <linux/io.h>
+#include <asm/qe.h>
+
+static struct gen_pool *muram_pool;
+static spinlock_t cpm_muram_lock;
+static u8 __iomem *muram_vbase;
+static phys_addr_t muram_pbase;
+
+struct muram_block {
+	struct list_head head;
+	unsigned long start;
+	int size;
+};
+
+static LIST_HEAD(muram_block_list);
+
+/* max address size we deal with */
+#define OF_MAX_ADDR_CELLS	4
+#define GENPOOL_OFFSET		(4096 * 8)
+
+int cpm_muram_init(void)
+{
+	struct device_node *np;
+	struct resource r;
+	u32 zero[OF_MAX_ADDR_CELLS] = {};
+	resource_size_t max = 0;
+	int i = 0;
+	int ret = 0;
+
+	if (muram_pbase)
+		return 0;
+
+	spin_lock_init(&cpm_muram_lock);
+	np = of_find_compatible_node(NULL, NULL, "fsl,cpm-muram-data");
+	if (!np) {
+		/* try legacy bindings */
+		np = of_find_node_by_name(NULL, "data-only");
+		if (!np) {
+			pr_err("Cannot find CPM muram data node");
+			ret = -ENODEV;
+			goto out_muram;
+		}
+	}
+
+	muram_pool = gen_pool_create(0, -1);
+	muram_pbase = of_translate_address(np, zero);
+	if (muram_pbase == (phys_addr_t)OF_BAD_ADDR) {
+		pr_err("Cannot translate zero through CPM muram node");
+		ret = -ENODEV;
+		goto out_pool;
+	}
+
+	while (of_address_to_resource(np, i++, &r) == 0) {
+		if (r.end > max)
+			max = r.end;
+		ret = gen_pool_add(muram_pool, r.start - muram_pbase +
+				   GENPOOL_OFFSET, resource_size(&r), -1);
+		if (ret) {
+			pr_err("QE: couldn't add muram to pool!\n");
+			goto out_pool;
+		}
+	}
+
+	muram_vbase = ioremap(muram_pbase, max - muram_pbase + 1);
+	if (!muram_vbase) {
+		pr_err("Cannot map QE muram");
+		ret = -ENOMEM;
+		goto out_pool;
+	}
+	goto out_muram;
+out_pool:
+	gen_pool_destroy(muram_pool);
+out_muram:
+	of_node_put(np);
+	return ret;
+}
+
+/*
+ * cpm_muram_alloc - allocate the requested size worth of multi-user ram
+ * @size: number of bytes to allocate
+ * @align: requested alignment, in bytes
+ *
+ * This function returns an offset into the muram area.
+ * Use cpm_dpram_addr() to get the virtual address of the area.
+ * Use cpm_muram_free() to free the allocation.
+ */
+unsigned long cpm_muram_alloc(unsigned long size, unsigned long align)
+{
+	unsigned long start;
+	unsigned long flags;
+	struct genpool_data_align muram_pool_data;
+
+	spin_lock_irqsave(&cpm_muram_lock, flags);
+	muram_pool_data.align = align;
+	start = cpm_muram_alloc_common(size, gen_pool_first_fit_align,
+				       &muram_pool_data);
+	spin_unlock_irqrestore(&cpm_muram_lock, flags);
+	return start;
+}
+EXPORT_SYMBOL(cpm_muram_alloc);
+
+/**
+ * cpm_muram_free - free a chunk of multi-user ram
+ * @offset: The beginning of the chunk as returned by cpm_muram_alloc().
+ */
+int cpm_muram_free(unsigned long offset)
+{
+	unsigned long flags;
+	int size;
+	struct muram_block *tmp;
+
+	size = 0;
+	spin_lock_irqsave(&cpm_muram_lock, flags);
+	list_for_each_entry(tmp, &muram_block_list, head) {
+		if (tmp->start == offset) {
+			size = tmp->size;
+			list_del(&tmp->head);
+			kfree(tmp);
+			break;
+		}
+	}
+	gen_pool_free(muram_pool, offset + GENPOOL_OFFSET, size);
+	spin_unlock_irqrestore(&cpm_muram_lock, flags);
+	return size;
+}
+EXPORT_SYMBOL(cpm_muram_free);
+
+/*
+ * cpm_muram_alloc_fixed - reserve a specific region of multi-user ram
+ * @offset: offset of allocation start address
+ * @size: number of bytes to allocate
+ * This function returns an offset into the muram area
+ * Use cpm_dpram_addr() to get the virtual address of the area.
+ * Use cpm_muram_free() to free the allocation.
+ */
+unsigned long cpm_muram_alloc_fixed(unsigned long offset, unsigned long size)
+{
+	unsigned long start;
+	unsigned long flags;
+	struct genpool_data_fixed muram_pool_data_fixed;
+
+	spin_lock_irqsave(&cpm_muram_lock, flags);
+	muram_pool_data_fixed.offset = offset + GENPOOL_OFFSET;
+	start = cpm_muram_alloc_common(size, gen_pool_fixed_alloc,
+				       &muram_pool_data_fixed);
+	spin_unlock_irqrestore(&cpm_muram_lock, flags);
+	return start;
+}
+EXPORT_SYMBOL(cpm_muram_alloc_fixed);
+
+/*
+ * cpm_muram_alloc_common - cpm_muram_alloc common code
+ * @size: number of bytes to allocate
+ * @algo: algorithm for alloc.
+ * @data: data for genalloc's algorithm.
+ *
+ * This function returns an offset into the muram area.
+ */
+unsigned long cpm_muram_alloc_common(unsigned long size, genpool_algo_t algo,
+				     void *data)
+{
+	struct muram_block *entry;
+	unsigned long start;
+
+	start = gen_pool_alloc_algo(muram_pool, size, algo, data);
+	if (!start)
+		goto out2;
+	start = start - GENPOOL_OFFSET;
+	memset_io(cpm_muram_addr(start), 0, size);
+	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry)
+		goto out1;
+	entry->start = start;
+	entry->size = size;
+	list_add(&entry->head, &muram_block_list);
+
+	return start;
+out1:
+	gen_pool_free(muram_pool, start, size);
+out2:
+	return (unsigned long)-ENOMEM;
+}
+
+/**
+ * cpm_muram_addr - turn a muram offset into a virtual address
+ * @offset: muram offset to convert
+ */
+void __iomem *cpm_muram_addr(unsigned long offset)
+{
+	return muram_vbase + offset;
+}
+EXPORT_SYMBOL(cpm_muram_addr);
+
+unsigned long cpm_muram_offset(void __iomem *addr)
+{
+	return addr - (void __iomem *)muram_vbase;
+}
+EXPORT_SYMBOL(cpm_muram_offset);
+
+/**
+ * cpm_muram_dma - turn a muram virtual address into a DMA address
+ * @offset: virtual address from cpm_muram_addr() to convert
+ */
+dma_addr_t cpm_muram_dma(void __iomem *addr)
+{
+	return muram_pbase + ((u8 __iomem *)addr - muram_vbase);
+}
+EXPORT_SYMBOL(cpm_muram_dma);

From 302c059f2e7bac7342f912bc77ff5bd6490c8edd Mon Sep 17 00:00:00 2001
From: Zhao Qiang <qiang.zhao@freescale.com>
Date: Mon, 30 Nov 2015 10:48:56 +0800
Subject: [PATCH 115/149] QE: use subsys_initcall to init qe

Use subsys_initcall to init qe to adapt ARM architecture.
Remove qe_reset from PowerPC platform file.

Signed-off-by: Zhao Qiang <qiang.zhao@freescale.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>
---
 arch/powerpc/platforms/83xx/km83xx.c      |  2 --
 arch/powerpc/platforms/83xx/mpc832x_mds.c |  2 --
 arch/powerpc/platforms/83xx/mpc832x_rdb.c |  2 --
 arch/powerpc/platforms/83xx/mpc836x_mds.c |  2 --
 arch/powerpc/platforms/83xx/mpc836x_rdk.c |  3 ---
 arch/powerpc/platforms/85xx/common.c      |  1 -
 arch/powerpc/sysdev/qe_lib/qe.c           | 13 +++++++++++++
 7 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/platforms/83xx/km83xx.c b/arch/powerpc/platforms/83xx/km83xx.c
index bf4c4473abb9..ae1115813844 100644
--- a/arch/powerpc/platforms/83xx/km83xx.c
+++ b/arch/powerpc/platforms/83xx/km83xx.c
@@ -136,8 +136,6 @@ static void __init mpc83xx_km_setup_arch(void)
 	mpc83xx_setup_pci();
 
 #ifdef CONFIG_QUICC_ENGINE
-	qe_reset();
-
 	np = of_find_node_by_name(NULL, "par_io");
 	if (np != NULL) {
 		par_io_init(np);
diff --git a/arch/powerpc/platforms/83xx/mpc832x_mds.c b/arch/powerpc/platforms/83xx/mpc832x_mds.c
index 8d762203eeff..aacc43f64246 100644
--- a/arch/powerpc/platforms/83xx/mpc832x_mds.c
+++ b/arch/powerpc/platforms/83xx/mpc832x_mds.c
@@ -74,8 +74,6 @@ static void __init mpc832x_sys_setup_arch(void)
 	mpc83xx_setup_pci();
 
 #ifdef CONFIG_QUICC_ENGINE
-	qe_reset();
-
 	if ((np = of_find_node_by_name(NULL, "par_io")) != NULL) {
 		par_io_init(np);
 		of_node_put(np);
diff --git a/arch/powerpc/platforms/83xx/mpc832x_rdb.c b/arch/powerpc/platforms/83xx/mpc832x_rdb.c
index eff5baabc3fb..0c7a43e1c390 100644
--- a/arch/powerpc/platforms/83xx/mpc832x_rdb.c
+++ b/arch/powerpc/platforms/83xx/mpc832x_rdb.c
@@ -203,8 +203,6 @@ static void __init mpc832x_rdb_setup_arch(void)
 	mpc83xx_setup_pci();
 
 #ifdef CONFIG_QUICC_ENGINE
-	qe_reset();
-
 	if ((np = of_find_node_by_name(NULL, "par_io")) != NULL) {
 		par_io_init(np);
 		of_node_put(np);
diff --git a/arch/powerpc/platforms/83xx/mpc836x_mds.c b/arch/powerpc/platforms/83xx/mpc836x_mds.c
index 1a26d2f83401..eb24abdf1ae7 100644
--- a/arch/powerpc/platforms/83xx/mpc836x_mds.c
+++ b/arch/powerpc/platforms/83xx/mpc836x_mds.c
@@ -82,8 +82,6 @@ static void __init mpc836x_mds_setup_arch(void)
 	mpc83xx_setup_pci();
 
 #ifdef CONFIG_QUICC_ENGINE
-	qe_reset();
-
 	if ((np = of_find_node_by_name(NULL, "par_io")) != NULL) {
 		par_io_init(np);
 		of_node_put(np);
diff --git a/arch/powerpc/platforms/83xx/mpc836x_rdk.c b/arch/powerpc/platforms/83xx/mpc836x_rdk.c
index b63b42d11d6c..823e370ed212 100644
--- a/arch/powerpc/platforms/83xx/mpc836x_rdk.c
+++ b/arch/powerpc/platforms/83xx/mpc836x_rdk.c
@@ -35,9 +35,6 @@ static void __init mpc836x_rdk_setup_arch(void)
 		ppc_md.progress("mpc836x_rdk_setup_arch()", 0);
 
 	mpc83xx_setup_pci();
-#ifdef CONFIG_QUICC_ENGINE
-	qe_reset();
-#endif
 }
 
 /*
diff --git a/arch/powerpc/platforms/85xx/common.c b/arch/powerpc/platforms/85xx/common.c
index 23791de7b688..18bca203e01a 100644
--- a/arch/powerpc/platforms/85xx/common.c
+++ b/arch/powerpc/platforms/85xx/common.c
@@ -105,7 +105,6 @@ void __init mpc85xx_qe_init(void)
 		return;
 	}
 
-	qe_reset();
 	of_node_put(np);
 
 }
diff --git a/arch/powerpc/sysdev/qe_lib/qe.c b/arch/powerpc/sysdev/qe_lib/qe.c
index c2518cdb7ddb..88ae5c7ff4bb 100644
--- a/arch/powerpc/sysdev/qe_lib/qe.c
+++ b/arch/powerpc/sysdev/qe_lib/qe.c
@@ -671,6 +671,19 @@ unsigned int qe_get_num_of_snums(void)
 }
 EXPORT_SYMBOL(qe_get_num_of_snums);
 
+static int __init qe_init(void)
+{
+	struct device_node *np;
+
+	np = of_find_compatible_node(NULL, NULL, "fsl,qe");
+	if (!np)
+		return -ENODEV;
+	qe_reset();
+	of_node_put(np);
+	return 0;
+}
+subsys_initcall(qe_init);
+
 #if defined(CONFIG_SUSPEND) && defined(CONFIG_PPC_85xx)
 static int qe_resume(struct platform_device *ofdev)
 {

From 7aa1aa6ecec2af19d9aa85430ce3e56119e21626 Mon Sep 17 00:00:00 2001
From: Zhao Qiang <qiang.zhao@freescale.com>
Date: Mon, 30 Nov 2015 10:48:57 +0800
Subject: [PATCH 116/149] QE: Move QE from arch/powerpc to drivers/soc

ls1 has qe and ls1 has arm cpu.
move qe from arch/powerpc to drivers/soc/fsl
to adapt to powerpc and arm

Signed-off-by: Zhao Qiang <qiang.zhao@freescale.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>
---
 MAINTAINERS                                           |  5 +++--
 arch/powerpc/Kconfig                                  |  2 --
 arch/powerpc/include/asm/cpm.h                        |  2 +-
 arch/powerpc/platforms/83xx/km83xx.c                  |  4 ++--
 arch/powerpc/platforms/83xx/misc.c                    |  2 +-
 arch/powerpc/platforms/83xx/mpc832x_mds.c             |  4 ++--
 arch/powerpc/platforms/83xx/mpc832x_rdb.c             |  4 ++--
 arch/powerpc/platforms/83xx/mpc836x_mds.c             |  4 ++--
 arch/powerpc/platforms/83xx/mpc836x_rdk.c             |  4 ++--
 arch/powerpc/platforms/85xx/common.c                  |  2 +-
 arch/powerpc/platforms/85xx/corenet_generic.c         |  2 +-
 arch/powerpc/platforms/85xx/mpc85xx_mds.c             |  4 ++--
 arch/powerpc/platforms/85xx/mpc85xx_rdb.c             |  4 ++--
 arch/powerpc/platforms/85xx/twr_p102x.c               |  4 ++--
 arch/powerpc/platforms/Kconfig                        | 11 -----------
 arch/powerpc/sysdev/Makefile                          |  2 --
 arch/powerpc/sysdev/cpm_common.c                      |  2 +-
 drivers/net/ethernet/freescale/fsl_pq_mdio.c          |  2 +-
 drivers/net/ethernet/freescale/ucc_geth.c             |  8 ++++----
 drivers/net/ethernet/freescale/ucc_geth.h             |  8 ++++----
 drivers/soc/Kconfig                                   |  1 +
 drivers/soc/Makefile                                  |  1 +
 drivers/soc/fsl/Makefile                              |  6 ++++++
 .../sysdev/qe_lib => drivers/soc/fsl/qe}/Kconfig      | 11 +++++++++++
 .../sysdev/qe_lib => drivers/soc/fsl/qe}/Makefile     |  0
 .../sysdev/qe_lib => drivers/soc/fsl/qe}/gpio.c       |  2 +-
 .../powerpc/sysdev/qe_lib => drivers/soc/fsl/qe}/qe.c |  4 ++--
 .../sysdev/qe_lib => drivers/soc/fsl/qe}/qe_common.c  |  2 +-
 .../sysdev/qe_lib => drivers/soc/fsl/qe}/qe_ic.c      |  5 +++--
 .../sysdev/qe_lib => drivers/soc/fsl/qe}/qe_ic.h      |  4 ++--
 .../sysdev/qe_lib => drivers/soc/fsl/qe}/qe_io.c      |  2 +-
 .../sysdev/qe_lib => drivers/soc/fsl/qe}/ucc.c        |  6 +++---
 .../sysdev/qe_lib => drivers/soc/fsl/qe}/ucc_fast.c   |  8 ++++----
 .../sysdev/qe_lib => drivers/soc/fsl/qe}/ucc_slow.c   |  8 ++++----
 .../sysdev/qe_lib => drivers/soc/fsl/qe}/usb.c        |  4 ++--
 drivers/spi/spi-fsl-cpm.c                             |  2 +-
 drivers/tty/serial/ucc_uart.c                         |  2 +-
 drivers/usb/gadget/udc/fsl_qe_udc.c                   |  2 +-
 drivers/usb/host/fhci-hcd.c                           |  2 +-
 drivers/usb/host/fhci-hub.c                           |  2 +-
 drivers/usb/host/fhci-sched.c                         |  2 +-
 drivers/usb/host/fhci.h                               |  4 ++--
 .../include/asm => include/soc/fsl/qe}/immap_qe.h     |  0
 {arch/powerpc/include/asm => include/soc/fsl/qe}/qe.h |  2 +-
 .../include/asm => include/soc/fsl/qe}/qe_ic.h        |  0
 .../powerpc/include/asm => include/soc/fsl/qe}/ucc.h  |  4 ++--
 .../include/asm => include/soc/fsl/qe}/ucc_fast.h     |  6 +++---
 .../include/asm => include/soc/fsl/qe}/ucc_slow.h     |  6 +++---
 48 files changed, 92 insertions(+), 86 deletions(-)
 create mode 100644 drivers/soc/fsl/Makefile
 rename {arch/powerpc/sysdev/qe_lib => drivers/soc/fsl/qe}/Kconfig (54%)
 rename {arch/powerpc/sysdev/qe_lib => drivers/soc/fsl/qe}/Makefile (100%)
 rename {arch/powerpc/sysdev/qe_lib => drivers/soc/fsl/qe}/gpio.c (99%)
 rename {arch/powerpc/sysdev/qe_lib => drivers/soc/fsl/qe}/qe.c (99%)
 rename {arch/powerpc/sysdev/qe_lib => drivers/soc/fsl/qe}/qe_common.c (99%)
 rename {arch/powerpc/sysdev/qe_lib => drivers/soc/fsl/qe}/qe_ic.c (99%)
 rename {arch/powerpc/sysdev/qe_lib => drivers/soc/fsl/qe}/qe_ic.h (97%)
 rename {arch/powerpc/sysdev/qe_lib => drivers/soc/fsl/qe}/qe_io.c (99%)
 rename {arch/powerpc/sysdev/qe_lib => drivers/soc/fsl/qe}/ucc.c (98%)
 rename {arch/powerpc/sysdev/qe_lib => drivers/soc/fsl/qe}/ucc_fast.c (98%)
 rename {arch/powerpc/sysdev/qe_lib => drivers/soc/fsl/qe}/ucc_slow.c (98%)
 rename {arch/powerpc/sysdev/qe_lib => drivers/soc/fsl/qe}/usb.c (96%)
 rename {arch/powerpc/include/asm => include/soc/fsl/qe}/immap_qe.h (100%)
 rename {arch/powerpc/include/asm => include/soc/fsl/qe}/qe.h (99%)
 rename {arch/powerpc/include/asm => include/soc/fsl/qe}/qe_ic.h (100%)
 rename {arch/powerpc/include/asm => include/soc/fsl/qe}/ucc.h (96%)
 rename {arch/powerpc/include/asm => include/soc/fsl/qe}/ucc_fast.h (98%)
 rename {arch/powerpc/include/asm => include/soc/fsl/qe}/ucc_slow.h (99%)

diff --git a/MAINTAINERS b/MAINTAINERS
index 050d0e77a2cf..8099527abccf 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4489,8 +4489,9 @@ F:	include/linux/fs_enet_pd.h
 FREESCALE QUICC ENGINE LIBRARY
 L:	linuxppc-dev@lists.ozlabs.org
 S:	Orphan
-F:	arch/powerpc/sysdev/qe_lib/
-F:	arch/powerpc/include/asm/*qe.h
+F:	drivers/soc/fsl/qe/
+F:	include/soc/fsl/*qe*.h
+F:	include/soc/fsl/*ucc*.h
 
 FREESCALE USB PERIPHERAL DRIVERS
 M:	Li Yang <leoli@freescale.com>
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 6e03f85b11cd..405ce42c8ff7 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -1076,8 +1076,6 @@ source "drivers/Kconfig"
 
 source "fs/Kconfig"
 
-source "arch/powerpc/sysdev/qe_lib/Kconfig"
-
 source "lib/Kconfig"
 
 source "arch/powerpc/Kconfig.debug"
diff --git a/arch/powerpc/include/asm/cpm.h b/arch/powerpc/include/asm/cpm.h
index 0958028cf31a..2c5c5b476804 100644
--- a/arch/powerpc/include/asm/cpm.h
+++ b/arch/powerpc/include/asm/cpm.h
@@ -5,7 +5,7 @@
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/of.h>
-#include <asm/qe.h>
+#include <soc/fsl/qe/qe.h>
 
 /*
  * SPI Parameter RAM common to QE and CPM.
diff --git a/arch/powerpc/platforms/83xx/km83xx.c b/arch/powerpc/platforms/83xx/km83xx.c
index ae1115813844..4bc6bbbe9ada 100644
--- a/arch/powerpc/platforms/83xx/km83xx.c
+++ b/arch/powerpc/platforms/83xx/km83xx.c
@@ -37,8 +37,8 @@
 #include <asm/udbg.h>
 #include <sysdev/fsl_soc.h>
 #include <sysdev/fsl_pci.h>
-#include <asm/qe.h>
-#include <asm/qe_ic.h>
+#include <soc/fsl/qe/qe.h>
+#include <soc/fsl/qe/qe_ic.h>
 
 #include "mpc83xx.h"
 
diff --git a/arch/powerpc/platforms/83xx/misc.c b/arch/powerpc/platforms/83xx/misc.c
index ef9d01a049c1..7e923cad56cf 100644
--- a/arch/powerpc/platforms/83xx/misc.c
+++ b/arch/powerpc/platforms/83xx/misc.c
@@ -17,7 +17,7 @@
 #include <asm/io.h>
 #include <asm/hw_irq.h>
 #include <asm/ipic.h>
-#include <asm/qe_ic.h>
+#include <soc/fsl/qe/qe_ic.h>
 #include <sysdev/fsl_soc.h>
 #include <sysdev/fsl_pci.h>
 
diff --git a/arch/powerpc/platforms/83xx/mpc832x_mds.c b/arch/powerpc/platforms/83xx/mpc832x_mds.c
index aacc43f64246..a973b2ae5df6 100644
--- a/arch/powerpc/platforms/83xx/mpc832x_mds.c
+++ b/arch/powerpc/platforms/83xx/mpc832x_mds.c
@@ -36,8 +36,8 @@
 #include <asm/udbg.h>
 #include <sysdev/fsl_soc.h>
 #include <sysdev/fsl_pci.h>
-#include <asm/qe.h>
-#include <asm/qe_ic.h>
+#include <soc/fsl/qe/qe.h>
+#include <soc/fsl/qe/qe_ic.h>
 
 #include "mpc83xx.h"
 
diff --git a/arch/powerpc/platforms/83xx/mpc832x_rdb.c b/arch/powerpc/platforms/83xx/mpc832x_rdb.c
index 0c7a43e1c390..ea2b87d202ca 100644
--- a/arch/powerpc/platforms/83xx/mpc832x_rdb.c
+++ b/arch/powerpc/platforms/83xx/mpc832x_rdb.c
@@ -25,8 +25,8 @@
 #include <asm/time.h>
 #include <asm/ipic.h>
 #include <asm/udbg.h>
-#include <asm/qe.h>
-#include <asm/qe_ic.h>
+#include <soc/fsl/qe/qe.h>
+#include <soc/fsl/qe/qe_ic.h>
 #include <sysdev/fsl_soc.h>
 #include <sysdev/fsl_pci.h>
 
diff --git a/arch/powerpc/platforms/83xx/mpc836x_mds.c b/arch/powerpc/platforms/83xx/mpc836x_mds.c
index eb24abdf1ae7..dd70b85f56d4 100644
--- a/arch/powerpc/platforms/83xx/mpc836x_mds.c
+++ b/arch/powerpc/platforms/83xx/mpc836x_mds.c
@@ -44,8 +44,8 @@
 #include <sysdev/fsl_soc.h>
 #include <sysdev/fsl_pci.h>
 #include <sysdev/simple_gpio.h>
-#include <asm/qe.h>
-#include <asm/qe_ic.h>
+#include <soc/fsl/qe/qe.h>
+#include <soc/fsl/qe/qe_ic.h>
 
 #include "mpc83xx.h"
 
diff --git a/arch/powerpc/platforms/83xx/mpc836x_rdk.c b/arch/powerpc/platforms/83xx/mpc836x_rdk.c
index 823e370ed212..4cd7153a6c88 100644
--- a/arch/powerpc/platforms/83xx/mpc836x_rdk.c
+++ b/arch/powerpc/platforms/83xx/mpc836x_rdk.c
@@ -20,8 +20,8 @@
 #include <asm/time.h>
 #include <asm/ipic.h>
 #include <asm/udbg.h>
-#include <asm/qe.h>
-#include <asm/qe_ic.h>
+#include <soc/fsl/qe/qe.h>
+#include <soc/fsl/qe/qe_ic.h>
 #include <sysdev/fsl_soc.h>
 #include <sysdev/fsl_pci.h>
 
diff --git a/arch/powerpc/platforms/85xx/common.c b/arch/powerpc/platforms/85xx/common.c
index 18bca203e01a..949f22c86e61 100644
--- a/arch/powerpc/platforms/85xx/common.c
+++ b/arch/powerpc/platforms/85xx/common.c
@@ -9,7 +9,7 @@
 #include <linux/of_irq.h>
 #include <linux/of_platform.h>
 
-#include <asm/qe.h>
+#include <soc/fsl/qe/qe.h>
 #include <sysdev/cpm2_pic.h>
 
 #include "mpc85xx.h"
diff --git a/arch/powerpc/platforms/85xx/corenet_generic.c b/arch/powerpc/platforms/85xx/corenet_generic.c
index 46d05c94add6..a2b0bc859de0 100644
--- a/arch/powerpc/platforms/85xx/corenet_generic.c
+++ b/arch/powerpc/platforms/85xx/corenet_generic.c
@@ -27,7 +27,7 @@
 #include <asm/udbg.h>
 #include <asm/mpic.h>
 #include <asm/ehv_pic.h>
-#include <asm/qe_ic.h>
+#include <soc/fsl/qe/qe_ic.h>
 
 #include <linux/of_platform.h>
 #include <sysdev/fsl_soc.h>
diff --git a/arch/powerpc/platforms/85xx/mpc85xx_mds.c b/arch/powerpc/platforms/85xx/mpc85xx_mds.c
index f0be439ceaaa..f61cbe235581 100644
--- a/arch/powerpc/platforms/85xx/mpc85xx_mds.c
+++ b/arch/powerpc/platforms/85xx/mpc85xx_mds.c
@@ -48,8 +48,8 @@
 #include <sysdev/fsl_soc.h>
 #include <sysdev/fsl_pci.h>
 #include <sysdev/simple_gpio.h>
-#include <asm/qe.h>
-#include <asm/qe_ic.h>
+#include <soc/fsl/qe/qe.h>
+#include <soc/fsl/qe/qe_ic.h>
 #include <asm/mpic.h>
 #include <asm/swiotlb.h>
 #include "smp.h"
diff --git a/arch/powerpc/platforms/85xx/mpc85xx_rdb.c b/arch/powerpc/platforms/85xx/mpc85xx_rdb.c
index 50dcc00a0f5a..3f4dad133338 100644
--- a/arch/powerpc/platforms/85xx/mpc85xx_rdb.c
+++ b/arch/powerpc/platforms/85xx/mpc85xx_rdb.c
@@ -26,8 +26,8 @@
 #include <asm/prom.h>
 #include <asm/udbg.h>
 #include <asm/mpic.h>
-#include <asm/qe.h>
-#include <asm/qe_ic.h>
+#include <soc/fsl/qe/qe.h>
+#include <soc/fsl/qe/qe_ic.h>
 
 #include <sysdev/fsl_soc.h>
 #include <sysdev/fsl_pci.h>
diff --git a/arch/powerpc/platforms/85xx/twr_p102x.c b/arch/powerpc/platforms/85xx/twr_p102x.c
index 892e613519cc..71bc255b4324 100644
--- a/arch/powerpc/platforms/85xx/twr_p102x.c
+++ b/arch/powerpc/platforms/85xx/twr_p102x.c
@@ -22,8 +22,8 @@
 #include <asm/pci-bridge.h>
 #include <asm/udbg.h>
 #include <asm/mpic.h>
-#include <asm/qe.h>
-#include <asm/qe_ic.h>
+#include <soc/fsl/qe/qe.h>
+#include <soc/fsl/qe/qe_ic.h>
 
 #include <sysdev/fsl_soc.h>
 #include <sysdev/fsl_pci.h>
diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig
index 57069eb8f093..46a3533d3acb 100644
--- a/arch/powerpc/platforms/Kconfig
+++ b/arch/powerpc/platforms/Kconfig
@@ -272,17 +272,6 @@ config TAU_AVERAGE
 
 	  If in doubt, say N here.
 
-config QUICC_ENGINE
-	bool "Freescale QUICC Engine (QE) Support"
-	depends on FSL_SOC && PPC32
-	select GENERIC_ALLOCATOR
-	select CRC32
-	help
-	  The QUICC Engine (QE) is a new generation of communications
-	  coprocessors on Freescale embedded CPUs (akin to CPM in older chips).
-	  Selecting this option means that you wish to build a kernel
-	  for a machine with a QE coprocessor.
-
 config QE_GPIO
 	bool "QE GPIO support"
 	depends on QUICC_ENGINE
diff --git a/arch/powerpc/sysdev/Makefile b/arch/powerpc/sysdev/Makefile
index f1d47498ddf2..bd6bd729969c 100644
--- a/arch/powerpc/sysdev/Makefile
+++ b/arch/powerpc/sysdev/Makefile
@@ -26,8 +26,6 @@ obj-$(CONFIG_FSL_85XX_CACHE_SRAM)	+= fsl_85xx_l2ctlr.o fsl_85xx_cache_sram.o
 obj-$(CONFIG_SIMPLE_GPIO)	+= simple_gpio.o
 obj-$(CONFIG_FSL_RIO)		+= fsl_rio.o fsl_rmu.o
 obj-$(CONFIG_TSI108_BRIDGE)	+= tsi108_pci.o tsi108_dev.o
-obj-$(CONFIG_QUICC_ENGINE)	+= qe_lib/
-obj-$(CONFIG_CPM)		+= qe_lib/
 mv64x60-$(CONFIG_PCI)		+= mv64x60_pci.o
 obj-$(CONFIG_MV64X60)		+= $(mv64x60-y) mv64x60_pic.o mv64x60_dev.o \
 				   mv64x60_udbg.o
diff --git a/arch/powerpc/sysdev/cpm_common.c b/arch/powerpc/sysdev/cpm_common.c
index 6993aa8e7242..9d32465eddb1 100644
--- a/arch/powerpc/sysdev/cpm_common.c
+++ b/arch/powerpc/sysdev/cpm_common.c
@@ -28,7 +28,7 @@
 #include <asm/udbg.h>
 #include <asm/io.h>
 #include <asm/cpm.h>
-#include <asm/qe.h>
+#include <soc/fsl/qe/qe.h>
 
 #include <mm/mmu_decl.h>
 
diff --git a/drivers/net/ethernet/freescale/fsl_pq_mdio.c b/drivers/net/ethernet/freescale/fsl_pq_mdio.c
index 55c36230e176..d0a6fa6d4f3e 100644
--- a/drivers/net/ethernet/freescale/fsl_pq_mdio.c
+++ b/drivers/net/ethernet/freescale/fsl_pq_mdio.c
@@ -29,7 +29,7 @@
 
 #include <asm/io.h>
 #if IS_ENABLED(CONFIG_UCC_GETH)
-#include <asm/ucc.h>	/* for ucc_set_qe_mux_mii_mng() */
+#include <soc/fsl/qe/ucc.h>
 #endif
 
 #include "gianfar.h"
diff --git a/drivers/net/ethernet/freescale/ucc_geth.c b/drivers/net/ethernet/freescale/ucc_geth.c
index 650f7888e32b..c30b72e02a1a 100644
--- a/drivers/net/ethernet/freescale/ucc_geth.c
+++ b/drivers/net/ethernet/freescale/ucc_geth.c
@@ -40,10 +40,10 @@
 #include <asm/uaccess.h>
 #include <asm/irq.h>
 #include <asm/io.h>
-#include <asm/immap_qe.h>
-#include <asm/qe.h>
-#include <asm/ucc.h>
-#include <asm/ucc_fast.h>
+#include <soc/fsl/qe/immap_qe.h>
+#include <soc/fsl/qe/qe.h>
+#include <soc/fsl/qe/ucc.h>
+#include <soc/fsl/qe/ucc_fast.h>
 #include <asm/machdep.h>
 
 #include "ucc_geth.h"
diff --git a/drivers/net/ethernet/freescale/ucc_geth.h b/drivers/net/ethernet/freescale/ucc_geth.h
index 75f337163ce3..5da19b440a6a 100644
--- a/drivers/net/ethernet/freescale/ucc_geth.h
+++ b/drivers/net/ethernet/freescale/ucc_geth.h
@@ -22,11 +22,11 @@
 #include <linux/list.h>
 #include <linux/if_ether.h>
 
-#include <asm/immap_qe.h>
-#include <asm/qe.h>
+#include <soc/fsl/qe/immap_qe.h>
+#include <soc/fsl/qe/qe.h>
 
-#include <asm/ucc.h>
-#include <asm/ucc_fast.h>
+#include <soc/fsl/qe/ucc.h>
+#include <soc/fsl/qe/ucc_fast.h>
 
 #define DRV_DESC "QE UCC Gigabit Ethernet Controller"
 #define DRV_NAME "ucc_geth"
diff --git a/drivers/soc/Kconfig b/drivers/soc/Kconfig
index 4e853ed2c82b..ad0df75fab6e 100644
--- a/drivers/soc/Kconfig
+++ b/drivers/soc/Kconfig
@@ -1,6 +1,7 @@
 menu "SOC (System On Chip) specific Drivers"
 
 source "drivers/soc/brcmstb/Kconfig"
+source "drivers/soc/fsl/qe/Kconfig"
 source "drivers/soc/mediatek/Kconfig"
 source "drivers/soc/qcom/Kconfig"
 source "drivers/soc/rockchip/Kconfig"
diff --git a/drivers/soc/Makefile b/drivers/soc/Makefile
index f2ba2e932ae1..9536b804424a 100644
--- a/drivers/soc/Makefile
+++ b/drivers/soc/Makefile
@@ -4,6 +4,7 @@
 
 obj-$(CONFIG_SOC_BRCMSTB)	+= brcmstb/
 obj-$(CONFIG_MACH_DOVE)		+= dove/
+obj-y				+= fsl/
 obj-$(CONFIG_ARCH_MEDIATEK)	+= mediatek/
 obj-$(CONFIG_ARCH_QCOM)		+= qcom/
 obj-$(CONFIG_ARCH_ROCKCHIP)		+= rockchip/
diff --git a/drivers/soc/fsl/Makefile b/drivers/soc/fsl/Makefile
new file mode 100644
index 000000000000..203307fd92c1
--- /dev/null
+++ b/drivers/soc/fsl/Makefile
@@ -0,0 +1,6 @@
+#
+# Makefile for the Linux Kernel SOC fsl specific device drivers
+#
+
+obj-$(CONFIG_QUICC_ENGINE)		+= qe/
+obj-$(CONFIG_CPM)			+= qe/
diff --git a/arch/powerpc/sysdev/qe_lib/Kconfig b/drivers/soc/fsl/qe/Kconfig
similarity index 54%
rename from arch/powerpc/sysdev/qe_lib/Kconfig
rename to drivers/soc/fsl/qe/Kconfig
index 3c251993bacd..20978f2058a6 100644
--- a/arch/powerpc/sysdev/qe_lib/Kconfig
+++ b/drivers/soc/fsl/qe/Kconfig
@@ -2,6 +2,17 @@
 # QE Communication options
 #
 
+config QUICC_ENGINE
+	bool "Freescale QUICC Engine (QE) Support"
+	depends on FSL_SOC && PPC32
+	select GENERIC_ALLOCATOR
+	select CRC32
+	help
+	  The QUICC Engine (QE) is a new generation of communications
+	  coprocessors on Freescale embedded CPUs (akin to CPM in older chips).
+	  Selecting this option means that you wish to build a kernel
+	  for a machine with a QE coprocessor.
+
 config UCC_SLOW
 	bool
 	default y if SERIAL_QE
diff --git a/arch/powerpc/sysdev/qe_lib/Makefile b/drivers/soc/fsl/qe/Makefile
similarity index 100%
rename from arch/powerpc/sysdev/qe_lib/Makefile
rename to drivers/soc/fsl/qe/Makefile
diff --git a/arch/powerpc/sysdev/qe_lib/gpio.c b/drivers/soc/fsl/qe/gpio.c
similarity index 99%
rename from arch/powerpc/sysdev/qe_lib/gpio.c
rename to drivers/soc/fsl/qe/gpio.c
index 521e67a49dc4..aa5c11acf212 100644
--- a/arch/powerpc/sysdev/qe_lib/gpio.c
+++ b/drivers/soc/fsl/qe/gpio.c
@@ -21,7 +21,7 @@
 #include <linux/gpio.h>
 #include <linux/slab.h>
 #include <linux/export.h>
-#include <asm/qe.h>
+#include <soc/fsl/qe/qe.h>
 
 struct qe_gpio_chip {
 	struct of_mm_gpio_chip mm_gc;
diff --git a/arch/powerpc/sysdev/qe_lib/qe.c b/drivers/soc/fsl/qe/qe.c
similarity index 99%
rename from arch/powerpc/sysdev/qe_lib/qe.c
rename to drivers/soc/fsl/qe/qe.c
index 88ae5c7ff4bb..709fc63809e5 100644
--- a/arch/powerpc/sysdev/qe_lib/qe.c
+++ b/drivers/soc/fsl/qe/qe.c
@@ -31,8 +31,8 @@
 #include <asm/irq.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
-#include <asm/immap_qe.h>
-#include <asm/qe.h>
+#include <soc/fsl/qe/immap_qe.h>
+#include <soc/fsl/qe/qe.h>
 #include <asm/prom.h>
 #include <asm/rheap.h>
 
diff --git a/arch/powerpc/sysdev/qe_lib/qe_common.c b/drivers/soc/fsl/qe/qe_common.c
similarity index 99%
rename from arch/powerpc/sysdev/qe_lib/qe_common.c
rename to drivers/soc/fsl/qe/qe_common.c
index b90043f1503b..419fa5b7be4d 100644
--- a/arch/powerpc/sysdev/qe_lib/qe_common.c
+++ b/drivers/soc/fsl/qe/qe_common.c
@@ -26,7 +26,7 @@
 #include <linux/of_address.h>
 #include <linux/slab.h>
 #include <linux/io.h>
-#include <asm/qe.h>
+#include <soc/fsl/qe/qe.h>
 
 static struct gen_pool *muram_pool;
 static spinlock_t cpm_muram_lock;
diff --git a/arch/powerpc/sysdev/qe_lib/qe_ic.c b/drivers/soc/fsl/qe/qe_ic.c
similarity index 99%
rename from arch/powerpc/sysdev/qe_lib/qe_ic.c
rename to drivers/soc/fsl/qe/qe_ic.c
index ef36f16f9f6f..b77d01ff8330 100644
--- a/arch/powerpc/sysdev/qe_lib/qe_ic.c
+++ b/drivers/soc/fsl/qe/qe_ic.c
@@ -14,6 +14,8 @@
  * option) any later version.
  */
 
+#include <linux/of_irq.h>
+#include <linux/of_address.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/errno.h>
@@ -26,8 +28,7 @@
 #include <linux/spinlock.h>
 #include <asm/irq.h>
 #include <asm/io.h>
-#include <asm/prom.h>
-#include <asm/qe_ic.h>
+#include <soc/fsl/qe/qe_ic.h>
 
 #include "qe_ic.h"
 
diff --git a/arch/powerpc/sysdev/qe_lib/qe_ic.h b/drivers/soc/fsl/qe/qe_ic.h
similarity index 97%
rename from arch/powerpc/sysdev/qe_lib/qe_ic.h
rename to drivers/soc/fsl/qe/qe_ic.h
index efef7ab9b753..926a2ed42319 100644
--- a/arch/powerpc/sysdev/qe_lib/qe_ic.h
+++ b/drivers/soc/fsl/qe/qe_ic.h
@@ -1,5 +1,5 @@
 /*
- * arch/powerpc/sysdev/qe_lib/qe_ic.h
+ * drivers/soc/fsl/qe/qe_ic.h
  *
  * QUICC ENGINE Interrupt Controller Header
  *
@@ -16,7 +16,7 @@
 #ifndef _POWERPC_SYSDEV_QE_IC_H
 #define _POWERPC_SYSDEV_QE_IC_H
 
-#include <asm/qe_ic.h>
+#include <soc/fsl/qe/qe_ic.h>
 
 #define NR_QE_IC_INTS		64
 
diff --git a/arch/powerpc/sysdev/qe_lib/qe_io.c b/drivers/soc/fsl/qe/qe_io.c
similarity index 99%
rename from arch/powerpc/sysdev/qe_lib/qe_io.c
rename to drivers/soc/fsl/qe/qe_io.c
index 7ea0174f6d3d..7ae59abc7863 100644
--- a/arch/powerpc/sysdev/qe_lib/qe_io.c
+++ b/drivers/soc/fsl/qe/qe_io.c
@@ -21,7 +21,7 @@
 #include <linux/ioport.h>
 
 #include <asm/io.h>
-#include <asm/qe.h>
+#include <soc/fsl/qe/qe.h>
 #include <asm/prom.h>
 #include <sysdev/fsl_soc.h>
 
diff --git a/arch/powerpc/sysdev/qe_lib/ucc.c b/drivers/soc/fsl/qe/ucc.c
similarity index 98%
rename from arch/powerpc/sysdev/qe_lib/ucc.c
rename to drivers/soc/fsl/qe/ucc.c
index 621575b7e84a..b59d3358f9bd 100644
--- a/arch/powerpc/sysdev/qe_lib/ucc.c
+++ b/drivers/soc/fsl/qe/ucc.c
@@ -21,9 +21,9 @@
 
 #include <asm/irq.h>
 #include <asm/io.h>
-#include <asm/immap_qe.h>
-#include <asm/qe.h>
-#include <asm/ucc.h>
+#include <soc/fsl/qe/immap_qe.h>
+#include <soc/fsl/qe/qe.h>
+#include <soc/fsl/qe/ucc.h>
 
 int ucc_set_qe_mux_mii_mng(unsigned int ucc_num)
 {
diff --git a/arch/powerpc/sysdev/qe_lib/ucc_fast.c b/drivers/soc/fsl/qe/ucc_fast.c
similarity index 98%
rename from arch/powerpc/sysdev/qe_lib/ucc_fast.c
rename to drivers/soc/fsl/qe/ucc_fast.c
index 65aaf15032ae..a7689310fe40 100644
--- a/arch/powerpc/sysdev/qe_lib/ucc_fast.c
+++ b/drivers/soc/fsl/qe/ucc_fast.c
@@ -21,11 +21,11 @@
 #include <linux/export.h>
 
 #include <asm/io.h>
-#include <asm/immap_qe.h>
-#include <asm/qe.h>
+#include <soc/fsl/qe/immap_qe.h>
+#include <soc/fsl/qe/qe.h>
 
-#include <asm/ucc.h>
-#include <asm/ucc_fast.h>
+#include <soc/fsl/qe/ucc.h>
+#include <soc/fsl/qe/ucc_fast.h>
 
 void ucc_fast_dump_regs(struct ucc_fast_private * uccf)
 {
diff --git a/arch/powerpc/sysdev/qe_lib/ucc_slow.c b/drivers/soc/fsl/qe/ucc_slow.c
similarity index 98%
rename from arch/powerpc/sysdev/qe_lib/ucc_slow.c
rename to drivers/soc/fsl/qe/ucc_slow.c
index 5f91628209eb..9334bdbd9b30 100644
--- a/arch/powerpc/sysdev/qe_lib/ucc_slow.c
+++ b/drivers/soc/fsl/qe/ucc_slow.c
@@ -21,11 +21,11 @@
 #include <linux/export.h>
 
 #include <asm/io.h>
-#include <asm/immap_qe.h>
-#include <asm/qe.h>
+#include <soc/fsl/qe/immap_qe.h>
+#include <soc/fsl/qe/qe.h>
 
-#include <asm/ucc.h>
-#include <asm/ucc_slow.h>
+#include <soc/fsl/qe/ucc.h>
+#include <soc/fsl/qe/ucc_slow.h>
 
 u32 ucc_slow_get_qe_cr_subblock(int uccs_num)
 {
diff --git a/arch/powerpc/sysdev/qe_lib/usb.c b/drivers/soc/fsl/qe/usb.c
similarity index 96%
rename from arch/powerpc/sysdev/qe_lib/usb.c
rename to drivers/soc/fsl/qe/usb.c
index 27f23bd15eb6..111f7ab80f04 100644
--- a/arch/powerpc/sysdev/qe_lib/usb.c
+++ b/drivers/soc/fsl/qe/usb.c
@@ -17,8 +17,8 @@
 #include <linux/errno.h>
 #include <linux/export.h>
 #include <linux/io.h>
-#include <asm/immap_qe.h>
-#include <asm/qe.h>
+#include <soc/fsl/qe/immap_qe.h>
+#include <soc/fsl/qe/qe.h>
 
 int qe_usb_clock_set(enum qe_clock clk, int rate)
 {
diff --git a/drivers/spi/spi-fsl-cpm.c b/drivers/spi/spi-fsl-cpm.c
index 896add8cfd3b..8f7b26ec181e 100644
--- a/drivers/spi/spi-fsl-cpm.c
+++ b/drivers/spi/spi-fsl-cpm.c
@@ -16,7 +16,7 @@
  * option) any later version.
  */
 #include <asm/cpm.h>
-#include <asm/qe.h>
+#include <soc/fsl/qe/qe.h>
 #include <linux/dma-mapping.h>
 #include <linux/fsl_devices.h>
 #include <linux/kernel.h>
diff --git a/drivers/tty/serial/ucc_uart.c b/drivers/tty/serial/ucc_uart.c
index 73190f5d2832..1a7dc3c590b1 100644
--- a/drivers/tty/serial/ucc_uart.c
+++ b/drivers/tty/serial/ucc_uart.c
@@ -31,7 +31,7 @@
 #include <linux/dma-mapping.h>
 
 #include <linux/fs_uart_pd.h>
-#include <asm/ucc_slow.h>
+#include <soc/fsl/qe/ucc_slow.h>
 
 #include <linux/firmware.h>
 #include <asm/reg.h>
diff --git a/drivers/usb/gadget/udc/fsl_qe_udc.c b/drivers/usb/gadget/udc/fsl_qe_udc.c
index 5fb6f8b4f0b4..53c0692f1b09 100644
--- a/drivers/usb/gadget/udc/fsl_qe_udc.c
+++ b/drivers/usb/gadget/udc/fsl_qe_udc.c
@@ -38,7 +38,7 @@
 #include <linux/usb/ch9.h>
 #include <linux/usb/gadget.h>
 #include <linux/usb/otg.h>
-#include <asm/qe.h>
+#include <soc/fsl/qe/qe.h>
 #include <asm/cpm.h>
 #include <asm/dma.h>
 #include <asm/reg.h>
diff --git a/drivers/usb/host/fhci-hcd.c b/drivers/usb/host/fhci-hcd.c
index c6cebb96fd21..0960f41f945a 100644
--- a/drivers/usb/host/fhci-hcd.c
+++ b/drivers/usb/host/fhci-hcd.c
@@ -31,7 +31,7 @@
 #include <linux/of_platform.h>
 #include <linux/of_gpio.h>
 #include <linux/slab.h>
-#include <asm/qe.h>
+#include <soc/fsl/qe/qe.h>
 #include <asm/fsl_gtm.h>
 #include "fhci.h"
 
diff --git a/drivers/usb/host/fhci-hub.c b/drivers/usb/host/fhci-hub.c
index 3bacdd7befe9..60d55eb3de0d 100644
--- a/drivers/usb/host/fhci-hub.c
+++ b/drivers/usb/host/fhci-hub.c
@@ -24,7 +24,7 @@
 #include <linux/usb.h>
 #include <linux/usb/hcd.h>
 #include <linux/gpio.h>
-#include <asm/qe.h>
+#include <soc/fsl/qe/qe.h>
 #include "fhci.h"
 
 /* virtual root hub specific descriptor */
diff --git a/drivers/usb/host/fhci-sched.c b/drivers/usb/host/fhci-sched.c
index 95ca5986e672..a9609a336efe 100644
--- a/drivers/usb/host/fhci-sched.c
+++ b/drivers/usb/host/fhci-sched.c
@@ -25,7 +25,7 @@
 #include <linux/io.h>
 #include <linux/usb.h>
 #include <linux/usb/hcd.h>
-#include <asm/qe.h>
+#include <soc/fsl/qe/qe.h>
 #include <asm/fsl_gtm.h>
 #include "fhci.h"
 
diff --git a/drivers/usb/host/fhci.h b/drivers/usb/host/fhci.h
index 154e6a007727..3fc82c1c3c73 100644
--- a/drivers/usb/host/fhci.h
+++ b/drivers/usb/host/fhci.h
@@ -27,8 +27,8 @@
 #include <linux/io.h>
 #include <linux/usb.h>
 #include <linux/usb/hcd.h>
-#include <asm/qe.h>
-#include <asm/immap_qe.h>
+#include <soc/fsl/qe/qe.h>
+#include <soc/fsl/qe/immap_qe.h>
 
 #define USB_CLOCK	48000000
 
diff --git a/arch/powerpc/include/asm/immap_qe.h b/include/soc/fsl/qe/immap_qe.h
similarity index 100%
rename from arch/powerpc/include/asm/immap_qe.h
rename to include/soc/fsl/qe/immap_qe.h
diff --git a/arch/powerpc/include/asm/qe.h b/include/soc/fsl/qe/qe.h
similarity index 99%
rename from arch/powerpc/include/asm/qe.h
rename to include/soc/fsl/qe/qe.h
index ceeaf91854b5..c7fa36c335c9 100644
--- a/arch/powerpc/include/asm/qe.h
+++ b/include/soc/fsl/qe/qe.h
@@ -22,7 +22,7 @@
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <asm/cpm.h>
-#include <asm/immap_qe.h>
+#include <soc/fsl/qe/immap_qe.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
 #include <linux/types.h>
diff --git a/arch/powerpc/include/asm/qe_ic.h b/include/soc/fsl/qe/qe_ic.h
similarity index 100%
rename from arch/powerpc/include/asm/qe_ic.h
rename to include/soc/fsl/qe/qe_ic.h
diff --git a/arch/powerpc/include/asm/ucc.h b/include/soc/fsl/qe/ucc.h
similarity index 96%
rename from arch/powerpc/include/asm/ucc.h
rename to include/soc/fsl/qe/ucc.h
index 6927ac26516e..894f14cbb044 100644
--- a/arch/powerpc/include/asm/ucc.h
+++ b/include/soc/fsl/qe/ucc.h
@@ -15,8 +15,8 @@
 #ifndef __UCC_H__
 #define __UCC_H__
 
-#include <asm/immap_qe.h>
-#include <asm/qe.h>
+#include <soc/fsl/qe/immap_qe.h>
+#include <soc/fsl/qe/qe.h>
 
 #define STATISTICS
 
diff --git a/arch/powerpc/include/asm/ucc_fast.h b/include/soc/fsl/qe/ucc_fast.h
similarity index 98%
rename from arch/powerpc/include/asm/ucc_fast.h
rename to include/soc/fsl/qe/ucc_fast.h
index 72ea9bab07df..df8ea7958c63 100644
--- a/arch/powerpc/include/asm/ucc_fast.h
+++ b/include/soc/fsl/qe/ucc_fast.h
@@ -16,10 +16,10 @@
 
 #include <linux/kernel.h>
 
-#include <asm/immap_qe.h>
-#include <asm/qe.h>
+#include <soc/fsl/qe/immap_qe.h>
+#include <soc/fsl/qe/qe.h>
 
-#include <asm/ucc.h>
+#include <soc/fsl/qe/ucc.h>
 
 /* Receive BD's status */
 #define R_E	0x80000000	/* buffer empty */
diff --git a/arch/powerpc/include/asm/ucc_slow.h b/include/soc/fsl/qe/ucc_slow.h
similarity index 99%
rename from arch/powerpc/include/asm/ucc_slow.h
rename to include/soc/fsl/qe/ucc_slow.h
index 233ef5fe5fde..6c0573a0825c 100644
--- a/arch/powerpc/include/asm/ucc_slow.h
+++ b/include/soc/fsl/qe/ucc_slow.h
@@ -17,10 +17,10 @@
 
 #include <linux/kernel.h>
 
-#include <asm/immap_qe.h>
-#include <asm/qe.h>
+#include <soc/fsl/qe/immap_qe.h>
+#include <soc/fsl/qe/qe.h>
 
-#include <asm/ucc.h>
+#include <soc/fsl/qe/ucc.h>
 
 /* transmit BD's status */
 #define T_R	0x80000000	/* ready bit */

From 230dd6059a97965de8464db293c3e852da395986 Mon Sep 17 00:00:00 2001
From: Harninder Rai <harninder.rai@freescale.com>
Date: Thu, 5 Nov 2015 11:15:59 +0800
Subject: [PATCH 117/149] powerpc/fsl: Add PCI node in device tree of
 bsc9132qds

Signed-off-by: Harninder Rai <harninder.rai@freescale.com>
Signed-off-by: Minghuan Lian <Minghuan.Lian@freescale.com>
Signed-off-by: Hou Zhiqiang <B48286@freescale.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>
---
 arch/powerpc/boot/dts/fsl/bsc9132qds.dts      | 15 ++++++++++
 arch/powerpc/boot/dts/fsl/bsc9132si-post.dtsi | 28 +++++++++++++++++++
 arch/powerpc/boot/dts/fsl/bsc9132si-pre.dtsi  |  1 +
 3 files changed, 44 insertions(+)

diff --git a/arch/powerpc/boot/dts/fsl/bsc9132qds.dts b/arch/powerpc/boot/dts/fsl/bsc9132qds.dts
index 70882ade606d..56e6f1337e96 100644
--- a/arch/powerpc/boot/dts/fsl/bsc9132qds.dts
+++ b/arch/powerpc/boot/dts/fsl/bsc9132qds.dts
@@ -29,6 +29,21 @@
 	soc: soc@ff700000 {
 		ranges = <0x0 0x0 0xff700000 0x100000>;
 	};
+
+	pci0: pcie@ff70a000 {
+		reg = <0 0xff70a000 0 0x1000>;
+		ranges = <0x2000000 0x0 0x90000000 0 0x90000000 0x0 0x20000000
+			  0x1000000 0x0 0x00000000 0 0xc0010000 0x0 0x10000>;
+		pcie@0 {
+			ranges = <0x2000000 0x0 0x90000000
+				  0x2000000 0x0 0x90000000
+				  0x0 0x20000000
+
+				  0x1000000 0x0 0x0
+				  0x1000000 0x0 0x0
+				  0x0 0x100000>;
+		};
+	};
 };
 
 /include/ "bsc9132qds.dtsi"
diff --git a/arch/powerpc/boot/dts/fsl/bsc9132si-post.dtsi b/arch/powerpc/boot/dts/fsl/bsc9132si-post.dtsi
index c72307198140..b5f071574e83 100644
--- a/arch/powerpc/boot/dts/fsl/bsc9132si-post.dtsi
+++ b/arch/powerpc/boot/dts/fsl/bsc9132si-post.dtsi
@@ -40,6 +40,34 @@
 	interrupts = <16 2 0 0 20 2 0 0>;
 };
 
+/* controller at 0xa000 */
+&pci0 {
+	compatible = "fsl,bsc9132-pcie", "fsl,qoriq-pcie-v2.2";
+	device_type = "pci";
+	#size-cells = <2>;
+	#address-cells = <3>;
+	bus-range = <0 255>;
+	interrupts = <16 2 0 0>;
+
+	pcie@0 {
+		reg = <0 0 0 0 0>;
+		#interrupt-cells = <1>;
+		#size-cells = <2>;
+		#address-cells = <3>;
+		device_type = "pci";
+		interrupts = <16 2 0 0>;
+		interrupt-map-mask = <0xf800 0 0 7>;
+
+		interrupt-map = <
+			/* IDSEL 0x0 */
+			0000 0x0 0x0 0x1 &mpic 0x0 0x2 0x0 0x0
+			0000 0x0 0x0 0x2 &mpic 0x1 0x2 0x0 0x0
+			0000 0x0 0x0 0x3 &mpic 0x2 0x2 0x0 0x0
+			0000 0x0 0x0 0x4 &mpic 0x3 0x2 0x0 0x0
+			>;
+	};
+};
+
 &soc {
 	#address-cells = <1>;
 	#size-cells = <1>;
diff --git a/arch/powerpc/boot/dts/fsl/bsc9132si-pre.dtsi b/arch/powerpc/boot/dts/fsl/bsc9132si-pre.dtsi
index 301a9dba5790..90f7949fe312 100644
--- a/arch/powerpc/boot/dts/fsl/bsc9132si-pre.dtsi
+++ b/arch/powerpc/boot/dts/fsl/bsc9132si-pre.dtsi
@@ -45,6 +45,7 @@
 		serial0 = &serial0;
 		ethernet0 = &enet0;
 		ethernet1 = &enet1;
+		pci0 = &pci0;
 	};
 
 	cpus {

From 720d7aebcdffda29aa71e12f3b806dbf3aa20761 Mon Sep 17 00:00:00 2001
From: Harninder Rai <harninder.rai@freescale.com>
Date: Thu, 5 Nov 2015 11:16:00 +0800
Subject: [PATCH 118/149] powerpc/85xx: Add PCIe controller support for
 bsc9132qds
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. Use machine_arch_initcall to hook mpc85xx_common_publish_devices
This can ensure before pcibios_init() is called, pci controllers have
been probed and added to the hose_list.
2. Add a workaround for errata A-005434
For the BSC9132, PEX_PEXIWARn[TRGT] for all windows defaults to 0xF,
which is mapped to CCSRBAR. However, for other products, 0xF is
mapped to the local memory. Therefore, for the BSC9132, any default
PCI Express access to the local memory (DDR) will now access the
CCSRBAR. This patch changes the mapping of targets of inbound windows
PEX_PEXIWARn[TRGT] to the Local address space – 0x0 (from 0xF).

Signed-off-by: Harninder Rai <harninder.rai@freescale.com>
Signed-off-by: Minghuan Lian <Minghuan.Lian@freescale.com>
Signed-off-by: Hou Zhiqiang <B48286@freescale.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>
---
 arch/powerpc/platforms/85xx/bsc913x_qds.c |  8 +++++++-
 arch/powerpc/sysdev/fsl_pci.c             | 13 +++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/85xx/bsc913x_qds.c b/arch/powerpc/platforms/85xx/bsc913x_qds.c
index f0927e58af25..dcfafd6b91ee 100644
--- a/arch/powerpc/platforms/85xx/bsc913x_qds.c
+++ b/arch/powerpc/platforms/85xx/bsc913x_qds.c
@@ -17,6 +17,7 @@
 #include <linux/pci.h>
 #include <asm/mpic.h>
 #include <sysdev/fsl_soc.h>
+#include <sysdev/fsl_pci.h>
 #include <asm/udbg.h>
 
 #include "mpc85xx.h"
@@ -46,10 +47,12 @@ static void __init bsc913x_qds_setup_arch(void)
 	mpc85xx_smp_init();
 #endif
 
+	fsl_pci_assign_primary();
+
 	pr_info("bsc913x board from Freescale Semiconductor\n");
 }
 
-machine_device_initcall(bsc9132_qds, mpc85xx_common_publish_devices);
+machine_arch_initcall(bsc9132_qds, mpc85xx_common_publish_devices);
 
 /*
  * Called very early, device-tree isn't unflattened
@@ -67,6 +70,9 @@ define_machine(bsc9132_qds) {
 	.probe			= bsc9132_qds_probe,
 	.setup_arch		= bsc913x_qds_setup_arch,
 	.init_IRQ		= bsc913x_qds_pic_init,
+#ifdef CONFIG_PCI
+	.pcibios_fixup_bus	= fsl_pcibios_fixup_bus,
+#endif
 	.get_irq		= mpic_get_irq,
 	.restart		= fsl_rstcr_restart,
 	.calibrate_decr		= generic_calibrate_decr,
diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c
index 610f472f91d1..79976e51b8ad 100644
--- a/arch/powerpc/sysdev/fsl_pci.c
+++ b/arch/powerpc/sysdev/fsl_pci.c
@@ -216,6 +216,19 @@ static void setup_pci_atmu(struct pci_controller *hose)
 	 */
 	setup_inbound = !is_kdump();
 
+	if (of_device_is_compatible(hose->dn, "fsl,bsc9132-pcie")) {
+		/*
+		 * BSC9132 Rev1.0 has an issue where all the PEX inbound
+		 * windows have implemented the default target value as 0xf
+		 * for CCSR space.In all Freescale legacy devices the target
+		 * of 0xf is reserved for local memory space. 9132 Rev1.0
+		 * now has local mempry space mapped to target 0x0 instead of
+		 * 0xf. Hence adding a workaround to remove the target 0xf
+		 * defined for memory space from Inbound window attributes.
+		 */
+		piwar &= ~PIWAR_TGI_LOCAL;
+	}
+
 	if (early_find_capability(hose, 0, 0, PCI_CAP_ID_EXP)) {
 		if (in_be32(&pci->block_rev1) >= PCIE_IP_REV_2_2) {
 			win_idx = 2;

From 4e9de5e9701f4f9206fc25249729881f8394850d Mon Sep 17 00:00:00 2001
From: Igal Liberman <igal.liberman@freescale.com>
Date: Thu, 5 Nov 2015 12:23:06 +0200
Subject: [PATCH 119/149] powerpc/mpc85xx: Update B4 FMan MURAM size

FMan V3H has 2 different MURAM sizes:
    In B4860/4420 the MURAM size is 512KB.
    In T4240 and T2080 the MURAM size is 384KB.

The MURAM size in FMan V3H device tree is 384KB.
This patch updates the MURAM size for B4 to 512KB.

Signed-off-by: Igal Liberman <igal.liberman@freescale.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>
---
 arch/powerpc/boot/dts/fsl/b4si-post.dtsi | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/powerpc/boot/dts/fsl/b4si-post.dtsi b/arch/powerpc/boot/dts/fsl/b4si-post.dtsi
index 74866ac52f39..1b33f5157c8a 100644
--- a/arch/powerpc/boot/dts/fsl/b4si-post.dtsi
+++ b/arch/powerpc/boot/dts/fsl/b4si-post.dtsi
@@ -474,6 +474,11 @@
 	fman@400000 {
 		interrupts = <96 2 0 0>, <16 2 1 30>;
 
+		muram@0 {
+			compatible = "fsl,fman-muram";
+			reg = <0x0 0x80000>;
+		};
+
 		enet0: ethernet@e0000 {
 		};
 

From 433c858a61ac4192f821b71cd1370886e8395863 Mon Sep 17 00:00:00 2001
From: Daniel Walker <danielwa@cisco.com>
Date: Thu, 5 Nov 2015 16:31:21 -0800
Subject: [PATCH 120/149] powerpc/85xx: mpc85xx ADS: remove pci exclude

This code was reworked in commit,

905e75c46dba5f3061049277e4eb7110beedba43

This change removed the fsl_add_bridge() which originally was above
the addition of the pci_exclude_device function. I think the assumption was that
the pci_exclude_device would prevent changes to the bridge PCI config after
it's been added. It seems it wasn't fully tested on MPC85xx ADS because
if you move the fsl_add_bridge() the pci_exclude_device is set in the machine
description then you can never update the PCI Config since the exclude
prevents it. This disrupts things like DMA.

This issue was extensively debugged by David Beazley.

Cc: xe-kernel@external.cisco.com
Cc: dbeazley@cisco.com
Cc: dwalker@fifo99.com
Signed-off-by: Daniel Walker <danielwa@cisco.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>
---
 arch/powerpc/platforms/85xx/mpc85xx_ads.c | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/arch/powerpc/platforms/85xx/mpc85xx_ads.c b/arch/powerpc/platforms/85xx/mpc85xx_ads.c
index 7d12a19aa7ee..de72a5f464b1 100644
--- a/arch/powerpc/platforms/85xx/mpc85xx_ads.c
+++ b/arch/powerpc/platforms/85xx/mpc85xx_ads.c
@@ -36,17 +36,6 @@
 
 #include "mpc85xx.h"
 
-#ifdef CONFIG_PCI
-static int mpc85xx_exclude_device(struct pci_controller *hose,
-				   u_char bus, u_char devfn)
-{
-	if (bus == 0 && PCI_SLOT(devfn) == 0)
-		return PCIBIOS_DEVICE_NOT_FOUND;
-	else
-		return PCIBIOS_SUCCESSFUL;
-}
-#endif /* CONFIG_PCI */
-
 static void __init mpc85xx_ads_pic_init(void)
 {
 	struct mpic *mpic = mpic_alloc(NULL, 0, MPIC_BIG_ENDIAN,
@@ -145,10 +134,6 @@ static void __init mpc85xx_ads_setup_arch(void)
 	init_ioports();
 #endif
 
-#ifdef CONFIG_PCI
-	ppc_md.pci_exclude_device = mpc85xx_exclude_device;
-#endif
-
 	fsl_pci_assign_primary();
 }
 

From 9d68e7accf28d0ffe5bf44aae4e64d744fa97337 Mon Sep 17 00:00:00 2001
From: li pengbo <Pengbo.Li@freescale.com>
Date: Thu, 19 Nov 2015 10:52:04 +0800
Subject: [PATCH 121/149] powerpc/85xx: Enable TWR_P102x in
 mpc85xx_basic_defconfig

Enable TWR_P102x option by default in mpc85xx_basic_defconfig to support
p1025twr board.

Signed-off-by: Pengbo Li <Pengbo.Li@freescale.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>
---
 arch/powerpc/configs/mpc85xx_basic_defconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/configs/mpc85xx_basic_defconfig b/arch/powerpc/configs/mpc85xx_basic_defconfig
index 850bd195d0e8..b1593fe6f70b 100644
--- a/arch/powerpc/configs/mpc85xx_basic_defconfig
+++ b/arch/powerpc/configs/mpc85xx_basic_defconfig
@@ -12,6 +12,7 @@ CONFIG_P1010_RDB=y
 CONFIG_P1022_DS=y
 CONFIG_P1022_RDK=y
 CONFIG_P1023_RDB=y
+CONFIG_TWR_P102x=y
 CONFIG_SBC8548=y
 CONFIG_SOCRATES=y
 CONFIG_STX_GP3=y

From 2749539b4bf8231b3a20ad759ec8c559ec29292c Mon Sep 17 00:00:00 2001
From: Scott Wood <scottwood@freescale.com>
Date: Fri, 4 Dec 2015 16:31:13 -0600
Subject: [PATCH 122/149] powerpc/e6500: add locking to hugetlb

e6500 has threads but does not have TLB write conditional.  Thus,
the hugetlb code needs to take the same lock that the normal TLB miss
handlers take, to ensure that the tlbsx and tlbwe are atomic.

Signed-off-by: Scott Wood <scottwood@freescale.com>
---
 arch/powerpc/mm/hugetlbpage-book3e.c | 46 ++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/arch/powerpc/mm/hugetlbpage-book3e.c b/arch/powerpc/mm/hugetlbpage-book3e.c
index ba47aaf33a4b..7e6d0880813f 100644
--- a/arch/powerpc/mm/hugetlbpage-book3e.c
+++ b/arch/powerpc/mm/hugetlbpage-book3e.c
@@ -51,6 +51,48 @@ static inline int mmu_get_tsize(int psize)
 	return mmu_psize_defs[psize].enc;
 }
 
+#if defined(CONFIG_PPC_FSL_BOOK3E) && defined(CONFIG_PPC64)
+#include <asm/paca.h>
+
+static inline void book3e_tlb_lock(void)
+{
+	struct paca_struct *paca = get_paca();
+	unsigned long tmp;
+	int token = smp_processor_id() + 1;
+
+	asm volatile("1: lbarx %0, 0, %1;"
+		     "cmpwi %0, 0;"
+		     "bne 2f;"
+		     "stbcx. %2, 0, %1;"
+		     "bne 1b;"
+		     "b 3f;"
+		     "2: lbzx %0, 0, %1;"
+		     "cmpwi %0, 0;"
+		     "bne 2b;"
+		     "b 1b;"
+		     "3:"
+		     : "=&r" (tmp)
+		     : "r" (&paca->tcd_ptr->lock), "r" (token)
+		     : "memory");
+}
+
+static inline void book3e_tlb_unlock(void)
+{
+	struct paca_struct *paca = get_paca();
+
+	isync();
+	paca->tcd_ptr->lock = 0;
+}
+#else
+static inline void book3e_tlb_lock(void)
+{
+}
+
+static inline void book3e_tlb_unlock(void)
+{
+}
+#endif
+
 static inline int book3e_tlb_exists(unsigned long ea, unsigned long pid)
 {
 	int found = 0;
@@ -109,7 +151,10 @@ void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea,
 	 */
 	local_irq_save(flags);
 
+	book3e_tlb_lock();
+
 	if (unlikely(book3e_tlb_exists(ea, mm->context.id))) {
+		book3e_tlb_unlock();
 		local_irq_restore(flags);
 		return;
 	}
@@ -141,6 +186,7 @@ void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea,
 
 	asm volatile ("tlbwe");
 
+	book3e_tlb_unlock();
 	local_irq_restore(flags);
 }
 

From b0417a2870eff30cc102fbc34982b6ce06ce8c6f Mon Sep 17 00:00:00 2001
From: Zhao Qiang <qiang.zhao@freescale.com>
Date: Tue, 15 Dec 2015 10:41:18 +0800
Subject: [PATCH 123/149] powerpc/p1010rdb: Update dts for pcie interrupt-map

p1010rdb uses the irq[4:5] for inta and intb to pcie,
it is active-high, so set it.

Signed-off-by: Zhao Qiang <qiang.zhao@freescale.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>
---
 arch/powerpc/boot/dts/fsl/p1010rdb.dtsi | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/arch/powerpc/boot/dts/fsl/p1010rdb.dtsi b/arch/powerpc/boot/dts/fsl/p1010rdb.dtsi
index 0f0ced69835a..14b629505038 100644
--- a/arch/powerpc/boot/dts/fsl/p1010rdb.dtsi
+++ b/arch/powerpc/boot/dts/fsl/p1010rdb.dtsi
@@ -215,3 +215,19 @@
 		phy-connection-type = "sgmii";
 	};
 };
+
+&pci0 {
+	pcie@0 {
+		interrupt-map = <
+			/* IDSEL 0x0 */
+			/*
+			 *irq[4:5] are active-high
+			 *irq[6:7] are active-low
+			 */
+			0000 0x0 0x0 0x1 &mpic 0x4 0x2 0x0 0x0
+			0000 0x0 0x0 0x2 &mpic 0x5 0x2 0x0 0x0
+			0000 0x0 0x0 0x3 &mpic 0x6 0x1 0x0 0x0
+			0000 0x0 0x0 0x4 &mpic 0x7 0x1 0x0 0x0
+			>;
+	};
+};

From 479f6a7fc64722d82e24db15b2d6ae7f2882377a Mon Sep 17 00:00:00 2001
From: Raghav Dogra <raghav@freescale.com>
Date: Fri, 30 Oct 2015 11:52:02 +0530
Subject: [PATCH 124/149] powerpc/fsl_lbc: removal of dead code

The condition check is not used.

Signed-off-by: Raghav Dogra <raghav@freescale.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>
---
 arch/powerpc/sysdev/fsl_lbc.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/powerpc/sysdev/fsl_lbc.c b/arch/powerpc/sysdev/fsl_lbc.c
index 38138cf8d33e..47f781059eeb 100644
--- a/arch/powerpc/sysdev/fsl_lbc.c
+++ b/arch/powerpc/sysdev/fsl_lbc.c
@@ -243,8 +243,6 @@ static irqreturn_t fsl_lbc_ctrl_irq(int irqno, void *data)
 	if (status & LTESR_CS)
 		dev_err(ctrl->dev, "Chip select error: "
 			"LTESR 0x%08X\n", status);
-	if (status & LTESR_UPM)
-		;
 	if (status & LTESR_FCT) {
 		dev_err(ctrl->dev, "FCM command time-out: "
 			"LTESR 0x%08X\n", status);

From 2330770797afa822652b541d81a17f0e04bcf598 Mon Sep 17 00:00:00 2001
From: Hongtao Jia <hongtao.jia@freescale.com>
Date: Tue, 24 Nov 2015 14:52:44 +0800
Subject: [PATCH 125/149] dt-bindings: Add QorIQ TMU thermal bindings

Add bindings documentation for TMU (Thermal Monitoring Unit) on QorIQ
platform.

Signed-off-by: Jia Hongtao <hongtao.jia@freescale.com>
Reviewed-by: Scott Wood <scottwood@freescale.com>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Scott Wood <scottwood@freescale.com>
---
 .../bindings/thermal/qoriq-thermal.txt        | 63 +++++++++++++++++++
 1 file changed, 63 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/thermal/qoriq-thermal.txt

diff --git a/Documentation/devicetree/bindings/thermal/qoriq-thermal.txt b/Documentation/devicetree/bindings/thermal/qoriq-thermal.txt
new file mode 100644
index 000000000000..66223d561972
--- /dev/null
+++ b/Documentation/devicetree/bindings/thermal/qoriq-thermal.txt
@@ -0,0 +1,63 @@
+* Thermal Monitoring Unit (TMU) on Freescale QorIQ SoCs
+
+Required properties:
+- compatible : Must include "fsl,qoriq-tmu". The version of the device is
+	determined by the TMU IP Block Revision Register (IPBRR0) at
+	offset 0x0BF8.
+	Table of correspondences between IPBRR0 values and example  chips:
+		Value           Device
+		----------      -----
+		0x01900102      T1040
+- reg : Address range of TMU registers.
+- interrupts : Contains the interrupt for TMU.
+- fsl,tmu-range : The values to be programmed into TTRnCR, as specified by
+	the SoC reference manual. The first cell is TTR0CR, the second is
+	TTR1CR, etc.
+- fsl,tmu-calibration : A list of cell pairs containing temperature
+	calibration data, as specified by the SoC reference manual.
+	The first cell of each pair is the value to be written to TTCFGR,
+	and the second is the value to be written to TSCFGR.
+
+Example:
+
+tmu@f0000 {
+	compatible = "fsl,qoriq-tmu";
+	reg = <0xf0000 0x1000>;
+	interrupts = <18 2 0 0>;
+	fsl,tmu-range = <0x000a0000 0x00090026 0x0008004a 0x0001006a>;
+	fsl,tmu-calibration = <0x00000000 0x00000025
+			       0x00000001 0x00000028
+			       0x00000002 0x0000002d
+			       0x00000003 0x00000031
+			       0x00000004 0x00000036
+			       0x00000005 0x0000003a
+			       0x00000006 0x00000040
+			       0x00000007 0x00000044
+			       0x00000008 0x0000004a
+			       0x00000009 0x0000004f
+			       0x0000000a 0x00000054
+
+			       0x00010000 0x0000000d
+			       0x00010001 0x00000013
+			       0x00010002 0x00000019
+			       0x00010003 0x0000001f
+			       0x00010004 0x00000025
+			       0x00010005 0x0000002d
+			       0x00010006 0x00000033
+			       0x00010007 0x00000043
+			       0x00010008 0x0000004b
+			       0x00010009 0x00000053
+
+			       0x00020000 0x00000010
+			       0x00020001 0x00000017
+			       0x00020002 0x0000001f
+			       0x00020003 0x00000029
+			       0x00020004 0x00000031
+			       0x00020005 0x0000003c
+			       0x00020006 0x00000042
+			       0x00020007 0x0000004d
+			       0x00020008 0x00000056
+
+			       0x00030000 0x00000012
+			       0x00030001 0x0000001d>;
+};

From be489a3936349c5f68c8001f31580d697c474b98 Mon Sep 17 00:00:00 2001
From: Hongtao Jia <hongtao.jia@freescale.com>
Date: Tue, 24 Nov 2015 14:52:46 +0800
Subject: [PATCH 126/149] powerpc/mpc85xx: Add TMU device tree support for
 T1040/T1042

Also add nodes and properties for thermal management support. Meanwhile
preprocessor support is needed using thermal of framework.

Signed-off-by: Jia Hongtao <hongtao.jia@freescale.com>
Reviewed-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>
---
 arch/powerpc/boot/dts/fsl/t1040d4rdb.dts    |  2 +-
 arch/powerpc/boot/dts/fsl/t1040qds.dts      |  2 +-
 arch/powerpc/boot/dts/fsl/t1040rdb.dts      |  2 +-
 arch/powerpc/boot/dts/fsl/t1040si-post.dtsi | 94 +++++++++++++++++++++
 arch/powerpc/boot/dts/fsl/t1042d4rdb.dts    |  2 +-
 arch/powerpc/boot/dts/fsl/t1042qds.dts      |  2 +-
 arch/powerpc/boot/dts/fsl/t1042rdb.dts      |  2 +-
 arch/powerpc/boot/dts/fsl/t1042rdb_pi.dts   |  2 +-
 arch/powerpc/boot/dts/fsl/t1042si-post.dtsi |  2 +-
 arch/powerpc/boot/dts/fsl/t104xsi-pre.dtsi  |  4 +
 10 files changed, 106 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/boot/dts/fsl/t1040d4rdb.dts b/arch/powerpc/boot/dts/fsl/t1040d4rdb.dts
index 681746efd31d..fb6bc02ebb60 100644
--- a/arch/powerpc/boot/dts/fsl/t1040d4rdb.dts
+++ b/arch/powerpc/boot/dts/fsl/t1040d4rdb.dts
@@ -43,4 +43,4 @@
 	interrupt-parent = <&mpic>;
 };
 
-/include/ "t1040si-post.dtsi"
+#include "t1040si-post.dtsi"
diff --git a/arch/powerpc/boot/dts/fsl/t1040qds.dts b/arch/powerpc/boot/dts/fsl/t1040qds.dts
index 4d298659468c..5f76edc7838c 100644
--- a/arch/powerpc/boot/dts/fsl/t1040qds.dts
+++ b/arch/powerpc/boot/dts/fsl/t1040qds.dts
@@ -43,4 +43,4 @@
 	interrupt-parent = <&mpic>;
 };
 
-/include/ "t1040si-post.dtsi"
+#include "t1040si-post.dtsi"
diff --git a/arch/powerpc/boot/dts/fsl/t1040rdb.dts b/arch/powerpc/boot/dts/fsl/t1040rdb.dts
index 8f9e65b47515..cf194154bbdc 100644
--- a/arch/powerpc/boot/dts/fsl/t1040rdb.dts
+++ b/arch/powerpc/boot/dts/fsl/t1040rdb.dts
@@ -45,4 +45,4 @@
 	};
 };
 
-/include/ "t1040si-post.dtsi"
+#include "t1040si-post.dtsi"
diff --git a/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi b/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi
index d30b3de1cfc5..e0f4da554774 100644
--- a/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi
+++ b/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi
@@ -32,6 +32,8 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include <dt-bindings/thermal/thermal.h>
+
 &bman_fbpr {
 	compatible = "fsl,bman-fbpr";
 	alloc-ranges = <0 0 0x10000 0>;
@@ -484,6 +486,98 @@
 		reg	   = <0xea000 0x4000>;
 	};
 
+	tmu: tmu@f0000 {
+		compatible = "fsl,qoriq-tmu";
+		reg = <0xf0000 0x1000>;
+		interrupts = <18 2 0 0>;
+		fsl,tmu-range = <0xa0000 0x90026 0x8004a 0x1006a>;
+		fsl,tmu-calibration = <0x00000000 0x00000025
+				       0x00000001 0x00000028
+				       0x00000002 0x0000002d
+				       0x00000003 0x00000031
+				       0x00000004 0x00000036
+				       0x00000005 0x0000003a
+				       0x00000006 0x00000040
+				       0x00000007 0x00000044
+				       0x00000008 0x0000004a
+				       0x00000009 0x0000004f
+				       0x0000000a 0x00000054
+
+				       0x00010000 0x0000000d
+				       0x00010001 0x00000013
+				       0x00010002 0x00000019
+				       0x00010003 0x0000001f
+				       0x00010004 0x00000025
+				       0x00010005 0x0000002d
+				       0x00010006 0x00000033
+				       0x00010007 0x00000043
+				       0x00010008 0x0000004b
+				       0x00010009 0x00000053
+
+				       0x00020000 0x00000010
+				       0x00020001 0x00000017
+				       0x00020002 0x0000001f
+				       0x00020003 0x00000029
+				       0x00020004 0x00000031
+				       0x00020005 0x0000003c
+				       0x00020006 0x00000042
+				       0x00020007 0x0000004d
+				       0x00020008 0x00000056
+
+				       0x00030000 0x00000012
+				       0x00030001 0x0000001d>;
+		#thermal-sensor-cells = <0>;
+	};
+
+	thermal-zones {
+		cpu_thermal: cpu-thermal {
+			polling-delay-passive = <1000>;
+			polling-delay = <5000>;
+
+			thermal-sensors = <&tmu>;
+
+			trips {
+				cpu_alert: cpu-alert {
+					temperature = <85000>;
+					hysteresis = <2000>;
+					type = "passive";
+				};
+				cpu_crit: cpu-crit {
+					temperature = <95000>;
+					hysteresis = <2000>;
+					type = "critical";
+				};
+			};
+
+			cooling-maps {
+				map0 {
+					trip = <&cpu_alert>;
+					cooling-device =
+						<&cpu0 THERMAL_NO_LIMIT
+							THERMAL_NO_LIMIT>;
+				};
+				map1 {
+					trip = <&cpu_alert>;
+					cooling-device =
+						<&cpu1 THERMAL_NO_LIMIT
+							THERMAL_NO_LIMIT>;
+				};
+				map2 {
+					trip = <&cpu_alert>;
+					cooling-device =
+						<&cpu2 THERMAL_NO_LIMIT
+							THERMAL_NO_LIMIT>;
+				};
+				map3 {
+					trip = <&cpu_alert>;
+					cooling-device =
+						<&cpu3 THERMAL_NO_LIMIT
+							THERMAL_NO_LIMIT>;
+				};
+			};
+		};
+	};
+
 	scfg: global-utilities@fc000 {
 		compatible = "fsl,t1040-scfg";
 		reg = <0xfc000 0x1000>;
diff --git a/arch/powerpc/boot/dts/fsl/t1042d4rdb.dts b/arch/powerpc/boot/dts/fsl/t1042d4rdb.dts
index b245b31b8279..2a5a90dd272e 100644
--- a/arch/powerpc/boot/dts/fsl/t1042d4rdb.dts
+++ b/arch/powerpc/boot/dts/fsl/t1042d4rdb.dts
@@ -50,4 +50,4 @@
 	};
 };
 
-/include/ "t1040si-post.dtsi"
+#include "t1042si-post.dtsi"
diff --git a/arch/powerpc/boot/dts/fsl/t1042qds.dts b/arch/powerpc/boot/dts/fsl/t1042qds.dts
index 4ab9bbe7c5c5..90a4a73bb905 100644
--- a/arch/powerpc/boot/dts/fsl/t1042qds.dts
+++ b/arch/powerpc/boot/dts/fsl/t1042qds.dts
@@ -43,4 +43,4 @@
 	interrupt-parent = <&mpic>;
 };
 
-/include/ "t1042si-post.dtsi"
+#include "t1042si-post.dtsi"
diff --git a/arch/powerpc/boot/dts/fsl/t1042rdb.dts b/arch/powerpc/boot/dts/fsl/t1042rdb.dts
index 67af56bc5ee9..8d908e795e4d 100644
--- a/arch/powerpc/boot/dts/fsl/t1042rdb.dts
+++ b/arch/powerpc/boot/dts/fsl/t1042rdb.dts
@@ -45,4 +45,4 @@
 	};
 };
 
-/include/ "t1042si-post.dtsi"
+#include "t1042si-post.dtsi"
diff --git a/arch/powerpc/boot/dts/fsl/t1042rdb_pi.dts b/arch/powerpc/boot/dts/fsl/t1042rdb_pi.dts
index 2f67677530a4..98c001019d6a 100644
--- a/arch/powerpc/boot/dts/fsl/t1042rdb_pi.dts
+++ b/arch/powerpc/boot/dts/fsl/t1042rdb_pi.dts
@@ -54,4 +54,4 @@
 	};
 };
 
-/include/ "t1042si-post.dtsi"
+#include "t1042si-post.dtsi"
diff --git a/arch/powerpc/boot/dts/fsl/t1042si-post.dtsi b/arch/powerpc/boot/dts/fsl/t1042si-post.dtsi
index 319b74f29724..a5544f93689c 100644
--- a/arch/powerpc/boot/dts/fsl/t1042si-post.dtsi
+++ b/arch/powerpc/boot/dts/fsl/t1042si-post.dtsi
@@ -32,6 +32,6 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/include/ "t1040si-post.dtsi"
+#include "t1040si-post.dtsi"
 
 /* Place holder for ethernet related device tree nodes */
diff --git a/arch/powerpc/boot/dts/fsl/t104xsi-pre.dtsi b/arch/powerpc/boot/dts/fsl/t104xsi-pre.dtsi
index fcfa38ae5e02..6db0ee8b1384 100644
--- a/arch/powerpc/boot/dts/fsl/t104xsi-pre.dtsi
+++ b/arch/powerpc/boot/dts/fsl/t104xsi-pre.dtsi
@@ -76,6 +76,7 @@
 			reg = <0>;
 			clocks = <&mux0>;
 			next-level-cache = <&L2_1>;
+			#cooling-cells = <2>;
 			L2_1: l2-cache {
 				next-level-cache = <&cpc>;
 			};
@@ -85,6 +86,7 @@
 			reg = <1>;
 			clocks = <&mux1>;
 			next-level-cache = <&L2_2>;
+			#cooling-cells = <2>;
 			L2_2: l2-cache {
 				next-level-cache = <&cpc>;
 			};
@@ -94,6 +96,7 @@
 			reg = <2>;
 			clocks = <&mux2>;
 			next-level-cache = <&L2_3>;
+			#cooling-cells = <2>;
 			L2_3: l2-cache {
 				next-level-cache = <&cpc>;
 			};
@@ -103,6 +106,7 @@
 			reg = <3>;
 			clocks = <&mux3>;
 			next-level-cache = <&L2_4>;
+			#cooling-cells = <2>;
 			L2_4: l2-cache {
 				next-level-cache = <&cpc>;
 			};

From 3045e409e403b35ea4e30393a97cb913c745b38d Mon Sep 17 00:00:00 2001
From: Hongtao Jia <hongtao.jia@freescale.com>
Date: Tue, 24 Nov 2015 14:52:47 +0800
Subject: [PATCH 127/149] powerpc/mpc85xx: Add TMU device tree support for
 T1023/T1024

Also add nodes and properties for thermal management support. Meanwhile
preprocessor support is needed using thermal of framework.

Signed-off-by: Jia Hongtao <hongtao.jia@freescale.com>
Reviewed-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>
---
 arch/powerpc/boot/dts/fsl/t1023rdb.dts      |  2 +-
 arch/powerpc/boot/dts/fsl/t1023si-post.dtsi | 86 +++++++++++++++++++++
 arch/powerpc/boot/dts/fsl/t1024qds.dts      |  2 +-
 arch/powerpc/boot/dts/fsl/t1024rdb.dts      |  2 +-
 arch/powerpc/boot/dts/fsl/t1024si-post.dtsi |  2 +-
 arch/powerpc/boot/dts/fsl/t102xsi-pre.dtsi  |  2 +
 6 files changed, 92 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/boot/dts/fsl/t1023rdb.dts b/arch/powerpc/boot/dts/fsl/t1023rdb.dts
index 2b2fff4a12a2..6bd842beb1dc 100644
--- a/arch/powerpc/boot/dts/fsl/t1023rdb.dts
+++ b/arch/powerpc/boot/dts/fsl/t1023rdb.dts
@@ -159,4 +159,4 @@
 	};
 };
 
-/include/ "t1023si-post.dtsi"
+#include "t1023si-post.dtsi"
diff --git a/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi b/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi
index 518ddaa8da2d..99e421df79d4 100644
--- a/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi
+++ b/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi
@@ -32,6 +32,8 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include <dt-bindings/thermal/thermal.h>
+
 &ifc {
 	#address-cells = <2>;
 	#size-cells = <1>;
@@ -275,6 +277,90 @@
 		reg = <0xea000 0x4000>;
 	};
 
+	tmu: tmu@f0000 {
+		compatible = "fsl,qoriq-tmu";
+		reg = <0xf0000 0x1000>;
+		interrupts = <18 2 0 0>;
+		fsl,tmu-range = <0xb0000 0xa0026 0x80048 0x30061>;
+		fsl,tmu-calibration = <0x00000000 0x0000000f
+				       0x00000001 0x00000017
+				       0x00000002 0x0000001e
+				       0x00000003 0x00000026
+				       0x00000004 0x0000002e
+				       0x00000005 0x00000035
+				       0x00000006 0x0000003d
+				       0x00000007 0x00000044
+				       0x00000008 0x0000004c
+				       0x00000009 0x00000053
+				       0x0000000a 0x0000005b
+				       0x0000000b 0x00000064
+
+				       0x00010000 0x00000011
+				       0x00010001 0x0000001c
+				       0x00010002 0x00000024
+				       0x00010003 0x0000002b
+				       0x00010004 0x00000034
+				       0x00010005 0x00000039
+				       0x00010006 0x00000042
+				       0x00010007 0x0000004c
+				       0x00010008 0x00000051
+				       0x00010009 0x0000005a
+				       0x0001000a 0x00000063
+
+				       0x00020000 0x00000013
+				       0x00020001 0x00000019
+				       0x00020002 0x00000024
+				       0x00020003 0x0000002c
+				       0x00020004 0x00000035
+				       0x00020005 0x0000003d
+				       0x00020006 0x00000046
+				       0x00020007 0x00000050
+				       0x00020008 0x00000059
+
+				       0x00030000 0x00000002
+				       0x00030001 0x0000000d
+				       0x00030002 0x00000019
+				       0x00030003 0x00000024>;
+		#thermal-sensor-cells = <0>;
+	};
+
+	thermal-zones {
+		cpu_thermal: cpu-thermal {
+			polling-delay-passive = <1000>;
+			polling-delay = <5000>;
+
+			thermal-sensors = <&tmu>;
+
+			trips {
+				cpu_alert: cpu-alert {
+					temperature = <85000>;
+					hysteresis = <2000>;
+					type = "passive";
+				};
+				cpu_crit: cpu-crit {
+					temperature = <95000>;
+					hysteresis = <2000>;
+					type = "critical";
+				};
+			};
+
+			cooling-maps {
+				map0 {
+					trip = <&cpu_alert>;
+					cooling-device =
+						<&cpu0 THERMAL_NO_LIMIT
+							THERMAL_NO_LIMIT>;
+				};
+				map1 {
+					trip = <&cpu_alert>;
+					cooling-device =
+						<&cpu1 THERMAL_NO_LIMIT
+							THERMAL_NO_LIMIT>;
+				};
+			};
+		};
+	};
+
 	scfg: global-utilities@fc000 {
 		compatible = "fsl,t1023-scfg";
 		reg = <0xfc000 0x1000>;
diff --git a/arch/powerpc/boot/dts/fsl/t1024qds.dts b/arch/powerpc/boot/dts/fsl/t1024qds.dts
index 43cd5b50cd0a..6a3581b8e1f8 100644
--- a/arch/powerpc/boot/dts/fsl/t1024qds.dts
+++ b/arch/powerpc/boot/dts/fsl/t1024qds.dts
@@ -248,4 +248,4 @@
 	};
 };
 
-/include/ "t1024si-post.dtsi"
+#include "t1024si-post.dtsi"
diff --git a/arch/powerpc/boot/dts/fsl/t1024rdb.dts b/arch/powerpc/boot/dts/fsl/t1024rdb.dts
index 429d8c73650a..0ccc7d03335e 100644
--- a/arch/powerpc/boot/dts/fsl/t1024rdb.dts
+++ b/arch/powerpc/boot/dts/fsl/t1024rdb.dts
@@ -188,4 +188,4 @@
 	};
 };
 
-/include/ "t1024si-post.dtsi"
+#include "t1024si-post.dtsi"
diff --git a/arch/powerpc/boot/dts/fsl/t1024si-post.dtsi b/arch/powerpc/boot/dts/fsl/t1024si-post.dtsi
index 95e3af8d768e..bb480346a58d 100644
--- a/arch/powerpc/boot/dts/fsl/t1024si-post.dtsi
+++ b/arch/powerpc/boot/dts/fsl/t1024si-post.dtsi
@@ -32,7 +32,7 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/include/ "t1023si-post.dtsi"
+#include "t1023si-post.dtsi"
 
 / {
 	aliases {
diff --git a/arch/powerpc/boot/dts/fsl/t102xsi-pre.dtsi b/arch/powerpc/boot/dts/fsl/t102xsi-pre.dtsi
index 3e1528abf3f4..9d08a363bab3 100644
--- a/arch/powerpc/boot/dts/fsl/t102xsi-pre.dtsi
+++ b/arch/powerpc/boot/dts/fsl/t102xsi-pre.dtsi
@@ -76,6 +76,7 @@
 			reg = <0>;
 			clocks = <&mux0>;
 			next-level-cache = <&L2_1>;
+			#cooling-cells = <2>;
 			L2_1: l2-cache {
 				next-level-cache = <&cpc>;
 			};
@@ -85,6 +86,7 @@
 			reg = <1>;
 			clocks = <&mux1>;
 			next-level-cache = <&L2_2>;
+			#cooling-cells = <2>;
 			L2_2: l2-cache {
 				next-level-cache = <&cpc>;
 			};

From 2fc251a8dda56b71ec491bee4c6897e3e12c0739 Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Fri, 11 Dec 2015 09:34:42 +1100
Subject: [PATCH 128/149] powerpc: Copy only required pieces of the
 mm_context_t to the paca

Currently we copy the whole mm_context_t to the paca but only access a
few bits of it.  This is wasteful of space paca and also takes quite
some time in the hot path of context switching.

This patch pulls in only the required bits from the mm_context_t to
the paca and on context switch, copies only those.

Benchmarking this (On top of Anton's recent MSR context switching
changes [1]) using processes and yield shows an improvement of almost
3% on POWER8:

  http://ozlabs.org/~anton/junkcode/context_switch2.c
  ./context_switch2 --test=yield --process 0 0

1. https://lists.ozlabs.org/pipermail/linuxppc-dev/2015-October/135700.html

Signed-off-by: Michael Neuling <mikey@neuling.org>
[mpe: Rename paca fields to be mm_ctx_foo rather than context_foo]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/paca.h   | 18 ++++++++++++++++--
 arch/powerpc/kernel/asm-offsets.c |  8 ++++----
 arch/powerpc/mm/hash_utils_64.c   |  4 ++--
 3 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 1cc6e0828907..ef78c288c712 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -16,6 +16,7 @@
 
 #ifdef CONFIG_PPC64
 
+#include <linux/string.h>
 #include <asm/types.h>
 #include <asm/lppaca.h>
 #include <asm/mmu.h>
@@ -132,7 +133,13 @@ struct paca_struct {
 #endif /* CONFIG_PPC_BOOK3E */
 
 #ifdef CONFIG_PPC_BOOK3S
-	mm_context_t context;
+	mm_context_id_t mm_ctx_id;
+#ifdef CONFIG_PPC_MM_SLICES
+	u64 mm_ctx_low_slices_psize;
+	unsigned char mm_ctx_high_slices_psize[SLICE_ARRAY_SIZE];
+#else
+	u16 mm_ctx_sllp;
+#endif
 #endif
 
 	/*
@@ -199,7 +206,14 @@ struct paca_struct {
 #ifdef CONFIG_PPC_BOOK3S
 static inline void copy_mm_to_paca(mm_context_t *context)
 {
-	get_paca()->context = *context;
+	get_paca()->mm_ctx_id = context->id;
+#ifdef CONFIG_PPC_MM_SLICES
+	get_paca()->mm_ctx_low_slices_psize = context->low_slices_psize;
+	memcpy(&get_paca()->mm_ctx_high_slices_psize,
+	       &context->high_slices_psize, SLICE_ARRAY_SIZE);
+#else
+	get_paca()->mm_ctx_sllp = context->sllp;
+#endif
 }
 #else
 static inline void copy_mm_to_paca(mm_context_t *context){}
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 9db7be292bf3..07cebc3514f3 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -186,12 +186,12 @@ int main(void)
 	DEFINE(PACASOFTIRQEN, offsetof(struct paca_struct, soft_enabled));
 	DEFINE(PACAIRQHAPPENED, offsetof(struct paca_struct, irq_happened));
 #ifdef CONFIG_PPC_BOOK3S
-	DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id));
+	DEFINE(PACACONTEXTID, offsetof(struct paca_struct, mm_ctx_id));
 #ifdef CONFIG_PPC_MM_SLICES
 	DEFINE(PACALOWSLICESPSIZE, offsetof(struct paca_struct,
-					    context.low_slices_psize));
+					    mm_ctx_low_slices_psize));
 	DEFINE(PACAHIGHSLICEPSIZE, offsetof(struct paca_struct,
-					    context.high_slices_psize));
+					    mm_ctx_high_slices_psize));
 	DEFINE(MMUPSIZEDEFSIZE, sizeof(struct mmu_psize_def));
 #endif /* CONFIG_PPC_MM_SLICES */
 #endif
@@ -224,7 +224,7 @@ int main(void)
 #ifdef CONFIG_PPC_MM_SLICES
 	DEFINE(MMUPSIZESLLP, offsetof(struct mmu_psize_def, sllp));
 #else
-	DEFINE(PACACONTEXTSLLP, offsetof(struct paca_struct, context.sllp));
+	DEFINE(PACACONTEXTSLLP, offsetof(struct paca_struct, mm_ctx_sllp));
 #endif /* CONFIG_PPC_MM_SLICES */
 	DEFINE(PACA_EXGEN, offsetof(struct paca_struct, exgen));
 	DEFINE(PACA_EXMC, offsetof(struct paca_struct, exmc));
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 03279eac0957..db744576d730 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -853,11 +853,11 @@ static unsigned int get_paca_psize(unsigned long addr)
 	unsigned long index, mask_index;
 
 	if (addr < SLICE_LOW_TOP) {
-		lpsizes = get_paca()->context.low_slices_psize;
+		lpsizes = get_paca()->mm_ctx_low_slices_psize;
 		index = GET_LOW_SLICE_INDEX(addr);
 		return (lpsizes >> (index * 4)) & 0xF;
 	}
-	hpsizes = get_paca()->context.high_slices_psize;
+	hpsizes = get_paca()->mm_ctx_high_slices_psize;
 	index = GET_HIGH_SLICE_INDEX(addr);
 	mask_index = index & 0x1;
 	return (hpsizes[index >> 1] >> (mask_index * 4)) & 0xF;

From affddff69c55eb68969448f35f59054a370bc7c1 Mon Sep 17 00:00:00 2001
From: Russell Currey <ruscur@russell.cc>
Date: Fri, 27 Nov 2015 17:23:07 +1100
Subject: [PATCH 129/149] powerpc/powernv: Add a kmsg_dumper that flushes
 console output on panic

On BMC machines, console output is controlled by the OPAL firmware and is
only flushed when its pollers are called.  When the kernel is in a panic
state, it no longer calls these pollers and thus console output does not
completely flush, causing some output from the panic to be lost.

Output is only actually lost when the kernel is configured to not power off
or reboot after panic (i.e. CONFIG_PANIC_TIMEOUT is set to 0) since OPAL
flushes the console buffer as part of its power down routines.  Before this
patch, however, only partial output would be printed during the timeout wait.

This patch adds a new kmsg_dumper which gets called at panic time to ensure
panic output is not lost.  It accomplishes this by calling OPAL_CONSOLE_FLUSH
in the OPAL API, and if that is not available, the pollers are called enough
times to (hopefully) completely flush the buffer.

The flushing mechanism will only affect output printed at and before the
kmsg_dump call in kernel/panic.c:panic().  As such, the "end Kernel panic"
message may still be truncated as follows:

>Call Trace:
>[c000000f1f603b00] [c0000000008e9458] dump_stack+0x90/0xbc (unreliable)
>[c000000f1f603b30] [c0000000008e7e78] panic+0xf8/0x2c4
>[c000000f1f603bc0] [c000000000be4860] mount_block_root+0x288/0x33c
>[c000000f1f603c80] [c000000000be4d14] prepare_namespace+0x1f4/0x254
>[c000000f1f603d00] [c000000000be43e8] kernel_init_freeable+0x318/0x350
>[c000000f1f603dc0] [c00000000000bd74] kernel_init+0x24/0x130
>[c000000f1f603e30] [c0000000000095b0] ret_from_kernel_thread+0x5c/0xac
>---[ end Kernel panic - not

This functionality is implemented as a kmsg_dumper as it seems to be the
most sensible way to introduce platform-specific functionality to the
panic function.

Signed-off-by: Russell Currey <ruscur@russell.cc>
Reviewed-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/opal-api.h           |  3 +-
 arch/powerpc/include/asm/opal.h               |  3 +
 arch/powerpc/platforms/powernv/Makefile       |  1 +
 arch/powerpc/platforms/powernv/opal-kmsg.c    | 68 +++++++++++++++++++
 .../powerpc/platforms/powernv/opal-wrappers.S |  1 +
 arch/powerpc/platforms/powernv/opal.c         |  3 +
 6 files changed, 78 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/platforms/powernv/opal-kmsg.c

diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h
index 8374afed9d0a..f8faaaeeca1e 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -157,7 +157,8 @@
 #define OPAL_LEDS_GET_INDICATOR			114
 #define OPAL_LEDS_SET_INDICATOR			115
 #define OPAL_CEC_REBOOT2			116
-#define OPAL_LAST				116
+#define OPAL_CONSOLE_FLUSH			117
+#define OPAL_LAST				117
 
 /* Device tree flags */
 
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 800115910e43..a5fd407213b6 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -35,6 +35,7 @@ int64_t opal_console_read(int64_t term_number, __be64 *length,
 			  uint8_t *buffer);
 int64_t opal_console_write_buffer_space(int64_t term_number,
 					__be64 *length);
+void opal_console_flush(void);
 int64_t opal_rtc_read(__be32 *year_month_day,
 		      __be64 *hour_minute_second_millisecond);
 int64_t opal_rtc_write(uint32_t year_month_day,
@@ -262,6 +263,8 @@ extern int opal_resync_timebase(void);
 
 extern void opal_lpc_init(void);
 
+extern void opal_kmsg_init(void);
+
 extern int opal_event_request(unsigned int opal_event_nr);
 
 struct opal_sg_list *opal_vmalloc_to_sg_list(void *vmalloc_addr,
diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
index ee774e8a4837..f1516b5ecec9 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -2,6 +2,7 @@ obj-y			+= setup.o opal-wrappers.o opal.o opal-async.o idle.o
 obj-y			+= opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o
 obj-y			+= rng.o opal-elog.o opal-dump.o opal-sysparam.o opal-sensor.o
 obj-y			+= opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o
+obj-y			+= opal-kmsg.o
 
 obj-$(CONFIG_SMP)	+= smp.o subcore.o subcore-asm.o
 obj-$(CONFIG_PCI)	+= pci.o pci-p5ioc2.o pci-ioda.o npu-dma.o
diff --git a/arch/powerpc/platforms/powernv/opal-kmsg.c b/arch/powerpc/platforms/powernv/opal-kmsg.c
new file mode 100644
index 000000000000..bd3b2ee1ba1d
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-kmsg.c
@@ -0,0 +1,68 @@
+/*
+ * kmsg dumper that ensures the OPAL console fully flushes panic messages
+ *
+ * Author: Russell Currey <ruscur@russell.cc>
+ *
+ * Copyright 2015 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ */
+
+#include <linux/kmsg_dump.h>
+
+#include <asm/opal.h>
+#include <asm/opal-api.h>
+
+/*
+ * Console output is controlled by OPAL firmware.  The kernel regularly calls
+ * OPAL_POLL_EVENTS, which flushes some console output.  In a panic state,
+ * however, the kernel no longer calls OPAL_POLL_EVENTS and the panic message
+ * may not be completely printed.  This function does not actually dump the
+ * message, it just ensures that OPAL completely flushes the console buffer.
+ */
+static void force_opal_console_flush(struct kmsg_dumper *dumper,
+				     enum kmsg_dump_reason reason)
+{
+	int i;
+
+	/*
+	 * Outside of a panic context the pollers will continue to run,
+	 * so we don't need to do any special flushing.
+	 */
+	if (reason != KMSG_DUMP_PANIC)
+		return;
+
+	if (opal_check_token(OPAL_CONSOLE_FLUSH)) {
+		opal_console_flush();
+	} else {
+		/*
+		 * If OPAL_CONSOLE_FLUSH is not implemented in the firmware,
+		 * the console can still be flushed by calling the polling
+		 * function enough times to flush the buffer.  We don't know
+		 * how much output still needs to be flushed, but we can be
+		 * generous since the kernel is in panic and doesn't need
+		 * to do much else.
+		 */
+		printk(KERN_NOTICE "opal: OPAL_CONSOLE_FLUSH missing.\n");
+		for (i = 0; i < 1024; i++) {
+			opal_poll_events(NULL);
+		}
+	}
+}
+
+static struct kmsg_dumper opal_kmsg_dumper = {
+	.dump = force_opal_console_flush
+};
+
+void __init opal_kmsg_init(void)
+{
+	int rc;
+
+	/* Add our dumper to the list */
+	rc = kmsg_dump_register(&opal_kmsg_dumper);
+	if (rc != 0)
+		pr_err("opal: kmsg_dump_register failed; returned %d\n", rc);
+}
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index b7a464fef7a7..e45b88a5d7e0 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -301,3 +301,4 @@ OPAL_CALL(opal_flash_erase,			OPAL_FLASH_ERASE);
 OPAL_CALL(opal_prd_msg,				OPAL_PRD_MSG);
 OPAL_CALL(opal_leds_get_ind,			OPAL_LEDS_GET_INDICATOR);
 OPAL_CALL(opal_leds_set_ind,			OPAL_LEDS_SET_INDICATOR);
+OPAL_CALL(opal_console_flush,			OPAL_CONSOLE_FLUSH);
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index aad0033d65d1..b349dd3e76ea 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -748,6 +748,9 @@ static int __init opal_init(void)
 	opal_pdev_init(opal_node, "ibm,opal-flash");
 	opal_pdev_init(opal_node, "ibm,opal-prd");
 
+	/* Initialise OPAL kmsg dumper for flushing console on panic */
+	opal_kmsg_init();
+
 	return 0;
 }
 machine_subsys_initcall(powernv, opal_init);

From 57a9039052aadf5833c40ab494d30d3755660a48 Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Fri, 18 Dec 2015 21:46:04 +1100
Subject: [PATCH 130/149] powerpc/powernv: Only delay opal_rtc_read() retry
 when necessary

Only delay opal_rtc_read() when busy and are going to retry.

This has the advantage of possibly saving a massive 10ms off booting!

Kudos to Stewart for noticing.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Reviewed-by: Stewart Smith <stewart@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/opal-rtc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/opal-rtc.c b/arch/powerpc/platforms/powernv/opal-rtc.c
index 1b149c92fca1..f8868864f373 100644
--- a/arch/powerpc/platforms/powernv/opal-rtc.c
+++ b/arch/powerpc/platforms/powernv/opal-rtc.c
@@ -50,7 +50,7 @@ unsigned long __init opal_get_boot_time(void)
 		rc = opal_rtc_read(&__y_m_d, &__h_m_s_ms);
 		if (rc == OPAL_BUSY_EVENT)
 			opal_poll_events(NULL);
-		else
+		else if (rc == OPAL_BUSY)
 			mdelay(10);
 	}
 	if (rc != OPAL_SUCCESS)

From 759fb100b22473bebc46a0f10ca1af5acd79439d Mon Sep 17 00:00:00 2001
From: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Date: Mon, 21 Dec 2015 17:38:41 +1100
Subject: [PATCH 131/149] powerpc: Fix style of self-test config prompts

A few of the config prompts for powerpc self-tests have periods at the
end, which is inconsistent with the rest of the prompts. Remove the
periods.

Signed-off-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/Kconfig.debug | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug
index 3a510f4a6b68..77e2cefe47eb 100644
--- a/arch/powerpc/Kconfig.debug
+++ b/arch/powerpc/Kconfig.debug
@@ -64,17 +64,17 @@ config PPC_EMULATED_STATS
 	  emulated.
 
 config CODE_PATCHING_SELFTEST
-	bool "Run self-tests of the code-patching code."
+	bool "Run self-tests of the code-patching code"
 	depends on DEBUG_KERNEL
 	default n
 
 config FTR_FIXUP_SELFTEST
-	bool "Run self-tests of the feature-fixup code."
+	bool "Run self-tests of the feature-fixup code"
 	depends on DEBUG_KERNEL
 	default n
 
 config MSI_BITMAP_SELFTEST
-	bool "Run self-tests of the MSI bitmap code."
+	bool "Run self-tests of the MSI bitmap code"
 	depends on DEBUG_KERNEL
 	default n
 

From dc3799bb9ab2666fa19081121f05a4e573ecae12 Mon Sep 17 00:00:00 2001
From: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Date: Mon, 21 Dec 2015 18:28:37 +1100
Subject: [PATCH 132/149] powerpc/powernv: Fix minor off-by-one error in
 opal_mce_check_early_recovery()

Fix off-by-one error in opal_mce_check_early_recovery() when checking
whether the NIP falls within OPAL space.

Signed-off-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/opal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index b349dd3e76ea..81f4a3ab8743 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -548,7 +548,7 @@ bool opal_mce_check_early_recovery(struct pt_regs *regs)
 		goto out;
 
 	if ((regs->nip >= opal.base) &&
-			(regs->nip <= (opal.base + opal.size)))
+			(regs->nip < (opal.base + opal.size)))
 		recover_addr = find_recovery_address(regs->nip);
 
 	/*

From 44451d4d8f0e35153b3e7e3d40e198f2cf9ac36a Mon Sep 17 00:00:00 2001
From: Scott Wood <oss@buserror.net>
Date: Thu, 31 Dec 2015 12:57:26 -0600
Subject: [PATCH 133/149] MAINTAINERS: Update Scott Wood's e-mail address

Freescale is now NXP.  I still work there, but I won't be using their
mail system for Linux development.

Signed-off-by: Scott Wood <oss@buserror.net>
---
 MAINTAINERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 8099527abccf..124dd3795475 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6429,7 +6429,7 @@ S:	Maintained
 F:	arch/powerpc/platforms/8xx/
 
 LINUX FOR POWERPC EMBEDDED PPC83XX AND PPC85XX
-M:	Scott Wood <scottwood@freescale.com>
+M:	Scott Wood <oss@buserror.net>
 M:	Kumar Gala <galak@kernel.crashing.org>
 W:	http://www.penguinppc.org/
 L:	linuxppc-dev@lists.ozlabs.org

From 7b8ad495d59280b634a7b546f4cdf58cf4d65f61 Mon Sep 17 00:00:00 2001
From: Vaibhav Jain <vaibhav@linux.vnet.ibm.com>
Date: Tue, 24 Nov 2015 16:26:18 +0530
Subject: [PATCH 134/149] cxl: Fix DSI misses when the context owning task
 exits

Presently when a user-space process issues CXL_IOCTL_START_WORK ioctl we
store the pid of the current task_struct and use it to get pointer to
the mm_struct of the process, while processing page or segment faults
from the capi card. However this causes issues when the thread that had
originally issued the start-work ioctl exits in which case the stored
pid is no more valid and the cxl driver is unable to handle faults as
the mm_struct corresponding to process is no more accessible.

This patch fixes this issue by using the mm_struct of the next alive
task in the thread group. This is done by iterating over all the tasks
in the thread group starting from thread group leader and calling
get_task_mm on each one of them. When a valid mm_struct is obtained the
pid of the associated task is stored in the context replacing the
exiting one for handling future faults.

The patch introduces a new function named get_mem_context that checks if
the current task pointed to by ctx->pid is dead? If yes it performs the
steps described above. Also a new variable cxl_context.glpid is
introduced which stores the pid of the thread group leader associated
with the context owning task.

Reported-by: Matthew R. Ochs <mrochs@linux.vnet.ibm.com>
Reported-by: Frank Haverkamp <HAVERKAM@de.ibm.com>
Suggested-by: Ian Munsie <imunsie@au1.ibm.com>
Signed-off-by: Vaibhav Jain <vaibhav@linux.vnet.ibm.com>
Acked-by: Ian Munsie <imunsie@au1.ibm.com>
Reviewed-by: Frederic Barrat <fbarrat@linux.vnet.ibm.com>
Reviewed-by: Matthew R. Ochs <mrochs@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 drivers/misc/cxl/api.c     |   2 +-
 drivers/misc/cxl/context.c |   6 +-
 drivers/misc/cxl/cxl.h     |   3 +
 drivers/misc/cxl/fault.c   | 129 +++++++++++++++++++++++++++----------
 drivers/misc/cxl/file.c    |   6 +-
 5 files changed, 109 insertions(+), 37 deletions(-)

diff --git a/drivers/misc/cxl/api.c b/drivers/misc/cxl/api.c
index a6543aefa299..ea3eeb7011e1 100644
--- a/drivers/misc/cxl/api.c
+++ b/drivers/misc/cxl/api.c
@@ -172,7 +172,7 @@ int cxl_start_context(struct cxl_context *ctx, u64 wed,
 
 	if (task) {
 		ctx->pid = get_task_pid(task, PIDTYPE_PID);
-		get_pid(ctx->pid);
+		ctx->glpid = get_task_pid(task->group_leader, PIDTYPE_PID);
 		kernel = false;
 	}
 
diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c
index 6dde7a9d6a7e..262b88eac414 100644
--- a/drivers/misc/cxl/context.c
+++ b/drivers/misc/cxl/context.c
@@ -42,7 +42,7 @@ int cxl_context_init(struct cxl_context *ctx, struct cxl_afu *afu, bool master,
 	spin_lock_init(&ctx->sste_lock);
 	ctx->afu = afu;
 	ctx->master = master;
-	ctx->pid = NULL; /* Set in start work ioctl */
+	ctx->pid = ctx->glpid = NULL; /* Set in start work ioctl */
 	mutex_init(&ctx->mapping_lock);
 	ctx->mapping = mapping;
 
@@ -217,7 +217,11 @@ int __detach_context(struct cxl_context *ctx)
 	WARN_ON(cxl_detach_process(ctx) &&
 		cxl_adapter_link_ok(ctx->afu->adapter));
 	flush_work(&ctx->fault_work); /* Only needed for dedicated process */
+
+	/* release the reference to the group leader and mm handling pid */
 	put_pid(ctx->pid);
+	put_pid(ctx->glpid);
+
 	cxl_ctx_put();
 	return 0;
 }
diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
index 25ae57fa79b0..a521bc72cec2 100644
--- a/drivers/misc/cxl/cxl.h
+++ b/drivers/misc/cxl/cxl.h
@@ -445,6 +445,9 @@ struct cxl_context {
 	unsigned int sst_size, sst_lru;
 
 	wait_queue_head_t wq;
+	/* pid of the group leader associated with the pid */
+	struct pid *glpid;
+	/* use mm context associated with this pid for ds faults */
 	struct pid *pid;
 	spinlock_t lock; /* Protects pending_irq_mask, pending_fault and fault_addr */
 	/* Only used in PR mode */
diff --git a/drivers/misc/cxl/fault.c b/drivers/misc/cxl/fault.c
index 25a5418c55cb..81c3f75b7330 100644
--- a/drivers/misc/cxl/fault.c
+++ b/drivers/misc/cxl/fault.c
@@ -166,13 +166,92 @@ static void cxl_handle_page_fault(struct cxl_context *ctx,
 	cxl_ack_irq(ctx, CXL_PSL_TFC_An_R, 0);
 }
 
+/*
+ * Returns the mm_struct corresponding to the context ctx via ctx->pid
+ * In case the task has exited we use the task group leader accessible
+ * via ctx->glpid to find the next task in the thread group that has a
+ * valid  mm_struct associated with it. If a task with valid mm_struct
+ * is found the ctx->pid is updated to use the task struct for subsequent
+ * translations. In case no valid mm_struct is found in the task group to
+ * service the fault a NULL is returned.
+ */
+static struct mm_struct *get_mem_context(struct cxl_context *ctx)
+{
+	struct task_struct *task = NULL;
+	struct mm_struct *mm = NULL;
+	struct pid *old_pid = ctx->pid;
+
+	if (old_pid == NULL) {
+		pr_warn("%s: Invalid context for pe=%d\n",
+			 __func__, ctx->pe);
+		return NULL;
+	}
+
+	task = get_pid_task(old_pid, PIDTYPE_PID);
+
+	/*
+	 * pid_alive may look racy but this saves us from costly
+	 * get_task_mm when the task is a zombie. In worst case
+	 * we may think a task is alive, which is about to die
+	 * but get_task_mm will return NULL.
+	 */
+	if (task != NULL && pid_alive(task))
+		mm = get_task_mm(task);
+
+	/* release the task struct that was taken earlier */
+	if (task)
+		put_task_struct(task);
+	else
+		pr_devel("%s: Context owning pid=%i for pe=%i dead\n",
+			__func__, pid_nr(old_pid), ctx->pe);
+
+	/*
+	 * If we couldn't find the mm context then use the group
+	 * leader to iterate over the task group and find a task
+	 * that gives us mm_struct.
+	 */
+	if (unlikely(mm == NULL && ctx->glpid != NULL)) {
+
+		rcu_read_lock();
+		task = pid_task(ctx->glpid, PIDTYPE_PID);
+		if (task)
+			do {
+				mm = get_task_mm(task);
+				if (mm) {
+					ctx->pid = get_task_pid(task,
+								PIDTYPE_PID);
+					break;
+				}
+				task = next_thread(task);
+			} while (task && !thread_group_leader(task));
+		rcu_read_unlock();
+
+		/* check if we switched pid */
+		if (ctx->pid != old_pid) {
+			if (mm)
+				pr_devel("%s:pe=%i switch pid %i->%i\n",
+					 __func__, ctx->pe, pid_nr(old_pid),
+					 pid_nr(ctx->pid));
+			else
+				pr_devel("%s:Cannot find mm for pid=%i\n",
+					 __func__, pid_nr(old_pid));
+
+			/* drop the reference to older pid */
+			put_pid(old_pid);
+		}
+	}
+
+	return mm;
+}
+
+
+
 void cxl_handle_fault(struct work_struct *fault_work)
 {
 	struct cxl_context *ctx =
 		container_of(fault_work, struct cxl_context, fault_work);
 	u64 dsisr = ctx->dsisr;
 	u64 dar = ctx->dar;
-	struct task_struct *task = NULL;
 	struct mm_struct *mm = NULL;
 
 	if (cxl_p2n_read(ctx->afu, CXL_PSL_DSISR_An) != dsisr ||
@@ -195,17 +274,17 @@ void cxl_handle_fault(struct work_struct *fault_work)
 		"DSISR: %#llx DAR: %#llx\n", ctx->pe, dsisr, dar);
 
 	if (!ctx->kernel) {
-		if (!(task = get_pid_task(ctx->pid, PIDTYPE_PID))) {
-			pr_devel("cxl_handle_fault unable to get task %i\n",
-				 pid_nr(ctx->pid));
+
+		mm = get_mem_context(ctx);
+		/* indicates all the thread in task group have exited */
+		if (mm == NULL) {
+			pr_devel("%s: unable to get mm for pe=%d pid=%i\n",
+				 __func__, ctx->pe, pid_nr(ctx->pid));
 			cxl_ack_ae(ctx);
 			return;
-		}
-		if (!(mm = get_task_mm(task))) {
-			pr_devel("cxl_handle_fault unable to get mm %i\n",
-				 pid_nr(ctx->pid));
-			cxl_ack_ae(ctx);
-			goto out;
+		} else {
+			pr_devel("Handling page fault for pe=%d pid=%i\n",
+				 ctx->pe, pid_nr(ctx->pid));
 		}
 	}
 
@@ -218,33 +297,22 @@ void cxl_handle_fault(struct work_struct *fault_work)
 
 	if (mm)
 		mmput(mm);
-out:
-	if (task)
-		put_task_struct(task);
 }
 
 static void cxl_prefault_one(struct cxl_context *ctx, u64 ea)
 {
-	int rc;
-	struct task_struct *task;
 	struct mm_struct *mm;
 
-	if (!(task = get_pid_task(ctx->pid, PIDTYPE_PID))) {
-		pr_devel("cxl_prefault_one unable to get task %i\n",
-			 pid_nr(ctx->pid));
-		return;
-	}
-	if (!(mm = get_task_mm(task))) {
+	mm = get_mem_context(ctx);
+	if (mm == NULL) {
 		pr_devel("cxl_prefault_one unable to get mm %i\n",
 			 pid_nr(ctx->pid));
-		put_task_struct(task);
 		return;
 	}
 
-	rc = cxl_fault_segment(ctx, mm, ea);
+	cxl_fault_segment(ctx, mm, ea);
 
 	mmput(mm);
-	put_task_struct(task);
 }
 
 static u64 next_segment(u64 ea, u64 vsid)
@@ -263,18 +331,13 @@ static void cxl_prefault_vma(struct cxl_context *ctx)
 	struct copro_slb slb;
 	struct vm_area_struct *vma;
 	int rc;
-	struct task_struct *task;
 	struct mm_struct *mm;
 
-	if (!(task = get_pid_task(ctx->pid, PIDTYPE_PID))) {
-		pr_devel("cxl_prefault_vma unable to get task %i\n",
-			 pid_nr(ctx->pid));
-		return;
-	}
-	if (!(mm = get_task_mm(task))) {
+	mm = get_mem_context(ctx);
+	if (mm == NULL) {
 		pr_devel("cxl_prefault_vm unable to get mm %i\n",
 			 pid_nr(ctx->pid));
-		goto out1;
+		return;
 	}
 
 	down_read(&mm->mmap_sem);
@@ -295,8 +358,6 @@ static void cxl_prefault_vma(struct cxl_context *ctx)
 	up_read(&mm->mmap_sem);
 
 	mmput(mm);
-out1:
-	put_task_struct(task);
 }
 
 void cxl_prefault(struct cxl_context *ctx, u64 wed)
diff --git a/drivers/misc/cxl/file.c b/drivers/misc/cxl/file.c
index 5cc14599837d..783337d22f36 100644
--- a/drivers/misc/cxl/file.c
+++ b/drivers/misc/cxl/file.c
@@ -201,8 +201,12 @@ static long afu_ioctl_start_work(struct cxl_context *ctx,
 	 * where a process (master, some daemon, etc) has opened the chardev on
 	 * behalf of another process, so the AFU's mm gets bound to the process
 	 * that performs this ioctl and not the process that opened the file.
+	 * Also we grab the PID of the group leader so that if the task that
+	 * has performed the attach operation exits the mm context of the
+	 * process is still accessible.
 	 */
-	ctx->pid = get_pid(get_task_pid(current, PIDTYPE_PID));
+	ctx->pid = get_task_pid(current, PIDTYPE_PID);
+	ctx->glpid = get_task_pid(current->group_leader, PIDTYPE_PID);
 
 	trace_cxl_attach(ctx, work.work_element_descriptor, work.num_interrupts, amr);
 

From c33e54fafacaf83b3e345aae0e241c1f152224a0 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Sat, 9 Jan 2016 08:25:01 +1100
Subject: [PATCH 135/149] powerpc: Fix build break due to paca mm_context_t
 changes

Commit 2fc251a8dda5 ("powerpc: Copy only required pieces of the
mm_context_t to the paca") broke the build for CONFIG_PPC_STD_MMU_64=y
and CONFIG_PPC_MM_SLICES=n.

That only happens for a kernel built with 4K pages and HUGETLB disabled,
which is why we missed it.

Fix it by adding a mm_ctx_user_psize member to the paca and populating
it in the appropriate places.

Fixes: 2fc251a8dda5 ("powerpc: Copy only required pieces of the mm_context_t to the paca")
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/paca.h | 2 ++
 arch/powerpc/mm/hash_utils_64.c | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index ef78c288c712..546540b91095 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -138,6 +138,7 @@ struct paca_struct {
 	u64 mm_ctx_low_slices_psize;
 	unsigned char mm_ctx_high_slices_psize[SLICE_ARRAY_SIZE];
 #else
+	u16 mm_ctx_user_psize;
 	u16 mm_ctx_sllp;
 #endif
 #endif
@@ -212,6 +213,7 @@ static inline void copy_mm_to_paca(mm_context_t *context)
 	memcpy(&get_paca()->mm_ctx_high_slices_psize,
 	       &context->high_slices_psize, SLICE_ARRAY_SIZE);
 #else
+	get_paca()->mm_ctx_user_psize = context->user_psize;
 	get_paca()->mm_ctx_sllp = context->sllp;
 #endif
 }
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index db744576d730..ba59d5977f34 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -866,7 +866,7 @@ static unsigned int get_paca_psize(unsigned long addr)
 #else
 unsigned int get_paca_psize(unsigned long addr)
 {
-	return get_paca()->context.user_psize;
+	return get_paca()->mm_ctx_user_psize;
 }
 #endif
 

From 35de3b1aa16842214e0cd7c6036daf4619294314 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 8 Dec 2015 13:50:56 -0500
Subject: [PATCH 136/149] powerpc: Implement save_stack_trace_regs() to enable
 kprobe stack tracing

It has come to my attention that kprobe event stack tracing does not
work on powerpc. You can see with the following:

  # cd /sys/kernel/debug/tracing
  # echo stacktrace > trace_options
  # echo 'p kfree' > kprobe_events
  # echo 1 > events/kprobes/enable

Will print the following warning:
  save_stack_trace_regs() not implemented yet.

Although save_stack_trace() (which normal event stack traces use) is
implemented, save_stack_trace_regs() which kprobe events use is not.
This is a cheap attempt to implement that function.

Note, This may have issues if a task tries to get a stack trace from
another task with its regs, because it just passes in "current" to
save_context_stack(). But this does solve the issue with stack tracing
kprobe events.

Reported-by: Chunyu Hu <chuhu@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/stacktrace.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c
index ea43a347a104..4f24606afc3f 100644
--- a/arch/powerpc/kernel/stacktrace.c
+++ b/arch/powerpc/kernel/stacktrace.c
@@ -61,3 +61,10 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
 	save_context_stack(trace, tsk->thread.ksp, tsk, 0);
 }
 EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
+
+void
+save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace)
+{
+	save_context_stack(trace, regs->gpr[1], current, 0);
+}
+EXPORT_SYMBOL_GPL(save_stack_trace_regs);

From b0eab5b29a55fd9f31fad28df520337545c813ef Mon Sep 17 00:00:00 2001
From: Russell Currey <ruscur@russell.cc>
Date: Fri, 8 Jan 2016 16:16:47 +1100
Subject: [PATCH 137/149] powerpc/powernv: Remove misleading comment in pci.c

PCI in powernv now supports quite a bit more than p5ioc2, so remove the
outdated comment.

Signed-off-by: Russell Currey <ruscur@russell.cc>
Acked-by: Stewart Smith <stewart@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/pci.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index ff4e42d9d259..2f55c86df703 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -1,8 +1,6 @@
 /*
  * Support PCI/PCIe on PowerNV platforms
  *
- * Currently supports only P5IOC2
- *
  * Copyright 2011 Benjamin Herrenschmidt, IBM Corp.
  *
  * This program is free software; you can redistribute it and/or

From 419dbd5e1ff0e45a6e1d28c1f7b74d121d2a56e7 Mon Sep 17 00:00:00 2001
From: Alistair Popple <alistair@popple.id.au>
Date: Fri, 8 Jan 2016 11:35:09 +1100
Subject: [PATCH 138/149] powerpc/powernv: Fix update of NVLink DMA mask

The emulated NVLink PCI devices share the same IODA2 TCE tables but only
support a single TVT (instead of the normal two for PCI devices). This
requires the kernel to manually replace windows with either the bypass
or non-bypass window depending on what the driver has requested.

Unfortunately an incorrect optimisation was made in
pnv_pci_ioda_dma_set_mask() which caused updating of some NPU device PEs
to be skipped in certain configurations due to an incorrect assumption
that a NULL peer PE in the array indicated there were no more peers
present. This patch fixes the problem by ensuring all peer PEs are
updated.

Fixes: 5d2aa710e697 ("powerpc/powernv: Add support for Nvlink NPUs")
Signed-off-by: Alistair Popple <alistair@popple.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 323e1e58da93..458133f3c020 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1612,7 +1612,10 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
 
 	/* Update peer npu devices */
 	if (pe->flags & PNV_IODA_PE_PEER)
-		for (i = 0; pe->peers[i]; i++) {
+		for (i = 0; i < PNV_IODA_MAX_PEER_PES; i++) {
+			if (!pe->peers[i])
+				continue;
+
 			linked_npu_dev = pe->peers[i]->pdev;
 			if (dma_get_mask(&linked_npu_dev->dev) != dma_mask)
 				dma_set_mask(&linked_npu_dev->dev, dma_mask);

From b521549a09ddfac3bed38e261168cda92d04ce81 Mon Sep 17 00:00:00 2001
From: Alistair Popple <alistair@popple.id.au>
Date: Mon, 11 Jan 2016 16:53:49 +1100
Subject: [PATCH 139/149] powerpc/powernv: Change NPU PE# assignment

The P8+ hardware supports four partitionable endpoints (PEs) however
the hardware reports all errors as occurring on PE#0. This means we
need to reserve this PE for error handling (EEH) and not assign it to
a NPU device, implying that some devices will need to share PEs.

This patch changes the PE assignment for NPU devices such that NPU
devices which connect to the same GPU are assigned to the same
PE#.

Signed-off-by: Alistair Popple <alistair@popple.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 73 ++++++++++++++++++++---
 1 file changed, 66 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 458133f3c020..0b625272f3ca 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1074,16 +1074,75 @@ static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
 	pnv_ioda_link_pe_by_weight(phb, pe);
 }
 
-static void pnv_ioda_setup_dev_PEs(struct pci_bus *bus)
+static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev)
+{
+	int pe_num, found_pe = false, rc;
+	long rid;
+	struct pnv_ioda_pe *pe;
+	struct pci_dev *gpu_pdev;
+	struct pci_dn *npu_pdn;
+	struct pci_controller *hose = pci_bus_to_host(npu_pdev->bus);
+	struct pnv_phb *phb = hose->private_data;
+
+	/*
+	 * Due to a hardware errata PE#0 on the NPU is reserved for
+	 * error handling. This means we only have three PEs remaining
+	 * which need to be assigned to four links, implying some
+	 * links must share PEs.
+	 *
+	 * To achieve this we assign PEs such that NPUs linking the
+	 * same GPU get assigned the same PE.
+	 */
+	gpu_pdev = pnv_pci_get_gpu_dev(npu_pdev);
+	for (pe_num = 0; pe_num < phb->ioda.total_pe; pe_num++) {
+		pe = &phb->ioda.pe_array[pe_num];
+		if (!pe->pdev)
+			continue;
+
+		if (pnv_pci_get_gpu_dev(pe->pdev) == gpu_pdev) {
+			/*
+			 * This device has the same peer GPU so should
+			 * be assigned the same PE as the existing
+			 * peer NPU.
+			 */
+			dev_info(&npu_pdev->dev,
+				"Associating to existing PE %d\n", pe_num);
+			pci_dev_get(npu_pdev);
+			npu_pdn = pci_get_pdn(npu_pdev);
+			rid = npu_pdev->bus->number << 8 | npu_pdn->devfn;
+			npu_pdn->pcidev = npu_pdev;
+			npu_pdn->pe_number = pe_num;
+			pe->dma_weight += pnv_ioda_dma_weight(npu_pdev);
+			phb->ioda.pe_rmap[rid] = pe->pe_number;
+
+			/* Map the PE to this link */
+			rc = opal_pci_set_pe(phb->opal_id, pe_num, rid,
+					OpalPciBusAll,
+					OPAL_COMPARE_RID_DEVICE_NUMBER,
+					OPAL_COMPARE_RID_FUNCTION_NUMBER,
+					OPAL_MAP_PE);
+			WARN_ON(rc != OPAL_SUCCESS);
+			found_pe = true;
+			break;
+		}
+	}
+
+	if (!found_pe)
+		/*
+		 * Could not find an existing PE so allocate a new
+		 * one.
+		 */
+		return pnv_ioda_setup_dev_PE(npu_pdev);
+	else
+		return pe;
+}
+
+static void pnv_ioda_setup_npu_PEs(struct pci_bus *bus)
 {
-	struct pci_bus *child;
 	struct pci_dev *pdev;
 
 	list_for_each_entry(pdev, &bus->devices, bus_list)
-		pnv_ioda_setup_dev_PE(pdev);
-
-	list_for_each_entry(child, &bus->children, node)
-		pnv_ioda_setup_dev_PEs(child);
+		pnv_ioda_setup_npu_PE(pdev);
 }
 
 static void pnv_ioda_setup_PEs(struct pci_bus *bus)
@@ -1128,7 +1187,7 @@ static void pnv_pci_ioda_setup_PEs(void)
 		 * remaining types of PHBs.
 		 */
 		if (phb->type == PNV_PHB_NPU)
-			pnv_ioda_setup_dev_PEs(hose->bus);
+			pnv_ioda_setup_npu_PEs(hose->bus);
 		else
 			pnv_ioda_setup_PEs(hose->bus);
 	}

From 08f48f3234a79bca86c2283a166aec83bf52b265 Mon Sep 17 00:00:00 2001
From: Alistair Popple <alistair@popple.id.au>
Date: Mon, 11 Jan 2016 16:53:50 +1100
Subject: [PATCH 140/149] powerpc/powernv: Reserve PE#0 on NPU

P8+ hardware reports all errors on PE#0. This patch ensures PE#0 is
not assigned to NPU devices so that it can be used for EEH.

Signed-off-by: Alistair Popple <alistair@popple.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 0b625272f3ca..573ae1994097 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1186,9 +1186,11 @@ static void pnv_pci_ioda_setup_PEs(void)
 		 * functions. PCI bus dependent PEs are required for the
 		 * remaining types of PHBs.
 		 */
-		if (phb->type == PNV_PHB_NPU)
+		if (phb->type == PNV_PHB_NPU) {
+			/* PE#0 is needed for error reporting */
+			pnv_ioda_reserve_pe(phb, 0);
 			pnv_ioda_setup_npu_PEs(hose->bus);
-		else
+		} else
 			pnv_ioda_setup_PEs(hose->bus);
 	}
 }

From e708c24cd01ce80b1609d8baccee40ccc3608a01 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Mon, 11 Jan 2016 13:59:04 +1100
Subject: [PATCH 141/149] powerpc: Add HWCAP bits for Power9

In order to support Power9 we need two new HWCAP bits. We are merging
these ahead of the cputable entry so that glibc can start referring to
them.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/uapi/asm/cputable.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/include/uapi/asm/cputable.h b/arch/powerpc/include/uapi/asm/cputable.h
index 43686043e297..8dde19962a5b 100644
--- a/arch/powerpc/include/uapi/asm/cputable.h
+++ b/arch/powerpc/include/uapi/asm/cputable.h
@@ -43,5 +43,7 @@
 #define PPC_FEATURE2_TAR		0x04000000
 #define PPC_FEATURE2_VEC_CRYPTO		0x02000000
 #define PPC_FEATURE2_HTM_NOSC		0x01000000
+#define PPC_FEATURE2_ARCH_3_00		0x00800000 /* ISA 3.00 */
+#define PPC_FEATURE2_HAS_IEEE128	0x00400000 /* VSX IEEE Binary Float 128-bit */
 
 #endif /* _UAPI__ASM_POWERPC_CPUTABLE_H */

From aa09545589ceeff884421d8eb38d04963190afbe Mon Sep 17 00:00:00 2001
From: Brian Norris <computersforpeace@gmail.com>
Date: Fri, 8 Jan 2016 10:30:09 -0800
Subject: [PATCH 142/149] cxl: fix build for GCC 4.6.x

GCC 4.6.3 does not support -Wno-unused-const-variable. Instead, use the
kbuild infrastructure that checks if this options exists.

Fixes: 2cd55c68c0a4 ("cxl: Fix build failure due to -Wunused-variable behaviour change")
Suggested-by: Michal Marek <mmarek@suse.com>
Suggested-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Brian Norris <computersforpeace@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 drivers/misc/cxl/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/misc/cxl/Makefile b/drivers/misc/cxl/Makefile
index 6982f603fadc..ab6f392d3504 100644
--- a/drivers/misc/cxl/Makefile
+++ b/drivers/misc/cxl/Makefile
@@ -1,4 +1,4 @@
-ccflags-y := -Werror -Wno-unused-const-variable
+ccflags-y := -Werror $(call cc-disable-warning, unused-const-variable)
 
 cxl-y				+= main.o file.o irq.o fault.o native.o
 cxl-y				+= context.o sysfs.o debugfs.o pci.o trace.o

From 57f7c3932516b9f7908d9b0a24396112d0f4ca55 Mon Sep 17 00:00:00 2001
From: Brian Norris <computersforpeace@gmail.com>
Date: Fri, 8 Jan 2016 10:30:10 -0800
Subject: [PATCH 143/149] cxl: use -Werror only with CONFIG_PPC_WERROR

Some developers really like to have -Werror enabled for their code, as
it helps to ensure warning free code. Others don't want -Werror, as it
(for example) can cause problems when newer (or older) compilers have
different sets of warnings, or new warnings can appear just when turning
up the warning level (e.g., make W=1 or W=2). Thus, it seems prudent to
have the use of -Werror be configurable.

It so happens that cxl is only built on PowerPC, and PowerPC already
has a nice set of Kconfig options for this, under CONFIG_PPC_WERROR. So
let's use that, and the world is a happy place again! (Note that
PPC_WERROR defaults to =y, so the common case compile should still be
enforcing -Werror.)

Fixes: d3d73f4b38a8 ("cxl: Compile with -Werror")
Signed-off-by: Brian Norris <computersforpeace@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 drivers/misc/cxl/Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/misc/cxl/Makefile b/drivers/misc/cxl/Makefile
index ab6f392d3504..be2ac5ce349f 100644
--- a/drivers/misc/cxl/Makefile
+++ b/drivers/misc/cxl/Makefile
@@ -1,4 +1,5 @@
-ccflags-y := -Werror $(call cc-disable-warning, unused-const-variable)
+ccflags-y			:= $(call cc-disable-warning, unused-const-variable)
+ccflags-$(CONFIG_PPC_WERROR)	+= -Werror
 
 cxl-y				+= main.o file.o irq.o fault.o native.o
 cxl-y				+= context.o sysfs.o debugfs.o pci.o trace.o

From 68adb7bfd66504e97364651fb7dac3f9c8aa8561 Mon Sep 17 00:00:00 2001
From: Uma Krishnan <ukrishn@linux.vnet.ibm.com>
Date: Mon, 7 Dec 2015 16:03:32 -0600
Subject: [PATCH 144/149] cxl: Enable PCI device ID for future IBM CXL adapter

Add support for future IBM Coherent Accelerator (CXL) device
with ID of 0x0601.

Signed-off-by: Uma Krishnan <ukrishn@linux.vnet.ibm.com>
Reviewed-by: Matthew R. Ochs <mrochs@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 drivers/misc/cxl/pci.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
index 85761d7eb333..4c1903f781fc 100644
--- a/drivers/misc/cxl/pci.c
+++ b/drivers/misc/cxl/pci.c
@@ -138,6 +138,7 @@ static const struct pci_device_id cxl_pci_tbl[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_IBM, 0x0477), },
 	{ PCI_DEVICE(PCI_VENDOR_ID_IBM, 0x044b), },
 	{ PCI_DEVICE(PCI_VENDOR_ID_IBM, 0x04cf), },
+	{ PCI_DEVICE(PCI_VENDOR_ID_IBM, 0x0601), },
 	{ PCI_DEVICE_CLASS(0x120000, ~0), },
 
 	{ }

From 44734f23de2465c3c0d39e4a16df7735b23fd142 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 11 Jan 2016 21:19:34 +0530
Subject: [PATCH 145/149] powerpc/mm: Fix _PAGE_PTE breaking swapoff

Core kernel expects swp_entry_t to consist of only swap type and swap
offset. We should not leak pte bits into swp_entry_t. This breaks
swapoff which use the swap type and offset to build a swp_entry_t and
later compare that to the swp_entry_t obtained from linux page table
pte. Leaking pte bits into swp_entry_t breaks that comparison and
results in us looping in try_to_unuse.

The stack trace can be anywhere below try_to_unuse() in mm/swapfile.c,
since swapoff is circling around and around that function, reading from
each used swap block into a page, then trying to find where that page
belongs, looking at every non-file pte of every mm that ever swapped.

Fixes: 6a119eae942c ("powerpc/mm: Add a _PAGE_PTE bit")
Reported-by: Hugh Dickins <hughd@google.com>
Suggested-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Acked-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/pgtable.h | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 03c1a5a21c0c..8e040c42e931 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -158,9 +158,14 @@ static inline void pgd_set(pgd_t *pgdp, unsigned long val)
 #define __swp_entry(type, offset)	((swp_entry_t) { \
 					((type) << _PAGE_BIT_SWAP_TYPE) \
 					| ((offset) << PTE_RPN_SHIFT) })
-
-#define __pte_to_swp_entry(pte)		((swp_entry_t) { pte_val((pte)) })
-#define __swp_entry_to_pte(x)		__pte((x).val)
+/*
+ * swp_entry_t must be independent of pte bits. We build a swp_entry_t from
+ * swap type and offset we get from swap and convert that to pte to find a
+ * matching pte in linux page table.
+ * Clear bits not found in swap entries here.
+ */
+#define __pte_to_swp_entry(pte)	((swp_entry_t) { pte_val((pte)) & ~_PAGE_PTE })
+#define __swp_entry_to_pte(x)	__pte((x).val | _PAGE_PTE)
 
 #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
 #define _PAGE_SWP_SOFT_DIRTY   (1UL << (SWP_TYPE_BITS + _PAGE_BIT_SWAP_TYPE))

From 2f10f1a7884e97a68e52c4b6f7866e29cf3fe7e6 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Sat, 9 Jan 2016 16:54:59 -0800
Subject: [PATCH 146/149] powerpc/mm: fix _PAGE_SWP_SOFT_DIRTY breaking swapoff

Swapoff after swapping hangs on the G5, when CONFIG_CHECKPOINT_RESTORE=y
but CONFIG_MEM_SOFT_DIRTY is not set.  That's because the non-zero
_PAGE_SWP_SOFT_DIRTY bit, added by CONFIG_HAVE_ARCH_SOFT_DIRTY=y, is not
discounted when CONFIG_MEM_SOFT_DIRTY is not set: so swap ptes cannot be
recognized.

(I suspect that the peculiar dependence of HAVE_ARCH_SOFT_DIRTY on
CHECKPOINT_RESTORE in arch/powerpc/Kconfig comes from an incomplete
attempt to solve this problem.)

It's true that the relationship between CONFIG_HAVE_ARCH_SOFT_DIRTY and
and CONFIG_MEM_SOFT_DIRTY is too confusing, and it's true that swapoff
should be made more robust; but nevertheless, fix up the powerpc ifdefs
as x86_64 and s390 (which met the same problem) have them, defining the
bits as 0 if CONFIG_MEM_SOFT_DIRTY is not set.

Fixes: 7207f43665b8 ("powerpc/mm: Add page soft dirty tracking")
Signed-off-by: Hugh Dickins <hughd@google.com>
Reviewed-by: Cyrill Gorcunov <gorcunov@openvz.org>
Acked-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/hash.h    | 5 +++++
 arch/powerpc/include/asm/book3s/64/pgtable.h | 9 ++++++---
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
index 9e861b4378bd..2ff8b3df553d 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -33,7 +33,12 @@
 #define _PAGE_F_GIX_SHIFT	12
 #define _PAGE_F_SECOND		0x08000 /* Whether to use secondary hash or not */
 #define _PAGE_SPECIAL		0x10000 /* software: special page */
+
+#ifdef CONFIG_MEM_SOFT_DIRTY
 #define _PAGE_SOFT_DIRTY	0x20000 /* software: software dirty tracking */
+#else
+#define _PAGE_SOFT_DIRTY	0x00000
+#endif
 
 /*
  * THP pages can't be special. So use the _PAGE_SPECIAL
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 8e040c42e931..b3a5badab69f 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -167,8 +167,13 @@ static inline void pgd_set(pgd_t *pgdp, unsigned long val)
 #define __pte_to_swp_entry(pte)	((swp_entry_t) { pte_val((pte)) & ~_PAGE_PTE })
 #define __swp_entry_to_pte(x)	__pte((x).val | _PAGE_PTE)
 
-#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
+#ifdef CONFIG_MEM_SOFT_DIRTY
 #define _PAGE_SWP_SOFT_DIRTY   (1UL << (SWP_TYPE_BITS + _PAGE_BIT_SWAP_TYPE))
+#else
+#define _PAGE_SWP_SOFT_DIRTY	0UL
+#endif /* CONFIG_MEM_SOFT_DIRTY */
+
+#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
 static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
 {
 	return __pte(pte_val(pte) | _PAGE_SWP_SOFT_DIRTY);
@@ -181,8 +186,6 @@ static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
 {
 	return __pte(pte_val(pte) & ~_PAGE_SWP_SOFT_DIRTY);
 }
-#else
-#define _PAGE_SWP_SOFT_DIRTY	0
 #endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
 
 void pgtable_cache_add(unsigned shift, void (*ctor)(void *));

From c88c5d43732a0356f99e5e4d1ad62ab1ea516b81 Mon Sep 17 00:00:00 2001
From: Russell Currey <ruscur@russell.cc>
Date: Wed, 13 Jan 2016 12:04:32 +1100
Subject: [PATCH 147/149] powerpc/powernv: Fix OPAL_CONSOLE_FLUSH prototype and
 usages

The recently added OPAL API call, OPAL_CONSOLE_FLUSH, originally took no
parameters and returned nothing.  The call was updated to accept the
terminal number to flush, and returned various values depending on the
state of the output buffer.

The prototype has been updated and its usage in the OPAL kmsg dumper has
been modified to support its new behaviour as an incremental flush.

Signed-off-by: Russell Currey <ruscur@russell.cc>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/opal.h            | 2 +-
 arch/powerpc/platforms/powernv/opal-kmsg.c | 9 ++++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index a5fd407213b6..07a99e638449 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -35,7 +35,7 @@ int64_t opal_console_read(int64_t term_number, __be64 *length,
 			  uint8_t *buffer);
 int64_t opal_console_write_buffer_space(int64_t term_number,
 					__be64 *length);
-void opal_console_flush(void);
+int64_t opal_console_flush(int64_t term_number);
 int64_t opal_rtc_read(__be32 *year_month_day,
 		      __be64 *hour_minute_second_millisecond);
 int64_t opal_rtc_write(uint32_t year_month_day,
diff --git a/arch/powerpc/platforms/powernv/opal-kmsg.c b/arch/powerpc/platforms/powernv/opal-kmsg.c
index bd3b2ee1ba1d..6f1214d4de92 100644
--- a/arch/powerpc/platforms/powernv/opal-kmsg.c
+++ b/arch/powerpc/platforms/powernv/opal-kmsg.c
@@ -27,6 +27,7 @@ static void force_opal_console_flush(struct kmsg_dumper *dumper,
 				     enum kmsg_dump_reason reason)
 {
 	int i;
+	int64_t ret;
 
 	/*
 	 * Outside of a panic context the pollers will continue to run,
@@ -36,7 +37,13 @@ static void force_opal_console_flush(struct kmsg_dumper *dumper,
 		return;
 
 	if (opal_check_token(OPAL_CONSOLE_FLUSH)) {
-		opal_console_flush();
+		ret = opal_console_flush(0);
+
+		if (ret == OPAL_UNSUPPORTED || ret == OPAL_PARAMETER)
+			return;
+
+		/* Incrementally flush until there's nothing left */
+		while (opal_console_flush(0) != OPAL_SUCCESS);
 	} else {
 		/*
 		 * If OPAL_CONSOLE_FLUSH is not implemented in the firmware,

From 2e50c4bef77511b42cc226865d6bc568fa7f8769 Mon Sep 17 00:00:00 2001
From: Ulrich Weigand <ulrich.weigand@de.ibm.com>
Date: Tue, 12 Jan 2016 23:14:22 +1100
Subject: [PATCH 148/149] scripts/recordmcount.pl: support data in text section
 on powerpc

If a text section starts out with a data blob before the first
function start label, disassembly parsing doing in recordmcount.pl
gets confused on powerpc, leading to creation of corrupted module
objects.

This was not a problem so far since the compiler would never create
such text sections.  However, this has changed with a recent change
in GCC 6 to support distances of > 2GB between a function and its
assoicated TOC in the ELFv2 ABI, exposing this problem.

There is already code in recordmcount.pl to handle such data blobs
on the sparc64 platform.  This patch uses the same method to handle
those on powerpc as well.

Cc: stable@vger.kernel.org
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ulrich Weigand <ulrich.weigand@de.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 scripts/recordmcount.pl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl
index 826470d7f000..96e2486a6fc4 100755
--- a/scripts/recordmcount.pl
+++ b/scripts/recordmcount.pl
@@ -263,7 +263,8 @@ if ($arch eq "x86_64") {
 
 } elsif ($arch eq "powerpc") {
     $local_regex = "^[0-9a-fA-F]+\\s+t\\s+(\\.?\\S+)";
-    $function_regex = "^([0-9a-fA-F]+)\\s+<(\\.?.*?)>:";
+    # See comment in the sparc64 section for why we use '\w'.
+    $function_regex = "^([0-9a-fA-F]+)\\s+<(\\.?\\w*?)>:";
     $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\s\\.?_mcount\$";
 
     if ($bits == 64) {

From a61674bdfc7c2bf909c4010699607b62b69b7bec Mon Sep 17 00:00:00 2001
From: Ulrich Weigand <ulrich.weigand@de.ibm.com>
Date: Tue, 12 Jan 2016 23:14:23 +1100
Subject: [PATCH 149/149] powerpc/module: Handle R_PPC64_ENTRY relocations

GCC 6 will include changes to generated code with -mcmodel=large,
which is used to build kernel modules on powerpc64le.  This was
necessary because the large model is supposed to allow arbitrary
sizes and locations of the code and data sections, but the ELFv2
global entry point prolog still made the unconditional assumption
that the TOC associated with any particular function can be found
within 2 GB of the function entry point:

func:
	addis r2,r12,(.TOC.-func)@ha
	addi  r2,r2,(.TOC.-func)@l
	.localentry func, .-func

To remove this assumption, GCC will now generate instead this global
entry point prolog sequence when using -mcmodel=large:

	.quad .TOC.-func
func:
	.reloc ., R_PPC64_ENTRY
	ld    r2, -8(r12)
	add   r2, r2, r12
	.localentry func, .-func

The new .reloc triggers an optimization in the linker that will
replace this new prolog with the original code (see above) if the
linker determines that the distance between .TOC. and func is in
range after all.

Since this new relocation is now present in module object files,
the kernel module loader is required to handle them too.  This
patch adds support for the new relocation and implements the
same optimization done by the GNU linker.

Cc: stable@vger.kernel.org
Signed-off-by: Ulrich Weigand <ulrich.weigand@de.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/uapi/asm/elf.h |  2 ++
 arch/powerpc/kernel/module_64.c     | 27 +++++++++++++++++++++++++++
 2 files changed, 29 insertions(+)

diff --git a/arch/powerpc/include/uapi/asm/elf.h b/arch/powerpc/include/uapi/asm/elf.h
index 59dad113897b..c2d21d11c2d2 100644
--- a/arch/powerpc/include/uapi/asm/elf.h
+++ b/arch/powerpc/include/uapi/asm/elf.h
@@ -295,6 +295,8 @@ do {									\
 #define R_PPC64_TLSLD		108
 #define R_PPC64_TOCSAVE		109
 
+#define R_PPC64_ENTRY		118
+
 #define R_PPC64_REL16		249
 #define R_PPC64_REL16_LO	250
 #define R_PPC64_REL16_HI	251
diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c
index 68384514506b..59663af9315f 100644
--- a/arch/powerpc/kernel/module_64.c
+++ b/arch/powerpc/kernel/module_64.c
@@ -635,6 +635,33 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
 			 */
 			break;
 
+		case R_PPC64_ENTRY:
+			/*
+			 * Optimize ELFv2 large code model entry point if
+			 * the TOC is within 2GB range of current location.
+			 */
+			value = my_r2(sechdrs, me) - (unsigned long)location;
+			if (value + 0x80008000 > 0xffffffff)
+				break;
+			/*
+			 * Check for the large code model prolog sequence:
+		         *	ld r2, ...(r12)
+			 *	add r2, r2, r12
+			 */
+			if ((((uint32_t *)location)[0] & ~0xfffc)
+			    != 0xe84c0000)
+				break;
+			if (((uint32_t *)location)[1] != 0x7c426214)
+				break;
+			/*
+			 * If found, replace it with:
+			 *	addis r2, r12, (.TOC.-func)@ha
+			 *	addi r2, r12, (.TOC.-func)@l
+			 */
+			((uint32_t *)location)[0] = 0x3c4c0000 + PPC_HA(value);
+			((uint32_t *)location)[1] = 0x38420000 + PPC_LO(value);
+			break;
+
 		case R_PPC64_REL16_HA:
 			/* Subtract location pointer */
 			value -= (unsigned long)location;