2019-05-27 14:55:05 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation
|
|
|
|
*
|
2006-04-29 11:51:59 +08:00
|
|
|
* Rewrite, cleanup:
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
2005-11-21 16:12:32 +08:00
|
|
|
* Copyright (C) 2004 Olof Johansson <olof@lixom.net>, IBM Corporation
|
2006-04-29 11:51:59 +08:00
|
|
|
* Copyright (C) 2006 Olof Johansson <olof@lixom.net>
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
|
|
|
* Dynamic DMA mapping support, pSeries-specific parts, both SMP and LPAR.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/init.h>
|
|
|
|
#include <linux/types.h>
|
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/mm.h>
|
2012-07-26 05:20:03 +08:00
|
|
|
#include <linux/memblock.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <linux/spinlock.h>
|
|
|
|
#include <linux/string.h>
|
|
|
|
#include <linux/pci.h>
|
|
|
|
#include <linux/dma-mapping.h>
|
2008-10-23 04:39:04 +08:00
|
|
|
#include <linux/crash_dump.h>
|
2011-02-10 17:10:47 +08:00
|
|
|
#include <linux/memory.h>
|
2012-10-03 00:57:57 +08:00
|
|
|
#include <linux/of.h>
|
2015-06-05 14:34:56 +08:00
|
|
|
#include <linux/iommu.h>
|
2015-06-05 14:35:09 +08:00
|
|
|
#include <linux/rculist.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <asm/io.h>
|
|
|
|
#include <asm/prom.h>
|
|
|
|
#include <asm/rtas.h>
|
|
|
|
#include <asm/iommu.h>
|
|
|
|
#include <asm/pci-bridge.h>
|
|
|
|
#include <asm/machdep.h>
|
2005-08-03 12:35:25 +08:00
|
|
|
#include <asm/firmware.h>
|
2005-09-20 11:45:41 +08:00
|
|
|
#include <asm/tce.h>
|
2005-09-28 00:50:25 +08:00
|
|
|
#include <asm/ppc-pci.h>
|
2005-11-07 10:18:13 +08:00
|
|
|
#include <asm/udbg.h>
|
2011-02-10 17:10:47 +08:00
|
|
|
#include <asm/mmzone.h>
|
2013-08-22 17:53:52 +08:00
|
|
|
#include <asm/plpar_wrappers.h>
|
2005-11-03 12:33:31 +08:00
|
|
|
|
2015-03-31 13:00:50 +08:00
|
|
|
#include "pseries.h"
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2020-08-05 11:04:52 +08:00
|
|
|
enum {
|
|
|
|
DDW_QUERY_PE_DMA_WIN = 0,
|
|
|
|
DDW_CREATE_PE_DMA_WIN = 1,
|
|
|
|
DDW_REMOVE_PE_DMA_WIN = 2,
|
|
|
|
|
|
|
|
DDW_APPLICABLE_SIZE
|
|
|
|
};
|
|
|
|
|
powerpc/pseries/iommu: Update call to ibm, query-pe-dma-windows
>From LoPAR level 2.8, "ibm,ddw-extensions" index 3 can make the number of
outputs from "ibm,query-pe-dma-windows" go from 5 to 6.
This change of output size is meant to expand the address size of
largest_available_block PE TCE from 32-bit to 64-bit, which ends up
shifting page_size and migration_capable.
This ends up requiring the update of
ddw_query_response->largest_available_block from u32 to u64, and manually
assigning the values from the buffer into this struct, according to
output size.
Also, a routine was created for helping reading the ddw extensions as
suggested by LoPAR: First reading the size of the extension array from
index 0, checking if the property exists, and then returning it's value.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Tested-by: David Dai <zdai@linux.vnet.ibm.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200805030455.123024-3-leobras.c@gmail.com
2020-08-05 11:04:53 +08:00
|
|
|
enum {
|
|
|
|
DDW_EXT_SIZE = 0,
|
|
|
|
DDW_EXT_RESET_DMA_WIN = 1,
|
|
|
|
DDW_EXT_QUERY_OUT_SIZE = 2
|
|
|
|
};
|
|
|
|
|
2021-08-17 14:39:21 +08:00
|
|
|
static struct iommu_table *iommu_pseries_alloc_table(int node)
|
2015-06-05 14:35:08 +08:00
|
|
|
{
|
2017-10-19 02:48:52 +08:00
|
|
|
struct iommu_table *tbl;
|
2015-06-05 14:35:08 +08:00
|
|
|
|
|
|
|
tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node);
|
|
|
|
if (!tbl)
|
2021-08-17 14:39:21 +08:00
|
|
|
return NULL;
|
2015-06-05 14:35:08 +08:00
|
|
|
|
2015-06-05 14:35:09 +08:00
|
|
|
INIT_LIST_HEAD_RCU(&tbl->it_group_list);
|
2017-03-22 12:21:50 +08:00
|
|
|
kref_init(&tbl->it_kref);
|
2021-08-17 14:39:21 +08:00
|
|
|
return tbl;
|
|
|
|
}
|
2015-06-05 14:35:09 +08:00
|
|
|
|
2021-08-17 14:39:21 +08:00
|
|
|
static struct iommu_table_group *iommu_pseries_alloc_group(int node)
|
|
|
|
{
|
|
|
|
struct iommu_table_group *table_group;
|
|
|
|
|
|
|
|
table_group = kzalloc_node(sizeof(*table_group), GFP_KERNEL, node);
|
|
|
|
if (!table_group)
|
|
|
|
return NULL;
|
2015-06-05 14:35:08 +08:00
|
|
|
|
2021-08-17 14:39:21 +08:00
|
|
|
table_group->tables[0] = iommu_pseries_alloc_table(node);
|
|
|
|
if (table_group->tables[0])
|
|
|
|
return table_group;
|
2015-06-05 14:35:08 +08:00
|
|
|
|
2017-10-19 02:48:52 +08:00
|
|
|
kfree(table_group);
|
2015-06-05 14:35:08 +08:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void iommu_pseries_free_group(struct iommu_table_group *table_group,
|
2015-06-05 14:34:56 +08:00
|
|
|
const char *node_name)
|
|
|
|
{
|
2015-06-05 14:35:08 +08:00
|
|
|
struct iommu_table *tbl;
|
|
|
|
|
|
|
|
if (!table_group)
|
|
|
|
return;
|
|
|
|
|
2015-06-05 14:35:09 +08:00
|
|
|
tbl = table_group->tables[0];
|
2015-06-05 14:34:56 +08:00
|
|
|
#ifdef CONFIG_IOMMU_API
|
2015-06-05 14:35:08 +08:00
|
|
|
if (table_group->group) {
|
|
|
|
iommu_group_put(table_group->group);
|
|
|
|
BUG_ON(table_group->group);
|
2015-06-05 14:34:56 +08:00
|
|
|
}
|
|
|
|
#endif
|
2017-03-22 12:21:50 +08:00
|
|
|
iommu_tce_table_put(tbl);
|
2015-06-05 14:35:08 +08:00
|
|
|
|
|
|
|
kfree(table_group);
|
2015-06-05 14:34:56 +08:00
|
|
|
}
|
|
|
|
|
2008-07-24 02:31:16 +08:00
|
|
|
static int tce_build_pSeries(struct iommu_table *tbl, long index,
|
2006-04-29 11:51:59 +08:00
|
|
|
long npages, unsigned long uaddr,
|
2008-07-16 03:51:47 +08:00
|
|
|
enum dma_data_direction direction,
|
2016-08-04 04:46:00 +08:00
|
|
|
unsigned long attrs)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2006-04-29 11:51:59 +08:00
|
|
|
u64 proto_tce;
|
2019-04-07 10:48:08 +08:00
|
|
|
__be64 *tcep;
|
2006-04-29 11:51:59 +08:00
|
|
|
u64 rpn;
|
powerpc/pseries/iommu: Replace hard-coded page shift
Some functions assume IOMMU page size can only be 4K (pageshift == 12).
Update them to accept any page size passed, so we can use 64K pages.
In the process, some defines like TCE_SHIFT were made obsolete, and then
removed.
IODA3 Revision 3.0_prd1 (OpenPowerFoundation), Figures 3.4 and 3.5 show
a RPN of 52-bit, and considers a 12-bit pageshift, so there should be
no need of using TCE_RPN_MASK, which masks out any bit after 40 in rpn.
It's usage removed from tce_build_pSeries(), tce_build_pSeriesLP(), and
tce_buildmulti_pSeriesLP().
Most places had a tbl struct, so using tbl->it_page_shift was simple.
tce_free_pSeriesLP() was a special case, since callers not always have a
tbl struct, so adding a tceshift parameter seems the right thing to do.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: Frederic Barrat <fbarrat@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-2-leobras.c@gmail.com
2021-08-17 14:39:19 +08:00
|
|
|
const unsigned long tceshift = tbl->it_page_shift;
|
|
|
|
const unsigned long pagesize = IOMMU_PAGE_SIZE(tbl);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-04-29 11:51:59 +08:00
|
|
|
proto_tce = TCE_PCI_READ; // Read allowed
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
if (direction != DMA_TO_DEVICE)
|
2006-04-29 11:51:59 +08:00
|
|
|
proto_tce |= TCE_PCI_WRITE;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2019-04-07 10:48:08 +08:00
|
|
|
tcep = ((__be64 *)tbl->it_base) + index;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
while (npages--) {
|
2010-07-12 12:36:09 +08:00
|
|
|
/* can't move this out since we might cross MEMBLOCK boundary */
|
powerpc/pseries/iommu: Replace hard-coded page shift
Some functions assume IOMMU page size can only be 4K (pageshift == 12).
Update them to accept any page size passed, so we can use 64K pages.
In the process, some defines like TCE_SHIFT were made obsolete, and then
removed.
IODA3 Revision 3.0_prd1 (OpenPowerFoundation), Figures 3.4 and 3.5 show
a RPN of 52-bit, and considers a 12-bit pageshift, so there should be
no need of using TCE_RPN_MASK, which masks out any bit after 40 in rpn.
It's usage removed from tce_build_pSeries(), tce_build_pSeriesLP(), and
tce_buildmulti_pSeriesLP().
Most places had a tbl struct, so using tbl->it_page_shift was simple.
tce_free_pSeriesLP() was a special case, since callers not always have a
tbl struct, so adding a tceshift parameter seems the right thing to do.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: Frederic Barrat <fbarrat@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-2-leobras.c@gmail.com
2021-08-17 14:39:19 +08:00
|
|
|
rpn = __pa(uaddr) >> tceshift;
|
|
|
|
*tcep = cpu_to_be64(proto_tce | rpn << tceshift);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
powerpc/pseries/iommu: Replace hard-coded page shift
Some functions assume IOMMU page size can only be 4K (pageshift == 12).
Update them to accept any page size passed, so we can use 64K pages.
In the process, some defines like TCE_SHIFT were made obsolete, and then
removed.
IODA3 Revision 3.0_prd1 (OpenPowerFoundation), Figures 3.4 and 3.5 show
a RPN of 52-bit, and considers a 12-bit pageshift, so there should be
no need of using TCE_RPN_MASK, which masks out any bit after 40 in rpn.
It's usage removed from tce_build_pSeries(), tce_build_pSeriesLP(), and
tce_buildmulti_pSeriesLP().
Most places had a tbl struct, so using tbl->it_page_shift was simple.
tce_free_pSeriesLP() was a special case, since callers not always have a
tbl struct, so adding a tceshift parameter seems the right thing to do.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: Frederic Barrat <fbarrat@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-2-leobras.c@gmail.com
2021-08-17 14:39:19 +08:00
|
|
|
uaddr += pagesize;
|
2006-04-29 11:51:59 +08:00
|
|
|
tcep++;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2008-07-24 02:31:16 +08:00
|
|
|
return 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static void tce_free_pSeries(struct iommu_table *tbl, long index, long npages)
|
|
|
|
{
|
2019-04-07 10:48:08 +08:00
|
|
|
__be64 *tcep;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2019-04-07 10:48:08 +08:00
|
|
|
tcep = ((__be64 *)tbl->it_base) + index;
|
2006-04-29 11:51:59 +08:00
|
|
|
|
|
|
|
while (npages--)
|
|
|
|
*(tcep++) = 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2006-06-23 14:35:10 +08:00
|
|
|
static unsigned long tce_get_pseries(struct iommu_table *tbl, long index)
|
|
|
|
{
|
2013-10-17 20:21:15 +08:00
|
|
|
__be64 *tcep;
|
2006-06-23 14:35:10 +08:00
|
|
|
|
2013-10-17 20:21:15 +08:00
|
|
|
tcep = ((__be64 *)tbl->it_base) + index;
|
2006-06-23 14:35:10 +08:00
|
|
|
|
2013-10-17 20:21:15 +08:00
|
|
|
return be64_to_cpu(*tcep);
|
2006-06-23 14:35:10 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
powerpc/pseries/iommu: Replace hard-coded page shift
Some functions assume IOMMU page size can only be 4K (pageshift == 12).
Update them to accept any page size passed, so we can use 64K pages.
In the process, some defines like TCE_SHIFT were made obsolete, and then
removed.
IODA3 Revision 3.0_prd1 (OpenPowerFoundation), Figures 3.4 and 3.5 show
a RPN of 52-bit, and considers a 12-bit pageshift, so there should be
no need of using TCE_RPN_MASK, which masks out any bit after 40 in rpn.
It's usage removed from tce_build_pSeries(), tce_build_pSeriesLP(), and
tce_buildmulti_pSeriesLP().
Most places had a tbl struct, so using tbl->it_page_shift was simple.
tce_free_pSeriesLP() was a special case, since callers not always have a
tbl struct, so adding a tceshift parameter seems the right thing to do.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: Frederic Barrat <fbarrat@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-2-leobras.c@gmail.com
2021-08-17 14:39:19 +08:00
|
|
|
static void tce_free_pSeriesLP(unsigned long liobn, long, long, long);
|
2008-07-24 02:31:16 +08:00
|
|
|
static void tce_freemulti_pSeriesLP(struct iommu_table*, long, long);
|
|
|
|
|
2019-12-16 12:19:22 +08:00
|
|
|
static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift,
|
2005-04-17 06:20:36 +08:00
|
|
|
long npages, unsigned long uaddr,
|
2008-07-16 03:51:47 +08:00
|
|
|
enum dma_data_direction direction,
|
2016-08-04 04:46:00 +08:00
|
|
|
unsigned long attrs)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2008-07-24 02:31:16 +08:00
|
|
|
u64 rc = 0;
|
2006-04-29 11:51:59 +08:00
|
|
|
u64 proto_tce, tce;
|
|
|
|
u64 rpn;
|
2008-07-24 02:31:16 +08:00
|
|
|
int ret = 0;
|
|
|
|
long tcenum_start = tcenum, npages_start = npages;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2019-12-16 12:19:22 +08:00
|
|
|
rpn = __pa(uaddr) >> tceshift;
|
2006-04-29 11:51:59 +08:00
|
|
|
proto_tce = TCE_PCI_READ;
|
2005-04-17 06:20:36 +08:00
|
|
|
if (direction != DMA_TO_DEVICE)
|
2006-04-29 11:51:59 +08:00
|
|
|
proto_tce |= TCE_PCI_WRITE;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
while (npages--) {
|
powerpc/pseries/iommu: Replace hard-coded page shift
Some functions assume IOMMU page size can only be 4K (pageshift == 12).
Update them to accept any page size passed, so we can use 64K pages.
In the process, some defines like TCE_SHIFT were made obsolete, and then
removed.
IODA3 Revision 3.0_prd1 (OpenPowerFoundation), Figures 3.4 and 3.5 show
a RPN of 52-bit, and considers a 12-bit pageshift, so there should be
no need of using TCE_RPN_MASK, which masks out any bit after 40 in rpn.
It's usage removed from tce_build_pSeries(), tce_build_pSeriesLP(), and
tce_buildmulti_pSeriesLP().
Most places had a tbl struct, so using tbl->it_page_shift was simple.
tce_free_pSeriesLP() was a special case, since callers not always have a
tbl struct, so adding a tceshift parameter seems the right thing to do.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: Frederic Barrat <fbarrat@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-2-leobras.c@gmail.com
2021-08-17 14:39:19 +08:00
|
|
|
tce = proto_tce | rpn << tceshift;
|
2019-12-16 12:19:22 +08:00
|
|
|
rc = plpar_tce_put((u64)liobn, (u64)tcenum << tceshift, tce);
|
2006-04-29 11:51:59 +08:00
|
|
|
|
2008-07-24 02:31:16 +08:00
|
|
|
if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
|
|
|
|
ret = (int)rc;
|
powerpc/pseries/iommu: Replace hard-coded page shift
Some functions assume IOMMU page size can only be 4K (pageshift == 12).
Update them to accept any page size passed, so we can use 64K pages.
In the process, some defines like TCE_SHIFT were made obsolete, and then
removed.
IODA3 Revision 3.0_prd1 (OpenPowerFoundation), Figures 3.4 and 3.5 show
a RPN of 52-bit, and considers a 12-bit pageshift, so there should be
no need of using TCE_RPN_MASK, which masks out any bit after 40 in rpn.
It's usage removed from tce_build_pSeries(), tce_build_pSeriesLP(), and
tce_buildmulti_pSeriesLP().
Most places had a tbl struct, so using tbl->it_page_shift was simple.
tce_free_pSeriesLP() was a special case, since callers not always have a
tbl struct, so adding a tceshift parameter seems the right thing to do.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: Frederic Barrat <fbarrat@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-2-leobras.c@gmail.com
2021-08-17 14:39:19 +08:00
|
|
|
tce_free_pSeriesLP(liobn, tcenum_start, tceshift,
|
2008-07-24 02:31:16 +08:00
|
|
|
(npages_start - (npages + 1)));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
if (rc && printk_ratelimit()) {
|
2009-01-06 22:26:03 +08:00
|
|
|
printk("tce_build_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);
|
2019-12-16 12:19:22 +08:00
|
|
|
printk("\tindex = 0x%llx\n", (u64)liobn);
|
2009-01-06 22:26:03 +08:00
|
|
|
printk("\ttcenum = 0x%llx\n", (u64)tcenum);
|
|
|
|
printk("\ttce val = 0x%llx\n", tce );
|
2014-10-13 16:41:40 +08:00
|
|
|
dump_stack();
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2006-04-29 11:51:59 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
tcenum++;
|
2006-04-29 11:51:59 +08:00
|
|
|
rpn++;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2008-07-24 02:31:16 +08:00
|
|
|
return ret;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2013-10-17 20:21:15 +08:00
|
|
|
static DEFINE_PER_CPU(__be64 *, tce_page);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-07-24 02:31:16 +08:00
|
|
|
static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
|
2005-04-17 06:20:36 +08:00
|
|
|
long npages, unsigned long uaddr,
|
2008-07-16 03:51:47 +08:00
|
|
|
enum dma_data_direction direction,
|
2016-08-04 04:46:00 +08:00
|
|
|
unsigned long attrs)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2008-07-24 02:31:16 +08:00
|
|
|
u64 rc = 0;
|
2006-04-29 11:51:59 +08:00
|
|
|
u64 proto_tce;
|
2013-10-17 20:21:15 +08:00
|
|
|
__be64 *tcep;
|
2006-04-29 11:51:59 +08:00
|
|
|
u64 rpn;
|
2005-04-17 06:20:36 +08:00
|
|
|
long l, limit;
|
2008-07-24 02:31:16 +08:00
|
|
|
long tcenum_start = tcenum, npages_start = npages;
|
|
|
|
int ret = 0;
|
2012-06-04 03:42:13 +08:00
|
|
|
unsigned long flags;
|
powerpc/pseries/iommu: Replace hard-coded page shift
Some functions assume IOMMU page size can only be 4K (pageshift == 12).
Update them to accept any page size passed, so we can use 64K pages.
In the process, some defines like TCE_SHIFT were made obsolete, and then
removed.
IODA3 Revision 3.0_prd1 (OpenPowerFoundation), Figures 3.4 and 3.5 show
a RPN of 52-bit, and considers a 12-bit pageshift, so there should be
no need of using TCE_RPN_MASK, which masks out any bit after 40 in rpn.
It's usage removed from tce_build_pSeries(), tce_build_pSeriesLP(), and
tce_buildmulti_pSeriesLP().
Most places had a tbl struct, so using tbl->it_page_shift was simple.
tce_free_pSeriesLP() was a special case, since callers not always have a
tbl struct, so adding a tceshift parameter seems the right thing to do.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: Frederic Barrat <fbarrat@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-2-leobras.c@gmail.com
2021-08-17 14:39:19 +08:00
|
|
|
const unsigned long tceshift = tbl->it_page_shift;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2019-12-16 12:19:23 +08:00
|
|
|
if ((npages == 1) || !firmware_has_feature(FW_FEATURE_PUT_TCE_IND)) {
|
2019-12-16 12:19:22 +08:00
|
|
|
return tce_build_pSeriesLP(tbl->it_index, tcenum,
|
powerpc/pseries/iommu: Replace hard-coded page shift
Some functions assume IOMMU page size can only be 4K (pageshift == 12).
Update them to accept any page size passed, so we can use 64K pages.
In the process, some defines like TCE_SHIFT were made obsolete, and then
removed.
IODA3 Revision 3.0_prd1 (OpenPowerFoundation), Figures 3.4 and 3.5 show
a RPN of 52-bit, and considers a 12-bit pageshift, so there should be
no need of using TCE_RPN_MASK, which masks out any bit after 40 in rpn.
It's usage removed from tce_build_pSeries(), tce_build_pSeriesLP(), and
tce_buildmulti_pSeriesLP().
Most places had a tbl struct, so using tbl->it_page_shift was simple.
tce_free_pSeriesLP() was a special case, since callers not always have a
tbl struct, so adding a tceshift parameter seems the right thing to do.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: Frederic Barrat <fbarrat@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-2-leobras.c@gmail.com
2021-08-17 14:39:19 +08:00
|
|
|
tceshift, npages, uaddr,
|
2008-07-24 02:31:16 +08:00
|
|
|
direction, attrs);
|
2008-05-08 12:27:23 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2012-06-04 03:42:13 +08:00
|
|
|
local_irq_save(flags); /* to protect tcep and the page behind it */
|
|
|
|
|
powerpc: Replace __get_cpu_var uses
This still has not been merged and now powerpc is the only arch that does
not have this change. Sorry about missing linuxppc-dev before.
V2->V2
- Fix up to work against 3.18-rc1
__get_cpu_var() is used for multiple purposes in the kernel source. One of
them is address calculation via the form &__get_cpu_var(x). This calculates
the address for the instance of the percpu variable of the current processor
based on an offset.
Other use cases are for storing and retrieving data from the current
processors percpu area. __get_cpu_var() can be used as an lvalue when
writing data or on the right side of an assignment.
__get_cpu_var() is defined as :
__get_cpu_var() always only does an address determination. However, store
and retrieve operations could use a segment prefix (or global register on
other platforms) to avoid the address calculation.
this_cpu_write() and this_cpu_read() can directly take an offset into a
percpu area and use optimized assembly code to read and write per cpu
variables.
This patch converts __get_cpu_var into either an explicit address
calculation using this_cpu_ptr() or into a use of this_cpu operations that
use the offset. Thereby address calculations are avoided and less registers
are used when code is generated.
At the end of the patch set all uses of __get_cpu_var have been removed so
the macro is removed too.
The patch set includes passes over all arches as well. Once these operations
are used throughout then specialized macros can be defined in non -x86
arches as well in order to optimize per cpu access by f.e. using a global
register that may be set to the per cpu base.
Transformations done to __get_cpu_var()
1. Determine the address of the percpu instance of the current processor.
DEFINE_PER_CPU(int, y);
int *x = &__get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(&y);
2. Same as #1 but this time an array structure is involved.
DEFINE_PER_CPU(int, y[20]);
int *x = __get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(y);
3. Retrieve the content of the current processors instance of a per cpu
variable.
DEFINE_PER_CPU(int, y);
int x = __get_cpu_var(y)
Converts to
int x = __this_cpu_read(y);
4. Retrieve the content of a percpu struct
DEFINE_PER_CPU(struct mystruct, y);
struct mystruct x = __get_cpu_var(y);
Converts to
memcpy(&x, this_cpu_ptr(&y), sizeof(x));
5. Assignment to a per cpu variable
DEFINE_PER_CPU(int, y)
__get_cpu_var(y) = x;
Converts to
__this_cpu_write(y, x);
6. Increment/Decrement etc of a per cpu variable
DEFINE_PER_CPU(int, y);
__get_cpu_var(y)++
Converts to
__this_cpu_inc(y)
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Paul Mackerras <paulus@samba.org>
Signed-off-by: Christoph Lameter <cl@linux.com>
[mpe: Fix build errors caused by set/or_softirq_pending(), and rework
assignment in __set_breakpoint() to use memcpy().]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2014-10-22 04:23:25 +08:00
|
|
|
tcep = __this_cpu_read(tce_page);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* This is safe to do since interrupts are off when we're called
|
|
|
|
* from iommu_alloc{,_sg}()
|
|
|
|
*/
|
|
|
|
if (!tcep) {
|
2013-10-17 20:21:15 +08:00
|
|
|
tcep = (__be64 *)__get_free_page(GFP_ATOMIC);
|
2005-04-17 06:20:36 +08:00
|
|
|
/* If allocation fails, fall back to the loop implementation */
|
2008-05-08 12:27:23 +08:00
|
|
|
if (!tcep) {
|
2012-06-04 03:42:13 +08:00
|
|
|
local_irq_restore(flags);
|
2019-12-16 12:19:22 +08:00
|
|
|
return tce_build_pSeriesLP(tbl->it_index, tcenum,
|
powerpc/pseries/iommu: Replace hard-coded page shift
Some functions assume IOMMU page size can only be 4K (pageshift == 12).
Update them to accept any page size passed, so we can use 64K pages.
In the process, some defines like TCE_SHIFT were made obsolete, and then
removed.
IODA3 Revision 3.0_prd1 (OpenPowerFoundation), Figures 3.4 and 3.5 show
a RPN of 52-bit, and considers a 12-bit pageshift, so there should be
no need of using TCE_RPN_MASK, which masks out any bit after 40 in rpn.
It's usage removed from tce_build_pSeries(), tce_build_pSeriesLP(), and
tce_buildmulti_pSeriesLP().
Most places had a tbl struct, so using tbl->it_page_shift was simple.
tce_free_pSeriesLP() was a special case, since callers not always have a
tbl struct, so adding a tceshift parameter seems the right thing to do.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: Frederic Barrat <fbarrat@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-2-leobras.c@gmail.com
2021-08-17 14:39:19 +08:00
|
|
|
tceshift,
|
2019-12-16 12:19:22 +08:00
|
|
|
npages, uaddr, direction, attrs);
|
2008-05-08 12:27:23 +08:00
|
|
|
}
|
powerpc: Replace __get_cpu_var uses
This still has not been merged and now powerpc is the only arch that does
not have this change. Sorry about missing linuxppc-dev before.
V2->V2
- Fix up to work against 3.18-rc1
__get_cpu_var() is used for multiple purposes in the kernel source. One of
them is address calculation via the form &__get_cpu_var(x). This calculates
the address for the instance of the percpu variable of the current processor
based on an offset.
Other use cases are for storing and retrieving data from the current
processors percpu area. __get_cpu_var() can be used as an lvalue when
writing data or on the right side of an assignment.
__get_cpu_var() is defined as :
__get_cpu_var() always only does an address determination. However, store
and retrieve operations could use a segment prefix (or global register on
other platforms) to avoid the address calculation.
this_cpu_write() and this_cpu_read() can directly take an offset into a
percpu area and use optimized assembly code to read and write per cpu
variables.
This patch converts __get_cpu_var into either an explicit address
calculation using this_cpu_ptr() or into a use of this_cpu operations that
use the offset. Thereby address calculations are avoided and less registers
are used when code is generated.
At the end of the patch set all uses of __get_cpu_var have been removed so
the macro is removed too.
The patch set includes passes over all arches as well. Once these operations
are used throughout then specialized macros can be defined in non -x86
arches as well in order to optimize per cpu access by f.e. using a global
register that may be set to the per cpu base.
Transformations done to __get_cpu_var()
1. Determine the address of the percpu instance of the current processor.
DEFINE_PER_CPU(int, y);
int *x = &__get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(&y);
2. Same as #1 but this time an array structure is involved.
DEFINE_PER_CPU(int, y[20]);
int *x = __get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(y);
3. Retrieve the content of the current processors instance of a per cpu
variable.
DEFINE_PER_CPU(int, y);
int x = __get_cpu_var(y)
Converts to
int x = __this_cpu_read(y);
4. Retrieve the content of a percpu struct
DEFINE_PER_CPU(struct mystruct, y);
struct mystruct x = __get_cpu_var(y);
Converts to
memcpy(&x, this_cpu_ptr(&y), sizeof(x));
5. Assignment to a per cpu variable
DEFINE_PER_CPU(int, y)
__get_cpu_var(y) = x;
Converts to
__this_cpu_write(y, x);
6. Increment/Decrement etc of a per cpu variable
DEFINE_PER_CPU(int, y);
__get_cpu_var(y)++
Converts to
__this_cpu_inc(y)
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Paul Mackerras <paulus@samba.org>
Signed-off-by: Christoph Lameter <cl@linux.com>
[mpe: Fix build errors caused by set/or_softirq_pending(), and rework
assignment in __set_breakpoint() to use memcpy().]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2014-10-22 04:23:25 +08:00
|
|
|
__this_cpu_write(tce_page, tcep);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
powerpc/pseries/iommu: Replace hard-coded page shift
Some functions assume IOMMU page size can only be 4K (pageshift == 12).
Update them to accept any page size passed, so we can use 64K pages.
In the process, some defines like TCE_SHIFT were made obsolete, and then
removed.
IODA3 Revision 3.0_prd1 (OpenPowerFoundation), Figures 3.4 and 3.5 show
a RPN of 52-bit, and considers a 12-bit pageshift, so there should be
no need of using TCE_RPN_MASK, which masks out any bit after 40 in rpn.
It's usage removed from tce_build_pSeries(), tce_build_pSeriesLP(), and
tce_buildmulti_pSeriesLP().
Most places had a tbl struct, so using tbl->it_page_shift was simple.
tce_free_pSeriesLP() was a special case, since callers not always have a
tbl struct, so adding a tceshift parameter seems the right thing to do.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: Frederic Barrat <fbarrat@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-2-leobras.c@gmail.com
2021-08-17 14:39:19 +08:00
|
|
|
rpn = __pa(uaddr) >> tceshift;
|
2006-04-29 11:51:59 +08:00
|
|
|
proto_tce = TCE_PCI_READ;
|
2005-04-17 06:20:36 +08:00
|
|
|
if (direction != DMA_TO_DEVICE)
|
2006-04-29 11:51:59 +08:00
|
|
|
proto_tce |= TCE_PCI_WRITE;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* We can map max one pageful of TCEs at a time */
|
|
|
|
do {
|
|
|
|
/*
|
|
|
|
* Set up the page with TCE data, looping through and setting
|
|
|
|
* the values.
|
|
|
|
*/
|
2006-04-29 11:51:59 +08:00
|
|
|
limit = min_t(long, npages, 4096/TCE_ENTRY_SIZE);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
for (l = 0; l < limit; l++) {
|
powerpc/pseries/iommu: Replace hard-coded page shift
Some functions assume IOMMU page size can only be 4K (pageshift == 12).
Update them to accept any page size passed, so we can use 64K pages.
In the process, some defines like TCE_SHIFT were made obsolete, and then
removed.
IODA3 Revision 3.0_prd1 (OpenPowerFoundation), Figures 3.4 and 3.5 show
a RPN of 52-bit, and considers a 12-bit pageshift, so there should be
no need of using TCE_RPN_MASK, which masks out any bit after 40 in rpn.
It's usage removed from tce_build_pSeries(), tce_build_pSeriesLP(), and
tce_buildmulti_pSeriesLP().
Most places had a tbl struct, so using tbl->it_page_shift was simple.
tce_free_pSeriesLP() was a special case, since callers not always have a
tbl struct, so adding a tceshift parameter seems the right thing to do.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: Frederic Barrat <fbarrat@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-2-leobras.c@gmail.com
2021-08-17 14:39:19 +08:00
|
|
|
tcep[l] = cpu_to_be64(proto_tce | rpn << tceshift);
|
2006-04-29 11:51:59 +08:00
|
|
|
rpn++;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
rc = plpar_tce_put_indirect((u64)tbl->it_index,
|
powerpc/pseries/iommu: Replace hard-coded page shift
Some functions assume IOMMU page size can only be 4K (pageshift == 12).
Update them to accept any page size passed, so we can use 64K pages.
In the process, some defines like TCE_SHIFT were made obsolete, and then
removed.
IODA3 Revision 3.0_prd1 (OpenPowerFoundation), Figures 3.4 and 3.5 show
a RPN of 52-bit, and considers a 12-bit pageshift, so there should be
no need of using TCE_RPN_MASK, which masks out any bit after 40 in rpn.
It's usage removed from tce_build_pSeries(), tce_build_pSeriesLP(), and
tce_buildmulti_pSeriesLP().
Most places had a tbl struct, so using tbl->it_page_shift was simple.
tce_free_pSeriesLP() was a special case, since callers not always have a
tbl struct, so adding a tceshift parameter seems the right thing to do.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: Frederic Barrat <fbarrat@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-2-leobras.c@gmail.com
2021-08-17 14:39:19 +08:00
|
|
|
(u64)tcenum << tceshift,
|
2012-07-26 05:19:57 +08:00
|
|
|
(u64)__pa(tcep),
|
2005-04-17 06:20:36 +08:00
|
|
|
limit);
|
|
|
|
|
|
|
|
npages -= limit;
|
|
|
|
tcenum += limit;
|
|
|
|
} while (npages > 0 && !rc);
|
|
|
|
|
2012-06-04 03:42:13 +08:00
|
|
|
local_irq_restore(flags);
|
|
|
|
|
2008-07-24 02:31:16 +08:00
|
|
|
if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
|
|
|
|
ret = (int)rc;
|
|
|
|
tce_freemulti_pSeriesLP(tbl, tcenum_start,
|
|
|
|
(npages_start - (npages + limit)));
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
if (rc && printk_ratelimit()) {
|
2009-01-06 22:26:03 +08:00
|
|
|
printk("tce_buildmulti_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);
|
|
|
|
printk("\tindex = 0x%llx\n", (u64)tbl->it_index);
|
|
|
|
printk("\tnpages = 0x%llx\n", (u64)npages);
|
|
|
|
printk("\ttce[0] val = 0x%llx\n", tcep[0]);
|
2014-10-13 16:41:40 +08:00
|
|
|
dump_stack();
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2008-07-24 02:31:16 +08:00
|
|
|
return ret;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
powerpc/pseries/iommu: Replace hard-coded page shift
Some functions assume IOMMU page size can only be 4K (pageshift == 12).
Update them to accept any page size passed, so we can use 64K pages.
In the process, some defines like TCE_SHIFT were made obsolete, and then
removed.
IODA3 Revision 3.0_prd1 (OpenPowerFoundation), Figures 3.4 and 3.5 show
a RPN of 52-bit, and considers a 12-bit pageshift, so there should be
no need of using TCE_RPN_MASK, which masks out any bit after 40 in rpn.
It's usage removed from tce_build_pSeries(), tce_build_pSeriesLP(), and
tce_buildmulti_pSeriesLP().
Most places had a tbl struct, so using tbl->it_page_shift was simple.
tce_free_pSeriesLP() was a special case, since callers not always have a
tbl struct, so adding a tceshift parameter seems the right thing to do.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: Frederic Barrat <fbarrat@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-2-leobras.c@gmail.com
2021-08-17 14:39:19 +08:00
|
|
|
static void tce_free_pSeriesLP(unsigned long liobn, long tcenum, long tceshift,
|
|
|
|
long npages)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
u64 rc;
|
|
|
|
|
|
|
|
while (npages--) {
|
powerpc/pseries/iommu: Replace hard-coded page shift
Some functions assume IOMMU page size can only be 4K (pageshift == 12).
Update them to accept any page size passed, so we can use 64K pages.
In the process, some defines like TCE_SHIFT were made obsolete, and then
removed.
IODA3 Revision 3.0_prd1 (OpenPowerFoundation), Figures 3.4 and 3.5 show
a RPN of 52-bit, and considers a 12-bit pageshift, so there should be
no need of using TCE_RPN_MASK, which masks out any bit after 40 in rpn.
It's usage removed from tce_build_pSeries(), tce_build_pSeriesLP(), and
tce_buildmulti_pSeriesLP().
Most places had a tbl struct, so using tbl->it_page_shift was simple.
tce_free_pSeriesLP() was a special case, since callers not always have a
tbl struct, so adding a tceshift parameter seems the right thing to do.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: Frederic Barrat <fbarrat@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-2-leobras.c@gmail.com
2021-08-17 14:39:19 +08:00
|
|
|
rc = plpar_tce_put((u64)liobn, (u64)tcenum << tceshift, 0);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
if (rc && printk_ratelimit()) {
|
2009-01-06 22:26:03 +08:00
|
|
|
printk("tce_free_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);
|
2019-12-16 12:19:22 +08:00
|
|
|
printk("\tindex = 0x%llx\n", (u64)liobn);
|
2009-01-06 22:26:03 +08:00
|
|
|
printk("\ttcenum = 0x%llx\n", (u64)tcenum);
|
2014-10-13 16:41:40 +08:00
|
|
|
dump_stack();
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
tcenum++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static void tce_freemulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages)
|
|
|
|
{
|
|
|
|
u64 rc;
|
|
|
|
|
2019-12-16 12:19:23 +08:00
|
|
|
if (!firmware_has_feature(FW_FEATURE_STUFF_TCE))
|
powerpc/pseries/iommu: Replace hard-coded page shift
Some functions assume IOMMU page size can only be 4K (pageshift == 12).
Update them to accept any page size passed, so we can use 64K pages.
In the process, some defines like TCE_SHIFT were made obsolete, and then
removed.
IODA3 Revision 3.0_prd1 (OpenPowerFoundation), Figures 3.4 and 3.5 show
a RPN of 52-bit, and considers a 12-bit pageshift, so there should be
no need of using TCE_RPN_MASK, which masks out any bit after 40 in rpn.
It's usage removed from tce_build_pSeries(), tce_build_pSeriesLP(), and
tce_buildmulti_pSeriesLP().
Most places had a tbl struct, so using tbl->it_page_shift was simple.
tce_free_pSeriesLP() was a special case, since callers not always have a
tbl struct, so adding a tceshift parameter seems the right thing to do.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: Frederic Barrat <fbarrat@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-2-leobras.c@gmail.com
2021-08-17 14:39:19 +08:00
|
|
|
return tce_free_pSeriesLP(tbl->it_index, tcenum,
|
|
|
|
tbl->it_page_shift, npages);
|
2015-06-05 14:35:06 +08:00
|
|
|
|
powerpc/pseries/iommu: Replace hard-coded page shift
Some functions assume IOMMU page size can only be 4K (pageshift == 12).
Update them to accept any page size passed, so we can use 64K pages.
In the process, some defines like TCE_SHIFT were made obsolete, and then
removed.
IODA3 Revision 3.0_prd1 (OpenPowerFoundation), Figures 3.4 and 3.5 show
a RPN of 52-bit, and considers a 12-bit pageshift, so there should be
no need of using TCE_RPN_MASK, which masks out any bit after 40 in rpn.
It's usage removed from tce_build_pSeries(), tce_build_pSeriesLP(), and
tce_buildmulti_pSeriesLP().
Most places had a tbl struct, so using tbl->it_page_shift was simple.
tce_free_pSeriesLP() was a special case, since callers not always have a
tbl struct, so adding a tceshift parameter seems the right thing to do.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: Frederic Barrat <fbarrat@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-2-leobras.c@gmail.com
2021-08-17 14:39:19 +08:00
|
|
|
rc = plpar_tce_stuff((u64)tbl->it_index,
|
|
|
|
(u64)tcenum << tbl->it_page_shift, 0, npages);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
if (rc && printk_ratelimit()) {
|
|
|
|
printk("tce_freemulti_pSeriesLP: plpar_tce_stuff failed\n");
|
2009-01-06 22:26:03 +08:00
|
|
|
printk("\trc = %lld\n", rc);
|
|
|
|
printk("\tindex = 0x%llx\n", (u64)tbl->it_index);
|
|
|
|
printk("\tnpages = 0x%llx\n", (u64)npages);
|
2014-10-13 16:41:40 +08:00
|
|
|
dump_stack();
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2006-06-23 14:35:10 +08:00
|
|
|
static unsigned long tce_get_pSeriesLP(struct iommu_table *tbl, long tcenum)
|
|
|
|
{
|
|
|
|
u64 rc;
|
|
|
|
unsigned long tce_ret;
|
|
|
|
|
powerpc/pseries/iommu: Replace hard-coded page shift
Some functions assume IOMMU page size can only be 4K (pageshift == 12).
Update them to accept any page size passed, so we can use 64K pages.
In the process, some defines like TCE_SHIFT were made obsolete, and then
removed.
IODA3 Revision 3.0_prd1 (OpenPowerFoundation), Figures 3.4 and 3.5 show
a RPN of 52-bit, and considers a 12-bit pageshift, so there should be
no need of using TCE_RPN_MASK, which masks out any bit after 40 in rpn.
It's usage removed from tce_build_pSeries(), tce_build_pSeriesLP(), and
tce_buildmulti_pSeriesLP().
Most places had a tbl struct, so using tbl->it_page_shift was simple.
tce_free_pSeriesLP() was a special case, since callers not always have a
tbl struct, so adding a tceshift parameter seems the right thing to do.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: Frederic Barrat <fbarrat@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-2-leobras.c@gmail.com
2021-08-17 14:39:19 +08:00
|
|
|
rc = plpar_tce_get((u64)tbl->it_index,
|
|
|
|
(u64)tcenum << tbl->it_page_shift, &tce_ret);
|
2006-06-23 14:35:10 +08:00
|
|
|
|
|
|
|
if (rc && printk_ratelimit()) {
|
2009-01-06 22:26:03 +08:00
|
|
|
printk("tce_get_pSeriesLP: plpar_tce_get failed. rc=%lld\n", rc);
|
|
|
|
printk("\tindex = 0x%llx\n", (u64)tbl->it_index);
|
|
|
|
printk("\ttcenum = 0x%llx\n", (u64)tcenum);
|
2014-10-13 16:41:40 +08:00
|
|
|
dump_stack();
|
2006-06-23 14:35:10 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return tce_ret;
|
|
|
|
}
|
|
|
|
|
2011-03-31 09:57:33 +08:00
|
|
|
/* this is compatible with cells for the device tree property */
|
2011-02-10 17:10:47 +08:00
|
|
|
struct dynamic_dma_window_prop {
|
|
|
|
__be32 liobn; /* tce table number */
|
|
|
|
__be64 dma_base; /* address hi,lo */
|
|
|
|
__be32 tce_shift; /* ilog2(tce_page_size) */
|
|
|
|
__be32 window_shift; /* ilog2(tce_window_size) */
|
|
|
|
};
|
|
|
|
|
2021-08-17 14:39:29 +08:00
|
|
|
struct dma_win {
|
2011-02-10 17:10:47 +08:00
|
|
|
struct device_node *device;
|
|
|
|
const struct dynamic_dma_window_prop *prop;
|
|
|
|
struct list_head list;
|
|
|
|
};
|
|
|
|
|
|
|
|
/* Dynamic DMA Window support */
|
|
|
|
struct ddw_query_response {
|
2014-09-25 14:39:18 +08:00
|
|
|
u32 windows_available;
|
powerpc/pseries/iommu: Update call to ibm, query-pe-dma-windows
>From LoPAR level 2.8, "ibm,ddw-extensions" index 3 can make the number of
outputs from "ibm,query-pe-dma-windows" go from 5 to 6.
This change of output size is meant to expand the address size of
largest_available_block PE TCE from 32-bit to 64-bit, which ends up
shifting page_size and migration_capable.
This ends up requiring the update of
ddw_query_response->largest_available_block from u32 to u64, and manually
assigning the values from the buffer into this struct, according to
output size.
Also, a routine was created for helping reading the ddw extensions as
suggested by LoPAR: First reading the size of the extension array from
index 0, checking if the property exists, and then returning it's value.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Tested-by: David Dai <zdai@linux.vnet.ibm.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200805030455.123024-3-leobras.c@gmail.com
2020-08-05 11:04:53 +08:00
|
|
|
u64 largest_available_block;
|
2014-09-25 14:39:18 +08:00
|
|
|
u32 page_size;
|
|
|
|
u32 migration_capable;
|
2011-02-10 17:10:47 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
struct ddw_create_response {
|
2014-09-25 14:39:18 +08:00
|
|
|
u32 liobn;
|
|
|
|
u32 addr_hi;
|
|
|
|
u32 addr_lo;
|
2011-02-10 17:10:47 +08:00
|
|
|
};
|
|
|
|
|
2021-08-17 14:39:29 +08:00
|
|
|
static LIST_HEAD(dma_win_list);
|
2011-02-10 17:10:47 +08:00
|
|
|
/* prevents races between memory on/offline and window creation */
|
2021-08-17 14:39:29 +08:00
|
|
|
static DEFINE_SPINLOCK(dma_win_list_lock);
|
2011-02-10 17:10:47 +08:00
|
|
|
/* protects initializing window twice for same device */
|
2021-08-17 14:39:29 +08:00
|
|
|
static DEFINE_MUTEX(dma_win_init_mutex);
|
2011-02-10 17:10:47 +08:00
|
|
|
#define DIRECT64_PROPNAME "linux,direct64-ddr-window-info"
|
powerpc/pseries/iommu: Make use of DDW for indirect mapping
So far it's assumed possible to map the guest RAM 1:1 to the bus, which
works with a small number of devices. SRIOV changes it as the user can
configure hundreds VFs and since phyp preallocates TCEs and does not
allow IOMMU pages bigger than 64K, it has to limit the number of TCEs
per a PE to limit waste of physical pages.
As of today, if the assumed direct mapping is not possible, DDW creation
is skipped and the default DMA window "ibm,dma-window" is used instead.
By using DDW, indirect mapping can get more TCEs than available for the
default DMA window, and also get access to using much larger pagesizes
(16MB as implemented in qemu vs 4k from default DMA window), causing a
significant increase on the maximum amount of memory that can be IOMMU
mapped at the same time.
Indirect mapping will only be used if direct mapping is not a
possibility.
For indirect mapping, it's necessary to re-create the iommu_table with
the new DMA window parameters, so iommu_alloc() can use it.
Removing the default DMA window for using DDW with indirect mapping
is only allowed if there is no current IOMMU memory allocated in
the iommu_table. enable_ddw() is aborted otherwise.
Even though there won't be both direct and indirect mappings at the
same time, we can't reuse the DIRECT64_PROPNAME property name, or else
an older kexec()ed kernel can assume direct mapping, and skip
iommu_alloc(), causing undesirable behavior.
So a new property name DMA64_PROPNAME "linux,dma64-ddr-window-info"
was created to represent a DDW that does not allow direct mapping.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-11-leobras.c@gmail.com
2021-08-17 14:39:28 +08:00
|
|
|
#define DMA64_PROPNAME "linux,dma64-ddr-window-info"
|
2011-02-10 17:10:47 +08:00
|
|
|
|
|
|
|
static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn,
|
|
|
|
unsigned long num_pfn, const void *arg)
|
|
|
|
{
|
|
|
|
const struct dynamic_dma_window_prop *maprange = arg;
|
|
|
|
int rc;
|
|
|
|
u64 tce_size, num_tce, dma_offset, next;
|
|
|
|
u32 tce_shift;
|
|
|
|
long limit;
|
|
|
|
|
|
|
|
tce_shift = be32_to_cpu(maprange->tce_shift);
|
|
|
|
tce_size = 1ULL << tce_shift;
|
|
|
|
next = start_pfn << PAGE_SHIFT;
|
|
|
|
num_tce = num_pfn << PAGE_SHIFT;
|
|
|
|
|
|
|
|
/* round back to the beginning of the tce page size */
|
|
|
|
num_tce += next & (tce_size - 1);
|
|
|
|
next &= ~(tce_size - 1);
|
|
|
|
|
|
|
|
/* covert to number of tces */
|
|
|
|
num_tce |= tce_size - 1;
|
|
|
|
num_tce >>= tce_shift;
|
|
|
|
|
|
|
|
do {
|
|
|
|
/*
|
|
|
|
* Set up the page with TCE data, looping through and setting
|
|
|
|
* the values.
|
|
|
|
*/
|
|
|
|
limit = min_t(long, num_tce, 512);
|
|
|
|
dma_offset = next + be64_to_cpu(maprange->dma_base);
|
|
|
|
|
|
|
|
rc = plpar_tce_stuff((u64)be32_to_cpu(maprange->liobn),
|
|
|
|
dma_offset,
|
|
|
|
0, limit);
|
2013-01-18 17:16:24 +08:00
|
|
|
next += limit * tce_size;
|
2011-02-10 17:10:47 +08:00
|
|
|
num_tce -= limit;
|
|
|
|
} while (num_tce > 0 && !rc);
|
|
|
|
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn,
|
|
|
|
unsigned long num_pfn, const void *arg)
|
|
|
|
{
|
|
|
|
const struct dynamic_dma_window_prop *maprange = arg;
|
2013-10-17 20:21:15 +08:00
|
|
|
u64 tce_size, num_tce, dma_offset, next, proto_tce, liobn;
|
|
|
|
__be64 *tcep;
|
2011-02-10 17:10:47 +08:00
|
|
|
u32 tce_shift;
|
|
|
|
u64 rc = 0;
|
|
|
|
long l, limit;
|
|
|
|
|
2019-12-16 12:19:23 +08:00
|
|
|
if (!firmware_has_feature(FW_FEATURE_PUT_TCE_IND)) {
|
2019-12-16 12:19:22 +08:00
|
|
|
unsigned long tceshift = be32_to_cpu(maprange->tce_shift);
|
|
|
|
unsigned long dmastart = (start_pfn << PAGE_SHIFT) +
|
|
|
|
be64_to_cpu(maprange->dma_base);
|
|
|
|
unsigned long tcenum = dmastart >> tceshift;
|
|
|
|
unsigned long npages = num_pfn << PAGE_SHIFT >> tceshift;
|
|
|
|
void *uaddr = __va(start_pfn << PAGE_SHIFT);
|
|
|
|
|
|
|
|
return tce_build_pSeriesLP(be32_to_cpu(maprange->liobn),
|
|
|
|
tcenum, tceshift, npages, (unsigned long) uaddr,
|
|
|
|
DMA_BIDIRECTIONAL, 0);
|
|
|
|
}
|
|
|
|
|
2011-02-10 17:10:47 +08:00
|
|
|
local_irq_disable(); /* to protect tcep and the page behind it */
|
powerpc: Replace __get_cpu_var uses
This still has not been merged and now powerpc is the only arch that does
not have this change. Sorry about missing linuxppc-dev before.
V2->V2
- Fix up to work against 3.18-rc1
__get_cpu_var() is used for multiple purposes in the kernel source. One of
them is address calculation via the form &__get_cpu_var(x). This calculates
the address for the instance of the percpu variable of the current processor
based on an offset.
Other use cases are for storing and retrieving data from the current
processors percpu area. __get_cpu_var() can be used as an lvalue when
writing data or on the right side of an assignment.
__get_cpu_var() is defined as :
__get_cpu_var() always only does an address determination. However, store
and retrieve operations could use a segment prefix (or global register on
other platforms) to avoid the address calculation.
this_cpu_write() and this_cpu_read() can directly take an offset into a
percpu area and use optimized assembly code to read and write per cpu
variables.
This patch converts __get_cpu_var into either an explicit address
calculation using this_cpu_ptr() or into a use of this_cpu operations that
use the offset. Thereby address calculations are avoided and less registers
are used when code is generated.
At the end of the patch set all uses of __get_cpu_var have been removed so
the macro is removed too.
The patch set includes passes over all arches as well. Once these operations
are used throughout then specialized macros can be defined in non -x86
arches as well in order to optimize per cpu access by f.e. using a global
register that may be set to the per cpu base.
Transformations done to __get_cpu_var()
1. Determine the address of the percpu instance of the current processor.
DEFINE_PER_CPU(int, y);
int *x = &__get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(&y);
2. Same as #1 but this time an array structure is involved.
DEFINE_PER_CPU(int, y[20]);
int *x = __get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(y);
3. Retrieve the content of the current processors instance of a per cpu
variable.
DEFINE_PER_CPU(int, y);
int x = __get_cpu_var(y)
Converts to
int x = __this_cpu_read(y);
4. Retrieve the content of a percpu struct
DEFINE_PER_CPU(struct mystruct, y);
struct mystruct x = __get_cpu_var(y);
Converts to
memcpy(&x, this_cpu_ptr(&y), sizeof(x));
5. Assignment to a per cpu variable
DEFINE_PER_CPU(int, y)
__get_cpu_var(y) = x;
Converts to
__this_cpu_write(y, x);
6. Increment/Decrement etc of a per cpu variable
DEFINE_PER_CPU(int, y);
__get_cpu_var(y)++
Converts to
__this_cpu_inc(y)
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Paul Mackerras <paulus@samba.org>
Signed-off-by: Christoph Lameter <cl@linux.com>
[mpe: Fix build errors caused by set/or_softirq_pending(), and rework
assignment in __set_breakpoint() to use memcpy().]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2014-10-22 04:23:25 +08:00
|
|
|
tcep = __this_cpu_read(tce_page);
|
2011-02-10 17:10:47 +08:00
|
|
|
|
|
|
|
if (!tcep) {
|
2013-10-17 20:21:15 +08:00
|
|
|
tcep = (__be64 *)__get_free_page(GFP_ATOMIC);
|
2011-02-10 17:10:47 +08:00
|
|
|
if (!tcep) {
|
|
|
|
local_irq_enable();
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
powerpc: Replace __get_cpu_var uses
This still has not been merged and now powerpc is the only arch that does
not have this change. Sorry about missing linuxppc-dev before.
V2->V2
- Fix up to work against 3.18-rc1
__get_cpu_var() is used for multiple purposes in the kernel source. One of
them is address calculation via the form &__get_cpu_var(x). This calculates
the address for the instance of the percpu variable of the current processor
based on an offset.
Other use cases are for storing and retrieving data from the current
processors percpu area. __get_cpu_var() can be used as an lvalue when
writing data or on the right side of an assignment.
__get_cpu_var() is defined as :
__get_cpu_var() always only does an address determination. However, store
and retrieve operations could use a segment prefix (or global register on
other platforms) to avoid the address calculation.
this_cpu_write() and this_cpu_read() can directly take an offset into a
percpu area and use optimized assembly code to read and write per cpu
variables.
This patch converts __get_cpu_var into either an explicit address
calculation using this_cpu_ptr() or into a use of this_cpu operations that
use the offset. Thereby address calculations are avoided and less registers
are used when code is generated.
At the end of the patch set all uses of __get_cpu_var have been removed so
the macro is removed too.
The patch set includes passes over all arches as well. Once these operations
are used throughout then specialized macros can be defined in non -x86
arches as well in order to optimize per cpu access by f.e. using a global
register that may be set to the per cpu base.
Transformations done to __get_cpu_var()
1. Determine the address of the percpu instance of the current processor.
DEFINE_PER_CPU(int, y);
int *x = &__get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(&y);
2. Same as #1 but this time an array structure is involved.
DEFINE_PER_CPU(int, y[20]);
int *x = __get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(y);
3. Retrieve the content of the current processors instance of a per cpu
variable.
DEFINE_PER_CPU(int, y);
int x = __get_cpu_var(y)
Converts to
int x = __this_cpu_read(y);
4. Retrieve the content of a percpu struct
DEFINE_PER_CPU(struct mystruct, y);
struct mystruct x = __get_cpu_var(y);
Converts to
memcpy(&x, this_cpu_ptr(&y), sizeof(x));
5. Assignment to a per cpu variable
DEFINE_PER_CPU(int, y)
__get_cpu_var(y) = x;
Converts to
__this_cpu_write(y, x);
6. Increment/Decrement etc of a per cpu variable
DEFINE_PER_CPU(int, y);
__get_cpu_var(y)++
Converts to
__this_cpu_inc(y)
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Paul Mackerras <paulus@samba.org>
Signed-off-by: Christoph Lameter <cl@linux.com>
[mpe: Fix build errors caused by set/or_softirq_pending(), and rework
assignment in __set_breakpoint() to use memcpy().]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2014-10-22 04:23:25 +08:00
|
|
|
__this_cpu_write(tce_page, tcep);
|
2011-02-10 17:10:47 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
proto_tce = TCE_PCI_READ | TCE_PCI_WRITE;
|
|
|
|
|
|
|
|
liobn = (u64)be32_to_cpu(maprange->liobn);
|
|
|
|
tce_shift = be32_to_cpu(maprange->tce_shift);
|
|
|
|
tce_size = 1ULL << tce_shift;
|
|
|
|
next = start_pfn << PAGE_SHIFT;
|
|
|
|
num_tce = num_pfn << PAGE_SHIFT;
|
|
|
|
|
|
|
|
/* round back to the beginning of the tce page size */
|
|
|
|
num_tce += next & (tce_size - 1);
|
|
|
|
next &= ~(tce_size - 1);
|
|
|
|
|
|
|
|
/* covert to number of tces */
|
|
|
|
num_tce |= tce_size - 1;
|
|
|
|
num_tce >>= tce_shift;
|
|
|
|
|
|
|
|
/* We can map max one pageful of TCEs at a time */
|
|
|
|
do {
|
|
|
|
/*
|
|
|
|
* Set up the page with TCE data, looping through and setting
|
|
|
|
* the values.
|
|
|
|
*/
|
|
|
|
limit = min_t(long, num_tce, 4096/TCE_ENTRY_SIZE);
|
|
|
|
dma_offset = next + be64_to_cpu(maprange->dma_base);
|
|
|
|
|
|
|
|
for (l = 0; l < limit; l++) {
|
2013-10-17 20:21:15 +08:00
|
|
|
tcep[l] = cpu_to_be64(proto_tce | next);
|
2011-02-10 17:10:47 +08:00
|
|
|
next += tce_size;
|
|
|
|
}
|
|
|
|
|
|
|
|
rc = plpar_tce_put_indirect(liobn,
|
|
|
|
dma_offset,
|
2012-07-26 05:19:57 +08:00
|
|
|
(u64)__pa(tcep),
|
2011-02-10 17:10:47 +08:00
|
|
|
limit);
|
|
|
|
|
|
|
|
num_tce -= limit;
|
|
|
|
} while (num_tce > 0 && !rc);
|
|
|
|
|
|
|
|
/* error cleanup: caller will clear whole range */
|
|
|
|
|
|
|
|
local_irq_enable();
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int tce_setrange_multi_pSeriesLP_walk(unsigned long start_pfn,
|
|
|
|
unsigned long num_pfn, void *arg)
|
|
|
|
{
|
|
|
|
return tce_setrange_multi_pSeriesLP(start_pfn, num_pfn, arg);
|
|
|
|
}
|
|
|
|
|
2021-08-17 14:39:25 +08:00
|
|
|
static void iommu_table_setparms_common(struct iommu_table *tbl, unsigned long busno,
|
|
|
|
unsigned long liobn, unsigned long win_addr,
|
|
|
|
unsigned long window_size, unsigned long page_shift,
|
|
|
|
void *base, struct iommu_table_ops *table_ops)
|
|
|
|
{
|
|
|
|
tbl->it_busno = busno;
|
|
|
|
tbl->it_index = liobn;
|
|
|
|
tbl->it_offset = win_addr >> page_shift;
|
|
|
|
tbl->it_size = window_size >> page_shift;
|
|
|
|
tbl->it_page_shift = page_shift;
|
|
|
|
tbl->it_base = (unsigned long)base;
|
|
|
|
tbl->it_blocksize = 16;
|
|
|
|
tbl->it_type = TCE_PCI;
|
|
|
|
tbl->it_ops = table_ops;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct iommu_table_ops iommu_table_pseries_ops;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
static void iommu_table_setparms(struct pci_controller *phb,
|
|
|
|
struct device_node *dn,
|
2006-04-29 11:51:59 +08:00
|
|
|
struct iommu_table *tbl)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
struct device_node *node;
|
2016-07-08 14:37:10 +08:00
|
|
|
const unsigned long *basep;
|
2006-10-05 11:28:00 +08:00
|
|
|
const u32 *sizep;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2021-08-17 14:39:25 +08:00
|
|
|
/* Test if we are going over 2GB of DMA space */
|
|
|
|
if (phb->dma_window_base_cur + phb->dma_window_size > SZ_2G) {
|
|
|
|
udbg_printf("PCI_DMA: Unexpected number of IOAs under this PHB.\n");
|
|
|
|
panic("PCI_DMA: Unexpected number of IOAs under this PHB.\n");
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2021-08-17 14:39:25 +08:00
|
|
|
node = phb->dn;
|
2007-04-03 20:26:41 +08:00
|
|
|
basep = of_get_property(node, "linux,tce-base", NULL);
|
|
|
|
sizep = of_get_property(node, "linux,tce-size", NULL);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (basep == NULL || sizep == NULL) {
|
2017-08-21 23:16:47 +08:00
|
|
|
printk(KERN_ERR "PCI_DMA: iommu_table_setparms: %pOF has "
|
|
|
|
"missing tce entries !\n", dn);
|
2005-04-17 06:20:36 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2021-08-17 14:39:25 +08:00
|
|
|
iommu_table_setparms_common(tbl, phb->bus->number, 0, phb->dma_window_base_cur,
|
|
|
|
phb->dma_window_size, IOMMU_PAGE_SHIFT_4K,
|
|
|
|
__va(*basep), &iommu_table_pseries_ops);
|
2006-06-23 14:35:10 +08:00
|
|
|
|
2008-10-23 04:39:04 +08:00
|
|
|
if (!is_kdump_kernel())
|
2008-10-22 01:38:10 +08:00
|
|
|
memset((void *)tbl->it_base, 0, *sizep);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
phb->dma_window_base_cur += phb->dma_window_size;
|
|
|
|
}
|
|
|
|
|
2021-08-17 14:39:25 +08:00
|
|
|
struct iommu_table_ops iommu_table_lpar_multi_ops;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* iommu_table_setparms_lpar
|
|
|
|
*
|
|
|
|
* Function: On pSeries LPAR systems, return TCE table info, given a pci bus.
|
|
|
|
*/
|
|
|
|
static void iommu_table_setparms_lpar(struct pci_controller *phb,
|
|
|
|
struct device_node *dn,
|
|
|
|
struct iommu_table *tbl,
|
2017-03-24 14:37:21 +08:00
|
|
|
struct iommu_table_group *table_group,
|
2013-08-07 00:01:36 +08:00
|
|
|
const __be32 *dma_window)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2021-08-17 14:39:25 +08:00
|
|
|
unsigned long offset, size, liobn;
|
2006-05-18 16:06:37 +08:00
|
|
|
|
2021-08-17 14:39:25 +08:00
|
|
|
of_parse_dma_window(dn, dma_window, &liobn, &offset, &size);
|
|
|
|
|
|
|
|
iommu_table_setparms_common(tbl, phb->bus->number, liobn, offset, size, IOMMU_PAGE_SHIFT_4K, NULL,
|
|
|
|
&iommu_table_lpar_multi_ops);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2017-03-24 14:37:21 +08:00
|
|
|
|
|
|
|
table_group->tce32_start = offset;
|
|
|
|
table_group->tce32_size = size;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2015-06-05 14:35:06 +08:00
|
|
|
struct iommu_table_ops iommu_table_pseries_ops = {
|
|
|
|
.set = tce_build_pSeries,
|
|
|
|
.clear = tce_free_pSeries,
|
|
|
|
.get = tce_get_pseries
|
|
|
|
};
|
|
|
|
|
2006-11-11 14:25:02 +08:00
|
|
|
static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2005-09-22 00:55:31 +08:00
|
|
|
struct device_node *dn;
|
2005-04-17 06:20:36 +08:00
|
|
|
struct iommu_table *tbl;
|
2005-09-22 00:55:31 +08:00
|
|
|
struct device_node *isa_dn, *isa_dn_orig;
|
|
|
|
struct device_node *tmp;
|
|
|
|
struct pci_dn *pci;
|
|
|
|
int children;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-09-22 00:55:31 +08:00
|
|
|
dn = pci_bus_to_OF_node(bus);
|
2006-11-11 14:25:02 +08:00
|
|
|
|
2017-08-21 23:16:47 +08:00
|
|
|
pr_debug("pci_dma_bus_setup_pSeries: setting up bus %pOF\n", dn);
|
2005-09-22 00:55:31 +08:00
|
|
|
|
|
|
|
if (bus->self) {
|
|
|
|
/* This is not a root bus, any setup will be done for the
|
|
|
|
* device-side of the bridge in iommu_dev_setup_pSeries().
|
|
|
|
*/
|
|
|
|
return;
|
|
|
|
}
|
2006-11-11 14:25:02 +08:00
|
|
|
pci = PCI_DN(dn);
|
2005-09-22 00:55:31 +08:00
|
|
|
|
|
|
|
/* Check if the ISA bus on the system is under
|
|
|
|
* this PHB.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2005-09-22 00:55:31 +08:00
|
|
|
isa_dn = isa_dn_orig = of_find_node_by_type(NULL, "isa");
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-09-22 00:55:31 +08:00
|
|
|
while (isa_dn && isa_dn != dn)
|
|
|
|
isa_dn = isa_dn->parent;
|
|
|
|
|
2014-08-08 18:07:47 +08:00
|
|
|
of_node_put(isa_dn_orig);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-06-20 16:00:30 +08:00
|
|
|
/* Count number of direct PCI children of the PHB. */
|
2005-09-22 00:55:31 +08:00
|
|
|
for (children = 0, tmp = dn->child; tmp; tmp = tmp->sibling)
|
2006-06-20 16:00:30 +08:00
|
|
|
children++;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-04-24 13:13:19 +08:00
|
|
|
pr_debug("Children: %d\n", children);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-09-22 00:55:31 +08:00
|
|
|
/* Calculate amount of DMA window per slot. Each window must be
|
|
|
|
* a power of two (due to pci_alloc_consistent requirements).
|
|
|
|
*
|
|
|
|
* Keep 256MB aside for PHBs with ISA.
|
|
|
|
*/
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-09-22 00:55:31 +08:00
|
|
|
if (!isa_dn) {
|
|
|
|
/* No ISA/IDE - just set window size and return */
|
|
|
|
pci->phb->dma_window_size = 0x80000000ul; /* To be divided */
|
|
|
|
|
|
|
|
while (pci->phb->dma_window_size * children > 0x80000000ul)
|
|
|
|
pci->phb->dma_window_size >>= 1;
|
2009-06-03 02:21:30 +08:00
|
|
|
pr_debug("No ISA/IDE, window size is 0x%llx\n",
|
2008-04-24 13:13:19 +08:00
|
|
|
pci->phb->dma_window_size);
|
2005-09-22 00:55:31 +08:00
|
|
|
pci->phb->dma_window_base_cur = 0;
|
|
|
|
|
|
|
|
return;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2005-09-22 00:55:31 +08:00
|
|
|
|
|
|
|
/* If we have ISA, then we probably have an IDE
|
|
|
|
* controller too. Allocate a 128MB table but
|
|
|
|
* skip the first 128MB to avoid stepping on ISA
|
|
|
|
* space.
|
|
|
|
*/
|
|
|
|
pci->phb->dma_window_size = 0x8000000ul;
|
|
|
|
pci->phb->dma_window_base_cur = 0x8000000ul;
|
|
|
|
|
2015-06-05 14:35:08 +08:00
|
|
|
pci->table_group = iommu_pseries_alloc_group(pci->phb->node);
|
|
|
|
tbl = pci->table_group->tables[0];
|
2005-09-22 00:55:31 +08:00
|
|
|
|
|
|
|
iommu_table_setparms(pci->phb, dn, tbl);
|
2021-08-17 14:39:25 +08:00
|
|
|
|
2021-02-16 11:33:07 +08:00
|
|
|
if (!iommu_init_table(tbl, pci->phb->node, 0, 0))
|
|
|
|
panic("Failed to initialize iommu table");
|
2005-09-22 00:55:31 +08:00
|
|
|
|
|
|
|
/* Divide the rest (1.75GB) among the children */
|
|
|
|
pci->phb->dma_window_size = 0x80000000ul;
|
|
|
|
while (pci->phb->dma_window_size * children > 0x70000000ul)
|
|
|
|
pci->phb->dma_window_size >>= 1;
|
|
|
|
|
2009-06-03 02:21:30 +08:00
|
|
|
pr_debug("ISA/IDE, window size is 0x%llx\n", pci->phb->dma_window_size);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2017-03-24 14:37:21 +08:00
|
|
|
#ifdef CONFIG_IOMMU_API
|
|
|
|
static int tce_exchange_pseries(struct iommu_table *tbl, long index, unsigned
|
2022-05-06 13:37:55 +08:00
|
|
|
long *tce, enum dma_data_direction *direction)
|
2017-03-24 14:37:21 +08:00
|
|
|
{
|
|
|
|
long rc;
|
|
|
|
unsigned long ioba = (unsigned long) index << tbl->it_page_shift;
|
|
|
|
unsigned long flags, oldtce = 0;
|
|
|
|
u64 proto_tce = iommu_direction_to_tce_perm(*direction);
|
|
|
|
unsigned long newtce = *tce | proto_tce;
|
|
|
|
|
|
|
|
spin_lock_irqsave(&tbl->large_pool.lock, flags);
|
|
|
|
|
|
|
|
rc = plpar_tce_get((u64)tbl->it_index, ioba, &oldtce);
|
|
|
|
if (!rc)
|
|
|
|
rc = plpar_tce_put((u64)tbl->it_index, ioba, newtce);
|
|
|
|
|
|
|
|
if (!rc) {
|
|
|
|
*direction = iommu_tce_direction(oldtce);
|
|
|
|
*tce = oldtce & ~(TCE_PCI_READ | TCE_PCI_WRITE);
|
|
|
|
}
|
|
|
|
|
|
|
|
spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
|
|
|
|
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2015-06-05 14:35:06 +08:00
|
|
|
struct iommu_table_ops iommu_table_lpar_multi_ops = {
|
|
|
|
.set = tce_buildmulti_pSeriesLP,
|
2017-03-24 14:37:21 +08:00
|
|
|
#ifdef CONFIG_IOMMU_API
|
2019-08-29 16:52:51 +08:00
|
|
|
.xchg_no_kill = tce_exchange_pseries,
|
2017-03-24 14:37:21 +08:00
|
|
|
#endif
|
2015-06-05 14:35:06 +08:00
|
|
|
.clear = tce_freemulti_pSeriesLP,
|
|
|
|
.get = tce_get_pSeriesLP
|
|
|
|
};
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2022-06-29 14:06:14 +08:00
|
|
|
/*
|
|
|
|
* Find nearest ibm,dma-window (default DMA window) or direct DMA window or
|
|
|
|
* dynamic 64bit DMA window, walking up the device tree.
|
|
|
|
*/
|
|
|
|
static struct device_node *pci_dma_find(struct device_node *dn,
|
|
|
|
const __be32 **dma_window)
|
|
|
|
{
|
|
|
|
const __be32 *dw = NULL;
|
|
|
|
|
|
|
|
for ( ; dn && PCI_DN(dn); dn = dn->parent) {
|
|
|
|
dw = of_get_property(dn, "ibm,dma-window", NULL);
|
|
|
|
if (dw) {
|
|
|
|
if (dma_window)
|
|
|
|
*dma_window = dw;
|
|
|
|
return dn;
|
|
|
|
}
|
|
|
|
dw = of_get_property(dn, DIRECT64_PROPNAME, NULL);
|
|
|
|
if (dw)
|
|
|
|
return dn;
|
|
|
|
dw = of_get_property(dn, DMA64_PROPNAME, NULL);
|
|
|
|
if (dw)
|
|
|
|
return dn;
|
|
|
|
}
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2006-11-11 14:25:02 +08:00
|
|
|
static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
struct iommu_table *tbl;
|
|
|
|
struct device_node *dn, *pdn;
|
2005-09-06 11:17:54 +08:00
|
|
|
struct pci_dn *ppci;
|
2013-08-07 00:01:36 +08:00
|
|
|
const __be32 *dma_window = NULL;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
dn = pci_bus_to_OF_node(bus);
|
|
|
|
|
2017-08-21 23:16:47 +08:00
|
|
|
pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus %pOF\n",
|
|
|
|
dn);
|
2006-11-11 14:25:02 +08:00
|
|
|
|
2022-06-29 14:06:14 +08:00
|
|
|
pdn = pci_dma_find(dn, &dma_window);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2022-06-29 14:06:14 +08:00
|
|
|
if (dma_window == NULL)
|
2008-04-24 13:13:19 +08:00
|
|
|
pr_debug(" no ibm,dma-window property !\n");
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-12-06 09:37:35 +08:00
|
|
|
ppci = PCI_DN(pdn);
|
2006-11-11 14:25:02 +08:00
|
|
|
|
2017-08-21 23:16:47 +08:00
|
|
|
pr_debug(" parent is %pOF, iommu_table: 0x%p\n",
|
|
|
|
pdn, ppci->table_group);
|
2006-11-11 14:25:02 +08:00
|
|
|
|
2015-06-05 14:35:08 +08:00
|
|
|
if (!ppci->table_group) {
|
|
|
|
ppci->table_group = iommu_pseries_alloc_group(ppci->phb->node);
|
|
|
|
tbl = ppci->table_group->tables[0];
|
2022-06-29 14:06:14 +08:00
|
|
|
if (dma_window) {
|
|
|
|
iommu_table_setparms_lpar(ppci->phb, pdn, tbl,
|
|
|
|
ppci->table_group, dma_window);
|
2021-08-17 14:39:25 +08:00
|
|
|
|
2022-06-29 14:06:14 +08:00
|
|
|
if (!iommu_init_table(tbl, ppci->phb->node, 0, 0))
|
|
|
|
panic("Failed to initialize iommu table");
|
|
|
|
}
|
2015-06-05 14:35:08 +08:00
|
|
|
iommu_register_group(ppci->table_group,
|
|
|
|
pci_domain_nr(bus), 0);
|
|
|
|
pr_debug(" created table: %p\n", ppci->table_group);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2006-11-11 14:25:02 +08:00
|
|
|
static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2006-11-11 14:25:02 +08:00
|
|
|
struct device_node *dn;
|
2005-09-22 00:55:31 +08:00
|
|
|
struct iommu_table *tbl;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-04-24 13:13:19 +08:00
|
|
|
pr_debug("pci_dma_dev_setup_pSeries: %s\n", pci_name(dev));
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2010-04-14 07:12:56 +08:00
|
|
|
dn = dev->dev.of_node;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-09-22 00:55:31 +08:00
|
|
|
/* If we're the direct child of a root bus, then we need to allocate
|
|
|
|
* an iommu table ourselves. The bus setup code should have setup
|
|
|
|
* the window sizes already.
|
|
|
|
*/
|
|
|
|
if (!dev->bus->self) {
|
2006-11-11 14:25:02 +08:00
|
|
|
struct pci_controller *phb = PCI_DN(dn)->phb;
|
|
|
|
|
2008-04-24 13:13:19 +08:00
|
|
|
pr_debug(" --> first child, no bridge. Allocating iommu table.\n");
|
2015-06-05 14:35:08 +08:00
|
|
|
PCI_DN(dn)->table_group = iommu_pseries_alloc_group(phb->node);
|
|
|
|
tbl = PCI_DN(dn)->table_group->tables[0];
|
2006-11-11 14:25:02 +08:00
|
|
|
iommu_table_setparms(phb, dn, tbl);
|
2021-08-17 14:39:25 +08:00
|
|
|
|
2021-02-16 11:33:07 +08:00
|
|
|
if (!iommu_init_table(tbl, phb->node, 0, 0))
|
|
|
|
panic("Failed to initialize iommu table");
|
|
|
|
|
2015-06-05 14:34:54 +08:00
|
|
|
set_iommu_table_base(&dev->dev, tbl);
|
2005-09-22 00:55:31 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* If this device is further down the bus tree, search upwards until
|
|
|
|
* an already allocated iommu table is found and use that.
|
|
|
|
*/
|
|
|
|
|
2015-06-05 14:35:08 +08:00
|
|
|
while (dn && PCI_DN(dn) && PCI_DN(dn)->table_group == NULL)
|
2005-04-17 06:20:36 +08:00
|
|
|
dn = dn->parent;
|
|
|
|
|
2018-12-19 16:52:20 +08:00
|
|
|
if (dn && PCI_DN(dn))
|
2015-06-05 14:35:08 +08:00
|
|
|
set_iommu_table_base(&dev->dev,
|
|
|
|
PCI_DN(dn)->table_group->tables[0]);
|
2018-12-19 16:52:20 +08:00
|
|
|
else
|
2006-11-11 14:25:02 +08:00
|
|
|
printk(KERN_WARNING "iommu: Device %s has no iommu table\n",
|
|
|
|
pci_name(dev));
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2011-02-10 17:10:47 +08:00
|
|
|
static int __read_mostly disable_ddw;
|
|
|
|
|
|
|
|
static int __init disable_ddw_setup(char *str)
|
|
|
|
{
|
|
|
|
disable_ddw = 1;
|
|
|
|
printk(KERN_INFO "ppc iommu: disabling ddw.\n");
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
early_param("disable_ddw", disable_ddw_setup);
|
|
|
|
|
powerpc/pseries/iommu: Add ddw_property_create() and refactor enable_ddw()
Code used to create a ddw property that was previously scattered in
enable_ddw() is now gathered in ddw_property_create(), which deals with
allocation and filling the property, letting it ready for
of_property_add(), which now occurs in sequence.
This created an opportunity to reorganize the second part of enable_ddw():
Without this patch enable_ddw() does, in order:
kzalloc() property & members, create_ddw(), fill ddwprop inside property,
ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory,
of_add_property(), and list_add().
With this patch enable_ddw() does, in order:
create_ddw(), ddw_property_create(), of_add_property(),
ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory,
and list_add().
This change requires of_remove_property() in case anything fails after
of_add_property(), but we get to do tce_setrange_multi_pSeriesLP_walk
in all memory, which looks the most expensive operation, only if
everything else succeeds.
Also, the error path got remove_ddw() replaced by a new helper
__remove_dma_window(), which only removes the new DDW with an rtas-call.
For this, a new helper clean_dma_window() was needed to clean anything
that could left if walk_system_ram_range() fails.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-7-leobras.c@gmail.com
2021-08-17 14:39:24 +08:00
|
|
|
static void clean_dma_window(struct device_node *np, struct dynamic_dma_window_prop *dwp)
|
2011-02-10 17:10:47 +08:00
|
|
|
{
|
2020-08-05 11:04:54 +08:00
|
|
|
int ret;
|
2014-09-25 14:39:18 +08:00
|
|
|
|
2011-02-10 17:10:47 +08:00
|
|
|
ret = tce_clearrange_multi_pSeriesLP(0,
|
|
|
|
1ULL << (be32_to_cpu(dwp->window_shift) - PAGE_SHIFT), dwp);
|
|
|
|
if (ret)
|
2016-10-25 12:00:08 +08:00
|
|
|
pr_warn("%pOF failed to clear tces in window.\n",
|
|
|
|
np);
|
2011-02-10 17:10:47 +08:00
|
|
|
else
|
2017-08-21 23:16:47 +08:00
|
|
|
pr_debug("%pOF successfully cleared tces in window.\n",
|
|
|
|
np);
|
powerpc/pseries/iommu: Add ddw_property_create() and refactor enable_ddw()
Code used to create a ddw property that was previously scattered in
enable_ddw() is now gathered in ddw_property_create(), which deals with
allocation and filling the property, letting it ready for
of_property_add(), which now occurs in sequence.
This created an opportunity to reorganize the second part of enable_ddw():
Without this patch enable_ddw() does, in order:
kzalloc() property & members, create_ddw(), fill ddwprop inside property,
ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory,
of_add_property(), and list_add().
With this patch enable_ddw() does, in order:
create_ddw(), ddw_property_create(), of_add_property(),
ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory,
and list_add().
This change requires of_remove_property() in case anything fails after
of_add_property(), but we get to do tce_setrange_multi_pSeriesLP_walk
in all memory, which looks the most expensive operation, only if
everything else succeeds.
Also, the error path got remove_ddw() replaced by a new helper
__remove_dma_window(), which only removes the new DDW with an rtas-call.
For this, a new helper clean_dma_window() was needed to clean anything
that could left if walk_system_ram_range() fails.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-7-leobras.c@gmail.com
2021-08-17 14:39:24 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Call only if DMA window is clean.
|
|
|
|
*/
|
|
|
|
static void __remove_dma_window(struct device_node *np, u32 *ddw_avail, u64 liobn)
|
|
|
|
{
|
|
|
|
int ret;
|
2011-02-10 17:10:47 +08:00
|
|
|
|
2020-08-05 11:04:52 +08:00
|
|
|
ret = rtas_call(ddw_avail[DDW_REMOVE_PE_DMA_WIN], 1, 1, NULL, liobn);
|
2014-01-11 07:09:38 +08:00
|
|
|
if (ret)
|
powerpc/pseries/iommu: Add ddw_property_create() and refactor enable_ddw()
Code used to create a ddw property that was previously scattered in
enable_ddw() is now gathered in ddw_property_create(), which deals with
allocation and filling the property, letting it ready for
of_property_add(), which now occurs in sequence.
This created an opportunity to reorganize the second part of enable_ddw():
Without this patch enable_ddw() does, in order:
kzalloc() property & members, create_ddw(), fill ddwprop inside property,
ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory,
of_add_property(), and list_add().
With this patch enable_ddw() does, in order:
create_ddw(), ddw_property_create(), of_add_property(),
ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory,
and list_add().
This change requires of_remove_property() in case anything fails after
of_add_property(), but we get to do tce_setrange_multi_pSeriesLP_walk
in all memory, which looks the most expensive operation, only if
everything else succeeds.
Also, the error path got remove_ddw() replaced by a new helper
__remove_dma_window(), which only removes the new DDW with an rtas-call.
For this, a new helper clean_dma_window() was needed to clean anything
that could left if walk_system_ram_range() fails.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-7-leobras.c@gmail.com
2021-08-17 14:39:24 +08:00
|
|
|
pr_warn("%pOF: failed to remove DMA window: rtas returned "
|
2014-01-11 07:09:38 +08:00
|
|
|
"%d to ibm,remove-pe-dma-window(%x) %llx\n",
|
2020-08-05 11:04:52 +08:00
|
|
|
np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
|
2014-01-11 07:09:38 +08:00
|
|
|
else
|
powerpc/pseries/iommu: Add ddw_property_create() and refactor enable_ddw()
Code used to create a ddw property that was previously scattered in
enable_ddw() is now gathered in ddw_property_create(), which deals with
allocation and filling the property, letting it ready for
of_property_add(), which now occurs in sequence.
This created an opportunity to reorganize the second part of enable_ddw():
Without this patch enable_ddw() does, in order:
kzalloc() property & members, create_ddw(), fill ddwprop inside property,
ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory,
of_add_property(), and list_add().
With this patch enable_ddw() does, in order:
create_ddw(), ddw_property_create(), of_add_property(),
ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory,
and list_add().
This change requires of_remove_property() in case anything fails after
of_add_property(), but we get to do tce_setrange_multi_pSeriesLP_walk
in all memory, which looks the most expensive operation, only if
everything else succeeds.
Also, the error path got remove_ddw() replaced by a new helper
__remove_dma_window(), which only removes the new DDW with an rtas-call.
For this, a new helper clean_dma_window() was needed to clean anything
that could left if walk_system_ram_range() fails.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-7-leobras.c@gmail.com
2021-08-17 14:39:24 +08:00
|
|
|
pr_debug("%pOF: successfully removed DMA window: rtas returned "
|
2014-01-11 07:09:38 +08:00
|
|
|
"%d to ibm,remove-pe-dma-window(%x) %llx\n",
|
2020-08-05 11:04:52 +08:00
|
|
|
np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
|
2020-08-05 11:04:54 +08:00
|
|
|
}
|
|
|
|
|
powerpc/pseries/iommu: Add ddw_property_create() and refactor enable_ddw()
Code used to create a ddw property that was previously scattered in
enable_ddw() is now gathered in ddw_property_create(), which deals with
allocation and filling the property, letting it ready for
of_property_add(), which now occurs in sequence.
This created an opportunity to reorganize the second part of enable_ddw():
Without this patch enable_ddw() does, in order:
kzalloc() property & members, create_ddw(), fill ddwprop inside property,
ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory,
of_add_property(), and list_add().
With this patch enable_ddw() does, in order:
create_ddw(), ddw_property_create(), of_add_property(),
ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory,
and list_add().
This change requires of_remove_property() in case anything fails after
of_add_property(), but we get to do tce_setrange_multi_pSeriesLP_walk
in all memory, which looks the most expensive operation, only if
everything else succeeds.
Also, the error path got remove_ddw() replaced by a new helper
__remove_dma_window(), which only removes the new DDW with an rtas-call.
For this, a new helper clean_dma_window() was needed to clean anything
that could left if walk_system_ram_range() fails.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-7-leobras.c@gmail.com
2021-08-17 14:39:24 +08:00
|
|
|
static void remove_dma_window(struct device_node *np, u32 *ddw_avail,
|
|
|
|
struct property *win)
|
|
|
|
{
|
|
|
|
struct dynamic_dma_window_prop *dwp;
|
|
|
|
u64 liobn;
|
|
|
|
|
|
|
|
dwp = win->value;
|
|
|
|
liobn = (u64)be32_to_cpu(dwp->liobn);
|
|
|
|
|
|
|
|
clean_dma_window(np, dwp);
|
|
|
|
__remove_dma_window(np, ddw_avail, liobn);
|
|
|
|
}
|
|
|
|
|
2021-08-17 14:39:26 +08:00
|
|
|
static int remove_ddw(struct device_node *np, bool remove_prop, const char *win_name)
|
2020-08-05 11:04:54 +08:00
|
|
|
{
|
|
|
|
struct property *win;
|
|
|
|
u32 ddw_avail[DDW_APPLICABLE_SIZE];
|
|
|
|
int ret = 0;
|
|
|
|
|
2021-08-17 14:39:26 +08:00
|
|
|
win = of_find_property(np, win_name, NULL);
|
|
|
|
if (!win)
|
|
|
|
return -EINVAL;
|
|
|
|
|
2020-08-05 11:04:54 +08:00
|
|
|
ret = of_property_read_u32_array(np, "ibm,ddw-applicable",
|
|
|
|
&ddw_avail[0], DDW_APPLICABLE_SIZE);
|
|
|
|
if (ret)
|
2021-08-17 14:39:26 +08:00
|
|
|
return 0;
|
2020-08-05 11:04:54 +08:00
|
|
|
|
|
|
|
|
|
|
|
if (win->length >= sizeof(struct dynamic_dma_window_prop))
|
|
|
|
remove_dma_window(np, ddw_avail, win);
|
|
|
|
|
|
|
|
if (!remove_prop)
|
2021-08-17 14:39:26 +08:00
|
|
|
return 0;
|
2011-02-10 17:10:47 +08:00
|
|
|
|
2020-08-05 11:04:54 +08:00
|
|
|
ret = of_remove_property(np, win);
|
2011-05-11 20:24:58 +08:00
|
|
|
if (ret)
|
2021-08-17 14:39:29 +08:00
|
|
|
pr_warn("%pOF: failed to remove DMA window property: %d\n",
|
2017-08-21 23:16:47 +08:00
|
|
|
np, ret);
|
2021-08-17 14:39:26 +08:00
|
|
|
return 0;
|
2011-05-11 20:24:58 +08:00
|
|
|
}
|
2011-02-10 17:10:47 +08:00
|
|
|
|
2021-08-17 14:39:23 +08:00
|
|
|
static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr, int *window_shift)
|
2011-02-10 17:10:47 +08:00
|
|
|
{
|
2021-08-17 14:39:29 +08:00
|
|
|
struct dma_win *window;
|
|
|
|
const struct dynamic_dma_window_prop *dma64;
|
2021-08-17 14:39:23 +08:00
|
|
|
bool found = false;
|
2011-02-10 17:10:47 +08:00
|
|
|
|
2021-08-17 14:39:29 +08:00
|
|
|
spin_lock(&dma_win_list_lock);
|
2011-02-10 17:10:47 +08:00
|
|
|
/* check if we already created a window and dupe that config if so */
|
2021-08-17 14:39:29 +08:00
|
|
|
list_for_each_entry(window, &dma_win_list, list) {
|
2011-02-10 17:10:47 +08:00
|
|
|
if (window->device == pdn) {
|
2021-08-17 14:39:29 +08:00
|
|
|
dma64 = window->prop;
|
|
|
|
*dma_addr = be64_to_cpu(dma64->dma_base);
|
|
|
|
*window_shift = be32_to_cpu(dma64->window_shift);
|
2021-08-17 14:39:23 +08:00
|
|
|
found = true;
|
2011-02-10 17:10:47 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2021-08-17 14:39:29 +08:00
|
|
|
spin_unlock(&dma_win_list_lock);
|
2011-02-10 17:10:47 +08:00
|
|
|
|
2021-08-17 14:39:23 +08:00
|
|
|
return found;
|
2011-02-10 17:10:47 +08:00
|
|
|
}
|
|
|
|
|
2021-08-17 14:39:29 +08:00
|
|
|
static struct dma_win *ddw_list_new_entry(struct device_node *pdn,
|
|
|
|
const struct dynamic_dma_window_prop *dma64)
|
2021-08-17 14:39:22 +08:00
|
|
|
{
|
2021-08-17 14:39:29 +08:00
|
|
|
struct dma_win *window;
|
2021-08-17 14:39:22 +08:00
|
|
|
|
|
|
|
window = kzalloc(sizeof(*window), GFP_KERNEL);
|
|
|
|
if (!window)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
window->device = pdn;
|
|
|
|
window->prop = dma64;
|
|
|
|
|
|
|
|
return window;
|
|
|
|
}
|
|
|
|
|
2021-08-17 14:39:27 +08:00
|
|
|
static void find_existing_ddw_windows_named(const char *name)
|
2011-02-10 17:10:47 +08:00
|
|
|
{
|
2014-01-11 07:10:41 +08:00
|
|
|
int len;
|
2011-05-11 20:24:59 +08:00
|
|
|
struct device_node *pdn;
|
2021-08-17 14:39:29 +08:00
|
|
|
struct dma_win *window;
|
2021-08-17 14:39:27 +08:00
|
|
|
const struct dynamic_dma_window_prop *dma64;
|
2011-05-11 20:24:59 +08:00
|
|
|
|
2021-08-17 14:39:27 +08:00
|
|
|
for_each_node_with_property(pdn, name) {
|
|
|
|
dma64 = of_get_property(pdn, name, &len);
|
|
|
|
if (!dma64 || len < sizeof(*dma64)) {
|
|
|
|
remove_ddw(pdn, true, name);
|
2014-01-11 07:10:41 +08:00
|
|
|
continue;
|
|
|
|
}
|
2011-05-11 20:24:59 +08:00
|
|
|
|
2021-08-17 14:39:27 +08:00
|
|
|
window = ddw_list_new_entry(pdn, dma64);
|
2021-10-14 15:56:04 +08:00
|
|
|
if (!window) {
|
|
|
|
of_node_put(pdn);
|
2021-08-17 14:39:22 +08:00
|
|
|
break;
|
2021-10-14 15:56:04 +08:00
|
|
|
}
|
2021-08-17 14:39:22 +08:00
|
|
|
|
2021-08-17 14:39:29 +08:00
|
|
|
spin_lock(&dma_win_list_lock);
|
|
|
|
list_add(&window->list, &dma_win_list);
|
|
|
|
spin_unlock(&dma_win_list_lock);
|
2011-02-10 17:10:47 +08:00
|
|
|
}
|
2021-08-17 14:39:27 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static int find_existing_ddw_windows(void)
|
|
|
|
{
|
|
|
|
if (!firmware_has_feature(FW_FEATURE_LPAR))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
find_existing_ddw_windows_named(DIRECT64_PROPNAME);
|
powerpc/pseries/iommu: Make use of DDW for indirect mapping
So far it's assumed possible to map the guest RAM 1:1 to the bus, which
works with a small number of devices. SRIOV changes it as the user can
configure hundreds VFs and since phyp preallocates TCEs and does not
allow IOMMU pages bigger than 64K, it has to limit the number of TCEs
per a PE to limit waste of physical pages.
As of today, if the assumed direct mapping is not possible, DDW creation
is skipped and the default DMA window "ibm,dma-window" is used instead.
By using DDW, indirect mapping can get more TCEs than available for the
default DMA window, and also get access to using much larger pagesizes
(16MB as implemented in qemu vs 4k from default DMA window), causing a
significant increase on the maximum amount of memory that can be IOMMU
mapped at the same time.
Indirect mapping will only be used if direct mapping is not a
possibility.
For indirect mapping, it's necessary to re-create the iommu_table with
the new DMA window parameters, so iommu_alloc() can use it.
Removing the default DMA window for using DDW with indirect mapping
is only allowed if there is no current IOMMU memory allocated in
the iommu_table. enable_ddw() is aborted otherwise.
Even though there won't be both direct and indirect mappings at the
same time, we can't reuse the DIRECT64_PROPNAME property name, or else
an older kexec()ed kernel can assume direct mapping, and skip
iommu_alloc(), causing undesirable behavior.
So a new property name DMA64_PROPNAME "linux,dma64-ddr-window-info"
was created to represent a DDW that does not allow direct mapping.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-11-leobras.c@gmail.com
2021-08-17 14:39:28 +08:00
|
|
|
find_existing_ddw_windows_named(DMA64_PROPNAME);
|
2011-02-10 17:10:47 +08:00
|
|
|
|
2011-05-11 20:24:59 +08:00
|
|
|
return 0;
|
2011-02-10 17:10:47 +08:00
|
|
|
}
|
2011-05-11 20:24:59 +08:00
|
|
|
machine_arch_initcall(pseries, find_existing_ddw_windows);
|
2011-02-10 17:10:47 +08:00
|
|
|
|
powerpc/pseries/iommu: Update call to ibm, query-pe-dma-windows
>From LoPAR level 2.8, "ibm,ddw-extensions" index 3 can make the number of
outputs from "ibm,query-pe-dma-windows" go from 5 to 6.
This change of output size is meant to expand the address size of
largest_available_block PE TCE from 32-bit to 64-bit, which ends up
shifting page_size and migration_capable.
This ends up requiring the update of
ddw_query_response->largest_available_block from u32 to u64, and manually
assigning the values from the buffer into this struct, according to
output size.
Also, a routine was created for helping reading the ddw extensions as
suggested by LoPAR: First reading the size of the extension array from
index 0, checking if the property exists, and then returning it's value.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Tested-by: David Dai <zdai@linux.vnet.ibm.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200805030455.123024-3-leobras.c@gmail.com
2020-08-05 11:04:53 +08:00
|
|
|
/**
|
|
|
|
* ddw_read_ext - Get the value of an DDW extension
|
|
|
|
* @np: device node from which the extension value is to be read.
|
|
|
|
* @extnum: index number of the extension.
|
|
|
|
* @value: pointer to return value, modified when extension is available.
|
|
|
|
*
|
|
|
|
* Checks if "ibm,ddw-extensions" exists for this node, and get the value
|
|
|
|
* on index 'extnum'.
|
|
|
|
* It can be used only to check if a property exists, passing value == NULL.
|
|
|
|
*
|
|
|
|
* Returns:
|
|
|
|
* 0 if extension successfully read
|
|
|
|
* -EINVAL if the "ibm,ddw-extensions" does not exist,
|
|
|
|
* -ENODATA if "ibm,ddw-extensions" does not have a value, and
|
|
|
|
* -EOVERFLOW if "ibm,ddw-extensions" does not contain this extension.
|
|
|
|
*/
|
|
|
|
static inline int ddw_read_ext(const struct device_node *np, int extnum,
|
|
|
|
u32 *value)
|
|
|
|
{
|
|
|
|
static const char propname[] = "ibm,ddw-extensions";
|
|
|
|
u32 count;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = of_property_read_u32_index(np, propname, DDW_EXT_SIZE, &count);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
if (count < extnum)
|
|
|
|
return -EOVERFLOW;
|
|
|
|
|
|
|
|
if (!value)
|
|
|
|
value = &count;
|
|
|
|
|
|
|
|
return of_property_read_u32_index(np, propname, extnum, value);
|
|
|
|
}
|
|
|
|
|
2011-05-11 20:25:00 +08:00
|
|
|
static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail,
|
powerpc/pseries/iommu: Update call to ibm, query-pe-dma-windows
>From LoPAR level 2.8, "ibm,ddw-extensions" index 3 can make the number of
outputs from "ibm,query-pe-dma-windows" go from 5 to 6.
This change of output size is meant to expand the address size of
largest_available_block PE TCE from 32-bit to 64-bit, which ends up
shifting page_size and migration_capable.
This ends up requiring the update of
ddw_query_response->largest_available_block from u32 to u64, and manually
assigning the values from the buffer into this struct, according to
output size.
Also, a routine was created for helping reading the ddw extensions as
suggested by LoPAR: First reading the size of the extension array from
index 0, checking if the property exists, and then returning it's value.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Tested-by: David Dai <zdai@linux.vnet.ibm.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200805030455.123024-3-leobras.c@gmail.com
2020-08-05 11:04:53 +08:00
|
|
|
struct ddw_query_response *query,
|
|
|
|
struct device_node *parent)
|
2011-02-10 17:10:47 +08:00
|
|
|
{
|
2016-04-12 03:17:23 +08:00
|
|
|
struct device_node *dn;
|
|
|
|
struct pci_dn *pdn;
|
powerpc/pseries/iommu: Update call to ibm, query-pe-dma-windows
>From LoPAR level 2.8, "ibm,ddw-extensions" index 3 can make the number of
outputs from "ibm,query-pe-dma-windows" go from 5 to 6.
This change of output size is meant to expand the address size of
largest_available_block PE TCE from 32-bit to 64-bit, which ends up
shifting page_size and migration_capable.
This ends up requiring the update of
ddw_query_response->largest_available_block from u32 to u64, and manually
assigning the values from the buffer into this struct, according to
output size.
Also, a routine was created for helping reading the ddw extensions as
suggested by LoPAR: First reading the size of the extension array from
index 0, checking if the property exists, and then returning it's value.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Tested-by: David Dai <zdai@linux.vnet.ibm.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200805030455.123024-3-leobras.c@gmail.com
2020-08-05 11:04:53 +08:00
|
|
|
u32 cfg_addr, ext_query, query_out[5];
|
2011-02-10 17:10:47 +08:00
|
|
|
u64 buid;
|
powerpc/pseries/iommu: Update call to ibm, query-pe-dma-windows
>From LoPAR level 2.8, "ibm,ddw-extensions" index 3 can make the number of
outputs from "ibm,query-pe-dma-windows" go from 5 to 6.
This change of output size is meant to expand the address size of
largest_available_block PE TCE from 32-bit to 64-bit, which ends up
shifting page_size and migration_capable.
This ends up requiring the update of
ddw_query_response->largest_available_block from u32 to u64, and manually
assigning the values from the buffer into this struct, according to
output size.
Also, a routine was created for helping reading the ddw extensions as
suggested by LoPAR: First reading the size of the extension array from
index 0, checking if the property exists, and then returning it's value.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Tested-by: David Dai <zdai@linux.vnet.ibm.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200805030455.123024-3-leobras.c@gmail.com
2020-08-05 11:04:53 +08:00
|
|
|
int ret, out_sz;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* From LoPAR level 2.8, "ibm,ddw-extensions" index 3 can rule how many
|
|
|
|
* output parameters ibm,query-pe-dma-windows will have, ranging from
|
|
|
|
* 5 to 6.
|
|
|
|
*/
|
|
|
|
ret = ddw_read_ext(parent, DDW_EXT_QUERY_OUT_SIZE, &ext_query);
|
|
|
|
if (!ret && ext_query == 1)
|
|
|
|
out_sz = 6;
|
|
|
|
else
|
|
|
|
out_sz = 5;
|
2011-02-10 17:10:47 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Get the config address and phb buid of the PE window.
|
|
|
|
* Rely on eeh to retrieve this for us.
|
|
|
|
* Retrieve them from the pci device, not the node with the
|
|
|
|
* dma-window property
|
|
|
|
*/
|
2016-04-12 03:17:23 +08:00
|
|
|
dn = pci_device_to_OF_node(dev);
|
|
|
|
pdn = PCI_DN(dn);
|
|
|
|
buid = pdn->phb->buid;
|
2016-05-26 07:56:07 +08:00
|
|
|
cfg_addr = ((pdn->busno << 16) | (pdn->devfn << 8));
|
2012-03-21 05:30:28 +08:00
|
|
|
|
powerpc/pseries/iommu: Update call to ibm, query-pe-dma-windows
>From LoPAR level 2.8, "ibm,ddw-extensions" index 3 can make the number of
outputs from "ibm,query-pe-dma-windows" go from 5 to 6.
This change of output size is meant to expand the address size of
largest_available_block PE TCE from 32-bit to 64-bit, which ends up
shifting page_size and migration_capable.
This ends up requiring the update of
ddw_query_response->largest_available_block from u32 to u64, and manually
assigning the values from the buffer into this struct, according to
output size.
Also, a routine was created for helping reading the ddw extensions as
suggested by LoPAR: First reading the size of the extension array from
index 0, checking if the property exists, and then returning it's value.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Tested-by: David Dai <zdai@linux.vnet.ibm.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200805030455.123024-3-leobras.c@gmail.com
2020-08-05 11:04:53 +08:00
|
|
|
ret = rtas_call(ddw_avail[DDW_QUERY_PE_DMA_WIN], 3, out_sz, query_out,
|
2020-08-05 11:04:52 +08:00
|
|
|
cfg_addr, BUID_HI(buid), BUID_LO(buid));
|
powerpc/pseries/iommu: Update call to ibm, query-pe-dma-windows
>From LoPAR level 2.8, "ibm,ddw-extensions" index 3 can make the number of
outputs from "ibm,query-pe-dma-windows" go from 5 to 6.
This change of output size is meant to expand the address size of
largest_available_block PE TCE from 32-bit to 64-bit, which ends up
shifting page_size and migration_capable.
This ends up requiring the update of
ddw_query_response->largest_available_block from u32 to u64, and manually
assigning the values from the buffer into this struct, according to
output size.
Also, a routine was created for helping reading the ddw extensions as
suggested by LoPAR: First reading the size of the extension array from
index 0, checking if the property exists, and then returning it's value.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Tested-by: David Dai <zdai@linux.vnet.ibm.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200805030455.123024-3-leobras.c@gmail.com
2020-08-05 11:04:53 +08:00
|
|
|
|
|
|
|
switch (out_sz) {
|
|
|
|
case 5:
|
|
|
|
query->windows_available = query_out[0];
|
|
|
|
query->largest_available_block = query_out[1];
|
|
|
|
query->page_size = query_out[2];
|
|
|
|
query->migration_capable = query_out[3];
|
|
|
|
break;
|
|
|
|
case 6:
|
|
|
|
query->windows_available = query_out[0];
|
|
|
|
query->largest_available_block = ((u64)query_out[1] << 32) |
|
|
|
|
query_out[2];
|
|
|
|
query->page_size = query_out[3];
|
|
|
|
query->migration_capable = query_out[4];
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2022-06-01 12:01:17 +08:00
|
|
|
dev_info(&dev->dev, "ibm,query-pe-dma-windows(%x) %x %x %x returned %d, lb=%llx ps=%x wn=%d\n",
|
|
|
|
ddw_avail[DDW_QUERY_PE_DMA_WIN], cfg_addr, BUID_HI(buid),
|
|
|
|
BUID_LO(buid), ret, query->largest_available_block,
|
|
|
|
query->page_size, query->windows_available);
|
|
|
|
|
2011-02-10 17:10:47 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2011-05-11 20:25:00 +08:00
|
|
|
static int create_ddw(struct pci_dev *dev, const u32 *ddw_avail,
|
2011-02-10 17:10:47 +08:00
|
|
|
struct ddw_create_response *create, int page_shift,
|
|
|
|
int window_shift)
|
|
|
|
{
|
2016-04-12 03:17:23 +08:00
|
|
|
struct device_node *dn;
|
|
|
|
struct pci_dn *pdn;
|
2011-02-10 17:10:47 +08:00
|
|
|
u32 cfg_addr;
|
|
|
|
u64 buid;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Get the config address and phb buid of the PE window.
|
|
|
|
* Rely on eeh to retrieve this for us.
|
|
|
|
* Retrieve them from the pci device, not the node with the
|
|
|
|
* dma-window property
|
|
|
|
*/
|
2016-04-12 03:17:23 +08:00
|
|
|
dn = pci_device_to_OF_node(dev);
|
|
|
|
pdn = PCI_DN(dn);
|
|
|
|
buid = pdn->phb->buid;
|
2016-05-26 07:56:07 +08:00
|
|
|
cfg_addr = ((pdn->busno << 16) | (pdn->devfn << 8));
|
2011-02-10 17:10:47 +08:00
|
|
|
|
|
|
|
do {
|
|
|
|
/* extra outputs are LIOBN and dma-addr (hi, lo) */
|
2020-08-05 11:04:52 +08:00
|
|
|
ret = rtas_call(ddw_avail[DDW_CREATE_PE_DMA_WIN], 5, 4,
|
|
|
|
(u32 *)create, cfg_addr, BUID_HI(buid),
|
|
|
|
BUID_LO(buid), page_shift, window_shift);
|
2011-02-10 17:10:47 +08:00
|
|
|
} while (rtas_busy_delay(ret));
|
|
|
|
dev_info(&dev->dev,
|
|
|
|
"ibm,create-pe-dma-window(%x) %x %x %x %x %x returned %d "
|
2020-08-05 11:04:52 +08:00
|
|
|
"(liobn = 0x%x starting addr = %x %x)\n",
|
|
|
|
ddw_avail[DDW_CREATE_PE_DMA_WIN], cfg_addr, BUID_HI(buid),
|
|
|
|
BUID_LO(buid), page_shift, window_shift, ret, create->liobn,
|
|
|
|
create->addr_hi, create->addr_lo);
|
2011-02-10 17:10:47 +08:00
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
powerpc/pseries: close DDW race between functions of adapter
Given a PCI device with multiple functions in a DDW capable slot, the
following situation can be encountered: When the first function sets a
64-bit DMA mask, enable_ddw() will be called and we can fail to properly
configure DDW (the most common reason being the new DMA window's size is
not large enough to map all of an LPAR's memory). With the recent
changes to DDW, we remove the base window in order to determine if the
new window is of sufficient size to cover an LPAR's memory. We correctly
replace the base window if we find that not to be the case. However,
once we go through and re-configured 32-bit DMA via the IOMMU, the next
function of the adapter will go through the same process. And since DDW
is a characteristic of the slot itself, we are most likely going to fail
again. But to determine we are going to fail the second slot, we again
remove the base window -- but that is now in-use by the first
function/driver, which might be issuing I/O already.
To close this window, keep a list of all the failed struct device_nodes
that have failed to configure DDW. If the current device_node is in that
list, just fail out immediately and fall back to 32-bit DMA without
doing any DDW manipulation.
Signed-off-by: Nishanth Aravamudan <nacc@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
2013-03-07 20:33:03 +08:00
|
|
|
struct failed_ddw_pdn {
|
|
|
|
struct device_node *pdn;
|
|
|
|
struct list_head list;
|
|
|
|
};
|
|
|
|
|
|
|
|
static LIST_HEAD(failed_ddw_pdn_list);
|
|
|
|
|
2018-12-19 16:52:18 +08:00
|
|
|
static phys_addr_t ddw_memory_hotplug_max(void)
|
|
|
|
{
|
|
|
|
phys_addr_t max_addr = memory_hotplug_max();
|
|
|
|
struct device_node *memory;
|
|
|
|
|
|
|
|
for_each_node_by_type(memory, "memory") {
|
|
|
|
unsigned long start, size;
|
2019-04-07 10:48:08 +08:00
|
|
|
int n_mem_addr_cells, n_mem_size_cells, len;
|
2018-12-19 16:52:18 +08:00
|
|
|
const __be32 *memcell_buf;
|
|
|
|
|
|
|
|
memcell_buf = of_get_property(memory, "reg", &len);
|
|
|
|
if (!memcell_buf || len <= 0)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
n_mem_addr_cells = of_n_addr_cells(memory);
|
|
|
|
n_mem_size_cells = of_n_size_cells(memory);
|
|
|
|
|
|
|
|
start = of_read_number(memcell_buf, n_mem_addr_cells);
|
|
|
|
memcell_buf += n_mem_addr_cells;
|
|
|
|
size = of_read_number(memcell_buf, n_mem_size_cells);
|
|
|
|
memcell_buf += n_mem_size_cells;
|
|
|
|
|
|
|
|
max_addr = max_t(phys_addr_t, max_addr, start + size);
|
|
|
|
}
|
|
|
|
|
|
|
|
return max_addr;
|
|
|
|
}
|
|
|
|
|
powerpc/pseries/iommu: Allow bigger 64bit window by removing default DMA window
On LoPAR "DMA Window Manipulation Calls", it's recommended to remove the
default DMA window for the device, before attempting to configure a DDW,
in order to make the maximum resources available for the next DDW to be
created.
This is a requirement for using DDW on devices in which hypervisor
allows only one DMA window.
If setting up a new DDW fails anywhere after the removal of this
default DMA window, it's needed to restore the default DMA window.
For this, an implementation of ibm,reset-pe-dma-windows rtas call is
needed:
Platforms supporting the DDW option starting with LoPAR level 2.7 implement
ibm,ddw-extensions. The first extension available (index 2) carries the
token for ibm,reset-pe-dma-windows rtas call, which is used to restore
the default DMA window for a device, if it has been deleted.
It does so by resetting the TCE table allocation for the PE to it's
boot time value, available in "ibm,dma-window" device tree node.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Tested-by: David Dai <zdai@linux.vnet.ibm.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200805030455.123024-5-leobras.c@gmail.com
2020-08-05 11:04:55 +08:00
|
|
|
/*
|
|
|
|
* Platforms supporting the DDW option starting with LoPAR level 2.7 implement
|
|
|
|
* ibm,ddw-extensions, which carries the rtas token for
|
|
|
|
* ibm,reset-pe-dma-windows.
|
|
|
|
* That rtas-call can be used to restore the default DMA window for the device.
|
|
|
|
*/
|
|
|
|
static void reset_dma_window(struct pci_dev *dev, struct device_node *par_dn)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
u32 cfg_addr, reset_dma_win;
|
|
|
|
u64 buid;
|
|
|
|
struct device_node *dn;
|
|
|
|
struct pci_dn *pdn;
|
|
|
|
|
|
|
|
ret = ddw_read_ext(par_dn, DDW_EXT_RESET_DMA_WIN, &reset_dma_win);
|
|
|
|
if (ret)
|
|
|
|
return;
|
|
|
|
|
|
|
|
dn = pci_device_to_OF_node(dev);
|
|
|
|
pdn = PCI_DN(dn);
|
|
|
|
buid = pdn->phb->buid;
|
|
|
|
cfg_addr = (pdn->busno << 16) | (pdn->devfn << 8);
|
|
|
|
|
|
|
|
ret = rtas_call(reset_dma_win, 3, 1, NULL, cfg_addr, BUID_HI(buid),
|
|
|
|
BUID_LO(buid));
|
|
|
|
if (ret)
|
|
|
|
dev_info(&dev->dev,
|
|
|
|
"ibm,reset-pe-dma-windows(%x) %x %x %x returned %d ",
|
|
|
|
reset_dma_win, cfg_addr, BUID_HI(buid), BUID_LO(buid),
|
|
|
|
ret);
|
|
|
|
}
|
|
|
|
|
2021-04-09 04:19:16 +08:00
|
|
|
/* Return largest page shift based on "IO Page Sizes" output of ibm,query-pe-dma-window. */
|
|
|
|
static int iommu_get_page_shift(u32 query_page_size)
|
|
|
|
{
|
2021-10-06 12:47:35 +08:00
|
|
|
/* Supported IO page-sizes according to LoPAR, note that 2M is out of order */
|
2021-04-09 04:19:16 +08:00
|
|
|
const int shift[] = {
|
|
|
|
__builtin_ctzll(SZ_4K), __builtin_ctzll(SZ_64K), __builtin_ctzll(SZ_16M),
|
|
|
|
__builtin_ctzll(SZ_32M), __builtin_ctzll(SZ_64M), __builtin_ctzll(SZ_128M),
|
2021-10-06 12:47:35 +08:00
|
|
|
__builtin_ctzll(SZ_256M), __builtin_ctzll(SZ_16G), __builtin_ctzll(SZ_2M)
|
2021-04-09 04:19:16 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
int i = ARRAY_SIZE(shift) - 1;
|
2021-10-06 12:47:35 +08:00
|
|
|
int ret = 0;
|
2021-04-09 04:19:16 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* On LoPAR, ibm,query-pe-dma-window outputs "IO Page Sizes" using a bit field:
|
|
|
|
* - bit 31 means 4k pages are supported,
|
|
|
|
* - bit 30 means 64k pages are supported, and so on.
|
|
|
|
* Larger pagesizes map more memory with the same amount of TCEs, so start probing them.
|
|
|
|
*/
|
|
|
|
for (; i >= 0 ; i--) {
|
|
|
|
if (query_page_size & (1 << i))
|
2021-10-06 12:47:35 +08:00
|
|
|
ret = max(ret, shift[i]);
|
2021-04-09 04:19:16 +08:00
|
|
|
}
|
|
|
|
|
2021-10-06 12:47:35 +08:00
|
|
|
return ret;
|
2021-04-09 04:19:16 +08:00
|
|
|
}
|
|
|
|
|
powerpc/pseries/iommu: Add ddw_property_create() and refactor enable_ddw()
Code used to create a ddw property that was previously scattered in
enable_ddw() is now gathered in ddw_property_create(), which deals with
allocation and filling the property, letting it ready for
of_property_add(), which now occurs in sequence.
This created an opportunity to reorganize the second part of enable_ddw():
Without this patch enable_ddw() does, in order:
kzalloc() property & members, create_ddw(), fill ddwprop inside property,
ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory,
of_add_property(), and list_add().
With this patch enable_ddw() does, in order:
create_ddw(), ddw_property_create(), of_add_property(),
ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory,
and list_add().
This change requires of_remove_property() in case anything fails after
of_add_property(), but we get to do tce_setrange_multi_pSeriesLP_walk
in all memory, which looks the most expensive operation, only if
everything else succeeds.
Also, the error path got remove_ddw() replaced by a new helper
__remove_dma_window(), which only removes the new DDW with an rtas-call.
For this, a new helper clean_dma_window() was needed to clean anything
that could left if walk_system_ram_range() fails.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-7-leobras.c@gmail.com
2021-08-17 14:39:24 +08:00
|
|
|
static struct property *ddw_property_create(const char *propname, u32 liobn, u64 dma_addr,
|
|
|
|
u32 page_shift, u32 window_shift)
|
|
|
|
{
|
|
|
|
struct dynamic_dma_window_prop *ddwprop;
|
|
|
|
struct property *win64;
|
|
|
|
|
|
|
|
win64 = kzalloc(sizeof(*win64), GFP_KERNEL);
|
|
|
|
if (!win64)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
win64->name = kstrdup(propname, GFP_KERNEL);
|
|
|
|
ddwprop = kzalloc(sizeof(*ddwprop), GFP_KERNEL);
|
|
|
|
win64->value = ddwprop;
|
|
|
|
win64->length = sizeof(*ddwprop);
|
|
|
|
if (!win64->name || !win64->value) {
|
|
|
|
kfree(win64->name);
|
|
|
|
kfree(win64->value);
|
|
|
|
kfree(win64);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
ddwprop->liobn = cpu_to_be32(liobn);
|
|
|
|
ddwprop->dma_base = cpu_to_be64(dma_addr);
|
|
|
|
ddwprop->tce_shift = cpu_to_be32(page_shift);
|
|
|
|
ddwprop->window_shift = cpu_to_be32(window_shift);
|
|
|
|
|
|
|
|
return win64;
|
|
|
|
}
|
|
|
|
|
2011-02-10 17:10:47 +08:00
|
|
|
/*
|
|
|
|
* If the PE supports dynamic dma windows, and there is space for a table
|
|
|
|
* that can map all pages in a linear offset, then setup such a table,
|
|
|
|
* and record the dma-offset in the struct device.
|
|
|
|
*
|
|
|
|
* dev: the pci device we are checking
|
|
|
|
* pdn: the parent pe node with the ibm,dma_window property
|
|
|
|
* Future: also check if we can remap the base window for our base page size
|
|
|
|
*
|
2021-08-17 14:39:23 +08:00
|
|
|
* returns true if can map all pages (direct mapping), false otherwise..
|
2011-02-10 17:10:47 +08:00
|
|
|
*/
|
2021-08-17 14:39:23 +08:00
|
|
|
static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
|
2011-02-10 17:10:47 +08:00
|
|
|
{
|
2020-10-29 09:52:41 +08:00
|
|
|
int len = 0, ret;
|
|
|
|
int max_ram_len = order_base_2(ddw_memory_hotplug_max());
|
2011-02-10 17:10:47 +08:00
|
|
|
struct ddw_query_response query;
|
|
|
|
struct ddw_create_response create;
|
|
|
|
int page_shift;
|
powerpc/pseries/iommu: Add ddw_property_create() and refactor enable_ddw()
Code used to create a ddw property that was previously scattered in
enable_ddw() is now gathered in ddw_property_create(), which deals with
allocation and filling the property, letting it ready for
of_property_add(), which now occurs in sequence.
This created an opportunity to reorganize the second part of enable_ddw():
Without this patch enable_ddw() does, in order:
kzalloc() property & members, create_ddw(), fill ddwprop inside property,
ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory,
of_add_property(), and list_add().
With this patch enable_ddw() does, in order:
create_ddw(), ddw_property_create(), of_add_property(),
ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory,
and list_add().
This change requires of_remove_property() in case anything fails after
of_add_property(), but we get to do tce_setrange_multi_pSeriesLP_walk
in all memory, which looks the most expensive operation, only if
everything else succeeds.
Also, the error path got remove_ddw() replaced by a new helper
__remove_dma_window(), which only removes the new DDW with an rtas-call.
For this, a new helper clean_dma_window() was needed to clean anything
that could left if walk_system_ram_range() fails.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-7-leobras.c@gmail.com
2021-08-17 14:39:24 +08:00
|
|
|
u64 win_addr;
|
powerpc/pseries/iommu: Make use of DDW for indirect mapping
So far it's assumed possible to map the guest RAM 1:1 to the bus, which
works with a small number of devices. SRIOV changes it as the user can
configure hundreds VFs and since phyp preallocates TCEs and does not
allow IOMMU pages bigger than 64K, it has to limit the number of TCEs
per a PE to limit waste of physical pages.
As of today, if the assumed direct mapping is not possible, DDW creation
is skipped and the default DMA window "ibm,dma-window" is used instead.
By using DDW, indirect mapping can get more TCEs than available for the
default DMA window, and also get access to using much larger pagesizes
(16MB as implemented in qemu vs 4k from default DMA window), causing a
significant increase on the maximum amount of memory that can be IOMMU
mapped at the same time.
Indirect mapping will only be used if direct mapping is not a
possibility.
For indirect mapping, it's necessary to re-create the iommu_table with
the new DMA window parameters, so iommu_alloc() can use it.
Removing the default DMA window for using DDW with indirect mapping
is only allowed if there is no current IOMMU memory allocated in
the iommu_table. enable_ddw() is aborted otherwise.
Even though there won't be both direct and indirect mappings at the
same time, we can't reuse the DIRECT64_PROPNAME property name, or else
an older kexec()ed kernel can assume direct mapping, and skip
iommu_alloc(), causing undesirable behavior.
So a new property name DMA64_PROPNAME "linux,dma64-ddr-window-info"
was created to represent a DDW that does not allow direct mapping.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-11-leobras.c@gmail.com
2021-08-17 14:39:28 +08:00
|
|
|
const char *win_name;
|
2011-02-10 17:10:47 +08:00
|
|
|
struct device_node *dn;
|
2020-08-05 11:04:52 +08:00
|
|
|
u32 ddw_avail[DDW_APPLICABLE_SIZE];
|
2021-08-17 14:39:29 +08:00
|
|
|
struct dma_win *window;
|
2011-05-06 21:27:30 +08:00
|
|
|
struct property *win64;
|
powerpc/pseries: close DDW race between functions of adapter
Given a PCI device with multiple functions in a DDW capable slot, the
following situation can be encountered: When the first function sets a
64-bit DMA mask, enable_ddw() will be called and we can fail to properly
configure DDW (the most common reason being the new DMA window's size is
not large enough to map all of an LPAR's memory). With the recent
changes to DDW, we remove the base window in order to determine if the
new window is of sufficient size to cover an LPAR's memory. We correctly
replace the base window if we find that not to be the case. However,
once we go through and re-configured 32-bit DMA via the IOMMU, the next
function of the adapter will go through the same process. And since DDW
is a characteristic of the slot itself, we are most likely going to fail
again. But to determine we are going to fail the second slot, we again
remove the base window -- but that is now in-use by the first
function/driver, which might be issuing I/O already.
To close this window, keep a list of all the failed struct device_nodes
that have failed to configure DDW. If the current device_node is in that
list, just fail out immediately and fall back to 32-bit DMA without
doing any DDW manipulation.
Signed-off-by: Nishanth Aravamudan <nacc@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
2013-03-07 20:33:03 +08:00
|
|
|
struct failed_ddw_pdn *fpdn;
|
powerpc/pseries/iommu: Make use of DDW for indirect mapping
So far it's assumed possible to map the guest RAM 1:1 to the bus, which
works with a small number of devices. SRIOV changes it as the user can
configure hundreds VFs and since phyp preallocates TCEs and does not
allow IOMMU pages bigger than 64K, it has to limit the number of TCEs
per a PE to limit waste of physical pages.
As of today, if the assumed direct mapping is not possible, DDW creation
is skipped and the default DMA window "ibm,dma-window" is used instead.
By using DDW, indirect mapping can get more TCEs than available for the
default DMA window, and also get access to using much larger pagesizes
(16MB as implemented in qemu vs 4k from default DMA window), causing a
significant increase on the maximum amount of memory that can be IOMMU
mapped at the same time.
Indirect mapping will only be used if direct mapping is not a
possibility.
For indirect mapping, it's necessary to re-create the iommu_table with
the new DMA window parameters, so iommu_alloc() can use it.
Removing the default DMA window for using DDW with indirect mapping
is only allowed if there is no current IOMMU memory allocated in
the iommu_table. enable_ddw() is aborted otherwise.
Even though there won't be both direct and indirect mappings at the
same time, we can't reuse the DIRECT64_PROPNAME property name, or else
an older kexec()ed kernel can assume direct mapping, and skip
iommu_alloc(), causing undesirable behavior.
So a new property name DMA64_PROPNAME "linux,dma64-ddr-window-info"
was created to represent a DDW that does not allow direct mapping.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-11-leobras.c@gmail.com
2021-08-17 14:39:28 +08:00
|
|
|
bool default_win_removed = false, direct_mapping = false;
|
2020-10-29 09:52:41 +08:00
|
|
|
bool pmem_present;
|
powerpc/pseries/iommu: Make use of DDW for indirect mapping
So far it's assumed possible to map the guest RAM 1:1 to the bus, which
works with a small number of devices. SRIOV changes it as the user can
configure hundreds VFs and since phyp preallocates TCEs and does not
allow IOMMU pages bigger than 64K, it has to limit the number of TCEs
per a PE to limit waste of physical pages.
As of today, if the assumed direct mapping is not possible, DDW creation
is skipped and the default DMA window "ibm,dma-window" is used instead.
By using DDW, indirect mapping can get more TCEs than available for the
default DMA window, and also get access to using much larger pagesizes
(16MB as implemented in qemu vs 4k from default DMA window), causing a
significant increase on the maximum amount of memory that can be IOMMU
mapped at the same time.
Indirect mapping will only be used if direct mapping is not a
possibility.
For indirect mapping, it's necessary to re-create the iommu_table with
the new DMA window parameters, so iommu_alloc() can use it.
Removing the default DMA window for using DDW with indirect mapping
is only allowed if there is no current IOMMU memory allocated in
the iommu_table. enable_ddw() is aborted otherwise.
Even though there won't be both direct and indirect mappings at the
same time, we can't reuse the DIRECT64_PROPNAME property name, or else
an older kexec()ed kernel can assume direct mapping, and skip
iommu_alloc(), causing undesirable behavior.
So a new property name DMA64_PROPNAME "linux,dma64-ddr-window-info"
was created to represent a DDW that does not allow direct mapping.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-11-leobras.c@gmail.com
2021-08-17 14:39:28 +08:00
|
|
|
struct pci_dn *pci = PCI_DN(pdn);
|
2022-06-29 14:06:14 +08:00
|
|
|
struct property *default_win = NULL;
|
2020-10-29 09:52:41 +08:00
|
|
|
|
|
|
|
dn = of_find_node_by_type(NULL, "ibm,pmemory");
|
|
|
|
pmem_present = dn != NULL;
|
|
|
|
of_node_put(dn);
|
2011-02-10 17:10:47 +08:00
|
|
|
|
2021-08-17 14:39:29 +08:00
|
|
|
mutex_lock(&dma_win_init_mutex);
|
2011-02-10 17:10:47 +08:00
|
|
|
|
2021-08-17 14:39:23 +08:00
|
|
|
if (find_existing_ddw(pdn, &dev->dev.archdata.dma_offset, &len)) {
|
powerpc/pseries/iommu: Make use of DDW for indirect mapping
So far it's assumed possible to map the guest RAM 1:1 to the bus, which
works with a small number of devices. SRIOV changes it as the user can
configure hundreds VFs and since phyp preallocates TCEs and does not
allow IOMMU pages bigger than 64K, it has to limit the number of TCEs
per a PE to limit waste of physical pages.
As of today, if the assumed direct mapping is not possible, DDW creation
is skipped and the default DMA window "ibm,dma-window" is used instead.
By using DDW, indirect mapping can get more TCEs than available for the
default DMA window, and also get access to using much larger pagesizes
(16MB as implemented in qemu vs 4k from default DMA window), causing a
significant increase on the maximum amount of memory that can be IOMMU
mapped at the same time.
Indirect mapping will only be used if direct mapping is not a
possibility.
For indirect mapping, it's necessary to re-create the iommu_table with
the new DMA window parameters, so iommu_alloc() can use it.
Removing the default DMA window for using DDW with indirect mapping
is only allowed if there is no current IOMMU memory allocated in
the iommu_table. enable_ddw() is aborted otherwise.
Even though there won't be both direct and indirect mappings at the
same time, we can't reuse the DIRECT64_PROPNAME property name, or else
an older kexec()ed kernel can assume direct mapping, and skip
iommu_alloc(), causing undesirable behavior.
So a new property name DMA64_PROPNAME "linux,dma64-ddr-window-info"
was created to represent a DDW that does not allow direct mapping.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-11-leobras.c@gmail.com
2021-08-17 14:39:28 +08:00
|
|
|
direct_mapping = (len >= max_ram_len);
|
2011-02-10 17:10:47 +08:00
|
|
|
goto out_unlock;
|
2021-08-17 14:39:23 +08:00
|
|
|
}
|
2011-02-10 17:10:47 +08:00
|
|
|
|
powerpc/pseries: close DDW race between functions of adapter
Given a PCI device with multiple functions in a DDW capable slot, the
following situation can be encountered: When the first function sets a
64-bit DMA mask, enable_ddw() will be called and we can fail to properly
configure DDW (the most common reason being the new DMA window's size is
not large enough to map all of an LPAR's memory). With the recent
changes to DDW, we remove the base window in order to determine if the
new window is of sufficient size to cover an LPAR's memory. We correctly
replace the base window if we find that not to be the case. However,
once we go through and re-configured 32-bit DMA via the IOMMU, the next
function of the adapter will go through the same process. And since DDW
is a characteristic of the slot itself, we are most likely going to fail
again. But to determine we are going to fail the second slot, we again
remove the base window -- but that is now in-use by the first
function/driver, which might be issuing I/O already.
To close this window, keep a list of all the failed struct device_nodes
that have failed to configure DDW. If the current device_node is in that
list, just fail out immediately and fall back to 32-bit DMA without
doing any DDW manipulation.
Signed-off-by: Nishanth Aravamudan <nacc@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
2013-03-07 20:33:03 +08:00
|
|
|
/*
|
|
|
|
* If we already went through this for a previous function of
|
|
|
|
* the same device and failed, we don't want to muck with the
|
|
|
|
* DMA window again, as it will race with in-flight operations
|
|
|
|
* and can lead to EEHs. The above mutex protects access to the
|
|
|
|
* list.
|
|
|
|
*/
|
|
|
|
list_for_each_entry(fpdn, &failed_ddw_pdn_list, list) {
|
2017-08-21 23:16:47 +08:00
|
|
|
if (fpdn->pdn == pdn)
|
powerpc/pseries: close DDW race between functions of adapter
Given a PCI device with multiple functions in a DDW capable slot, the
following situation can be encountered: When the first function sets a
64-bit DMA mask, enable_ddw() will be called and we can fail to properly
configure DDW (the most common reason being the new DMA window's size is
not large enough to map all of an LPAR's memory). With the recent
changes to DDW, we remove the base window in order to determine if the
new window is of sufficient size to cover an LPAR's memory. We correctly
replace the base window if we find that not to be the case. However,
once we go through and re-configured 32-bit DMA via the IOMMU, the next
function of the adapter will go through the same process. And since DDW
is a characteristic of the slot itself, we are most likely going to fail
again. But to determine we are going to fail the second slot, we again
remove the base window -- but that is now in-use by the first
function/driver, which might be issuing I/O already.
To close this window, keep a list of all the failed struct device_nodes
that have failed to configure DDW. If the current device_node is in that
list, just fail out immediately and fall back to 32-bit DMA without
doing any DDW manipulation.
Signed-off-by: Nishanth Aravamudan <nacc@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
2013-03-07 20:33:03 +08:00
|
|
|
goto out_unlock;
|
|
|
|
}
|
|
|
|
|
2011-02-10 17:10:47 +08:00
|
|
|
/*
|
|
|
|
* the ibm,ddw-applicable property holds the tokens for:
|
|
|
|
* ibm,query-pe-dma-window
|
|
|
|
* ibm,create-pe-dma-window
|
|
|
|
* ibm,remove-pe-dma-window
|
|
|
|
* for the given node in that order.
|
|
|
|
* the property is actually in the parent, not the PE
|
|
|
|
*/
|
2014-09-25 14:39:18 +08:00
|
|
|
ret = of_property_read_u32_array(pdn, "ibm,ddw-applicable",
|
2020-08-05 11:04:52 +08:00
|
|
|
&ddw_avail[0], DDW_APPLICABLE_SIZE);
|
2014-09-25 14:39:18 +08:00
|
|
|
if (ret)
|
2014-01-11 07:09:38 +08:00
|
|
|
goto out_failed;
|
2012-05-15 15:04:32 +08:00
|
|
|
|
2014-01-11 07:09:38 +08:00
|
|
|
/*
|
2011-02-10 17:10:47 +08:00
|
|
|
* Query if there is a second window of size to map the
|
|
|
|
* whole partition. Query returns number of windows, largest
|
|
|
|
* block assigned to PE (partition endpoint), and two bitmasks
|
|
|
|
* of page sizes: supported and supported for migrate-dma.
|
|
|
|
*/
|
|
|
|
dn = pci_device_to_OF_node(dev);
|
powerpc/pseries/iommu: Update call to ibm, query-pe-dma-windows
>From LoPAR level 2.8, "ibm,ddw-extensions" index 3 can make the number of
outputs from "ibm,query-pe-dma-windows" go from 5 to 6.
This change of output size is meant to expand the address size of
largest_available_block PE TCE from 32-bit to 64-bit, which ends up
shifting page_size and migration_capable.
This ends up requiring the update of
ddw_query_response->largest_available_block from u32 to u64, and manually
assigning the values from the buffer into this struct, according to
output size.
Also, a routine was created for helping reading the ddw extensions as
suggested by LoPAR: First reading the size of the extension array from
index 0, checking if the property exists, and then returning it's value.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Tested-by: David Dai <zdai@linux.vnet.ibm.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200805030455.123024-3-leobras.c@gmail.com
2020-08-05 11:04:53 +08:00
|
|
|
ret = query_ddw(dev, ddw_avail, &query, pdn);
|
2011-02-10 17:10:47 +08:00
|
|
|
if (ret != 0)
|
2014-01-11 07:09:38 +08:00
|
|
|
goto out_failed;
|
2011-02-10 17:10:47 +08:00
|
|
|
|
powerpc/pseries/iommu: Allow bigger 64bit window by removing default DMA window
On LoPAR "DMA Window Manipulation Calls", it's recommended to remove the
default DMA window for the device, before attempting to configure a DDW,
in order to make the maximum resources available for the next DDW to be
created.
This is a requirement for using DDW on devices in which hypervisor
allows only one DMA window.
If setting up a new DDW fails anywhere after the removal of this
default DMA window, it's needed to restore the default DMA window.
For this, an implementation of ibm,reset-pe-dma-windows rtas call is
needed:
Platforms supporting the DDW option starting with LoPAR level 2.7 implement
ibm,ddw-extensions. The first extension available (index 2) carries the
token for ibm,reset-pe-dma-windows rtas call, which is used to restore
the default DMA window for a device, if it has been deleted.
It does so by resetting the TCE table allocation for the PE to it's
boot time value, available in "ibm,dma-window" device tree node.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Tested-by: David Dai <zdai@linux.vnet.ibm.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200805030455.123024-5-leobras.c@gmail.com
2020-08-05 11:04:55 +08:00
|
|
|
/*
|
|
|
|
* If there is no window available, remove the default DMA window,
|
|
|
|
* if it's present. This will make all the resources available to the
|
|
|
|
* new DDW window.
|
|
|
|
* If anything fails after this, we need to restore it, so also check
|
|
|
|
* for extensions presence.
|
|
|
|
*/
|
2011-02-10 17:10:47 +08:00
|
|
|
if (query.windows_available == 0) {
|
powerpc/pseries/iommu: Allow bigger 64bit window by removing default DMA window
On LoPAR "DMA Window Manipulation Calls", it's recommended to remove the
default DMA window for the device, before attempting to configure a DDW,
in order to make the maximum resources available for the next DDW to be
created.
This is a requirement for using DDW on devices in which hypervisor
allows only one DMA window.
If setting up a new DDW fails anywhere after the removal of this
default DMA window, it's needed to restore the default DMA window.
For this, an implementation of ibm,reset-pe-dma-windows rtas call is
needed:
Platforms supporting the DDW option starting with LoPAR level 2.7 implement
ibm,ddw-extensions. The first extension available (index 2) carries the
token for ibm,reset-pe-dma-windows rtas call, which is used to restore
the default DMA window for a device, if it has been deleted.
It does so by resetting the TCE table allocation for the PE to it's
boot time value, available in "ibm,dma-window" device tree node.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Tested-by: David Dai <zdai@linux.vnet.ibm.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200805030455.123024-5-leobras.c@gmail.com
2020-08-05 11:04:55 +08:00
|
|
|
int reset_win_ext;
|
|
|
|
|
2021-10-20 21:23:14 +08:00
|
|
|
/* DDW + IOMMU on single window may fail if there is any allocation */
|
2022-06-29 14:06:14 +08:00
|
|
|
if (iommu_table_in_use(pci->table_group->tables[0])) {
|
2021-10-20 21:23:14 +08:00
|
|
|
dev_warn(&dev->dev, "current IOMMU table in use, can't be replaced.\n");
|
|
|
|
goto out_failed;
|
|
|
|
}
|
|
|
|
|
powerpc/pseries/iommu: Allow bigger 64bit window by removing default DMA window
On LoPAR "DMA Window Manipulation Calls", it's recommended to remove the
default DMA window for the device, before attempting to configure a DDW,
in order to make the maximum resources available for the next DDW to be
created.
This is a requirement for using DDW on devices in which hypervisor
allows only one DMA window.
If setting up a new DDW fails anywhere after the removal of this
default DMA window, it's needed to restore the default DMA window.
For this, an implementation of ibm,reset-pe-dma-windows rtas call is
needed:
Platforms supporting the DDW option starting with LoPAR level 2.7 implement
ibm,ddw-extensions. The first extension available (index 2) carries the
token for ibm,reset-pe-dma-windows rtas call, which is used to restore
the default DMA window for a device, if it has been deleted.
It does so by resetting the TCE table allocation for the PE to it's
boot time value, available in "ibm,dma-window" device tree node.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Tested-by: David Dai <zdai@linux.vnet.ibm.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200805030455.123024-5-leobras.c@gmail.com
2020-08-05 11:04:55 +08:00
|
|
|
default_win = of_find_property(pdn, "ibm,dma-window", NULL);
|
|
|
|
if (!default_win)
|
|
|
|
goto out_failed;
|
|
|
|
|
|
|
|
reset_win_ext = ddw_read_ext(pdn, DDW_EXT_RESET_DMA_WIN, NULL);
|
|
|
|
if (reset_win_ext)
|
|
|
|
goto out_failed;
|
|
|
|
|
|
|
|
remove_dma_window(pdn, ddw_avail, default_win);
|
|
|
|
default_win_removed = true;
|
|
|
|
|
|
|
|
/* Query again, to check if the window is available */
|
|
|
|
ret = query_ddw(dev, ddw_avail, &query, pdn);
|
|
|
|
if (ret != 0)
|
|
|
|
goto out_failed;
|
|
|
|
|
|
|
|
if (query.windows_available == 0) {
|
|
|
|
/* no windows are available for this device. */
|
|
|
|
dev_dbg(&dev->dev, "no free dynamic windows");
|
|
|
|
goto out_failed;
|
|
|
|
}
|
2011-02-10 17:10:47 +08:00
|
|
|
}
|
2021-04-09 04:19:16 +08:00
|
|
|
|
|
|
|
page_shift = iommu_get_page_shift(query.page_size);
|
|
|
|
if (!page_shift) {
|
2021-08-17 14:39:29 +08:00
|
|
|
dev_dbg(&dev->dev, "no supported page size in mask %x",
|
|
|
|
query.page_size);
|
2014-01-11 07:09:38 +08:00
|
|
|
goto out_failed;
|
2011-02-10 17:10:47 +08:00
|
|
|
}
|
powerpc/pseries/iommu: Make use of DDW for indirect mapping
So far it's assumed possible to map the guest RAM 1:1 to the bus, which
works with a small number of devices. SRIOV changes it as the user can
configure hundreds VFs and since phyp preallocates TCEs and does not
allow IOMMU pages bigger than 64K, it has to limit the number of TCEs
per a PE to limit waste of physical pages.
As of today, if the assumed direct mapping is not possible, DDW creation
is skipped and the default DMA window "ibm,dma-window" is used instead.
By using DDW, indirect mapping can get more TCEs than available for the
default DMA window, and also get access to using much larger pagesizes
(16MB as implemented in qemu vs 4k from default DMA window), causing a
significant increase on the maximum amount of memory that can be IOMMU
mapped at the same time.
Indirect mapping will only be used if direct mapping is not a
possibility.
For indirect mapping, it's necessary to re-create the iommu_table with
the new DMA window parameters, so iommu_alloc() can use it.
Removing the default DMA window for using DDW with indirect mapping
is only allowed if there is no current IOMMU memory allocated in
the iommu_table. enable_ddw() is aborted otherwise.
Even though there won't be both direct and indirect mappings at the
same time, we can't reuse the DIRECT64_PROPNAME property name, or else
an older kexec()ed kernel can assume direct mapping, and skip
iommu_alloc(), causing undesirable behavior.
So a new property name DMA64_PROPNAME "linux,dma64-ddr-window-info"
was created to represent a DDW that does not allow direct mapping.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-11-leobras.c@gmail.com
2021-08-17 14:39:28 +08:00
|
|
|
|
|
|
|
|
2020-10-29 09:52:41 +08:00
|
|
|
/*
|
|
|
|
* The "ibm,pmemory" can appear anywhere in the address space.
|
|
|
|
* Assuming it is still backed by page structs, try MAX_PHYSMEM_BITS
|
|
|
|
* for the upper limit and fallback to max RAM otherwise but this
|
|
|
|
* disables device::dma_ops_bypass.
|
|
|
|
*/
|
|
|
|
len = max_ram_len;
|
|
|
|
if (pmem_present) {
|
|
|
|
if (query.largest_available_block >=
|
|
|
|
(1ULL << (MAX_PHYSMEM_BITS - page_shift)))
|
2021-04-20 12:54:04 +08:00
|
|
|
len = MAX_PHYSMEM_BITS;
|
2020-10-29 09:52:41 +08:00
|
|
|
else
|
|
|
|
dev_info(&dev->dev, "Skipping ibm,pmemory");
|
|
|
|
}
|
|
|
|
|
powerpc/pseries/iommu: Make use of DDW for indirect mapping
So far it's assumed possible to map the guest RAM 1:1 to the bus, which
works with a small number of devices. SRIOV changes it as the user can
configure hundreds VFs and since phyp preallocates TCEs and does not
allow IOMMU pages bigger than 64K, it has to limit the number of TCEs
per a PE to limit waste of physical pages.
As of today, if the assumed direct mapping is not possible, DDW creation
is skipped and the default DMA window "ibm,dma-window" is used instead.
By using DDW, indirect mapping can get more TCEs than available for the
default DMA window, and also get access to using much larger pagesizes
(16MB as implemented in qemu vs 4k from default DMA window), causing a
significant increase on the maximum amount of memory that can be IOMMU
mapped at the same time.
Indirect mapping will only be used if direct mapping is not a
possibility.
For indirect mapping, it's necessary to re-create the iommu_table with
the new DMA window parameters, so iommu_alloc() can use it.
Removing the default DMA window for using DDW with indirect mapping
is only allowed if there is no current IOMMU memory allocated in
the iommu_table. enable_ddw() is aborted otherwise.
Even though there won't be both direct and indirect mappings at the
same time, we can't reuse the DIRECT64_PROPNAME property name, or else
an older kexec()ed kernel can assume direct mapping, and skip
iommu_alloc(), causing undesirable behavior.
So a new property name DMA64_PROPNAME "linux,dma64-ddr-window-info"
was created to represent a DDW that does not allow direct mapping.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-11-leobras.c@gmail.com
2021-08-17 14:39:28 +08:00
|
|
|
/* check if the available block * number of ptes will map everything */
|
2020-10-29 09:52:41 +08:00
|
|
|
if (query.largest_available_block < (1ULL << (len - page_shift))) {
|
|
|
|
dev_dbg(&dev->dev,
|
|
|
|
"can't map partition max 0x%llx with %llu %llu-sized pages\n",
|
|
|
|
1ULL << len,
|
|
|
|
query.largest_available_block,
|
|
|
|
1ULL << page_shift);
|
powerpc/pseries/iommu: Make use of DDW for indirect mapping
So far it's assumed possible to map the guest RAM 1:1 to the bus, which
works with a small number of devices. SRIOV changes it as the user can
configure hundreds VFs and since phyp preallocates TCEs and does not
allow IOMMU pages bigger than 64K, it has to limit the number of TCEs
per a PE to limit waste of physical pages.
As of today, if the assumed direct mapping is not possible, DDW creation
is skipped and the default DMA window "ibm,dma-window" is used instead.
By using DDW, indirect mapping can get more TCEs than available for the
default DMA window, and also get access to using much larger pagesizes
(16MB as implemented in qemu vs 4k from default DMA window), causing a
significant increase on the maximum amount of memory that can be IOMMU
mapped at the same time.
Indirect mapping will only be used if direct mapping is not a
possibility.
For indirect mapping, it's necessary to re-create the iommu_table with
the new DMA window parameters, so iommu_alloc() can use it.
Removing the default DMA window for using DDW with indirect mapping
is only allowed if there is no current IOMMU memory allocated in
the iommu_table. enable_ddw() is aborted otherwise.
Even though there won't be both direct and indirect mappings at the
same time, we can't reuse the DIRECT64_PROPNAME property name, or else
an older kexec()ed kernel can assume direct mapping, and skip
iommu_alloc(), causing undesirable behavior.
So a new property name DMA64_PROPNAME "linux,dma64-ddr-window-info"
was created to represent a DDW that does not allow direct mapping.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-11-leobras.c@gmail.com
2021-08-17 14:39:28 +08:00
|
|
|
|
|
|
|
len = order_base_2(query.largest_available_block << page_shift);
|
|
|
|
win_name = DMA64_PROPNAME;
|
|
|
|
} else {
|
2021-11-08 12:03:19 +08:00
|
|
|
direct_mapping = !default_win_removed ||
|
|
|
|
(len == MAX_PHYSMEM_BITS) ||
|
|
|
|
(!pmem_present && (len == max_ram_len));
|
|
|
|
win_name = direct_mapping ? DIRECT64_PROPNAME : DMA64_PROPNAME;
|
2011-02-10 17:10:47 +08:00
|
|
|
}
|
|
|
|
|
2011-05-11 20:25:00 +08:00
|
|
|
ret = create_ddw(dev, ddw_avail, &create, page_shift, len);
|
2011-02-10 17:10:47 +08:00
|
|
|
if (ret != 0)
|
powerpc/pseries/iommu: Add ddw_property_create() and refactor enable_ddw()
Code used to create a ddw property that was previously scattered in
enable_ddw() is now gathered in ddw_property_create(), which deals with
allocation and filling the property, letting it ready for
of_property_add(), which now occurs in sequence.
This created an opportunity to reorganize the second part of enable_ddw():
Without this patch enable_ddw() does, in order:
kzalloc() property & members, create_ddw(), fill ddwprop inside property,
ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory,
of_add_property(), and list_add().
With this patch enable_ddw() does, in order:
create_ddw(), ddw_property_create(), of_add_property(),
ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory,
and list_add().
This change requires of_remove_property() in case anything fails after
of_add_property(), but we get to do tce_setrange_multi_pSeriesLP_walk
in all memory, which looks the most expensive operation, only if
everything else succeeds.
Also, the error path got remove_ddw() replaced by a new helper
__remove_dma_window(), which only removes the new DDW with an rtas-call.
For this, a new helper clean_dma_window() was needed to clean anything
that could left if walk_system_ram_range() fails.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-7-leobras.c@gmail.com
2021-08-17 14:39:24 +08:00
|
|
|
goto out_failed;
|
2011-02-10 17:10:47 +08:00
|
|
|
|
2017-08-21 23:16:47 +08:00
|
|
|
dev_dbg(&dev->dev, "created tce table LIOBN 0x%x for %pOF\n",
|
|
|
|
create.liobn, dn);
|
2011-02-10 17:10:47 +08:00
|
|
|
|
powerpc/pseries/iommu: Add ddw_property_create() and refactor enable_ddw()
Code used to create a ddw property that was previously scattered in
enable_ddw() is now gathered in ddw_property_create(), which deals with
allocation and filling the property, letting it ready for
of_property_add(), which now occurs in sequence.
This created an opportunity to reorganize the second part of enable_ddw():
Without this patch enable_ddw() does, in order:
kzalloc() property & members, create_ddw(), fill ddwprop inside property,
ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory,
of_add_property(), and list_add().
With this patch enable_ddw() does, in order:
create_ddw(), ddw_property_create(), of_add_property(),
ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory,
and list_add().
This change requires of_remove_property() in case anything fails after
of_add_property(), but we get to do tce_setrange_multi_pSeriesLP_walk
in all memory, which looks the most expensive operation, only if
everything else succeeds.
Also, the error path got remove_ddw() replaced by a new helper
__remove_dma_window(), which only removes the new DDW with an rtas-call.
For this, a new helper clean_dma_window() was needed to clean anything
that could left if walk_system_ram_range() fails.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-7-leobras.c@gmail.com
2021-08-17 14:39:24 +08:00
|
|
|
win_addr = ((u64)create.addr_hi << 32) | create.addr_lo;
|
powerpc/pseries/iommu: Make use of DDW for indirect mapping
So far it's assumed possible to map the guest RAM 1:1 to the bus, which
works with a small number of devices. SRIOV changes it as the user can
configure hundreds VFs and since phyp preallocates TCEs and does not
allow IOMMU pages bigger than 64K, it has to limit the number of TCEs
per a PE to limit waste of physical pages.
As of today, if the assumed direct mapping is not possible, DDW creation
is skipped and the default DMA window "ibm,dma-window" is used instead.
By using DDW, indirect mapping can get more TCEs than available for the
default DMA window, and also get access to using much larger pagesizes
(16MB as implemented in qemu vs 4k from default DMA window), causing a
significant increase on the maximum amount of memory that can be IOMMU
mapped at the same time.
Indirect mapping will only be used if direct mapping is not a
possibility.
For indirect mapping, it's necessary to re-create the iommu_table with
the new DMA window parameters, so iommu_alloc() can use it.
Removing the default DMA window for using DDW with indirect mapping
is only allowed if there is no current IOMMU memory allocated in
the iommu_table. enable_ddw() is aborted otherwise.
Even though there won't be both direct and indirect mappings at the
same time, we can't reuse the DIRECT64_PROPNAME property name, or else
an older kexec()ed kernel can assume direct mapping, and skip
iommu_alloc(), causing undesirable behavior.
So a new property name DMA64_PROPNAME "linux,dma64-ddr-window-info"
was created to represent a DDW that does not allow direct mapping.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-11-leobras.c@gmail.com
2021-08-17 14:39:28 +08:00
|
|
|
win64 = ddw_property_create(win_name, create.liobn, win_addr, page_shift, len);
|
|
|
|
|
powerpc/pseries/iommu: Add ddw_property_create() and refactor enable_ddw()
Code used to create a ddw property that was previously scattered in
enable_ddw() is now gathered in ddw_property_create(), which deals with
allocation and filling the property, letting it ready for
of_property_add(), which now occurs in sequence.
This created an opportunity to reorganize the second part of enable_ddw():
Without this patch enable_ddw() does, in order:
kzalloc() property & members, create_ddw(), fill ddwprop inside property,
ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory,
of_add_property(), and list_add().
With this patch enable_ddw() does, in order:
create_ddw(), ddw_property_create(), of_add_property(),
ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory,
and list_add().
This change requires of_remove_property() in case anything fails after
of_add_property(), but we get to do tce_setrange_multi_pSeriesLP_walk
in all memory, which looks the most expensive operation, only if
everything else succeeds.
Also, the error path got remove_ddw() replaced by a new helper
__remove_dma_window(), which only removes the new DDW with an rtas-call.
For this, a new helper clean_dma_window() was needed to clean anything
that could left if walk_system_ram_range() fails.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-7-leobras.c@gmail.com
2021-08-17 14:39:24 +08:00
|
|
|
if (!win64) {
|
|
|
|
dev_info(&dev->dev,
|
|
|
|
"couldn't allocate property, property name, or value\n");
|
|
|
|
goto out_remove_win;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = of_add_property(pdn, win64);
|
|
|
|
if (ret) {
|
2021-08-17 14:39:29 +08:00
|
|
|
dev_err(&dev->dev, "unable to add DMA window property for %pOF: %d",
|
powerpc/pseries/iommu: Add ddw_property_create() and refactor enable_ddw()
Code used to create a ddw property that was previously scattered in
enable_ddw() is now gathered in ddw_property_create(), which deals with
allocation and filling the property, letting it ready for
of_property_add(), which now occurs in sequence.
This created an opportunity to reorganize the second part of enable_ddw():
Without this patch enable_ddw() does, in order:
kzalloc() property & members, create_ddw(), fill ddwprop inside property,
ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory,
of_add_property(), and list_add().
With this patch enable_ddw() does, in order:
create_ddw(), ddw_property_create(), of_add_property(),
ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory,
and list_add().
This change requires of_remove_property() in case anything fails after
of_add_property(), but we get to do tce_setrange_multi_pSeriesLP_walk
in all memory, which looks the most expensive operation, only if
everything else succeeds.
Also, the error path got remove_ddw() replaced by a new helper
__remove_dma_window(), which only removes the new DDW with an rtas-call.
For this, a new helper clean_dma_window() was needed to clean anything
that could left if walk_system_ram_range() fails.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-7-leobras.c@gmail.com
2021-08-17 14:39:24 +08:00
|
|
|
pdn, ret);
|
|
|
|
goto out_free_prop;
|
|
|
|
}
|
|
|
|
|
|
|
|
window = ddw_list_new_entry(pdn, win64->value);
|
2011-02-10 17:10:47 +08:00
|
|
|
if (!window)
|
powerpc/pseries/iommu: Add ddw_property_create() and refactor enable_ddw()
Code used to create a ddw property that was previously scattered in
enable_ddw() is now gathered in ddw_property_create(), which deals with
allocation and filling the property, letting it ready for
of_property_add(), which now occurs in sequence.
This created an opportunity to reorganize the second part of enable_ddw():
Without this patch enable_ddw() does, in order:
kzalloc() property & members, create_ddw(), fill ddwprop inside property,
ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory,
of_add_property(), and list_add().
With this patch enable_ddw() does, in order:
create_ddw(), ddw_property_create(), of_add_property(),
ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory,
and list_add().
This change requires of_remove_property() in case anything fails after
of_add_property(), but we get to do tce_setrange_multi_pSeriesLP_walk
in all memory, which looks the most expensive operation, only if
everything else succeeds.
Also, the error path got remove_ddw() replaced by a new helper
__remove_dma_window(), which only removes the new DDW with an rtas-call.
For this, a new helper clean_dma_window() was needed to clean anything
that could left if walk_system_ram_range() fails.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-7-leobras.c@gmail.com
2021-08-17 14:39:24 +08:00
|
|
|
goto out_del_prop;
|
2011-02-10 17:10:47 +08:00
|
|
|
|
powerpc/pseries/iommu: Make use of DDW for indirect mapping
So far it's assumed possible to map the guest RAM 1:1 to the bus, which
works with a small number of devices. SRIOV changes it as the user can
configure hundreds VFs and since phyp preallocates TCEs and does not
allow IOMMU pages bigger than 64K, it has to limit the number of TCEs
per a PE to limit waste of physical pages.
As of today, if the assumed direct mapping is not possible, DDW creation
is skipped and the default DMA window "ibm,dma-window" is used instead.
By using DDW, indirect mapping can get more TCEs than available for the
default DMA window, and also get access to using much larger pagesizes
(16MB as implemented in qemu vs 4k from default DMA window), causing a
significant increase on the maximum amount of memory that can be IOMMU
mapped at the same time.
Indirect mapping will only be used if direct mapping is not a
possibility.
For indirect mapping, it's necessary to re-create the iommu_table with
the new DMA window parameters, so iommu_alloc() can use it.
Removing the default DMA window for using DDW with indirect mapping
is only allowed if there is no current IOMMU memory allocated in
the iommu_table. enable_ddw() is aborted otherwise.
Even though there won't be both direct and indirect mappings at the
same time, we can't reuse the DIRECT64_PROPNAME property name, or else
an older kexec()ed kernel can assume direct mapping, and skip
iommu_alloc(), causing undesirable behavior.
So a new property name DMA64_PROPNAME "linux,dma64-ddr-window-info"
was created to represent a DDW that does not allow direct mapping.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-11-leobras.c@gmail.com
2021-08-17 14:39:28 +08:00
|
|
|
if (direct_mapping) {
|
|
|
|
/* DDW maps the whole partition, so enable direct DMA mapping */
|
|
|
|
ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT,
|
|
|
|
win64->value, tce_setrange_multi_pSeriesLP_walk);
|
|
|
|
if (ret) {
|
2021-08-17 14:39:29 +08:00
|
|
|
dev_info(&dev->dev, "failed to map DMA window for %pOF: %d\n",
|
powerpc/pseries/iommu: Make use of DDW for indirect mapping
So far it's assumed possible to map the guest RAM 1:1 to the bus, which
works with a small number of devices. SRIOV changes it as the user can
configure hundreds VFs and since phyp preallocates TCEs and does not
allow IOMMU pages bigger than 64K, it has to limit the number of TCEs
per a PE to limit waste of physical pages.
As of today, if the assumed direct mapping is not possible, DDW creation
is skipped and the default DMA window "ibm,dma-window" is used instead.
By using DDW, indirect mapping can get more TCEs than available for the
default DMA window, and also get access to using much larger pagesizes
(16MB as implemented in qemu vs 4k from default DMA window), causing a
significant increase on the maximum amount of memory that can be IOMMU
mapped at the same time.
Indirect mapping will only be used if direct mapping is not a
possibility.
For indirect mapping, it's necessary to re-create the iommu_table with
the new DMA window parameters, so iommu_alloc() can use it.
Removing the default DMA window for using DDW with indirect mapping
is only allowed if there is no current IOMMU memory allocated in
the iommu_table. enable_ddw() is aborted otherwise.
Even though there won't be both direct and indirect mappings at the
same time, we can't reuse the DIRECT64_PROPNAME property name, or else
an older kexec()ed kernel can assume direct mapping, and skip
iommu_alloc(), causing undesirable behavior.
So a new property name DMA64_PROPNAME "linux,dma64-ddr-window-info"
was created to represent a DDW that does not allow direct mapping.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-11-leobras.c@gmail.com
2021-08-17 14:39:28 +08:00
|
|
|
dn, ret);
|
2011-02-10 17:10:47 +08:00
|
|
|
|
2021-11-08 12:03:18 +08:00
|
|
|
/* Make sure to clean DDW if any TCE was set*/
|
|
|
|
clean_dma_window(pdn, win64->value);
|
powerpc/pseries/iommu: Make use of DDW for indirect mapping
So far it's assumed possible to map the guest RAM 1:1 to the bus, which
works with a small number of devices. SRIOV changes it as the user can
configure hundreds VFs and since phyp preallocates TCEs and does not
allow IOMMU pages bigger than 64K, it has to limit the number of TCEs
per a PE to limit waste of physical pages.
As of today, if the assumed direct mapping is not possible, DDW creation
is skipped and the default DMA window "ibm,dma-window" is used instead.
By using DDW, indirect mapping can get more TCEs than available for the
default DMA window, and also get access to using much larger pagesizes
(16MB as implemented in qemu vs 4k from default DMA window), causing a
significant increase on the maximum amount of memory that can be IOMMU
mapped at the same time.
Indirect mapping will only be used if direct mapping is not a
possibility.
For indirect mapping, it's necessary to re-create the iommu_table with
the new DMA window parameters, so iommu_alloc() can use it.
Removing the default DMA window for using DDW with indirect mapping
is only allowed if there is no current IOMMU memory allocated in
the iommu_table. enable_ddw() is aborted otherwise.
Even though there won't be both direct and indirect mappings at the
same time, we can't reuse the DIRECT64_PROPNAME property name, or else
an older kexec()ed kernel can assume direct mapping, and skip
iommu_alloc(), causing undesirable behavior.
So a new property name DMA64_PROPNAME "linux,dma64-ddr-window-info"
was created to represent a DDW that does not allow direct mapping.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-11-leobras.c@gmail.com
2021-08-17 14:39:28 +08:00
|
|
|
goto out_del_list;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
struct iommu_table *newtbl;
|
|
|
|
int i;
|
2021-10-20 21:23:15 +08:00
|
|
|
unsigned long start = 0, end = 0;
|
powerpc/pseries/iommu: Make use of DDW for indirect mapping
So far it's assumed possible to map the guest RAM 1:1 to the bus, which
works with a small number of devices. SRIOV changes it as the user can
configure hundreds VFs and since phyp preallocates TCEs and does not
allow IOMMU pages bigger than 64K, it has to limit the number of TCEs
per a PE to limit waste of physical pages.
As of today, if the assumed direct mapping is not possible, DDW creation
is skipped and the default DMA window "ibm,dma-window" is used instead.
By using DDW, indirect mapping can get more TCEs than available for the
default DMA window, and also get access to using much larger pagesizes
(16MB as implemented in qemu vs 4k from default DMA window), causing a
significant increase on the maximum amount of memory that can be IOMMU
mapped at the same time.
Indirect mapping will only be used if direct mapping is not a
possibility.
For indirect mapping, it's necessary to re-create the iommu_table with
the new DMA window parameters, so iommu_alloc() can use it.
Removing the default DMA window for using DDW with indirect mapping
is only allowed if there is no current IOMMU memory allocated in
the iommu_table. enable_ddw() is aborted otherwise.
Even though there won't be both direct and indirect mappings at the
same time, we can't reuse the DIRECT64_PROPNAME property name, or else
an older kexec()ed kernel can assume direct mapping, and skip
iommu_alloc(), causing undesirable behavior.
So a new property name DMA64_PROPNAME "linux,dma64-ddr-window-info"
was created to represent a DDW that does not allow direct mapping.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-11-leobras.c@gmail.com
2021-08-17 14:39:28 +08:00
|
|
|
|
|
|
|
for (i = 0; i < ARRAY_SIZE(pci->phb->mem_resources); i++) {
|
|
|
|
const unsigned long mask = IORESOURCE_MEM_64 | IORESOURCE_MEM;
|
|
|
|
|
|
|
|
/* Look for MMIO32 */
|
2021-10-20 21:23:15 +08:00
|
|
|
if ((pci->phb->mem_resources[i].flags & mask) == IORESOURCE_MEM) {
|
|
|
|
start = pci->phb->mem_resources[i].start;
|
|
|
|
end = pci->phb->mem_resources[i].end;
|
powerpc/pseries/iommu: Make use of DDW for indirect mapping
So far it's assumed possible to map the guest RAM 1:1 to the bus, which
works with a small number of devices. SRIOV changes it as the user can
configure hundreds VFs and since phyp preallocates TCEs and does not
allow IOMMU pages bigger than 64K, it has to limit the number of TCEs
per a PE to limit waste of physical pages.
As of today, if the assumed direct mapping is not possible, DDW creation
is skipped and the default DMA window "ibm,dma-window" is used instead.
By using DDW, indirect mapping can get more TCEs than available for the
default DMA window, and also get access to using much larger pagesizes
(16MB as implemented in qemu vs 4k from default DMA window), causing a
significant increase on the maximum amount of memory that can be IOMMU
mapped at the same time.
Indirect mapping will only be used if direct mapping is not a
possibility.
For indirect mapping, it's necessary to re-create the iommu_table with
the new DMA window parameters, so iommu_alloc() can use it.
Removing the default DMA window for using DDW with indirect mapping
is only allowed if there is no current IOMMU memory allocated in
the iommu_table. enable_ddw() is aborted otherwise.
Even though there won't be both direct and indirect mappings at the
same time, we can't reuse the DIRECT64_PROPNAME property name, or else
an older kexec()ed kernel can assume direct mapping, and skip
iommu_alloc(), causing undesirable behavior.
So a new property name DMA64_PROPNAME "linux,dma64-ddr-window-info"
was created to represent a DDW that does not allow direct mapping.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-11-leobras.c@gmail.com
2021-08-17 14:39:28 +08:00
|
|
|
break;
|
2021-10-20 21:23:15 +08:00
|
|
|
}
|
powerpc/pseries/iommu: Make use of DDW for indirect mapping
So far it's assumed possible to map the guest RAM 1:1 to the bus, which
works with a small number of devices. SRIOV changes it as the user can
configure hundreds VFs and since phyp preallocates TCEs and does not
allow IOMMU pages bigger than 64K, it has to limit the number of TCEs
per a PE to limit waste of physical pages.
As of today, if the assumed direct mapping is not possible, DDW creation
is skipped and the default DMA window "ibm,dma-window" is used instead.
By using DDW, indirect mapping can get more TCEs than available for the
default DMA window, and also get access to using much larger pagesizes
(16MB as implemented in qemu vs 4k from default DMA window), causing a
significant increase on the maximum amount of memory that can be IOMMU
mapped at the same time.
Indirect mapping will only be used if direct mapping is not a
possibility.
For indirect mapping, it's necessary to re-create the iommu_table with
the new DMA window parameters, so iommu_alloc() can use it.
Removing the default DMA window for using DDW with indirect mapping
is only allowed if there is no current IOMMU memory allocated in
the iommu_table. enable_ddw() is aborted otherwise.
Even though there won't be both direct and indirect mappings at the
same time, we can't reuse the DIRECT64_PROPNAME property name, or else
an older kexec()ed kernel can assume direct mapping, and skip
iommu_alloc(), causing undesirable behavior.
So a new property name DMA64_PROPNAME "linux,dma64-ddr-window-info"
was created to represent a DDW that does not allow direct mapping.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-11-leobras.c@gmail.com
2021-08-17 14:39:28 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* New table for using DDW instead of the default DMA window */
|
|
|
|
newtbl = iommu_pseries_alloc_table(pci->phb->node);
|
|
|
|
if (!newtbl) {
|
|
|
|
dev_dbg(&dev->dev, "couldn't create new IOMMU table\n");
|
|
|
|
goto out_del_list;
|
|
|
|
}
|
|
|
|
|
|
|
|
iommu_table_setparms_common(newtbl, pci->phb->bus->number, create.liobn, win_addr,
|
|
|
|
1UL << len, page_shift, NULL, &iommu_table_lpar_multi_ops);
|
2021-10-20 21:23:15 +08:00
|
|
|
iommu_init_table(newtbl, pci->phb->node, start, end);
|
powerpc/pseries/iommu: Make use of DDW for indirect mapping
So far it's assumed possible to map the guest RAM 1:1 to the bus, which
works with a small number of devices. SRIOV changes it as the user can
configure hundreds VFs and since phyp preallocates TCEs and does not
allow IOMMU pages bigger than 64K, it has to limit the number of TCEs
per a PE to limit waste of physical pages.
As of today, if the assumed direct mapping is not possible, DDW creation
is skipped and the default DMA window "ibm,dma-window" is used instead.
By using DDW, indirect mapping can get more TCEs than available for the
default DMA window, and also get access to using much larger pagesizes
(16MB as implemented in qemu vs 4k from default DMA window), causing a
significant increase on the maximum amount of memory that can be IOMMU
mapped at the same time.
Indirect mapping will only be used if direct mapping is not a
possibility.
For indirect mapping, it's necessary to re-create the iommu_table with
the new DMA window parameters, so iommu_alloc() can use it.
Removing the default DMA window for using DDW with indirect mapping
is only allowed if there is no current IOMMU memory allocated in
the iommu_table. enable_ddw() is aborted otherwise.
Even though there won't be both direct and indirect mappings at the
same time, we can't reuse the DIRECT64_PROPNAME property name, or else
an older kexec()ed kernel can assume direct mapping, and skip
iommu_alloc(), causing undesirable behavior.
So a new property name DMA64_PROPNAME "linux,dma64-ddr-window-info"
was created to represent a DDW that does not allow direct mapping.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-11-leobras.c@gmail.com
2021-08-17 14:39:28 +08:00
|
|
|
|
|
|
|
pci->table_group->tables[1] = newtbl;
|
|
|
|
|
|
|
|
set_iommu_table_base(&dev->dev, newtbl);
|
2011-02-10 17:10:47 +08:00
|
|
|
}
|
|
|
|
|
2022-06-29 14:06:14 +08:00
|
|
|
if (default_win_removed) {
|
|
|
|
iommu_tce_table_put(pci->table_group->tables[0]);
|
|
|
|
pci->table_group->tables[0] = NULL;
|
|
|
|
|
|
|
|
/* default_win is valid here because default_win_removed == true */
|
|
|
|
of_remove_property(pdn, default_win);
|
|
|
|
dev_info(&dev->dev, "Removed default DMA window for %pOF\n", pdn);
|
|
|
|
}
|
|
|
|
|
2021-08-17 14:39:29 +08:00
|
|
|
spin_lock(&dma_win_list_lock);
|
|
|
|
list_add(&window->list, &dma_win_list);
|
|
|
|
spin_unlock(&dma_win_list_lock);
|
2011-02-10 17:10:47 +08:00
|
|
|
|
powerpc/pseries/iommu: Add ddw_property_create() and refactor enable_ddw()
Code used to create a ddw property that was previously scattered in
enable_ddw() is now gathered in ddw_property_create(), which deals with
allocation and filling the property, letting it ready for
of_property_add(), which now occurs in sequence.
This created an opportunity to reorganize the second part of enable_ddw():
Without this patch enable_ddw() does, in order:
kzalloc() property & members, create_ddw(), fill ddwprop inside property,
ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory,
of_add_property(), and list_add().
With this patch enable_ddw() does, in order:
create_ddw(), ddw_property_create(), of_add_property(),
ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory,
and list_add().
This change requires of_remove_property() in case anything fails after
of_add_property(), but we get to do tce_setrange_multi_pSeriesLP_walk
in all memory, which looks the most expensive operation, only if
everything else succeeds.
Also, the error path got remove_ddw() replaced by a new helper
__remove_dma_window(), which only removes the new DDW with an rtas-call.
For this, a new helper clean_dma_window() was needed to clean anything
that could left if walk_system_ram_range() fails.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-7-leobras.c@gmail.com
2021-08-17 14:39:24 +08:00
|
|
|
dev->dev.archdata.dma_offset = win_addr;
|
2011-02-10 17:10:47 +08:00
|
|
|
goto out_unlock;
|
|
|
|
|
powerpc/pseries/iommu: Add ddw_property_create() and refactor enable_ddw()
Code used to create a ddw property that was previously scattered in
enable_ddw() is now gathered in ddw_property_create(), which deals with
allocation and filling the property, letting it ready for
of_property_add(), which now occurs in sequence.
This created an opportunity to reorganize the second part of enable_ddw():
Without this patch enable_ddw() does, in order:
kzalloc() property & members, create_ddw(), fill ddwprop inside property,
ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory,
of_add_property(), and list_add().
With this patch enable_ddw() does, in order:
create_ddw(), ddw_property_create(), of_add_property(),
ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory,
and list_add().
This change requires of_remove_property() in case anything fails after
of_add_property(), but we get to do tce_setrange_multi_pSeriesLP_walk
in all memory, which looks the most expensive operation, only if
everything else succeeds.
Also, the error path got remove_ddw() replaced by a new helper
__remove_dma_window(), which only removes the new DDW with an rtas-call.
For this, a new helper clean_dma_window() was needed to clean anything
that could left if walk_system_ram_range() fails.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-7-leobras.c@gmail.com
2021-08-17 14:39:24 +08:00
|
|
|
out_del_list:
|
2011-08-08 09:18:00 +08:00
|
|
|
kfree(window);
|
|
|
|
|
powerpc/pseries/iommu: Add ddw_property_create() and refactor enable_ddw()
Code used to create a ddw property that was previously scattered in
enable_ddw() is now gathered in ddw_property_create(), which deals with
allocation and filling the property, letting it ready for
of_property_add(), which now occurs in sequence.
This created an opportunity to reorganize the second part of enable_ddw():
Without this patch enable_ddw() does, in order:
kzalloc() property & members, create_ddw(), fill ddwprop inside property,
ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory,
of_add_property(), and list_add().
With this patch enable_ddw() does, in order:
create_ddw(), ddw_property_create(), of_add_property(),
ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory,
and list_add().
This change requires of_remove_property() in case anything fails after
of_add_property(), but we get to do tce_setrange_multi_pSeriesLP_walk
in all memory, which looks the most expensive operation, only if
everything else succeeds.
Also, the error path got remove_ddw() replaced by a new helper
__remove_dma_window(), which only removes the new DDW with an rtas-call.
For this, a new helper clean_dma_window() was needed to clean anything
that could left if walk_system_ram_range() fails.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-7-leobras.c@gmail.com
2021-08-17 14:39:24 +08:00
|
|
|
out_del_prop:
|
|
|
|
of_remove_property(pdn, win64);
|
2011-02-10 17:10:47 +08:00
|
|
|
|
|
|
|
out_free_prop:
|
|
|
|
kfree(win64->name);
|
|
|
|
kfree(win64->value);
|
|
|
|
kfree(win64);
|
|
|
|
|
powerpc/pseries/iommu: Add ddw_property_create() and refactor enable_ddw()
Code used to create a ddw property that was previously scattered in
enable_ddw() is now gathered in ddw_property_create(), which deals with
allocation and filling the property, letting it ready for
of_property_add(), which now occurs in sequence.
This created an opportunity to reorganize the second part of enable_ddw():
Without this patch enable_ddw() does, in order:
kzalloc() property & members, create_ddw(), fill ddwprop inside property,
ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory,
of_add_property(), and list_add().
With this patch enable_ddw() does, in order:
create_ddw(), ddw_property_create(), of_add_property(),
ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory,
and list_add().
This change requires of_remove_property() in case anything fails after
of_add_property(), but we get to do tce_setrange_multi_pSeriesLP_walk
in all memory, which looks the most expensive operation, only if
everything else succeeds.
Also, the error path got remove_ddw() replaced by a new helper
__remove_dma_window(), which only removes the new DDW with an rtas-call.
For this, a new helper clean_dma_window() was needed to clean anything
that could left if walk_system_ram_range() fails.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-7-leobras.c@gmail.com
2021-08-17 14:39:24 +08:00
|
|
|
out_remove_win:
|
|
|
|
/* DDW is clean, so it's ok to call this directly. */
|
|
|
|
__remove_dma_window(pdn, ddw_avail, create.liobn);
|
|
|
|
|
2014-01-11 07:09:38 +08:00
|
|
|
out_failed:
|
powerpc/pseries/iommu: Allow bigger 64bit window by removing default DMA window
On LoPAR "DMA Window Manipulation Calls", it's recommended to remove the
default DMA window for the device, before attempting to configure a DDW,
in order to make the maximum resources available for the next DDW to be
created.
This is a requirement for using DDW on devices in which hypervisor
allows only one DMA window.
If setting up a new DDW fails anywhere after the removal of this
default DMA window, it's needed to restore the default DMA window.
For this, an implementation of ibm,reset-pe-dma-windows rtas call is
needed:
Platforms supporting the DDW option starting with LoPAR level 2.7 implement
ibm,ddw-extensions. The first extension available (index 2) carries the
token for ibm,reset-pe-dma-windows rtas call, which is used to restore
the default DMA window for a device, if it has been deleted.
It does so by resetting the TCE table allocation for the PE to it's
boot time value, available in "ibm,dma-window" device tree node.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Tested-by: David Dai <zdai@linux.vnet.ibm.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200805030455.123024-5-leobras.c@gmail.com
2020-08-05 11:04:55 +08:00
|
|
|
if (default_win_removed)
|
|
|
|
reset_dma_window(dev, pdn);
|
2012-05-15 15:04:32 +08:00
|
|
|
|
powerpc/pseries: close DDW race between functions of adapter
Given a PCI device with multiple functions in a DDW capable slot, the
following situation can be encountered: When the first function sets a
64-bit DMA mask, enable_ddw() will be called and we can fail to properly
configure DDW (the most common reason being the new DMA window's size is
not large enough to map all of an LPAR's memory). With the recent
changes to DDW, we remove the base window in order to determine if the
new window is of sufficient size to cover an LPAR's memory. We correctly
replace the base window if we find that not to be the case. However,
once we go through and re-configured 32-bit DMA via the IOMMU, the next
function of the adapter will go through the same process. And since DDW
is a characteristic of the slot itself, we are most likely going to fail
again. But to determine we are going to fail the second slot, we again
remove the base window -- but that is now in-use by the first
function/driver, which might be issuing I/O already.
To close this window, keep a list of all the failed struct device_nodes
that have failed to configure DDW. If the current device_node is in that
list, just fail out immediately and fall back to 32-bit DMA without
doing any DDW manipulation.
Signed-off-by: Nishanth Aravamudan <nacc@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
2013-03-07 20:33:03 +08:00
|
|
|
fpdn = kzalloc(sizeof(*fpdn), GFP_KERNEL);
|
|
|
|
if (!fpdn)
|
|
|
|
goto out_unlock;
|
|
|
|
fpdn->pdn = pdn;
|
|
|
|
list_add(&fpdn->list, &failed_ddw_pdn_list);
|
|
|
|
|
2011-02-10 17:10:47 +08:00
|
|
|
out_unlock:
|
2021-08-17 14:39:29 +08:00
|
|
|
mutex_unlock(&dma_win_init_mutex);
|
2020-10-29 09:52:41 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If we have persistent memory and the window size is only as big
|
|
|
|
* as RAM, then we failed to create a window to cover persistent
|
|
|
|
* memory and need to set the DMA limit.
|
|
|
|
*/
|
2021-11-08 12:03:18 +08:00
|
|
|
if (pmem_present && direct_mapping && len == max_ram_len)
|
2021-08-17 14:39:23 +08:00
|
|
|
dev->dev.bus_dma_limit = dev->dev.archdata.dma_offset + (1ULL << len);
|
2020-10-29 09:52:41 +08:00
|
|
|
|
2021-11-08 12:03:18 +08:00
|
|
|
return direct_mapping;
|
2011-02-10 17:10:47 +08:00
|
|
|
}
|
|
|
|
|
2006-11-11 14:25:02 +08:00
|
|
|
static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
struct device_node *pdn, *dn;
|
|
|
|
struct iommu_table *tbl;
|
2013-08-07 00:01:36 +08:00
|
|
|
const __be32 *dma_window = NULL;
|
2005-09-06 11:17:54 +08:00
|
|
|
struct pci_dn *pci;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-04-24 13:13:19 +08:00
|
|
|
pr_debug("pci_dma_dev_setup_pSeriesLP: %s\n", pci_name(dev));
|
2006-11-11 14:25:02 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* dev setup for LPAR is a little tricky, since the device tree might
|
2011-03-31 09:57:33 +08:00
|
|
|
* contain the dma-window properties per-device and not necessarily
|
2005-04-17 06:20:36 +08:00
|
|
|
* for the bus. So we need to search upwards in the tree until we
|
|
|
|
* either hit a dma-window property, OR find a parent with a table
|
|
|
|
* already allocated.
|
|
|
|
*/
|
|
|
|
dn = pci_device_to_OF_node(dev);
|
2017-08-21 23:16:47 +08:00
|
|
|
pr_debug(" node is %pOF\n", dn);
|
2006-10-30 13:15:59 +08:00
|
|
|
|
2022-06-29 14:06:14 +08:00
|
|
|
pdn = pci_dma_find(dn, &dma_window);
|
2007-04-11 04:11:23 +08:00
|
|
|
if (!pdn || !PCI_DN(pdn)) {
|
|
|
|
printk(KERN_WARNING "pci_dma_dev_setup_pSeriesLP: "
|
2017-08-21 23:16:47 +08:00
|
|
|
"no DMA window found for pci dev=%s dn=%pOF\n",
|
|
|
|
pci_name(dev), dn);
|
2007-04-11 04:11:23 +08:00
|
|
|
return;
|
|
|
|
}
|
2017-08-21 23:16:47 +08:00
|
|
|
pr_debug(" parent is %pOF\n", pdn);
|
2006-11-11 14:25:02 +08:00
|
|
|
|
2005-12-06 09:37:35 +08:00
|
|
|
pci = PCI_DN(pdn);
|
2015-06-05 14:35:08 +08:00
|
|
|
if (!pci->table_group) {
|
|
|
|
pci->table_group = iommu_pseries_alloc_group(pci->phb->node);
|
|
|
|
tbl = pci->table_group->tables[0];
|
2017-03-24 14:37:21 +08:00
|
|
|
iommu_table_setparms_lpar(pci->phb, pdn, tbl,
|
|
|
|
pci->table_group, dma_window);
|
2021-08-17 14:39:25 +08:00
|
|
|
|
2019-07-18 13:11:39 +08:00
|
|
|
iommu_init_table(tbl, pci->phb->node, 0, 0);
|
2015-06-05 14:35:08 +08:00
|
|
|
iommu_register_group(pci->table_group,
|
|
|
|
pci_domain_nr(pci->phb->bus), 0);
|
|
|
|
pr_debug(" created table: %p\n", pci->table_group);
|
2007-05-10 13:16:27 +08:00
|
|
|
} else {
|
2015-06-05 14:35:08 +08:00
|
|
|
pr_debug(" found DMA window, table: %p\n", pci->table_group);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2015-06-05 14:35:08 +08:00
|
|
|
set_iommu_table_base(&dev->dev, pci->table_group->tables[0]);
|
2018-12-19 16:52:21 +08:00
|
|
|
iommu_add_device(pci->table_group, &dev->dev);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2011-02-10 17:10:47 +08:00
|
|
|
|
2019-02-13 15:01:07 +08:00
|
|
|
static bool iommu_bypass_supported_pSeriesLP(struct pci_dev *pdev, u64 dma_mask)
|
2011-02-10 17:10:47 +08:00
|
|
|
{
|
2019-02-13 15:01:07 +08:00
|
|
|
struct device_node *dn = pci_device_to_OF_node(pdev), *pdn;
|
2011-02-10 17:10:47 +08:00
|
|
|
|
|
|
|
/* only attempt to use a new window if 64-bit DMA is requested */
|
2019-02-13 15:01:07 +08:00
|
|
|
if (dma_mask < DMA_BIT_MASK(64))
|
|
|
|
return false;
|
2011-02-10 17:10:47 +08:00
|
|
|
|
2019-02-13 15:01:07 +08:00
|
|
|
dev_dbg(&pdev->dev, "node is %pOF\n", dn);
|
2011-02-10 17:10:47 +08:00
|
|
|
|
2019-02-13 15:01:07 +08:00
|
|
|
/*
|
|
|
|
* the device tree might contain the dma-window properties
|
|
|
|
* per-device and not necessarily for the bus. So we need to
|
|
|
|
* search upwards in the tree until we either hit a dma-window
|
|
|
|
* property, OR find a parent with a table already allocated.
|
|
|
|
*/
|
2022-06-29 14:06:14 +08:00
|
|
|
pdn = pci_dma_find(dn, NULL);
|
2021-08-17 14:39:23 +08:00
|
|
|
if (pdn && PCI_DN(pdn))
|
|
|
|
return enable_ddw(pdev, pdn);
|
2011-06-24 17:05:22 +08:00
|
|
|
|
2019-02-13 15:01:07 +08:00
|
|
|
return false;
|
2011-06-24 17:05:22 +08:00
|
|
|
}
|
|
|
|
|
2011-02-10 17:10:47 +08:00
|
|
|
static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action,
|
|
|
|
void *data)
|
|
|
|
{
|
2021-08-17 14:39:29 +08:00
|
|
|
struct dma_win *window;
|
2011-02-10 17:10:47 +08:00
|
|
|
struct memory_notify *arg = data;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
switch (action) {
|
|
|
|
case MEM_GOING_ONLINE:
|
2021-08-17 14:39:29 +08:00
|
|
|
spin_lock(&dma_win_list_lock);
|
|
|
|
list_for_each_entry(window, &dma_win_list, list) {
|
2011-02-10 17:10:47 +08:00
|
|
|
ret |= tce_setrange_multi_pSeriesLP(arg->start_pfn,
|
|
|
|
arg->nr_pages, window->prop);
|
|
|
|
/* XXX log error */
|
|
|
|
}
|
2021-08-17 14:39:29 +08:00
|
|
|
spin_unlock(&dma_win_list_lock);
|
2011-02-10 17:10:47 +08:00
|
|
|
break;
|
|
|
|
case MEM_CANCEL_ONLINE:
|
|
|
|
case MEM_OFFLINE:
|
2021-08-17 14:39:29 +08:00
|
|
|
spin_lock(&dma_win_list_lock);
|
|
|
|
list_for_each_entry(window, &dma_win_list, list) {
|
2011-02-10 17:10:47 +08:00
|
|
|
ret |= tce_clearrange_multi_pSeriesLP(arg->start_pfn,
|
|
|
|
arg->nr_pages, window->prop);
|
|
|
|
/* XXX log error */
|
|
|
|
}
|
2021-08-17 14:39:29 +08:00
|
|
|
spin_unlock(&dma_win_list_lock);
|
2011-02-10 17:10:47 +08:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (ret && action != MEM_CANCEL_ONLINE)
|
|
|
|
return NOTIFY_BAD;
|
|
|
|
|
|
|
|
return NOTIFY_OK;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct notifier_block iommu_mem_nb = {
|
|
|
|
.notifier_call = iommu_mem_notifier,
|
|
|
|
};
|
|
|
|
|
2014-11-25 01:58:01 +08:00
|
|
|
static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long action, void *data)
|
2007-03-04 14:04:44 +08:00
|
|
|
{
|
|
|
|
int err = NOTIFY_OK;
|
2014-11-25 01:58:01 +08:00
|
|
|
struct of_reconfig_data *rd = data;
|
|
|
|
struct device_node *np = rd->dn;
|
2007-03-04 14:04:44 +08:00
|
|
|
struct pci_dn *pci = PCI_DN(np);
|
2021-08-17 14:39:29 +08:00
|
|
|
struct dma_win *window;
|
2007-03-04 14:04:44 +08:00
|
|
|
|
|
|
|
switch (action) {
|
2012-10-03 00:57:57 +08:00
|
|
|
case OF_RECONFIG_DETACH_NODE:
|
2014-08-11 17:16:20 +08:00
|
|
|
/*
|
|
|
|
* Removing the property will invoke the reconfig
|
|
|
|
* notifier again, which causes dead-lock on the
|
|
|
|
* read-write semaphore of the notifier chain. So
|
|
|
|
* we have to remove the property when releasing
|
|
|
|
* the device node.
|
|
|
|
*/
|
powerpc/pseries/iommu: Make use of DDW for indirect mapping
So far it's assumed possible to map the guest RAM 1:1 to the bus, which
works with a small number of devices. SRIOV changes it as the user can
configure hundreds VFs and since phyp preallocates TCEs and does not
allow IOMMU pages bigger than 64K, it has to limit the number of TCEs
per a PE to limit waste of physical pages.
As of today, if the assumed direct mapping is not possible, DDW creation
is skipped and the default DMA window "ibm,dma-window" is used instead.
By using DDW, indirect mapping can get more TCEs than available for the
default DMA window, and also get access to using much larger pagesizes
(16MB as implemented in qemu vs 4k from default DMA window), causing a
significant increase on the maximum amount of memory that can be IOMMU
mapped at the same time.
Indirect mapping will only be used if direct mapping is not a
possibility.
For indirect mapping, it's necessary to re-create the iommu_table with
the new DMA window parameters, so iommu_alloc() can use it.
Removing the default DMA window for using DDW with indirect mapping
is only allowed if there is no current IOMMU memory allocated in
the iommu_table. enable_ddw() is aborted otherwise.
Even though there won't be both direct and indirect mappings at the
same time, we can't reuse the DIRECT64_PROPNAME property name, or else
an older kexec()ed kernel can assume direct mapping, and skip
iommu_alloc(), causing undesirable behavior.
So a new property name DMA64_PROPNAME "linux,dma64-ddr-window-info"
was created to represent a DDW that does not allow direct mapping.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210817063929.38701-11-leobras.c@gmail.com
2021-08-17 14:39:28 +08:00
|
|
|
if (remove_ddw(np, false, DIRECT64_PROPNAME))
|
|
|
|
remove_ddw(np, false, DMA64_PROPNAME);
|
|
|
|
|
2015-06-05 14:35:08 +08:00
|
|
|
if (pci && pci->table_group)
|
|
|
|
iommu_pseries_free_group(pci->table_group,
|
2015-06-05 14:34:56 +08:00
|
|
|
np->full_name);
|
2011-02-10 17:10:47 +08:00
|
|
|
|
2021-08-17 14:39:29 +08:00
|
|
|
spin_lock(&dma_win_list_lock);
|
|
|
|
list_for_each_entry(window, &dma_win_list, list) {
|
2011-02-10 17:10:47 +08:00
|
|
|
if (window->device == np) {
|
|
|
|
list_del(&window->list);
|
|
|
|
kfree(window);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2021-08-17 14:39:29 +08:00
|
|
|
spin_unlock(&dma_win_list_lock);
|
2007-03-04 14:04:44 +08:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
err = NOTIFY_DONE;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct notifier_block iommu_reconfig_nb = {
|
|
|
|
.notifier_call = iommu_reconfig_notifier,
|
|
|
|
};
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* These are called very early. */
|
2021-12-17 06:00:27 +08:00
|
|
|
void __init iommu_init_early_pSeries(void)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2010-10-18 15:27:03 +08:00
|
|
|
if (of_chosen && of_get_property(of_chosen, "linux,iommu-off", NULL))
|
2005-04-17 06:20:36 +08:00
|
|
|
return;
|
|
|
|
|
2006-03-21 17:45:59 +08:00
|
|
|
if (firmware_has_feature(FW_FEATURE_LPAR)) {
|
2015-03-31 13:00:50 +08:00
|
|
|
pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeriesLP;
|
|
|
|
pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeriesLP;
|
2019-02-13 15:01:07 +08:00
|
|
|
if (!disable_ddw)
|
|
|
|
pseries_pci_controller_ops.iommu_bypass_supported =
|
|
|
|
iommu_bypass_supported_pSeriesLP;
|
2005-04-17 06:20:36 +08:00
|
|
|
} else {
|
2015-03-31 13:00:50 +08:00
|
|
|
pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeries;
|
|
|
|
pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeries;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2012-10-03 00:57:57 +08:00
|
|
|
of_reconfig_notifier_register(&iommu_reconfig_nb);
|
2011-02-10 17:10:47 +08:00
|
|
|
register_memory_notifier(&iommu_mem_nb);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2019-12-16 12:19:21 +08:00
|
|
|
set_pci_dma_ops(&dma_iommu_ops);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2010-09-28 23:33:12 +08:00
|
|
|
static int __init disable_multitce(char *str)
|
|
|
|
{
|
|
|
|
if (strcmp(str, "off") == 0 &&
|
|
|
|
firmware_has_feature(FW_FEATURE_LPAR) &&
|
2019-12-16 12:19:23 +08:00
|
|
|
(firmware_has_feature(FW_FEATURE_PUT_TCE_IND) ||
|
|
|
|
firmware_has_feature(FW_FEATURE_STUFF_TCE))) {
|
2010-09-28 23:33:12 +08:00
|
|
|
printk(KERN_INFO "Disabling MULTITCE firmware feature\n");
|
2019-12-16 12:19:23 +08:00
|
|
|
powerpc_firmware_features &=
|
|
|
|
~(FW_FEATURE_PUT_TCE_IND | FW_FEATURE_STUFF_TCE);
|
2010-09-28 23:33:12 +08:00
|
|
|
}
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
__setup("multitce=", disable_multitce);
|
2015-02-22 03:00:50 +08:00
|
|
|
|
2018-12-19 16:52:21 +08:00
|
|
|
static int tce_iommu_bus_notifier(struct notifier_block *nb,
|
|
|
|
unsigned long action, void *data)
|
|
|
|
{
|
|
|
|
struct device *dev = data;
|
|
|
|
|
|
|
|
switch (action) {
|
|
|
|
case BUS_NOTIFY_DEL_DEVICE:
|
|
|
|
iommu_del_device(dev);
|
|
|
|
return 0;
|
|
|
|
default:
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct notifier_block tce_iommu_bus_nb = {
|
|
|
|
.notifier_call = tce_iommu_bus_notifier,
|
|
|
|
};
|
|
|
|
|
|
|
|
static int __init tce_iommu_bus_notifier_init(void)
|
|
|
|
{
|
|
|
|
bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
|
|
|
|
return 0;
|
|
|
|
}
|
2015-02-22 03:00:50 +08:00
|
|
|
machine_subsys_initcall_sync(pseries, tce_iommu_bus_notifier_init);
|