From 472d26df5e8075eda677b6be730e0fbf434ff2a8 Mon Sep 17 00:00:00 2001 From: Xiaotao Yin Date: Tue, 10 Dec 2019 12:27:04 +0800 Subject: [PATCH 1/8] iommu/iova: Init the struct iova to fix the possible memleak During ethernet(Marvell octeontx2) set ring buffer test: ethtool -G eth1 rx tx following kmemleak will happen sometimes: unreferenced object 0xffff000b85421340 (size 64): comm "ethtool", pid 867, jiffies 4295323539 (age 550.500s) hex dump (first 64 bytes): 80 13 42 85 0b 00 ff ff ff ff ff ff ff ff ff ff ..B............. 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ ff ff ff ff ff ff ff ff 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<000000001b204ddf>] kmem_cache_alloc+0x1b0/0x350 [<00000000d9ef2e50>] alloc_iova+0x3c/0x168 [<00000000ea30f99d>] alloc_iova_fast+0x7c/0x2d8 [<00000000b8bb2f1f>] iommu_dma_alloc_iova.isra.0+0x12c/0x138 [<000000002f1a43b5>] __iommu_dma_map+0x8c/0xf8 [<00000000ecde7899>] iommu_dma_map_page+0x98/0xf8 [<0000000082004e59>] otx2_alloc_rbuf+0xf4/0x158 [<000000002b107f6b>] otx2_rq_aura_pool_init+0x110/0x270 [<00000000c3d563c7>] otx2_open+0x15c/0x734 [<00000000a2f5f3a8>] otx2_dev_open+0x3c/0x68 [<00000000456a98b5>] otx2_set_ringparam+0x1ac/0x1d4 [<00000000f2fbb819>] dev_ethtool+0xb84/0x2028 [<0000000069b67c5a>] dev_ioctl+0x248/0x3a0 [<00000000af38663a>] sock_ioctl+0x280/0x638 [<000000002582384c>] do_vfs_ioctl+0x8b0/0xa80 [<000000004e1a2c02>] ksys_ioctl+0x84/0xb8 The reason: When alloc_iova_mem() without initial with Zero, sometimes fpn_lo will equal to IOVA_ANCHOR by chance, so when return with -ENOMEM(iova32_full) from __alloc_and_insert_iova_range(), the new_iova will not be freed in free_iova_mem(). Fixes: bb68b2fbfbd6 ("iommu/iova: Add rbtree anchor node") Signed-off-by: Xiaotao Yin Reviewed-by: Robin Murphy Signed-off-by: Joerg Roedel --- drivers/iommu/iova.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c index 41c605b0058f..c7a914b9bbbc 100644 --- a/drivers/iommu/iova.c +++ b/drivers/iommu/iova.c @@ -233,7 +233,7 @@ static DEFINE_MUTEX(iova_cache_mutex); struct iova *alloc_iova_mem(void) { - return kmem_cache_alloc(iova_cache, GFP_ATOMIC); + return kmem_cache_zalloc(iova_cache, GFP_ATOMIC); } EXPORT_SYMBOL(alloc_iova_mem); From bd036d2fdd374fa252abfc221e1a1280eee42f89 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 11 Dec 2019 18:33:26 +0000 Subject: [PATCH 2/8] iommu/dma: Rationalise types for DMA masks Since iommu_dma_alloc_iova() combines incoming masks with the u64 bus limit, it makes more sense to pass them around in their native u64 rather than converting to dma_addr_t early. Do that, and resolve the remaining type discrepancy against the domain geometry with a cheeky cast to keep things simple. Signed-off-by: Robin Murphy Reviewed-by: Christoph Hellwig Tested-by: Nathan Chancellor # build Reviewed-by: Nicolas Saenz Julienne Signed-off-by: Joerg Roedel --- drivers/iommu/dma-iommu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 0cc702a70a96..6e573d1cb8bf 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -399,7 +399,7 @@ static int dma_info_to_prot(enum dma_data_direction dir, bool coherent, } static dma_addr_t iommu_dma_alloc_iova(struct iommu_domain *domain, - size_t size, dma_addr_t dma_limit, struct device *dev) + size_t size, u64 dma_limit, struct device *dev) { struct iommu_dma_cookie *cookie = domain->iova_cookie; struct iova_domain *iovad = &cookie->iovad; @@ -424,7 +424,7 @@ static dma_addr_t iommu_dma_alloc_iova(struct iommu_domain *domain, dma_limit = min_not_zero(dma_limit, dev->bus_dma_limit); if (domain->geometry.force_aperture) - dma_limit = min(dma_limit, domain->geometry.aperture_end); + dma_limit = min(dma_limit, (u64)domain->geometry.aperture_end); /* Try to get PCI devices a SAC address */ if (dma_limit > DMA_BIT_MASK(32) && dev_is_pci(dev)) @@ -477,7 +477,7 @@ static void __iommu_dma_unmap(struct device *dev, dma_addr_t dma_addr, } static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys, - size_t size, int prot, dma_addr_t dma_mask) + size_t size, int prot, u64 dma_mask) { struct iommu_domain *domain = iommu_get_dma_domain(dev); struct iommu_dma_cookie *cookie = domain->iova_cookie; From d8018a0e9195ba9f0fb9cf0fd3843807c8b952d5 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Wed, 11 Dec 2019 13:28:29 -0700 Subject: [PATCH 3/8] iommu/vt-d: Set ISA bridge reserved region as relaxable Commit d850c2ee5fe2 ("iommu/vt-d: Expose ISA direct mapping region via iommu_get_resv_regions") created a direct-mapped reserved memory region in order to replace the static identity mapping of the ISA address space, where the latter was then removed in commit df4f3c603aeb ("iommu/vt-d: Remove static identity map code"). According to the history of this code and the Kconfig option surrounding it, this direct mapping exists for the benefit of legacy ISA drivers that are not compatible with the DMA API. In conjuntion with commit 9b77e5c79840 ("vfio/type1: check dma map request is within a valid iova range") this change introduced a regression where the vfio IOMMU backend enforces reserved memory regions per IOMMU group, preventing userspace from creating IOMMU mappings conflicting with prescribed reserved regions. A necessary prerequisite for the vfio change was the introduction of "relaxable" direct mappings introduced by commit adfd37382090 ("iommu: Introduce IOMMU_RESV_DIRECT_RELAXABLE reserved memory regions"). These relaxable direct mappings provide the same identity mapping support in the default domain, but also indicate that the reservation is software imposed and may be relaxed under some conditions, such as device assignment. Convert the ISA bridge direct-mapped reserved region to relaxable to reflect that the restriction is self imposed and need not be enforced by drivers such as vfio. Fixes: 1c5c59fbad20 ("iommu/vt-d: Differentiate relaxable and non relaxable RMRRs") Cc: stable@vger.kernel.org # v5.3+ Link: https://lore.kernel.org/linux-iommu/20191211082304.2d4fab45@x1.home Reported-by: cprt Tested-by: cprt Signed-off-by: Alex Williamson Acked-by: Lu Baolu Reviewed-by: Eric Auger Tested-by: Jerry Snitselaar Reviewed-by: Jerry Snitselaar Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 0c8d81f56a30..6eb0dd7489a1 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -5737,7 +5737,7 @@ static void intel_iommu_get_resv_regions(struct device *device, if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) { reg = iommu_alloc_resv_region(0, 1UL << 24, 0, - IOMMU_RESV_DIRECT); + IOMMU_RESV_DIRECT_RELAXABLE); if (reg) list_add_tail(®->list, head); } From 75d18385394f56db76845d91a192532aba421875 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 11 Dec 2019 09:40:15 +0800 Subject: [PATCH 4/8] iommu/vt-d: Fix dmar pte read access not set error If the default DMA domain of a group doesn't fit a device, it will still sit in the group but use a private identity domain. When map/unmap/iova_to_phys come through iommu API, the driver should still serve them, otherwise, other devices in the same group will be impacted. Since identity domain has been mapped with the whole available memory space and RMRRs, we don't need to worry about the impact on it. Link: https://www.spinics.net/lists/iommu/msg40416.html Cc: Jerry Snitselaar Reported-by: Jerry Snitselaar Fixes: 942067f1b6b97 ("iommu/vt-d: Identify default domains replaced with private") Cc: stable@vger.kernel.org # v5.3+ Signed-off-by: Lu Baolu Reviewed-by: Jerry Snitselaar Tested-by: Jerry Snitselaar Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 6eb0dd7489a1..c8ced7712a8e 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -5478,9 +5478,6 @@ static int intel_iommu_map(struct iommu_domain *domain, int prot = 0; int ret; - if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN) - return -EINVAL; - if (iommu_prot & IOMMU_READ) prot |= DMA_PTE_READ; if (iommu_prot & IOMMU_WRITE) @@ -5523,8 +5520,6 @@ static size_t intel_iommu_unmap(struct iommu_domain *domain, /* Cope with horrid API which requires us to unmap more than the size argument if it happens to be a large-page mapping. */ BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level)); - if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN) - return 0; if (size < VTD_PAGE_SIZE << level_to_offset_bits(level)) size = VTD_PAGE_SIZE << level_to_offset_bits(level); @@ -5556,9 +5551,6 @@ static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, int level = 0; u64 phys = 0; - if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN) - return 0; - pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level); if (pte) phys = dma_pte_addr(pte); From d360211524bece6db9920f32c91808235290b51c Mon Sep 17 00:00:00 2001 From: Jerry Snitselaar Date: Tue, 10 Dec 2019 11:56:06 -0700 Subject: [PATCH 5/8] iommu: set group default domain before creating direct mappings iommu_group_create_direct_mappings uses group->default_domain, but right after it is called, request_default_domain_for_dev calls iommu_domain_free for the default domain, and sets the group default domain to a different domain. Move the iommu_group_create_direct_mappings call to after the group default domain is set, so the direct mappings get associated with that domain. Cc: Joerg Roedel Cc: Lu Baolu Cc: iommu@lists.linux-foundation.org Cc: stable@vger.kernel.org Fixes: 7423e01741dd ("iommu: Add API to request DMA domain for device") Signed-off-by: Jerry Snitselaar Reviewed-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index db7bfd4f2d20..fa908179b80b 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2282,13 +2282,13 @@ request_default_domain_for_dev(struct device *dev, unsigned long type) goto out; } - iommu_group_create_direct_mappings(group, dev); - /* Make the domain the default for this group */ if (group->default_domain) iommu_domain_free(group->default_domain); group->default_domain = domain; + iommu_group_create_direct_mappings(group, dev); + dev_info(dev, "Using iommu %s mapping\n", type == IOMMU_DOMAIN_DMA ? "dma" : "direct"); From cde9319e884eb6267a0df446f3c131fe1108defb Mon Sep 17 00:00:00 2001 From: Jerry Snitselaar Date: Thu, 12 Dec 2019 22:36:42 -0700 Subject: [PATCH 6/8] iommu/vt-d: Allocate reserved region for ISA with correct permission Currently the reserved region for ISA is allocated with no permissions. If a dma domain is being used, mapping this region will fail. Set the permissions to DMA_PTE_READ|DMA_PTE_WRITE. Cc: Joerg Roedel Cc: Lu Baolu Cc: iommu@lists.linux-foundation.org Cc: stable@vger.kernel.org # v5.3+ Fixes: d850c2ee5fe2 ("iommu/vt-d: Expose ISA direct mapping region via iommu_get_resv_regions") Signed-off-by: Jerry Snitselaar Acked-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index c8ced7712a8e..42966611a192 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -5728,7 +5728,7 @@ static void intel_iommu_get_resv_regions(struct device *device, struct pci_dev *pdev = to_pci_dev(device); if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) { - reg = iommu_alloc_resv_region(0, 1UL << 24, 0, + reg = iommu_alloc_resv_region(0, 1UL << 24, prot, IOMMU_RESV_DIRECT_RELAXABLE); if (reg) list_add_tail(®->list, head); From f81b846dcd9a1e6d120f73970a9a98b7fcaaffba Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 20 Nov 2019 14:10:16 +0800 Subject: [PATCH 7/8] iommu/vt-d: Remove incorrect PSI capability check The PSI (Page Selective Invalidation) bit in the capability register is only valid for second-level translation. Intel IOMMU supporting scalable mode must support page/address selective IOTLB invalidation for first-level translation. Remove the PSI capability check in SVA cache invalidation code. Fixes: 8744daf4b0699 ("iommu/vt-d: Remove global page flush support") Cc: Jacob Pan Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel-svm.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c index 9b159132405d..dca88f9fdf29 100644 --- a/drivers/iommu/intel-svm.c +++ b/drivers/iommu/intel-svm.c @@ -104,11 +104,7 @@ static void intel_flush_svm_range_dev (struct intel_svm *svm, struct intel_svm_d { struct qi_desc desc; - /* - * Do PASID granu IOTLB invalidation if page selective capability is - * not available. - */ - if (pages == -1 || !cap_pgsel_inv(svm->iommu->cap)) { + if (pages == -1) { desc.qw0 = QI_EIOTLB_PASID(svm->pasid) | QI_EIOTLB_DID(sdev->did) | QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | From c18647900ec864d401ba09b3bbd5b34f331f8d26 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 9 Dec 2019 19:47:25 +0000 Subject: [PATCH 8/8] iommu/dma: Relax locking in iommu_dma_prepare_msi() Since commit ece6e6f0218b ("iommu/dma-iommu: Split iommu_dma_map_msi_msg() in two parts"), iommu_dma_prepare_msi() should no longer have to worry about preempting itself, nor being called in atomic context at all. Thus we can downgrade the IRQ-safe locking to a simple mutex to avoid angering the new might_sleep() check in iommu_map(). Reported-by: Qian Cai Tested-by: Jean-Philippe Brucker Signed-off-by: Robin Murphy Signed-off-by: Joerg Roedel --- drivers/iommu/dma-iommu.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 6e573d1cb8bf..c363294b3bb9 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -44,7 +45,6 @@ struct iommu_dma_cookie { dma_addr_t msi_iova; }; struct list_head msi_page_list; - spinlock_t msi_lock; /* Domain for flush queue callback; NULL if flush queue not in use */ struct iommu_domain *fq_domain; @@ -63,7 +63,6 @@ static struct iommu_dma_cookie *cookie_alloc(enum iommu_dma_cookie_type type) cookie = kzalloc(sizeof(*cookie), GFP_KERNEL); if (cookie) { - spin_lock_init(&cookie->msi_lock); INIT_LIST_HEAD(&cookie->msi_page_list); cookie->type = type; } @@ -1176,7 +1175,7 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev, if (msi_page->phys == msi_addr) return msi_page; - msi_page = kzalloc(sizeof(*msi_page), GFP_ATOMIC); + msi_page = kzalloc(sizeof(*msi_page), GFP_KERNEL); if (!msi_page) return NULL; @@ -1206,7 +1205,7 @@ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) struct iommu_domain *domain = iommu_get_domain_for_dev(dev); struct iommu_dma_cookie *cookie; struct iommu_dma_msi_page *msi_page; - unsigned long flags; + static DEFINE_MUTEX(msi_prepare_lock); /* see below */ if (!domain || !domain->iova_cookie) { desc->iommu_cookie = NULL; @@ -1216,13 +1215,13 @@ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) cookie = domain->iova_cookie; /* - * We disable IRQs to rule out a possible inversion against - * irq_desc_lock if, say, someone tries to retarget the affinity - * of an MSI from within an IPI handler. + * In fact the whole prepare operation should already be serialised by + * irq_domain_mutex further up the callchain, but that's pretty subtle + * on its own, so consider this locking as failsafe documentation... */ - spin_lock_irqsave(&cookie->msi_lock, flags); + mutex_lock(&msi_prepare_lock); msi_page = iommu_dma_get_msi_page(dev, msi_addr, domain); - spin_unlock_irqrestore(&cookie->msi_lock, flags); + mutex_unlock(&msi_prepare_lock); msi_desc_set_iommu_cookie(desc, msi_page);