From eedb0b12d091a21909b5e84d9f3e5e649305bd12 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 Jan 2021 14:53:22 +0100 Subject: [PATCH 01/10] dma-mapping: add a dma_mmap_pages helper Add a helper to map memory allocated using dma_alloc_pages into a user address space, similar to the dma_alloc_attrs function for coherent allocations. Signed-off-by: Christoph Hellwig Reviewed-by: Tomasz Figa Tested-by: Ricardo Ribalda --- Documentation/core-api/dma-api.rst | 10 ++++++++++ include/linux/dma-mapping.h | 2 ++ kernel/dma/mapping.c | 13 +++++++++++++ 3 files changed, 25 insertions(+) diff --git a/Documentation/core-api/dma-api.rst b/Documentation/core-api/dma-api.rst index e6d23f117308..157a474ae544 100644 --- a/Documentation/core-api/dma-api.rst +++ b/Documentation/core-api/dma-api.rst @@ -563,6 +563,16 @@ Free a region of memory previously allocated using dma_alloc_pages(). dev, size, dma_handle and dir must all be the same as those passed into dma_alloc_pages(). page must be the pointer returned by dma_alloc_pages(). +:: + + int + dma_mmap_pages(struct device *dev, struct vm_area_struct *vma, + size_t size, struct page *page) + +Map an allocation returned from dma_alloc_pages() into a user address space. +dev and size must be the same as those passed into dma_alloc_pages(). +page must be the pointer returned by dma_alloc_pages(). + :: void * diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 2a984cb4d1e0..2b8dce756e1f 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -263,6 +263,8 @@ struct page *dma_alloc_pages(struct device *dev, size_t size, dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp); void dma_free_pages(struct device *dev, size_t size, struct page *page, dma_addr_t dma_handle, enum dma_data_direction dir); +int dma_mmap_pages(struct device *dev, struct vm_area_struct *vma, + size_t size, struct page *page); static inline void *dma_alloc_noncoherent(struct device *dev, size_t size, dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp) diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index b6a633679933..9ce86c77651c 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -517,6 +517,19 @@ void dma_free_pages(struct device *dev, size_t size, struct page *page, } EXPORT_SYMBOL_GPL(dma_free_pages); +int dma_mmap_pages(struct device *dev, struct vm_area_struct *vma, + size_t size, struct page *page) +{ + unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; + + if (vma->vm_pgoff >= count || vma_pages(vma) > count - vma->vm_pgoff) + return -ENXIO; + return remap_pfn_range(vma, vma->vm_start, + page_to_pfn(page) + vma->vm_pgoff, + vma_pages(vma) << PAGE_SHIFT, vma->vm_page_prot); +} +EXPORT_SYMBOL_GPL(dma_mmap_pages); + int dma_supported(struct device *dev, u64 mask) { const struct dma_map_ops *ops = get_dma_ops(dev); From 198c50e2ccff5c78ddbe0cb01593ac32458deb69 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Feb 2021 18:12:02 +0100 Subject: [PATCH 02/10] dma-mapping: refactor dma_{alloc,free}_pages Factour out internal versions without the dma_debug calls in preparation for callers that will need different dma_debug calls. Note that this changes the dma_debug calls to get the not page aligned size values, but as long as alloc and free agree on one variant we are fine. Signed-off-by: Christoph Hellwig Reviewed-by: Tomasz Figa Tested-by: Ricardo Ribalda --- kernel/dma/mapping.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 9ce86c77651c..07f964ebcda1 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -477,11 +477,10 @@ void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr, } EXPORT_SYMBOL(dma_free_attrs); -struct page *dma_alloc_pages(struct device *dev, size_t size, +static struct page *__dma_alloc_pages(struct device *dev, size_t size, dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp) { const struct dma_map_ops *ops = get_dma_ops(dev); - struct page *page; if (WARN_ON_ONCE(!dev->coherent_dma_mask)) return NULL; @@ -490,31 +489,41 @@ struct page *dma_alloc_pages(struct device *dev, size_t size, size = PAGE_ALIGN(size); if (dma_alloc_direct(dev, ops)) - page = dma_direct_alloc_pages(dev, size, dma_handle, dir, gfp); - else if (ops->alloc_pages) - page = ops->alloc_pages(dev, size, dma_handle, dir, gfp); - else + return dma_direct_alloc_pages(dev, size, dma_handle, dir, gfp); + if (!ops->alloc_pages) return NULL; + return ops->alloc_pages(dev, size, dma_handle, dir, gfp); +} - debug_dma_map_page(dev, page, 0, size, dir, *dma_handle); +struct page *dma_alloc_pages(struct device *dev, size_t size, + dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp) +{ + struct page *page = __dma_alloc_pages(dev, size, dma_handle, dir, gfp); + if (page) + debug_dma_map_page(dev, page, 0, size, dir, *dma_handle); return page; } EXPORT_SYMBOL_GPL(dma_alloc_pages); -void dma_free_pages(struct device *dev, size_t size, struct page *page, +static void __dma_free_pages(struct device *dev, size_t size, struct page *page, dma_addr_t dma_handle, enum dma_data_direction dir) { const struct dma_map_ops *ops = get_dma_ops(dev); size = PAGE_ALIGN(size); - debug_dma_unmap_page(dev, dma_handle, size, dir); - if (dma_alloc_direct(dev, ops)) dma_direct_free_pages(dev, size, page, dma_handle, dir); else if (ops->free_pages) ops->free_pages(dev, size, page, dma_handle, dir); } + +void dma_free_pages(struct device *dev, size_t size, struct page *page, + dma_addr_t dma_handle, enum dma_data_direction dir) +{ + debug_dma_unmap_page(dev, dma_handle, size, dir); + __dma_free_pages(dev, size, page, dma_handle, dir); +} EXPORT_SYMBOL_GPL(dma_free_pages); int dma_mmap_pages(struct device *dev, struct vm_area_struct *vma, From 7d5b5738d1514e9dd8ed452660e2a4d25beb9483 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 Jan 2021 14:54:18 +0100 Subject: [PATCH 03/10] dma-mapping: add a dma_alloc_noncontiguous API Add a new API that returns a potentiall virtually non-contigous sg_table and a DMA address. This API is only properly implemented for dma-iommu and will simply return a contigious chunk as a fallback. The intent is that drivers can use this API if either: - no kernel mapping or only temporary kernel mappings are required. That is as a better replacement for DMA_ATTR_NO_KERNEL_MAPPING - a kernel mapping is required for cached and DMA mapped pages, but the driver also needs the pages to e.g. map them to userspace. In that sense it is a replacement for some aspects of the recently removed and never fully implemented DMA_ATTR_NON_CONSISTENT Signed-off-by: Christoph Hellwig Reviewed-by: Tomasz Figa Tested-by: Ricardo Ribalda --- Documentation/core-api/dma-api.rst | 78 +++++++++++++++++++++ include/linux/dma-map-ops.h | 19 ++++++ include/linux/dma-mapping.h | 32 +++++++++ kernel/dma/mapping.c | 106 +++++++++++++++++++++++++++++ 4 files changed, 235 insertions(+) diff --git a/Documentation/core-api/dma-api.rst b/Documentation/core-api/dma-api.rst index 157a474ae544..00a1d4fa3f9e 100644 --- a/Documentation/core-api/dma-api.rst +++ b/Documentation/core-api/dma-api.rst @@ -594,6 +594,84 @@ dev, size, dma_handle and dir must all be the same as those passed into dma_alloc_noncoherent(). cpu_addr must be the virtual address returned by dma_alloc_noncoherent(). +:: + + struct sg_table * + dma_alloc_noncontiguous(struct device *dev, size_t size, + enum dma_data_direction dir, gfp_t gfp, + unsigned long attrs); + +This routine allocates bytes of non-coherent and possibly non-contiguous +memory. It returns a pointer to struct sg_table that describes the allocated +and DMA mapped memory, or NULL if the allocation failed. The resulting memory +can be used for struct page mapped into a scatterlist are suitable for. + +The return sg_table is guaranteed to have 1 single DMA mapped segment as +indicated by sgt->nents, but it might have multiple CPU side segments as +indicated by sgt->orig_nents. + +The dir parameter specified if data is read and/or written by the device, +see dma_map_single() for details. + +The gfp parameter allows the caller to specify the ``GFP_`` flags (see +kmalloc()) for the allocation, but rejects flags used to specify a memory +zone such as GFP_DMA or GFP_HIGHMEM. + +The attrs argument must be either 0 or DMA_ATTR_ALLOC_SINGLE_PAGES. + +Before giving the memory to the device, dma_sync_sgtable_for_device() needs +to be called, and before reading memory written by the device, +dma_sync_sgtable_for_cpu(), just like for streaming DMA mappings that are +reused. + +:: + + void + dma_free_noncontiguous(struct device *dev, size_t size, + struct sg_table *sgt, + enum dma_data_direction dir) + +Free memory previously allocated using dma_alloc_noncontiguous(). dev, size, +and dir must all be the same as those passed into dma_alloc_noncontiguous(). +sgt must be the pointer returned by dma_alloc_noncontiguous(). + +:: + + void * + dma_vmap_noncontiguous(struct device *dev, size_t size, + struct sg_table *sgt) + +Return a contiguous kernel mapping for an allocation returned from +dma_alloc_noncontiguous(). dev and size must be the same as those passed into +dma_alloc_noncontiguous(). sgt must be the pointer returned by +dma_alloc_noncontiguous(). + +Once a non-contiguous allocation is mapped using this function, the +flush_kernel_vmap_range() and invalidate_kernel_vmap_range() APIs must be used +to manage the coherency between the kernel mapping, the device and user space +mappings (if any). + +:: + + void + dma_vunmap_noncontiguous(struct device *dev, void *vaddr) + +Unmap a kernel mapping returned by dma_vmap_noncontiguous(). dev must be the +same the one passed into dma_alloc_noncontiguous(). vaddr must be the pointer +returned by dma_vmap_noncontiguous(). + + +:: + + int + dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma, + size_t size, struct sg_table *sgt) + +Map an allocation returned from dma_alloc_noncontiguous() into a user address +space. dev and size must be the same as those passed into +dma_alloc_noncontiguous(). sgt must be the pointer returned by +dma_alloc_noncontiguous(). + :: int diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 51872e736e7b..0d53a96a3d64 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -22,6 +22,11 @@ struct dma_map_ops { gfp_t gfp); void (*free_pages)(struct device *dev, size_t size, struct page *vaddr, dma_addr_t dma_handle, enum dma_data_direction dir); + struct sg_table *(*alloc_noncontiguous)(struct device *dev, size_t size, + enum dma_data_direction dir, gfp_t gfp, + unsigned long attrs); + void (*free_noncontiguous)(struct device *dev, size_t size, + struct sg_table *sgt, enum dma_data_direction dir); int (*mmap)(struct device *, struct vm_area_struct *, void *, dma_addr_t, size_t, unsigned long attrs); @@ -198,6 +203,20 @@ static inline int dma_mmap_from_global_coherent(struct vm_area_struct *vma, } #endif /* CONFIG_DMA_DECLARE_COHERENT */ +/* + * This is the actual return value from the ->alloc_noncontiguous method. + * The users of the DMA API should only care about the sg_table, but to make + * the DMA-API internal vmaping and freeing easier we stash away the page + * array as well (except for the fallback case). This can go away any time, + * e.g. when a vmap-variant that takes a scatterlist comes along. + */ +struct dma_sgt_handle { + struct sg_table sgt; + struct page **pages; +}; +#define sgt_handle(sgt) \ + container_of((sgt), struct dma_sgt_handle, sgt) + int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs); diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 2b8dce756e1f..954847f9a3e0 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -144,6 +144,15 @@ u64 dma_get_required_mask(struct device *dev); size_t dma_max_mapping_size(struct device *dev); bool dma_need_sync(struct device *dev, dma_addr_t dma_addr); unsigned long dma_get_merge_boundary(struct device *dev); +struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size, + enum dma_data_direction dir, gfp_t gfp, unsigned long attrs); +void dma_free_noncontiguous(struct device *dev, size_t size, + struct sg_table *sgt, enum dma_data_direction dir); +void *dma_vmap_noncontiguous(struct device *dev, size_t size, + struct sg_table *sgt); +void dma_vunmap_noncontiguous(struct device *dev, void *vaddr); +int dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma, + size_t size, struct sg_table *sgt); #else /* CONFIG_HAS_DMA */ static inline dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, size_t offset, size_t size, @@ -257,6 +266,29 @@ static inline unsigned long dma_get_merge_boundary(struct device *dev) { return 0; } +static inline struct sg_table *dma_alloc_noncontiguous(struct device *dev, + size_t size, enum dma_data_direction dir, gfp_t gfp, + unsigned long attrs) +{ + return NULL; +} +static inline void dma_free_noncontiguous(struct device *dev, size_t size, + struct sg_table *sgt, enum dma_data_direction dir) +{ +} +static inline void *dma_vmap_noncontiguous(struct device *dev, size_t size, + struct sg_table *sgt) +{ + return NULL; +} +static inline void dma_vunmap_noncontiguous(struct device *dev, void *vaddr) +{ +} +static inline int dma_mmap_noncontiguous(struct device *dev, + struct vm_area_struct *vma, size_t size, struct sg_table *sgt) +{ + return -EINVAL; +} #endif /* CONFIG_HAS_DMA */ struct page *dma_alloc_pages(struct device *dev, size_t size, diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 07f964ebcda1..2b06a809d0b9 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -539,6 +539,112 @@ int dma_mmap_pages(struct device *dev, struct vm_area_struct *vma, } EXPORT_SYMBOL_GPL(dma_mmap_pages); +static struct sg_table *alloc_single_sgt(struct device *dev, size_t size, + enum dma_data_direction dir, gfp_t gfp) +{ + struct sg_table *sgt; + struct page *page; + + sgt = kmalloc(sizeof(*sgt), gfp); + if (!sgt) + return NULL; + if (sg_alloc_table(sgt, 1, gfp)) + goto out_free_sgt; + page = __dma_alloc_pages(dev, size, &sgt->sgl->dma_address, dir, gfp); + if (!page) + goto out_free_table; + sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0); + sg_dma_len(sgt->sgl) = sgt->sgl->length; + return sgt; +out_free_table: + sg_free_table(sgt); +out_free_sgt: + kfree(sgt); + return NULL; +} + +struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size, + enum dma_data_direction dir, gfp_t gfp, unsigned long attrs) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + struct sg_table *sgt; + + if (WARN_ON_ONCE(attrs & ~DMA_ATTR_ALLOC_SINGLE_PAGES)) + return NULL; + + if (ops && ops->alloc_noncontiguous) + sgt = ops->alloc_noncontiguous(dev, size, dir, gfp, attrs); + else + sgt = alloc_single_sgt(dev, size, dir, gfp); + + if (sgt) { + sgt->nents = 1; + debug_dma_map_sg(dev, sgt->sgl, sgt->orig_nents, 1, dir); + } + return sgt; +} +EXPORT_SYMBOL_GPL(dma_alloc_noncontiguous); + +static void free_single_sgt(struct device *dev, size_t size, + struct sg_table *sgt, enum dma_data_direction dir) +{ + __dma_free_pages(dev, size, sg_page(sgt->sgl), sgt->sgl->dma_address, + dir); + sg_free_table(sgt); + kfree(sgt); +} + +void dma_free_noncontiguous(struct device *dev, size_t size, + struct sg_table *sgt, enum dma_data_direction dir) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + debug_dma_unmap_sg(dev, sgt->sgl, sgt->orig_nents, dir); + if (ops && ops->free_noncontiguous) + ops->free_noncontiguous(dev, size, sgt, dir); + else + free_single_sgt(dev, size, sgt, dir); +} +EXPORT_SYMBOL_GPL(dma_free_noncontiguous); + +void *dma_vmap_noncontiguous(struct device *dev, size_t size, + struct sg_table *sgt) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; + + if (ops && ops->alloc_noncontiguous) + return vmap(sgt_handle(sgt)->pages, count, VM_MAP, PAGE_KERNEL); + return page_address(sg_page(sgt->sgl)); +} +EXPORT_SYMBOL_GPL(dma_vmap_noncontiguous); + +void dma_vunmap_noncontiguous(struct device *dev, void *vaddr) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + if (ops && ops->alloc_noncontiguous) + vunmap(vaddr); +} +EXPORT_SYMBOL_GPL(dma_vunmap_noncontiguous); + +int dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma, + size_t size, struct sg_table *sgt) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + if (ops && ops->alloc_noncontiguous) { + unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; + + if (vma->vm_pgoff >= count || + vma_pages(vma) > count - vma->vm_pgoff) + return -ENXIO; + return vm_map_pages(vma, sgt_handle(sgt)->pages, count); + } + return dma_mmap_pages(dev, vma, size, sg_page(sgt->sgl)); +} +EXPORT_SYMBOL_GPL(dma_mmap_noncontiguous); + int dma_supported(struct device *dev, u64 mask) { const struct dma_map_ops *ops = get_dma_ops(dev); From 8230ce9a4e206fa1be17d66245f87cae2935d7d2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 Jan 2021 14:44:15 +0100 Subject: [PATCH 04/10] dma-iommu: refactor iommu_dma_alloc_remap Split out a new helper that only allocates a sg_table worth of memory without mapping it into contiguous kernel address space. Signed-off-by: Christoph Hellwig Reviewed-by: Tomasz Figa Tested-by: Ricardo Ribalda --- drivers/iommu/dma-iommu.c | 69 ++++++++++++++++++++------------------- 1 file changed, 36 insertions(+), 33 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index af765c813cc8..ec1abad156db 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -650,23 +650,12 @@ static struct page **__iommu_dma_alloc_pages(struct device *dev, return pages; } -/** - * iommu_dma_alloc_remap - Allocate and map a buffer contiguous in IOVA space - * @dev: Device to allocate memory for. Must be a real device - * attached to an iommu_dma_domain - * @size: Size of buffer in bytes - * @dma_handle: Out argument for allocated DMA handle - * @gfp: Allocation flags - * @prot: pgprot_t to use for the remapped mapping - * @attrs: DMA attributes for this allocation - * - * If @size is less than PAGE_SIZE, then a full CPU page will be allocated, +/* + * If size is less than PAGE_SIZE, then a full CPU page will be allocated, * but an IOMMU which supports smaller pages might not map the whole thing. - * - * Return: Mapped virtual address, or NULL on failure. */ -static void *iommu_dma_alloc_remap(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t gfp, pgprot_t prot, +static struct page **__iommu_dma_alloc_noncontiguous(struct device *dev, + size_t size, struct sg_table *sgt, gfp_t gfp, pgprot_t prot, unsigned long attrs) { struct iommu_domain *domain = iommu_get_dma_domain(dev); @@ -676,11 +665,7 @@ static void *iommu_dma_alloc_remap(struct device *dev, size_t size, int ioprot = dma_info_to_prot(DMA_BIDIRECTIONAL, coherent, attrs); unsigned int count, min_size, alloc_sizes = domain->pgsize_bitmap; struct page **pages; - struct sg_table sgt; dma_addr_t iova; - void *vaddr; - - *dma_handle = DMA_MAPPING_ERROR; if (static_branch_unlikely(&iommu_deferred_attach_enabled) && iommu_deferred_attach(dev, domain)) @@ -707,34 +692,26 @@ static void *iommu_dma_alloc_remap(struct device *dev, size_t size, if (!iova) goto out_free_pages; - if (sg_alloc_table_from_pages(&sgt, pages, count, 0, size, GFP_KERNEL)) + if (sg_alloc_table_from_pages(sgt, pages, count, 0, size, GFP_KERNEL)) goto out_free_iova; if (!(ioprot & IOMMU_CACHE)) { struct scatterlist *sg; int i; - for_each_sg(sgt.sgl, sg, sgt.orig_nents, i) + for_each_sg(sgt->sgl, sg, sgt->orig_nents, i) arch_dma_prep_coherent(sg_page(sg), sg->length); } - if (iommu_map_sg_atomic(domain, iova, sgt.sgl, sgt.orig_nents, ioprot) + if (iommu_map_sg_atomic(domain, iova, sgt->sgl, sgt->orig_nents, ioprot) < size) goto out_free_sg; - vaddr = dma_common_pages_remap(pages, size, prot, - __builtin_return_address(0)); - if (!vaddr) - goto out_unmap; + sgt->sgl->dma_address = iova; + return pages; - *dma_handle = iova; - sg_free_table(&sgt); - return vaddr; - -out_unmap: - __iommu_dma_unmap(dev, iova, size); out_free_sg: - sg_free_table(&sgt); + sg_free_table(sgt); out_free_iova: iommu_dma_free_iova(cookie, iova, size, NULL); out_free_pages: @@ -742,6 +719,32 @@ out_free_pages: return NULL; } +static void *iommu_dma_alloc_remap(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp, pgprot_t prot, + unsigned long attrs) +{ + struct page **pages; + struct sg_table sgt; + void *vaddr; + + pages = __iommu_dma_alloc_noncontiguous(dev, size, &sgt, gfp, prot, + attrs); + if (!pages) + return NULL; + *dma_handle = sgt.sgl->dma_address; + sg_free_table(&sgt); + vaddr = dma_common_pages_remap(pages, size, prot, + __builtin_return_address(0)); + if (!vaddr) + goto out_unmap; + return vaddr; + +out_unmap: + __iommu_dma_unmap(dev, *dma_handle, size); + __iommu_dma_free_pages(pages, PAGE_ALIGN(size) >> PAGE_SHIFT); + return NULL; +} + static void iommu_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction dir) { From e817ee5f2f95ca58a3b961ae4acfd3885e830b9c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 Jan 2021 14:47:29 +0100 Subject: [PATCH 05/10] dma-iommu: implement ->alloc_noncontiguous Implement support for allocating a non-contiguous DMA region. Signed-off-by: Christoph Hellwig Reviewed-by: Tomasz Figa Tested-by: Ricardo Ribalda --- drivers/iommu/dma-iommu.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index ec1abad156db..1946422e4ac7 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -708,6 +708,7 @@ static struct page **__iommu_dma_alloc_noncontiguous(struct device *dev, goto out_free_sg; sgt->sgl->dma_address = iova; + sgt->sgl->dma_length = size; return pages; out_free_sg: @@ -745,6 +746,37 @@ out_unmap: return NULL; } +#ifdef CONFIG_DMA_REMAP +static struct sg_table *iommu_dma_alloc_noncontiguous(struct device *dev, + size_t size, enum dma_data_direction dir, gfp_t gfp, + unsigned long attrs) +{ + struct dma_sgt_handle *sh; + + sh = kmalloc(sizeof(*sh), gfp); + if (!sh) + return NULL; + + sh->pages = __iommu_dma_alloc_noncontiguous(dev, size, &sh->sgt, gfp, + PAGE_KERNEL, attrs); + if (!sh->pages) { + kfree(sh); + return NULL; + } + return &sh->sgt; +} + +static void iommu_dma_free_noncontiguous(struct device *dev, size_t size, + struct sg_table *sgt, enum dma_data_direction dir) +{ + struct dma_sgt_handle *sh = sgt_handle(sgt); + + __iommu_dma_unmap(dev, sgt->sgl->dma_address, size); + __iommu_dma_free_pages(sh->pages, PAGE_ALIGN(size) >> PAGE_SHIFT); + sg_free_table(&sh->sgt); +} +#endif /* CONFIG_DMA_REMAP */ + static void iommu_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction dir) { @@ -1261,6 +1293,10 @@ static const struct dma_map_ops iommu_dma_ops = { .free = iommu_dma_free, .alloc_pages = dma_common_alloc_pages, .free_pages = dma_common_free_pages, +#ifdef CONFIG_DMA_REMAP + .alloc_noncontiguous = iommu_dma_alloc_noncontiguous, + .free_noncontiguous = iommu_dma_free_noncontiguous, +#endif .mmap = iommu_dma_mmap, .get_sgtable = iommu_dma_get_sgtable, .map_page = iommu_dma_map_page, From 20e1dbf2bbe2431072571000ed31dfef09359c08 Mon Sep 17 00:00:00 2001 From: Ricardo Ribalda Date: Sat, 13 Mar 2021 00:55:20 +0100 Subject: [PATCH 06/10] media: uvcvideo: Use dma_alloc_noncontiguous API On architectures where there is no coherent caching such as ARM use the dma_alloc_noncontiguous API and handle manually the cache flushing using dma_sync_sgtable(). If the architechture has coherent cache, the API falls back to alloc_dma_pages, so we can remove the coherent caching code-path from the driver, making it simpler. With this patch on the affected architectures we can measure up to 20x performance improvement in uvc_video_copy_data_work(). Eg: aarch64 with an external usb camera NON_CONTIGUOUS frames: 999 packets: 999 empty: 0 (0 %) errors: 0 invalid: 0 pts: 0 early, 0 initial, 999 ok scr: 0 count ok, 0 diff ok sof: 2048 <= sof <= 0, freq 0.000 kHz bytes 67034480 : duration 33303 FPS: 29.99 URB: 523446/4993 uS/qty: 104.836 avg 132.532 std 13.230 min 831.094 max (uS) header: 76564/4993 uS/qty: 15.334 avg 15.229 std 3.438 min 186.875 max (uS) latency: 468945/4992 uS/qty: 93.939 avg 132.577 std 9.531 min 824.010 max (uS) decode: 54161/4993 uS/qty: 10.847 avg 6.313 std 1.614 min 111.458 max (uS) raw decode speed: 9.931 Gbits/s raw URB handling speed: 1.025 Gbits/s throughput: 16.102 Mbits/s URB decode CPU usage 0.162600 % COHERENT frames: 999 packets: 999 empty: 0 (0 %) errors: 0 invalid: 0 pts: 0 early, 0 initial, 999 ok scr: 0 count ok, 0 diff ok sof: 2048 <= sof <= 0, freq 0.000 kHz bytes 54683536 : duration 33302 FPS: 29.99 URB: 1478135/4000 uS/qty: 369.533 avg 390.357 std 22.968 min 3337.865 max (uS) header: 79761/4000 uS/qty: 19.940 avg 18.495 std 1.875 min 336.719 max (uS) latency: 281077/4000 uS/qty: 70.269 avg 83.102 std 5.104 min 735.000 max (uS) decode: 1197057/4000 uS/qty: 299.264 avg 318.080 std 1.615 min 2806.667 max (uS) raw decode speed: 365.470 Mbits/s raw URB handling speed: 295.986 Mbits/s throughput: 13.136 Mbits/s URB decode CPU usage 3.594500 % In non-affected architectures we see no significant impact. Eg: x86 with an external usb camera NON_CONTIGUOUS frames: 999 packets: 999 empty: 0 (0 %) errors: 0 invalid: 0 pts: 0 early, 0 initial, 999 ok scr: 0 count ok, 0 diff ok sof: 2048 <= sof <= 0, freq 0.000 kHz bytes 70179056 : duration 33301 FPS: 29.99 URB: 288901/4897 uS/qty: 58.995 avg 26.022 std 4.319 min 253.853 max (uS) header: 54792/4897 uS/qty: 11.189 avg 6.218 std 0.620 min 61.750 max (uS) latency: 236602/4897 uS/qty: 48.315 avg 24.244 std 1.764 min 240.924 max (uS) decode: 52298/4897 uS/qty: 10.679 avg 8.299 std 1.638 min 108.861 max (uS) raw decode speed: 10.796 Gbits/s raw URB handling speed: 1.949 Gbits/s throughput: 16.859 Mbits/s URB decode CPU usage 0.157000 % COHERENT frames: 999 packets: 999 empty: 0 (0 %) errors: 0 invalid: 0 pts: 0 early, 0 initial, 999 ok scr: 0 count ok, 0 diff ok sof: 2048 <= sof <= 0, freq 0.000 kHz bytes 71818320 : duration 33301 FPS: 29.99 URB: 321021/5000 uS/qty: 64.204 avg 23.001 std 10.430 min 268.837 max (uS) header: 54308/5000 uS/qty: 10.861 avg 5.104 std 0.778 min 54.736 max (uS) latency: 268799/5000 uS/qty: 53.759 avg 21.827 std 6.095 min 255.153 max (uS) decode: 52222/5000 uS/qty: 10.444 avg 7.137 std 1.874 min 71.103 max (uS) raw decode speed: 11.048 Gbits/s raw URB handling speed: 1.789 Gbits/s throughput: 17.253 Mbits/s URB decode CPU usage 0.156800 % Signed-off-by: Ricardo Ribalda Reviewed-by: Laurent Pinchart Reviewed-by: Tomasz Figa Signed-off-by: Christoph Hellwig --- drivers/media/usb/uvc/uvc_video.c | 94 +++++++++++++++++++++++-------- drivers/media/usb/uvc/uvcvideo.h | 5 +- 2 files changed, 73 insertions(+), 26 deletions(-) diff --git a/drivers/media/usb/uvc/uvc_video.c b/drivers/media/usb/uvc/uvc_video.c index f2f565281e63..a777b389a66e 100644 --- a/drivers/media/usb/uvc/uvc_video.c +++ b/drivers/media/usb/uvc/uvc_video.c @@ -6,11 +6,14 @@ * Laurent Pinchart (laurent.pinchart@ideasonboard.com) */ +#include +#include #include #include #include #include #include +#include #include #include #include @@ -1096,6 +1099,29 @@ static int uvc_video_decode_start(struct uvc_streaming *stream, return data[0]; } +static inline enum dma_data_direction uvc_stream_dir( + struct uvc_streaming *stream) +{ + if (stream->type == V4L2_BUF_TYPE_VIDEO_CAPTURE) + return DMA_FROM_DEVICE; + else + return DMA_TO_DEVICE; +} + +static inline struct device *uvc_stream_to_dmadev(struct uvc_streaming *stream) +{ + return bus_to_hcd(stream->dev->udev->bus)->self.sysdev; +} + +static int uvc_submit_urb(struct uvc_urb *uvc_urb, gfp_t mem_flags) +{ + /* Sync DMA. */ + dma_sync_sgtable_for_device(uvc_stream_to_dmadev(uvc_urb->stream), + uvc_urb->sgt, + uvc_stream_dir(uvc_urb->stream)); + return usb_submit_urb(uvc_urb->urb, mem_flags); +} + /* * uvc_video_decode_data_work: Asynchronous memcpy processing * @@ -1117,7 +1143,7 @@ static void uvc_video_copy_data_work(struct work_struct *work) uvc_queue_buffer_release(op->buf); } - ret = usb_submit_urb(uvc_urb->urb, GFP_KERNEL); + ret = uvc_submit_urb(uvc_urb, GFP_KERNEL); if (ret < 0) dev_err(&uvc_urb->stream->intf->dev, "Failed to resubmit video URB (%d).\n", ret); @@ -1537,6 +1563,12 @@ static void uvc_video_complete(struct urb *urb) /* Re-initialise the URB async work. */ uvc_urb->async_operations = 0; + /* Sync DMA and invalidate vmap range. */ + dma_sync_sgtable_for_cpu(uvc_stream_to_dmadev(uvc_urb->stream), + uvc_urb->sgt, uvc_stream_dir(stream)); + invalidate_kernel_vmap_range(uvc_urb->buffer, + uvc_urb->stream->urb_size); + /* * Process the URB headers, and optionally queue expensive memcpy tasks * to be deferred to a work queue. @@ -1545,7 +1577,7 @@ static void uvc_video_complete(struct urb *urb) /* If no async work is needed, resubmit the URB immediately. */ if (!uvc_urb->async_operations) { - ret = usb_submit_urb(uvc_urb->urb, GFP_ATOMIC); + ret = uvc_submit_urb(uvc_urb, GFP_ATOMIC); if (ret < 0) dev_err(&stream->intf->dev, "Failed to resubmit video URB (%d).\n", ret); @@ -1560,24 +1592,49 @@ static void uvc_video_complete(struct urb *urb) */ static void uvc_free_urb_buffers(struct uvc_streaming *stream) { + struct device *dma_dev = uvc_stream_to_dmadev(stream); struct uvc_urb *uvc_urb; for_each_uvc_urb(uvc_urb, stream) { if (!uvc_urb->buffer) continue; -#ifndef CONFIG_DMA_NONCOHERENT - usb_free_coherent(stream->dev->udev, stream->urb_size, - uvc_urb->buffer, uvc_urb->dma); -#else - kfree(uvc_urb->buffer); -#endif + dma_vunmap_noncontiguous(dma_dev, uvc_urb->buffer); + dma_free_noncontiguous(dma_dev, stream->urb_size, uvc_urb->sgt, + uvc_stream_dir(stream)); + uvc_urb->buffer = NULL; + uvc_urb->sgt = NULL; } stream->urb_size = 0; } +static bool uvc_alloc_urb_buffer(struct uvc_streaming *stream, + struct uvc_urb *uvc_urb, gfp_t gfp_flags) +{ + struct device *dma_dev = uvc_stream_to_dmadev(stream); + + uvc_urb->sgt = dma_alloc_noncontiguous(dma_dev, stream->urb_size, + uvc_stream_dir(stream), + gfp_flags, 0); + if (!uvc_urb->sgt) + return false; + uvc_urb->dma = uvc_urb->sgt->sgl->dma_address; + + uvc_urb->buffer = dma_vmap_noncontiguous(dma_dev, stream->urb_size, + uvc_urb->sgt); + if (!uvc_urb->buffer) { + dma_free_noncontiguous(dma_dev, stream->urb_size, + uvc_urb->sgt, + uvc_stream_dir(stream)); + uvc_urb->sgt = NULL; + return false; + } + + return true; +} + /* * Allocate transfer buffers. This function can be called with buffers * already allocated when resuming from suspend, in which case it will @@ -1608,19 +1665,12 @@ static int uvc_alloc_urb_buffers(struct uvc_streaming *stream, /* Retry allocations until one succeed. */ for (; npackets > 1; npackets /= 2) { + stream->urb_size = psize * npackets; + for (i = 0; i < UVC_URBS; ++i) { struct uvc_urb *uvc_urb = &stream->uvc_urb[i]; - stream->urb_size = psize * npackets; -#ifndef CONFIG_DMA_NONCOHERENT - uvc_urb->buffer = usb_alloc_coherent( - stream->dev->udev, stream->urb_size, - gfp_flags | __GFP_NOWARN, &uvc_urb->dma); -#else - uvc_urb->buffer = - kmalloc(stream->urb_size, gfp_flags | __GFP_NOWARN); -#endif - if (!uvc_urb->buffer) { + if (!uvc_alloc_urb_buffer(stream, uvc_urb, gfp_flags)) { uvc_free_urb_buffers(stream); break; } @@ -1730,12 +1780,8 @@ static int uvc_init_video_isoc(struct uvc_streaming *stream, urb->context = uvc_urb; urb->pipe = usb_rcvisocpipe(stream->dev->udev, ep->desc.bEndpointAddress); -#ifndef CONFIG_DMA_NONCOHERENT urb->transfer_flags = URB_ISO_ASAP | URB_NO_TRANSFER_DMA_MAP; urb->transfer_dma = uvc_urb->dma; -#else - urb->transfer_flags = URB_ISO_ASAP; -#endif urb->interval = ep->desc.bInterval; urb->transfer_buffer = uvc_urb->buffer; urb->complete = uvc_video_complete; @@ -1795,10 +1841,8 @@ static int uvc_init_video_bulk(struct uvc_streaming *stream, usb_fill_bulk_urb(urb, stream->dev->udev, pipe, uvc_urb->buffer, size, uvc_video_complete, uvc_urb); -#ifndef CONFIG_DMA_NONCOHERENT urb->transfer_flags = URB_NO_TRANSFER_DMA_MAP; urb->transfer_dma = uvc_urb->dma; -#endif uvc_urb->urb = urb; } @@ -1895,7 +1939,7 @@ static int uvc_video_start_transfer(struct uvc_streaming *stream, /* Submit the URBs. */ for_each_uvc_urb(uvc_urb, stream) { - ret = usb_submit_urb(uvc_urb->urb, gfp_flags); + ret = uvc_submit_urb(uvc_urb, gfp_flags); if (ret < 0) { dev_err(&stream->intf->dev, "Failed to submit URB %u (%d).\n", diff --git a/drivers/media/usb/uvc/uvcvideo.h b/drivers/media/usb/uvc/uvcvideo.h index 97df5ecd66c9..cce5e38133cd 100644 --- a/drivers/media/usb/uvc/uvcvideo.h +++ b/drivers/media/usb/uvc/uvcvideo.h @@ -219,6 +219,7 @@ */ struct gpio_desc; +struct sg_table; struct uvc_device; /* TODO: Put the most frequently accessed fields at the beginning of @@ -545,7 +546,8 @@ struct uvc_copy_op { * @urb: the URB described by this context structure * @stream: UVC streaming context * @buffer: memory storage for the URB - * @dma: DMA coherent addressing for the urb_buffer + * @dma: Allocated DMA handle + * @sgt: sgt_table with the urb locations in memory * @async_operations: counter to indicate the number of copy operations * @copy_operations: work descriptors for asynchronous copy operations * @work: work queue entry for asynchronous decode @@ -556,6 +558,7 @@ struct uvc_urb { char *buffer; dma_addr_t dma; + struct sg_table *sgt; unsigned int async_operations; struct uvc_copy_op copy_operations[UVC_MAX_PACKETS]; From 84fcfbdadbfdd86c9a43a52703203e05fe7efd92 Mon Sep 17 00:00:00 2001 From: Wang Qing Date: Fri, 12 Mar 2021 10:19:12 +0800 Subject: [PATCH 07/10] dma-mapping: remove a pointless empty line in dma_alloc_coherent Signed-off-by: Wang Qing Signed-off-by: Christoph Hellwig --- include/linux/dma-mapping.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 954847f9a3e0..e9d19b974f26 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -435,7 +435,6 @@ static inline void dma_sync_sgtable_for_device(struct device *dev, static inline void *dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp) { - return dma_alloc_attrs(dev, size, dma_handle, gfp, (gfp & __GFP_NOWARN) ? DMA_ATTR_NO_WARN : 0); } From 42e4eefb089f12ea900062ecdcc7ca10c3423a05 Mon Sep 17 00:00:00 2001 From: Hao Fang Date: Tue, 30 Mar 2021 14:33:48 +0800 Subject: [PATCH 08/10] dma-mapping: benchmark: use the correct HiSilicon copyright s/Hisilicon/HiSilicon/g. It should use capital S, according to https://www.hisilicon.com/en/terms-of-use. Signed-off-by: Hao Fang Acked-by: Barry Song Signed-off-by: Christoph Hellwig --- kernel/dma/map_benchmark.c | 2 +- tools/testing/selftests/dma/dma_map_benchmark.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c index e0e64f8b0739..00d6549a5495 100644 --- a/kernel/dma/map_benchmark.c +++ b/kernel/dma/map_benchmark.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (C) 2020 Hisilicon Limited. + * Copyright (C) 2020 HiSilicon Limited. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/tools/testing/selftests/dma/dma_map_benchmark.c b/tools/testing/selftests/dma/dma_map_benchmark.c index fb23ce9617ea..b492bed0936d 100644 --- a/tools/testing/selftests/dma/dma_map_benchmark.c +++ b/tools/testing/selftests/dma/dma_map_benchmark.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (C) 2020 Hisilicon Limited. + * Copyright (C) 2020 HiSilicon Limited. */ #include From ca947482b0b30443e6da1f0f5ba7244e34a4f65a Mon Sep 17 00:00:00 2001 From: Xiang Chen Date: Thu, 18 Mar 2021 17:29:30 +0800 Subject: [PATCH 09/10] dma-mapping: benchmark: Add support for multi-pages map/unmap Currently it only support one page map/unmap once a time for dma-map benchmark, but there are some other scenaries which need to support for multi-page map/unmap: for those multi-pages interfaces such as dma_alloc_coherent() and dma_map_sg(), the time spent on multi-pages map/unmap is not the time of a single page * npages (not linear) as it may use block description instead of page description when it is satified with the size such as 2M/1G, and also it can send a single TLB invalidation command to invalidate multi-pages instead of multi-times when RIL is enabled (which will short the time of unmap). So it is necessary to add support for multi-pages map/unmap. Add a parameter "-g" to support multi-pages map/unmap. Signed-off-by: Xiang Chen Acked-by: Barry Song Signed-off-by: Christoph Hellwig --- kernel/dma/map_benchmark.c | 21 ++++++++++++------- .../testing/selftests/dma/dma_map_benchmark.c | 20 ++++++++++++++---- 2 files changed, 30 insertions(+), 11 deletions(-) diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c index 00d6549a5495..9b9af1bd6be3 100644 --- a/kernel/dma/map_benchmark.c +++ b/kernel/dma/map_benchmark.c @@ -38,7 +38,8 @@ struct map_benchmark { __u32 dma_bits; /* DMA addressing capability */ __u32 dma_dir; /* DMA data direction */ __u32 dma_trans_ns; /* time for DMA transmission in ns */ - __u8 expansion[80]; /* For future use */ + __u32 granule; /* how many PAGE_SIZE will do map/unmap once a time */ + __u8 expansion[76]; /* For future use */ }; struct map_benchmark_data { @@ -58,9 +59,11 @@ static int map_benchmark_thread(void *data) void *buf; dma_addr_t dma_addr; struct map_benchmark_data *map = data; + int npages = map->bparam.granule; + u64 size = npages * PAGE_SIZE; int ret = 0; - buf = (void *)__get_free_page(GFP_KERNEL); + buf = alloc_pages_exact(size, GFP_KERNEL); if (!buf) return -ENOMEM; @@ -76,10 +79,10 @@ static int map_benchmark_thread(void *data) * 66 means evertything goes well! 66 is lucky. */ if (map->dir != DMA_FROM_DEVICE) - memset(buf, 0x66, PAGE_SIZE); + memset(buf, 0x66, size); map_stime = ktime_get(); - dma_addr = dma_map_single(map->dev, buf, PAGE_SIZE, map->dir); + dma_addr = dma_map_single(map->dev, buf, size, map->dir); if (unlikely(dma_mapping_error(map->dev, dma_addr))) { pr_err("dma_map_single failed on %s\n", dev_name(map->dev)); @@ -93,7 +96,7 @@ static int map_benchmark_thread(void *data) ndelay(map->bparam.dma_trans_ns); unmap_stime = ktime_get(); - dma_unmap_single(map->dev, dma_addr, PAGE_SIZE, map->dir); + dma_unmap_single(map->dev, dma_addr, size, map->dir); unmap_etime = ktime_get(); unmap_delta = ktime_sub(unmap_etime, unmap_stime); @@ -112,7 +115,7 @@ static int map_benchmark_thread(void *data) } out: - free_page((unsigned long)buf); + free_pages_exact(buf, size); return ret; } @@ -203,7 +206,6 @@ static long map_benchmark_ioctl(struct file *file, unsigned int cmd, struct map_benchmark_data *map = file->private_data; void __user *argp = (void __user *)arg; u64 old_dma_mask; - int ret; if (copy_from_user(&map->bparam, argp, sizeof(map->bparam))) @@ -234,6 +236,11 @@ static long map_benchmark_ioctl(struct file *file, unsigned int cmd, return -EINVAL; } + if (map->bparam.granule < 1 || map->bparam.granule > 1024) { + pr_err("invalid granule size\n"); + return -EINVAL; + } + switch (map->bparam.dma_dir) { case DMA_MAP_BIDIRECTIONAL: map->dir = DMA_BIDIRECTIONAL; diff --git a/tools/testing/selftests/dma/dma_map_benchmark.c b/tools/testing/selftests/dma/dma_map_benchmark.c index b492bed0936d..485dff51bad2 100644 --- a/tools/testing/selftests/dma/dma_map_benchmark.c +++ b/tools/testing/selftests/dma/dma_map_benchmark.c @@ -40,7 +40,8 @@ struct map_benchmark { __u32 dma_bits; /* DMA addressing capability */ __u32 dma_dir; /* DMA data direction */ __u32 dma_trans_ns; /* time for DMA transmission in ns */ - __u8 expansion[80]; /* For future use */ + __u32 granule; /* how many PAGE_SIZE will do map/unmap once a time */ + __u8 expansion[76]; /* For future use */ }; int main(int argc, char **argv) @@ -51,11 +52,13 @@ int main(int argc, char **argv) int threads = 1, seconds = 20, node = -1; /* default dma mask 32bit, bidirectional DMA */ int bits = 32, xdelay = 0, dir = DMA_MAP_BIDIRECTIONAL; + /* default granule 1 PAGESIZE */ + int granule = 1; int cmd = DMA_MAP_BENCHMARK; char *p; - while ((opt = getopt(argc, argv, "t:s:n:b:d:x:")) != -1) { + while ((opt = getopt(argc, argv, "t:s:n:b:d:x:g:")) != -1) { switch (opt) { case 't': threads = atoi(optarg); @@ -75,6 +78,9 @@ int main(int argc, char **argv) case 'x': xdelay = atoi(optarg); break; + case 'g': + granule = atoi(optarg); + break; default: return -1; } @@ -110,6 +116,11 @@ int main(int argc, char **argv) exit(1); } + if (granule < 1 || granule > 1024) { + fprintf(stderr, "invalid granule size\n"); + exit(1); + } + fd = open("/sys/kernel/debug/dma_map_benchmark", O_RDWR); if (fd == -1) { perror("open"); @@ -123,14 +134,15 @@ int main(int argc, char **argv) map.dma_bits = bits; map.dma_dir = dir; map.dma_trans_ns = xdelay; + map.granule = granule; if (ioctl(fd, cmd, &map)) { perror("ioctl"); exit(1); } - printf("dma mapping benchmark: threads:%d seconds:%d node:%d dir:%s\n", - threads, seconds, node, dir[directions]); + printf("dma mapping benchmark: threads:%d seconds:%d node:%d dir:%s granule: %d\n", + threads, seconds, node, dir[directions], granule); printf("average map latency(us):%.1f standard deviation:%.1f\n", map.avg_map_100ns/10.0, map.map_stddev/10.0); printf("average unmap latency(us):%.1f standard deviation:%.1f\n", From a7f3d3d3600c8ed119eb0d2483de0062ce2e3707 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 26 Mar 2021 22:03:05 +0100 Subject: [PATCH 10/10] dma-mapping: add unlikely hint to error path in dma_mapping_error Zillions of drivers use the unlikely() hint when checking the result of dma_mapping_error(). This is an inline function anyway, so we can move the hint into the function and remove it from drivers over time. Signed-off-by: Heiner Kallweit Reviewed-by: Robin Murphy Signed-off-by: Christoph Hellwig --- include/linux/dma-mapping.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index e9d19b974f26..183e7103a66d 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -95,7 +95,7 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) { debug_dma_mapping_error(dev, dma_addr); - if (dma_addr == DMA_MAPPING_ERROR) + if (unlikely(dma_addr == DMA_MAPPING_ERROR)) return -ENOMEM; return 0; }