dma-mapping updates for Linux 5.3

- move the USB special case that bounced DMA through a device bar into the USB code instead of handling it in the common DMA code (Laurentiu Tudor and Fredrik Noring) - don't dip into the global CMA pool for single page allocations (Nicolin Chen) - fix a crash when allocating memory for the atomic pool failed during boot (Florian Fainelli) - move support for MIPS-style uncached segments to the common code and use that for MIPS and nios2 (me) - make support for DMA_ATTR_NON_CONSISTENT and DMA_ATTR_NO_KERNEL_MAPPING generic (me) - convert nds32 to the generic remapping allocator (me) -----BEGIN PGP SIGNATURE----- iQI/BAABCgApFiEEgdbnc3r/njty3Iq9D55TZVIEUYMFAl0nPqgLHGhjaEBsc3Qu ZGUACgkQD55TZVIEUYNj2hAAxIv2O3wv6V5xhzWwOVo8e/xW1ZLlGAF0/z92u0do 32Tm8jkdAGjZDnyxam7qisMSIjCNykpauQzVVxyUNBRSsn1V5t7KSaH3/OXCOVcr x2VWBirxGO2BbRseaCBjIcA/2qna+VIDGFcNXCtf6rM00YUK6qaJzkMwBKQAeYcM uJMJkaf8qaW4hygLJP8axXiGFdIJyFNLAlJ+ok6kYsJHHJNceOp0bo3CDa2mJBK9 IhraK2zVkyE5EQkQM5cE/Kw1ppPelUKUkHwjgM4wpz2b18WbLu11nKP0hmUcvKRQ heY8xWiKxN0QTgS03ou7EVylyrSAE4dIKgzuA4VO32QCGsWypcAg4iU6s5TX6p9g tZEW2ckE6wbmRdQPyKoDpZg299/eQjRHc4MAA1yinT8tFMokw2tk8Fq1FWyltwL1 8EiP5oNs2qUNvNgqUresl6/f6YOacFi1Q6IhgBVj6d6lyhMhlsHfW4w1XA1siv/I 6l4qJbLohYab6hY7i+mBOd8iG/KrAlr4P6admnv2jDchswbb5t2j+ABE9xv++PFi u1HFqMlxqdWQaXGca2UeCUxUjkwO9N+kHpP+VRz+6D2b64dtCWSu8CN23sYXm2tO ubWIlrQQZPhhMkoFg7XqKSTacd+ut+SXN9Nxsyv548ETV0l1xbiLRHIbhyoIESD5 RAI= =01Fr -----END PGP SIGNATURE----- Merge tag 'dma-mapping-5.3' of git://git.infradead.org/users/hch/dma-mapping Pull dma-mapping updates from Christoph Hellwig: - move the USB special case that bounced DMA through a device bar into the USB code instead of handling it in the common DMA code (Laurentiu Tudor and Fredrik Noring) - don't dip into the global CMA pool for single page allocations (Nicolin Chen) - fix a crash when allocating memory for the atomic pool failed during boot (Florian Fainelli) - move support for MIPS-style uncached segments to the common code and use that for MIPS and nios2 (me) - make support for DMA_ATTR_NON_CONSISTENT and DMA_ATTR_NO_KERNEL_MAPPING generic (me) - convert nds32 to the generic remapping allocator (me) * tag 'dma-mapping-5.3' of git://git.infradead.org/users/hch/dma-mapping: (29 commits) dma-mapping: mark dma_alloc_need_uncached as __always_inline MIPS: only select ARCH_HAS_UNCACHED_SEGMENT for non-coherent platforms usb: host: Fix excessive alignment restriction for local memory allocations lib/genalloc.c: Add algorithm, align and zeroed family of DMA allocators nios2: use the generic uncached segment support in dma-direct nds32: use the generic remapping allocator for coherent DMA allocations arc: use the generic remapping allocator for coherent DMA allocations dma-direct: handle DMA_ATTR_NO_KERNEL_MAPPING in common code dma-direct: handle DMA_ATTR_NON_CONSISTENT in common code dma-mapping: add a dma_alloc_need_uncached helper openrisc: remove the partial DMA_ATTR_NON_CONSISTENT support arc: remove the partial DMA_ATTR_NON_CONSISTENT support arm-nommu: remove the partial DMA_ATTR_NON_CONSISTENT support ARM: dma-mapping: allow larger DMA mask than supported dma-mapping: truncate dma masks to what dma_addr_t can hold iommu/dma: Apply dma_{alloc,free}_contiguous functions dma-remap: Avoid de-referencing NULL atomic_pool MIPS: use the generic uncached segment support in dma-direct dma-direct: provide generic support for uncached kernel segments au1100fb: fix DMA API abuse ...
2019-07-12 15:13:55 -07:00 · 2019-07-12 15:13:55 -07:00 · 9e3a25dc99
parent 9787aed57d 15ffe5e1ac
commit 9e3a25dc99
41 changed files with 519 additions and 658 deletions
--- a/arch/Kconfig
+++ b/arch/Kconfig
@ -260,6 +260,14 @@ config ARCH_HAS_SET_MEMORY
 config ARCH_HAS_SET_DIRECT_MAP
 	bool
 #
 # Select if arch has an uncached kernel segment and provides the
 # uncached_kernel_address / cached_kernel_address symbols to use it
 #
 config ARCH_HAS_UNCACHED_SEGMENT
 	select ARCH_HAS_DMA_PREP_COHERENT
 	bool
 # Select if arch init_task must go in the __init_task_data section
 config ARCH_TASK_STRUCT_ON_STACK
       bool
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@ -7,6 +7,7 @@ config ARC
 	def_bool y
 	select ARC_TIMERS
 	select ARCH_HAS_DMA_COHERENT_TO_PFN
 	select ARCH_HAS_DMA_PREP_COHERENT
 	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_SETUP_DMA_OPS
 	select ARCH_HAS_SYNC_DMA_FOR_CPU
@ -16,6 +17,7 @@ config ARC
 	select BUILDTIME_EXTABLE_SORT
 	select CLONE_BACKWARDS
 	select COMMON_CLK
 	select DMA_DIRECT_REMAP
 	select GENERIC_ATOMIC64 if !ISA_ARCV2 || !(ARC_HAS_LL64 && ARC_HAS_LLSC)
 	select GENERIC_CLOCKEVENTS
 	select GENERIC_FIND_FIRST_BIT
--- a/arch/arc/mm/dma.c
+++ b/arch/arc/mm/dma.c
@ -8,51 +8,15 @@
 #include <asm/cacheflush.h>
 /*
- * ARCH specific callbacks for generic noncoherent DMA ops (dma/noncoherent.c)
+ * ARCH specific callbacks for generic noncoherent DMA ops
 *  - hardware IOC not available (or "dma-coherent" not set for device in DT)
 *  - But still handle both coherent and non-coherent requests from caller
 *
 * For DMA coherent hardware (IOC) generic code suffices
 */
-void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
+
-		gfp_t gfp, unsigned long attrs)
+void arch_dma_prep_coherent(struct page *page, size_t size)
 {
 	unsigned long order = get_order(size);
 	struct page *page;
 	phys_addr_t paddr;
 	void *kvaddr;
 	bool need_coh = !(attrs & DMA_ATTR_NON_CONSISTENT);
 	/*
 	 * __GFP_HIGHMEM flag is cleared by upper layer functions
 	 * (in include/linux/dma-mapping.h) so we should never get a
 	 * __GFP_HIGHMEM here.
 	 */
 	BUG_ON(gfp & __GFP_HIGHMEM);
 	page = alloc_pages(gfp | __GFP_ZERO, order);
 	if (!page)
 		return NULL;
 	/* This is linear addr (0x8000_0000 based) */
 	paddr = page_to_phys(page);
 	*dma_handle = paddr;
 	/*
 	 * A coherent buffer needs MMU mapping to enforce non-cachability.
 	 * kvaddr is kernel Virtual address (0x7000_0000 based).
 	 */
 	if (need_coh) {
 		kvaddr = ioremap_nocache(paddr, size);
 		if (kvaddr == NULL) {
 			__free_pages(page, order);
 			return NULL;
 		}
 	} else {
 		kvaddr = (void *)(u32)paddr;
 	}
 	/*
 	 * Evict any existing L1 and/or L2 lines for the backing page
 	 * in case it was used earlier as a normal "cached" page.
@ -63,28 +27,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
 	 * Currently flush_cache_vmap nukes the L1 cache completely which
 	 * will be optimized as a separate commit
 	 */
-	if (need_coh)
+	dma_cache_wback_inv(page_to_phys(page), size);
 		dma_cache_wback_inv(paddr, size);
 	return kvaddr;
 }
 void arch_dma_free(struct device *dev, size_t size, void *vaddr,
 		dma_addr_t dma_handle, unsigned long attrs)
 {
 	phys_addr_t paddr = dma_handle;
 	struct page *page = virt_to_page(paddr);
 	if (!(attrs & DMA_ATTR_NON_CONSISTENT))
 		iounmap((void __force __iomem *)vaddr);
 	__free_pages(page, get_order(size));
 }
 long arch_dma_coherent_to_pfn(struct device *dev, void *cpu_addr,
 		dma_addr_t dma_addr)
 {
 	return __phys_to_pfn(dma_addr);
 }
 /*
@ -161,3 +104,9 @@ void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
 	dev_info(dev, "use %sncoherent DMA ops\n",
 		 dev->dma_coherent ? "" : "non");
 }
 static int __init atomic_pool_init(void)
 {
 	return dma_atomic_pool_init(GFP_KERNEL, pgprot_noncached(PAGE_KERNEL));
 }
 postcore_initcall(atomic_pool_init);
--- a/arch/arm/mm/dma-mapping-nommu.c
+++ b/arch/arm/mm/dma-mapping-nommu.c
@ -35,18 +35,7 @@ static void *arm_nommu_dma_alloc(struct device *dev, size_t size,
 				 unsigned long attrs)
 {
-	void *ret;
+	void *ret = dma_alloc_from_global_coherent(size, dma_handle);
 	/*
 	 * Try generic allocator first if we are advertised that
 	 * consistency is not required.
 	 */
 	if (attrs & DMA_ATTR_NON_CONSISTENT)
 		return dma_direct_alloc_pages(dev, size, dma_handle, gfp,
 				attrs);
 	ret = dma_alloc_from_global_coherent(size, dma_handle);
 	/*
 	 * dma_alloc_from_global_coherent() may fail because:
@ -66,16 +55,9 @@ static void arm_nommu_dma_free(struct device *dev, size_t size,
 			       void *cpu_addr, dma_addr_t dma_addr,
 			       unsigned long attrs)
 {
-	if (attrs & DMA_ATTR_NON_CONSISTENT) {
+	int ret = dma_release_from_global_coherent(get_order(size), cpu_addr);
 		dma_direct_free_pages(dev, size, cpu_addr, dma_addr, attrs);
 	} else {
 		int ret = dma_release_from_global_coherent(get_order(size),
 							   cpu_addr);
 	WARN_ON_ONCE(ret == 0);
 	}
 	return;
 }
 static int arm_nommu_dma_mmap(struct device *dev, struct vm_area_struct *vma,
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@ -216,25 +216,7 @@ EXPORT_SYMBOL(arm_coherent_dma_ops);
 static int __dma_supported(struct device *dev, u64 mask, bool warn)
 {
-	unsigned long max_dma_pfn;
+	unsigned long max_dma_pfn = min(max_pfn, arm_dma_pfn_limit);
 	/*
 	 * If the mask allows for more memory than we can address,
 	 * and we actually have that much memory, then we must
 	 * indicate that DMA to this device is not supported.
 	 */
 	if (sizeof(mask) != sizeof(dma_addr_t) &&
 	    mask > (dma_addr_t)~0 &&
 	    dma_to_pfn(dev, ~0) < max_pfn - 1) {
 		if (warn) {
 			dev_warn(dev, "Coherent DMA mask %#llx is larger than dma_addr_t allows\n",
 				 mask);
 			dev_warn(dev, "Driver did not use or check the return value from dma_set_coherent_mask()?\n");
 		}
 		return 0;
 	}
 	max_dma_pfn = min(max_pfn, arm_dma_pfn_limit);
 	/*
 	 * Translate the device's DMA mask to a PFN limit.  This
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@ -1121,6 +1121,7 @@ config DMA_NONCOHERENT
 	bool
 	select ARCH_HAS_DMA_MMAP_PGPROT
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
 	select ARCH_HAS_UNCACHED_SEGMENT
 	select NEED_DMA_MAP_STATE
 	select ARCH_HAS_DMA_COHERENT_TO_PFN
 	select DMA_NONCOHERENT_CACHE_SYNC
--- a/arch/mips/include/asm/page.h
+++ b/arch/mips/include/asm/page.h
@ -258,9 +258,6 @@ extern bool __virt_addr_valid(const volatile void *kaddr);
 	 ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) | \
 	 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
 #define UNCAC_ADDR(addr)	(UNCAC_BASE + __pa(addr))
 #define CAC_ADDR(addr)		((unsigned long)__va((addr) - UNCAC_BASE))
 #include <asm-generic/memory_model.h>
 #include <asm-generic/getorder.h>
--- a/arch/mips/jazz/jazzdma.c
+++ b/arch/mips/jazz/jazzdma.c
@ -575,10 +575,6 @@ static void *jazz_dma_alloc(struct device *dev, size_t size,
 		return NULL;
 	}
 	if (!(attrs & DMA_ATTR_NON_CONSISTENT)) {
 		dma_cache_wback_inv((unsigned long)ret, size);
 		ret = (void *)UNCAC_ADDR(ret);
 	}
 	return ret;
 }
@ -586,8 +582,6 @@ static void jazz_dma_free(struct device *dev, size_t size, void *vaddr,
 		dma_addr_t dma_handle, unsigned long attrs)
 {
 	vdma_free(dma_handle);
 	if (!(attrs & DMA_ATTR_NON_CONSISTENT))
 		vaddr = (void *)CAC_ADDR((unsigned long)vaddr);
 	dma_direct_free_pages(dev, size, vaddr, dma_handle, attrs);
 }
--- a/arch/mips/mm/cache.c
+++ b/arch/mips/mm/cache.c
@ -62,8 +62,6 @@ void (*_dma_cache_wback_inv)(unsigned long start, unsigned long size);
 void (*_dma_cache_wback)(unsigned long start, unsigned long size);
 void (*_dma_cache_inv)(unsigned long start, unsigned long size);
 EXPORT_SYMBOL(_dma_cache_wback_inv);
 #endif /* CONFIG_DMA_NONCOHERENT */
 /*
--- a/arch/mips/mm/dma-noncoherent.c
+++ b/arch/mips/mm/dma-noncoherent.c
@ -44,33 +44,25 @@ static inline bool cpu_needs_post_dma_flush(struct device *dev)
 	}
 }
-void *arch_dma_alloc(struct device *dev, size_t size,
+void arch_dma_prep_coherent(struct page *page, size_t size)
 		dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
 {
-	void *ret;
+	dma_cache_wback_inv((unsigned long)page_address(page), size);
 	ret = dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs);
 	if (ret && !(attrs & DMA_ATTR_NON_CONSISTENT)) {
 		dma_cache_wback_inv((unsigned long) ret, size);
 		ret = (void *)UNCAC_ADDR(ret);
 	}
 	return ret;
 }
-void arch_dma_free(struct device *dev, size_t size, void *cpu_addr,
+void *uncached_kernel_address(void *addr)
 		dma_addr_t dma_addr, unsigned long attrs)
 {
-	if (!(attrs & DMA_ATTR_NON_CONSISTENT))
+	return (void *)(__pa(addr) + UNCAC_BASE);
-		cpu_addr = (void *)CAC_ADDR((unsigned long)cpu_addr);
+}
-	dma_direct_free_pages(dev, size, cpu_addr, dma_addr, attrs);
+
 void *cached_kernel_address(void *addr)
 {
 	return __va(addr) - UNCAC_BASE;
 }
 long arch_dma_coherent_to_pfn(struct device *dev, void *cpu_addr,
 		dma_addr_t dma_addr)
 {
-	unsigned long addr = CAC_ADDR((unsigned long)cpu_addr);
+	return page_to_pfn(virt_to_page(cached_kernel_address(cpu_addr)));
 	return page_to_pfn(virt_to_page((void *)addr));
 }
 pgprot_t arch_dma_mmap_pgprot(struct device *dev, pgprot_t prot,
--- a/arch/nds32/Kconfig
+++ b/arch/nds32/Kconfig
@ -7,12 +7,14 @@
 config NDS32
 	def_bool y
 	select ARCH_32BIT_OFF_T
 	select ARCH_HAS_DMA_PREP_COHERENT
 	select ARCH_HAS_SYNC_DMA_FOR_CPU
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
 	select ARCH_WANT_FRAME_POINTERS if FTRACE
 	select CLKSRC_MMIO
 	select CLONE_BACKWARDS
 	select COMMON_CLK
 	select DMA_DIRECT_REMAP
 	select GENERIC_ATOMIC64
 	select GENERIC_CPU_DEVICES
 	select GENERIC_CLOCKEVENTS
--- a/arch/nds32/kernel/dma.c
+++ b/arch/nds32/kernel/dma.c
@ -3,327 +3,13 @@
 #include <linux/types.h>
 #include <linux/mm.h>
 #include <linux/string.h>
 #include <linux/dma-noncoherent.h>
 #include <linux/io.h>
 #include <linux/cache.h>
 #include <linux/highmem.h>
 #include <linux/slab.h>
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 #include <asm/proc-fns.h>
 /*
 * This is the page table (2MB) covering uncached, DMA consistent allocations
 */
 static pte_t *consistent_pte;
 static DEFINE_RAW_SPINLOCK(consistent_lock);
 /*
 * VM region handling support.
 *
 * This should become something generic, handling VM region allocations for
 * vmalloc and similar (ioremap, module space, etc).
 *
 * I envisage vmalloc()'s supporting vm_struct becoming:
 *
 *  struct vm_struct {
 *    struct vm_region	region;
 *    unsigned long	flags;
 *    struct page	**pages;
 *    unsigned int	nr_pages;
 *    unsigned long	phys_addr;
 *  };
 *
 * get_vm_area() would then call vm_region_alloc with an appropriate
 * struct vm_region head (eg):
 *
 *  struct vm_region vmalloc_head = {
 *	.vm_list	= LIST_HEAD_INIT(vmalloc_head.vm_list),
 *	.vm_start	= VMALLOC_START,
 *	.vm_end		= VMALLOC_END,
 *  };
 *
 * However, vmalloc_head.vm_start is variable (typically, it is dependent on
 * the amount of RAM found at boot time.)  I would imagine that get_vm_area()
 * would have to initialise this each time prior to calling vm_region_alloc().
 */
 struct arch_vm_region {
 	struct list_head vm_list;
 	unsigned long vm_start;
 	unsigned long vm_end;
 	struct page *vm_pages;
 };
 static struct arch_vm_region consistent_head = {
 	.vm_list = LIST_HEAD_INIT(consistent_head.vm_list),
 	.vm_start = CONSISTENT_BASE,
 	.vm_end = CONSISTENT_END,
 };
 static struct arch_vm_region *vm_region_alloc(struct arch_vm_region *head,
 					      size_t size, int gfp)
 {
 	unsigned long addr = head->vm_start, end = head->vm_end - size;
 	unsigned long flags;
 	struct arch_vm_region *c, *new;
 	new = kmalloc(sizeof(struct arch_vm_region), gfp);
 	if (!new)
 		goto out;
 	raw_spin_lock_irqsave(&consistent_lock, flags);
 	list_for_each_entry(c, &head->vm_list, vm_list) {
 		if ((addr + size) < addr)
 			goto nospc;
 		if ((addr + size) <= c->vm_start)
 			goto found;
 		addr = c->vm_end;
 		if (addr > end)
 			goto nospc;
 	}
 found:
 	/*
 	 * Insert this entry _before_ the one we found.
 	 */
 	list_add_tail(&new->vm_list, &c->vm_list);
 	new->vm_start = addr;
 	new->vm_end = addr + size;
 	raw_spin_unlock_irqrestore(&consistent_lock, flags);
 	return new;
 nospc:
 	raw_spin_unlock_irqrestore(&consistent_lock, flags);
 	kfree(new);
 out:
 	return NULL;
 }
 static struct arch_vm_region *vm_region_find(struct arch_vm_region *head,
 					     unsigned long addr)
 {
 	struct arch_vm_region *c;
 	list_for_each_entry(c, &head->vm_list, vm_list) {
 		if (c->vm_start == addr)
 			goto out;
 	}
 	c = NULL;
 out:
 	return c;
 }
 void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
 		gfp_t gfp, unsigned long attrs)
 {
 	struct page *page;
 	struct arch_vm_region *c;
 	unsigned long order;
 	u64 mask = ~0ULL, limit;
 	pgprot_t prot = pgprot_noncached(PAGE_KERNEL);
 	if (!consistent_pte) {
 		pr_err("%s: not initialized\n", __func__);
 		dump_stack();
 		return NULL;
 	}
 	if (dev) {
 		mask = dev->coherent_dma_mask;
 		/*
 		 * Sanity check the DMA mask - it must be non-zero, and
 		 * must be able to be satisfied by a DMA allocation.
 		 */
 		if (mask == 0) {
 			dev_warn(dev, "coherent DMA mask is unset\n");
 			goto no_page;
 		}
 	}
 	/*
 	 * Sanity check the allocation size.
 	 */
 	size = PAGE_ALIGN(size);
 	limit = (mask + 1) & ~mask;
 	if ((limit && size >= limit) ||
 	    size >= (CONSISTENT_END - CONSISTENT_BASE)) {
 		pr_warn("coherent allocation too big "
 			"(requested %#x mask %#llx)\n", size, mask);
 		goto no_page;
 	}
 	order = get_order(size);
 	if (mask != 0xffffffff)
 		gfp |= GFP_DMA;
 	page = alloc_pages(gfp, order);
 	if (!page)
 		goto no_page;
 	/*
 	 * Invalidate any data that might be lurking in the
 	 * kernel direct-mapped region for device DMA.
 	 */
 	{
 		unsigned long kaddr = (unsigned long)page_address(page);
 		memset(page_address(page), 0, size);
 		cpu_dma_wbinval_range(kaddr, kaddr + size);
 	}
 	/*
 	 * Allocate a virtual address in the consistent mapping region.
 	 */
 	c = vm_region_alloc(&consistent_head, size,
 			    gfp & ~(__GFP_DMA | __GFP_HIGHMEM));
 	if (c) {
 		pte_t *pte = consistent_pte + CONSISTENT_OFFSET(c->vm_start);
 		struct page *end = page + (1 << order);
 		c->vm_pages = page;
 		/*
 		 * Set the "dma handle"
 		 */
 		*handle = page_to_phys(page);
 		do {
 			BUG_ON(!pte_none(*pte));
 			/*
 			 * x86 does not mark the pages reserved...
 			 */
 			SetPageReserved(page);
 			set_pte(pte, mk_pte(page, prot));
 			page++;
 			pte++;
 		} while (size -= PAGE_SIZE);
 		/*
 		 * Free the otherwise unused pages.
 		 */
 		while (page < end) {
 			__free_page(page);
 			page++;
 		}
 		return (void *)c->vm_start;
 	}
 	if (page)
 		__free_pages(page, order);
 no_page:
 	*handle = ~0;
 	return NULL;
 }
 void arch_dma_free(struct device *dev, size_t size, void *cpu_addr,
 		dma_addr_t handle, unsigned long attrs)
 {
 	struct arch_vm_region *c;
 	unsigned long flags, addr;
 	pte_t *ptep;
 	size = PAGE_ALIGN(size);
 	raw_spin_lock_irqsave(&consistent_lock, flags);
 	c = vm_region_find(&consistent_head, (unsigned long)cpu_addr);
 	if (!c)
 		goto no_area;
 	if ((c->vm_end - c->vm_start) != size) {
 		pr_err("%s: freeing wrong coherent size (%ld != %d)\n",
 		       __func__, c->vm_end - c->vm_start, size);
 		dump_stack();
 		size = c->vm_end - c->vm_start;
 	}
 	ptep = consistent_pte + CONSISTENT_OFFSET(c->vm_start);
 	addr = c->vm_start;
 	do {
 		pte_t pte = ptep_get_and_clear(&init_mm, addr, ptep);
 		unsigned long pfn;
 		ptep++;
 		addr += PAGE_SIZE;
 		if (!pte_none(pte) && pte_present(pte)) {
 			pfn = pte_pfn(pte);
 			if (pfn_valid(pfn)) {
 				struct page *page = pfn_to_page(pfn);
 				/*
 				 * x86 does not mark the pages reserved...
 				 */
 				ClearPageReserved(page);
 				__free_page(page);
 				continue;
 			}
 		}
 		pr_crit("%s: bad page in kernel page table\n", __func__);
 	} while (size -= PAGE_SIZE);
 	flush_tlb_kernel_range(c->vm_start, c->vm_end);
 	list_del(&c->vm_list);
 	raw_spin_unlock_irqrestore(&consistent_lock, flags);
 	kfree(c);
 	return;
 no_area:
 	raw_spin_unlock_irqrestore(&consistent_lock, flags);
 	pr_err("%s: trying to free invalid coherent area: %p\n",
 	       __func__, cpu_addr);
 	dump_stack();
 }
 /*
 * Initialise the consistent memory allocation.
 */
 static int __init consistent_init(void)
 {
 	pgd_t *pgd;
 	pmd_t *pmd;
 	pte_t *pte;
 	int ret = 0;
 	do {
 		pgd = pgd_offset(&init_mm, CONSISTENT_BASE);
 		pmd = pmd_alloc(&init_mm, pgd, CONSISTENT_BASE);
 		if (!pmd) {
 			pr_err("%s: no pmd tables\n", __func__);
 			ret = -ENOMEM;
 			break;
 		}
 		/* The first level mapping may be created in somewhere.
 		 * It's not necessary to warn here. */
 		/* WARN_ON(!pmd_none(*pmd)); */
 		pte = pte_alloc_kernel(pmd, CONSISTENT_BASE);
 		if (!pte) {
 			ret = -ENOMEM;
 			break;
 		}
 		consistent_pte = pte;
 	} while (0);
 	return ret;
 }
 core_initcall(consistent_init);
 static inline void cache_op(phys_addr_t paddr, size_t size,
 		void (*fn)(unsigned long start, unsigned long end))
 {
@ -389,3 +75,14 @@ void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
 		BUG();
 	}
 }
 void arch_dma_prep_coherent(struct page *page, size_t size)
 {
 	cache_op(page_to_phys(page), size, cpu_dma_wbinval_range);
 }
 static int __init atomic_pool_init(void)
 {
 	return dma_atomic_pool_init(GFP_KERNEL, pgprot_noncached(PAGE_KERNEL));
 }
 postcore_initcall(atomic_pool_init);
--- a/arch/nios2/Kconfig
+++ b/arch/nios2/Kconfig
@ -4,6 +4,7 @@ config NIOS2
 	select ARCH_32BIT_OFF_T
 	select ARCH_HAS_SYNC_DMA_FOR_CPU
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
 	select ARCH_HAS_UNCACHED_SEGMENT
 	select ARCH_NO_SWAP
 	select TIMER_OF
 	select GENERIC_ATOMIC64
--- a/arch/nios2/include/asm/page.h
+++ b/arch/nios2/include/asm/page.h
@ -101,12 +101,6 @@ static inline bool pfn_valid(unsigned long pfn)
 # define VM_DATA_DEFAULT_FLAGS	(VM_READ | VM_WRITE | \
 				 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
 # define UNCAC_ADDR(addr)	\
 	((void *)((unsigned)(addr) | CONFIG_NIOS2_IO_REGION_BASE))
 # define CAC_ADDR(addr)		\
 	((void *)(((unsigned)(addr) & ~CONFIG_NIOS2_IO_REGION_BASE) |	\
 		CONFIG_NIOS2_KERNEL_REGION_BASE))
 #include <asm-generic/memory_model.h>
 #include <asm-generic/getorder.h>
--- a/arch/nios2/mm/dma-mapping.c
+++ b/arch/nios2/mm/dma-mapping.c
@ -60,32 +60,28 @@ void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
 	}
 }
-void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
+void arch_dma_prep_coherent(struct page *page, size_t size)
 		gfp_t gfp, unsigned long attrs)
 {
-	void *ret;
+	unsigned long start = (unsigned long)page_address(page);
-	/* optimized page clearing */
+	flush_dcache_range(start, start + size);
 	gfp |= __GFP_ZERO;
 	if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff))
 		gfp |= GFP_DMA;
 	ret = (void *) __get_free_pages(gfp, get_order(size));
 	if (ret != NULL) {
 		*dma_handle = virt_to_phys(ret);
 		flush_dcache_range((unsigned long) ret,
 			(unsigned long) ret + size);
 		ret = UNCAC_ADDR(ret);
 	}
 	return ret;
 }
-void arch_dma_free(struct device *dev, size_t size, void *vaddr,
+void *uncached_kernel_address(void *ptr)
 		dma_addr_t dma_handle, unsigned long attrs)
 {
-	unsigned long addr = (unsigned long) CAC_ADDR((unsigned long) vaddr);
+	unsigned long addr = (unsigned long)ptr;
-	free_pages(addr, get_order(size));
+	addr |= CONFIG_NIOS2_IO_REGION_BASE;
 	return (void *)ptr;
 }
 void *cached_kernel_address(void *ptr)
 {
 	unsigned long addr = (unsigned long)ptr;
 	addr &= ~CONFIG_NIOS2_IO_REGION_BASE;
 	addr |= CONFIG_NIOS2_KERNEL_REGION_BASE;
 	return (void *)ptr;
 }
--- a/arch/openrisc/kernel/dma.c
+++ b/arch/openrisc/kernel/dma.c
@ -94,7 +94,6 @@ arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
 	va = (unsigned long)page;
 	if ((attrs & DMA_ATTR_NON_CONSISTENT) == 0) {
 	/*
 	 * We need to iterate through the pages, clearing the dcache for
 	 * them and setting the cache-inhibit bit.
@ -103,7 +102,6 @@ arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
 		free_pages_exact(page, size);
 		return NULL;
 	}
 	}
 	return (void *)va;
 }
@ -118,10 +116,8 @@ arch_dma_free(struct device *dev, size_t size, void *vaddr,
 		.mm = &init_mm
 	};
 	if ((attrs & DMA_ATTR_NON_CONSISTENT) == 0) {
 	/* walk_page_range shouldn't be able to fail here */
 	WARN_ON(walk_page_range(va, va + size, &walk));
 	}
 	free_pages_exact(vaddr, size);
 }
--- a/arch/parisc/kernel/pci-dma.c
+++ b/arch/parisc/kernel/pci-dma.c
@ -394,17 +394,20 @@ pcxl_dma_init(void)
 __initcall(pcxl_dma_init);
-static void *pcxl_dma_alloc(struct device *dev, size_t size,
+void *arch_dma_alloc(struct device *dev, size_t size,
-		dma_addr_t *dma_handle, gfp_t flag, unsigned long attrs)
+		dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
 {
 	unsigned long vaddr;
 	unsigned long paddr;
 	int order;
 	if (boot_cpu_data.cpu_type != pcxl2 && boot_cpu_data.cpu_type != pcxl)
 		return NULL;
 	order = get_order(size);
 	size = 1 << (order + PAGE_SHIFT);
 	vaddr = pcxl_alloc_range(size);
-	paddr = __get_free_pages(flag | __GFP_ZERO, order);
+	paddr = __get_free_pages(gfp | __GFP_ZERO, order);
 	flush_kernel_dcache_range(paddr, size);
 	paddr = __pa(paddr);
 	map_uncached_pages(vaddr, size, paddr);
@ -421,44 +424,19 @@ static void *pcxl_dma_alloc(struct device *dev, size_t size,
 	return (void *)vaddr;
 }
 static void *pcx_dma_alloc(struct device *dev, size_t size,
 		dma_addr_t *dma_handle, gfp_t flag, unsigned long attrs)
 {
 	void *addr;
 	if ((attrs & DMA_ATTR_NON_CONSISTENT) == 0)
 		return NULL;
 	addr = (void *)__get_free_pages(flag | __GFP_ZERO, get_order(size));
 	if (addr)
 		*dma_handle = (dma_addr_t)virt_to_phys(addr);
 	return addr;
 }
 void *arch_dma_alloc(struct device *dev, size_t size,
 		dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
 {
 	if (boot_cpu_data.cpu_type == pcxl2 || boot_cpu_data.cpu_type == pcxl)
 		return pcxl_dma_alloc(dev, size, dma_handle, gfp, attrs);
 	else
 		return pcx_dma_alloc(dev, size, dma_handle, gfp, attrs);
 }
 void arch_dma_free(struct device *dev, size_t size, void *vaddr,
 		dma_addr_t dma_handle, unsigned long attrs)
 {
 	int order = get_order(size);
-	if (boot_cpu_data.cpu_type == pcxl2 || boot_cpu_data.cpu_type == pcxl) {
+	WARN_ON_ONCE(boot_cpu_data.cpu_type != pcxl2 &&
 		     boot_cpu_data.cpu_type != pcxl);
 	size = 1 << (order + PAGE_SHIFT);
 	unmap_uncached_pages((unsigned long)vaddr, size);
 	pcxl_free_range((unsigned long)vaddr, size);
-		vaddr = __va(dma_handle);
+	free_pages((unsigned long)__va(dma_handle), order);
 	}
 	free_pages((unsigned long)vaddr, get_order(size));
 }
 void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
--- a/arch/xtensa/kernel/pci-dma.c
+++ b/arch/xtensa/kernel/pci-dma.c
@ -163,10 +163,6 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
 	*handle = phys_to_dma(dev, page_to_phys(page));
 	if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) {
 		return page;
 	}
 #ifdef CONFIG_MMU
 	if (PageHighMem(page)) {
 		void *p;
@ -192,9 +188,7 @@ void arch_dma_free(struct device *dev, size_t size, void *vaddr,
 	unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
 	struct page *page;
-	if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) {
+	if (platform_vaddr_uncached(vaddr)) {
 		page = vaddr;
 	} else if (platform_vaddr_uncached(vaddr)) {
 		page = virt_to_page(platform_vaddr_to_cached(vaddr));
 	} else {
 #ifdef CONFIG_MMU
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@ -951,8 +951,8 @@ static void __iommu_dma_free(struct device *dev, size_t size, void *cpu_addr)
 	if (pages)
 		__iommu_dma_free_pages(pages, count);
-	if (page && !dma_release_from_contiguous(dev, page, count))
+	if (page)
-		__free_pages(page, get_order(alloc_size));
+		dma_free_contiguous(dev, page, alloc_size);
 }
 static void iommu_dma_free(struct device *dev, size_t size, void *cpu_addr,
@ -970,12 +970,7 @@ static void *iommu_dma_alloc_pages(struct device *dev, size_t size,
 	struct page *page = NULL;
 	void *cpu_addr;
-	if (gfpflags_allow_blocking(gfp))
+	page = dma_alloc_contiguous(dev, alloc_size, gfp);
 		page = dma_alloc_from_contiguous(dev, alloc_size >> PAGE_SHIFT,
 						 get_order(alloc_size),
 						 gfp & __GFP_NOWARN);
 	if (!page)
 		page = alloc_pages(gfp, get_order(alloc_size));
 	if (!page)
 		return NULL;
@ -997,8 +992,7 @@ static void *iommu_dma_alloc_pages(struct device *dev, size_t size,
 	memset(cpu_addr, 0, alloc_size);
 	return cpu_addr;
 out_free_pages:
-	if (!dma_release_from_contiguous(dev, page, alloc_size >> PAGE_SHIFT))
+	dma_free_contiguous(dev, page, alloc_size);
 		__free_pages(page, get_order(alloc_size));
 	return NULL;
 }
--- a/drivers/usb/Kconfig
+++ b/drivers/usb/Kconfig
@ -45,6 +45,7 @@ config USB_ARCH_HAS_HCD
 config USB
 	tristate "Support for Host-side USB"
 	depends on USB_ARCH_HAS_HCD
 	select GENERIC_ALLOCATOR
 	select USB_COMMON
 	select NLS  # for UTF-8 strings
 	---help---
--- a/drivers/usb/core/buffer.c
+++ b/drivers/usb/core/buffer.c
@ -16,6 +16,7 @@
 #include <linux/io.h>
 #include <linux/dma-mapping.h>
 #include <linux/dmapool.h>
 #include <linux/genalloc.h>
 #include <linux/usb.h>
 #include <linux/usb/hcd.h>
@ -67,7 +68,7 @@ int hcd_buffer_create(struct usb_hcd *hcd)
 	if (!IS_ENABLED(CONFIG_HAS_DMA) ||
 	    (!is_device_dma_capable(hcd->self.sysdev) &&
-	     !(hcd->driver->flags & HCD_LOCAL_MEM)))
+	     !hcd->localmem_pool))
 		return 0;
 	for (i = 0; i < HCD_BUFFER_POOLS; i++) {
@ -124,10 +125,12 @@ void *hcd_buffer_alloc(
 	if (size == 0)
 		return NULL;
 	if (hcd->localmem_pool)
 		return gen_pool_dma_alloc(hcd->localmem_pool, size, dma);
 	/* some USB hosts just use PIO */
 	if (!IS_ENABLED(CONFIG_HAS_DMA) ||
-	    (!is_device_dma_capable(bus->sysdev) &&
+	    !is_device_dma_capable(bus->sysdev)) {
 	     !(hcd->driver->flags & HCD_LOCAL_MEM))) {
 		*dma = ~(dma_addr_t) 0;
 		return kmalloc(size, mem_flags);
 	}
@ -152,9 +155,13 @@ void hcd_buffer_free(
 	if (!addr)
 		return;
 	if (hcd->localmem_pool) {
 		gen_pool_free(hcd->localmem_pool, (unsigned long)addr, size);
 		return;
 	}
 	if (!IS_ENABLED(CONFIG_HAS_DMA) ||
-	    (!is_device_dma_capable(bus->sysdev) &&
+	    !is_device_dma_capable(bus->sysdev)) {
 	     !(hcd->driver->flags & HCD_LOCAL_MEM))) {
 		kfree(addr);
 		return;
 	}
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@ -29,6 +29,8 @@
 #include <linux/workqueue.h>
 #include <linux/pm_runtime.h>
 #include <linux/types.h>
 #include <linux/genalloc.h>
 #include <linux/io.h>
 #include <linux/phy/phy.h>
 #include <linux/usb.h>
@ -1345,14 +1347,14 @@ EXPORT_SYMBOL_GPL(usb_hcd_unlink_urb_from_ep);
 * using regular system memory - like pci devices doing bus mastering.
 *
 * To support host controllers with limited dma capabilities we provide dma
- * bounce buffers. This feature can be enabled using the HCD_LOCAL_MEM flag.
+ * bounce buffers. This feature can be enabled by initializing
 * hcd->localmem_pool using usb_hcd_setup_local_mem().
 * For this to work properly the host controller code must first use the
 * function dma_declare_coherent_memory() to point out which memory area
 * that should be used for dma allocations.
 *
- * The HCD_LOCAL_MEM flag then tells the usb code to allocate all data for
+ * The initialized hcd->localmem_pool then tells the usb code to allocate all
- * dma using dma_alloc_coherent() which in turn allocates from the memory
+ * data for dma using the genalloc API.
 * area pointed out with dma_declare_coherent_memory().
 *
 * So, to summarize...
 *
@ -1362,9 +1364,6 @@ EXPORT_SYMBOL_GPL(usb_hcd_unlink_urb_from_ep);
 *   (a) "normal" kernel memory is no good, and
 *   (b) there's not enough to share
 *
 * - The only *portable* hook for such stuff in the
 *   DMA framework is dma_declare_coherent_memory()
 *
 * - So we use that, even though the primary requirement
 *   is that the memory be "local" (hence addressable
 *   by that device), not "coherent".
@ -1531,7 +1530,7 @@ int usb_hcd_map_urb_for_dma(struct usb_hcd *hcd, struct urb *urb,
 						urb->setup_dma))
 				return -EAGAIN;
 			urb->transfer_flags |= URB_SETUP_MAP_SINGLE;
-		} else if (hcd->driver->flags & HCD_LOCAL_MEM) {
+		} else if (hcd->localmem_pool) {
 			ret = hcd_alloc_coherent(
 					urb->dev->bus, mem_flags,
 					&urb->setup_dma,
@ -1601,7 +1600,7 @@ int usb_hcd_map_urb_for_dma(struct usb_hcd *hcd, struct urb *urb,
 				else
 					urb->transfer_flags |= URB_DMA_MAP_SINGLE;
 			}
-		} else if (hcd->driver->flags & HCD_LOCAL_MEM) {
+		} else if (hcd->localmem_pool) {
 			ret = hcd_alloc_coherent(
 					urb->dev->bus, mem_flags,
 					&urb->transfer_dma,
@ -3039,6 +3038,40 @@ usb_hcd_platform_shutdown(struct platform_device *dev)
 }
 EXPORT_SYMBOL_GPL(usb_hcd_platform_shutdown);
 int usb_hcd_setup_local_mem(struct usb_hcd *hcd, phys_addr_t phys_addr,
 			    dma_addr_t dma, size_t size)
 {
 	int err;
 	void *local_mem;
 	hcd->localmem_pool = devm_gen_pool_create(hcd->self.sysdev, 4,
 						  dev_to_node(hcd->self.sysdev),
 						  dev_name(hcd->self.sysdev));
 	if (IS_ERR(hcd->localmem_pool))
 		return PTR_ERR(hcd->localmem_pool);
 	local_mem = devm_memremap(hcd->self.sysdev, phys_addr,
 				  size, MEMREMAP_WC);
 	if (!local_mem)
 		return -ENOMEM;
 	/*
 	 * Here we pass a dma_addr_t but the arg type is a phys_addr_t.
 	 * It's not backed by system memory and thus there's no kernel mapping
 	 * for it.
 	 */
 	err = gen_pool_add_virt(hcd->localmem_pool, (unsigned long)local_mem,
 				dma, size, dev_to_node(hcd->self.sysdev));
 	if (err < 0) {
 		dev_err(hcd->self.sysdev, "gen_pool_add_virt failed with %d\n",
 			err);
 		return err;
 	}
 	return 0;
 }
 EXPORT_SYMBOL_GPL(usb_hcd_setup_local_mem);
 /*-------------------------------------------------------------------------*/
 #if IS_ENABLED(CONFIG_USB_MON)
--- a/drivers/usb/host/ehci-hcd.c
+++ b/drivers/usb/host/ehci-hcd.c
@ -559,7 +559,7 @@ static int ehci_init(struct usb_hcd *hcd)
 	ehci->command = temp;
 	/* Accept arbitrarily long scatter-gather lists */
-	if (!(hcd->driver->flags & HCD_LOCAL_MEM))
+	if (!hcd->localmem_pool)
 		hcd->self.sg_tablesize = ~0;
 	/* Prepare for unlinking active QHs */
--- a/drivers/usb/host/fotg210-hcd.c
+++ b/drivers/usb/host/fotg210-hcd.c
@ -4996,7 +4996,7 @@ static int hcd_fotg210_init(struct usb_hcd *hcd)
 	fotg210->command = temp;
 	/* Accept arbitrarily long scatter-gather lists */
-	if (!(hcd->driver->flags & HCD_LOCAL_MEM))
+	if (!hcd->localmem_pool)
 		hcd->self.sg_tablesize = ~0;
 	return 0;
 }
--- a/drivers/usb/host/ohci-hcd.c
+++ b/drivers/usb/host/ohci-hcd.c
@ -40,6 +40,7 @@
 #include <linux/dmapool.h>
 #include <linux/workqueue.h>
 #include <linux/debugfs.h>
 #include <linux/genalloc.h>
 #include <asm/io.h>
 #include <asm/irq.h>
@ -447,7 +448,7 @@ static int ohci_init (struct ohci_hcd *ohci)
 	struct usb_hcd *hcd = ohci_to_hcd(ohci);
 	/* Accept arbitrarily long scatter-gather lists */
-	if (!(hcd->driver->flags & HCD_LOCAL_MEM))
+	if (!hcd->localmem_pool)
 		hcd->self.sg_tablesize = ~0;
 	if (distrust_firmware)
@ -505,8 +506,15 @@ static int ohci_init (struct ohci_hcd *ohci)
 	timer_setup(&ohci->io_watchdog, io_watchdog_func, 0);
 	ohci->prev_frame_no = IO_WATCHDOG_OFF;
-	ohci->hcca = dma_alloc_coherent (hcd->self.controller,
+	if (hcd->localmem_pool)
-			sizeof(*ohci->hcca), &ohci->hcca_dma, GFP_KERNEL);
+		ohci->hcca = gen_pool_dma_alloc_align(hcd->localmem_pool,
 						sizeof(*ohci->hcca),
 						&ohci->hcca_dma, 256);
 	else
 		ohci->hcca = dma_alloc_coherent(hcd->self.controller,
 						sizeof(*ohci->hcca),
 						&ohci->hcca_dma,
 						GFP_KERNEL);
 	if (!ohci->hcca)
 		return -ENOMEM;
@ -990,8 +998,13 @@ static void ohci_stop (struct usb_hcd *hcd)
 	remove_debug_files (ohci);
 	ohci_mem_cleanup (ohci);
 	if (ohci->hcca) {
-		dma_free_coherent (hcd->self.controller,
+		if (hcd->localmem_pool)
-				sizeof *ohci->hcca,
+			gen_pool_free(hcd->localmem_pool,
 				      (unsigned long)ohci->hcca,
 				      sizeof(*ohci->hcca));
 		else
 			dma_free_coherent(hcd->self.controller,
 					  sizeof(*ohci->hcca),
 					  ohci->hcca, ohci->hcca_dma);
 		ohci->hcca = NULL;
 		ohci->hcca_dma = 0;
--- a/drivers/usb/host/ohci-mem.c
+++ b/drivers/usb/host/ohci-mem.c
@ -36,6 +36,13 @@ static void ohci_hcd_init (struct ohci_hcd *ohci)
 static int ohci_mem_init (struct ohci_hcd *ohci)
 {
 	/*
 	 * HCs with local memory allocate from localmem_pool so there's
 	 * no need to create the below dma pools.
 	 */
 	if (ohci_to_hcd(ohci)->localmem_pool)
 		return 0;
 	ohci->td_cache = dma_pool_create ("ohci_td",
 		ohci_to_hcd(ohci)->self.controller,
 		sizeof (struct td),
@ -84,8 +91,13 @@ td_alloc (struct ohci_hcd *hc, gfp_t mem_flags)
 {
 	dma_addr_t	dma;
 	struct td	*td;
 	struct usb_hcd	*hcd = ohci_to_hcd(hc);
-	td = dma_pool_zalloc (hc->td_cache, mem_flags, &dma);
+	if (hcd->localmem_pool)
 		td = gen_pool_dma_zalloc_align(hcd->localmem_pool,
 				sizeof(*td), &dma, 32);
 	else
 		td = dma_pool_zalloc(hc->td_cache, mem_flags, &dma);
 	if (td) {
 		/* in case hc fetches it, make it look dead */
 		td->hwNextTD = cpu_to_hc32 (hc, dma);
@ -99,6 +111,7 @@ static void
 td_free (struct ohci_hcd *hc, struct td *td)
 {
 	struct td	**prev = &hc->td_hash [TD_HASH_FUNC (td->td_dma)];
 	struct usb_hcd	*hcd = ohci_to_hcd(hc);
 	while (*prev && *prev != td)
 		prev = &(*prev)->td_hash;
@ -106,7 +119,12 @@ td_free (struct ohci_hcd *hc, struct td *td)
 		*prev = td->td_hash;
 	else if ((td->hwINFO & cpu_to_hc32(hc, TD_DONE)) != 0)
 		ohci_dbg (hc, "no hash for td %p\n", td);
-	dma_pool_free (hc->td_cache, td, td->td_dma);
+
 	if (hcd->localmem_pool)
 		gen_pool_free(hcd->localmem_pool, (unsigned long)td,
 			      sizeof(*td));
 	else
 		dma_pool_free(hc->td_cache, td, td->td_dma);
 }
 /*-------------------------------------------------------------------------*/
@ -117,8 +135,13 @@ ed_alloc (struct ohci_hcd *hc, gfp_t mem_flags)
 {
 	dma_addr_t	dma;
 	struct ed	*ed;
 	struct usb_hcd	*hcd = ohci_to_hcd(hc);
-	ed = dma_pool_zalloc (hc->ed_cache, mem_flags, &dma);
+	if (hcd->localmem_pool)
 		ed = gen_pool_dma_zalloc_align(hcd->localmem_pool,
 				sizeof(*ed), &dma, 16);
 	else
 		ed = dma_pool_zalloc(hc->ed_cache, mem_flags, &dma);
 	if (ed) {
 		INIT_LIST_HEAD (&ed->td_list);
 		ed->dma = dma;
@ -129,6 +152,12 @@ ed_alloc (struct ohci_hcd *hc, gfp_t mem_flags)
 static void
 ed_free (struct ohci_hcd *hc, struct ed *ed)
 {
-	dma_pool_free (hc->ed_cache, ed, ed->dma);
+	struct usb_hcd	*hcd = ohci_to_hcd(hc);
 	if (hcd->localmem_pool)
 		gen_pool_free(hcd->localmem_pool, (unsigned long)ed,
 			      sizeof(*ed));
 	else
 		dma_pool_free(hc->ed_cache, ed, ed->dma);
 }
--- a/drivers/usb/host/ohci-sm501.c
+++ b/drivers/usb/host/ohci-sm501.c
@ -49,7 +49,7 @@ static const struct hc_driver ohci_sm501_hc_driver = {
 	 * generic hardware linkage
 	 */
 	.irq =			ohci_irq,
-	.flags =		HCD_USB11 | HCD_MEMORY | HCD_LOCAL_MEM,
+	.flags =		HCD_USB11 | HCD_MEMORY,
 	/*
 	 * basic lifecycle operations
@ -110,40 +110,18 @@ static int ohci_hcd_sm501_drv_probe(struct platform_device *pdev)
 		goto err0;
 	}
 	/* The sm501 chip is equipped with local memory that may be used
 	 * by on-chip devices such as the video controller and the usb host.
 	 * This driver uses dma_declare_coherent_memory() to make sure
 	 * usb allocations with dma_alloc_coherent() allocate from
 	 * this local memory. The dma_handle returned by dma_alloc_coherent()
 	 * will be an offset starting from 0 for the first local memory byte.
 	 *
 	 * So as long as data is allocated using dma_alloc_coherent() all is
 	 * fine. This is however not always the case - buffers may be allocated
 	 * using kmalloc() - so the usb core needs to be told that it must copy
 	 * data into our local memory if the buffers happen to be placed in
 	 * regular memory. The HCD_LOCAL_MEM flag does just that.
 	 */
 	retval = dma_declare_coherent_memory(dev, mem->start,
 					 mem->start - mem->parent->start,
 					 resource_size(mem));
 	if (retval) {
 		dev_err(dev, "cannot declare coherent memory\n");
 		goto err1;
 	}
 	/* allocate, reserve and remap resources for registers */
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	if (res == NULL) {
 		dev_err(dev, "no resource definition for registers\n");
 		retval = -ENOENT;
-		goto err2;
+		goto err1;
 	}
 	hcd = usb_create_hcd(driver, &pdev->dev, dev_name(&pdev->dev));
 	if (!hcd) {
 		retval = -ENOMEM;
-		goto err2;
+		goto err1;
 	}
 	hcd->rsrc_start = res->start;
@ -164,6 +142,25 @@ static int ohci_hcd_sm501_drv_probe(struct platform_device *pdev)
 	ohci_hcd_init(hcd_to_ohci(hcd));
 	/* The sm501 chip is equipped with local memory that may be used
 	 * by on-chip devices such as the video controller and the usb host.
 	 * This driver uses genalloc so that usb allocations with
 	 * gen_pool_dma_alloc() allocate from this local memory. The dma_handle
 	 * returned by gen_pool_dma_alloc() will be an offset starting from 0
 	 * for the first local memory byte.
 	 *
 	 * So as long as data is allocated using gen_pool_dma_alloc() all is
 	 * fine. This is however not always the case - buffers may be allocated
 	 * using kmalloc() - so the usb core needs to be told that it must copy
 	 * data into our local memory if the buffers happen to be placed in
 	 * regular memory. A non-null hcd->localmem_pool initialized by the
 	 * the call to usb_hcd_setup_local_mem() below does just that.
 	 */
 	if (usb_hcd_setup_local_mem(hcd, mem->start,
 				    mem->start - mem->parent->start,
 				    resource_size(mem)) < 0)
 		goto err5;
 	retval = usb_add_hcd(hcd, irq, IRQF_SHARED);
 	if (retval)
 		goto err5;
@ -181,8 +178,6 @@ err4:
 	release_mem_region(hcd->rsrc_start, hcd->rsrc_len);
 err3:
 	usb_put_hcd(hcd);
 err2:
 	dma_release_declared_memory(dev);
 err1:
 	release_mem_region(mem->start, resource_size(mem));
 err0:
@ -197,7 +192,6 @@ static int ohci_hcd_sm501_drv_remove(struct platform_device *pdev)
 	usb_remove_hcd(hcd);
 	release_mem_region(hcd->rsrc_start, hcd->rsrc_len);
 	usb_put_hcd(hcd);
 	dma_release_declared_memory(&pdev->dev);
 	mem = platform_get_resource(pdev, IORESOURCE_MEM, 1);
 	if (mem)
 		release_mem_region(mem->start, resource_size(mem));
--- a/drivers/usb/host/ohci-tmio.c
+++ b/drivers/usb/host/ohci-tmio.c
@ -153,7 +153,7 @@ static const struct hc_driver ohci_tmio_hc_driver = {
 	/* generic hardware linkage */
 	.irq =			ohci_irq,
-	.flags =		HCD_USB11 | HCD_MEMORY | HCD_LOCAL_MEM,
+	.flags =		HCD_USB11 | HCD_MEMORY,
 	/* basic lifecycle operations */
 	.start =		ohci_tmio_start,
@ -224,11 +224,6 @@ static int ohci_hcd_tmio_drv_probe(struct platform_device *dev)
 		goto err_ioremap_regs;
 	}
 	ret = dma_declare_coherent_memory(&dev->dev, sram->start, sram->start,
 				resource_size(sram));
 	if (ret)
 		goto err_dma_declare;
 	if (cell->enable) {
 		ret = cell->enable(dev);
 		if (ret)
@ -239,6 +234,11 @@ static int ohci_hcd_tmio_drv_probe(struct platform_device *dev)
 	ohci = hcd_to_ohci(hcd);
 	ohci_hcd_init(ohci);
 	ret = usb_hcd_setup_local_mem(hcd, sram->start, sram->start,
 				      resource_size(sram));
 	if (ret < 0)
 		goto err_enable;
 	ret = usb_add_hcd(hcd, irq, 0);
 	if (ret)
 		goto err_add_hcd;
@ -254,8 +254,6 @@ err_add_hcd:
 	if (cell->disable)
 		cell->disable(dev);
 err_enable:
 	dma_release_declared_memory(&dev->dev);
 err_dma_declare:
 	iounmap(hcd->regs);
 err_ioremap_regs:
 	iounmap(tmio->ccr);
@ -276,7 +274,6 @@ static int ohci_hcd_tmio_drv_remove(struct platform_device *dev)
 	tmio_stop_hc(dev);
 	if (cell->disable)
 		cell->disable(dev);
 	dma_release_declared_memory(&dev->dev);
 	iounmap(hcd->regs);
 	iounmap(tmio->ccr);
 	usb_put_hcd(hcd);
--- a/drivers/usb/host/ohci.h
+++ b/drivers/usb/host/ohci.h
@ -385,6 +385,8 @@ struct ohci_hcd {
 	/*
 	 * memory management for queue data structures
 	 *
 	 * @td_cache and @ed_cache are %NULL if &usb_hcd.localmem_pool is used.
 	 */
 	struct dma_pool		*td_cache;
 	struct dma_pool		*ed_cache;
--- a/drivers/usb/host/uhci-hcd.c
+++ b/drivers/usb/host/uhci-hcd.c
@ -581,7 +581,7 @@ static int uhci_start(struct usb_hcd *hcd)
 	hcd->uses_new_polling = 1;
 	/* Accept arbitrarily long scatter-gather lists */
-	if (!(hcd->driver->flags & HCD_LOCAL_MEM))
+	if (!hcd->localmem_pool)
 		hcd->self.sg_tablesize = ~0;
 	spin_lock_init(&uhci->lock);
--- a/drivers/video/fbdev/au1100fb.c
+++ b/drivers/video/fbdev/au1100fb.c
@ -340,14 +340,12 @@ int au1100fb_fb_pan_display(struct fb_var_screeninfo *var, struct fb_info *fbi)
 */
 int au1100fb_fb_mmap(struct fb_info *fbi, struct vm_area_struct *vma)
 {
-	struct au1100fb_device *fbdev;
+	struct au1100fb_device *fbdev = to_au1100fb_device(fbi);
 	fbdev = to_au1100fb_device(fbi);
 	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 	pgprot_val(vma->vm_page_prot) |= (6 << 9); //CCA=6
-	return vm_iomap_memory(vma, fbdev->fb_phys, fbdev->fb_len);
+	return dma_mmap_coherent(fbdev->dev, vma, fbdev->fb_mem, fbdev->fb_phys,
 			fbdev->fb_len);
 }
 static struct fb_ops au1100fb_ops =
@ -412,7 +410,6 @@ static int au1100fb_drv_probe(struct platform_device *dev)
 {
 	struct au1100fb_device *fbdev;
 	struct resource *regs_res;
 	unsigned long page;
 	struct clk *c;
 	/* Allocate new device private */
@ -424,6 +421,7 @@ static int au1100fb_drv_probe(struct platform_device *dev)
 		goto failed;
 	platform_set_drvdata(dev, (void *)fbdev);
 	fbdev->dev = &dev->dev;
 	/* Allocate region for our registers and map them */
 	regs_res = platform_get_resource(dev, IORESOURCE_MEM, 0);
@ -472,20 +470,6 @@ static int au1100fb_drv_probe(struct platform_device *dev)
 	au1100fb_fix.smem_start = fbdev->fb_phys;
 	au1100fb_fix.smem_len = fbdev->fb_len;
 	/*
 	 * Set page reserved so that mmap will work. This is necessary
 	 * since we'll be remapping normal memory.
 	 */
 	for (page = (unsigned long)fbdev->fb_mem;
 	     page < PAGE_ALIGN((unsigned long)fbdev->fb_mem + fbdev->fb_len);
 	     page += PAGE_SIZE) {
 #ifdef CONFIG_DMA_NONCOHERENT
 		SetPageReserved(virt_to_page(CAC_ADDR((void *)page)));
 #else
 		SetPageReserved(virt_to_page(page));
 #endif
 	}
 	print_dbg("Framebuffer memory map at %p", fbdev->fb_mem);
 	print_dbg("phys=0x%08x, size=%dK", fbdev->fb_phys, fbdev->fb_len / 1024);
--- a/drivers/video/fbdev/au1100fb.h
+++ b/drivers/video/fbdev/au1100fb.h
@ -110,6 +110,7 @@ struct au1100fb_device {
 	dma_addr_t    		fb_phys;
 	int			panel_idx;
 	struct clk		*lcdclk;
 	struct device		*dev;
 };
 /********************************************************************/
--- a/include/linux/dma-contiguous.h
+++ b/include/linux/dma-contiguous.h
@ -50,6 +50,7 @@
 #ifdef __KERNEL__
 #include <linux/device.h>
 #include <linux/mm.h>
 struct cma;
 struct page;
@ -111,6 +112,8 @@ struct page *dma_alloc_from_contiguous(struct device *dev, size_t count,
 				       unsigned int order, bool no_warn);
 bool dma_release_from_contiguous(struct device *dev, struct page *pages,
 				 int count);
 struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp);
 void dma_free_contiguous(struct device *dev, struct page *page, size_t size);
 #else
@ -153,6 +156,22 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages,
 	return false;
 }
 /* Use fallback alloc() and free() when CONFIG_DMA_CMA=n */
 static inline struct page *dma_alloc_contiguous(struct device *dev, size_t size,
 		gfp_t gfp)
 {
 	int node = dev ? dev_to_node(dev) : NUMA_NO_NODE;
 	size_t align = get_order(PAGE_ALIGN(size));
 	return alloc_pages_node(node, gfp, align);
 }
 static inline void dma_free_contiguous(struct device *dev, struct page *page,
 		size_t size)
 {
 	__free_pages(page, get_order(size));
 }
 #endif
 #endif
--- a/include/linux/dma-noncoherent.h
+++ b/include/linux/dma-noncoherent.h
@ -20,6 +20,22 @@ static inline bool dev_is_dma_coherent(struct device *dev)
 }
 #endif /* CONFIG_ARCH_HAS_DMA_COHERENCE_H */
 /*
 * Check if an allocation needs to be marked uncached to be coherent.
 */
 static __always_inline bool dma_alloc_need_uncached(struct device *dev,
 		unsigned long attrs)
 {
 	if (dev_is_dma_coherent(dev))
 		return false;
 	if (attrs & DMA_ATTR_NO_KERNEL_MAPPING)
 		return false;
 	if (IS_ENABLED(CONFIG_DMA_NONCOHERENT_CACHE_SYNC) &&
 	    (attrs & DMA_ATTR_NON_CONSISTENT))
 		return false;
 	return true;
 }
 void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
 		gfp_t gfp, unsigned long attrs);
 void arch_dma_free(struct device *dev, size_t size, void *cpu_addr,
@ -80,4 +96,7 @@ static inline void arch_dma_prep_coherent(struct page *page, size_t size)
 }
 #endif /* CONFIG_ARCH_HAS_DMA_PREP_COHERENT */
 void *uncached_kernel_address(void *addr);
 void *cached_kernel_address(void *addr);
 #endif /* _LINUX_DMA_NONCOHERENT_H */
--- a/include/linux/genalloc.h
+++ b/include/linux/genalloc.h
@ -155,6 +155,15 @@ static inline unsigned long gen_pool_alloc(struct gen_pool *pool, size_t size)
 extern void *gen_pool_dma_alloc(struct gen_pool *pool, size_t size,
 		dma_addr_t *dma);
 extern void *gen_pool_dma_alloc_algo(struct gen_pool *pool, size_t size,
 		dma_addr_t *dma, genpool_algo_t algo, void *data);
 extern void *gen_pool_dma_alloc_align(struct gen_pool *pool, size_t size,
 		dma_addr_t *dma, int align);
 extern void *gen_pool_dma_zalloc(struct gen_pool *pool, size_t size, dma_addr_t *dma);
 extern void *gen_pool_dma_zalloc_algo(struct gen_pool *pool, size_t size,
 		dma_addr_t *dma, genpool_algo_t algo, void *data);
 extern void *gen_pool_dma_zalloc_align(struct gen_pool *pool, size_t size,
 		dma_addr_t *dma, int align);
 extern void gen_pool_free_owner(struct gen_pool *pool, unsigned long addr,
 		size_t size, void **owner);
 static inline void gen_pool_free(struct gen_pool *pool, unsigned long addr,
--- a/include/linux/usb/hcd.h
+++ b/include/linux/usb/hcd.h
@ -216,6 +216,9 @@ struct usb_hcd {
 #define	HC_IS_RUNNING(state) ((state) & __ACTIVE)
 #define	HC_IS_SUSPENDED(state) ((state) & __SUSPEND)
 	/* memory pool for HCs having local memory, or %NULL */
 	struct gen_pool         *localmem_pool;
 	/* more shared queuing code would be good; it should support
 	 * smarter scheduling, handle transaction translators, etc;
 	 * input size of periodic table to an interrupt scheduler.
@ -253,7 +256,6 @@ struct hc_driver {
 	int	flags;
 #define	HCD_MEMORY	0x0001		/* HC regs use memory (else I/O) */
 #define	HCD_LOCAL_MEM	0x0002		/* HC needs local memory */
 #define	HCD_SHARED	0x0004		/* Two (or more) usb_hcds share HW */
 #define	HCD_USB11	0x0010		/* USB 1.1 */
 #define	HCD_USB2	0x0020		/* USB 2.0 */
@ -461,6 +463,8 @@ extern int usb_add_hcd(struct usb_hcd *hcd,
 		unsigned int irqnum, unsigned long irqflags);
 extern void usb_remove_hcd(struct usb_hcd *hcd);
 extern int usb_hcd_find_raw_port_number(struct usb_hcd *hcd, int port1);
 int usb_hcd_setup_local_mem(struct usb_hcd *hcd, phys_addr_t phys_addr,
 			    dma_addr_t dma, size_t size);
 struct platform_device;
 extern void usb_hcd_platform_shutdown(struct platform_device *dev);
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@ -214,6 +214,62 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages,
 	return cma_release(dev_get_cma_area(dev), pages, count);
 }
 /**
 * dma_alloc_contiguous() - allocate contiguous pages
 * @dev:   Pointer to device for which the allocation is performed.
 * @size:  Requested allocation size.
 * @gfp:   Allocation flags.
 *
 * This function allocates contiguous memory buffer for specified device. It
 * first tries to use device specific contiguous memory area if available or
 * the default global one, then tries a fallback allocation of normal pages.
 *
 * Note that it byapss one-page size of allocations from the global area as
 * the addresses within one page are always contiguous, so there is no need
 * to waste CMA pages for that kind; it also helps reduce fragmentations.
 */
 struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
 {
 	int node = dev ? dev_to_node(dev) : NUMA_NO_NODE;
 	size_t count = PAGE_ALIGN(size) >> PAGE_SHIFT;
 	size_t align = get_order(PAGE_ALIGN(size));
 	struct page *page = NULL;
 	struct cma *cma = NULL;
 	if (dev && dev->cma_area)
 		cma = dev->cma_area;
 	else if (count > 1)
 		cma = dma_contiguous_default_area;
 	/* CMA can be used only in the context which permits sleeping */
 	if (cma && gfpflags_allow_blocking(gfp)) {
 		align = min_t(size_t, align, CONFIG_CMA_ALIGNMENT);
 		page = cma_alloc(cma, count, align, gfp & __GFP_NOWARN);
 	}
 	/* Fallback allocation of normal pages */
 	if (!page)
 		page = alloc_pages_node(node, gfp, align);
 	return page;
 }
 /**
 * dma_free_contiguous() - release allocated pages
 * @dev:   Pointer to device for which the pages were allocated.
 * @page:  Pointer to the allocated pages.
 * @size:  Size of allocated pages.
 *
 * This function releases memory allocated by dma_alloc_contiguous(). As the
 * cma_release returns false when provided pages do not belong to contiguous
 * area and true otherwise, this function then does a fallback __free_pages()
 * upon a false-return.
 */
 void dma_free_contiguous(struct device *dev, struct page *page, size_t size)
 {
 	if (!cma_release(dev_get_cma_area(dev), page, size >> PAGE_SHIFT))
 		__free_pages(page, get_order(size));
 }
 /*
 * Support for reserved memory regions defined in device tree
 */
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@ -96,8 +96,6 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
 struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
 		dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
 {
 	unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
 	int page_order = get_order(size);
 	struct page *page = NULL;
 	u64 phys_mask;
@ -109,20 +107,9 @@ struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
 	gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
 			&phys_mask);
 again:
-	/* CMA can be used only in the context which permits sleeping */
+	page = dma_alloc_contiguous(dev, size, gfp);
 	if (gfpflags_allow_blocking(gfp)) {
 		page = dma_alloc_from_contiguous(dev, count, page_order,
 						 gfp & __GFP_NOWARN);
 	if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
-			dma_release_from_contiguous(dev, page, count);
+		dma_free_contiguous(dev, page, size);
 			page = NULL;
 		}
 	}
 	if (!page)
 		page = alloc_pages_node(dev_to_node(dev), gfp, page_order);
 	if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
 		__free_pages(page, page_order);
 		page = NULL;
 		if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
@ -151,10 +138,18 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
 	if (!page)
 		return NULL;
 	if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) {
 		/* remove any dirty cache lines on the kernel alias */
 		if (!PageHighMem(page))
 			arch_dma_prep_coherent(page, size);
 		/* return the page pointer as the opaque cookie */
 		return page;
 	}
 	if (PageHighMem(page)) {
 		/*
 		 * Depending on the cma= arguments and per-arch setup
-		 * dma_alloc_from_contiguous could return highmem pages.
+		 * dma_alloc_contiguous could return highmem pages.
 		 * Without remapping there is no way to return them here,
 		 * so log an error and fail.
 		 */
@ -171,15 +166,19 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
 		*dma_handle = phys_to_dma(dev, page_to_phys(page));
 	}
 	memset(ret, 0, size);
 	if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
 	    dma_alloc_need_uncached(dev, attrs)) {
 		arch_dma_prep_coherent(page, size);
 		ret = uncached_kernel_address(ret);
 	}
 	return ret;
 }
 void __dma_direct_free_pages(struct device *dev, size_t size, struct page *page)
 {
-	unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+	dma_free_contiguous(dev, page, size);
 	if (!dma_release_from_contiguous(dev, page, count))
 		__free_pages(page, get_order(size));
 }
 void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
@ -187,15 +186,26 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
 {
 	unsigned int page_order = get_order(size);
 	if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) {
 		/* cpu_addr is a struct page cookie, not a kernel address */
 		__dma_direct_free_pages(dev, size, cpu_addr);
 		return;
 	}
 	if (force_dma_unencrypted())
 		set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order);
 	if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
 	    dma_alloc_need_uncached(dev, attrs))
 		cpu_addr = cached_kernel_address(cpu_addr);
 	__dma_direct_free_pages(dev, size, virt_to_page(cpu_addr));
 }
 void *dma_direct_alloc(struct device *dev, size_t size,
 		dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
 {
-	if (!dev_is_dma_coherent(dev))
+	if (!IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
 	    dma_alloc_need_uncached(dev, attrs))
 		return arch_dma_alloc(dev, size, dma_handle, gfp, attrs);
 	return dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs);
 }
@ -203,7 +213,8 @@ void *dma_direct_alloc(struct device *dev, size_t size,
 void dma_direct_free(struct device *dev, size_t size,
 		void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs)
 {
-	if (!dev_is_dma_coherent(dev))
+	if (!IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
 	    dma_alloc_need_uncached(dev, attrs))
 		arch_dma_free(dev, size, cpu_addr, dma_addr, attrs);
 	else
 		dma_direct_free_pages(dev, size, cpu_addr, dma_addr, attrs);
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@ -317,6 +317,12 @@ void arch_dma_set_mask(struct device *dev, u64 mask);
 int dma_set_mask(struct device *dev, u64 mask)
 {
 	/*
 	 * Truncate the mask to the actually supported dma_addr_t width to
 	 * avoid generating unsupportable addresses.
 	 */
 	mask = (dma_addr_t)mask;
 	if (!dev->dma_mask || !dma_supported(dev, mask))
 		return -EIO;
@ -330,6 +336,12 @@ EXPORT_SYMBOL(dma_set_mask);
 #ifndef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK
 int dma_set_coherent_mask(struct device *dev, u64 mask)
 {
 	/*
 	 * Truncate the mask to the actually supported dma_addr_t width to
 	 * avoid generating unsupportable addresses.
 	 */
 	mask = (dma_addr_t)mask;
 	if (!dma_supported(dev, mask))
 		return -EIO;
--- a/kernel/dma/remap.c
+++ b/kernel/dma/remap.c
@ -158,6 +158,9 @@ out:
 bool dma_in_atomic_pool(void *start, size_t size)
 {
 	if (unlikely(!atomic_pool))
 		return false;
 	return addr_in_gen_pool(atomic_pool, (unsigned long)start, size);
 }
@ -199,8 +202,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
 	size = PAGE_ALIGN(size);
-	if (!gfpflags_allow_blocking(flags) &&
+	if (!gfpflags_allow_blocking(flags)) {
 	    !(attrs & DMA_ATTR_NO_KERNEL_MAPPING)) {
 		ret = dma_alloc_from_pool(size, &page, flags);
 		if (!ret)
 			return NULL;
@ -214,11 +216,6 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
 	/* remove any dirty cache lines on the kernel alias */
 	arch_dma_prep_coherent(page, size);
 	if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) {
 		ret = page; /* opaque cookie */
 		goto done;
 	}
 	/* create a coherent mapping */
 	ret = dma_common_contiguous_remap(page, size, VM_USERMAP,
 			arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs),
@ -237,10 +234,7 @@ done:
 void arch_dma_free(struct device *dev, size_t size, void *vaddr,
 		dma_addr_t dma_handle, unsigned long attrs)
 {
-	if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) {
+	if (!dma_free_from_pool(vaddr, PAGE_ALIGN(size))) {
 		/* vaddr is a struct page cookie, not a kernel address */
 		__dma_direct_free_pages(dev, size, vaddr);
 	} else if (!dma_free_from_pool(vaddr, PAGE_ALIGN(size))) {
 		phys_addr_t phys = dma_to_phys(dev, dma_handle);
 		struct page *page = pfn_to_page(__phys_to_pfn(phys));
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@ -327,21 +327,45 @@ EXPORT_SYMBOL(gen_pool_alloc_algo_owner);
 * gen_pool_dma_alloc - allocate special memory from the pool for DMA usage
 * @pool: pool to allocate from
 * @size: number of bytes to allocate from the pool
- * @dma: dma-view physical address return value.  Use NULL if unneeded.
+ * @dma: dma-view physical address return value.  Use %NULL if unneeded.
 *
 * Allocate the requested number of bytes from the specified pool.
 * Uses the pool allocation function (with first-fit algorithm by default).
 * Can not be used in NMI handler on architectures without
 * NMI-safe cmpxchg implementation.
 *
 * Return: virtual address of the allocated memory, or %NULL on failure
 */
 void *gen_pool_dma_alloc(struct gen_pool *pool, size_t size, dma_addr_t *dma)
 {
 	return gen_pool_dma_alloc_algo(pool, size, dma, pool->algo, pool->data);
 }
 EXPORT_SYMBOL(gen_pool_dma_alloc);
 /**
 * gen_pool_dma_alloc_algo - allocate special memory from the pool for DMA
 * usage with the given pool algorithm
 * @pool: pool to allocate from
 * @size: number of bytes to allocate from the pool
 * @dma: DMA-view physical address return value. Use %NULL if unneeded.
 * @algo: algorithm passed from caller
 * @data: data passed to algorithm
 *
 * Allocate the requested number of bytes from the specified pool. Uses the
 * given pool allocation function. Can not be used in NMI handler on
 * architectures without NMI-safe cmpxchg implementation.
 *
 * Return: virtual address of the allocated memory, or %NULL on failure
 */
 void *gen_pool_dma_alloc_algo(struct gen_pool *pool, size_t size,
 		dma_addr_t *dma, genpool_algo_t algo, void *data)
 {
 	unsigned long vaddr;
 	if (!pool)
 		return NULL;
-	vaddr = gen_pool_alloc(pool, size);
+	vaddr = gen_pool_alloc_algo(pool, size, algo, data);
 	if (!vaddr)
 		return NULL;
@ -350,7 +374,102 @@ void *gen_pool_dma_alloc(struct gen_pool *pool, size_t size, dma_addr_t *dma)
 	return (void *)vaddr;
 }
-EXPORT_SYMBOL(gen_pool_dma_alloc);
+EXPORT_SYMBOL(gen_pool_dma_alloc_algo);
 /**
 * gen_pool_dma_alloc_align - allocate special memory from the pool for DMA
 * usage with the given alignment
 * @pool: pool to allocate from
 * @size: number of bytes to allocate from the pool
 * @dma: DMA-view physical address return value. Use %NULL if unneeded.
 * @align: alignment in bytes for starting address
 *
 * Allocate the requested number bytes from the specified pool, with the given
 * alignment restriction. Can not be used in NMI handler on architectures
 * without NMI-safe cmpxchg implementation.
 *
 * Return: virtual address of the allocated memory, or %NULL on failure
 */
 void *gen_pool_dma_alloc_align(struct gen_pool *pool, size_t size,
 		dma_addr_t *dma, int align)
 {
 	struct genpool_data_align data = { .align = align };
 	return gen_pool_dma_alloc_algo(pool, size, dma,
 			gen_pool_first_fit_align, &data);
 }
 EXPORT_SYMBOL(gen_pool_dma_alloc_align);
 /**
 * gen_pool_dma_zalloc - allocate special zeroed memory from the pool for
 * DMA usage
 * @pool: pool to allocate from
 * @size: number of bytes to allocate from the pool
 * @dma: dma-view physical address return value.  Use %NULL if unneeded.
 *
 * Allocate the requested number of zeroed bytes from the specified pool.
 * Uses the pool allocation function (with first-fit algorithm by default).
 * Can not be used in NMI handler on architectures without
 * NMI-safe cmpxchg implementation.
 *
 * Return: virtual address of the allocated zeroed memory, or %NULL on failure
 */
 void *gen_pool_dma_zalloc(struct gen_pool *pool, size_t size, dma_addr_t *dma)
 {
 	return gen_pool_dma_zalloc_algo(pool, size, dma, pool->algo, pool->data);
 }
 EXPORT_SYMBOL(gen_pool_dma_zalloc);
 /**
 * gen_pool_dma_zalloc_algo - allocate special zeroed memory from the pool for
 * DMA usage with the given pool algorithm
 * @pool: pool to allocate from
 * @size: number of bytes to allocate from the pool
 * @dma: DMA-view physical address return value. Use %NULL if unneeded.
 * @algo: algorithm passed from caller
 * @data: data passed to algorithm
 *
 * Allocate the requested number of zeroed bytes from the specified pool. Uses
 * the given pool allocation function. Can not be used in NMI handler on
 * architectures without NMI-safe cmpxchg implementation.
 *
 * Return: virtual address of the allocated zeroed memory, or %NULL on failure
 */
 void *gen_pool_dma_zalloc_algo(struct gen_pool *pool, size_t size,
 		dma_addr_t *dma, genpool_algo_t algo, void *data)
 {
 	void *vaddr = gen_pool_dma_alloc_algo(pool, size, dma, algo, data);
 	if (vaddr)
 		memset(vaddr, 0, size);
 	return vaddr;
 }
 EXPORT_SYMBOL(gen_pool_dma_zalloc_algo);
 /**
 * gen_pool_dma_zalloc_align - allocate special zeroed memory from the pool for
 * DMA usage with the given alignment
 * @pool: pool to allocate from
 * @size: number of bytes to allocate from the pool
 * @dma: DMA-view physical address return value. Use %NULL if unneeded.
 * @align: alignment in bytes for starting address
 *
 * Allocate the requested number of zeroed bytes from the specified pool,
 * with the given alignment restriction. Can not be used in NMI handler on
 * architectures without NMI-safe cmpxchg implementation.
 *
 * Return: virtual address of the allocated zeroed memory, or %NULL on failure
 */
 void *gen_pool_dma_zalloc_align(struct gen_pool *pool, size_t size,
 		dma_addr_t *dma, int align)
 {
 	struct genpool_data_align data = { .align = align };
 	return gen_pool_dma_zalloc_algo(pool, size, dma,
 			gen_pool_first_fit_align, &data);
 }
 EXPORT_SYMBOL(gen_pool_dma_zalloc_align);
 /**
 * gen_pool_free - free allocated special memory back to the pool