Merge branch 'hmm-devmem-cleanup.4' into rdma.git hmm
Christoph Hellwig says: ==================== Below is a series that cleans up the dev_pagemap interface so that it is more easily usable, which removes the need to wrap it in hmm and thus allowing to kill a lot of code Changes since v3: - pull in "mm/swap: Fix release_pages() when releasing devmap pages" and rebase the other patches on top of that - fold the hmm_devmem_add_resource into the DEVICE_PUBLIC memory removal patch - remove _vm_normal_page as it isn't needed without DEVICE_PUBLIC memory - pick up various ACKs Changes since v2: - fix nvdimm kunit build - add a new memory type for device dax - fix a few issues in intermediate patches that didn't show up in the end result - incorporate feedback from Michal Hocko, including killing of the DEVICE_PUBLIC memory type entirely Changes since v1: - rebase - also switch p2pdma to the internal refcount - add type checking for pgmap->type - rename the migrate method to migrate_to_ram - cleanup the altmap_valid flag - various tidbits from the reviews ==================== Conflicts resolved by: - Keeping Ira's version of the code in swap.c - Using the delete for the section in hmm.rst - Using the delete for the devmap code in hmm.c and .h * branch 'hmm-devmem-cleanup.4': (24 commits) mm: don't select MIGRATE_VMA_HELPER from HMM_MIRROR mm: remove the HMM config option mm: sort out the DEVICE_PRIVATE Kconfig mess mm: simplify ZONE_DEVICE page private data mm: remove hmm_devmem_add mm: remove hmm_vma_alloc_locked_page nouveau: use devm_memremap_pages directly nouveau: use alloc_page_vma directly PCI/P2PDMA: use the dev_pagemap internal refcount device-dax: use the dev_pagemap internal refcount memremap: provide an optional internal refcount in struct dev_pagemap memremap: replace the altmap_valid field with a PGMAP_ALTMAP_VALID flag memremap: remove the data field in struct dev_pagemap memremap: add a migrate_to_ram method to struct dev_pagemap_ops memremap: lift the devmap_enable manipulation into devm_memremap_pages memremap: pass a struct dev_pagemap to ->kill and ->cleanup memremap: move dev_pagemap callbacks into a separate structure memremap: validate the pagemap type passed to devm_memremap_pages mm: factor out a devm_request_free_mem_region helper mm: export alloc_pages_vma ... Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
This commit is contained in:
commit
cc5dfd59e3
|
@ -336,33 +336,6 @@ directly using struct page for device memory which left most kernel code paths
|
|||
unaware of the difference. We only need to make sure that no one ever tries to
|
||||
map those pages from the CPU side.
|
||||
|
||||
HMM provides a set of helpers to register and hotplug device memory as a new
|
||||
region needing a struct page. This is offered through a very simple API::
|
||||
|
||||
struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
|
||||
struct device *device,
|
||||
unsigned long size);
|
||||
void hmm_devmem_remove(struct hmm_devmem *devmem);
|
||||
|
||||
The hmm_devmem_ops is where most of the important things are::
|
||||
|
||||
struct hmm_devmem_ops {
|
||||
void (*free)(struct hmm_devmem *devmem, struct page *page);
|
||||
vm_fault_t (*fault)(struct hmm_devmem *devmem,
|
||||
struct vm_area_struct *vma,
|
||||
unsigned long addr,
|
||||
struct page *page,
|
||||
unsigned flags,
|
||||
pmd_t *pmdp);
|
||||
};
|
||||
|
||||
The first callback (free()) happens when the last reference on a device page is
|
||||
dropped. This means the device page is now free and no longer used by anyone.
|
||||
The second callback happens whenever the CPU tries to access a device page
|
||||
which it cannot do. This second callback must trigger a migration back to
|
||||
system memory.
|
||||
|
||||
|
||||
Migration to and from device memory
|
||||
===================================
|
||||
|
||||
|
|
|
@ -131,17 +131,9 @@ void __ref arch_remove_memory(int nid, u64 start, u64 size,
|
|||
{
|
||||
unsigned long start_pfn = start >> PAGE_SHIFT;
|
||||
unsigned long nr_pages = size >> PAGE_SHIFT;
|
||||
struct page *page;
|
||||
struct page *page = pfn_to_page(start_pfn) + vmem_altmap_offset(altmap);
|
||||
int ret;
|
||||
|
||||
/*
|
||||
* If we have an altmap then we need to skip over any reserved PFNs
|
||||
* when querying the zone.
|
||||
*/
|
||||
page = pfn_to_page(start_pfn);
|
||||
if (altmap)
|
||||
page += vmem_altmap_offset(altmap);
|
||||
|
||||
__remove_pages(page_zone(page), start_pfn, nr_pages, altmap);
|
||||
|
||||
/* Remove htab bolted mappings for this section of memory */
|
||||
|
|
|
@ -1213,13 +1213,9 @@ void __ref arch_remove_memory(int nid, u64 start, u64 size,
|
|||
{
|
||||
unsigned long start_pfn = start >> PAGE_SHIFT;
|
||||
unsigned long nr_pages = size >> PAGE_SHIFT;
|
||||
struct page *page = pfn_to_page(start_pfn);
|
||||
struct zone *zone;
|
||||
struct page *page = pfn_to_page(start_pfn) + vmem_altmap_offset(altmap);
|
||||
struct zone *zone = page_zone(page);
|
||||
|
||||
/* With altmap the first mapped page is offset from @start */
|
||||
if (altmap)
|
||||
page += vmem_altmap_offset(altmap);
|
||||
zone = page_zone(page);
|
||||
__remove_pages(zone, start_pfn, nr_pages, altmap);
|
||||
kernel_physical_mapping_remove(start, start + size);
|
||||
}
|
||||
|
|
|
@ -43,8 +43,6 @@ struct dax_region {
|
|||
* @target_node: effective numa node if dev_dax memory range is onlined
|
||||
* @dev - device core
|
||||
* @pgmap - pgmap for memmap setup / lifetime (driver owned)
|
||||
* @ref: pgmap reference count (driver owned)
|
||||
* @cmp: @ref final put completion (driver owned)
|
||||
*/
|
||||
struct dev_dax {
|
||||
struct dax_region *region;
|
||||
|
@ -52,8 +50,6 @@ struct dev_dax {
|
|||
int target_node;
|
||||
struct device dev;
|
||||
struct dev_pagemap pgmap;
|
||||
struct percpu_ref ref;
|
||||
struct completion cmp;
|
||||
};
|
||||
|
||||
static inline struct dev_dax *to_dev_dax(struct device *dev)
|
||||
|
|
|
@ -14,37 +14,6 @@
|
|||
#include "dax-private.h"
|
||||
#include "bus.h"
|
||||
|
||||
static struct dev_dax *ref_to_dev_dax(struct percpu_ref *ref)
|
||||
{
|
||||
return container_of(ref, struct dev_dax, ref);
|
||||
}
|
||||
|
||||
static void dev_dax_percpu_release(struct percpu_ref *ref)
|
||||
{
|
||||
struct dev_dax *dev_dax = ref_to_dev_dax(ref);
|
||||
|
||||
dev_dbg(&dev_dax->dev, "%s\n", __func__);
|
||||
complete(&dev_dax->cmp);
|
||||
}
|
||||
|
||||
static void dev_dax_percpu_exit(struct percpu_ref *ref)
|
||||
{
|
||||
struct dev_dax *dev_dax = ref_to_dev_dax(ref);
|
||||
|
||||
dev_dbg(&dev_dax->dev, "%s\n", __func__);
|
||||
wait_for_completion(&dev_dax->cmp);
|
||||
percpu_ref_exit(ref);
|
||||
}
|
||||
|
||||
static void dev_dax_percpu_kill(struct percpu_ref *data)
|
||||
{
|
||||
struct percpu_ref *ref = data;
|
||||
struct dev_dax *dev_dax = ref_to_dev_dax(ref);
|
||||
|
||||
dev_dbg(&dev_dax->dev, "%s\n", __func__);
|
||||
percpu_ref_kill(ref);
|
||||
}
|
||||
|
||||
static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma,
|
||||
const char *func)
|
||||
{
|
||||
|
@ -459,15 +428,7 @@ int dev_dax_probe(struct device *dev)
|
|||
return -EBUSY;
|
||||
}
|
||||
|
||||
init_completion(&dev_dax->cmp);
|
||||
rc = percpu_ref_init(&dev_dax->ref, dev_dax_percpu_release, 0,
|
||||
GFP_KERNEL);
|
||||
if (rc)
|
||||
return rc;
|
||||
|
||||
dev_dax->pgmap.ref = &dev_dax->ref;
|
||||
dev_dax->pgmap.kill = dev_dax_percpu_kill;
|
||||
dev_dax->pgmap.cleanup = dev_dax_percpu_exit;
|
||||
dev_dax->pgmap.type = MEMORY_DEVICE_DEVDAX;
|
||||
addr = devm_memremap_pages(dev, &dev_dax->pgmap);
|
||||
if (IS_ERR(addr))
|
||||
return PTR_ERR(addr);
|
||||
|
|
|
@ -16,7 +16,7 @@ struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys)
|
|||
struct dev_dax *dev_dax;
|
||||
struct nd_namespace_io *nsio;
|
||||
struct dax_region *dax_region;
|
||||
struct dev_pagemap pgmap = { 0 };
|
||||
struct dev_pagemap pgmap = { };
|
||||
struct nd_namespace_common *ndns;
|
||||
struct nd_dax *nd_dax = to_nd_dax(dev);
|
||||
struct nd_pfn *nd_pfn = &nd_dax->nd_pfn;
|
||||
|
|
|
@ -84,11 +84,11 @@ config DRM_NOUVEAU_BACKLIGHT
|
|||
|
||||
config DRM_NOUVEAU_SVM
|
||||
bool "(EXPERIMENTAL) Enable SVM (Shared Virtual Memory) support"
|
||||
depends on ARCH_HAS_HMM
|
||||
depends on DEVICE_PRIVATE
|
||||
depends on DRM_NOUVEAU
|
||||
depends on HMM_MIRROR
|
||||
depends on STAGING
|
||||
select HMM_MIRROR
|
||||
select DEVICE_PRIVATE
|
||||
select MIGRATE_VMA_HELPER
|
||||
default n
|
||||
help
|
||||
Say Y here if you want to enable experimental support for
|
||||
|
|
|
@ -72,7 +72,8 @@ struct nouveau_dmem_migrate {
|
|||
};
|
||||
|
||||
struct nouveau_dmem {
|
||||
struct hmm_devmem *devmem;
|
||||
struct nouveau_drm *drm;
|
||||
struct dev_pagemap pagemap;
|
||||
struct nouveau_dmem_migrate migrate;
|
||||
struct list_head chunk_free;
|
||||
struct list_head chunk_full;
|
||||
|
@ -80,6 +81,11 @@ struct nouveau_dmem {
|
|||
struct mutex mutex;
|
||||
};
|
||||
|
||||
static inline struct nouveau_dmem *page_to_dmem(struct page *page)
|
||||
{
|
||||
return container_of(page->pgmap, struct nouveau_dmem, pagemap);
|
||||
}
|
||||
|
||||
struct nouveau_dmem_fault {
|
||||
struct nouveau_drm *drm;
|
||||
struct nouveau_fence *fence;
|
||||
|
@ -96,14 +102,10 @@ struct nouveau_migrate {
|
|||
unsigned long dma_nr;
|
||||
};
|
||||
|
||||
static void
|
||||
nouveau_dmem_free(struct hmm_devmem *devmem, struct page *page)
|
||||
static void nouveau_dmem_page_free(struct page *page)
|
||||
{
|
||||
struct nouveau_dmem_chunk *chunk;
|
||||
unsigned long idx;
|
||||
|
||||
chunk = (void *)hmm_devmem_page_get_drvdata(page);
|
||||
idx = page_to_pfn(page) - chunk->pfn_first;
|
||||
struct nouveau_dmem_chunk *chunk = page->zone_device_data;
|
||||
unsigned long idx = page_to_pfn(page) - chunk->pfn_first;
|
||||
|
||||
/*
|
||||
* FIXME:
|
||||
|
@ -148,11 +150,12 @@ nouveau_dmem_fault_alloc_and_copy(struct vm_area_struct *vma,
|
|||
if (!spage || !(src_pfns[i] & MIGRATE_PFN_MIGRATE))
|
||||
continue;
|
||||
|
||||
dpage = hmm_vma_alloc_locked_page(vma, addr);
|
||||
dpage = alloc_page_vma(GFP_HIGHUSER, vma, addr);
|
||||
if (!dpage) {
|
||||
dst_pfns[i] = MIGRATE_PFN_ERROR;
|
||||
continue;
|
||||
}
|
||||
lock_page(dpage);
|
||||
|
||||
dst_pfns[i] = migrate_pfn(page_to_pfn(dpage)) |
|
||||
MIGRATE_PFN_LOCKED;
|
||||
|
@ -194,7 +197,7 @@ nouveau_dmem_fault_alloc_and_copy(struct vm_area_struct *vma,
|
|||
|
||||
dst_addr = fault->dma[fault->npages++];
|
||||
|
||||
chunk = (void *)hmm_devmem_page_get_drvdata(spage);
|
||||
chunk = spage->zone_device_data;
|
||||
src_addr = page_to_pfn(spage) - chunk->pfn_first;
|
||||
src_addr = (src_addr << PAGE_SHIFT) + chunk->bo->bo.offset;
|
||||
|
||||
|
@ -259,29 +262,21 @@ static const struct migrate_vma_ops nouveau_dmem_fault_migrate_ops = {
|
|||
.finalize_and_map = nouveau_dmem_fault_finalize_and_map,
|
||||
};
|
||||
|
||||
static vm_fault_t
|
||||
nouveau_dmem_fault(struct hmm_devmem *devmem,
|
||||
struct vm_area_struct *vma,
|
||||
unsigned long addr,
|
||||
const struct page *page,
|
||||
unsigned int flags,
|
||||
pmd_t *pmdp)
|
||||
static vm_fault_t nouveau_dmem_migrate_to_ram(struct vm_fault *vmf)
|
||||
{
|
||||
struct drm_device *drm_dev = dev_get_drvdata(devmem->device);
|
||||
struct nouveau_dmem *dmem = page_to_dmem(vmf->page);
|
||||
unsigned long src[1] = {0}, dst[1] = {0};
|
||||
struct nouveau_dmem_fault fault = {0};
|
||||
struct nouveau_dmem_fault fault = { .drm = dmem->drm };
|
||||
int ret;
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* FIXME what we really want is to find some heuristic to migrate more
|
||||
* than just one page on CPU fault. When such fault happens it is very
|
||||
* likely that more surrounding page will CPU fault too.
|
||||
*/
|
||||
fault.drm = nouveau_drm(drm_dev);
|
||||
ret = migrate_vma(&nouveau_dmem_fault_migrate_ops, vma, addr,
|
||||
addr + PAGE_SIZE, src, dst, &fault);
|
||||
ret = migrate_vma(&nouveau_dmem_fault_migrate_ops, vmf->vma,
|
||||
vmf->address, vmf->address + PAGE_SIZE,
|
||||
src, dst, &fault);
|
||||
if (ret)
|
||||
return VM_FAULT_SIGBUS;
|
||||
|
||||
|
@ -291,10 +286,9 @@ nouveau_dmem_fault(struct hmm_devmem *devmem,
|
|||
return 0;
|
||||
}
|
||||
|
||||
static const struct hmm_devmem_ops
|
||||
nouveau_dmem_devmem_ops = {
|
||||
.free = nouveau_dmem_free,
|
||||
.fault = nouveau_dmem_fault,
|
||||
static const struct dev_pagemap_ops nouveau_dmem_pagemap_ops = {
|
||||
.page_free = nouveau_dmem_page_free,
|
||||
.migrate_to_ram = nouveau_dmem_migrate_to_ram,
|
||||
};
|
||||
|
||||
static int
|
||||
|
@ -580,7 +574,8 @@ void
|
|||
nouveau_dmem_init(struct nouveau_drm *drm)
|
||||
{
|
||||
struct device *device = drm->dev->dev;
|
||||
unsigned long i, size;
|
||||
struct resource *res;
|
||||
unsigned long i, size, pfn_first;
|
||||
int ret;
|
||||
|
||||
/* This only make sense on PASCAL or newer */
|
||||
|
@ -590,6 +585,7 @@ nouveau_dmem_init(struct nouveau_drm *drm)
|
|||
if (!(drm->dmem = kzalloc(sizeof(*drm->dmem), GFP_KERNEL)))
|
||||
return;
|
||||
|
||||
drm->dmem->drm = drm;
|
||||
mutex_init(&drm->dmem->mutex);
|
||||
INIT_LIST_HEAD(&drm->dmem->chunk_free);
|
||||
INIT_LIST_HEAD(&drm->dmem->chunk_full);
|
||||
|
@ -599,11 +595,8 @@ nouveau_dmem_init(struct nouveau_drm *drm)
|
|||
|
||||
/* Initialize migration dma helpers before registering memory */
|
||||
ret = nouveau_dmem_migrate_init(drm);
|
||||
if (ret) {
|
||||
kfree(drm->dmem);
|
||||
drm->dmem = NULL;
|
||||
return;
|
||||
}
|
||||
if (ret)
|
||||
goto out_free;
|
||||
|
||||
/*
|
||||
* FIXME we need some kind of policy to decide how much VRAM we
|
||||
|
@ -611,14 +604,16 @@ nouveau_dmem_init(struct nouveau_drm *drm)
|
|||
* and latter if we want to do thing like over commit then we
|
||||
* could revisit this.
|
||||
*/
|
||||
drm->dmem->devmem = hmm_devmem_add(&nouveau_dmem_devmem_ops,
|
||||
device, size);
|
||||
if (IS_ERR(drm->dmem->devmem)) {
|
||||
kfree(drm->dmem);
|
||||
drm->dmem = NULL;
|
||||
return;
|
||||
}
|
||||
res = devm_request_free_mem_region(device, &iomem_resource, size);
|
||||
if (IS_ERR(res))
|
||||
goto out_free;
|
||||
drm->dmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
|
||||
drm->dmem->pagemap.res = *res;
|
||||
drm->dmem->pagemap.ops = &nouveau_dmem_pagemap_ops;
|
||||
if (IS_ERR(devm_memremap_pages(device, &drm->dmem->pagemap)))
|
||||
goto out_free;
|
||||
|
||||
pfn_first = res->start >> PAGE_SHIFT;
|
||||
for (i = 0; i < (size / DMEM_CHUNK_SIZE); ++i) {
|
||||
struct nouveau_dmem_chunk *chunk;
|
||||
struct page *page;
|
||||
|
@ -631,17 +626,19 @@ nouveau_dmem_init(struct nouveau_drm *drm)
|
|||
}
|
||||
|
||||
chunk->drm = drm;
|
||||
chunk->pfn_first = drm->dmem->devmem->pfn_first;
|
||||
chunk->pfn_first += (i * DMEM_CHUNK_NPAGES);
|
||||
chunk->pfn_first = pfn_first + (i * DMEM_CHUNK_NPAGES);
|
||||
list_add_tail(&chunk->list, &drm->dmem->chunk_empty);
|
||||
|
||||
page = pfn_to_page(chunk->pfn_first);
|
||||
for (j = 0; j < DMEM_CHUNK_NPAGES; ++j, ++page) {
|
||||
hmm_devmem_page_set_drvdata(page, (long)chunk);
|
||||
}
|
||||
for (j = 0; j < DMEM_CHUNK_NPAGES; ++j, ++page)
|
||||
page->zone_device_data = chunk;
|
||||
}
|
||||
|
||||
NV_INFO(drm, "DMEM: registered %ldMB of device memory\n", size >> 20);
|
||||
return;
|
||||
out_free:
|
||||
kfree(drm->dmem);
|
||||
drm->dmem = NULL;
|
||||
}
|
||||
|
||||
static void
|
||||
|
@ -697,7 +694,7 @@ nouveau_dmem_migrate_alloc_and_copy(struct vm_area_struct *vma,
|
|||
if (!dpage || dst_pfns[i] == MIGRATE_PFN_ERROR)
|
||||
continue;
|
||||
|
||||
chunk = (void *)hmm_devmem_page_get_drvdata(dpage);
|
||||
chunk = dpage->zone_device_data;
|
||||
dst_addr = page_to_pfn(dpage) - chunk->pfn_first;
|
||||
dst_addr = (dst_addr << PAGE_SHIFT) + chunk->bo->bo.offset;
|
||||
|
||||
|
@ -832,13 +829,7 @@ out:
|
|||
static inline bool
|
||||
nouveau_dmem_page(struct nouveau_drm *drm, struct page *page)
|
||||
{
|
||||
if (!is_device_private_page(page))
|
||||
return false;
|
||||
|
||||
if (drm->dmem->devmem != page->pgmap->data)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
return is_device_private_page(page) && drm->dmem == page_to_dmem(page);
|
||||
}
|
||||
|
||||
void
|
||||
|
@ -867,7 +858,7 @@ nouveau_dmem_convert_pfn(struct nouveau_drm *drm,
|
|||
continue;
|
||||
}
|
||||
|
||||
chunk = (void *)hmm_devmem_page_get_drvdata(page);
|
||||
chunk = page->zone_device_data;
|
||||
addr = page_to_pfn(page) - chunk->pfn_first;
|
||||
addr = (addr + chunk->bo->bo.mem.start) << PAGE_SHIFT;
|
||||
|
||||
|
|
|
@ -622,7 +622,6 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap)
|
|||
if (offset < reserve)
|
||||
return -EINVAL;
|
||||
nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns);
|
||||
pgmap->altmap_valid = false;
|
||||
} else if (nd_pfn->mode == PFN_MODE_PMEM) {
|
||||
nd_pfn->npfns = PFN_SECTION_ALIGN_UP((resource_size(res)
|
||||
- offset) / PAGE_SIZE);
|
||||
|
@ -634,7 +633,7 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap)
|
|||
memcpy(altmap, &__altmap, sizeof(*altmap));
|
||||
altmap->free = PHYS_PFN(offset - reserve);
|
||||
altmap->alloc = 0;
|
||||
pgmap->altmap_valid = true;
|
||||
pgmap->flags |= PGMAP_ALTMAP_VALID;
|
||||
} else
|
||||
return -ENXIO;
|
||||
|
||||
|
|
|
@ -303,24 +303,24 @@ static const struct attribute_group *pmem_attribute_groups[] = {
|
|||
NULL,
|
||||
};
|
||||
|
||||
static void __pmem_release_queue(struct percpu_ref *ref)
|
||||
static void pmem_pagemap_cleanup(struct dev_pagemap *pgmap)
|
||||
{
|
||||
struct request_queue *q;
|
||||
struct request_queue *q =
|
||||
container_of(pgmap->ref, struct request_queue, q_usage_counter);
|
||||
|
||||
q = container_of(ref, typeof(*q), q_usage_counter);
|
||||
blk_cleanup_queue(q);
|
||||
}
|
||||
|
||||
static void pmem_release_queue(void *ref)
|
||||
static void pmem_release_queue(void *pgmap)
|
||||
{
|
||||
__pmem_release_queue(ref);
|
||||
pmem_pagemap_cleanup(pgmap);
|
||||
}
|
||||
|
||||
static void pmem_freeze_queue(struct percpu_ref *ref)
|
||||
static void pmem_pagemap_kill(struct dev_pagemap *pgmap)
|
||||
{
|
||||
struct request_queue *q;
|
||||
struct request_queue *q =
|
||||
container_of(pgmap->ref, struct request_queue, q_usage_counter);
|
||||
|
||||
q = container_of(ref, typeof(*q), q_usage_counter);
|
||||
blk_freeze_queue_start(q);
|
||||
}
|
||||
|
||||
|
@ -334,26 +334,16 @@ static void pmem_release_disk(void *__pmem)
|
|||
put_disk(pmem->disk);
|
||||
}
|
||||
|
||||
static void pmem_release_pgmap_ops(void *__pgmap)
|
||||
{
|
||||
dev_pagemap_put_ops();
|
||||
}
|
||||
|
||||
static void fsdax_pagefree(struct page *page, void *data)
|
||||
static void pmem_pagemap_page_free(struct page *page)
|
||||
{
|
||||
wake_up_var(&page->_refcount);
|
||||
}
|
||||
|
||||
static int setup_pagemap_fsdax(struct device *dev, struct dev_pagemap *pgmap)
|
||||
{
|
||||
dev_pagemap_get_ops();
|
||||
if (devm_add_action_or_reset(dev, pmem_release_pgmap_ops, pgmap))
|
||||
return -ENOMEM;
|
||||
pgmap->type = MEMORY_DEVICE_FS_DAX;
|
||||
pgmap->page_free = fsdax_pagefree;
|
||||
|
||||
return 0;
|
||||
}
|
||||
static const struct dev_pagemap_ops fsdax_pagemap_ops = {
|
||||
.page_free = pmem_pagemap_page_free,
|
||||
.kill = pmem_pagemap_kill,
|
||||
.cleanup = pmem_pagemap_cleanup,
|
||||
};
|
||||
|
||||
static int pmem_attach_disk(struct device *dev,
|
||||
struct nd_namespace_common *ndns)
|
||||
|
@ -409,11 +399,9 @@ static int pmem_attach_disk(struct device *dev,
|
|||
|
||||
pmem->pfn_flags = PFN_DEV;
|
||||
pmem->pgmap.ref = &q->q_usage_counter;
|
||||
pmem->pgmap.kill = pmem_freeze_queue;
|
||||
pmem->pgmap.cleanup = __pmem_release_queue;
|
||||
if (is_nd_pfn(dev)) {
|
||||
if (setup_pagemap_fsdax(dev, &pmem->pgmap))
|
||||
return -ENOMEM;
|
||||
pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
|
||||
pmem->pgmap.ops = &fsdax_pagemap_ops;
|
||||
addr = devm_memremap_pages(dev, &pmem->pgmap);
|
||||
pfn_sb = nd_pfn->pfn_sb;
|
||||
pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
|
||||
|
@ -424,15 +412,14 @@ static int pmem_attach_disk(struct device *dev,
|
|||
bb_res.start += pmem->data_offset;
|
||||
} else if (pmem_should_map_pages(dev)) {
|
||||
memcpy(&pmem->pgmap.res, &nsio->res, sizeof(pmem->pgmap.res));
|
||||
pmem->pgmap.altmap_valid = false;
|
||||
if (setup_pagemap_fsdax(dev, &pmem->pgmap))
|
||||
return -ENOMEM;
|
||||
pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
|
||||
pmem->pgmap.ops = &fsdax_pagemap_ops;
|
||||
addr = devm_memremap_pages(dev, &pmem->pgmap);
|
||||
pmem->pfn_flags |= PFN_MAP;
|
||||
memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res));
|
||||
} else {
|
||||
if (devm_add_action_or_reset(dev, pmem_release_queue,
|
||||
&q->q_usage_counter))
|
||||
&pmem->pgmap))
|
||||
return -ENOMEM;
|
||||
addr = devm_memremap(dev, pmem->phys_addr,
|
||||
pmem->size, ARCH_MEMREMAP_PMEM);
|
||||
|
|
|
@ -25,12 +25,6 @@ struct pci_p2pdma {
|
|||
bool p2pmem_published;
|
||||
};
|
||||
|
||||
struct p2pdma_pagemap {
|
||||
struct dev_pagemap pgmap;
|
||||
struct percpu_ref ref;
|
||||
struct completion ref_done;
|
||||
};
|
||||
|
||||
static ssize_t size_show(struct device *dev, struct device_attribute *attr,
|
||||
char *buf)
|
||||
{
|
||||
|
@ -79,31 +73,6 @@ static const struct attribute_group p2pmem_group = {
|
|||
.name = "p2pmem",
|
||||
};
|
||||
|
||||
static struct p2pdma_pagemap *to_p2p_pgmap(struct percpu_ref *ref)
|
||||
{
|
||||
return container_of(ref, struct p2pdma_pagemap, ref);
|
||||
}
|
||||
|
||||
static void pci_p2pdma_percpu_release(struct percpu_ref *ref)
|
||||
{
|
||||
struct p2pdma_pagemap *p2p_pgmap = to_p2p_pgmap(ref);
|
||||
|
||||
complete(&p2p_pgmap->ref_done);
|
||||
}
|
||||
|
||||
static void pci_p2pdma_percpu_kill(struct percpu_ref *ref)
|
||||
{
|
||||
percpu_ref_kill(ref);
|
||||
}
|
||||
|
||||
static void pci_p2pdma_percpu_cleanup(struct percpu_ref *ref)
|
||||
{
|
||||
struct p2pdma_pagemap *p2p_pgmap = to_p2p_pgmap(ref);
|
||||
|
||||
wait_for_completion(&p2p_pgmap->ref_done);
|
||||
percpu_ref_exit(&p2p_pgmap->ref);
|
||||
}
|
||||
|
||||
static void pci_p2pdma_release(void *data)
|
||||
{
|
||||
struct pci_dev *pdev = data;
|
||||
|
@ -166,7 +135,6 @@ out:
|
|||
int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
|
||||
u64 offset)
|
||||
{
|
||||
struct p2pdma_pagemap *p2p_pgmap;
|
||||
struct dev_pagemap *pgmap;
|
||||
void *addr;
|
||||
int error;
|
||||
|
@ -189,27 +157,15 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
|
|||
return error;
|
||||
}
|
||||
|
||||
p2p_pgmap = devm_kzalloc(&pdev->dev, sizeof(*p2p_pgmap), GFP_KERNEL);
|
||||
if (!p2p_pgmap)
|
||||
pgmap = devm_kzalloc(&pdev->dev, sizeof(*pgmap), GFP_KERNEL);
|
||||
if (!pgmap)
|
||||
return -ENOMEM;
|
||||
|
||||
init_completion(&p2p_pgmap->ref_done);
|
||||
error = percpu_ref_init(&p2p_pgmap->ref,
|
||||
pci_p2pdma_percpu_release, 0, GFP_KERNEL);
|
||||
if (error)
|
||||
goto pgmap_free;
|
||||
|
||||
pgmap = &p2p_pgmap->pgmap;
|
||||
|
||||
pgmap->res.start = pci_resource_start(pdev, bar) + offset;
|
||||
pgmap->res.end = pgmap->res.start + size - 1;
|
||||
pgmap->res.flags = pci_resource_flags(pdev, bar);
|
||||
pgmap->ref = &p2p_pgmap->ref;
|
||||
pgmap->type = MEMORY_DEVICE_PCI_P2PDMA;
|
||||
pgmap->pci_p2pdma_bus_offset = pci_bus_address(pdev, bar) -
|
||||
pci_resource_start(pdev, bar);
|
||||
pgmap->kill = pci_p2pdma_percpu_kill;
|
||||
pgmap->cleanup = pci_p2pdma_percpu_cleanup;
|
||||
|
||||
addr = devm_memremap_pages(&pdev->dev, pgmap);
|
||||
if (IS_ERR(addr)) {
|
||||
|
@ -220,7 +176,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
|
|||
error = gen_pool_add_owner(pdev->p2pdma->pool, (unsigned long)addr,
|
||||
pci_bus_address(pdev, bar) + offset,
|
||||
resource_size(&pgmap->res), dev_to_node(&pdev->dev),
|
||||
&p2p_pgmap->ref);
|
||||
pgmap->ref);
|
||||
if (error)
|
||||
goto pages_free;
|
||||
|
||||
|
@ -232,7 +188,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
|
|||
pages_free:
|
||||
devm_memunmap_pages(&pdev->dev, pgmap);
|
||||
pgmap_free:
|
||||
devm_kfree(&pdev->dev, p2p_pgmap);
|
||||
devm_kfree(&pdev->dev, pgmap);
|
||||
return error;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(pci_p2pdma_add_resource);
|
||||
|
|
|
@ -1279,7 +1279,7 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
|
|||
if (pm->show_pfn)
|
||||
frame = pte_pfn(pte);
|
||||
flags |= PM_PRESENT;
|
||||
page = _vm_normal_page(vma, addr, pte, true);
|
||||
page = vm_normal_page(vma, addr, pte);
|
||||
if (pte_soft_dirty(pte))
|
||||
flags |= PM_SOFT_DIRTY;
|
||||
} else if (is_swap_pte(pte)) {
|
||||
|
|
|
@ -62,7 +62,7 @@
|
|||
#include <linux/kconfig.h>
|
||||
#include <asm/pgtable.h>
|
||||
|
||||
#if IS_ENABLED(CONFIG_HMM)
|
||||
#ifdef CONFIG_HMM_MIRROR
|
||||
|
||||
#include <linux/device.h>
|
||||
#include <linux/migrate.h>
|
||||
|
@ -324,9 +324,6 @@ static inline uint64_t hmm_pfn_from_pfn(const struct hmm_range *range,
|
|||
return hmm_device_entry_from_pfn(range, pfn);
|
||||
}
|
||||
|
||||
|
||||
|
||||
#if IS_ENABLED(CONFIG_HMM_MIRROR)
|
||||
/*
|
||||
* Mirroring: how to synchronize device page table with CPU page table.
|
||||
*
|
||||
|
@ -550,197 +547,4 @@ static inline void hmm_mm_init(struct mm_struct *mm)
|
|||
static inline void hmm_mm_init(struct mm_struct *mm) {}
|
||||
#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
|
||||
|
||||
#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC)
|
||||
struct hmm_devmem;
|
||||
|
||||
struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
|
||||
unsigned long addr);
|
||||
|
||||
/*
|
||||
* struct hmm_devmem_ops - callback for ZONE_DEVICE memory events
|
||||
*
|
||||
* @free: call when refcount on page reach 1 and thus is no longer use
|
||||
* @fault: call when there is a page fault to unaddressable memory
|
||||
*
|
||||
* Both callback happens from page_free() and page_fault() callback of struct
|
||||
* dev_pagemap respectively. See include/linux/memremap.h for more details on
|
||||
* those.
|
||||
*
|
||||
* The hmm_devmem_ops callback are just here to provide a coherent and
|
||||
* uniq API to device driver and device driver should not register their
|
||||
* own page_free() or page_fault() but rely on the hmm_devmem_ops call-
|
||||
* back.
|
||||
*/
|
||||
struct hmm_devmem_ops {
|
||||
/*
|
||||
* free() - free a device page
|
||||
* @devmem: device memory structure (see struct hmm_devmem)
|
||||
* @page: pointer to struct page being freed
|
||||
*
|
||||
* Call back occurs whenever a device page refcount reach 1 which
|
||||
* means that no one is holding any reference on the page anymore
|
||||
* (ZONE_DEVICE page have an elevated refcount of 1 as default so
|
||||
* that they are not release to the general page allocator).
|
||||
*
|
||||
* Note that callback has exclusive ownership of the page (as no
|
||||
* one is holding any reference).
|
||||
*/
|
||||
void (*free)(struct hmm_devmem *devmem, struct page *page);
|
||||
/*
|
||||
* fault() - CPU page fault or get user page (GUP)
|
||||
* @devmem: device memory structure (see struct hmm_devmem)
|
||||
* @vma: virtual memory area containing the virtual address
|
||||
* @addr: virtual address that faulted or for which there is a GUP
|
||||
* @page: pointer to struct page backing virtual address (unreliable)
|
||||
* @flags: FAULT_FLAG_* (see include/linux/mm.h)
|
||||
* @pmdp: page middle directory
|
||||
* Return: VM_FAULT_MINOR/MAJOR on success or one of VM_FAULT_ERROR
|
||||
* on error
|
||||
*
|
||||
* The callback occurs whenever there is a CPU page fault or GUP on a
|
||||
* virtual address. This means that the device driver must migrate the
|
||||
* page back to regular memory (CPU accessible).
|
||||
*
|
||||
* The device driver is free to migrate more than one page from the
|
||||
* fault() callback as an optimization. However if the device decides
|
||||
* to migrate more than one page it must always priotirize the faulting
|
||||
* address over the others.
|
||||
*
|
||||
* The struct page pointer is only given as a hint to allow quick
|
||||
* lookup of internal device driver data. A concurrent migration
|
||||
* might have already freed that page and the virtual address might
|
||||
* no longer be backed by it. So it should not be modified by the
|
||||
* callback.
|
||||
*
|
||||
* Note that mmap semaphore is held in read mode at least when this
|
||||
* callback occurs, hence the vma is valid upon callback entry.
|
||||
*/
|
||||
vm_fault_t (*fault)(struct hmm_devmem *devmem,
|
||||
struct vm_area_struct *vma,
|
||||
unsigned long addr,
|
||||
const struct page *page,
|
||||
unsigned int flags,
|
||||
pmd_t *pmdp);
|
||||
};
|
||||
|
||||
/*
|
||||
* struct hmm_devmem - track device memory
|
||||
*
|
||||
* @completion: completion object for device memory
|
||||
* @pfn_first: first pfn for this resource (set by hmm_devmem_add())
|
||||
* @pfn_last: last pfn for this resource (set by hmm_devmem_add())
|
||||
* @resource: IO resource reserved for this chunk of memory
|
||||
* @pagemap: device page map for that chunk
|
||||
* @device: device to bind resource to
|
||||
* @ops: memory operations callback
|
||||
* @ref: per CPU refcount
|
||||
* @page_fault: callback when CPU fault on an unaddressable device page
|
||||
*
|
||||
* This is a helper structure for device drivers that do not wish to implement
|
||||
* the gory details related to hotplugging new memoy and allocating struct
|
||||
* pages.
|
||||
*
|
||||
* Device drivers can directly use ZONE_DEVICE memory on their own if they
|
||||
* wish to do so.
|
||||
*
|
||||
* The page_fault() callback must migrate page back, from device memory to
|
||||
* system memory, so that the CPU can access it. This might fail for various
|
||||
* reasons (device issues, device have been unplugged, ...). When such error
|
||||
* conditions happen, the page_fault() callback must return VM_FAULT_SIGBUS and
|
||||
* set the CPU page table entry to "poisoned".
|
||||
*
|
||||
* Note that because memory cgroup charges are transferred to the device memory,
|
||||
* this should never fail due to memory restrictions. However, allocation
|
||||
* of a regular system page might still fail because we are out of memory. If
|
||||
* that happens, the page_fault() callback must return VM_FAULT_OOM.
|
||||
*
|
||||
* The page_fault() callback can also try to migrate back multiple pages in one
|
||||
* chunk, as an optimization. It must, however, prioritize the faulting address
|
||||
* over all the others.
|
||||
*/
|
||||
typedef vm_fault_t (*dev_page_fault_t)(struct vm_area_struct *vma,
|
||||
unsigned long addr,
|
||||
const struct page *page,
|
||||
unsigned int flags,
|
||||
pmd_t *pmdp);
|
||||
|
||||
struct hmm_devmem {
|
||||
struct completion completion;
|
||||
unsigned long pfn_first;
|
||||
unsigned long pfn_last;
|
||||
struct resource *resource;
|
||||
struct device *device;
|
||||
struct dev_pagemap pagemap;
|
||||
const struct hmm_devmem_ops *ops;
|
||||
struct percpu_ref ref;
|
||||
dev_page_fault_t page_fault;
|
||||
};
|
||||
|
||||
/*
|
||||
* To add (hotplug) device memory, HMM assumes that there is no real resource
|
||||
* that reserves a range in the physical address space (this is intended to be
|
||||
* use by unaddressable device memory). It will reserve a physical range big
|
||||
* enough and allocate struct page for it.
|
||||
*
|
||||
* The device driver can wrap the hmm_devmem struct inside a private device
|
||||
* driver struct.
|
||||
*/
|
||||
struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
|
||||
struct device *device,
|
||||
unsigned long size);
|
||||
struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
|
||||
struct device *device,
|
||||
struct resource *res);
|
||||
|
||||
/*
|
||||
* hmm_devmem_page_set_drvdata - set per-page driver data field
|
||||
*
|
||||
* @page: pointer to struct page
|
||||
* @data: driver data value to set
|
||||
*
|
||||
* Because page can not be on lru we have an unsigned long that driver can use
|
||||
* to store a per page field. This just a simple helper to do that.
|
||||
*/
|
||||
static inline void hmm_devmem_page_set_drvdata(struct page *page,
|
||||
unsigned long data)
|
||||
{
|
||||
page->hmm_data = data;
|
||||
}
|
||||
|
||||
/*
|
||||
* hmm_devmem_page_get_drvdata - get per page driver data field
|
||||
*
|
||||
* @page: pointer to struct page
|
||||
* Return: driver data value
|
||||
*/
|
||||
static inline unsigned long hmm_devmem_page_get_drvdata(const struct page *page)
|
||||
{
|
||||
return page->hmm_data;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* struct hmm_device - fake device to hang device memory onto
|
||||
*
|
||||
* @device: device struct
|
||||
* @minor: device minor number
|
||||
*/
|
||||
struct hmm_device {
|
||||
struct device device;
|
||||
unsigned int minor;
|
||||
};
|
||||
|
||||
/*
|
||||
* A device driver that wants to handle multiple devices memory through a
|
||||
* single fake device can use hmm_device to do so. This is purely a helper and
|
||||
* it is not strictly needed, in order to make use of any HMM functionality.
|
||||
*/
|
||||
struct hmm_device *hmm_device_new(void *drvdata);
|
||||
void hmm_device_put(struct hmm_device *hmm_device);
|
||||
#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
|
||||
#else /* IS_ENABLED(CONFIG_HMM) */
|
||||
static inline void hmm_mm_destroy(struct mm_struct *mm) {}
|
||||
static inline void hmm_mm_init(struct mm_struct *mm) {}
|
||||
#endif /* IS_ENABLED(CONFIG_HMM) */
|
||||
|
||||
#endif /* LINUX_HMM_H */
|
||||
|
|
|
@ -132,7 +132,6 @@ enum {
|
|||
IORES_DESC_PERSISTENT_MEMORY = 4,
|
||||
IORES_DESC_PERSISTENT_MEMORY_LEGACY = 5,
|
||||
IORES_DESC_DEVICE_PRIVATE_MEMORY = 6,
|
||||
IORES_DESC_DEVICE_PUBLIC_MEMORY = 7,
|
||||
};
|
||||
|
||||
/* helpers to define resources */
|
||||
|
@ -286,6 +285,8 @@ static inline bool resource_overlaps(struct resource *r1, struct resource *r2)
|
|||
return (r1->start <= r2->end && r1->end >= r2->start);
|
||||
}
|
||||
|
||||
struct resource *devm_request_free_mem_region(struct device *dev,
|
||||
struct resource *base, unsigned long size);
|
||||
|
||||
#endif /* __ASSEMBLY__ */
|
||||
#endif /* _LINUX_IOPORT_H */
|
||||
|
|
|
@ -37,13 +37,6 @@ struct vmem_altmap {
|
|||
* A more complete discussion of unaddressable memory may be found in
|
||||
* include/linux/hmm.h and Documentation/vm/hmm.rst.
|
||||
*
|
||||
* MEMORY_DEVICE_PUBLIC:
|
||||
* Device memory that is cache coherent from device and CPU point of view. This
|
||||
* is use on platform that have an advance system bus (like CAPI or CCIX). A
|
||||
* driver can hotplug the device memory using ZONE_DEVICE and with that memory
|
||||
* type. Any page of a process can be migrated to such memory. However no one
|
||||
* should be allow to pin such memory so that it can always be evicted.
|
||||
*
|
||||
* MEMORY_DEVICE_FS_DAX:
|
||||
* Host memory that has similar access semantics as System RAM i.e. DMA
|
||||
* coherent and supports page pinning. In support of coordinating page
|
||||
|
@ -52,54 +45,84 @@ struct vmem_altmap {
|
|||
* wakeup is used to coordinate physical address space management (ex:
|
||||
* fs truncate/hole punch) vs pinned pages (ex: device dma).
|
||||
*
|
||||
* MEMORY_DEVICE_DEVDAX:
|
||||
* Host memory that has similar access semantics as System RAM i.e. DMA
|
||||
* coherent and supports page pinning. In contrast to
|
||||
* MEMORY_DEVICE_FS_DAX, this memory is access via a device-dax
|
||||
* character device.
|
||||
*
|
||||
* MEMORY_DEVICE_PCI_P2PDMA:
|
||||
* Device memory residing in a PCI BAR intended for use with Peer-to-Peer
|
||||
* transactions.
|
||||
*/
|
||||
enum memory_type {
|
||||
/* 0 is reserved to catch uninitialized type fields */
|
||||
MEMORY_DEVICE_PRIVATE = 1,
|
||||
MEMORY_DEVICE_PUBLIC,
|
||||
MEMORY_DEVICE_FS_DAX,
|
||||
MEMORY_DEVICE_DEVDAX,
|
||||
MEMORY_DEVICE_PCI_P2PDMA,
|
||||
};
|
||||
|
||||
/*
|
||||
* Additional notes about MEMORY_DEVICE_PRIVATE may be found in
|
||||
* include/linux/hmm.h and Documentation/vm/hmm.rst. There is also a brief
|
||||
* explanation in include/linux/memory_hotplug.h.
|
||||
*
|
||||
* The page_free() callback is called once the page refcount reaches 1
|
||||
* (ZONE_DEVICE pages never reach 0 refcount unless there is a refcount bug.
|
||||
* This allows the device driver to implement its own memory management.)
|
||||
*/
|
||||
typedef void (*dev_page_free_t)(struct page *page, void *data);
|
||||
struct dev_pagemap_ops {
|
||||
/*
|
||||
* Called once the page refcount reaches 1. (ZONE_DEVICE pages never
|
||||
* reach 0 refcount unless there is a refcount bug. This allows the
|
||||
* device driver to implement its own memory management.)
|
||||
*/
|
||||
void (*page_free)(struct page *page);
|
||||
|
||||
/*
|
||||
* Transition the refcount in struct dev_pagemap to the dead state.
|
||||
*/
|
||||
void (*kill)(struct dev_pagemap *pgmap);
|
||||
|
||||
/*
|
||||
* Wait for refcount in struct dev_pagemap to be idle and reap it.
|
||||
*/
|
||||
void (*cleanup)(struct dev_pagemap *pgmap);
|
||||
|
||||
/*
|
||||
* Used for private (un-addressable) device memory only. Must migrate
|
||||
* the page back to a CPU accessible page.
|
||||
*/
|
||||
vm_fault_t (*migrate_to_ram)(struct vm_fault *vmf);
|
||||
};
|
||||
|
||||
#define PGMAP_ALTMAP_VALID (1 << 0)
|
||||
|
||||
/**
|
||||
* struct dev_pagemap - metadata for ZONE_DEVICE mappings
|
||||
* @page_free: free page callback when page refcount reaches 1
|
||||
* @altmap: pre-allocated/reserved memory for vmemmap allocations
|
||||
* @res: physical address range covered by @ref
|
||||
* @ref: reference count that pins the devm_memremap_pages() mapping
|
||||
* @kill: callback to transition @ref to the dead state
|
||||
* @cleanup: callback to wait for @ref to be idle and reap it
|
||||
* @internal_ref: internal reference if @ref is not provided by the caller
|
||||
* @done: completion for @internal_ref
|
||||
* @dev: host device of the mapping for debug
|
||||
* @data: private data pointer for page_free()
|
||||
* @type: memory type: see MEMORY_* in memory_hotplug.h
|
||||
* @flags: PGMAP_* flags to specify defailed behavior
|
||||
* @ops: method table
|
||||
*/
|
||||
struct dev_pagemap {
|
||||
dev_page_free_t page_free;
|
||||
struct vmem_altmap altmap;
|
||||
bool altmap_valid;
|
||||
struct resource res;
|
||||
struct percpu_ref *ref;
|
||||
void (*kill)(struct percpu_ref *ref);
|
||||
void (*cleanup)(struct percpu_ref *ref);
|
||||
struct percpu_ref internal_ref;
|
||||
struct completion done;
|
||||
struct device *dev;
|
||||
void *data;
|
||||
enum memory_type type;
|
||||
unsigned int flags;
|
||||
u64 pci_p2pdma_bus_offset;
|
||||
const struct dev_pagemap_ops *ops;
|
||||
};
|
||||
|
||||
static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap)
|
||||
{
|
||||
if (pgmap->flags & PGMAP_ALTMAP_VALID)
|
||||
return &pgmap->altmap;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_ZONE_DEVICE
|
||||
void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap);
|
||||
void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap);
|
||||
|
|
|
@ -932,8 +932,6 @@ static inline bool is_zone_device_page(const struct page *page)
|
|||
#endif
|
||||
|
||||
#ifdef CONFIG_DEV_PAGEMAP_OPS
|
||||
void dev_pagemap_get_ops(void);
|
||||
void dev_pagemap_put_ops(void);
|
||||
void __put_devmap_managed_page(struct page *page);
|
||||
DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
|
||||
static inline bool put_devmap_managed_page(struct page *page)
|
||||
|
@ -944,7 +942,6 @@ static inline bool put_devmap_managed_page(struct page *page)
|
|||
return false;
|
||||
switch (page->pgmap->type) {
|
||||
case MEMORY_DEVICE_PRIVATE:
|
||||
case MEMORY_DEVICE_PUBLIC:
|
||||
case MEMORY_DEVICE_FS_DAX:
|
||||
__put_devmap_managed_page(page);
|
||||
return true;
|
||||
|
@ -960,12 +957,6 @@ static inline bool is_device_private_page(const struct page *page)
|
|||
page->pgmap->type == MEMORY_DEVICE_PRIVATE;
|
||||
}
|
||||
|
||||
static inline bool is_device_public_page(const struct page *page)
|
||||
{
|
||||
return is_zone_device_page(page) &&
|
||||
page->pgmap->type == MEMORY_DEVICE_PUBLIC;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_PCI_P2PDMA
|
||||
static inline bool is_pci_p2pdma_page(const struct page *page)
|
||||
{
|
||||
|
@ -980,14 +971,6 @@ static inline bool is_pci_p2pdma_page(const struct page *page)
|
|||
#endif /* CONFIG_PCI_P2PDMA */
|
||||
|
||||
#else /* CONFIG_DEV_PAGEMAP_OPS */
|
||||
static inline void dev_pagemap_get_ops(void)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void dev_pagemap_put_ops(void)
|
||||
{
|
||||
}
|
||||
|
||||
static inline bool put_devmap_managed_page(struct page *page)
|
||||
{
|
||||
return false;
|
||||
|
@ -998,11 +981,6 @@ static inline bool is_device_private_page(const struct page *page)
|
|||
return false;
|
||||
}
|
||||
|
||||
static inline bool is_device_public_page(const struct page *page)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline bool is_pci_p2pdma_page(const struct page *page)
|
||||
{
|
||||
return false;
|
||||
|
@ -1431,10 +1409,8 @@ struct zap_details {
|
|||
pgoff_t last_index; /* Highest page->index to unmap */
|
||||
};
|
||||
|
||||
struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
|
||||
pte_t pte, bool with_public_device);
|
||||
#define vm_normal_page(vma, addr, pte) _vm_normal_page(vma, addr, pte, false)
|
||||
|
||||
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
|
||||
pte_t pte);
|
||||
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
|
||||
pmd_t pmd);
|
||||
|
||||
|
|
|
@ -158,7 +158,7 @@ struct page {
|
|||
struct { /* ZONE_DEVICE pages */
|
||||
/** @pgmap: Points to the hosting device page map. */
|
||||
struct dev_pagemap *pgmap;
|
||||
unsigned long hmm_data;
|
||||
void *zone_device_data;
|
||||
unsigned long _zd_pad_1; /* uses mapping */
|
||||
};
|
||||
|
||||
|
@ -501,7 +501,7 @@ struct mm_struct {
|
|||
#endif
|
||||
struct work_struct async_put_work;
|
||||
|
||||
#if IS_ENABLED(CONFIG_HMM)
|
||||
#ifdef CONFIG_HMM_MIRROR
|
||||
/* HMM needs to track a few things per mm */
|
||||
struct hmm *hmm;
|
||||
#endif
|
||||
|
|
|
@ -129,12 +129,6 @@ static inline struct page *device_private_entry_to_page(swp_entry_t entry)
|
|||
{
|
||||
return pfn_to_page(swp_offset(entry));
|
||||
}
|
||||
|
||||
vm_fault_t device_private_entry_fault(struct vm_area_struct *vma,
|
||||
unsigned long addr,
|
||||
swp_entry_t entry,
|
||||
unsigned int flags,
|
||||
pmd_t *pmdp);
|
||||
#else /* CONFIG_DEVICE_PRIVATE */
|
||||
static inline swp_entry_t make_device_private_entry(struct page *page, bool write)
|
||||
{
|
||||
|
@ -164,15 +158,6 @@ static inline struct page *device_private_entry_to_page(swp_entry_t entry)
|
|||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline vm_fault_t device_private_entry_fault(struct vm_area_struct *vma,
|
||||
unsigned long addr,
|
||||
swp_entry_t entry,
|
||||
unsigned int flags,
|
||||
pmd_t *pmdp)
|
||||
{
|
||||
return VM_FAULT_SIGBUS;
|
||||
}
|
||||
#endif /* CONFIG_DEVICE_PRIVATE */
|
||||
|
||||
#ifdef CONFIG_MIGRATION
|
||||
|
|
|
@ -11,41 +11,39 @@
|
|||
#include <linux/types.h>
|
||||
#include <linux/wait_bit.h>
|
||||
#include <linux/xarray.h>
|
||||
#include <linux/hmm.h>
|
||||
|
||||
static DEFINE_XARRAY(pgmap_array);
|
||||
#define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
|
||||
#define SECTION_SIZE (1UL << PA_SECTION_SHIFT)
|
||||
|
||||
#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
|
||||
vm_fault_t device_private_entry_fault(struct vm_area_struct *vma,
|
||||
unsigned long addr,
|
||||
swp_entry_t entry,
|
||||
unsigned int flags,
|
||||
pmd_t *pmdp)
|
||||
#ifdef CONFIG_DEV_PAGEMAP_OPS
|
||||
DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
|
||||
EXPORT_SYMBOL(devmap_managed_key);
|
||||
static atomic_t devmap_managed_enable;
|
||||
|
||||
static void devmap_managed_enable_put(void *data)
|
||||
{
|
||||
struct page *page = device_private_entry_to_page(entry);
|
||||
struct hmm_devmem *devmem;
|
||||
|
||||
devmem = container_of(page->pgmap, typeof(*devmem), pagemap);
|
||||
|
||||
/*
|
||||
* The page_fault() callback must migrate page back to system memory
|
||||
* so that CPU can access it. This might fail for various reasons
|
||||
* (device issue, device was unsafely unplugged, ...). When such
|
||||
* error conditions happen, the callback must return VM_FAULT_SIGBUS.
|
||||
*
|
||||
* Note that because memory cgroup charges are accounted to the device
|
||||
* memory, this should never fail because of memory restrictions (but
|
||||
* allocation of regular system page might still fail because we are
|
||||
* out of memory).
|
||||
*
|
||||
* There is a more in-depth description of what that callback can and
|
||||
* cannot do, in include/linux/memremap.h
|
||||
*/
|
||||
return devmem->page_fault(vma, addr, page, flags, pmdp);
|
||||
if (atomic_dec_and_test(&devmap_managed_enable))
|
||||
static_branch_disable(&devmap_managed_key);
|
||||
}
|
||||
#endif /* CONFIG_DEVICE_PRIVATE */
|
||||
|
||||
static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgmap)
|
||||
{
|
||||
if (!pgmap->ops || !pgmap->ops->page_free) {
|
||||
WARN(1, "Missing page_free method\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (atomic_inc_return(&devmap_managed_enable) == 1)
|
||||
static_branch_enable(&devmap_managed_key);
|
||||
return devm_add_action_or_reset(dev, devmap_managed_enable_put, NULL);
|
||||
}
|
||||
#else
|
||||
static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgmap)
|
||||
{
|
||||
return -EINVAL;
|
||||
}
|
||||
#endif /* CONFIG_DEV_PAGEMAP_OPS */
|
||||
|
||||
static void pgmap_array_delete(struct resource *res)
|
||||
{
|
||||
|
@ -56,14 +54,8 @@ static void pgmap_array_delete(struct resource *res)
|
|||
|
||||
static unsigned long pfn_first(struct dev_pagemap *pgmap)
|
||||
{
|
||||
const struct resource *res = &pgmap->res;
|
||||
struct vmem_altmap *altmap = &pgmap->altmap;
|
||||
unsigned long pfn;
|
||||
|
||||
pfn = res->start >> PAGE_SHIFT;
|
||||
if (pgmap->altmap_valid)
|
||||
pfn += vmem_altmap_offset(altmap);
|
||||
return pfn;
|
||||
return (pgmap->res.start >> PAGE_SHIFT) +
|
||||
vmem_altmap_offset(pgmap_altmap(pgmap));
|
||||
}
|
||||
|
||||
static unsigned long pfn_end(struct dev_pagemap *pgmap)
|
||||
|
@ -83,6 +75,24 @@ static unsigned long pfn_next(unsigned long pfn)
|
|||
#define for_each_device_pfn(pfn, map) \
|
||||
for (pfn = pfn_first(map); pfn < pfn_end(map); pfn = pfn_next(pfn))
|
||||
|
||||
static void dev_pagemap_kill(struct dev_pagemap *pgmap)
|
||||
{
|
||||
if (pgmap->ops && pgmap->ops->kill)
|
||||
pgmap->ops->kill(pgmap);
|
||||
else
|
||||
percpu_ref_kill(pgmap->ref);
|
||||
}
|
||||
|
||||
static void dev_pagemap_cleanup(struct dev_pagemap *pgmap)
|
||||
{
|
||||
if (pgmap->ops && pgmap->ops->cleanup) {
|
||||
pgmap->ops->cleanup(pgmap);
|
||||
} else {
|
||||
wait_for_completion(&pgmap->done);
|
||||
percpu_ref_exit(pgmap->ref);
|
||||
}
|
||||
}
|
||||
|
||||
static void devm_memremap_pages_release(void *data)
|
||||
{
|
||||
struct dev_pagemap *pgmap = data;
|
||||
|
@ -92,10 +102,10 @@ static void devm_memremap_pages_release(void *data)
|
|||
unsigned long pfn;
|
||||
int nid;
|
||||
|
||||
pgmap->kill(pgmap->ref);
|
||||
dev_pagemap_kill(pgmap);
|
||||
for_each_device_pfn(pfn, pgmap)
|
||||
put_page(pfn_to_page(pfn));
|
||||
pgmap->cleanup(pgmap->ref);
|
||||
dev_pagemap_cleanup(pgmap);
|
||||
|
||||
/* pages are dead and unused, undo the arch mapping */
|
||||
align_start = res->start & ~(SECTION_SIZE - 1);
|
||||
|
@ -111,7 +121,7 @@ static void devm_memremap_pages_release(void *data)
|
|||
align_size >> PAGE_SHIFT, NULL);
|
||||
} else {
|
||||
arch_remove_memory(nid, align_start, align_size,
|
||||
pgmap->altmap_valid ? &pgmap->altmap : NULL);
|
||||
pgmap_altmap(pgmap));
|
||||
kasan_remove_zero_shadow(__va(align_start), align_size);
|
||||
}
|
||||
mem_hotplug_done();
|
||||
|
@ -122,20 +132,29 @@ static void devm_memremap_pages_release(void *data)
|
|||
"%s: failed to free all reserved pages\n", __func__);
|
||||
}
|
||||
|
||||
static void dev_pagemap_percpu_release(struct percpu_ref *ref)
|
||||
{
|
||||
struct dev_pagemap *pgmap =
|
||||
container_of(ref, struct dev_pagemap, internal_ref);
|
||||
|
||||
complete(&pgmap->done);
|
||||
}
|
||||
|
||||
/**
|
||||
* devm_memremap_pages - remap and provide memmap backing for the given resource
|
||||
* @dev: hosting device for @res
|
||||
* @pgmap: pointer to a struct dev_pagemap
|
||||
*
|
||||
* Notes:
|
||||
* 1/ At a minimum the res, ref and type members of @pgmap must be initialized
|
||||
* 1/ At a minimum the res and type members of @pgmap must be initialized
|
||||
* by the caller before passing it to this function
|
||||
*
|
||||
* 2/ The altmap field may optionally be initialized, in which case altmap_valid
|
||||
* must be set to true
|
||||
* 2/ The altmap field may optionally be initialized, in which case
|
||||
* PGMAP_ALTMAP_VALID must be set in pgmap->flags.
|
||||
*
|
||||
* 3/ pgmap->ref must be 'live' on entry and will be killed and reaped
|
||||
* at devm_memremap_pages_release() time, or if this routine fails.
|
||||
* 3/ The ref field may optionally be provided, in which pgmap->ref must be
|
||||
* 'live' on entry and will be killed and reaped at
|
||||
* devm_memremap_pages_release() time, or if this routine fails.
|
||||
*
|
||||
* 4/ res is expected to be a host memory range that could feasibly be
|
||||
* treated as a "System RAM" range, i.e. not a device mmio range, but
|
||||
|
@ -144,22 +163,66 @@ static void devm_memremap_pages_release(void *data)
|
|||
void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
|
||||
{
|
||||
resource_size_t align_start, align_size, align_end;
|
||||
struct vmem_altmap *altmap = pgmap->altmap_valid ?
|
||||
&pgmap->altmap : NULL;
|
||||
struct resource *res = &pgmap->res;
|
||||
struct dev_pagemap *conflict_pgmap;
|
||||
struct mhp_restrictions restrictions = {
|
||||
/*
|
||||
* We do not want any optional features only our own memmap
|
||||
*/
|
||||
.altmap = altmap,
|
||||
.altmap = pgmap_altmap(pgmap),
|
||||
};
|
||||
pgprot_t pgprot = PAGE_KERNEL;
|
||||
int error, nid, is_ram;
|
||||
bool need_devmap_managed = true;
|
||||
|
||||
if (!pgmap->ref || !pgmap->kill || !pgmap->cleanup) {
|
||||
WARN(1, "Missing reference count teardown definition\n");
|
||||
return ERR_PTR(-EINVAL);
|
||||
switch (pgmap->type) {
|
||||
case MEMORY_DEVICE_PRIVATE:
|
||||
if (!IS_ENABLED(CONFIG_DEVICE_PRIVATE)) {
|
||||
WARN(1, "Device private memory not supported\n");
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
if (!pgmap->ops || !pgmap->ops->migrate_to_ram) {
|
||||
WARN(1, "Missing migrate_to_ram method\n");
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
break;
|
||||
case MEMORY_DEVICE_FS_DAX:
|
||||
if (!IS_ENABLED(CONFIG_ZONE_DEVICE) ||
|
||||
IS_ENABLED(CONFIG_FS_DAX_LIMITED)) {
|
||||
WARN(1, "File system DAX not supported\n");
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
break;
|
||||
case MEMORY_DEVICE_DEVDAX:
|
||||
case MEMORY_DEVICE_PCI_P2PDMA:
|
||||
need_devmap_managed = false;
|
||||
break;
|
||||
default:
|
||||
WARN(1, "Invalid pgmap type %d\n", pgmap->type);
|
||||
break;
|
||||
}
|
||||
|
||||
if (!pgmap->ref) {
|
||||
if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup))
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
init_completion(&pgmap->done);
|
||||
error = percpu_ref_init(&pgmap->internal_ref,
|
||||
dev_pagemap_percpu_release, 0, GFP_KERNEL);
|
||||
if (error)
|
||||
return ERR_PTR(error);
|
||||
pgmap->ref = &pgmap->internal_ref;
|
||||
} else {
|
||||
if (!pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup) {
|
||||
WARN(1, "Missing reference count teardown definition\n");
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
}
|
||||
|
||||
if (need_devmap_managed) {
|
||||
error = devmap_managed_enable_get(dev, pgmap);
|
||||
if (error)
|
||||
return ERR_PTR(error);
|
||||
}
|
||||
|
||||
align_start = res->start & ~(SECTION_SIZE - 1);
|
||||
|
@ -241,7 +304,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
|
|||
|
||||
zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE];
|
||||
move_pfn_range_to_zone(zone, align_start >> PAGE_SHIFT,
|
||||
align_size >> PAGE_SHIFT, altmap);
|
||||
align_size >> PAGE_SHIFT, pgmap_altmap(pgmap));
|
||||
}
|
||||
|
||||
mem_hotplug_done();
|
||||
|
@ -271,9 +334,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
|
|||
err_pfn_remap:
|
||||
pgmap_array_delete(res);
|
||||
err_array:
|
||||
pgmap->kill(pgmap->ref);
|
||||
pgmap->cleanup(pgmap->ref);
|
||||
|
||||
dev_pagemap_kill(pgmap);
|
||||
dev_pagemap_cleanup(pgmap);
|
||||
return ERR_PTR(error);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(devm_memremap_pages);
|
||||
|
@ -287,7 +349,9 @@ EXPORT_SYMBOL_GPL(devm_memunmap_pages);
|
|||
unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
|
||||
{
|
||||
/* number of pfns from base where pfn_to_page() is valid */
|
||||
return altmap->reserve + altmap->free;
|
||||
if (altmap)
|
||||
return altmap->reserve + altmap->free;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns)
|
||||
|
@ -329,28 +393,6 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
|
|||
EXPORT_SYMBOL_GPL(get_dev_pagemap);
|
||||
|
||||
#ifdef CONFIG_DEV_PAGEMAP_OPS
|
||||
DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
|
||||
EXPORT_SYMBOL(devmap_managed_key);
|
||||
static atomic_t devmap_enable;
|
||||
|
||||
/*
|
||||
* Toggle the static key for ->page_free() callbacks when dev_pagemap
|
||||
* pages go idle.
|
||||
*/
|
||||
void dev_pagemap_get_ops(void)
|
||||
{
|
||||
if (atomic_inc_return(&devmap_enable) == 1)
|
||||
static_branch_enable(&devmap_managed_key);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(dev_pagemap_get_ops);
|
||||
|
||||
void dev_pagemap_put_ops(void)
|
||||
{
|
||||
if (atomic_dec_and_test(&devmap_enable))
|
||||
static_branch_disable(&devmap_managed_key);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(dev_pagemap_put_ops);
|
||||
|
||||
void __put_devmap_managed_page(struct page *page)
|
||||
{
|
||||
int count = page_ref_dec_return(page);
|
||||
|
@ -366,7 +408,7 @@ void __put_devmap_managed_page(struct page *page)
|
|||
|
||||
mem_cgroup_uncharge(page);
|
||||
|
||||
page->pgmap->page_free(page, page->pgmap->data);
|
||||
page->pgmap->ops->page_free(page);
|
||||
} else if (!count)
|
||||
__put_page(page);
|
||||
}
|
||||
|
|
|
@ -1628,6 +1628,45 @@ void resource_list_free(struct list_head *head)
|
|||
}
|
||||
EXPORT_SYMBOL(resource_list_free);
|
||||
|
||||
#ifdef CONFIG_DEVICE_PRIVATE
|
||||
/**
|
||||
* devm_request_free_mem_region - find free region for device private memory
|
||||
*
|
||||
* @dev: device struct to bind the resource to
|
||||
* @size: size in bytes of the device memory to add
|
||||
* @base: resource tree to look in
|
||||
*
|
||||
* This function tries to find an empty range of physical address big enough to
|
||||
* contain the new resource, so that it can later be hotplugged as ZONE_DEVICE
|
||||
* memory, which in turn allocates struct pages.
|
||||
*/
|
||||
struct resource *devm_request_free_mem_region(struct device *dev,
|
||||
struct resource *base, unsigned long size)
|
||||
{
|
||||
resource_size_t end, addr;
|
||||
struct resource *res;
|
||||
|
||||
size = ALIGN(size, 1UL << PA_SECTION_SHIFT);
|
||||
end = min_t(unsigned long, base->end, (1UL << MAX_PHYSMEM_BITS) - 1);
|
||||
addr = end - size + 1UL;
|
||||
|
||||
for (; addr > size && addr >= base->start; addr -= size) {
|
||||
if (region_intersects(addr, size, 0, IORES_DESC_NONE) !=
|
||||
REGION_DISJOINT)
|
||||
continue;
|
||||
|
||||
res = devm_request_mem_region(dev, addr, size, dev_name(dev));
|
||||
if (!res)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
res->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY;
|
||||
return res;
|
||||
}
|
||||
|
||||
return ERR_PTR(-ERANGE);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(devm_request_free_mem_region);
|
||||
#endif /* CONFIG_DEVICE_PRIVATE */
|
||||
|
||||
static int __init strict_iomem(char *str)
|
||||
{
|
||||
if (strstr(str, "relaxed"))
|
||||
|
|
50
mm/Kconfig
50
mm/Kconfig
|
@ -669,47 +669,17 @@ config ZONE_DEVICE
|
|||
|
||||
If FS_DAX is enabled, then say Y.
|
||||
|
||||
config ARCH_HAS_HMM_MIRROR
|
||||
bool
|
||||
default y
|
||||
depends on (X86_64 || PPC64)
|
||||
depends on MMU && 64BIT
|
||||
|
||||
config ARCH_HAS_HMM_DEVICE
|
||||
bool
|
||||
default y
|
||||
depends on (X86_64 || PPC64)
|
||||
depends on MEMORY_HOTPLUG
|
||||
depends on MEMORY_HOTREMOVE
|
||||
depends on SPARSEMEM_VMEMMAP
|
||||
depends on ARCH_HAS_ZONE_DEVICE
|
||||
select XARRAY_MULTI
|
||||
|
||||
config ARCH_HAS_HMM
|
||||
bool
|
||||
default y
|
||||
depends on (X86_64 || PPC64)
|
||||
depends on ZONE_DEVICE
|
||||
depends on MMU && 64BIT
|
||||
depends on MEMORY_HOTPLUG
|
||||
depends on MEMORY_HOTREMOVE
|
||||
depends on SPARSEMEM_VMEMMAP
|
||||
|
||||
config MIGRATE_VMA_HELPER
|
||||
bool
|
||||
|
||||
config DEV_PAGEMAP_OPS
|
||||
bool
|
||||
|
||||
config HMM
|
||||
bool
|
||||
select MMU_NOTIFIER
|
||||
select MIGRATE_VMA_HELPER
|
||||
|
||||
config HMM_MIRROR
|
||||
bool "HMM mirror CPU page table into a device page table"
|
||||
depends on ARCH_HAS_HMM
|
||||
select HMM
|
||||
depends on (X86_64 || PPC64)
|
||||
depends on MMU && 64BIT
|
||||
select MMU_NOTIFIER
|
||||
help
|
||||
Select HMM_MIRROR if you want to mirror range of the CPU page table of a
|
||||
process into a device page table. Here, mirror means "keep synchronized".
|
||||
|
@ -719,8 +689,7 @@ config HMM_MIRROR
|
|||
|
||||
config DEVICE_PRIVATE
|
||||
bool "Unaddressable device memory (GPU memory, ...)"
|
||||
depends on ARCH_HAS_HMM
|
||||
select HMM
|
||||
depends on ZONE_DEVICE
|
||||
select DEV_PAGEMAP_OPS
|
||||
|
||||
help
|
||||
|
@ -728,17 +697,6 @@ config DEVICE_PRIVATE
|
|||
memory; i.e., memory that is only accessible from the device (or
|
||||
group of devices). You likely also want to select HMM_MIRROR.
|
||||
|
||||
config DEVICE_PUBLIC
|
||||
bool "Addressable device memory (like GPU memory)"
|
||||
depends on ARCH_HAS_HMM
|
||||
select HMM
|
||||
select DEV_PAGEMAP_OPS
|
||||
|
||||
help
|
||||
Allows creation of struct pages to represent addressable device
|
||||
memory; i.e., memory that is accessible from both the device and
|
||||
the CPU
|
||||
|
||||
config FRAME_VECTOR
|
||||
bool
|
||||
|
||||
|
|
|
@ -102,5 +102,5 @@ obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o
|
|||
obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
|
||||
obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
|
||||
obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o
|
||||
obj-$(CONFIG_HMM) += hmm.o
|
||||
obj-$(CONFIG_HMM_MIRROR) += hmm.o
|
||||
obj-$(CONFIG_MEMFD_CREATE) += memfd.o
|
||||
|
|
7
mm/gup.c
7
mm/gup.c
|
@ -605,13 +605,6 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
|
|||
if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte)))
|
||||
goto unmap;
|
||||
*page = pte_page(*pte);
|
||||
|
||||
/*
|
||||
* This should never happen (a device public page in the gate
|
||||
* area).
|
||||
*/
|
||||
if (is_device_public_page(*page))
|
||||
goto unmap;
|
||||
}
|
||||
if (unlikely(!try_get_page(*page))) {
|
||||
ret = -ENOMEM;
|
||||
|
|
284
mm/hmm.c
284
mm/hmm.c
|
@ -26,9 +26,6 @@
|
|||
#include <linux/mmu_notifier.h>
|
||||
#include <linux/memory_hotplug.h>
|
||||
|
||||
#define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT)
|
||||
|
||||
#if IS_ENABLED(CONFIG_HMM_MIRROR)
|
||||
static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
|
||||
|
||||
/**
|
||||
|
@ -1287,284 +1284,3 @@ long hmm_range_dma_unmap(struct hmm_range *range,
|
|||
return cpages;
|
||||
}
|
||||
EXPORT_SYMBOL(hmm_range_dma_unmap);
|
||||
#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
|
||||
|
||||
|
||||
#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC)
|
||||
struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
|
||||
unsigned long addr)
|
||||
{
|
||||
struct page *page;
|
||||
|
||||
page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
|
||||
if (!page)
|
||||
return NULL;
|
||||
lock_page(page);
|
||||
return page;
|
||||
}
|
||||
EXPORT_SYMBOL(hmm_vma_alloc_locked_page);
|
||||
|
||||
|
||||
static void hmm_devmem_ref_release(struct percpu_ref *ref)
|
||||
{
|
||||
struct hmm_devmem *devmem;
|
||||
|
||||
devmem = container_of(ref, struct hmm_devmem, ref);
|
||||
complete(&devmem->completion);
|
||||
}
|
||||
|
||||
static void hmm_devmem_ref_exit(struct percpu_ref *ref)
|
||||
{
|
||||
struct hmm_devmem *devmem;
|
||||
|
||||
devmem = container_of(ref, struct hmm_devmem, ref);
|
||||
wait_for_completion(&devmem->completion);
|
||||
percpu_ref_exit(ref);
|
||||
}
|
||||
|
||||
static void hmm_devmem_ref_kill(struct percpu_ref *ref)
|
||||
{
|
||||
percpu_ref_kill(ref);
|
||||
}
|
||||
|
||||
static vm_fault_t hmm_devmem_fault(struct vm_area_struct *vma,
|
||||
unsigned long addr,
|
||||
const struct page *page,
|
||||
unsigned int flags,
|
||||
pmd_t *pmdp)
|
||||
{
|
||||
struct hmm_devmem *devmem = page->pgmap->data;
|
||||
|
||||
return devmem->ops->fault(devmem, vma, addr, page, flags, pmdp);
|
||||
}
|
||||
|
||||
static void hmm_devmem_free(struct page *page, void *data)
|
||||
{
|
||||
struct hmm_devmem *devmem = data;
|
||||
|
||||
page->mapping = NULL;
|
||||
|
||||
devmem->ops->free(devmem, page);
|
||||
}
|
||||
|
||||
/*
|
||||
* hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory
|
||||
*
|
||||
* @ops: memory event device driver callback (see struct hmm_devmem_ops)
|
||||
* @device: device struct to bind the resource too
|
||||
* @size: size in bytes of the device memory to add
|
||||
* Return: pointer to new hmm_devmem struct ERR_PTR otherwise
|
||||
*
|
||||
* This function first finds an empty range of physical address big enough to
|
||||
* contain the new resource, and then hotplugs it as ZONE_DEVICE memory, which
|
||||
* in turn allocates struct pages. It does not do anything beyond that; all
|
||||
* events affecting the memory will go through the various callbacks provided
|
||||
* by hmm_devmem_ops struct.
|
||||
*
|
||||
* Device driver should call this function during device initialization and
|
||||
* is then responsible of memory management. HMM only provides helpers.
|
||||
*/
|
||||
struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
|
||||
struct device *device,
|
||||
unsigned long size)
|
||||
{
|
||||
struct hmm_devmem *devmem;
|
||||
resource_size_t addr;
|
||||
void *result;
|
||||
int ret;
|
||||
|
||||
dev_pagemap_get_ops();
|
||||
|
||||
devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL);
|
||||
if (!devmem)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
init_completion(&devmem->completion);
|
||||
devmem->pfn_first = -1UL;
|
||||
devmem->pfn_last = -1UL;
|
||||
devmem->resource = NULL;
|
||||
devmem->device = device;
|
||||
devmem->ops = ops;
|
||||
|
||||
ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
|
||||
0, GFP_KERNEL);
|
||||
if (ret)
|
||||
return ERR_PTR(ret);
|
||||
|
||||
size = ALIGN(size, PA_SECTION_SIZE);
|
||||
addr = min((unsigned long)iomem_resource.end,
|
||||
(1UL << MAX_PHYSMEM_BITS) - 1);
|
||||
addr = addr - size + 1UL;
|
||||
|
||||
/*
|
||||
* FIXME add a new helper to quickly walk resource tree and find free
|
||||
* range
|
||||
*
|
||||
* FIXME what about ioport_resource resource ?
|
||||
*/
|
||||
for (; addr > size && addr >= iomem_resource.start; addr -= size) {
|
||||
ret = region_intersects(addr, size, 0, IORES_DESC_NONE);
|
||||
if (ret != REGION_DISJOINT)
|
||||
continue;
|
||||
|
||||
devmem->resource = devm_request_mem_region(device, addr, size,
|
||||
dev_name(device));
|
||||
if (!devmem->resource)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
break;
|
||||
}
|
||||
if (!devmem->resource)
|
||||
return ERR_PTR(-ERANGE);
|
||||
|
||||
devmem->resource->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY;
|
||||
devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
|
||||
devmem->pfn_last = devmem->pfn_first +
|
||||
(resource_size(devmem->resource) >> PAGE_SHIFT);
|
||||
devmem->page_fault = hmm_devmem_fault;
|
||||
|
||||
devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
|
||||
devmem->pagemap.res = *devmem->resource;
|
||||
devmem->pagemap.page_free = hmm_devmem_free;
|
||||
devmem->pagemap.altmap_valid = false;
|
||||
devmem->pagemap.ref = &devmem->ref;
|
||||
devmem->pagemap.data = devmem;
|
||||
devmem->pagemap.kill = hmm_devmem_ref_kill;
|
||||
devmem->pagemap.cleanup = hmm_devmem_ref_exit;
|
||||
|
||||
result = devm_memremap_pages(devmem->device, &devmem->pagemap);
|
||||
if (IS_ERR(result))
|
||||
return result;
|
||||
return devmem;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(hmm_devmem_add);
|
||||
|
||||
struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
|
||||
struct device *device,
|
||||
struct resource *res)
|
||||
{
|
||||
struct hmm_devmem *devmem;
|
||||
void *result;
|
||||
int ret;
|
||||
|
||||
if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY)
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
dev_pagemap_get_ops();
|
||||
|
||||
devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL);
|
||||
if (!devmem)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
init_completion(&devmem->completion);
|
||||
devmem->pfn_first = -1UL;
|
||||
devmem->pfn_last = -1UL;
|
||||
devmem->resource = res;
|
||||
devmem->device = device;
|
||||
devmem->ops = ops;
|
||||
|
||||
ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
|
||||
0, GFP_KERNEL);
|
||||
if (ret)
|
||||
return ERR_PTR(ret);
|
||||
|
||||
devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
|
||||
devmem->pfn_last = devmem->pfn_first +
|
||||
(resource_size(devmem->resource) >> PAGE_SHIFT);
|
||||
devmem->page_fault = hmm_devmem_fault;
|
||||
|
||||
devmem->pagemap.type = MEMORY_DEVICE_PUBLIC;
|
||||
devmem->pagemap.res = *devmem->resource;
|
||||
devmem->pagemap.page_free = hmm_devmem_free;
|
||||
devmem->pagemap.altmap_valid = false;
|
||||
devmem->pagemap.ref = &devmem->ref;
|
||||
devmem->pagemap.data = devmem;
|
||||
devmem->pagemap.kill = hmm_devmem_ref_kill;
|
||||
devmem->pagemap.cleanup = hmm_devmem_ref_exit;
|
||||
|
||||
result = devm_memremap_pages(devmem->device, &devmem->pagemap);
|
||||
if (IS_ERR(result))
|
||||
return result;
|
||||
return devmem;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(hmm_devmem_add_resource);
|
||||
|
||||
/*
|
||||
* A device driver that wants to handle multiple devices memory through a
|
||||
* single fake device can use hmm_device to do so. This is purely a helper
|
||||
* and it is not needed to make use of any HMM functionality.
|
||||
*/
|
||||
#define HMM_DEVICE_MAX 256
|
||||
|
||||
static DECLARE_BITMAP(hmm_device_mask, HMM_DEVICE_MAX);
|
||||
static DEFINE_SPINLOCK(hmm_device_lock);
|
||||
static struct class *hmm_device_class;
|
||||
static dev_t hmm_device_devt;
|
||||
|
||||
static void hmm_device_release(struct device *device)
|
||||
{
|
||||
struct hmm_device *hmm_device;
|
||||
|
||||
hmm_device = container_of(device, struct hmm_device, device);
|
||||
spin_lock(&hmm_device_lock);
|
||||
clear_bit(hmm_device->minor, hmm_device_mask);
|
||||
spin_unlock(&hmm_device_lock);
|
||||
|
||||
kfree(hmm_device);
|
||||
}
|
||||
|
||||
struct hmm_device *hmm_device_new(void *drvdata)
|
||||
{
|
||||
struct hmm_device *hmm_device;
|
||||
|
||||
hmm_device = kzalloc(sizeof(*hmm_device), GFP_KERNEL);
|
||||
if (!hmm_device)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
spin_lock(&hmm_device_lock);
|
||||
hmm_device->minor = find_first_zero_bit(hmm_device_mask, HMM_DEVICE_MAX);
|
||||
if (hmm_device->minor >= HMM_DEVICE_MAX) {
|
||||
spin_unlock(&hmm_device_lock);
|
||||
kfree(hmm_device);
|
||||
return ERR_PTR(-EBUSY);
|
||||
}
|
||||
set_bit(hmm_device->minor, hmm_device_mask);
|
||||
spin_unlock(&hmm_device_lock);
|
||||
|
||||
dev_set_name(&hmm_device->device, "hmm_device%d", hmm_device->minor);
|
||||
hmm_device->device.devt = MKDEV(MAJOR(hmm_device_devt),
|
||||
hmm_device->minor);
|
||||
hmm_device->device.release = hmm_device_release;
|
||||
dev_set_drvdata(&hmm_device->device, drvdata);
|
||||
hmm_device->device.class = hmm_device_class;
|
||||
device_initialize(&hmm_device->device);
|
||||
|
||||
return hmm_device;
|
||||
}
|
||||
EXPORT_SYMBOL(hmm_device_new);
|
||||
|
||||
void hmm_device_put(struct hmm_device *hmm_device)
|
||||
{
|
||||
put_device(&hmm_device->device);
|
||||
}
|
||||
EXPORT_SYMBOL(hmm_device_put);
|
||||
|
||||
static int __init hmm_init(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = alloc_chrdev_region(&hmm_device_devt, 0,
|
||||
HMM_DEVICE_MAX,
|
||||
"hmm_device");
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
hmm_device_class = class_create(THIS_MODULE, "hmm_device");
|
||||
if (IS_ERR(hmm_device_class)) {
|
||||
unregister_chrdev_region(hmm_device_devt, HMM_DEVICE_MAX);
|
||||
return PTR_ERR(hmm_device_class);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
device_initcall(hmm_init);
|
||||
#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
|
||||
|
|
|
@ -354,7 +354,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
|
|||
continue;
|
||||
}
|
||||
|
||||
page = _vm_normal_page(vma, addr, ptent, true);
|
||||
page = vm_normal_page(vma, addr, ptent);
|
||||
if (!page)
|
||||
continue;
|
||||
|
||||
|
|
|
@ -4793,7 +4793,7 @@ enum mc_target_type {
|
|||
static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
|
||||
unsigned long addr, pte_t ptent)
|
||||
{
|
||||
struct page *page = _vm_normal_page(vma, addr, ptent, true);
|
||||
struct page *page = vm_normal_page(vma, addr, ptent);
|
||||
|
||||
if (!page || !page_mapped(page))
|
||||
return NULL;
|
||||
|
@ -4994,8 +4994,8 @@ out:
|
|||
* 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
|
||||
* target for charge migration. if @target is not NULL, the entry is stored
|
||||
* in target->ent.
|
||||
* 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PUBLIC
|
||||
* or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru).
|
||||
* 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE
|
||||
* (so ZONE_DEVICE page and thus not on the lru).
|
||||
* For now we such page is charge like a regular page would be as for all
|
||||
* intent and purposes it is just special memory taking the place of a
|
||||
* regular page.
|
||||
|
@ -5029,8 +5029,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
|
|||
*/
|
||||
if (page->mem_cgroup == mc.from) {
|
||||
ret = MC_TARGET_PAGE;
|
||||
if (is_device_private_page(page) ||
|
||||
is_device_public_page(page))
|
||||
if (is_device_private_page(page))
|
||||
ret = MC_TARGET_DEVICE;
|
||||
if (target)
|
||||
target->page = page;
|
||||
|
@ -5101,8 +5100,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
|
|||
if (ptl) {
|
||||
/*
|
||||
* Note their can not be MC_TARGET_DEVICE for now as we do not
|
||||
* support transparent huge page with MEMORY_DEVICE_PUBLIC or
|
||||
* MEMORY_DEVICE_PRIVATE but this might change.
|
||||
* support transparent huge page with MEMORY_DEVICE_PRIVATE but
|
||||
* this might change.
|
||||
*/
|
||||
if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
|
||||
mc.precharge += HPAGE_PMD_NR;
|
||||
|
|
|
@ -1177,16 +1177,12 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
|
|||
goto unlock;
|
||||
}
|
||||
|
||||
switch (pgmap->type) {
|
||||
case MEMORY_DEVICE_PRIVATE:
|
||||
case MEMORY_DEVICE_PUBLIC:
|
||||
if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
|
||||
/*
|
||||
* TODO: Handle HMM pages which may need coordination
|
||||
* with device-side memory.
|
||||
*/
|
||||
goto unlock;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
49
mm/memory.c
49
mm/memory.c
|
@ -571,8 +571,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
|
|||
* PFNMAP mappings in order to support COWable mappings.
|
||||
*
|
||||
*/
|
||||
struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
|
||||
pte_t pte, bool with_public_device)
|
||||
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
|
||||
pte_t pte)
|
||||
{
|
||||
unsigned long pfn = pte_pfn(pte);
|
||||
|
||||
|
@ -585,29 +585,6 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
|
|||
return NULL;
|
||||
if (is_zero_pfn(pfn))
|
||||
return NULL;
|
||||
|
||||
/*
|
||||
* Device public pages are special pages (they are ZONE_DEVICE
|
||||
* pages but different from persistent memory). They behave
|
||||
* allmost like normal pages. The difference is that they are
|
||||
* not on the lru and thus should never be involve with any-
|
||||
* thing that involve lru manipulation (mlock, numa balancing,
|
||||
* ...).
|
||||
*
|
||||
* This is why we still want to return NULL for such page from
|
||||
* vm_normal_page() so that we do not have to special case all
|
||||
* call site of vm_normal_page().
|
||||
*/
|
||||
if (likely(pfn <= highest_memmap_pfn)) {
|
||||
struct page *page = pfn_to_page(pfn);
|
||||
|
||||
if (is_device_public_page(page)) {
|
||||
if (with_public_device)
|
||||
return page;
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
if (pte_devmap(pte))
|
||||
return NULL;
|
||||
|
||||
|
@ -797,17 +774,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
|
|||
rss[mm_counter(page)]++;
|
||||
} else if (pte_devmap(pte)) {
|
||||
page = pte_page(pte);
|
||||
|
||||
/*
|
||||
* Cache coherent device memory behave like regular page and
|
||||
* not like persistent memory page. For more informations see
|
||||
* MEMORY_DEVICE_CACHE_COHERENT in memory_hotplug.h
|
||||
*/
|
||||
if (is_device_public_page(page)) {
|
||||
get_page(page);
|
||||
page_dup_rmap(page, false);
|
||||
rss[mm_counter(page)]++;
|
||||
}
|
||||
}
|
||||
|
||||
out_set_pte:
|
||||
|
@ -1063,7 +1029,7 @@ again:
|
|||
if (pte_present(ptent)) {
|
||||
struct page *page;
|
||||
|
||||
page = _vm_normal_page(vma, addr, ptent, true);
|
||||
page = vm_normal_page(vma, addr, ptent);
|
||||
if (unlikely(details) && page) {
|
||||
/*
|
||||
* unmap_shared_mapping_pages() wants to
|
||||
|
@ -2782,13 +2748,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
|
|||
migration_entry_wait(vma->vm_mm, vmf->pmd,
|
||||
vmf->address);
|
||||
} else if (is_device_private_entry(entry)) {
|
||||
/*
|
||||
* For un-addressable device memory we call the pgmap
|
||||
* fault handler callback. The callback must migrate
|
||||
* the page back to some CPU accessible page.
|
||||
*/
|
||||
ret = device_private_entry_fault(vma, vmf->address, entry,
|
||||
vmf->flags, vmf->pmd);
|
||||
vmf->page = device_private_entry_to_page(entry);
|
||||
ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
|
||||
} else if (is_hwpoison_entry(entry)) {
|
||||
ret = VM_FAULT_HWPOISON;
|
||||
} else {
|
||||
|
|
|
@ -557,10 +557,8 @@ void __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
|
|||
int sections_to_remove;
|
||||
|
||||
/* In the ZONE_DEVICE case device driver owns the memory region */
|
||||
if (is_dev_zone(zone)) {
|
||||
if (altmap)
|
||||
map_offset = vmem_altmap_offset(altmap);
|
||||
}
|
||||
if (is_dev_zone(zone))
|
||||
map_offset = vmem_altmap_offset(altmap);
|
||||
|
||||
clear_zone_contiguous(zone);
|
||||
|
||||
|
|
|
@ -2098,6 +2098,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
|
|||
out:
|
||||
return page;
|
||||
}
|
||||
EXPORT_SYMBOL(alloc_pages_vma);
|
||||
|
||||
/**
|
||||
* alloc_pages_current - Allocate pages.
|
||||
|
|
28
mm/migrate.c
28
mm/migrate.c
|
@ -246,8 +246,6 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
|
|||
if (is_device_private_page(new)) {
|
||||
entry = make_device_private_entry(new, pte_write(pte));
|
||||
pte = swp_entry_to_pte(entry);
|
||||
} else if (is_device_public_page(new)) {
|
||||
pte = pte_mkdevmap(pte);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -381,7 +379,6 @@ static int expected_page_refs(struct address_space *mapping, struct page *page)
|
|||
* ZONE_DEVICE pages.
|
||||
*/
|
||||
expected_count += is_device_private_page(page);
|
||||
expected_count += is_device_public_page(page);
|
||||
if (mapping)
|
||||
expected_count += hpage_nr_pages(page) + page_has_private(page);
|
||||
|
||||
|
@ -994,10 +991,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
|
|||
if (!PageMappingFlags(page))
|
||||
page->mapping = NULL;
|
||||
|
||||
if (unlikely(is_zone_device_page(newpage))) {
|
||||
if (is_device_public_page(newpage))
|
||||
flush_dcache_page(newpage);
|
||||
} else
|
||||
if (likely(!is_zone_device_page(newpage)))
|
||||
flush_dcache_page(newpage);
|
||||
|
||||
}
|
||||
|
@ -2265,7 +2259,7 @@ again:
|
|||
pfn = 0;
|
||||
goto next;
|
||||
}
|
||||
page = _vm_normal_page(migrate->vma, addr, pte, true);
|
||||
page = vm_normal_page(migrate->vma, addr, pte);
|
||||
mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
|
||||
mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
|
||||
}
|
||||
|
@ -2406,16 +2400,7 @@ static bool migrate_vma_check_page(struct page *page)
|
|||
* FIXME proper solution is to rework migration_entry_wait() so
|
||||
* it does not need to take a reference on page.
|
||||
*/
|
||||
if (is_device_private_page(page))
|
||||
return true;
|
||||
|
||||
/*
|
||||
* Only allow device public page to be migrated and account for
|
||||
* the extra reference count imply by ZONE_DEVICE pages.
|
||||
*/
|
||||
if (!is_device_public_page(page))
|
||||
return false;
|
||||
extra++;
|
||||
return is_device_private_page(page);
|
||||
}
|
||||
|
||||
/* For file back page */
|
||||
|
@ -2665,11 +2650,6 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
|
|||
|
||||
swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE);
|
||||
entry = swp_entry_to_pte(swp_entry);
|
||||
} else if (is_device_public_page(page)) {
|
||||
entry = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
|
||||
if (vma->vm_flags & VM_WRITE)
|
||||
entry = pte_mkwrite(pte_mkdirty(entry));
|
||||
entry = pte_mkdevmap(entry);
|
||||
}
|
||||
} else {
|
||||
entry = mk_pte(page, vma->vm_page_prot);
|
||||
|
@ -2789,7 +2769,7 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
|
|||
migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
|
||||
continue;
|
||||
}
|
||||
} else if (!is_device_public_page(newpage)) {
|
||||
} else {
|
||||
/*
|
||||
* Other types of ZONE_DEVICE page are not
|
||||
* supported.
|
||||
|
|
|
@ -5853,6 +5853,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
|
|||
{
|
||||
unsigned long pfn, end_pfn = start_pfn + size;
|
||||
struct pglist_data *pgdat = zone->zone_pgdat;
|
||||
struct vmem_altmap *altmap = pgmap_altmap(pgmap);
|
||||
unsigned long zone_idx = zone_idx(zone);
|
||||
unsigned long start = jiffies;
|
||||
int nid = pgdat->node_id;
|
||||
|
@ -5865,9 +5866,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
|
|||
* of the pages reserved for the memmap, so we can just jump to
|
||||
* the end of that region and start processing the device pages.
|
||||
*/
|
||||
if (pgmap->altmap_valid) {
|
||||
struct vmem_altmap *altmap = &pgmap->altmap;
|
||||
|
||||
if (altmap) {
|
||||
start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
|
||||
size = end_pfn - start_pfn;
|
||||
}
|
||||
|
@ -5887,12 +5886,12 @@ void __ref memmap_init_zone_device(struct zone *zone,
|
|||
__SetPageReserved(page);
|
||||
|
||||
/*
|
||||
* ZONE_DEVICE pages union ->lru with a ->pgmap back
|
||||
* pointer and hmm_data. It is a bug if a ZONE_DEVICE
|
||||
* page is ever freed or placed on a driver-private list.
|
||||
* ZONE_DEVICE pages union ->lru with a ->pgmap back pointer
|
||||
* and zone_device_data. It is a bug if a ZONE_DEVICE page is
|
||||
* ever freed or placed on a driver-private list.
|
||||
*/
|
||||
page->pgmap = pgmap;
|
||||
page->hmm_data = 0;
|
||||
page->zone_device_data = NULL;
|
||||
|
||||
/*
|
||||
* Mark the block movable so that blocks are reserved for
|
||||
|
|
|
@ -100,25 +100,60 @@ static void nfit_test_kill(void *_pgmap)
|
|||
{
|
||||
struct dev_pagemap *pgmap = _pgmap;
|
||||
|
||||
WARN_ON(!pgmap || !pgmap->ref || !pgmap->kill || !pgmap->cleanup);
|
||||
pgmap->kill(pgmap->ref);
|
||||
pgmap->cleanup(pgmap->ref);
|
||||
WARN_ON(!pgmap || !pgmap->ref);
|
||||
|
||||
if (pgmap->ops && pgmap->ops->kill)
|
||||
pgmap->ops->kill(pgmap);
|
||||
else
|
||||
percpu_ref_kill(pgmap->ref);
|
||||
|
||||
if (pgmap->ops && pgmap->ops->cleanup) {
|
||||
pgmap->ops->cleanup(pgmap);
|
||||
} else {
|
||||
wait_for_completion(&pgmap->done);
|
||||
percpu_ref_exit(pgmap->ref);
|
||||
}
|
||||
}
|
||||
|
||||
static void dev_pagemap_percpu_release(struct percpu_ref *ref)
|
||||
{
|
||||
struct dev_pagemap *pgmap =
|
||||
container_of(ref, struct dev_pagemap, internal_ref);
|
||||
|
||||
complete(&pgmap->done);
|
||||
}
|
||||
|
||||
void *__wrap_devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
|
||||
{
|
||||
int error;
|
||||
resource_size_t offset = pgmap->res.start;
|
||||
struct nfit_test_resource *nfit_res = get_nfit_res(offset);
|
||||
|
||||
if (nfit_res) {
|
||||
int rc;
|
||||
if (!nfit_res)
|
||||
return devm_memremap_pages(dev, pgmap);
|
||||
|
||||
rc = devm_add_action_or_reset(dev, nfit_test_kill, pgmap);
|
||||
if (rc)
|
||||
return ERR_PTR(rc);
|
||||
return nfit_res->buf + offset - nfit_res->res.start;
|
||||
pgmap->dev = dev;
|
||||
if (!pgmap->ref) {
|
||||
if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup))
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
init_completion(&pgmap->done);
|
||||
error = percpu_ref_init(&pgmap->internal_ref,
|
||||
dev_pagemap_percpu_release, 0, GFP_KERNEL);
|
||||
if (error)
|
||||
return ERR_PTR(error);
|
||||
pgmap->ref = &pgmap->internal_ref;
|
||||
} else {
|
||||
if (!pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup) {
|
||||
WARN(1, "Missing reference count teardown definition\n");
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
}
|
||||
return devm_memremap_pages(dev, pgmap);
|
||||
|
||||
error = devm_add_action_or_reset(dev, nfit_test_kill, pgmap);
|
||||
if (error)
|
||||
return ERR_PTR(error);
|
||||
return nfit_res->buf + offset - nfit_res->res.start;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__wrap_devm_memremap_pages);
|
||||
|
||||
|
|
Loading…
Reference in New Issue