x86/NUMA: Provide a range-to-target_node lookup facility
The DEV_DAX_KMEM facility is a generic mechanism to allow device-dax instances, fronting performance-differentiated-memory like pmem, to be added to the System RAM pool. The NUMA node for that hot-added memory is derived from the device-dax instance's 'target_node' attribute. Recall that the 'target_node' is the ACPI-PXM-to-node translation for memory when it comes online whereas the 'numa_node' attribute of the device represents the closest online cpu node. Presently useful target_node information from the ACPI SRAT is discarded with the expectation that "Reserved" memory will never be onlined. Now, DEV_DAX_KMEM violates that assumption, there is a need to retain the translation. Move, rather than discard, numa_memblk data to a secondary array that memory_add_physaddr_to_target_node() may consider at a later point in time. Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Andy Lutomirski <luto@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Borislav Petkov <bp@alien8.de> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: <x86@kernel.org> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: David Hildenbrand <david@redhat.com> Cc: Michal Hocko <mhocko@suse.com> Reported-by: kbuild test robot <lkp@intel.com> Reviewed-by: Ingo Molnar <mingo@kernel.org> Signed-off-by: Dan Williams <dan.j.williams@intel.com> Reviewed-by: Thomas Gleixner <tglx@linutronix.de> Link: https://lore.kernel.org/r/158188326978.894464.217282995221175417.stgit@dwillia2-desk3.amr.corp.intel.com
This commit is contained in:
parent
1e5d8e1e47
commit
5d30f92e76
|
@ -26,6 +26,7 @@ struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
|
|||
EXPORT_SYMBOL(node_data);
|
||||
|
||||
static struct numa_meminfo numa_meminfo __initdata_or_meminfo;
|
||||
static struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo;
|
||||
|
||||
static int numa_distance_cnt;
|
||||
static u8 *numa_distance;
|
||||
|
@ -164,6 +165,19 @@ void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
|
|||
(mi->nr_blks - idx) * sizeof(mi->blk[0]));
|
||||
}
|
||||
|
||||
/**
|
||||
* numa_move_tail_memblk - Move a numa_memblk from one numa_meminfo to another
|
||||
* @dst: numa_meminfo to append block to
|
||||
* @idx: Index of memblk to remove
|
||||
* @src: numa_meminfo to remove memblk from
|
||||
*/
|
||||
static void __init numa_move_tail_memblk(struct numa_meminfo *dst, int idx,
|
||||
struct numa_meminfo *src)
|
||||
{
|
||||
dst->blk[dst->nr_blks++] = src->blk[idx];
|
||||
numa_remove_memblk_from(idx, src);
|
||||
}
|
||||
|
||||
/**
|
||||
* numa_add_memblk - Add one numa_memblk to numa_meminfo
|
||||
* @nid: NUMA node ID of the new memblk
|
||||
|
@ -233,14 +247,19 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
|
|||
for (i = 0; i < mi->nr_blks; i++) {
|
||||
struct numa_memblk *bi = &mi->blk[i];
|
||||
|
||||
/* make sure all blocks are inside the limits */
|
||||
/* move / save reserved memory ranges */
|
||||
if (!memblock_overlaps_region(&memblock.memory,
|
||||
bi->start, bi->end - bi->start)) {
|
||||
numa_move_tail_memblk(&numa_reserved_meminfo, i--, mi);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* make sure all non-reserved blocks are inside the limits */
|
||||
bi->start = max(bi->start, low);
|
||||
bi->end = min(bi->end, high);
|
||||
|
||||
/* and there's no empty or non-exist block */
|
||||
if (bi->start >= bi->end ||
|
||||
!memblock_overlaps_region(&memblock.memory,
|
||||
bi->start, bi->end - bi->start))
|
||||
/* and there's no empty block */
|
||||
if (bi->start >= bi->end)
|
||||
numa_remove_memblk_from(i--, mi);
|
||||
}
|
||||
|
||||
|
@ -877,16 +896,38 @@ EXPORT_SYMBOL(cpumask_of_node);
|
|||
|
||||
#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
|
||||
|
||||
#ifdef CONFIG_MEMORY_HOTPLUG
|
||||
int memory_add_physaddr_to_nid(u64 start)
|
||||
#ifdef CONFIG_NUMA_KEEP_MEMINFO
|
||||
static int meminfo_to_nid(struct numa_meminfo *mi, u64 start)
|
||||
{
|
||||
struct numa_meminfo *mi = &numa_meminfo;
|
||||
int nid = mi->blk[0].nid;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < mi->nr_blks; i++)
|
||||
if (mi->blk[i].start <= start && mi->blk[i].end > start)
|
||||
nid = mi->blk[i].nid;
|
||||
return mi->blk[i].nid;
|
||||
return NUMA_NO_NODE;
|
||||
}
|
||||
|
||||
int phys_to_target_node(phys_addr_t start)
|
||||
{
|
||||
int nid = meminfo_to_nid(&numa_meminfo, start);
|
||||
|
||||
/*
|
||||
* Prefer online nodes, but if reserved memory might be
|
||||
* hot-added continue the search with reserved ranges.
|
||||
*/
|
||||
if (nid != NUMA_NO_NODE)
|
||||
return nid;
|
||||
|
||||
return meminfo_to_nid(&numa_reserved_meminfo, start);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(phys_to_target_node);
|
||||
|
||||
int memory_add_physaddr_to_nid(u64 start)
|
||||
{
|
||||
int nid = meminfo_to_nid(&numa_meminfo, start);
|
||||
|
||||
if (nid == NUMA_NO_NODE)
|
||||
nid = numa_meminfo.blk[0].nid;
|
||||
return nid;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _LINUX_NUMA_H
|
||||
#define _LINUX_NUMA_H
|
||||
|
||||
#include <linux/types.h>
|
||||
|
||||
#ifdef CONFIG_NODES_SHIFT
|
||||
#define NODES_SHIFT CONFIG_NODES_SHIFT
|
||||
|
@ -21,12 +21,24 @@
|
|||
#endif
|
||||
|
||||
#ifdef CONFIG_NUMA
|
||||
/* Generic implementation available */
|
||||
int numa_map_to_online_node(int node);
|
||||
|
||||
/*
|
||||
* Optional architecture specific implementation, users need a "depends
|
||||
* on $ARCH"
|
||||
*/
|
||||
int phys_to_target_node(phys_addr_t addr);
|
||||
#else
|
||||
static inline int numa_map_to_online_node(int node)
|
||||
{
|
||||
return NUMA_NO_NODE;
|
||||
}
|
||||
|
||||
static inline int phys_to_target_node(phys_addr_t addr)
|
||||
{
|
||||
return NUMA_NO_NODE;
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* _LINUX_NUMA_H */
|
||||
|
|
Loading…
Reference in New Issue