Merge branch 'akpm' (patches from Andrew)
Merge more updates from Andrew Morton: "The remainder of the main mm/ queue. 143 patches. Subsystems affected by this patch series (all mm): pagecache, hugetlb, userfaultfd, vmscan, compaction, migration, cma, ksm, vmstat, mmap, kconfig, util, memory-hotplug, zswap, zsmalloc, highmem, cleanups, and kfence" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (143 commits) kfence: use power-efficient work queue to run delayed work kfence: maximize allocation wait timeout duration kfence: await for allocation using wait_event kfence: zero guard page after out-of-bounds access mm/process_vm_access.c: remove duplicate include mm/mempool: minor coding style tweaks mm/highmem.c: fix coding style issue btrfs: use memzero_page() instead of open coded kmap pattern iov_iter: lift memzero_page() to highmem.h mm/zsmalloc: use BUG_ON instead of if condition followed by BUG. mm/zswap.c: switch from strlcpy to strscpy arm64/Kconfig: introduce ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE x86/Kconfig: introduce ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE mm,memory_hotplug: add kernel boot option to enable memmap_on_memory acpi,memhotplug: enable MHP_MEMMAP_ON_MEMORY when supported mm,memory_hotplug: allocate memmap from the added memory range mm,memory_hotplug: factor out adjusting present pages into adjust_present_page_count() mm,memory_hotplug: relax fully spanned sections check drivers/base/memory: introduce memory_block_{online,offline} mm/memory_hotplug: remove broken locking of zone PCP structures during hot remove ...
This commit is contained in:
commit
8404c9fbc8
|
@ -0,0 +1,25 @@
|
|||
What: /sys/kernel/mm/cma/
|
||||
Date: Feb 2021
|
||||
Contact: Minchan Kim <minchan@kernel.org>
|
||||
Description:
|
||||
/sys/kernel/mm/cma/ contains a subdirectory for each CMA
|
||||
heap name (also sometimes called CMA areas).
|
||||
|
||||
Each CMA heap subdirectory (that is, each
|
||||
/sys/kernel/mm/cma/<cma-heap-name> directory) contains the
|
||||
following items:
|
||||
|
||||
alloc_pages_success
|
||||
alloc_pages_fail
|
||||
|
||||
What: /sys/kernel/mm/cma/<cma-heap-name>/alloc_pages_success
|
||||
Date: Feb 2021
|
||||
Contact: Minchan Kim <minchan@kernel.org>
|
||||
Description:
|
||||
the number of pages CMA API succeeded to allocate
|
||||
|
||||
What: /sys/kernel/mm/cma/<cma-heap-name>/alloc_pages_fail
|
||||
Date: Feb 2021
|
||||
Contact: Minchan Kim <minchan@kernel.org>
|
||||
Description:
|
||||
the number of pages CMA API failed to allocate
|
|
@ -2804,6 +2804,23 @@
|
|||
seconds. Use this parameter to check at some
|
||||
other rate. 0 disables periodic checking.
|
||||
|
||||
memory_hotplug.memmap_on_memory
|
||||
[KNL,X86,ARM] Boolean flag to enable this feature.
|
||||
Format: {on | off (default)}
|
||||
When enabled, runtime hotplugged memory will
|
||||
allocate its internal metadata (struct pages)
|
||||
from the hotadded memory which will allow to
|
||||
hotadd a lot of memory without requiring
|
||||
additional memory to do so.
|
||||
This feature is disabled by default because it
|
||||
has some implication on large (e.g. GB)
|
||||
allocations in some configurations (e.g. small
|
||||
memory blocks).
|
||||
The state of the flag can be read in
|
||||
/sys/module/memory_hotplug/parameters/memmap_on_memory.
|
||||
Note that even when enabled, there are a few cases where
|
||||
the feature is not effective.
|
||||
|
||||
memtest= [KNL,X86,ARM,PPC] Enable memtest
|
||||
Format: <integer>
|
||||
default : 0 <disable>
|
||||
|
|
|
@ -357,6 +357,15 @@ creates ZONE_MOVABLE as following.
|
|||
Unfortunately, there is no information to show which memory block belongs
|
||||
to ZONE_MOVABLE. This is TBD.
|
||||
|
||||
.. note::
|
||||
Techniques that rely on long-term pinnings of memory (especially, RDMA and
|
||||
vfio) are fundamentally problematic with ZONE_MOVABLE and, therefore, memory
|
||||
hot remove. Pinned pages cannot reside on ZONE_MOVABLE, to guarantee that
|
||||
memory can still get hot removed - be aware that pinning can fail even if
|
||||
there is plenty of free memory in ZONE_MOVABLE. In addition, using
|
||||
ZONE_MOVABLE might make page pinning more expensive, because pages have to be
|
||||
migrated off that zone first.
|
||||
|
||||
.. _memory_hotplug_how_to_offline_memory:
|
||||
|
||||
How to offline memory
|
||||
|
|
|
@ -63,36 +63,36 @@ the generic ioctl available.
|
|||
|
||||
The ``uffdio_api.features`` bitmask returned by the ``UFFDIO_API`` ioctl
|
||||
defines what memory types are supported by the ``userfaultfd`` and what
|
||||
events, except page fault notifications, may be generated.
|
||||
events, except page fault notifications, may be generated:
|
||||
|
||||
If the kernel supports registering ``userfaultfd`` ranges on hugetlbfs
|
||||
virtual memory areas, ``UFFD_FEATURE_MISSING_HUGETLBFS`` will be set in
|
||||
``uffdio_api.features``. Similarly, ``UFFD_FEATURE_MISSING_SHMEM`` will be
|
||||
set if the kernel supports registering ``userfaultfd`` ranges on shared
|
||||
memory (covering all shmem APIs, i.e. tmpfs, ``IPCSHM``, ``/dev/zero``,
|
||||
``MAP_SHARED``, ``memfd_create``, etc).
|
||||
- The ``UFFD_FEATURE_EVENT_*`` flags indicate that various other events
|
||||
other than page faults are supported. These events are described in more
|
||||
detail below in the `Non-cooperative userfaultfd`_ section.
|
||||
|
||||
The userland application that wants to use ``userfaultfd`` with hugetlbfs
|
||||
or shared memory need to set the corresponding flag in
|
||||
``uffdio_api.features`` to enable those features.
|
||||
- ``UFFD_FEATURE_MISSING_HUGETLBFS`` and ``UFFD_FEATURE_MISSING_SHMEM``
|
||||
indicate that the kernel supports ``UFFDIO_REGISTER_MODE_MISSING``
|
||||
registrations for hugetlbfs and shared memory (covering all shmem APIs,
|
||||
i.e. tmpfs, ``IPCSHM``, ``/dev/zero``, ``MAP_SHARED``, ``memfd_create``,
|
||||
etc) virtual memory areas, respectively.
|
||||
|
||||
If the userland desires to receive notifications for events other than
|
||||
page faults, it has to verify that ``uffdio_api.features`` has appropriate
|
||||
``UFFD_FEATURE_EVENT_*`` bits set. These events are described in more
|
||||
detail below in `Non-cooperative userfaultfd`_ section.
|
||||
- ``UFFD_FEATURE_MINOR_HUGETLBFS`` indicates that the kernel supports
|
||||
``UFFDIO_REGISTER_MODE_MINOR`` registration for hugetlbfs virtual memory
|
||||
areas.
|
||||
|
||||
Once the ``userfaultfd`` has been enabled the ``UFFDIO_REGISTER`` ioctl should
|
||||
be invoked (if present in the returned ``uffdio_api.ioctls`` bitmask) to
|
||||
register a memory range in the ``userfaultfd`` by setting the
|
||||
The userland application should set the feature flags it intends to use
|
||||
when invoking the ``UFFDIO_API`` ioctl, to request that those features be
|
||||
enabled if supported.
|
||||
|
||||
Once the ``userfaultfd`` API has been enabled the ``UFFDIO_REGISTER``
|
||||
ioctl should be invoked (if present in the returned ``uffdio_api.ioctls``
|
||||
bitmask) to register a memory range in the ``userfaultfd`` by setting the
|
||||
uffdio_register structure accordingly. The ``uffdio_register.mode``
|
||||
bitmask will specify to the kernel which kind of faults to track for
|
||||
the range (``UFFDIO_REGISTER_MODE_MISSING`` would track missing
|
||||
pages). The ``UFFDIO_REGISTER`` ioctl will return the
|
||||
the range. The ``UFFDIO_REGISTER`` ioctl will return the
|
||||
``uffdio_register.ioctls`` bitmask of ioctls that are suitable to resolve
|
||||
userfaults on the range registered. Not all ioctls will necessarily be
|
||||
supported for all memory types depending on the underlying virtual
|
||||
memory backend (anonymous memory vs tmpfs vs real filebacked
|
||||
mappings).
|
||||
supported for all memory types (e.g. anonymous memory vs. shmem vs.
|
||||
hugetlbfs), or all types of intercepted faults.
|
||||
|
||||
Userland can use the ``uffdio_register.ioctls`` to manage the virtual
|
||||
address space in the background (to add or potentially also remove
|
||||
|
@ -100,21 +100,46 @@ memory from the ``userfaultfd`` registered range). This means a userfault
|
|||
could be triggering just before userland maps in the background the
|
||||
user-faulted page.
|
||||
|
||||
The primary ioctl to resolve userfaults is ``UFFDIO_COPY``. That
|
||||
atomically copies a page into the userfault registered range and wakes
|
||||
up the blocked userfaults
|
||||
(unless ``uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE`` is set).
|
||||
Other ioctl works similarly to ``UFFDIO_COPY``. They're atomic as in
|
||||
guaranteeing that nothing can see an half copied page since it'll
|
||||
keep userfaulting until the copy has finished.
|
||||
Resolving Userfaults
|
||||
--------------------
|
||||
|
||||
There are three basic ways to resolve userfaults:
|
||||
|
||||
- ``UFFDIO_COPY`` atomically copies some existing page contents from
|
||||
userspace.
|
||||
|
||||
- ``UFFDIO_ZEROPAGE`` atomically zeros the new page.
|
||||
|
||||
- ``UFFDIO_CONTINUE`` maps an existing, previously-populated page.
|
||||
|
||||
These operations are atomic in the sense that they guarantee nothing can
|
||||
see a half-populated page, since readers will keep userfaulting until the
|
||||
operation has finished.
|
||||
|
||||
By default, these wake up userfaults blocked on the range in question.
|
||||
They support a ``UFFDIO_*_MODE_DONTWAKE`` ``mode`` flag, which indicates
|
||||
that waking will be done separately at some later time.
|
||||
|
||||
Which ioctl to choose depends on the kind of page fault, and what we'd
|
||||
like to do to resolve it:
|
||||
|
||||
- For ``UFFDIO_REGISTER_MODE_MISSING`` faults, the fault needs to be
|
||||
resolved by either providing a new page (``UFFDIO_COPY``), or mapping
|
||||
the zero page (``UFFDIO_ZEROPAGE``). By default, the kernel would map
|
||||
the zero page for a missing fault. With userfaultfd, userspace can
|
||||
decide what content to provide before the faulting thread continues.
|
||||
|
||||
- For ``UFFDIO_REGISTER_MODE_MINOR`` faults, there is an existing page (in
|
||||
the page cache). Userspace has the option of modifying the page's
|
||||
contents before resolving the fault. Once the contents are correct
|
||||
(modified or not), userspace asks the kernel to map the page and let the
|
||||
faulting thread continue with ``UFFDIO_CONTINUE``.
|
||||
|
||||
Notes:
|
||||
|
||||
- If you requested ``UFFDIO_REGISTER_MODE_MISSING`` when registering then
|
||||
you must provide some kind of page in your thread after reading from
|
||||
the uffd. You must provide either ``UFFDIO_COPY`` or ``UFFDIO_ZEROPAGE``.
|
||||
The normal behavior of the OS automatically providing a zero page on
|
||||
an anonymous mmaping is not in place.
|
||||
- You can tell which kind of fault occurred by examining
|
||||
``pagefault.flags`` within the ``uffd_msg``, checking for the
|
||||
``UFFD_PAGEFAULT_FLAG_*`` flags.
|
||||
|
||||
- None of the page-delivering ioctls default to the range that you
|
||||
registered with. You must fill in all fields for the appropriate
|
||||
|
@ -122,9 +147,9 @@ Notes:
|
|||
|
||||
- You get the address of the access that triggered the missing page
|
||||
event out of a struct uffd_msg that you read in the thread from the
|
||||
uffd. You can supply as many pages as you want with ``UFFDIO_COPY`` or
|
||||
``UFFDIO_ZEROPAGE``. Keep in mind that unless you used DONTWAKE then
|
||||
the first of any of those IOCTLs wakes up the faulting thread.
|
||||
uffd. You can supply as many pages as you want with these IOCTLs.
|
||||
Keep in mind that unless you used DONTWAKE then the first of any of
|
||||
those IOCTLs wakes up the faulting thread.
|
||||
|
||||
- Be sure to test for all errors including
|
||||
(``pollfd[0].revents & POLLERR``). This can happen, e.g. when ranges
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
config ARC
|
||||
def_bool y
|
||||
select ARC_TIMERS
|
||||
select ARCH_HAS_CACHE_LINE_SIZE
|
||||
select ARCH_HAS_DEBUG_VM_PGTABLE
|
||||
select ARCH_HAS_DMA_PREP_COHERENT
|
||||
select ARCH_HAS_PTE_SPECIAL
|
||||
|
@ -28,6 +29,7 @@ config ARC
|
|||
select GENERIC_SMP_IDLE_THREAD
|
||||
select HAVE_ARCH_KGDB
|
||||
select HAVE_ARCH_TRACEHOOK
|
||||
select HAVE_ARCH_TRANSPARENT_HUGEPAGE if ARC_MMU_V4
|
||||
select HAVE_DEBUG_STACKOVERFLOW
|
||||
select HAVE_DEBUG_KMEMLEAK
|
||||
select HAVE_FUTEX_CMPXCHG if FUTEX
|
||||
|
@ -48,9 +50,6 @@ config ARC
|
|||
select HAVE_ARCH_JUMP_LABEL if ISA_ARCV2 && !CPU_ENDIAN_BE32
|
||||
select SET_FS
|
||||
|
||||
config ARCH_HAS_CACHE_LINE_SIZE
|
||||
def_bool y
|
||||
|
||||
config TRACE_IRQFLAGS_SUPPORT
|
||||
def_bool y
|
||||
|
||||
|
@ -86,10 +85,6 @@ config STACKTRACE_SUPPORT
|
|||
def_bool y
|
||||
select STACKTRACE
|
||||
|
||||
config HAVE_ARCH_TRANSPARENT_HUGEPAGE
|
||||
def_bool y
|
||||
depends on ARC_MMU_V4
|
||||
|
||||
menu "ARC Architecture Configuration"
|
||||
|
||||
menu "ARC Platform/SoC/Board"
|
||||
|
|
|
@ -31,6 +31,7 @@ config ARM
|
|||
select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX
|
||||
select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT if CPU_V7
|
||||
select ARCH_SUPPORTS_ATOMIC_RMW
|
||||
select ARCH_SUPPORTS_HUGETLBFS if ARM_LPAE
|
||||
select ARCH_USE_BUILTIN_BSWAP
|
||||
select ARCH_USE_CMPXCHG_LOCKREF
|
||||
select ARCH_USE_MEMTEST
|
||||
|
@ -77,6 +78,7 @@ config ARM
|
|||
select HAVE_ARCH_SECCOMP_FILTER if AEABI && !OABI_COMPAT
|
||||
select HAVE_ARCH_THREAD_STRUCT_WHITELIST
|
||||
select HAVE_ARCH_TRACEHOOK
|
||||
select HAVE_ARCH_TRANSPARENT_HUGEPAGE if ARM_LPAE
|
||||
select HAVE_ARM_SMCCC if CPU_V7
|
||||
select HAVE_EBPF_JIT if !CPU_ENDIAN_BE32
|
||||
select HAVE_CONTEXT_TRACKING
|
||||
|
@ -1511,14 +1513,6 @@ config HW_PERF_EVENTS
|
|||
def_bool y
|
||||
depends on ARM_PMU
|
||||
|
||||
config SYS_SUPPORTS_HUGETLBFS
|
||||
def_bool y
|
||||
depends on ARM_LPAE
|
||||
|
||||
config HAVE_ARCH_TRANSPARENT_HUGEPAGE
|
||||
def_bool y
|
||||
depends on ARM_LPAE
|
||||
|
||||
config ARCH_WANT_GENERAL_HUGETLB
|
||||
def_bool y
|
||||
|
||||
|
|
|
@ -11,6 +11,12 @@ config ARM64
|
|||
select ACPI_PPTT if ACPI
|
||||
select ARCH_HAS_DEBUG_WX
|
||||
select ARCH_BINFMT_ELF_STATE
|
||||
select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION
|
||||
select ARCH_ENABLE_MEMORY_HOTPLUG
|
||||
select ARCH_ENABLE_MEMORY_HOTREMOVE
|
||||
select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2
|
||||
select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE
|
||||
select ARCH_HAS_CACHE_LINE_SIZE
|
||||
select ARCH_HAS_DEBUG_VIRTUAL
|
||||
select ARCH_HAS_DEBUG_VM_PGTABLE
|
||||
select ARCH_HAS_DMA_PREP_COHERENT
|
||||
|
@ -72,6 +78,7 @@ config ARM64
|
|||
select ARCH_USE_QUEUED_SPINLOCKS
|
||||
select ARCH_USE_SYM_ANNOTATIONS
|
||||
select ARCH_SUPPORTS_DEBUG_PAGEALLOC
|
||||
select ARCH_SUPPORTS_HUGETLBFS
|
||||
select ARCH_SUPPORTS_MEMORY_FAILURE
|
||||
select ARCH_SUPPORTS_SHADOW_CALL_STACK if CC_HAVE_SHADOW_CALL_STACK
|
||||
select ARCH_SUPPORTS_LTO_CLANG if CPU_LITTLE_ENDIAN
|
||||
|
@ -213,6 +220,7 @@ config ARM64
|
|||
select SWIOTLB
|
||||
select SYSCTL_EXCEPTION_TRACE
|
||||
select THREAD_INFO_IN_TASK
|
||||
select HAVE_ARCH_USERFAULTFD_MINOR if USERFAULTFD
|
||||
help
|
||||
ARM 64-bit (AArch64) Linux support.
|
||||
|
||||
|
@ -308,10 +316,7 @@ config ZONE_DMA32
|
|||
bool "Support DMA32 zone" if EXPERT
|
||||
default y
|
||||
|
||||
config ARCH_ENABLE_MEMORY_HOTPLUG
|
||||
def_bool y
|
||||
|
||||
config ARCH_ENABLE_MEMORY_HOTREMOVE
|
||||
config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
|
||||
def_bool y
|
||||
|
||||
config SMP
|
||||
|
@ -1070,18 +1075,9 @@ config HW_PERF_EVENTS
|
|||
def_bool y
|
||||
depends on ARM_PMU
|
||||
|
||||
config SYS_SUPPORTS_HUGETLBFS
|
||||
def_bool y
|
||||
|
||||
config ARCH_HAS_CACHE_LINE_SIZE
|
||||
def_bool y
|
||||
|
||||
config ARCH_HAS_FILTER_PGPROT
|
||||
def_bool y
|
||||
|
||||
config ARCH_ENABLE_SPLIT_PMD_PTLOCK
|
||||
def_bool y if PGTABLE_LEVELS > 2
|
||||
|
||||
# Supported by clang >= 7.0
|
||||
config CC_HAVE_SHADOW_CALL_STACK
|
||||
def_bool $(cc-option, -fsanitize=shadow-call-stack -ffixed-x18)
|
||||
|
@ -1923,14 +1919,6 @@ config SYSVIPC_COMPAT
|
|||
def_bool y
|
||||
depends on COMPAT && SYSVIPC
|
||||
|
||||
config ARCH_ENABLE_HUGEPAGE_MIGRATION
|
||||
def_bool y
|
||||
depends on HUGETLB_PAGE && MIGRATION
|
||||
|
||||
config ARCH_ENABLE_THP_MIGRATION
|
||||
def_bool y
|
||||
depends on TRANSPARENT_HUGEPAGE
|
||||
|
||||
menu "Power management options"
|
||||
|
||||
source "kernel/power/Kconfig"
|
||||
|
|
|
@ -252,7 +252,7 @@ void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
|
|||
set_pte(ptep, pte);
|
||||
}
|
||||
|
||||
pte_t *huge_pte_alloc(struct mm_struct *mm,
|
||||
pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
unsigned long addr, unsigned long sz)
|
||||
{
|
||||
pgd_t *pgdp;
|
||||
|
@ -284,9 +284,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
|
|||
*/
|
||||
ptep = pte_alloc_map(mm, pmdp, addr);
|
||||
} else if (sz == PMD_SIZE) {
|
||||
if (IS_ENABLED(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) &&
|
||||
pud_none(READ_ONCE(*pudp)))
|
||||
ptep = huge_pmd_share(mm, addr, pudp);
|
||||
if (want_pmd_share(vma, addr) && pud_none(READ_ONCE(*pudp)))
|
||||
ptep = huge_pmd_share(mm, vma, addr, pudp);
|
||||
else
|
||||
ptep = (pte_t *)pmd_alloc(mm, pudp, addr);
|
||||
} else if (sz == (CONT_PMD_SIZE)) {
|
||||
|
|
|
@ -13,6 +13,8 @@ config IA64
|
|||
select ARCH_MIGHT_HAVE_PC_SERIO
|
||||
select ACPI
|
||||
select ACPI_NUMA if NUMA
|
||||
select ARCH_ENABLE_MEMORY_HOTPLUG
|
||||
select ARCH_ENABLE_MEMORY_HOTREMOVE
|
||||
select ARCH_SUPPORTS_ACPI
|
||||
select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
|
||||
select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
|
||||
|
@ -32,6 +34,7 @@ config IA64
|
|||
select TTY
|
||||
select HAVE_ARCH_TRACEHOOK
|
||||
select HAVE_VIRT_CPU_ACCOUNTING
|
||||
select HUGETLB_PAGE_SIZE_VARIABLE if HUGETLB_PAGE
|
||||
select VIRT_TO_BUS
|
||||
select GENERIC_IRQ_PROBE
|
||||
select GENERIC_PENDING_IRQ if SMP
|
||||
|
@ -82,11 +85,6 @@ config STACKTRACE_SUPPORT
|
|||
config GENERIC_LOCKBREAK
|
||||
def_bool n
|
||||
|
||||
config HUGETLB_PAGE_SIZE_VARIABLE
|
||||
bool
|
||||
depends on HUGETLB_PAGE
|
||||
default y
|
||||
|
||||
config GENERIC_CALIBRATE_DELAY
|
||||
bool
|
||||
default y
|
||||
|
@ -250,12 +248,6 @@ config HOTPLUG_CPU
|
|||
can be controlled through /sys/devices/system/cpu/cpu#.
|
||||
Say N if you want to disable CPU hotplug.
|
||||
|
||||
config ARCH_ENABLE_MEMORY_HOTPLUG
|
||||
def_bool y
|
||||
|
||||
config ARCH_ENABLE_MEMORY_HOTREMOVE
|
||||
def_bool y
|
||||
|
||||
config SCHED_SMT
|
||||
bool "SMT scheduler support"
|
||||
depends on SMP
|
||||
|
|
|
@ -25,7 +25,8 @@ unsigned int hpage_shift = HPAGE_SHIFT_DEFAULT;
|
|||
EXPORT_SYMBOL(hpage_shift);
|
||||
|
||||
pte_t *
|
||||
huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
|
||||
huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
unsigned long addr, unsigned long sz)
|
||||
{
|
||||
unsigned long taddr = htlbpage_to_page(addr);
|
||||
pgd_t *pgd;
|
||||
|
|
|
@ -19,6 +19,7 @@ config MIPS
|
|||
select ARCH_USE_MEMTEST
|
||||
select ARCH_USE_QUEUED_RWLOCKS
|
||||
select ARCH_USE_QUEUED_SPINLOCKS
|
||||
select ARCH_SUPPORTS_HUGETLBFS if CPU_SUPPORTS_HUGEPAGES
|
||||
select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
|
||||
select ARCH_WANT_IPC_PARSE_VERSION
|
||||
select ARCH_WANT_LD_ORPHAN_WARN
|
||||
|
@ -1287,11 +1288,6 @@ config SYS_SUPPORTS_BIG_ENDIAN
|
|||
config SYS_SUPPORTS_LITTLE_ENDIAN
|
||||
bool
|
||||
|
||||
config SYS_SUPPORTS_HUGETLBFS
|
||||
bool
|
||||
depends on CPU_SUPPORTS_HUGEPAGES
|
||||
default y
|
||||
|
||||
config MIPS_HUGE_TLB_SUPPORT
|
||||
def_bool HUGETLB_PAGE || TRANSPARENT_HUGEPAGE
|
||||
|
||||
|
|
|
@ -21,8 +21,8 @@
|
|||
#include <asm/tlb.h>
|
||||
#include <asm/tlbflush.h>
|
||||
|
||||
pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr,
|
||||
unsigned long sz)
|
||||
pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
unsigned long addr, unsigned long sz)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
p4d_t *p4d;
|
||||
|
|
|
@ -12,6 +12,7 @@ config PARISC
|
|||
select ARCH_HAS_STRICT_KERNEL_RWX
|
||||
select ARCH_HAS_UBSAN_SANITIZE_ALL
|
||||
select ARCH_NO_SG_CHAIN
|
||||
select ARCH_SUPPORTS_HUGETLBFS if PA20
|
||||
select ARCH_SUPPORTS_MEMORY_FAILURE
|
||||
select DMA_OPS
|
||||
select RTC_CLASS
|
||||
|
@ -138,10 +139,6 @@ config PGTABLE_LEVELS
|
|||
default 3 if 64BIT && PARISC_PAGE_SIZE_4KB
|
||||
default 2
|
||||
|
||||
config SYS_SUPPORTS_HUGETLBFS
|
||||
def_bool y if PA20
|
||||
|
||||
|
||||
menu "Processor type and features"
|
||||
|
||||
choice
|
||||
|
|
|
@ -44,7 +44,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
|
|||
}
|
||||
|
||||
|
||||
pte_t *huge_pte_alloc(struct mm_struct *mm,
|
||||
pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
unsigned long addr, unsigned long sz)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
|
|
|
@ -118,6 +118,8 @@ config PPC
|
|||
# Please keep this list sorted alphabetically.
|
||||
#
|
||||
select ARCH_32BIT_OFF_T if PPC32
|
||||
select ARCH_ENABLE_MEMORY_HOTPLUG
|
||||
select ARCH_ENABLE_MEMORY_HOTREMOVE
|
||||
select ARCH_HAS_DEBUG_VIRTUAL
|
||||
select ARCH_HAS_DEBUG_VM_PGTABLE
|
||||
select ARCH_HAS_DEVMEM_IS_ALLOWED
|
||||
|
@ -236,6 +238,7 @@ config PPC
|
|||
select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH
|
||||
select HAVE_PERF_REGS
|
||||
select HAVE_PERF_USER_STACK_DUMP
|
||||
select HUGETLB_PAGE_SIZE_VARIABLE if PPC_BOOK3S_64 && HUGETLB_PAGE
|
||||
select MMU_GATHER_RCU_TABLE_FREE
|
||||
select MMU_GATHER_PAGE_SIZE
|
||||
select HAVE_REGS_AND_STACK_ACCESS_API
|
||||
|
@ -420,11 +423,6 @@ config HIGHMEM
|
|||
|
||||
source "kernel/Kconfig.hz"
|
||||
|
||||
config HUGETLB_PAGE_SIZE_VARIABLE
|
||||
bool
|
||||
depends on HUGETLB_PAGE && PPC_BOOK3S_64
|
||||
default y
|
||||
|
||||
config MATH_EMULATION
|
||||
bool "Math emulation"
|
||||
depends on 4xx || PPC_8xx || PPC_MPC832x || BOOKE
|
||||
|
@ -520,12 +518,6 @@ config ARCH_CPU_PROBE_RELEASE
|
|||
def_bool y
|
||||
depends on HOTPLUG_CPU
|
||||
|
||||
config ARCH_ENABLE_MEMORY_HOTPLUG
|
||||
def_bool y
|
||||
|
||||
config ARCH_ENABLE_MEMORY_HOTREMOVE
|
||||
def_bool y
|
||||
|
||||
config PPC64_SUPPORTS_MEMORY_FAILURE
|
||||
bool "Add support for memory hwpoison"
|
||||
depends on PPC_BOOK3S_64
|
||||
|
@ -705,9 +697,6 @@ config ARCH_SPARSEMEM_DEFAULT
|
|||
def_bool y
|
||||
depends on PPC_BOOK3S_64
|
||||
|
||||
config SYS_SUPPORTS_HUGETLBFS
|
||||
bool
|
||||
|
||||
config ILLEGAL_POINTER_VALUE
|
||||
hex
|
||||
# This is roughly half way between the top of user space and the bottom
|
||||
|
|
|
@ -106,7 +106,8 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
|
|||
* At this point we do the placement change only for BOOK3S 64. This would
|
||||
* possibly work on other subarchs.
|
||||
*/
|
||||
pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
|
||||
pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
unsigned long addr, unsigned long sz)
|
||||
{
|
||||
pgd_t *pg;
|
||||
p4d_t *p4;
|
||||
|
|
|
@ -40,8 +40,8 @@ config PPC_85xx
|
|||
|
||||
config PPC_8xx
|
||||
bool "Freescale 8xx"
|
||||
select ARCH_SUPPORTS_HUGETLBFS
|
||||
select FSL_SOC
|
||||
select SYS_SUPPORTS_HUGETLBFS
|
||||
select PPC_HAVE_KUEP
|
||||
select PPC_HAVE_KUAP
|
||||
select HAVE_ARCH_VMAP_STACK
|
||||
|
@ -95,9 +95,11 @@ config PPC_BOOK3S_64
|
|||
bool "Server processors"
|
||||
select PPC_FPU
|
||||
select PPC_HAVE_PMU_SUPPORT
|
||||
select SYS_SUPPORTS_HUGETLBFS
|
||||
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
|
||||
select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION
|
||||
select ARCH_ENABLE_PMD_SPLIT_PTLOCK
|
||||
select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE
|
||||
select ARCH_SUPPORTS_HUGETLBFS
|
||||
select ARCH_SUPPORTS_NUMA_BALANCING
|
||||
select IRQ_WORK
|
||||
select PPC_MM_SLICES
|
||||
|
@ -280,9 +282,9 @@ config FSL_BOOKE
|
|||
# this is for common code between PPC32 & PPC64 FSL BOOKE
|
||||
config PPC_FSL_BOOK3E
|
||||
bool
|
||||
select ARCH_SUPPORTS_HUGETLBFS if PHYS_64BIT || PPC64
|
||||
select FSL_EMB_PERFMON
|
||||
select PPC_SMP_MUXED_IPI
|
||||
select SYS_SUPPORTS_HUGETLBFS if PHYS_64BIT || PPC64
|
||||
select PPC_DOORBELL
|
||||
default y if FSL_BOOKE
|
||||
|
||||
|
@ -358,10 +360,6 @@ config SPE
|
|||
|
||||
If in doubt, say Y here.
|
||||
|
||||
config ARCH_ENABLE_SPLIT_PMD_PTLOCK
|
||||
def_bool y
|
||||
depends on PPC_BOOK3S_64
|
||||
|
||||
config PPC_RADIX_MMU
|
||||
bool "Radix MMU Support"
|
||||
depends on PPC_BOOK3S_64
|
||||
|
@ -421,10 +419,6 @@ config PPC_PKEY
|
|||
depends on PPC_BOOK3S_64
|
||||
depends on PPC_MEM_KEYS || PPC_KUAP || PPC_KUEP
|
||||
|
||||
config ARCH_ENABLE_HUGEPAGE_MIGRATION
|
||||
def_bool y
|
||||
depends on PPC_BOOK3S_64 && HUGETLB_PAGE && MIGRATION
|
||||
|
||||
|
||||
config PPC_MMU_NOHASH
|
||||
def_bool y
|
||||
|
|
|
@ -30,6 +30,7 @@ config RISCV
|
|||
select ARCH_HAS_STRICT_KERNEL_RWX if MMU
|
||||
select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX
|
||||
select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT
|
||||
select ARCH_SUPPORTS_HUGETLBFS if MMU
|
||||
select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
|
||||
select ARCH_WANT_FRAME_POINTERS
|
||||
select ARCH_WANT_HUGE_PMD_SHARE if 64BIT
|
||||
|
@ -166,10 +167,6 @@ config ARCH_WANT_GENERAL_HUGETLB
|
|||
config ARCH_SUPPORTS_UPROBES
|
||||
def_bool y
|
||||
|
||||
config SYS_SUPPORTS_HUGETLBFS
|
||||
depends on MMU
|
||||
def_bool y
|
||||
|
||||
config STACKTRACE_SUPPORT
|
||||
def_bool y
|
||||
|
||||
|
|
|
@ -60,6 +60,9 @@ config S390
|
|||
imply IMA_SECURE_AND_OR_TRUSTED_BOOT
|
||||
select ARCH_32BIT_USTAT_F_TINODE
|
||||
select ARCH_BINFMT_ELF_STATE
|
||||
select ARCH_ENABLE_MEMORY_HOTPLUG if SPARSEMEM
|
||||
select ARCH_ENABLE_MEMORY_HOTREMOVE
|
||||
select ARCH_ENABLE_SPLIT_PMD_PTLOCK
|
||||
select ARCH_HAS_DEBUG_VM_PGTABLE
|
||||
select ARCH_HAS_DEBUG_WX
|
||||
select ARCH_HAS_DEVMEM_IS_ALLOWED
|
||||
|
@ -626,15 +629,6 @@ config ARCH_SPARSEMEM_ENABLE
|
|||
config ARCH_SPARSEMEM_DEFAULT
|
||||
def_bool y
|
||||
|
||||
config ARCH_ENABLE_MEMORY_HOTPLUG
|
||||
def_bool y if SPARSEMEM
|
||||
|
||||
config ARCH_ENABLE_MEMORY_HOTREMOVE
|
||||
def_bool y
|
||||
|
||||
config ARCH_ENABLE_SPLIT_PMD_PTLOCK
|
||||
def_bool y
|
||||
|
||||
config MAX_PHYSMEM_BITS
|
||||
int "Maximum size of supported physical memory in bits (42-53)"
|
||||
range 42 53
|
||||
|
|
|
@ -189,7 +189,7 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
|
|||
return pte;
|
||||
}
|
||||
|
||||
pte_t *huge_pte_alloc(struct mm_struct *mm,
|
||||
pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
unsigned long addr, unsigned long sz)
|
||||
{
|
||||
pgd_t *pgdp;
|
||||
|
|
|
@ -2,6 +2,8 @@
|
|||
config SUPERH
|
||||
def_bool y
|
||||
select ARCH_32BIT_OFF_T
|
||||
select ARCH_ENABLE_MEMORY_HOTPLUG if SPARSEMEM && MMU
|
||||
select ARCH_ENABLE_MEMORY_HOTREMOVE if SPARSEMEM && MMU
|
||||
select ARCH_HAVE_CUSTOM_GPIO_H
|
||||
select ARCH_HAVE_NMI_SAFE_CMPXCHG if (GUSA_RB || CPU_SH4A)
|
||||
select ARCH_HAS_BINFMT_FLAT if !MMU
|
||||
|
@ -101,9 +103,6 @@ config SYS_SUPPORTS_APM_EMULATION
|
|||
bool
|
||||
select ARCH_SUSPEND_POSSIBLE
|
||||
|
||||
config SYS_SUPPORTS_HUGETLBFS
|
||||
bool
|
||||
|
||||
config SYS_SUPPORTS_SMP
|
||||
bool
|
||||
|
||||
|
@ -175,12 +174,12 @@ config CPU_SH3
|
|||
|
||||
config CPU_SH4
|
||||
bool
|
||||
select ARCH_SUPPORTS_HUGETLBFS if MMU
|
||||
select CPU_HAS_INTEVT
|
||||
select CPU_HAS_SR_RB
|
||||
select CPU_HAS_FPU if !CPU_SH4AL_DSP
|
||||
select SH_INTC
|
||||
select SYS_SUPPORTS_SH_TMU
|
||||
select SYS_SUPPORTS_HUGETLBFS if MMU
|
||||
|
||||
config CPU_SH4A
|
||||
bool
|
||||
|
|
|
@ -136,14 +136,6 @@ config ARCH_SPARSEMEM_DEFAULT
|
|||
config ARCH_SELECT_MEMORY_MODEL
|
||||
def_bool y
|
||||
|
||||
config ARCH_ENABLE_MEMORY_HOTPLUG
|
||||
def_bool y
|
||||
depends on SPARSEMEM && MMU
|
||||
|
||||
config ARCH_ENABLE_MEMORY_HOTREMOVE
|
||||
def_bool y
|
||||
depends on SPARSEMEM && MMU
|
||||
|
||||
config ARCH_MEMORY_PROBE
|
||||
def_bool y
|
||||
depends on MEMORY_HOTPLUG
|
||||
|
|
|
@ -21,7 +21,7 @@
|
|||
#include <asm/tlbflush.h>
|
||||
#include <asm/cacheflush.h>
|
||||
|
||||
pte_t *huge_pte_alloc(struct mm_struct *mm,
|
||||
pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
unsigned long addr, unsigned long sz)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
|
|
|
@ -279,7 +279,7 @@ unsigned long pud_leaf_size(pud_t pud) { return 1UL << tte_to_shift(*(pte_t *)&p
|
|||
unsigned long pmd_leaf_size(pmd_t pmd) { return 1UL << tte_to_shift(*(pte_t *)&pmd); }
|
||||
unsigned long pte_leaf_size(pte_t pte) { return 1UL << tte_to_shift(pte); }
|
||||
|
||||
pte_t *huge_pte_alloc(struct mm_struct *mm,
|
||||
pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
unsigned long addr, unsigned long sz)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
|
|
|
@ -60,7 +60,13 @@ config X86
|
|||
select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
|
||||
select ARCH_32BIT_OFF_T if X86_32
|
||||
select ARCH_CLOCKSOURCE_INIT
|
||||
select ARCH_ENABLE_HUGEPAGE_MIGRATION if X86_64 && HUGETLB_PAGE && MIGRATION
|
||||
select ARCH_ENABLE_MEMORY_HOTPLUG if X86_64 || (X86_32 && HIGHMEM)
|
||||
select ARCH_ENABLE_MEMORY_HOTREMOVE if MEMORY_HOTPLUG
|
||||
select ARCH_ENABLE_SPLIT_PMD_PTLOCK if X86_64 || X86_PAE
|
||||
select ARCH_ENABLE_THP_MIGRATION if X86_64 && TRANSPARENT_HUGEPAGE
|
||||
select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI
|
||||
select ARCH_HAS_CACHE_LINE_SIZE
|
||||
select ARCH_HAS_DEBUG_VIRTUAL
|
||||
select ARCH_HAS_DEBUG_VM_PGTABLE if !X86_PAE
|
||||
select ARCH_HAS_DEVMEM_IS_ALLOWED
|
||||
|
@ -165,6 +171,7 @@ config X86
|
|||
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
|
||||
select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if X86_64
|
||||
select HAVE_ARCH_USERFAULTFD_WP if X86_64 && USERFAULTFD
|
||||
select HAVE_ARCH_USERFAULTFD_MINOR if X86_64 && USERFAULTFD
|
||||
select HAVE_ARCH_VMAP_STACK if X86_64
|
||||
select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
|
||||
select HAVE_ARCH_WITHIN_STACK_FRAMES
|
||||
|
@ -315,9 +322,6 @@ config GENERIC_CALIBRATE_DELAY
|
|||
config ARCH_HAS_CPU_RELAX
|
||||
def_bool y
|
||||
|
||||
config ARCH_HAS_CACHE_LINE_SIZE
|
||||
def_bool y
|
||||
|
||||
config ARCH_HAS_FILTER_PGPROT
|
||||
def_bool y
|
||||
|
||||
|
@ -2428,30 +2432,13 @@ config ARCH_HAS_ADD_PAGES
|
|||
def_bool y
|
||||
depends on X86_64 && ARCH_ENABLE_MEMORY_HOTPLUG
|
||||
|
||||
config ARCH_ENABLE_MEMORY_HOTPLUG
|
||||
config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
|
||||
def_bool y
|
||||
depends on X86_64 || (X86_32 && HIGHMEM)
|
||||
|
||||
config ARCH_ENABLE_MEMORY_HOTREMOVE
|
||||
def_bool y
|
||||
depends on MEMORY_HOTPLUG
|
||||
|
||||
config USE_PERCPU_NUMA_NODE_ID
|
||||
def_bool y
|
||||
depends on NUMA
|
||||
|
||||
config ARCH_ENABLE_SPLIT_PMD_PTLOCK
|
||||
def_bool y
|
||||
depends on X86_64 || X86_PAE
|
||||
|
||||
config ARCH_ENABLE_HUGEPAGE_MIGRATION
|
||||
def_bool y
|
||||
depends on X86_64 && HUGETLB_PAGE && MIGRATION
|
||||
|
||||
config ARCH_ENABLE_THP_MIGRATION
|
||||
def_bool y
|
||||
depends on X86_64 && TRANSPARENT_HUGEPAGE
|
||||
|
||||
menu "Power management and ACPI options"
|
||||
|
||||
config ARCH_HIBERNATION_HEADER
|
||||
|
|
|
@ -16,6 +16,8 @@
|
|||
#include <linux/pci.h>
|
||||
#include <linux/vmalloc.h>
|
||||
#include <linux/libnvdimm.h>
|
||||
#include <linux/vmstat.h>
|
||||
#include <linux/kernel.h>
|
||||
|
||||
#include <asm/e820/api.h>
|
||||
#include <asm/processor.h>
|
||||
|
@ -91,6 +93,12 @@ static void split_page_count(int level)
|
|||
return;
|
||||
|
||||
direct_pages_count[level]--;
|
||||
if (system_state == SYSTEM_RUNNING) {
|
||||
if (level == PG_LEVEL_2M)
|
||||
count_vm_event(DIRECT_MAP_LEVEL2_SPLIT);
|
||||
else if (level == PG_LEVEL_1G)
|
||||
count_vm_event(DIRECT_MAP_LEVEL3_SPLIT);
|
||||
}
|
||||
direct_pages_count[level - 1] += PTRS_PER_PTE;
|
||||
}
|
||||
|
||||
|
|
|
@ -171,6 +171,7 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
|
|||
acpi_handle handle = mem_device->device->handle;
|
||||
int result, num_enabled = 0;
|
||||
struct acpi_memory_info *info;
|
||||
mhp_t mhp_flags = MHP_NONE;
|
||||
int node;
|
||||
|
||||
node = acpi_get_node(handle);
|
||||
|
@ -194,8 +195,10 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
|
|||
if (node < 0)
|
||||
node = memory_add_physaddr_to_nid(info->start_addr);
|
||||
|
||||
if (mhp_supports_memmap_on_memory(info->length))
|
||||
mhp_flags |= MHP_MEMMAP_ON_MEMORY;
|
||||
result = __add_memory(node, info->start_addr, info->length,
|
||||
MHP_NONE);
|
||||
mhp_flags);
|
||||
|
||||
/*
|
||||
* If the memory block has been used by the kernel, add_memory()
|
||||
|
|
|
@ -169,30 +169,98 @@ int memory_notify(unsigned long val, void *v)
|
|||
return blocking_notifier_call_chain(&memory_chain, val, v);
|
||||
}
|
||||
|
||||
static int memory_block_online(struct memory_block *mem)
|
||||
{
|
||||
unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
|
||||
unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
|
||||
unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
|
||||
struct zone *zone;
|
||||
int ret;
|
||||
|
||||
zone = zone_for_pfn_range(mem->online_type, mem->nid, start_pfn, nr_pages);
|
||||
|
||||
/*
|
||||
* Although vmemmap pages have a different lifecycle than the pages
|
||||
* they describe (they remain until the memory is unplugged), doing
|
||||
* their initialization and accounting at memory onlining/offlining
|
||||
* stage helps to keep accounting easier to follow - e.g vmemmaps
|
||||
* belong to the same zone as the memory they backed.
|
||||
*/
|
||||
if (nr_vmemmap_pages) {
|
||||
ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, zone);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = online_pages(start_pfn + nr_vmemmap_pages,
|
||||
nr_pages - nr_vmemmap_pages, zone);
|
||||
if (ret) {
|
||||
if (nr_vmemmap_pages)
|
||||
mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Account once onlining succeeded. If the zone was unpopulated, it is
|
||||
* now already properly populated.
|
||||
*/
|
||||
if (nr_vmemmap_pages)
|
||||
adjust_present_page_count(zone, nr_vmemmap_pages);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int memory_block_offline(struct memory_block *mem)
|
||||
{
|
||||
unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
|
||||
unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
|
||||
unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
|
||||
struct zone *zone;
|
||||
int ret;
|
||||
|
||||
zone = page_zone(pfn_to_page(start_pfn));
|
||||
|
||||
/*
|
||||
* Unaccount before offlining, such that unpopulated zone and kthreads
|
||||
* can properly be torn down in offline_pages().
|
||||
*/
|
||||
if (nr_vmemmap_pages)
|
||||
adjust_present_page_count(zone, -nr_vmemmap_pages);
|
||||
|
||||
ret = offline_pages(start_pfn + nr_vmemmap_pages,
|
||||
nr_pages - nr_vmemmap_pages);
|
||||
if (ret) {
|
||||
/* offline_pages() failed. Account back. */
|
||||
if (nr_vmemmap_pages)
|
||||
adjust_present_page_count(zone, nr_vmemmap_pages);
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (nr_vmemmap_pages)
|
||||
mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
|
||||
* OK to have direct references to sparsemem variables in here.
|
||||
*/
|
||||
static int
|
||||
memory_block_action(unsigned long start_section_nr, unsigned long action,
|
||||
int online_type, int nid)
|
||||
memory_block_action(struct memory_block *mem, unsigned long action)
|
||||
{
|
||||
unsigned long start_pfn;
|
||||
unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
|
||||
int ret;
|
||||
|
||||
start_pfn = section_nr_to_pfn(start_section_nr);
|
||||
|
||||
switch (action) {
|
||||
case MEM_ONLINE:
|
||||
ret = online_pages(start_pfn, nr_pages, online_type, nid);
|
||||
ret = memory_block_online(mem);
|
||||
break;
|
||||
case MEM_OFFLINE:
|
||||
ret = offline_pages(start_pfn, nr_pages);
|
||||
ret = memory_block_offline(mem);
|
||||
break;
|
||||
default:
|
||||
WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: "
|
||||
"%ld\n", __func__, start_section_nr, action, action);
|
||||
"%ld\n", __func__, mem->start_section_nr, action, action);
|
||||
ret = -EINVAL;
|
||||
}
|
||||
|
||||
|
@ -210,9 +278,7 @@ static int memory_block_change_state(struct memory_block *mem,
|
|||
if (to_state == MEM_OFFLINE)
|
||||
mem->state = MEM_GOING_OFFLINE;
|
||||
|
||||
ret = memory_block_action(mem->start_section_nr, to_state,
|
||||
mem->online_type, mem->nid);
|
||||
|
||||
ret = memory_block_action(mem, to_state);
|
||||
mem->state = ret ? from_state_req : to_state;
|
||||
|
||||
return ret;
|
||||
|
@ -567,7 +633,8 @@ int register_memory(struct memory_block *memory)
|
|||
return ret;
|
||||
}
|
||||
|
||||
static int init_memory_block(unsigned long block_id, unsigned long state)
|
||||
static int init_memory_block(unsigned long block_id, unsigned long state,
|
||||
unsigned long nr_vmemmap_pages)
|
||||
{
|
||||
struct memory_block *mem;
|
||||
int ret = 0;
|
||||
|
@ -584,6 +651,7 @@ static int init_memory_block(unsigned long block_id, unsigned long state)
|
|||
mem->start_section_nr = block_id * sections_per_block;
|
||||
mem->state = state;
|
||||
mem->nid = NUMA_NO_NODE;
|
||||
mem->nr_vmemmap_pages = nr_vmemmap_pages;
|
||||
|
||||
ret = register_memory(mem);
|
||||
|
||||
|
@ -603,7 +671,7 @@ static int add_memory_block(unsigned long base_section_nr)
|
|||
if (section_count == 0)
|
||||
return 0;
|
||||
return init_memory_block(memory_block_id(base_section_nr),
|
||||
MEM_ONLINE);
|
||||
MEM_ONLINE, 0);
|
||||
}
|
||||
|
||||
static void unregister_memory(struct memory_block *memory)
|
||||
|
@ -625,7 +693,8 @@ static void unregister_memory(struct memory_block *memory)
|
|||
*
|
||||
* Called under device_hotplug_lock.
|
||||
*/
|
||||
int create_memory_block_devices(unsigned long start, unsigned long size)
|
||||
int create_memory_block_devices(unsigned long start, unsigned long size,
|
||||
unsigned long vmemmap_pages)
|
||||
{
|
||||
const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
|
||||
unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
|
||||
|
@ -638,7 +707,7 @@ int create_memory_block_devices(unsigned long start, unsigned long size)
|
|||
return -EINVAL;
|
||||
|
||||
for (block_id = start_block_id; block_id != end_block_id; block_id++) {
|
||||
ret = init_memory_block(block_id, MEM_OFFLINE);
|
||||
ret = init_memory_block(block_id, MEM_OFFLINE, vmemmap_pages);
|
||||
if (ret)
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -223,10 +223,13 @@ config TMPFS_INODE64
|
|||
|
||||
If unsure, say N.
|
||||
|
||||
config ARCH_SUPPORTS_HUGETLBFS
|
||||
def_bool n
|
||||
|
||||
config HUGETLBFS
|
||||
bool "HugeTLB file system support"
|
||||
depends on X86 || IA64 || SPARC64 || (S390 && 64BIT) || \
|
||||
SYS_SUPPORTS_HUGETLBFS || BROKEN
|
||||
ARCH_SUPPORTS_HUGETLBFS || BROKEN
|
||||
help
|
||||
hugetlbfs is a filesystem backing for HugeTLB pages, based on
|
||||
ramfs. For architectures that support it, say Y here and read
|
||||
|
|
|
@ -79,7 +79,7 @@ static void kill_bdev(struct block_device *bdev)
|
|||
{
|
||||
struct address_space *mapping = bdev->bd_inode->i_mapping;
|
||||
|
||||
if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
|
||||
if (mapping_empty(mapping))
|
||||
return;
|
||||
|
||||
invalidate_bh_lrus();
|
||||
|
|
|
@ -591,16 +591,13 @@ static noinline int add_ra_bio_pages(struct inode *inode,
|
|||
free_extent_map(em);
|
||||
|
||||
if (page->index == end_index) {
|
||||
char *userpage;
|
||||
size_t zero_offset = offset_in_page(isize);
|
||||
|
||||
if (zero_offset) {
|
||||
int zeros;
|
||||
zeros = PAGE_SIZE - zero_offset;
|
||||
userpage = kmap_atomic(page);
|
||||
memset(userpage + zero_offset, 0, zeros);
|
||||
memzero_page(page, zero_offset, zeros);
|
||||
flush_dcache_page(page);
|
||||
kunmap_atomic(userpage);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -3421,15 +3421,12 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
|
|||
}
|
||||
|
||||
if (page->index == last_byte >> PAGE_SHIFT) {
|
||||
char *userpage;
|
||||
size_t zero_offset = offset_in_page(last_byte);
|
||||
|
||||
if (zero_offset) {
|
||||
iosize = PAGE_SIZE - zero_offset;
|
||||
userpage = kmap_atomic(page);
|
||||
memset(userpage + zero_offset, 0, iosize);
|
||||
memzero_page(page, zero_offset, iosize);
|
||||
flush_dcache_page(page);
|
||||
kunmap_atomic(userpage);
|
||||
}
|
||||
}
|
||||
begin_page_read(fs_info, page);
|
||||
|
@ -3438,14 +3435,11 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
|
|||
u64 disk_bytenr;
|
||||
|
||||
if (cur >= last_byte) {
|
||||
char *userpage;
|
||||
struct extent_state *cached = NULL;
|
||||
|
||||
iosize = PAGE_SIZE - pg_offset;
|
||||
userpage = kmap_atomic(page);
|
||||
memset(userpage + pg_offset, 0, iosize);
|
||||
memzero_page(page, pg_offset, iosize);
|
||||
flush_dcache_page(page);
|
||||
kunmap_atomic(userpage);
|
||||
set_extent_uptodate(tree, cur, cur + iosize - 1,
|
||||
&cached, GFP_NOFS);
|
||||
unlock_extent_cached(tree, cur,
|
||||
|
@ -3528,13 +3522,10 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
|
|||
|
||||
/* we've found a hole, just zero and go on */
|
||||
if (block_start == EXTENT_MAP_HOLE) {
|
||||
char *userpage;
|
||||
struct extent_state *cached = NULL;
|
||||
|
||||
userpage = kmap_atomic(page);
|
||||
memset(userpage + pg_offset, 0, iosize);
|
||||
memzero_page(page, pg_offset, iosize);
|
||||
flush_dcache_page(page);
|
||||
kunmap_atomic(userpage);
|
||||
|
||||
set_extent_uptodate(tree, cur, cur + iosize - 1,
|
||||
&cached, GFP_NOFS);
|
||||
|
@ -3845,12 +3836,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
|
|||
}
|
||||
|
||||
if (page->index == end_index) {
|
||||
char *userpage;
|
||||
|
||||
userpage = kmap_atomic(page);
|
||||
memset(userpage + pg_offset, 0,
|
||||
PAGE_SIZE - pg_offset);
|
||||
kunmap_atomic(userpage);
|
||||
memzero_page(page, pg_offset, PAGE_SIZE - pg_offset);
|
||||
flush_dcache_page(page);
|
||||
}
|
||||
|
||||
|
|
|
@ -646,17 +646,12 @@ again:
|
|||
if (!ret) {
|
||||
unsigned long offset = offset_in_page(total_compressed);
|
||||
struct page *page = pages[nr_pages - 1];
|
||||
char *kaddr;
|
||||
|
||||
/* zero the tail end of the last page, we might be
|
||||
* sending it down to disk
|
||||
*/
|
||||
if (offset) {
|
||||
kaddr = kmap_atomic(page);
|
||||
memset(kaddr + offset, 0,
|
||||
PAGE_SIZE - offset);
|
||||
kunmap_atomic(kaddr);
|
||||
}
|
||||
if (offset)
|
||||
memzero_page(page, offset, PAGE_SIZE - offset);
|
||||
will_compress = 1;
|
||||
}
|
||||
}
|
||||
|
@ -4833,7 +4828,6 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
|
|||
struct btrfs_ordered_extent *ordered;
|
||||
struct extent_state *cached_state = NULL;
|
||||
struct extent_changeset *data_reserved = NULL;
|
||||
char *kaddr;
|
||||
bool only_release_metadata = false;
|
||||
u32 blocksize = fs_info->sectorsize;
|
||||
pgoff_t index = from >> PAGE_SHIFT;
|
||||
|
@ -4925,15 +4919,13 @@ again:
|
|||
if (offset != blocksize) {
|
||||
if (!len)
|
||||
len = blocksize - offset;
|
||||
kaddr = kmap(page);
|
||||
if (front)
|
||||
memset(kaddr + (block_start - page_offset(page)),
|
||||
0, offset);
|
||||
memzero_page(page, (block_start - page_offset(page)),
|
||||
offset);
|
||||
else
|
||||
memset(kaddr + (block_start - page_offset(page)) + offset,
|
||||
0, len);
|
||||
memzero_page(page, (block_start - page_offset(page)) + offset,
|
||||
len);
|
||||
flush_dcache_page(page);
|
||||
kunmap(page);
|
||||
}
|
||||
ClearPageChecked(page);
|
||||
set_page_dirty(page);
|
||||
|
@ -6832,11 +6824,9 @@ static noinline int uncompress_inline(struct btrfs_path *path,
|
|||
* cover that region here.
|
||||
*/
|
||||
|
||||
if (max_size + pg_offset < PAGE_SIZE) {
|
||||
char *map = kmap(page);
|
||||
memset(map + pg_offset + max_size, 0, PAGE_SIZE - max_size - pg_offset);
|
||||
kunmap(page);
|
||||
}
|
||||
if (max_size + pg_offset < PAGE_SIZE)
|
||||
memzero_page(page, pg_offset + max_size,
|
||||
PAGE_SIZE - max_size - pg_offset);
|
||||
kfree(tmp);
|
||||
return ret;
|
||||
}
|
||||
|
@ -8506,7 +8496,6 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
|
|||
struct btrfs_ordered_extent *ordered;
|
||||
struct extent_state *cached_state = NULL;
|
||||
struct extent_changeset *data_reserved = NULL;
|
||||
char *kaddr;
|
||||
unsigned long zero_start;
|
||||
loff_t size;
|
||||
vm_fault_t ret;
|
||||
|
@ -8620,10 +8609,8 @@ again:
|
|||
zero_start = PAGE_SIZE;
|
||||
|
||||
if (zero_start != PAGE_SIZE) {
|
||||
kaddr = kmap(page);
|
||||
memset(kaddr + zero_start, 0, PAGE_SIZE - zero_start);
|
||||
memzero_page(page, zero_start, PAGE_SIZE - zero_start);
|
||||
flush_dcache_page(page);
|
||||
kunmap(page);
|
||||
}
|
||||
ClearPageChecked(page);
|
||||
set_page_dirty(page);
|
||||
|
|
|
@ -129,12 +129,8 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
|
|||
* So what's in the range [500, 4095] corresponds to zeroes.
|
||||
*/
|
||||
if (datal < block_size) {
|
||||
char *map;
|
||||
|
||||
map = kmap(page);
|
||||
memset(map + datal, 0, block_size - datal);
|
||||
memzero_page(page, datal, block_size - datal);
|
||||
flush_dcache_page(page);
|
||||
kunmap(page);
|
||||
}
|
||||
|
||||
SetPageUptodate(page);
|
||||
|
|
|
@ -375,7 +375,6 @@ int zlib_decompress(struct list_head *ws, unsigned char *data_in,
|
|||
unsigned long bytes_left;
|
||||
unsigned long total_out = 0;
|
||||
unsigned long pg_offset = 0;
|
||||
char *kaddr;
|
||||
|
||||
destlen = min_t(unsigned long, destlen, PAGE_SIZE);
|
||||
bytes_left = destlen;
|
||||
|
@ -455,9 +454,7 @@ next:
|
|||
* end of the inline extent (destlen) to the end of the page
|
||||
*/
|
||||
if (pg_offset < destlen) {
|
||||
kaddr = kmap_atomic(dest_page);
|
||||
memset(kaddr + pg_offset, 0, destlen - pg_offset);
|
||||
kunmap_atomic(kaddr);
|
||||
memzero_page(dest_page, pg_offset, destlen - pg_offset);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
|
|
@ -631,7 +631,6 @@ int zstd_decompress(struct list_head *ws, unsigned char *data_in,
|
|||
size_t ret2;
|
||||
unsigned long total_out = 0;
|
||||
unsigned long pg_offset = 0;
|
||||
char *kaddr;
|
||||
|
||||
stream = ZSTD_initDStream(
|
||||
ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size);
|
||||
|
@ -696,9 +695,7 @@ int zstd_decompress(struct list_head *ws, unsigned char *data_in,
|
|||
ret = 0;
|
||||
finish:
|
||||
if (pg_offset < destlen) {
|
||||
kaddr = kmap_atomic(dest_page);
|
||||
memset(kaddr + pg_offset, 0, destlen - pg_offset);
|
||||
kunmap_atomic(kaddr);
|
||||
memzero_page(dest_page, pg_offset, destlen - pg_offset);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
|
36
fs/buffer.c
36
fs/buffer.c
|
@ -1260,6 +1260,15 @@ static void bh_lru_install(struct buffer_head *bh)
|
|||
int i;
|
||||
|
||||
check_irqs_on();
|
||||
/*
|
||||
* the refcount of buffer_head in bh_lru prevents dropping the
|
||||
* attached page(i.e., try_to_free_buffers) so it could cause
|
||||
* failing page migration.
|
||||
* Skip putting upcoming bh into bh_lru until migration is done.
|
||||
*/
|
||||
if (lru_cache_disabled())
|
||||
return;
|
||||
|
||||
bh_lru_lock();
|
||||
|
||||
b = this_cpu_ptr(&bh_lrus);
|
||||
|
@ -1400,6 +1409,15 @@ __bread_gfp(struct block_device *bdev, sector_t block,
|
|||
}
|
||||
EXPORT_SYMBOL(__bread_gfp);
|
||||
|
||||
static void __invalidate_bh_lrus(struct bh_lru *b)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < BH_LRU_SIZE; i++) {
|
||||
brelse(b->bhs[i]);
|
||||
b->bhs[i] = NULL;
|
||||
}
|
||||
}
|
||||
/*
|
||||
* invalidate_bh_lrus() is called rarely - but not only at unmount.
|
||||
* This doesn't race because it runs in each cpu either in irq
|
||||
|
@ -1408,16 +1426,12 @@ EXPORT_SYMBOL(__bread_gfp);
|
|||
static void invalidate_bh_lru(void *arg)
|
||||
{
|
||||
struct bh_lru *b = &get_cpu_var(bh_lrus);
|
||||
int i;
|
||||
|
||||
for (i = 0; i < BH_LRU_SIZE; i++) {
|
||||
brelse(b->bhs[i]);
|
||||
b->bhs[i] = NULL;
|
||||
}
|
||||
__invalidate_bh_lrus(b);
|
||||
put_cpu_var(bh_lrus);
|
||||
}
|
||||
|
||||
static bool has_bh_in_lru(int cpu, void *dummy)
|
||||
bool has_bh_in_lru(int cpu, void *dummy)
|
||||
{
|
||||
struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
|
||||
int i;
|
||||
|
@ -1436,6 +1450,16 @@ void invalidate_bh_lrus(void)
|
|||
}
|
||||
EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
|
||||
|
||||
void invalidate_bh_lrus_cpu(int cpu)
|
||||
{
|
||||
struct bh_lru *b;
|
||||
|
||||
bh_lru_lock();
|
||||
b = per_cpu_ptr(&bh_lrus, cpu);
|
||||
__invalidate_bh_lrus(b);
|
||||
bh_lru_unlock();
|
||||
}
|
||||
|
||||
void set_bh_page(struct buffer_head *bh,
|
||||
struct page *page, unsigned long offset)
|
||||
{
|
||||
|
|
8
fs/dax.c
8
fs/dax.c
|
@ -525,7 +525,7 @@ retry:
|
|||
dax_disassociate_entry(entry, mapping, false);
|
||||
xas_store(xas, NULL); /* undo the PMD join */
|
||||
dax_wake_entry(xas, entry, true);
|
||||
mapping->nrexceptional--;
|
||||
mapping->nrpages -= PG_PMD_NR;
|
||||
entry = NULL;
|
||||
xas_set(xas, index);
|
||||
}
|
||||
|
@ -541,7 +541,7 @@ retry:
|
|||
dax_lock_entry(xas, entry);
|
||||
if (xas_error(xas))
|
||||
goto out_unlock;
|
||||
mapping->nrexceptional++;
|
||||
mapping->nrpages += 1UL << order;
|
||||
}
|
||||
|
||||
out_unlock:
|
||||
|
@ -661,7 +661,7 @@ static int __dax_invalidate_entry(struct address_space *mapping,
|
|||
goto out;
|
||||
dax_disassociate_entry(entry, mapping, trunc);
|
||||
xas_store(&xas, NULL);
|
||||
mapping->nrexceptional--;
|
||||
mapping->nrpages -= 1UL << dax_entry_order(entry);
|
||||
ret = 1;
|
||||
out:
|
||||
put_unlocked_entry(&xas, entry);
|
||||
|
@ -965,7 +965,7 @@ int dax_writeback_mapping_range(struct address_space *mapping,
|
|||
if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
|
||||
return -EIO;
|
||||
|
||||
if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
|
||||
if (mapping_empty(mapping) || wbc->sync_mode != WB_SYNC_ALL)
|
||||
return 0;
|
||||
|
||||
trace_dax_writeback_range(inode, xas.xa_index, end_index);
|
||||
|
|
|
@ -273,8 +273,7 @@ static void __gfs2_glock_put(struct gfs2_glock *gl)
|
|||
if (mapping) {
|
||||
truncate_inode_pages_final(mapping);
|
||||
if (!gfs2_withdrawn(sdp))
|
||||
GLOCK_BUG_ON(gl, mapping->nrpages ||
|
||||
mapping->nrexceptional);
|
||||
GLOCK_BUG_ON(gl, !mapping_empty(mapping));
|
||||
}
|
||||
trace_gfs2_glock_put(gl);
|
||||
sdp->sd_lockstruct.ls_ops->lm_put_lock(gl);
|
||||
|
|
|
@ -463,14 +463,11 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
|
|||
struct address_space *mapping = &inode->i_data;
|
||||
const pgoff_t start = lstart >> huge_page_shift(h);
|
||||
const pgoff_t end = lend >> huge_page_shift(h);
|
||||
struct vm_area_struct pseudo_vma;
|
||||
struct pagevec pvec;
|
||||
pgoff_t next, index;
|
||||
int i, freed = 0;
|
||||
bool truncate_op = (lend == LLONG_MAX);
|
||||
|
||||
vma_init(&pseudo_vma, current->mm);
|
||||
pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
|
||||
pagevec_init(&pvec);
|
||||
next = start;
|
||||
while (next < end) {
|
||||
|
@ -482,10 +479,9 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
|
|||
|
||||
for (i = 0; i < pagevec_count(&pvec); ++i) {
|
||||
struct page *page = pvec.pages[i];
|
||||
u32 hash;
|
||||
u32 hash = 0;
|
||||
|
||||
index = page->index;
|
||||
hash = hugetlb_fault_mutex_hash(mapping, index);
|
||||
if (!truncate_op) {
|
||||
/*
|
||||
* Only need to hold the fault mutex in the
|
||||
|
@ -493,6 +489,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
|
|||
* page faults. Races are not possible in the
|
||||
* case of truncation.
|
||||
*/
|
||||
hash = hugetlb_fault_mutex_hash(mapping, index);
|
||||
mutex_lock(&hugetlb_fault_mutex_table[hash]);
|
||||
}
|
||||
|
||||
|
@ -1435,7 +1432,7 @@ static int get_hstate_idx(int page_size_log)
|
|||
|
||||
if (!h)
|
||||
return -1;
|
||||
return h - hstates;
|
||||
return hstate_index(h);
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -529,7 +529,14 @@ void clear_inode(struct inode *inode)
|
|||
*/
|
||||
xa_lock_irq(&inode->i_data.i_pages);
|
||||
BUG_ON(inode->i_data.nrpages);
|
||||
BUG_ON(inode->i_data.nrexceptional);
|
||||
/*
|
||||
* Almost always, mapping_empty(&inode->i_data) here; but there are
|
||||
* two known and long-standing ways in which nodes may get left behind
|
||||
* (when deep radix-tree node allocation failed partway; or when THP
|
||||
* collapse_file() failed). Until those two known cases are cleaned up,
|
||||
* or a cleanup function is called here, do not BUG_ON(!mapping_empty),
|
||||
* nor even WARN_ON(!mapping_empty).
|
||||
*/
|
||||
xa_unlock_irq(&inode->i_data.i_pages);
|
||||
BUG_ON(!list_empty(&inode->i_data.private_list));
|
||||
BUG_ON(!(inode->i_state & I_FREEING));
|
||||
|
|
|
@ -661,6 +661,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
|
|||
[ilog2(VM_PKEY_BIT4)] = "",
|
||||
#endif
|
||||
#endif /* CONFIG_ARCH_HAS_PKEYS */
|
||||
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
|
||||
[ilog2(VM_UFFD_MINOR)] = "ui",
|
||||
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
|
||||
};
|
||||
size_t i;
|
||||
|
||||
|
|
149
fs/userfaultfd.c
149
fs/userfaultfd.c
|
@ -15,6 +15,7 @@
|
|||
#include <linux/sched/signal.h>
|
||||
#include <linux/sched/mm.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/mmu_notifier.h>
|
||||
#include <linux/poll.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/seq_file.h>
|
||||
|
@ -196,24 +197,21 @@ static inline struct uffd_msg userfault_msg(unsigned long address,
|
|||
msg_init(&msg);
|
||||
msg.event = UFFD_EVENT_PAGEFAULT;
|
||||
msg.arg.pagefault.address = address;
|
||||
/*
|
||||
* These flags indicate why the userfault occurred:
|
||||
* - UFFD_PAGEFAULT_FLAG_WP indicates a write protect fault.
|
||||
* - UFFD_PAGEFAULT_FLAG_MINOR indicates a minor fault.
|
||||
* - Neither of these flags being set indicates a MISSING fault.
|
||||
*
|
||||
* Separately, UFFD_PAGEFAULT_FLAG_WRITE indicates it was a write
|
||||
* fault. Otherwise, it was a read fault.
|
||||
*/
|
||||
if (flags & FAULT_FLAG_WRITE)
|
||||
/*
|
||||
* If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the
|
||||
* uffdio_api.features and UFFD_PAGEFAULT_FLAG_WRITE
|
||||
* was not set in a UFFD_EVENT_PAGEFAULT, it means it
|
||||
* was a read fault, otherwise if set it means it's
|
||||
* a write fault.
|
||||
*/
|
||||
msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE;
|
||||
if (reason & VM_UFFD_WP)
|
||||
/*
|
||||
* If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the
|
||||
* uffdio_api.features and UFFD_PAGEFAULT_FLAG_WP was
|
||||
* not set in a UFFD_EVENT_PAGEFAULT, it means it was
|
||||
* a missing fault, otherwise if set it means it's a
|
||||
* write protect fault.
|
||||
*/
|
||||
msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
|
||||
if (reason & VM_UFFD_MINOR)
|
||||
msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_MINOR;
|
||||
if (features & UFFD_FEATURE_THREAD_ID)
|
||||
msg.arg.pagefault.feat.ptid = task_pid_vnr(current);
|
||||
return msg;
|
||||
|
@ -400,8 +398,10 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
|
|||
|
||||
BUG_ON(ctx->mm != mm);
|
||||
|
||||
VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP));
|
||||
VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP));
|
||||
/* Any unrecognized flag is a bug. */
|
||||
VM_BUG_ON(reason & ~__VM_UFFD_FLAGS);
|
||||
/* 0 or > 1 flags set is a bug; we expect exactly 1. */
|
||||
VM_BUG_ON(!reason || (reason & (reason - 1)));
|
||||
|
||||
if (ctx->features & UFFD_FEATURE_SIGBUS)
|
||||
goto out;
|
||||
|
@ -611,7 +611,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
|
|||
for (vma = mm->mmap; vma; vma = vma->vm_next)
|
||||
if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
|
||||
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
|
||||
vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
|
||||
vma->vm_flags &= ~__VM_UFFD_FLAGS;
|
||||
}
|
||||
mmap_write_unlock(mm);
|
||||
|
||||
|
@ -643,7 +643,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
|
|||
octx = vma->vm_userfaultfd_ctx.ctx;
|
||||
if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
|
||||
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
|
||||
vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
|
||||
vma->vm_flags &= ~__VM_UFFD_FLAGS;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -725,7 +725,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
|
|||
} else {
|
||||
/* Drop uffd context if remap feature not enabled */
|
||||
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
|
||||
vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
|
||||
vma->vm_flags &= ~__VM_UFFD_FLAGS;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -866,12 +866,12 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
|
|||
for (vma = mm->mmap; vma; vma = vma->vm_next) {
|
||||
cond_resched();
|
||||
BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
|
||||
!!(vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
|
||||
!!(vma->vm_flags & __VM_UFFD_FLAGS));
|
||||
if (vma->vm_userfaultfd_ctx.ctx != ctx) {
|
||||
prev = vma;
|
||||
continue;
|
||||
}
|
||||
new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
|
||||
new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
|
||||
prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end,
|
||||
new_flags, vma->anon_vma,
|
||||
vma->vm_file, vma->vm_pgoff,
|
||||
|
@ -1261,9 +1261,19 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma,
|
|||
unsigned long vm_flags)
|
||||
{
|
||||
/* FIXME: add WP support to hugetlbfs and shmem */
|
||||
return vma_is_anonymous(vma) ||
|
||||
((is_vm_hugetlb_page(vma) || vma_is_shmem(vma)) &&
|
||||
!(vm_flags & VM_UFFD_WP));
|
||||
if (vm_flags & VM_UFFD_WP) {
|
||||
if (is_vm_hugetlb_page(vma) || vma_is_shmem(vma))
|
||||
return false;
|
||||
}
|
||||
|
||||
if (vm_flags & VM_UFFD_MINOR) {
|
||||
/* FIXME: Add minor fault interception for shmem. */
|
||||
if (!is_vm_hugetlb_page(vma))
|
||||
return false;
|
||||
}
|
||||
|
||||
return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
|
||||
vma_is_shmem(vma);
|
||||
}
|
||||
|
||||
static int userfaultfd_register(struct userfaultfd_ctx *ctx,
|
||||
|
@ -1289,14 +1299,19 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
|
|||
ret = -EINVAL;
|
||||
if (!uffdio_register.mode)
|
||||
goto out;
|
||||
if (uffdio_register.mode & ~(UFFDIO_REGISTER_MODE_MISSING|
|
||||
UFFDIO_REGISTER_MODE_WP))
|
||||
if (uffdio_register.mode & ~UFFD_API_REGISTER_MODES)
|
||||
goto out;
|
||||
vm_flags = 0;
|
||||
if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
|
||||
vm_flags |= VM_UFFD_MISSING;
|
||||
if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP)
|
||||
vm_flags |= VM_UFFD_WP;
|
||||
if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) {
|
||||
#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
|
||||
goto out;
|
||||
#endif
|
||||
vm_flags |= VM_UFFD_MINOR;
|
||||
}
|
||||
|
||||
ret = validate_range(mm, &uffdio_register.range.start,
|
||||
uffdio_register.range.len);
|
||||
|
@ -1340,7 +1355,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
|
|||
cond_resched();
|
||||
|
||||
BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
|
||||
!!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
|
||||
!!(cur->vm_flags & __VM_UFFD_FLAGS));
|
||||
|
||||
/* check not compatible vmas */
|
||||
ret = -EINVAL;
|
||||
|
@ -1420,8 +1435,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
|
|||
start = vma->vm_start;
|
||||
vma_end = min(end, vma->vm_end);
|
||||
|
||||
new_flags = (vma->vm_flags &
|
||||
~(VM_UFFD_MISSING|VM_UFFD_WP)) | vm_flags;
|
||||
new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
|
||||
prev = vma_merge(mm, prev, start, vma_end, new_flags,
|
||||
vma->anon_vma, vma->vm_file, vma->vm_pgoff,
|
||||
vma_policy(vma),
|
||||
|
@ -1449,6 +1463,9 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
|
|||
vma->vm_flags = new_flags;
|
||||
vma->vm_userfaultfd_ctx.ctx = ctx;
|
||||
|
||||
if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma))
|
||||
hugetlb_unshare_all_pmds(vma);
|
||||
|
||||
skip:
|
||||
prev = vma;
|
||||
start = vma->vm_end;
|
||||
|
@ -1470,6 +1487,10 @@ out_unlock:
|
|||
if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP))
|
||||
ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT);
|
||||
|
||||
/* CONTINUE ioctl is only supported for MINOR ranges. */
|
||||
if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR))
|
||||
ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE);
|
||||
|
||||
/*
|
||||
* Now that we scanned all vmas we can already tell
|
||||
* userland which ioctls methods are guaranteed to
|
||||
|
@ -1540,7 +1561,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
|
|||
cond_resched();
|
||||
|
||||
BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
|
||||
!!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
|
||||
!!(cur->vm_flags & __VM_UFFD_FLAGS));
|
||||
|
||||
/*
|
||||
* Check not compatible vmas, not strictly required
|
||||
|
@ -1591,7 +1612,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
|
|||
wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
|
||||
}
|
||||
|
||||
new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
|
||||
new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
|
||||
prev = vma_merge(mm, prev, start, vma_end, new_flags,
|
||||
vma->anon_vma, vma->vm_file, vma->vm_pgoff,
|
||||
vma_policy(vma),
|
||||
|
@ -1823,6 +1844,66 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
|
|||
return ret;
|
||||
}
|
||||
|
||||
static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
|
||||
{
|
||||
__s64 ret;
|
||||
struct uffdio_continue uffdio_continue;
|
||||
struct uffdio_continue __user *user_uffdio_continue;
|
||||
struct userfaultfd_wake_range range;
|
||||
|
||||
user_uffdio_continue = (struct uffdio_continue __user *)arg;
|
||||
|
||||
ret = -EAGAIN;
|
||||
if (READ_ONCE(ctx->mmap_changing))
|
||||
goto out;
|
||||
|
||||
ret = -EFAULT;
|
||||
if (copy_from_user(&uffdio_continue, user_uffdio_continue,
|
||||
/* don't copy the output fields */
|
||||
sizeof(uffdio_continue) - (sizeof(__s64))))
|
||||
goto out;
|
||||
|
||||
ret = validate_range(ctx->mm, &uffdio_continue.range.start,
|
||||
uffdio_continue.range.len);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
ret = -EINVAL;
|
||||
/* double check for wraparound just in case. */
|
||||
if (uffdio_continue.range.start + uffdio_continue.range.len <=
|
||||
uffdio_continue.range.start) {
|
||||
goto out;
|
||||
}
|
||||
if (uffdio_continue.mode & ~UFFDIO_CONTINUE_MODE_DONTWAKE)
|
||||
goto out;
|
||||
|
||||
if (mmget_not_zero(ctx->mm)) {
|
||||
ret = mcopy_continue(ctx->mm, uffdio_continue.range.start,
|
||||
uffdio_continue.range.len,
|
||||
&ctx->mmap_changing);
|
||||
mmput(ctx->mm);
|
||||
} else {
|
||||
return -ESRCH;
|
||||
}
|
||||
|
||||
if (unlikely(put_user(ret, &user_uffdio_continue->mapped)))
|
||||
return -EFAULT;
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
/* len == 0 would wake all */
|
||||
BUG_ON(!ret);
|
||||
range.len = ret;
|
||||
if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) {
|
||||
range.start = uffdio_continue.range.start;
|
||||
wake_userfault(ctx, &range);
|
||||
}
|
||||
ret = range.len == uffdio_continue.range.len ? 0 : -EAGAIN;
|
||||
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline unsigned int uffd_ctx_features(__u64 user_features)
|
||||
{
|
||||
/*
|
||||
|
@ -1859,6 +1940,9 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
|
|||
goto err_out;
|
||||
/* report all available features and ioctls to userland */
|
||||
uffdio_api.features = UFFD_API_FEATURES;
|
||||
#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
|
||||
uffdio_api.features &= ~UFFD_FEATURE_MINOR_HUGETLBFS;
|
||||
#endif
|
||||
uffdio_api.ioctls = UFFD_API_IOCTLS;
|
||||
ret = -EFAULT;
|
||||
if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
|
||||
|
@ -1907,6 +1991,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
|
|||
case UFFDIO_WRITEPROTECT:
|
||||
ret = userfaultfd_writeprotect(ctx, arg);
|
||||
break;
|
||||
case UFFDIO_CONTINUE:
|
||||
ret = userfaultfd_continue(ctx, arg);
|
||||
break;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
|
|
@ -194,6 +194,8 @@ void __breadahead_gfp(struct block_device *, sector_t block, unsigned int size,
|
|||
struct buffer_head *__bread_gfp(struct block_device *,
|
||||
sector_t block, unsigned size, gfp_t gfp);
|
||||
void invalidate_bh_lrus(void);
|
||||
void invalidate_bh_lrus_cpu(int cpu);
|
||||
bool has_bh_in_lru(int cpu, void *dummy);
|
||||
struct buffer_head *alloc_buffer_head(gfp_t gfp_flags);
|
||||
void free_buffer_head(struct buffer_head * bh);
|
||||
void unlock_buffer(struct buffer_head *bh);
|
||||
|
@ -406,6 +408,8 @@ static inline int inode_has_buffers(struct inode *inode) { return 0; }
|
|||
static inline void invalidate_inode_buffers(struct inode *inode) {}
|
||||
static inline int remove_inode_buffers(struct inode *inode) { return 1; }
|
||||
static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; }
|
||||
static inline void invalidate_bh_lrus_cpu(int cpu) {}
|
||||
static inline bool has_bh_in_lru(int cpu, void *dummy) { return 0; }
|
||||
#define buffer_heads_over_limit 0
|
||||
|
||||
#endif /* CONFIG_BLOCK */
|
||||
|
|
|
@ -44,9 +44,9 @@ extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
|
|||
unsigned int order_per_bit,
|
||||
const char *name,
|
||||
struct cma **res_cma);
|
||||
extern struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
|
||||
extern struct page *cma_alloc(struct cma *cma, unsigned long count, unsigned int align,
|
||||
bool no_warn);
|
||||
extern bool cma_release(struct cma *cma, const struct page *pages, unsigned int count);
|
||||
extern bool cma_release(struct cma *cma, const struct page *pages, unsigned long count);
|
||||
|
||||
extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data);
|
||||
#endif
|
||||
|
|
|
@ -81,7 +81,6 @@ static inline unsigned long compact_gap(unsigned int order)
|
|||
}
|
||||
|
||||
#ifdef CONFIG_COMPACTION
|
||||
extern int sysctl_compact_memory;
|
||||
extern unsigned int sysctl_compaction_proactiveness;
|
||||
extern int sysctl_compaction_handler(struct ctl_table *table, int write,
|
||||
void *buffer, size_t *length, loff_t *ppos);
|
||||
|
|
|
@ -442,7 +442,6 @@ int pagecache_write_end(struct file *, struct address_space *mapping,
|
|||
* @i_mmap: Tree of private and shared mappings.
|
||||
* @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable.
|
||||
* @nrpages: Number of page entries, protected by the i_pages lock.
|
||||
* @nrexceptional: Shadow or DAX entries, protected by the i_pages lock.
|
||||
* @writeback_index: Writeback starts here.
|
||||
* @a_ops: Methods.
|
||||
* @flags: Error bits and flags (AS_*).
|
||||
|
@ -463,7 +462,6 @@ struct address_space {
|
|||
struct rb_root_cached i_mmap;
|
||||
struct rw_semaphore i_mmap_rwsem;
|
||||
unsigned long nrpages;
|
||||
unsigned long nrexceptional;
|
||||
pgoff_t writeback_index;
|
||||
const struct address_space_operations *a_ops;
|
||||
unsigned long flags;
|
||||
|
|
|
@ -657,7 +657,7 @@ extern int alloc_contig_range(unsigned long start, unsigned long end,
|
|||
extern struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
|
||||
int nid, nodemask_t *nodemask);
|
||||
#endif
|
||||
void free_contig_range(unsigned long pfn, unsigned int nr_pages);
|
||||
void free_contig_range(unsigned long pfn, unsigned long nr_pages);
|
||||
|
||||
#ifdef CONFIG_CMA
|
||||
/* CMA stuff */
|
||||
|
|
|
@ -332,4 +332,11 @@ static inline void memcpy_to_page(struct page *page, size_t offset,
|
|||
kunmap_local(to);
|
||||
}
|
||||
|
||||
static inline void memzero_page(struct page *page, size_t offset, size_t len)
|
||||
{
|
||||
char *addr = kmap_atomic(page);
|
||||
memset(addr + offset, 0, len);
|
||||
kunmap_atomic(addr);
|
||||
}
|
||||
|
||||
#endif /* _LINUX_HIGHMEM_H */
|
||||
|
|
|
@ -87,9 +87,6 @@ enum transparent_hugepage_flag {
|
|||
TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
|
||||
TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG,
|
||||
TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG,
|
||||
#ifdef CONFIG_DEBUG_VM
|
||||
TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG,
|
||||
#endif
|
||||
};
|
||||
|
||||
struct kobject;
|
||||
|
|
|
@ -11,6 +11,7 @@
|
|||
#include <linux/kref.h>
|
||||
#include <linux/pgtable.h>
|
||||
#include <linux/gfp.h>
|
||||
#include <linux/userfaultfd_k.h>
|
||||
|
||||
struct ctl_table;
|
||||
struct user_struct;
|
||||
|
@ -134,11 +135,14 @@ void hugetlb_show_meminfo(void);
|
|||
unsigned long hugetlb_total_pages(void);
|
||||
vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
unsigned long address, unsigned int flags);
|
||||
#ifdef CONFIG_USERFAULTFD
|
||||
int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte,
|
||||
struct vm_area_struct *dst_vma,
|
||||
unsigned long dst_addr,
|
||||
unsigned long src_addr,
|
||||
enum mcopy_atomic_mode mode,
|
||||
struct page **pagep);
|
||||
#endif /* CONFIG_USERFAULTFD */
|
||||
bool hugetlb_reserve_pages(struct inode *inode, long from, long to,
|
||||
struct vm_area_struct *vma,
|
||||
vm_flags_t vm_flags);
|
||||
|
@ -152,7 +156,8 @@ void hugetlb_fix_reserve_counts(struct inode *inode);
|
|||
extern struct mutex *hugetlb_fault_mutex_table;
|
||||
u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx);
|
||||
|
||||
pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud);
|
||||
pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
unsigned long addr, pud_t *pud);
|
||||
|
||||
struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage);
|
||||
|
||||
|
@ -161,7 +166,7 @@ extern struct list_head huge_boot_pages;
|
|||
|
||||
/* arch callbacks */
|
||||
|
||||
pte_t *huge_pte_alloc(struct mm_struct *mm,
|
||||
pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
unsigned long addr, unsigned long sz);
|
||||
pte_t *huge_pte_offset(struct mm_struct *mm,
|
||||
unsigned long addr, unsigned long sz);
|
||||
|
@ -187,6 +192,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
|
|||
unsigned long address, unsigned long end, pgprot_t newprot);
|
||||
|
||||
bool is_hugetlb_entry_migration(pte_t pte);
|
||||
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
|
||||
|
||||
#else /* !CONFIG_HUGETLB_PAGE */
|
||||
|
||||
|
@ -308,16 +314,19 @@ static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
|
|||
BUG();
|
||||
}
|
||||
|
||||
#ifdef CONFIG_USERFAULTFD
|
||||
static inline int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
|
||||
pte_t *dst_pte,
|
||||
struct vm_area_struct *dst_vma,
|
||||
unsigned long dst_addr,
|
||||
unsigned long src_addr,
|
||||
enum mcopy_atomic_mode mode,
|
||||
struct page **pagep)
|
||||
{
|
||||
BUG();
|
||||
return 0;
|
||||
}
|
||||
#endif /* CONFIG_USERFAULTFD */
|
||||
|
||||
static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr,
|
||||
unsigned long sz)
|
||||
|
@ -368,6 +377,8 @@ static inline vm_fault_t hugetlb_fault(struct mm_struct *mm,
|
|||
return 0;
|
||||
}
|
||||
|
||||
static inline void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { }
|
||||
|
||||
#endif /* !CONFIG_HUGETLB_PAGE */
|
||||
/*
|
||||
* hugepages at page global directory. If arch support
|
||||
|
@ -555,6 +566,7 @@ HPAGEFLAG(Freed, freed)
|
|||
#define HSTATE_NAME_LEN 32
|
||||
/* Defines one hugetlb page size */
|
||||
struct hstate {
|
||||
struct mutex resize_lock;
|
||||
int next_nid_to_alloc;
|
||||
int next_nid_to_free;
|
||||
unsigned int order;
|
||||
|
@ -583,6 +595,7 @@ struct huge_bootmem_page {
|
|||
struct hstate *hstate;
|
||||
};
|
||||
|
||||
int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list);
|
||||
struct page *alloc_huge_page(struct vm_area_struct *vma,
|
||||
unsigned long addr, int avoid_reserve);
|
||||
struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
|
||||
|
@ -865,6 +878,12 @@ static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
|
|||
#else /* CONFIG_HUGETLB_PAGE */
|
||||
struct hstate {};
|
||||
|
||||
static inline int isolate_or_dissolve_huge_page(struct page *page,
|
||||
struct list_head *list)
|
||||
{
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
static inline struct page *alloc_huge_page(struct vm_area_struct *vma,
|
||||
unsigned long addr,
|
||||
int avoid_reserve)
|
||||
|
@ -1039,4 +1058,14 @@ static inline __init void hugetlb_cma_check(void)
|
|||
}
|
||||
#endif
|
||||
|
||||
bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr);
|
||||
|
||||
#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE
|
||||
/*
|
||||
* ARCHes with special requirements for evicting HUGETLB backing TLB entries can
|
||||
* implement this.
|
||||
*/
|
||||
#define flush_hugetlb_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end)
|
||||
#endif
|
||||
|
||||
#endif /* _LINUX_HUGETLB_H */
|
||||
|
|
|
@ -114,12 +114,13 @@ struct batched_lruvec_stat {
|
|||
};
|
||||
|
||||
/*
|
||||
* Bitmap of shrinker::id corresponding to memcg-aware shrinkers,
|
||||
* which have elements charged to this memcg.
|
||||
* Bitmap and deferred work of shrinker::id corresponding to memcg-aware
|
||||
* shrinkers, which have elements charged to this memcg.
|
||||
*/
|
||||
struct memcg_shrinker_map {
|
||||
struct shrinker_info {
|
||||
struct rcu_head rcu;
|
||||
unsigned long map[];
|
||||
atomic_long_t *nr_deferred;
|
||||
unsigned long *map;
|
||||
};
|
||||
|
||||
/*
|
||||
|
@ -145,7 +146,7 @@ struct mem_cgroup_per_node {
|
|||
|
||||
struct mem_cgroup_reclaim_iter iter;
|
||||
|
||||
struct memcg_shrinker_map __rcu *shrinker_map;
|
||||
struct shrinker_info __rcu *shrinker_info;
|
||||
|
||||
struct rb_node tree_node; /* RB tree node */
|
||||
unsigned long usage_in_excess;/* Set to the value by which */
|
||||
|
@ -1610,10 +1611,10 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
|
|||
return false;
|
||||
}
|
||||
|
||||
extern int memcg_expand_shrinker_maps(int new_id);
|
||||
|
||||
extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
|
||||
int nid, int shrinker_id);
|
||||
int alloc_shrinker_info(struct mem_cgroup *memcg);
|
||||
void free_shrinker_info(struct mem_cgroup *memcg);
|
||||
void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id);
|
||||
void reparent_shrinker_deferred(struct mem_cgroup *memcg);
|
||||
#else
|
||||
#define mem_cgroup_sockets_enabled 0
|
||||
static inline void mem_cgroup_sk_alloc(struct sock *sk) { };
|
||||
|
@ -1623,8 +1624,8 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
|
|||
return false;
|
||||
}
|
||||
|
||||
static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
|
||||
int nid, int shrinker_id)
|
||||
static inline void set_shrinker_bit(struct mem_cgroup *memcg,
|
||||
int nid, int shrinker_id)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -29,6 +29,11 @@ struct memory_block {
|
|||
int online_type; /* for passing data to online routine */
|
||||
int nid; /* NID for this memory block */
|
||||
struct device dev;
|
||||
/*
|
||||
* Number of vmemmap pages. These pages
|
||||
* lay at the beginning of the memory block.
|
||||
*/
|
||||
unsigned long nr_vmemmap_pages;
|
||||
};
|
||||
|
||||
int arch_get_memory_phys_device(unsigned long start_pfn);
|
||||
|
@ -80,7 +85,8 @@ static inline int memory_notify(unsigned long val, void *v)
|
|||
#else
|
||||
extern int register_memory_notifier(struct notifier_block *nb);
|
||||
extern void unregister_memory_notifier(struct notifier_block *nb);
|
||||
int create_memory_block_devices(unsigned long start, unsigned long size);
|
||||
int create_memory_block_devices(unsigned long start, unsigned long size,
|
||||
unsigned long vmemmap_pages);
|
||||
void remove_memory_block_devices(unsigned long start, unsigned long size);
|
||||
extern void memory_dev_init(void);
|
||||
extern int memory_notify(unsigned long val, void *v);
|
||||
|
|
|
@ -55,6 +55,14 @@ typedef int __bitwise mhp_t;
|
|||
*/
|
||||
#define MHP_MERGE_RESOURCE ((__force mhp_t)BIT(0))
|
||||
|
||||
/*
|
||||
* We want memmap (struct page array) to be self contained.
|
||||
* To do so, we will use the beginning of the hot-added range to build
|
||||
* the page tables for the memmap array that describes the entire range.
|
||||
* Only selected architectures support it with SPARSE_VMEMMAP.
|
||||
*/
|
||||
#define MHP_MEMMAP_ON_MEMORY ((__force mhp_t)BIT(1))
|
||||
|
||||
/*
|
||||
* Extended parameters for memory hotplug:
|
||||
* altmap: alternative allocator for memmap array (optional)
|
||||
|
@ -99,9 +107,13 @@ static inline void zone_seqlock_init(struct zone *zone)
|
|||
extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages);
|
||||
extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages);
|
||||
extern int add_one_highpage(struct page *page, int pfn, int bad_ppro);
|
||||
extern void adjust_present_page_count(struct zone *zone, long nr_pages);
|
||||
/* VM interface that may be used by firmware interface */
|
||||
extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
|
||||
struct zone *zone);
|
||||
extern void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages);
|
||||
extern int online_pages(unsigned long pfn, unsigned long nr_pages,
|
||||
int online_type, int nid);
|
||||
struct zone *zone);
|
||||
extern struct zone *test_pages_in_a_zone(unsigned long start_pfn,
|
||||
unsigned long end_pfn);
|
||||
extern void __offline_isolated_pages(unsigned long start_pfn,
|
||||
|
@ -359,6 +371,7 @@ extern struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_
|
|||
extern int arch_create_linear_mapping(int nid, u64 start, u64 size,
|
||||
struct mhp_params *params);
|
||||
void arch_remove_linear_mapping(u64 start, u64 size);
|
||||
extern bool mhp_supports_memmap_on_memory(unsigned long size);
|
||||
#endif /* CONFIG_MEMORY_HOTPLUG */
|
||||
|
||||
#endif /* __LINUX_MEMORY_HOTPLUG_H */
|
||||
|
|
|
@ -17,7 +17,7 @@ struct device;
|
|||
* @alloc: track pages consumed, private to vmemmap_populate()
|
||||
*/
|
||||
struct vmem_altmap {
|
||||
const unsigned long base_pfn;
|
||||
unsigned long base_pfn;
|
||||
const unsigned long end_pfn;
|
||||
const unsigned long reserve;
|
||||
unsigned long free;
|
||||
|
|
|
@ -27,6 +27,7 @@ enum migrate_reason {
|
|||
MR_MEMPOLICY_MBIND,
|
||||
MR_NUMA_MISPLACED,
|
||||
MR_CONTIG_RANGE,
|
||||
MR_LONGTERM_PIN,
|
||||
MR_TYPES
|
||||
};
|
||||
|
||||
|
@ -43,10 +44,7 @@ extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
|
|||
unsigned long private, enum migrate_mode mode, int reason);
|
||||
extern struct page *alloc_migration_target(struct page *page, unsigned long private);
|
||||
extern int isolate_movable_page(struct page *page, isolate_mode_t mode);
|
||||
extern void putback_movable_page(struct page *page);
|
||||
|
||||
extern void migrate_prep(void);
|
||||
extern void migrate_prep_local(void);
|
||||
extern void migrate_page_states(struct page *newpage, struct page *page);
|
||||
extern void migrate_page_copy(struct page *newpage, struct page *page);
|
||||
extern int migrate_huge_page_move_mapping(struct address_space *mapping,
|
||||
|
@ -66,9 +64,6 @@ static inline struct page *alloc_migration_target(struct page *page,
|
|||
static inline int isolate_movable_page(struct page *page, isolate_mode_t mode)
|
||||
{ return -EBUSY; }
|
||||
|
||||
static inline int migrate_prep(void) { return -ENOSYS; }
|
||||
static inline int migrate_prep_local(void) { return -ENOSYS; }
|
||||
|
||||
static inline void migrate_page_states(struct page *newpage, struct page *page)
|
||||
{
|
||||
}
|
||||
|
|
|
@ -372,6 +372,13 @@ extern unsigned int kobjsize(const void *objp);
|
|||
# define VM_GROWSUP VM_NONE
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
|
||||
# define VM_UFFD_MINOR_BIT 37
|
||||
# define VM_UFFD_MINOR BIT(VM_UFFD_MINOR_BIT) /* UFFD minor faults */
|
||||
#else /* !CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
|
||||
# define VM_UFFD_MINOR VM_NONE
|
||||
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
|
||||
|
||||
/* Bits set in the VMA until the stack is in its final location */
|
||||
#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ)
|
||||
|
||||
|
@ -1134,6 +1141,11 @@ static inline bool is_zone_device_page(const struct page *page)
|
|||
}
|
||||
#endif
|
||||
|
||||
static inline bool is_zone_movable_page(const struct page *page)
|
||||
{
|
||||
return page_zonenum(page) == ZONE_MOVABLE;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_DEV_PAGEMAP_OPS
|
||||
void free_devmap_managed_page(struct page *page);
|
||||
DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
|
||||
|
@ -1543,6 +1555,20 @@ static inline unsigned long page_to_section(const struct page *page)
|
|||
}
|
||||
#endif
|
||||
|
||||
/* MIGRATE_CMA and ZONE_MOVABLE do not allow pin pages */
|
||||
#ifdef CONFIG_MIGRATION
|
||||
static inline bool is_pinnable_page(struct page *page)
|
||||
{
|
||||
return !(is_zone_movable_page(page) || is_migrate_cma_page(page)) ||
|
||||
is_zero_pfn(page_to_pfn(page));
|
||||
}
|
||||
#else
|
||||
static inline bool is_pinnable_page(struct page *page)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline void set_page_zone(struct page *page, enum zone_type zone)
|
||||
{
|
||||
page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT);
|
||||
|
|
|
@ -407,8 +407,13 @@ enum zone_type {
|
|||
* to increase the number of THP/huge pages. Notable special cases are:
|
||||
*
|
||||
* 1. Pinned pages: (long-term) pinning of movable pages might
|
||||
* essentially turn such pages unmovable. Memory offlining might
|
||||
* retry a long time.
|
||||
* essentially turn such pages unmovable. Therefore, we do not allow
|
||||
* pinning long-term pages in ZONE_MOVABLE. When pages are pinned and
|
||||
* faulted, they come from the right zone right away. However, it is
|
||||
* still possible that address space already has pages in
|
||||
* ZONE_MOVABLE at the time when pages are pinned (i.e. user has
|
||||
* touches that memory before pinning). In such case we migrate them
|
||||
* to a different zone. When migration fails - pinning fails.
|
||||
* 2. memblock allocations: kernelcore/movablecore setups might create
|
||||
* situations where ZONE_MOVABLE contains unmovable allocations
|
||||
* after boot. Memory offlining and allocations fail early.
|
||||
|
@ -427,6 +432,15 @@ enum zone_type {
|
|||
* techniques might use alloc_contig_range() to hide previously
|
||||
* exposed pages from the buddy again (e.g., to implement some sort
|
||||
* of memory unplug in virtio-mem).
|
||||
* 6. ZERO_PAGE(0), kernelcore/movablecore setups might create
|
||||
* situations where ZERO_PAGE(0) which is allocated differently
|
||||
* on different platforms may end up in a movable zone. ZERO_PAGE(0)
|
||||
* cannot be migrated.
|
||||
* 7. Memory-hotplug: when using memmap_on_memory and onlining the
|
||||
* memory to the MOVABLE zone, the vmemmap pages are also placed in
|
||||
* such zone. Such pages cannot be really moved around as they are
|
||||
* self-stored in the range, but they are treated as movable when
|
||||
* the range they describe is about to be offlined.
|
||||
*
|
||||
* In general, no unmovable allocations that degrade memory offlining
|
||||
* should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range())
|
||||
|
@ -1383,10 +1397,8 @@ static inline int online_section_nr(unsigned long nr)
|
|||
|
||||
#ifdef CONFIG_MEMORY_HOTPLUG
|
||||
void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn);
|
||||
#ifdef CONFIG_MEMORY_HOTREMOVE
|
||||
void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
static inline struct mem_section *__pfn_to_section(unsigned long pfn)
|
||||
{
|
||||
|
|
|
@ -18,6 +18,11 @@
|
|||
|
||||
struct pagevec;
|
||||
|
||||
static inline bool mapping_empty(struct address_space *mapping)
|
||||
{
|
||||
return xa_empty(&mapping->i_pages);
|
||||
}
|
||||
|
||||
/*
|
||||
* Bits in mapping->flags.
|
||||
*/
|
||||
|
|
|
@ -1111,6 +1111,7 @@ extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
|
|||
extern void untrack_pfn_moved(struct vm_area_struct *vma);
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_MMU
|
||||
#ifdef __HAVE_COLOR_ZERO_PAGE
|
||||
static inline int is_zero_pfn(unsigned long pfn)
|
||||
{
|
||||
|
@ -1134,6 +1135,17 @@ static inline unsigned long my_zero_pfn(unsigned long addr)
|
|||
return zero_pfn;
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
static inline int is_zero_pfn(unsigned long pfn)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline unsigned long my_zero_pfn(unsigned long addr)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif /* CONFIG_MMU */
|
||||
|
||||
#ifdef CONFIG_MMU
|
||||
|
||||
|
|
|
@ -1583,7 +1583,7 @@ extern struct pid *cad_pid;
|
|||
#define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */
|
||||
#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */
|
||||
#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
|
||||
#define PF_MEMALLOC_NOCMA 0x10000000 /* All allocation request will have _GFP_MOVABLE cleared */
|
||||
#define PF_MEMALLOC_PIN 0x10000000 /* Allocation context constrained to zones which allow long term pinning. */
|
||||
#define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */
|
||||
#define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */
|
||||
|
||||
|
|
|
@ -151,12 +151,13 @@ static inline bool in_vfork(struct task_struct *tsk)
|
|||
* Applies per-task gfp context to the given allocation flags.
|
||||
* PF_MEMALLOC_NOIO implies GFP_NOIO
|
||||
* PF_MEMALLOC_NOFS implies GFP_NOFS
|
||||
* PF_MEMALLOC_PIN implies !GFP_MOVABLE
|
||||
*/
|
||||
static inline gfp_t current_gfp_context(gfp_t flags)
|
||||
{
|
||||
unsigned int pflags = READ_ONCE(current->flags);
|
||||
|
||||
if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS))) {
|
||||
if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS | PF_MEMALLOC_PIN))) {
|
||||
/*
|
||||
* NOIO implies both NOIO and NOFS and it is a weaker context
|
||||
* so always make sure it makes precedence
|
||||
|
@ -165,6 +166,9 @@ static inline gfp_t current_gfp_context(gfp_t flags)
|
|||
flags &= ~(__GFP_IO | __GFP_FS);
|
||||
else if (pflags & PF_MEMALLOC_NOFS)
|
||||
flags &= ~__GFP_FS;
|
||||
|
||||
if (pflags & PF_MEMALLOC_PIN)
|
||||
flags &= ~__GFP_MOVABLE;
|
||||
}
|
||||
return flags;
|
||||
}
|
||||
|
@ -271,29 +275,18 @@ static inline void memalloc_noreclaim_restore(unsigned int flags)
|
|||
current->flags = (current->flags & ~PF_MEMALLOC) | flags;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_CMA
|
||||
static inline unsigned int memalloc_nocma_save(void)
|
||||
static inline unsigned int memalloc_pin_save(void)
|
||||
{
|
||||
unsigned int flags = current->flags & PF_MEMALLOC_NOCMA;
|
||||
unsigned int flags = current->flags & PF_MEMALLOC_PIN;
|
||||
|
||||
current->flags |= PF_MEMALLOC_NOCMA;
|
||||
current->flags |= PF_MEMALLOC_PIN;
|
||||
return flags;
|
||||
}
|
||||
|
||||
static inline void memalloc_nocma_restore(unsigned int flags)
|
||||
static inline void memalloc_pin_restore(unsigned int flags)
|
||||
{
|
||||
current->flags = (current->flags & ~PF_MEMALLOC_NOCMA) | flags;
|
||||
current->flags = (current->flags & ~PF_MEMALLOC_PIN) | flags;
|
||||
}
|
||||
#else
|
||||
static inline unsigned int memalloc_nocma_save(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void memalloc_nocma_restore(unsigned int flags)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_MEMCG
|
||||
DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg);
|
||||
|
|
|
@ -79,13 +79,14 @@ struct shrinker {
|
|||
#define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
|
||||
|
||||
/* Flags */
|
||||
#define SHRINKER_NUMA_AWARE (1 << 0)
|
||||
#define SHRINKER_MEMCG_AWARE (1 << 1)
|
||||
#define SHRINKER_REGISTERED (1 << 0)
|
||||
#define SHRINKER_NUMA_AWARE (1 << 1)
|
||||
#define SHRINKER_MEMCG_AWARE (1 << 2)
|
||||
/*
|
||||
* It just makes sense when the shrinker is also MEMCG_AWARE for now,
|
||||
* non-MEMCG_AWARE shrinker should not have this flag set.
|
||||
*/
|
||||
#define SHRINKER_NONSLAB (1 << 2)
|
||||
#define SHRINKER_NONSLAB (1 << 3)
|
||||
|
||||
extern int prealloc_shrinker(struct shrinker *shrinker);
|
||||
extern void register_shrinker_prepared(struct shrinker *shrinker);
|
||||
|
|
|
@ -12,6 +12,7 @@
|
|||
#include <linux/fs.h>
|
||||
#include <linux/atomic.h>
|
||||
#include <linux/page-flags.h>
|
||||
#include <uapi/linux/mempolicy.h>
|
||||
#include <asm/page.h>
|
||||
|
||||
struct notifier_block;
|
||||
|
@ -339,6 +340,20 @@ extern void lru_note_cost(struct lruvec *lruvec, bool file,
|
|||
extern void lru_note_cost_page(struct page *);
|
||||
extern void lru_cache_add(struct page *);
|
||||
extern void mark_page_accessed(struct page *);
|
||||
|
||||
extern atomic_t lru_disable_count;
|
||||
|
||||
static inline bool lru_cache_disabled(void)
|
||||
{
|
||||
return atomic_read(&lru_disable_count);
|
||||
}
|
||||
|
||||
static inline void lru_cache_enable(void)
|
||||
{
|
||||
atomic_dec(&lru_disable_count);
|
||||
}
|
||||
|
||||
extern void lru_cache_disable(void);
|
||||
extern void lru_add_drain(void);
|
||||
extern void lru_add_drain_cpu(int cpu);
|
||||
extern void lru_add_drain_cpu_zone(struct zone *zone);
|
||||
|
@ -378,6 +393,12 @@ extern int sysctl_min_slab_ratio;
|
|||
#define node_reclaim_mode 0
|
||||
#endif
|
||||
|
||||
static inline bool node_reclaim_enabled(void)
|
||||
{
|
||||
/* Is any node_reclaim_mode bit set? */
|
||||
return node_reclaim_mode & (RECLAIM_ZONE|RECLAIM_WRITE|RECLAIM_UNMAP);
|
||||
}
|
||||
|
||||
extern void check_move_unevictable_pages(struct pagevec *pvec);
|
||||
|
||||
extern int kswapd_run(int nid);
|
||||
|
|
|
@ -17,6 +17,9 @@
|
|||
#include <linux/mm.h>
|
||||
#include <asm-generic/pgtable_uffd.h>
|
||||
|
||||
/* The set of all possible UFFD-related VM flags. */
|
||||
#define __VM_UFFD_FLAGS (VM_UFFD_MISSING | VM_UFFD_WP | VM_UFFD_MINOR)
|
||||
|
||||
/*
|
||||
* CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining
|
||||
* new flags, since they might collide with O_* ones. We want
|
||||
|
@ -34,6 +37,22 @@ extern int sysctl_unprivileged_userfaultfd;
|
|||
|
||||
extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);
|
||||
|
||||
/*
|
||||
* The mode of operation for __mcopy_atomic and its helpers.
|
||||
*
|
||||
* This is almost an implementation detail (mcopy_atomic below doesn't take this
|
||||
* as a parameter), but it's exposed here because memory-kind-specific
|
||||
* implementations (e.g. hugetlbfs) need to know the mode of operation.
|
||||
*/
|
||||
enum mcopy_atomic_mode {
|
||||
/* A normal copy_from_user into the destination range. */
|
||||
MCOPY_ATOMIC_NORMAL,
|
||||
/* Don't copy; map the destination range to the zero page. */
|
||||
MCOPY_ATOMIC_ZEROPAGE,
|
||||
/* Just install pte(s) with the existing page(s) in the page cache. */
|
||||
MCOPY_ATOMIC_CONTINUE,
|
||||
};
|
||||
|
||||
extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
|
||||
unsigned long src_start, unsigned long len,
|
||||
bool *mmap_changing, __u64 mode);
|
||||
|
@ -41,6 +60,8 @@ extern ssize_t mfill_zeropage(struct mm_struct *dst_mm,
|
|||
unsigned long dst_start,
|
||||
unsigned long len,
|
||||
bool *mmap_changing);
|
||||
extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start,
|
||||
unsigned long len, bool *mmap_changing);
|
||||
extern int mwriteprotect_range(struct mm_struct *dst_mm,
|
||||
unsigned long start, unsigned long len,
|
||||
bool enable_wp, bool *mmap_changing);
|
||||
|
@ -52,6 +73,22 @@ static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
|
|||
return vma->vm_userfaultfd_ctx.ctx == vm_ctx.ctx;
|
||||
}
|
||||
|
||||
/*
|
||||
* Never enable huge pmd sharing on some uffd registered vmas:
|
||||
*
|
||||
* - VM_UFFD_WP VMAs, because write protect information is per pgtable entry.
|
||||
*
|
||||
* - VM_UFFD_MINOR VMAs, because otherwise we would never get minor faults for
|
||||
* VMAs which share huge pmds. (If you have two mappings to the same
|
||||
* underlying pages, and fault in the non-UFFD-registered one with a write,
|
||||
* with huge pmd sharing this would *also* setup the second UFFD-registered
|
||||
* mapping, and we'd not get minor faults.)
|
||||
*/
|
||||
static inline bool uffd_disable_huge_pmd_share(struct vm_area_struct *vma)
|
||||
{
|
||||
return vma->vm_flags & (VM_UFFD_WP | VM_UFFD_MINOR);
|
||||
}
|
||||
|
||||
static inline bool userfaultfd_missing(struct vm_area_struct *vma)
|
||||
{
|
||||
return vma->vm_flags & VM_UFFD_MISSING;
|
||||
|
@ -62,6 +99,11 @@ static inline bool userfaultfd_wp(struct vm_area_struct *vma)
|
|||
return vma->vm_flags & VM_UFFD_WP;
|
||||
}
|
||||
|
||||
static inline bool userfaultfd_minor(struct vm_area_struct *vma)
|
||||
{
|
||||
return vma->vm_flags & VM_UFFD_MINOR;
|
||||
}
|
||||
|
||||
static inline bool userfaultfd_pte_wp(struct vm_area_struct *vma,
|
||||
pte_t pte)
|
||||
{
|
||||
|
@ -76,7 +118,7 @@ static inline bool userfaultfd_huge_pmd_wp(struct vm_area_struct *vma,
|
|||
|
||||
static inline bool userfaultfd_armed(struct vm_area_struct *vma)
|
||||
{
|
||||
return vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP);
|
||||
return vma->vm_flags & __VM_UFFD_FLAGS;
|
||||
}
|
||||
|
||||
extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *);
|
||||
|
@ -123,6 +165,11 @@ static inline bool userfaultfd_wp(struct vm_area_struct *vma)
|
|||
return false;
|
||||
}
|
||||
|
||||
static inline bool userfaultfd_minor(struct vm_area_struct *vma)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline bool userfaultfd_pte_wp(struct vm_area_struct *vma,
|
||||
pte_t pte)
|
||||
{
|
||||
|
|
|
@ -70,6 +70,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
|
|||
#endif
|
||||
#ifdef CONFIG_HUGETLB_PAGE
|
||||
HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL,
|
||||
#endif
|
||||
#ifdef CONFIG_CMA
|
||||
CMA_ALLOC_SUCCESS,
|
||||
CMA_ALLOC_FAIL,
|
||||
#endif
|
||||
UNEVICTABLE_PGCULLED, /* culled to noreclaim list */
|
||||
UNEVICTABLE_PGSCANNED, /* scanned for reclaimability */
|
||||
|
@ -120,6 +124,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
|
|||
#ifdef CONFIG_SWAP
|
||||
SWAP_RA,
|
||||
SWAP_RA_HIT,
|
||||
#endif
|
||||
#ifdef CONFIG_X86
|
||||
DIRECT_MAP_LEVEL2_SPLIT,
|
||||
DIRECT_MAP_LEVEL3_SPLIT,
|
||||
#endif
|
||||
NR_VM_EVENT_ITEMS
|
||||
};
|
||||
|
|
|
@ -8,28 +8,31 @@
|
|||
#include <linux/types.h>
|
||||
#include <linux/tracepoint.h>
|
||||
|
||||
TRACE_EVENT(cma_alloc,
|
||||
DECLARE_EVENT_CLASS(cma_alloc_class,
|
||||
|
||||
TP_PROTO(unsigned long pfn, const struct page *page,
|
||||
unsigned int count, unsigned int align),
|
||||
TP_PROTO(const char *name, unsigned long pfn, const struct page *page,
|
||||
unsigned long count, unsigned int align),
|
||||
|
||||
TP_ARGS(pfn, page, count, align),
|
||||
TP_ARGS(name, pfn, page, count, align),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__string(name, name)
|
||||
__field(unsigned long, pfn)
|
||||
__field(const struct page *, page)
|
||||
__field(unsigned int, count)
|
||||
__field(unsigned long, count)
|
||||
__field(unsigned int, align)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__assign_str(name, name);
|
||||
__entry->pfn = pfn;
|
||||
__entry->page = page;
|
||||
__entry->count = count;
|
||||
__entry->align = align;
|
||||
),
|
||||
|
||||
TP_printk("pfn=%lx page=%p count=%u align=%u",
|
||||
TP_printk("name=%s pfn=%lx page=%p count=%lu align=%u",
|
||||
__get_str(name),
|
||||
__entry->pfn,
|
||||
__entry->page,
|
||||
__entry->count,
|
||||
|
@ -38,29 +41,72 @@ TRACE_EVENT(cma_alloc,
|
|||
|
||||
TRACE_EVENT(cma_release,
|
||||
|
||||
TP_PROTO(unsigned long pfn, const struct page *page,
|
||||
unsigned int count),
|
||||
TP_PROTO(const char *name, unsigned long pfn, const struct page *page,
|
||||
unsigned long count),
|
||||
|
||||
TP_ARGS(pfn, page, count),
|
||||
TP_ARGS(name, pfn, page, count),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__string(name, name)
|
||||
__field(unsigned long, pfn)
|
||||
__field(const struct page *, page)
|
||||
__field(unsigned int, count)
|
||||
__field(unsigned long, count)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__assign_str(name, name);
|
||||
__entry->pfn = pfn;
|
||||
__entry->page = page;
|
||||
__entry->count = count;
|
||||
),
|
||||
|
||||
TP_printk("pfn=%lx page=%p count=%u",
|
||||
TP_printk("name=%s pfn=%lx page=%p count=%lu",
|
||||
__get_str(name),
|
||||
__entry->pfn,
|
||||
__entry->page,
|
||||
__entry->count)
|
||||
);
|
||||
|
||||
TRACE_EVENT(cma_alloc_start,
|
||||
|
||||
TP_PROTO(const char *name, unsigned long count, unsigned int align),
|
||||
|
||||
TP_ARGS(name, count, align),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__string(name, name)
|
||||
__field(unsigned long, count)
|
||||
__field(unsigned int, align)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__assign_str(name, name);
|
||||
__entry->count = count;
|
||||
__entry->align = align;
|
||||
),
|
||||
|
||||
TP_printk("name=%s count=%lu align=%u",
|
||||
__get_str(name),
|
||||
__entry->count,
|
||||
__entry->align)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(cma_alloc_class, cma_alloc_finish,
|
||||
|
||||
TP_PROTO(const char *name, unsigned long pfn, const struct page *page,
|
||||
unsigned long count, unsigned int align),
|
||||
|
||||
TP_ARGS(name, pfn, page, count, align)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(cma_alloc_class, cma_alloc_busy_retry,
|
||||
|
||||
TP_PROTO(const char *name, unsigned long pfn, const struct page *page,
|
||||
unsigned long count, unsigned int align),
|
||||
|
||||
TP_ARGS(name, pfn, page, count, align)
|
||||
);
|
||||
|
||||
#endif /* _TRACE_CMA_H */
|
||||
|
||||
/* This part must be outside protection */
|
||||
|
|
|
@ -20,7 +20,8 @@
|
|||
EM( MR_SYSCALL, "syscall_or_cpuset") \
|
||||
EM( MR_MEMPOLICY_MBIND, "mempolicy_mbind") \
|
||||
EM( MR_NUMA_MISPLACED, "numa_misplaced") \
|
||||
EMe(MR_CONTIG_RANGE, "contig_range")
|
||||
EM( MR_CONTIG_RANGE, "contig_range") \
|
||||
EMe(MR_LONGTERM_PIN, "longterm_pin")
|
||||
|
||||
/*
|
||||
* First define the enums in the above macros to be exported to userspace
|
||||
|
@ -81,6 +82,28 @@ TRACE_EVENT(mm_migrate_pages,
|
|||
__print_symbolic(__entry->mode, MIGRATE_MODE),
|
||||
__print_symbolic(__entry->reason, MIGRATE_REASON))
|
||||
);
|
||||
|
||||
TRACE_EVENT(mm_migrate_pages_start,
|
||||
|
||||
TP_PROTO(enum migrate_mode mode, int reason),
|
||||
|
||||
TP_ARGS(mode, reason),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(enum migrate_mode, mode)
|
||||
__field(int, reason)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->mode = mode;
|
||||
__entry->reason = reason;
|
||||
),
|
||||
|
||||
TP_printk("mode=%s reason=%s",
|
||||
__print_symbolic(__entry->mode, MIGRATE_MODE),
|
||||
__print_symbolic(__entry->reason, MIGRATE_REASON))
|
||||
);
|
||||
|
||||
#endif /* _TRACE_MIGRATE_H */
|
||||
|
||||
/* This part must be outside protection */
|
||||
|
|
|
@ -137,6 +137,12 @@ IF_HAVE_PG_ARCH_2(PG_arch_2, "arch_2" )
|
|||
#define IF_HAVE_VM_SOFTDIRTY(flag,name)
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
|
||||
# define IF_HAVE_UFFD_MINOR(flag, name) {flag, name},
|
||||
#else
|
||||
# define IF_HAVE_UFFD_MINOR(flag, name)
|
||||
#endif
|
||||
|
||||
#define __def_vmaflag_names \
|
||||
{VM_READ, "read" }, \
|
||||
{VM_WRITE, "write" }, \
|
||||
|
@ -148,6 +154,7 @@ IF_HAVE_PG_ARCH_2(PG_arch_2, "arch_2" )
|
|||
{VM_MAYSHARE, "mayshare" }, \
|
||||
{VM_GROWSDOWN, "growsdown" }, \
|
||||
{VM_UFFD_MISSING, "uffd_missing" }, \
|
||||
IF_HAVE_UFFD_MINOR(VM_UFFD_MINOR, "uffd_minor" ) \
|
||||
{VM_PFNMAP, "pfnmap" }, \
|
||||
{VM_DENYWRITE, "denywrite" }, \
|
||||
{VM_UFFD_WP, "uffd_wp" }, \
|
||||
|
|
|
@ -64,5 +64,12 @@ enum {
|
|||
#define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */
|
||||
#define MPOL_F_MORON (1 << 4) /* Migrate On protnone Reference On Node */
|
||||
|
||||
/*
|
||||
* These bit locations are exposed in the vm.zone_reclaim_mode sysctl
|
||||
* ABI. New bits are OK, but existing bits can never change.
|
||||
*/
|
||||
#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */
|
||||
#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
|
||||
#define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */
|
||||
|
||||
#endif /* _UAPI_LINUX_MEMPOLICY_H */
|
||||
|
|
|
@ -19,15 +19,19 @@
|
|||
* means the userland is reading).
|
||||
*/
|
||||
#define UFFD_API ((__u64)0xAA)
|
||||
#define UFFD_API_REGISTER_MODES (UFFDIO_REGISTER_MODE_MISSING | \
|
||||
UFFDIO_REGISTER_MODE_WP | \
|
||||
UFFDIO_REGISTER_MODE_MINOR)
|
||||
#define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \
|
||||
UFFD_FEATURE_EVENT_FORK | \
|
||||
UFFD_FEATURE_EVENT_REMAP | \
|
||||
UFFD_FEATURE_EVENT_REMOVE | \
|
||||
UFFD_FEATURE_EVENT_REMOVE | \
|
||||
UFFD_FEATURE_EVENT_UNMAP | \
|
||||
UFFD_FEATURE_MISSING_HUGETLBFS | \
|
||||
UFFD_FEATURE_MISSING_SHMEM | \
|
||||
UFFD_FEATURE_SIGBUS | \
|
||||
UFFD_FEATURE_THREAD_ID)
|
||||
UFFD_FEATURE_THREAD_ID | \
|
||||
UFFD_FEATURE_MINOR_HUGETLBFS)
|
||||
#define UFFD_API_IOCTLS \
|
||||
((__u64)1 << _UFFDIO_REGISTER | \
|
||||
(__u64)1 << _UFFDIO_UNREGISTER | \
|
||||
|
@ -36,10 +40,12 @@
|
|||
((__u64)1 << _UFFDIO_WAKE | \
|
||||
(__u64)1 << _UFFDIO_COPY | \
|
||||
(__u64)1 << _UFFDIO_ZEROPAGE | \
|
||||
(__u64)1 << _UFFDIO_WRITEPROTECT)
|
||||
(__u64)1 << _UFFDIO_WRITEPROTECT | \
|
||||
(__u64)1 << _UFFDIO_CONTINUE)
|
||||
#define UFFD_API_RANGE_IOCTLS_BASIC \
|
||||
((__u64)1 << _UFFDIO_WAKE | \
|
||||
(__u64)1 << _UFFDIO_COPY)
|
||||
(__u64)1 << _UFFDIO_COPY | \
|
||||
(__u64)1 << _UFFDIO_CONTINUE)
|
||||
|
||||
/*
|
||||
* Valid ioctl command number range with this API is from 0x00 to
|
||||
|
@ -55,6 +61,7 @@
|
|||
#define _UFFDIO_COPY (0x03)
|
||||
#define _UFFDIO_ZEROPAGE (0x04)
|
||||
#define _UFFDIO_WRITEPROTECT (0x06)
|
||||
#define _UFFDIO_CONTINUE (0x07)
|
||||
#define _UFFDIO_API (0x3F)
|
||||
|
||||
/* userfaultfd ioctl ids */
|
||||
|
@ -73,6 +80,8 @@
|
|||
struct uffdio_zeropage)
|
||||
#define UFFDIO_WRITEPROTECT _IOWR(UFFDIO, _UFFDIO_WRITEPROTECT, \
|
||||
struct uffdio_writeprotect)
|
||||
#define UFFDIO_CONTINUE _IOR(UFFDIO, _UFFDIO_CONTINUE, \
|
||||
struct uffdio_continue)
|
||||
|
||||
/* read() structure */
|
||||
struct uffd_msg {
|
||||
|
@ -127,6 +136,7 @@ struct uffd_msg {
|
|||
/* flags for UFFD_EVENT_PAGEFAULT */
|
||||
#define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */
|
||||
#define UFFD_PAGEFAULT_FLAG_WP (1<<1) /* If reason is VM_UFFD_WP */
|
||||
#define UFFD_PAGEFAULT_FLAG_MINOR (1<<2) /* If reason is VM_UFFD_MINOR */
|
||||
|
||||
struct uffdio_api {
|
||||
/* userland asks for an API number and the features to enable */
|
||||
|
@ -171,6 +181,10 @@ struct uffdio_api {
|
|||
*
|
||||
* UFFD_FEATURE_THREAD_ID pid of the page faulted task_struct will
|
||||
* be returned, if feature is not requested 0 will be returned.
|
||||
*
|
||||
* UFFD_FEATURE_MINOR_HUGETLBFS indicates that minor faults
|
||||
* can be intercepted (via REGISTER_MODE_MINOR) for
|
||||
* hugetlbfs-backed pages.
|
||||
*/
|
||||
#define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0)
|
||||
#define UFFD_FEATURE_EVENT_FORK (1<<1)
|
||||
|
@ -181,6 +195,7 @@ struct uffdio_api {
|
|||
#define UFFD_FEATURE_EVENT_UNMAP (1<<6)
|
||||
#define UFFD_FEATURE_SIGBUS (1<<7)
|
||||
#define UFFD_FEATURE_THREAD_ID (1<<8)
|
||||
#define UFFD_FEATURE_MINOR_HUGETLBFS (1<<9)
|
||||
__u64 features;
|
||||
|
||||
__u64 ioctls;
|
||||
|
@ -195,6 +210,7 @@ struct uffdio_register {
|
|||
struct uffdio_range range;
|
||||
#define UFFDIO_REGISTER_MODE_MISSING ((__u64)1<<0)
|
||||
#define UFFDIO_REGISTER_MODE_WP ((__u64)1<<1)
|
||||
#define UFFDIO_REGISTER_MODE_MINOR ((__u64)1<<2)
|
||||
__u64 mode;
|
||||
|
||||
/*
|
||||
|
@ -257,6 +273,18 @@ struct uffdio_writeprotect {
|
|||
__u64 mode;
|
||||
};
|
||||
|
||||
struct uffdio_continue {
|
||||
struct uffdio_range range;
|
||||
#define UFFDIO_CONTINUE_MODE_DONTWAKE ((__u64)1<<0)
|
||||
__u64 mode;
|
||||
|
||||
/*
|
||||
* Fields below here are written by the ioctl and must be at the end:
|
||||
* the copy_from_user will not read past here.
|
||||
*/
|
||||
__s64 mapped;
|
||||
};
|
||||
|
||||
/*
|
||||
* Flags for the userfaultfd(2) system call itself.
|
||||
*/
|
||||
|
|
|
@ -1644,6 +1644,11 @@ config HAVE_ARCH_USERFAULTFD_WP
|
|||
help
|
||||
Arch has userfaultfd write protection support
|
||||
|
||||
config HAVE_ARCH_USERFAULTFD_MINOR
|
||||
bool
|
||||
help
|
||||
Arch has userfaultfd minor fault support
|
||||
|
||||
config MEMBARRIER
|
||||
bool "Enable membarrier() system call" if EXPERT
|
||||
default y
|
||||
|
|
|
@ -2830,7 +2830,7 @@ static struct ctl_table vm_table[] = {
|
|||
#ifdef CONFIG_COMPACTION
|
||||
{
|
||||
.procname = "compact_memory",
|
||||
.data = &sysctl_compact_memory,
|
||||
.data = NULL,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0200,
|
||||
.proc_handler = sysctl_compaction_handler,
|
||||
|
|
|
@ -7,6 +7,7 @@ menuconfig KFENCE
|
|||
bool "KFENCE: low-overhead sampling-based memory safety error detector"
|
||||
depends on HAVE_ARCH_KFENCE && (SLAB || SLUB)
|
||||
select STACKTRACE
|
||||
select IRQ_WORK
|
||||
help
|
||||
KFENCE is a low-overhead sampling-based detector of heap out-of-bounds
|
||||
access, use-after-free, and invalid-free errors. KFENCE is designed
|
||||
|
|
|
@ -5,6 +5,7 @@
|
|||
#include <linux/fault-inject-usercopy.h>
|
||||
#include <linux/uio.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/vmalloc.h>
|
||||
#include <linux/splice.h>
|
||||
|
@ -507,13 +508,6 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction,
|
|||
}
|
||||
EXPORT_SYMBOL(iov_iter_init);
|
||||
|
||||
static void memzero_page(struct page *page, size_t offset, size_t len)
|
||||
{
|
||||
char *addr = kmap_atomic(page);
|
||||
memset(addr + offset, 0, len);
|
||||
kunmap_atomic(addr);
|
||||
}
|
||||
|
||||
static inline bool allocated(struct pipe_buffer *buf)
|
||||
{
|
||||
return buf->ops == &default_pipe_buf_ops;
|
||||
|
|
28
mm/Kconfig
28
mm/Kconfig
|
@ -148,6 +148,9 @@ config MEMORY_ISOLATION
|
|||
config HAVE_BOOTMEM_INFO_NODE
|
||||
def_bool n
|
||||
|
||||
config ARCH_ENABLE_MEMORY_HOTPLUG
|
||||
bool
|
||||
|
||||
# eventually, we can have this option just 'select SPARSEMEM'
|
||||
config MEMORY_HOTPLUG
|
||||
bool "Allow for memory hot-add"
|
||||
|
@ -176,12 +179,20 @@ config MEMORY_HOTPLUG_DEFAULT_ONLINE
|
|||
Say N here if you want the default policy to keep all hot-plugged
|
||||
memory blocks in 'offline' state.
|
||||
|
||||
config ARCH_ENABLE_MEMORY_HOTREMOVE
|
||||
bool
|
||||
|
||||
config MEMORY_HOTREMOVE
|
||||
bool "Allow for memory hot remove"
|
||||
select HAVE_BOOTMEM_INFO_NODE if (X86_64 || PPC64)
|
||||
depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE
|
||||
depends on MIGRATION
|
||||
|
||||
config MHP_MEMMAP_ON_MEMORY
|
||||
def_bool y
|
||||
depends on MEMORY_HOTPLUG && SPARSEMEM_VMEMMAP
|
||||
depends on ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
|
||||
|
||||
# Heavily threaded applications may benefit from splitting the mm-wide
|
||||
# page_table_lock, so that faults on different parts of the user address
|
||||
# space can be handled with less contention: split it at this NR_CPUS.
|
||||
|
@ -273,6 +284,13 @@ config ARCH_ENABLE_HUGEPAGE_MIGRATION
|
|||
config ARCH_ENABLE_THP_MIGRATION
|
||||
bool
|
||||
|
||||
config HUGETLB_PAGE_SIZE_VARIABLE
|
||||
def_bool n
|
||||
help
|
||||
Allows the pageblock_order value to be dynamic instead of just standard
|
||||
HUGETLB_PAGE_ORDER when there are multiple HugeTLB page sizes available
|
||||
on a platform.
|
||||
|
||||
config CONTIG_ALLOC
|
||||
def_bool (MEMORY_ISOLATION && COMPACTION) || CMA
|
||||
|
||||
|
@ -511,6 +529,13 @@ config CMA_DEBUGFS
|
|||
help
|
||||
Turns on the DebugFS interface for CMA.
|
||||
|
||||
config CMA_SYSFS
|
||||
bool "CMA information through sysfs interface"
|
||||
depends on CMA && SYSFS
|
||||
help
|
||||
This option exposes some sysfs attributes to get information
|
||||
from CMA.
|
||||
|
||||
config CMA_AREAS
|
||||
int "Maximum count of the CMA areas"
|
||||
depends on CMA
|
||||
|
@ -758,6 +783,9 @@ config IDLE_PAGE_TRACKING
|
|||
See Documentation/admin-guide/mm/idle_page_tracking.rst for
|
||||
more details.
|
||||
|
||||
config ARCH_HAS_CACHE_LINE_SIZE
|
||||
bool
|
||||
|
||||
config ARCH_HAS_PTE_DEVMAP
|
||||
bool
|
||||
|
||||
|
|
|
@ -58,9 +58,13 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
|
|||
page-alloc-y := page_alloc.o
|
||||
page-alloc-$(CONFIG_SHUFFLE_PAGE_ALLOCATOR) += shuffle.o
|
||||
|
||||
# Give 'memory_hotplug' its own module-parameter namespace
|
||||
memory-hotplug-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
|
||||
|
||||
obj-y += page-alloc.o
|
||||
obj-y += init-mm.o
|
||||
obj-y += memblock.o
|
||||
obj-y += $(memory-hotplug-y)
|
||||
|
||||
ifdef CONFIG_MMU
|
||||
obj-$(CONFIG_ADVISE_SYSCALLS) += madvise.o
|
||||
|
@ -83,7 +87,6 @@ obj-$(CONFIG_SLUB) += slub.o
|
|||
obj-$(CONFIG_KASAN) += kasan/
|
||||
obj-$(CONFIG_KFENCE) += kfence/
|
||||
obj-$(CONFIG_FAILSLAB) += failslab.o
|
||||
obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
|
||||
obj-$(CONFIG_MEMTEST) += memtest.o
|
||||
obj-$(CONFIG_MIGRATION) += migrate.o
|
||||
obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
|
||||
|
@ -109,6 +112,7 @@ obj-$(CONFIG_CMA) += cma.o
|
|||
obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
|
||||
obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
|
||||
obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
|
||||
obj-$(CONFIG_CMA_SYSFS) += cma_sysfs.o
|
||||
obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
|
||||
obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
|
||||
obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
|
||||
|
|
62
mm/cma.c
62
mm/cma.c
|
@ -24,7 +24,6 @@
|
|||
#include <linux/memblock.h>
|
||||
#include <linux/err.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/mutex.h>
|
||||
#include <linux/sizes.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/log2.h>
|
||||
|
@ -80,16 +79,17 @@ static unsigned long cma_bitmap_pages_to_bits(const struct cma *cma,
|
|||
}
|
||||
|
||||
static void cma_clear_bitmap(struct cma *cma, unsigned long pfn,
|
||||
unsigned int count)
|
||||
unsigned long count)
|
||||
{
|
||||
unsigned long bitmap_no, bitmap_count;
|
||||
unsigned long flags;
|
||||
|
||||
bitmap_no = (pfn - cma->base_pfn) >> cma->order_per_bit;
|
||||
bitmap_count = cma_bitmap_pages_to_bits(cma, count);
|
||||
|
||||
mutex_lock(&cma->lock);
|
||||
spin_lock_irqsave(&cma->lock, flags);
|
||||
bitmap_clear(cma->bitmap, bitmap_no, bitmap_count);
|
||||
mutex_unlock(&cma->lock);
|
||||
spin_unlock_irqrestore(&cma->lock, flags);
|
||||
}
|
||||
|
||||
static void __init cma_activate_area(struct cma *cma)
|
||||
|
@ -118,7 +118,7 @@ static void __init cma_activate_area(struct cma *cma)
|
|||
pfn += pageblock_nr_pages)
|
||||
init_cma_reserved_pageblock(pfn_to_page(pfn));
|
||||
|
||||
mutex_init(&cma->lock);
|
||||
spin_lock_init(&cma->lock);
|
||||
|
||||
#ifdef CONFIG_CMA_DEBUGFS
|
||||
INIT_HLIST_HEAD(&cma->mem_head);
|
||||
|
@ -392,7 +392,7 @@ static void cma_debug_show_areas(struct cma *cma)
|
|||
unsigned long nr_part, nr_total = 0;
|
||||
unsigned long nbits = cma_bitmap_maxno(cma);
|
||||
|
||||
mutex_lock(&cma->lock);
|
||||
spin_lock_irq(&cma->lock);
|
||||
pr_info("number of available pages: ");
|
||||
for (;;) {
|
||||
next_zero_bit = find_next_zero_bit(cma->bitmap, nbits, start);
|
||||
|
@ -407,7 +407,7 @@ static void cma_debug_show_areas(struct cma *cma)
|
|||
start = next_zero_bit + nr_zero;
|
||||
}
|
||||
pr_cont("=> %lu free of %lu total pages\n", nr_total, cma->count);
|
||||
mutex_unlock(&cma->lock);
|
||||
spin_unlock_irq(&cma->lock);
|
||||
}
|
||||
#else
|
||||
static inline void cma_debug_show_areas(struct cma *cma) { }
|
||||
|
@ -423,25 +423,27 @@ static inline void cma_debug_show_areas(struct cma *cma) { }
|
|||
* This function allocates part of contiguous memory on specific
|
||||
* contiguous memory area.
|
||||
*/
|
||||
struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
|
||||
bool no_warn)
|
||||
struct page *cma_alloc(struct cma *cma, unsigned long count,
|
||||
unsigned int align, bool no_warn)
|
||||
{
|
||||
unsigned long mask, offset;
|
||||
unsigned long pfn = -1;
|
||||
unsigned long start = 0;
|
||||
unsigned long bitmap_maxno, bitmap_no, bitmap_count;
|
||||
size_t i;
|
||||
unsigned long i;
|
||||
struct page *page = NULL;
|
||||
int ret = -ENOMEM;
|
||||
|
||||
if (!cma || !cma->count || !cma->bitmap)
|
||||
return NULL;
|
||||
goto out;
|
||||
|
||||
pr_debug("%s(cma %p, count %zu, align %d)\n", __func__, (void *)cma,
|
||||
pr_debug("%s(cma %p, count %lu, align %d)\n", __func__, (void *)cma,
|
||||
count, align);
|
||||
|
||||
if (!count)
|
||||
return NULL;
|
||||
goto out;
|
||||
|
||||
trace_cma_alloc_start(cma->name, count, align);
|
||||
|
||||
mask = cma_bitmap_aligned_mask(cma, align);
|
||||
offset = cma_bitmap_aligned_offset(cma, align);
|
||||
|
@ -449,15 +451,15 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
|
|||
bitmap_count = cma_bitmap_pages_to_bits(cma, count);
|
||||
|
||||
if (bitmap_count > bitmap_maxno)
|
||||
return NULL;
|
||||
goto out;
|
||||
|
||||
for (;;) {
|
||||
mutex_lock(&cma->lock);
|
||||
spin_lock_irq(&cma->lock);
|
||||
bitmap_no = bitmap_find_next_zero_area_off(cma->bitmap,
|
||||
bitmap_maxno, start, bitmap_count, mask,
|
||||
offset);
|
||||
if (bitmap_no >= bitmap_maxno) {
|
||||
mutex_unlock(&cma->lock);
|
||||
spin_unlock_irq(&cma->lock);
|
||||
break;
|
||||
}
|
||||
bitmap_set(cma->bitmap, bitmap_no, bitmap_count);
|
||||
|
@ -466,7 +468,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
|
|||
* our exclusive use. If the migration fails we will take the
|
||||
* lock again and unmark it.
|
||||
*/
|
||||
mutex_unlock(&cma->lock);
|
||||
spin_unlock_irq(&cma->lock);
|
||||
|
||||
pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit);
|
||||
ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA,
|
||||
|
@ -483,11 +485,14 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
|
|||
|
||||
pr_debug("%s(): memory range at %p is busy, retrying\n",
|
||||
__func__, pfn_to_page(pfn));
|
||||
|
||||
trace_cma_alloc_busy_retry(cma->name, pfn, pfn_to_page(pfn),
|
||||
count, align);
|
||||
/* try again with a bit different memory target */
|
||||
start = bitmap_no + mask + 1;
|
||||
}
|
||||
|
||||
trace_cma_alloc(pfn, page, count, align);
|
||||
trace_cma_alloc_finish(cma->name, pfn, page, count, align);
|
||||
|
||||
/*
|
||||
* CMA can allocate multiple page blocks, which results in different
|
||||
|
@ -500,12 +505,22 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
|
|||
}
|
||||
|
||||
if (ret && !no_warn) {
|
||||
pr_err("%s: %s: alloc failed, req-size: %zu pages, ret: %d\n",
|
||||
__func__, cma->name, count, ret);
|
||||
pr_err_ratelimited("%s: %s: alloc failed, req-size: %lu pages, ret: %d\n",
|
||||
__func__, cma->name, count, ret);
|
||||
cma_debug_show_areas(cma);
|
||||
}
|
||||
|
||||
pr_debug("%s(): returned %p\n", __func__, page);
|
||||
out:
|
||||
if (page) {
|
||||
count_vm_event(CMA_ALLOC_SUCCESS);
|
||||
cma_sysfs_account_success_pages(cma, count);
|
||||
} else {
|
||||
count_vm_event(CMA_ALLOC_FAIL);
|
||||
if (cma)
|
||||
cma_sysfs_account_fail_pages(cma, count);
|
||||
}
|
||||
|
||||
return page;
|
||||
}
|
||||
|
||||
|
@ -519,14 +534,15 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
|
|||
* It returns false when provided pages do not belong to contiguous area and
|
||||
* true otherwise.
|
||||
*/
|
||||
bool cma_release(struct cma *cma, const struct page *pages, unsigned int count)
|
||||
bool cma_release(struct cma *cma, const struct page *pages,
|
||||
unsigned long count)
|
||||
{
|
||||
unsigned long pfn;
|
||||
|
||||
if (!cma || !pages)
|
||||
return false;
|
||||
|
||||
pr_debug("%s(page %p, count %u)\n", __func__, (void *)pages, count);
|
||||
pr_debug("%s(page %p, count %lu)\n", __func__, (void *)pages, count);
|
||||
|
||||
pfn = page_to_pfn(pages);
|
||||
|
||||
|
@ -537,7 +553,7 @@ bool cma_release(struct cma *cma, const struct page *pages, unsigned int count)
|
|||
|
||||
free_contig_range(pfn, count);
|
||||
cma_clear_bitmap(cma, pfn, count);
|
||||
trace_cma_release(pfn, pages, count);
|
||||
trace_cma_release(cma->name, pfn, pages, count);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
|
25
mm/cma.h
25
mm/cma.h
|
@ -3,19 +3,33 @@
|
|||
#define __MM_CMA_H__
|
||||
|
||||
#include <linux/debugfs.h>
|
||||
#include <linux/kobject.h>
|
||||
|
||||
struct cma_kobject {
|
||||
struct kobject kobj;
|
||||
struct cma *cma;
|
||||
};
|
||||
|
||||
struct cma {
|
||||
unsigned long base_pfn;
|
||||
unsigned long count;
|
||||
unsigned long *bitmap;
|
||||
unsigned int order_per_bit; /* Order of pages represented by one bit */
|
||||
struct mutex lock;
|
||||
spinlock_t lock;
|
||||
#ifdef CONFIG_CMA_DEBUGFS
|
||||
struct hlist_head mem_head;
|
||||
spinlock_t mem_head_lock;
|
||||
struct debugfs_u32_array dfs_bitmap;
|
||||
#endif
|
||||
char name[CMA_MAX_NAME];
|
||||
#ifdef CONFIG_CMA_SYSFS
|
||||
/* the number of CMA page successful allocations */
|
||||
atomic64_t nr_pages_succeeded;
|
||||
/* the number of CMA page allocation failures */
|
||||
atomic64_t nr_pages_failed;
|
||||
/* kobject requires dynamic object */
|
||||
struct cma_kobject *cma_kobj;
|
||||
#endif
|
||||
};
|
||||
|
||||
extern struct cma cma_areas[MAX_CMA_AREAS];
|
||||
|
@ -26,4 +40,13 @@ static inline unsigned long cma_bitmap_maxno(struct cma *cma)
|
|||
return cma->count >> cma->order_per_bit;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_CMA_SYSFS
|
||||
void cma_sysfs_account_success_pages(struct cma *cma, unsigned long nr_pages);
|
||||
void cma_sysfs_account_fail_pages(struct cma *cma, unsigned long nr_pages);
|
||||
#else
|
||||
static inline void cma_sysfs_account_success_pages(struct cma *cma,
|
||||
unsigned long nr_pages) {};
|
||||
static inline void cma_sysfs_account_fail_pages(struct cma *cma,
|
||||
unsigned long nr_pages) {};
|
||||
#endif
|
||||
#endif
|
||||
|
|
|
@ -36,10 +36,10 @@ static int cma_used_get(void *data, u64 *val)
|
|||
struct cma *cma = data;
|
||||
unsigned long used;
|
||||
|
||||
mutex_lock(&cma->lock);
|
||||
spin_lock_irq(&cma->lock);
|
||||
/* pages counter is smaller than sizeof(int) */
|
||||
used = bitmap_weight(cma->bitmap, (int)cma_bitmap_maxno(cma));
|
||||
mutex_unlock(&cma->lock);
|
||||
spin_unlock_irq(&cma->lock);
|
||||
*val = (u64)used << cma->order_per_bit;
|
||||
|
||||
return 0;
|
||||
|
@ -53,7 +53,7 @@ static int cma_maxchunk_get(void *data, u64 *val)
|
|||
unsigned long start, end = 0;
|
||||
unsigned long bitmap_maxno = cma_bitmap_maxno(cma);
|
||||
|
||||
mutex_lock(&cma->lock);
|
||||
spin_lock_irq(&cma->lock);
|
||||
for (;;) {
|
||||
start = find_next_zero_bit(cma->bitmap, bitmap_maxno, end);
|
||||
if (start >= bitmap_maxno)
|
||||
|
@ -61,7 +61,7 @@ static int cma_maxchunk_get(void *data, u64 *val)
|
|||
end = find_next_bit(cma->bitmap, bitmap_maxno, start);
|
||||
maxchunk = max(end - start, maxchunk);
|
||||
}
|
||||
mutex_unlock(&cma->lock);
|
||||
spin_unlock_irq(&cma->lock);
|
||||
*val = (u64)maxchunk << cma->order_per_bit;
|
||||
|
||||
return 0;
|
||||
|
|
|
@ -0,0 +1,112 @@
|
|||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* CMA SysFS Interface
|
||||
*
|
||||
* Copyright (c) 2021 Minchan Kim <minchan@kernel.org>
|
||||
*/
|
||||
|
||||
#include <linux/cma.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/slab.h>
|
||||
|
||||
#include "cma.h"
|
||||
|
||||
#define CMA_ATTR_RO(_name) \
|
||||
static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
|
||||
|
||||
void cma_sysfs_account_success_pages(struct cma *cma, unsigned long nr_pages)
|
||||
{
|
||||
atomic64_add(nr_pages, &cma->nr_pages_succeeded);
|
||||
}
|
||||
|
||||
void cma_sysfs_account_fail_pages(struct cma *cma, unsigned long nr_pages)
|
||||
{
|
||||
atomic64_add(nr_pages, &cma->nr_pages_failed);
|
||||
}
|
||||
|
||||
static inline struct cma *cma_from_kobj(struct kobject *kobj)
|
||||
{
|
||||
return container_of(kobj, struct cma_kobject, kobj)->cma;
|
||||
}
|
||||
|
||||
static ssize_t alloc_pages_success_show(struct kobject *kobj,
|
||||
struct kobj_attribute *attr, char *buf)
|
||||
{
|
||||
struct cma *cma = cma_from_kobj(kobj);
|
||||
|
||||
return sysfs_emit(buf, "%llu\n",
|
||||
atomic64_read(&cma->nr_pages_succeeded));
|
||||
}
|
||||
CMA_ATTR_RO(alloc_pages_success);
|
||||
|
||||
static ssize_t alloc_pages_fail_show(struct kobject *kobj,
|
||||
struct kobj_attribute *attr, char *buf)
|
||||
{
|
||||
struct cma *cma = cma_from_kobj(kobj);
|
||||
|
||||
return sysfs_emit(buf, "%llu\n", atomic64_read(&cma->nr_pages_failed));
|
||||
}
|
||||
CMA_ATTR_RO(alloc_pages_fail);
|
||||
|
||||
static void cma_kobj_release(struct kobject *kobj)
|
||||
{
|
||||
struct cma *cma = cma_from_kobj(kobj);
|
||||
struct cma_kobject *cma_kobj = cma->cma_kobj;
|
||||
|
||||
kfree(cma_kobj);
|
||||
cma->cma_kobj = NULL;
|
||||
}
|
||||
|
||||
static struct attribute *cma_attrs[] = {
|
||||
&alloc_pages_success_attr.attr,
|
||||
&alloc_pages_fail_attr.attr,
|
||||
NULL,
|
||||
};
|
||||
ATTRIBUTE_GROUPS(cma);
|
||||
|
||||
static struct kobj_type cma_ktype = {
|
||||
.release = cma_kobj_release,
|
||||
.sysfs_ops = &kobj_sysfs_ops,
|
||||
.default_groups = cma_groups,
|
||||
};
|
||||
|
||||
static int __init cma_sysfs_init(void)
|
||||
{
|
||||
struct kobject *cma_kobj_root;
|
||||
struct cma_kobject *cma_kobj;
|
||||
struct cma *cma;
|
||||
int i, err;
|
||||
|
||||
cma_kobj_root = kobject_create_and_add("cma", mm_kobj);
|
||||
if (!cma_kobj_root)
|
||||
return -ENOMEM;
|
||||
|
||||
for (i = 0; i < cma_area_count; i++) {
|
||||
cma_kobj = kzalloc(sizeof(*cma_kobj), GFP_KERNEL);
|
||||
if (!cma_kobj) {
|
||||
err = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
cma = &cma_areas[i];
|
||||
cma->cma_kobj = cma_kobj;
|
||||
cma_kobj->cma = cma;
|
||||
err = kobject_init_and_add(&cma_kobj->kobj, &cma_ktype,
|
||||
cma_kobj_root, "%s", cma->name);
|
||||
if (err) {
|
||||
kobject_put(&cma_kobj->kobj);
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
out:
|
||||
while (--i >= 0) {
|
||||
cma = &cma_areas[i];
|
||||
kobject_put(&cma->cma_kobj->kobj);
|
||||
}
|
||||
kobject_put(cma_kobj_root);
|
||||
|
||||
return err;
|
||||
}
|
||||
subsys_initcall(cma_sysfs_init);
|
107
mm/compaction.c
107
mm/compaction.c
|
@ -787,15 +787,14 @@ static bool too_many_isolated(pg_data_t *pgdat)
|
|||
*
|
||||
* Isolate all pages that can be migrated from the range specified by
|
||||
* [low_pfn, end_pfn). The range is expected to be within same pageblock.
|
||||
* Returns zero if there is a fatal signal pending, otherwise PFN of the
|
||||
* first page that was not scanned (which may be both less, equal to or more
|
||||
* than end_pfn).
|
||||
* Returns errno, like -EAGAIN or -EINTR in case e.g signal pending or congestion,
|
||||
* -ENOMEM in case we could not allocate a page, or 0.
|
||||
* cc->migrate_pfn will contain the next pfn to scan.
|
||||
*
|
||||
* The pages are isolated on cc->migratepages list (not required to be empty),
|
||||
* and cc->nr_migratepages is updated accordingly. The cc->migrate_pfn field
|
||||
* is neither read nor updated.
|
||||
* and cc->nr_migratepages is updated accordingly.
|
||||
*/
|
||||
static unsigned long
|
||||
static int
|
||||
isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
|
||||
unsigned long end_pfn, isolate_mode_t isolate_mode)
|
||||
{
|
||||
|
@ -809,6 +808,9 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
|
|||
bool skip_on_failure = false;
|
||||
unsigned long next_skip_pfn = 0;
|
||||
bool skip_updated = false;
|
||||
int ret = 0;
|
||||
|
||||
cc->migrate_pfn = low_pfn;
|
||||
|
||||
/*
|
||||
* Ensure that there are not too many pages isolated from the LRU
|
||||
|
@ -818,16 +820,16 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
|
|||
while (unlikely(too_many_isolated(pgdat))) {
|
||||
/* stop isolation if there are still pages not migrated */
|
||||
if (cc->nr_migratepages)
|
||||
return 0;
|
||||
return -EAGAIN;
|
||||
|
||||
/* async migration should just abort */
|
||||
if (cc->mode == MIGRATE_ASYNC)
|
||||
return 0;
|
||||
return -EAGAIN;
|
||||
|
||||
congestion_wait(BLK_RW_ASYNC, HZ/10);
|
||||
|
||||
if (fatal_signal_pending(current))
|
||||
return 0;
|
||||
return -EINTR;
|
||||
}
|
||||
|
||||
cond_resched();
|
||||
|
@ -875,8 +877,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
|
|||
|
||||
if (fatal_signal_pending(current)) {
|
||||
cc->contended = true;
|
||||
ret = -EINTR;
|
||||
|
||||
low_pfn = 0;
|
||||
goto fatal_pending;
|
||||
}
|
||||
|
||||
|
@ -904,6 +906,38 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
|
|||
valid_page = page;
|
||||
}
|
||||
|
||||
if (PageHuge(page) && cc->alloc_contig) {
|
||||
ret = isolate_or_dissolve_huge_page(page, &cc->migratepages);
|
||||
|
||||
/*
|
||||
* Fail isolation in case isolate_or_dissolve_huge_page()
|
||||
* reports an error. In case of -ENOMEM, abort right away.
|
||||
*/
|
||||
if (ret < 0) {
|
||||
/* Do not report -EBUSY down the chain */
|
||||
if (ret == -EBUSY)
|
||||
ret = 0;
|
||||
low_pfn += (1UL << compound_order(page)) - 1;
|
||||
goto isolate_fail;
|
||||
}
|
||||
|
||||
if (PageHuge(page)) {
|
||||
/*
|
||||
* Hugepage was successfully isolated and placed
|
||||
* on the cc->migratepages list.
|
||||
*/
|
||||
low_pfn += compound_nr(page) - 1;
|
||||
goto isolate_success_no_list;
|
||||
}
|
||||
|
||||
/*
|
||||
* Ok, the hugepage was dissolved. Now these pages are
|
||||
* Buddy and cannot be re-allocated because they are
|
||||
* isolated. Fall-through as the check below handles
|
||||
* Buddy pages.
|
||||
*/
|
||||
}
|
||||
|
||||
/*
|
||||
* Skip if free. We read page order here without zone lock
|
||||
* which is generally unsafe, but the race window is small and
|
||||
|
@ -1037,6 +1071,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
|
|||
|
||||
isolate_success:
|
||||
list_add(&page->lru, &cc->migratepages);
|
||||
isolate_success_no_list:
|
||||
cc->nr_migratepages += compound_nr(page);
|
||||
nr_isolated += compound_nr(page);
|
||||
|
||||
|
@ -1063,7 +1098,7 @@ isolate_fail_put:
|
|||
put_page(page);
|
||||
|
||||
isolate_fail:
|
||||
if (!skip_on_failure)
|
||||
if (!skip_on_failure && ret != -ENOMEM)
|
||||
continue;
|
||||
|
||||
/*
|
||||
|
@ -1089,6 +1124,9 @@ isolate_fail:
|
|||
*/
|
||||
next_skip_pfn += 1UL << cc->order;
|
||||
}
|
||||
|
||||
if (ret == -ENOMEM)
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -1130,7 +1168,9 @@ fatal_pending:
|
|||
if (nr_isolated)
|
||||
count_compact_events(COMPACTISOLATED, nr_isolated);
|
||||
|
||||
return low_pfn;
|
||||
cc->migrate_pfn = low_pfn;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1139,15 +1179,15 @@ fatal_pending:
|
|||
* @start_pfn: The first PFN to start isolating.
|
||||
* @end_pfn: The one-past-last PFN.
|
||||
*
|
||||
* Returns zero if isolation fails fatally due to e.g. pending signal.
|
||||
* Otherwise, function returns one-past-the-last PFN of isolated page
|
||||
* (which may be greater than end_pfn if end fell in a middle of a THP page).
|
||||
* Returns -EAGAIN when contented, -EINTR in case of a signal pending, -ENOMEM
|
||||
* in case we could not allocate a page, or 0.
|
||||
*/
|
||||
unsigned long
|
||||
int
|
||||
isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
|
||||
unsigned long end_pfn)
|
||||
{
|
||||
unsigned long pfn, block_start_pfn, block_end_pfn;
|
||||
int ret = 0;
|
||||
|
||||
/* Scan block by block. First and last block may be incomplete */
|
||||
pfn = start_pfn;
|
||||
|
@ -1166,17 +1206,17 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
|
|||
block_end_pfn, cc->zone))
|
||||
continue;
|
||||
|
||||
pfn = isolate_migratepages_block(cc, pfn, block_end_pfn,
|
||||
ISOLATE_UNEVICTABLE);
|
||||
ret = isolate_migratepages_block(cc, pfn, block_end_pfn,
|
||||
ISOLATE_UNEVICTABLE);
|
||||
|
||||
if (!pfn)
|
||||
if (ret)
|
||||
break;
|
||||
|
||||
if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX)
|
||||
break;
|
||||
}
|
||||
|
||||
return pfn;
|
||||
return ret;
|
||||
}
|
||||
|
||||
#endif /* CONFIG_COMPACTION || CONFIG_CMA */
|
||||
|
@ -1847,7 +1887,7 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
|
|||
*/
|
||||
for (; block_end_pfn <= cc->free_pfn;
|
||||
fast_find_block = false,
|
||||
low_pfn = block_end_pfn,
|
||||
cc->migrate_pfn = low_pfn = block_end_pfn,
|
||||
block_start_pfn = block_end_pfn,
|
||||
block_end_pfn += pageblock_nr_pages) {
|
||||
|
||||
|
@ -1889,10 +1929,8 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
|
|||
}
|
||||
|
||||
/* Perform the isolation */
|
||||
low_pfn = isolate_migratepages_block(cc, low_pfn,
|
||||
block_end_pfn, isolate_mode);
|
||||
|
||||
if (!low_pfn)
|
||||
if (isolate_migratepages_block(cc, low_pfn, block_end_pfn,
|
||||
isolate_mode))
|
||||
return ISOLATE_ABORT;
|
||||
|
||||
/*
|
||||
|
@ -1903,9 +1941,6 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
|
|||
break;
|
||||
}
|
||||
|
||||
/* Record where migration scanner will be restarted. */
|
||||
cc->migrate_pfn = low_pfn;
|
||||
|
||||
return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
|
||||
}
|
||||
|
||||
|
@ -2319,7 +2354,8 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
|
|||
trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
|
||||
cc->free_pfn, end_pfn, sync);
|
||||
|
||||
migrate_prep_local();
|
||||
/* lru_add_drain_all could be expensive with involving other CPUs */
|
||||
lru_add_drain();
|
||||
|
||||
while ((ret = compact_finished(cc)) == COMPACT_CONTINUE) {
|
||||
int err;
|
||||
|
@ -2494,6 +2530,14 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
|
|||
*/
|
||||
WRITE_ONCE(current->capture_control, NULL);
|
||||
*capture = READ_ONCE(capc.page);
|
||||
/*
|
||||
* Technically, it is also possible that compaction is skipped but
|
||||
* the page is still captured out of luck(IRQ came and freed the page).
|
||||
* Returning COMPACT_SUCCESS in such cases helps in properly accounting
|
||||
* the COMPACT[STALL|FAIL] when compaction is skipped.
|
||||
*/
|
||||
if (*capture)
|
||||
ret = COMPACT_SUCCESS;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
@ -2657,9 +2701,6 @@ static void compact_nodes(void)
|
|||
compact_node(nid);
|
||||
}
|
||||
|
||||
/* The written value is actually unused, all memory is compacted */
|
||||
int sysctl_compact_memory;
|
||||
|
||||
/*
|
||||
* Tunable for proactive compaction. It determines how
|
||||
* aggressively the kernel should compact memory in the
|
||||
|
@ -2844,7 +2885,7 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx)
|
|||
*/
|
||||
static int kcompactd(void *p)
|
||||
{
|
||||
pg_data_t *pgdat = (pg_data_t*)p;
|
||||
pg_data_t *pgdat = (pg_data_t *)p;
|
||||
struct task_struct *tsk = current;
|
||||
unsigned int proactive_defer = 0;
|
||||
|
||||
|
|
24
mm/filemap.c
24
mm/filemap.c
|
@ -142,17 +142,6 @@ static void page_cache_delete(struct address_space *mapping,
|
|||
|
||||
page->mapping = NULL;
|
||||
/* Leave page->index set: truncation lookup relies upon it */
|
||||
|
||||
if (shadow) {
|
||||
mapping->nrexceptional += nr;
|
||||
/*
|
||||
* Make sure the nrexceptional update is committed before
|
||||
* the nrpages update so that final truncate racing
|
||||
* with reclaim does not see both counters 0 at the
|
||||
* same time and miss a shadow entry.
|
||||
*/
|
||||
smp_wmb();
|
||||
}
|
||||
mapping->nrpages -= nr;
|
||||
}
|
||||
|
||||
|
@ -629,9 +618,6 @@ EXPORT_SYMBOL(filemap_fdatawait_keep_errors);
|
|||
/* Returns true if writeback might be needed or already in progress. */
|
||||
static bool mapping_needs_writeback(struct address_space *mapping)
|
||||
{
|
||||
if (dax_mapping(mapping))
|
||||
return mapping->nrexceptional;
|
||||
|
||||
return mapping->nrpages;
|
||||
}
|
||||
|
||||
|
@ -925,8 +911,6 @@ noinline int __add_to_page_cache_locked(struct page *page,
|
|||
if (xas_error(&xas))
|
||||
goto unlock;
|
||||
|
||||
if (old)
|
||||
mapping->nrexceptional--;
|
||||
mapping->nrpages++;
|
||||
|
||||
/* hugetlb pages do not participate in page cache accounting */
|
||||
|
@ -3283,7 +3267,7 @@ const struct vm_operations_struct generic_file_vm_ops = {
|
|||
|
||||
/* This is used for a general mmap of a disk file */
|
||||
|
||||
int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
|
||||
int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
|
||||
{
|
||||
struct address_space *mapping = file->f_mapping;
|
||||
|
||||
|
@ -3308,11 +3292,11 @@ vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
|
|||
{
|
||||
return VM_FAULT_SIGBUS;
|
||||
}
|
||||
int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
|
||||
int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
|
||||
{
|
||||
return -ENOSYS;
|
||||
}
|
||||
int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma)
|
||||
int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
|
||||
{
|
||||
return -ENOSYS;
|
||||
}
|
||||
|
@ -3740,7 +3724,7 @@ EXPORT_SYMBOL(generic_perform_write);
|
|||
ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
|
||||
{
|
||||
struct file *file = iocb->ki_filp;
|
||||
struct address_space * mapping = file->f_mapping;
|
||||
struct address_space *mapping = file->f_mapping;
|
||||
struct inode *inode = mapping->host;
|
||||
ssize_t written = 0;
|
||||
ssize_t err;
|
||||
|
|
|
@ -60,16 +60,20 @@ static u64 frontswap_succ_stores;
|
|||
static u64 frontswap_failed_stores;
|
||||
static u64 frontswap_invalidates;
|
||||
|
||||
static inline void inc_frontswap_loads(void) {
|
||||
static inline void inc_frontswap_loads(void)
|
||||
{
|
||||
data_race(frontswap_loads++);
|
||||
}
|
||||
static inline void inc_frontswap_succ_stores(void) {
|
||||
static inline void inc_frontswap_succ_stores(void)
|
||||
{
|
||||
data_race(frontswap_succ_stores++);
|
||||
}
|
||||
static inline void inc_frontswap_failed_stores(void) {
|
||||
static inline void inc_frontswap_failed_stores(void)
|
||||
{
|
||||
data_race(frontswap_failed_stores++);
|
||||
}
|
||||
static inline void inc_frontswap_invalidates(void) {
|
||||
static inline void inc_frontswap_invalidates(void)
|
||||
{
|
||||
data_race(frontswap_invalidates++);
|
||||
}
|
||||
#else
|
||||
|
|
174
mm/gup.c
174
mm/gup.c
|
@ -87,11 +87,12 @@ __maybe_unused struct page *try_grab_compound_head(struct page *page,
|
|||
int orig_refs = refs;
|
||||
|
||||
/*
|
||||
* Can't do FOLL_LONGTERM + FOLL_PIN with CMA in the gup fast
|
||||
* path, so fail and let the caller fall back to the slow path.
|
||||
* Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a
|
||||
* right zone, so fail and let the caller fall back to the slow
|
||||
* path.
|
||||
*/
|
||||
if (unlikely(flags & FOLL_LONGTERM) &&
|
||||
is_migrate_cma_page(page))
|
||||
if (unlikely((flags & FOLL_LONGTERM) &&
|
||||
!is_pinnable_page(page)))
|
||||
return NULL;
|
||||
|
||||
/*
|
||||
|
@ -1527,7 +1528,7 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
|
|||
{
|
||||
struct vm_area_struct *vma;
|
||||
unsigned long vm_flags;
|
||||
int i;
|
||||
long i;
|
||||
|
||||
/* calculate required read or write permissions.
|
||||
* If FOLL_FORCE is set, we only require the "MAY" flags.
|
||||
|
@ -1600,112 +1601,92 @@ struct page *get_dump_page(unsigned long addr)
|
|||
}
|
||||
#endif /* CONFIG_ELF_CORE */
|
||||
|
||||
#ifdef CONFIG_CMA
|
||||
static long check_and_migrate_cma_pages(struct mm_struct *mm,
|
||||
unsigned long start,
|
||||
unsigned long nr_pages,
|
||||
struct page **pages,
|
||||
struct vm_area_struct **vmas,
|
||||
unsigned int gup_flags)
|
||||
#ifdef CONFIG_MIGRATION
|
||||
/*
|
||||
* Check whether all pages are pinnable, if so return number of pages. If some
|
||||
* pages are not pinnable, migrate them, and unpin all pages. Return zero if
|
||||
* pages were migrated, or if some pages were not successfully isolated.
|
||||
* Return negative error if migration fails.
|
||||
*/
|
||||
static long check_and_migrate_movable_pages(unsigned long nr_pages,
|
||||
struct page **pages,
|
||||
unsigned int gup_flags)
|
||||
{
|
||||
unsigned long i;
|
||||
unsigned long step;
|
||||
unsigned long isolation_error_count = 0;
|
||||
bool drain_allow = true;
|
||||
bool migrate_allow = true;
|
||||
LIST_HEAD(cma_page_list);
|
||||
long ret = nr_pages;
|
||||
LIST_HEAD(movable_page_list);
|
||||
long ret = 0;
|
||||
struct page *prev_head = NULL;
|
||||
struct page *head;
|
||||
struct migration_target_control mtc = {
|
||||
.nid = NUMA_NO_NODE,
|
||||
.gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_NOWARN,
|
||||
.gfp_mask = GFP_USER | __GFP_NOWARN,
|
||||
};
|
||||
|
||||
check_again:
|
||||
for (i = 0; i < nr_pages;) {
|
||||
|
||||
struct page *head = compound_head(pages[i]);
|
||||
|
||||
for (i = 0; i < nr_pages; i++) {
|
||||
head = compound_head(pages[i]);
|
||||
if (head == prev_head)
|
||||
continue;
|
||||
prev_head = head;
|
||||
/*
|
||||
* gup may start from a tail page. Advance step by the left
|
||||
* part.
|
||||
* If we get a movable page, since we are going to be pinning
|
||||
* these entries, try to move them out if possible.
|
||||
*/
|
||||
step = compound_nr(head) - (pages[i] - head);
|
||||
/*
|
||||
* If we get a page from the CMA zone, since we are going to
|
||||
* be pinning these entries, we might as well move them out
|
||||
* of the CMA zone if possible.
|
||||
*/
|
||||
if (is_migrate_cma_page(head)) {
|
||||
if (PageHuge(head))
|
||||
isolate_huge_page(head, &cma_page_list);
|
||||
else {
|
||||
if (!is_pinnable_page(head)) {
|
||||
if (PageHuge(head)) {
|
||||
if (!isolate_huge_page(head, &movable_page_list))
|
||||
isolation_error_count++;
|
||||
} else {
|
||||
if (!PageLRU(head) && drain_allow) {
|
||||
lru_add_drain_all();
|
||||
drain_allow = false;
|
||||
}
|
||||
|
||||
if (!isolate_lru_page(head)) {
|
||||
list_add_tail(&head->lru, &cma_page_list);
|
||||
mod_node_page_state(page_pgdat(head),
|
||||
NR_ISOLATED_ANON +
|
||||
page_is_file_lru(head),
|
||||
thp_nr_pages(head));
|
||||
if (isolate_lru_page(head)) {
|
||||
isolation_error_count++;
|
||||
continue;
|
||||
}
|
||||
list_add_tail(&head->lru, &movable_page_list);
|
||||
mod_node_page_state(page_pgdat(head),
|
||||
NR_ISOLATED_ANON +
|
||||
page_is_file_lru(head),
|
||||
thp_nr_pages(head));
|
||||
}
|
||||
}
|
||||
|
||||
i += step;
|
||||
}
|
||||
|
||||
if (!list_empty(&cma_page_list)) {
|
||||
/*
|
||||
* drop the above get_user_pages reference.
|
||||
*/
|
||||
if (gup_flags & FOLL_PIN)
|
||||
unpin_user_pages(pages, nr_pages);
|
||||
else
|
||||
for (i = 0; i < nr_pages; i++)
|
||||
put_page(pages[i]);
|
||||
/*
|
||||
* If list is empty, and no isolation errors, means that all pages are
|
||||
* in the correct zone.
|
||||
*/
|
||||
if (list_empty(&movable_page_list) && !isolation_error_count)
|
||||
return nr_pages;
|
||||
|
||||
if (migrate_pages(&cma_page_list, alloc_migration_target, NULL,
|
||||
(unsigned long)&mtc, MIGRATE_SYNC, MR_CONTIG_RANGE)) {
|
||||
/*
|
||||
* some of the pages failed migration. Do get_user_pages
|
||||
* without migration.
|
||||
*/
|
||||
migrate_allow = false;
|
||||
|
||||
if (!list_empty(&cma_page_list))
|
||||
putback_movable_pages(&cma_page_list);
|
||||
}
|
||||
/*
|
||||
* We did migrate all the pages, Try to get the page references
|
||||
* again migrating any new CMA pages which we failed to isolate
|
||||
* earlier.
|
||||
*/
|
||||
ret = __get_user_pages_locked(mm, start, nr_pages,
|
||||
pages, vmas, NULL,
|
||||
gup_flags);
|
||||
|
||||
if ((ret > 0) && migrate_allow) {
|
||||
nr_pages = ret;
|
||||
drain_allow = true;
|
||||
goto check_again;
|
||||
}
|
||||
if (gup_flags & FOLL_PIN) {
|
||||
unpin_user_pages(pages, nr_pages);
|
||||
} else {
|
||||
for (i = 0; i < nr_pages; i++)
|
||||
put_page(pages[i]);
|
||||
}
|
||||
if (!list_empty(&movable_page_list)) {
|
||||
ret = migrate_pages(&movable_page_list, alloc_migration_target,
|
||||
NULL, (unsigned long)&mtc, MIGRATE_SYNC,
|
||||
MR_LONGTERM_PIN);
|
||||
if (ret && !list_empty(&movable_page_list))
|
||||
putback_movable_pages(&movable_page_list);
|
||||
}
|
||||
|
||||
return ret;
|
||||
return ret > 0 ? -ENOMEM : ret;
|
||||
}
|
||||
#else
|
||||
static long check_and_migrate_cma_pages(struct mm_struct *mm,
|
||||
unsigned long start,
|
||||
unsigned long nr_pages,
|
||||
struct page **pages,
|
||||
struct vm_area_struct **vmas,
|
||||
unsigned int gup_flags)
|
||||
static long check_and_migrate_movable_pages(unsigned long nr_pages,
|
||||
struct page **pages,
|
||||
unsigned int gup_flags)
|
||||
{
|
||||
return nr_pages;
|
||||
}
|
||||
#endif /* CONFIG_CMA */
|
||||
#endif /* CONFIG_MIGRATION */
|
||||
|
||||
/*
|
||||
* __gup_longterm_locked() is a wrapper for __get_user_pages_locked which
|
||||
|
@ -1718,21 +1699,22 @@ static long __gup_longterm_locked(struct mm_struct *mm,
|
|||
struct vm_area_struct **vmas,
|
||||
unsigned int gup_flags)
|
||||
{
|
||||
unsigned long flags = 0;
|
||||
unsigned int flags;
|
||||
long rc;
|
||||
|
||||
if (gup_flags & FOLL_LONGTERM)
|
||||
flags = memalloc_nocma_save();
|
||||
if (!(gup_flags & FOLL_LONGTERM))
|
||||
return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
|
||||
NULL, gup_flags);
|
||||
flags = memalloc_pin_save();
|
||||
do {
|
||||
rc = __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
|
||||
NULL, gup_flags);
|
||||
if (rc <= 0)
|
||||
break;
|
||||
rc = check_and_migrate_movable_pages(rc, pages, gup_flags);
|
||||
} while (!rc);
|
||||
memalloc_pin_restore(flags);
|
||||
|
||||
rc = __get_user_pages_locked(mm, start, nr_pages, pages, vmas, NULL,
|
||||
gup_flags);
|
||||
|
||||
if (gup_flags & FOLL_LONGTERM) {
|
||||
if (rc > 0)
|
||||
rc = check_and_migrate_cma_pages(mm, start, rc, pages,
|
||||
vmas, gup_flags);
|
||||
memalloc_nocma_restore(flags);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
|
|
@ -52,6 +52,12 @@ static void verify_dma_pinned(unsigned int cmd, struct page **pages,
|
|||
|
||||
dump_page(page, "gup_test failure");
|
||||
break;
|
||||
} else if (cmd == PIN_LONGTERM_BENCHMARK &&
|
||||
WARN(!is_pinnable_page(page),
|
||||
"pages[%lu] is NOT pinnable but pinned\n",
|
||||
i)) {
|
||||
dump_page(page, "gup_test failure");
|
||||
break;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
@ -94,7 +100,7 @@ static int __gup_test_ioctl(unsigned int cmd,
|
|||
{
|
||||
ktime_t start_time, end_time;
|
||||
unsigned long i, nr_pages, addr, next;
|
||||
int nr;
|
||||
long nr;
|
||||
struct page **pages;
|
||||
int ret = 0;
|
||||
bool needs_mmap_lock =
|
||||
|
@ -126,37 +132,34 @@ static int __gup_test_ioctl(unsigned int cmd,
|
|||
nr = (next - addr) / PAGE_SIZE;
|
||||
}
|
||||
|
||||
/* Filter out most gup flags: only allow a tiny subset here: */
|
||||
gup->flags &= FOLL_WRITE;
|
||||
|
||||
switch (cmd) {
|
||||
case GUP_FAST_BENCHMARK:
|
||||
nr = get_user_pages_fast(addr, nr, gup->flags,
|
||||
nr = get_user_pages_fast(addr, nr, gup->gup_flags,
|
||||
pages + i);
|
||||
break;
|
||||
case GUP_BASIC_TEST:
|
||||
nr = get_user_pages(addr, nr, gup->flags, pages + i,
|
||||
nr = get_user_pages(addr, nr, gup->gup_flags, pages + i,
|
||||
NULL);
|
||||
break;
|
||||
case PIN_FAST_BENCHMARK:
|
||||
nr = pin_user_pages_fast(addr, nr, gup->flags,
|
||||
nr = pin_user_pages_fast(addr, nr, gup->gup_flags,
|
||||
pages + i);
|
||||
break;
|
||||
case PIN_BASIC_TEST:
|
||||
nr = pin_user_pages(addr, nr, gup->flags, pages + i,
|
||||
nr = pin_user_pages(addr, nr, gup->gup_flags, pages + i,
|
||||
NULL);
|
||||
break;
|
||||
case PIN_LONGTERM_BENCHMARK:
|
||||
nr = pin_user_pages(addr, nr,
|
||||
gup->flags | FOLL_LONGTERM,
|
||||
gup->gup_flags | FOLL_LONGTERM,
|
||||
pages + i, NULL);
|
||||
break;
|
||||
case DUMP_USER_PAGES_TEST:
|
||||
if (gup->flags & GUP_TEST_FLAG_DUMP_PAGES_USE_PIN)
|
||||
nr = pin_user_pages(addr, nr, gup->flags,
|
||||
if (gup->test_flags & GUP_TEST_FLAG_DUMP_PAGES_USE_PIN)
|
||||
nr = pin_user_pages(addr, nr, gup->gup_flags,
|
||||
pages + i, NULL);
|
||||
else
|
||||
nr = get_user_pages(addr, nr, gup->flags,
|
||||
nr = get_user_pages(addr, nr, gup->gup_flags,
|
||||
pages + i, NULL);
|
||||
break;
|
||||
default:
|
||||
|
@ -187,7 +190,7 @@ static int __gup_test_ioctl(unsigned int cmd,
|
|||
|
||||
start_time = ktime_get();
|
||||
|
||||
put_back_pages(cmd, pages, nr_pages, gup->flags);
|
||||
put_back_pages(cmd, pages, nr_pages, gup->test_flags);
|
||||
|
||||
end_time = ktime_get();
|
||||
gup->put_delta_usec = ktime_us_delta(end_time, start_time);
|
||||
|
|
|
@ -21,7 +21,8 @@ struct gup_test {
|
|||
__u64 addr;
|
||||
__u64 size;
|
||||
__u32 nr_pages_per_call;
|
||||
__u32 flags;
|
||||
__u32 gup_flags;
|
||||
__u32 test_flags;
|
||||
/*
|
||||
* Each non-zero entry is the number of the page (1-based: first page is
|
||||
* page 1, so that zero entries mean "do nothing") from the .addr base.
|
||||
|
|
11
mm/highmem.c
11
mm/highmem.c
|
@ -104,7 +104,7 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color)
|
|||
atomic_long_t _totalhigh_pages __read_mostly;
|
||||
EXPORT_SYMBOL(_totalhigh_pages);
|
||||
|
||||
unsigned int __nr_free_highpages (void)
|
||||
unsigned int __nr_free_highpages(void)
|
||||
{
|
||||
struct zone *zone;
|
||||
unsigned int pages = 0;
|
||||
|
@ -120,7 +120,7 @@ unsigned int __nr_free_highpages (void)
|
|||
static int pkmap_count[LAST_PKMAP];
|
||||
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock);
|
||||
|
||||
pte_t * pkmap_page_table;
|
||||
pte_t *pkmap_page_table;
|
||||
|
||||
/*
|
||||
* Most architectures have no use for kmap_high_get(), so let's abstract
|
||||
|
@ -147,6 +147,7 @@ struct page *__kmap_to_page(void *vaddr)
|
|||
|
||||
if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) {
|
||||
int i = PKMAP_NR(addr);
|
||||
|
||||
return pte_page(pkmap_page_table[i]);
|
||||
}
|
||||
|
||||
|
@ -278,9 +279,8 @@ void *kmap_high(struct page *page)
|
|||
pkmap_count[PKMAP_NR(vaddr)]++;
|
||||
BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 2);
|
||||
unlock_kmap();
|
||||
return (void*) vaddr;
|
||||
return (void *) vaddr;
|
||||
}
|
||||
|
||||
EXPORT_SYMBOL(kmap_high);
|
||||
|
||||
#ifdef ARCH_NEEDS_KMAP_HIGH_GET
|
||||
|
@ -305,7 +305,7 @@ void *kmap_high_get(struct page *page)
|
|||
pkmap_count[PKMAP_NR(vaddr)]++;
|
||||
}
|
||||
unlock_kmap_any(flags);
|
||||
return (void*) vaddr;
|
||||
return (void *) vaddr;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -737,7 +737,6 @@ done:
|
|||
spin_unlock_irqrestore(&pas->lock, flags);
|
||||
return ret;
|
||||
}
|
||||
|
||||
EXPORT_SYMBOL(page_address);
|
||||
|
||||
/**
|
||||
|
|
328
mm/huge_memory.c
328
mm/huge_memory.c
|
@ -7,6 +7,7 @@
|
|||
|
||||
#include <linux/mm.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/sched/mm.h>
|
||||
#include <linux/sched/coredump.h>
|
||||
#include <linux/sched/numa_balancing.h>
|
||||
#include <linux/highmem.h>
|
||||
|
@ -77,18 +78,18 @@ bool transparent_hugepage_enabled(struct vm_area_struct *vma)
|
|||
return false;
|
||||
}
|
||||
|
||||
static struct page *get_huge_zero_page(void)
|
||||
static bool get_huge_zero_page(void)
|
||||
{
|
||||
struct page *zero_page;
|
||||
retry:
|
||||
if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
|
||||
return READ_ONCE(huge_zero_page);
|
||||
return true;
|
||||
|
||||
zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
|
||||
HPAGE_PMD_ORDER);
|
||||
if (!zero_page) {
|
||||
count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
|
||||
return NULL;
|
||||
return false;
|
||||
}
|
||||
count_vm_event(THP_ZERO_PAGE_ALLOC);
|
||||
preempt_disable();
|
||||
|
@ -101,7 +102,7 @@ retry:
|
|||
/* We take additional reference here. It will be put back by shrinker */
|
||||
atomic_set(&huge_zero_refcount, 2);
|
||||
preempt_enable();
|
||||
return READ_ONCE(huge_zero_page);
|
||||
return true;
|
||||
}
|
||||
|
||||
static void put_huge_zero_page(void)
|
||||
|
@ -624,14 +625,12 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
|
|||
|
||||
/* Deliver the page fault to userland */
|
||||
if (userfaultfd_missing(vma)) {
|
||||
vm_fault_t ret2;
|
||||
|
||||
spin_unlock(vmf->ptl);
|
||||
put_page(page);
|
||||
pte_free(vma->vm_mm, pgtable);
|
||||
ret2 = handle_userfault(vmf, VM_UFFD_MISSING);
|
||||
VM_BUG_ON(ret2 & VM_FAULT_FALLBACK);
|
||||
return ret2;
|
||||
ret = handle_userfault(vmf, VM_UFFD_MISSING);
|
||||
VM_BUG_ON(ret & VM_FAULT_FALLBACK);
|
||||
return ret;
|
||||
}
|
||||
|
||||
entry = mk_huge_pmd(page, vma->vm_page_prot);
|
||||
|
@ -1293,7 +1292,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
|
|||
}
|
||||
|
||||
page = pmd_page(orig_pmd);
|
||||
VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
|
||||
VM_BUG_ON_PAGE(!PageHead(page), page);
|
||||
|
||||
/* Lock page for reuse_swap_page() */
|
||||
if (!trylock_page(page)) {
|
||||
|
@ -1464,12 +1463,6 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
|
|||
*/
|
||||
page_locked = trylock_page(page);
|
||||
target_nid = mpol_misplaced(page, vma, haddr);
|
||||
if (target_nid == NUMA_NO_NODE) {
|
||||
/* If the page was locked, there are no parallel migrations */
|
||||
if (page_locked)
|
||||
goto clear_pmdnuma;
|
||||
}
|
||||
|
||||
/* Migration could have started since the pmd_trans_migrating check */
|
||||
if (!page_locked) {
|
||||
page_nid = NUMA_NO_NODE;
|
||||
|
@ -1478,6 +1471,11 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
|
|||
spin_unlock(vmf->ptl);
|
||||
put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
|
||||
goto out;
|
||||
} else if (target_nid == NUMA_NO_NODE) {
|
||||
/* There are no parallel migrations and page is in the right
|
||||
* node. Clear the numa hinting info in this pmd.
|
||||
*/
|
||||
goto clear_pmdnuma;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -1696,7 +1694,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
|
|||
|
||||
VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
|
||||
entry = pmd_to_swp_entry(orig_pmd);
|
||||
page = pfn_to_page(swp_offset(entry));
|
||||
page = migration_entry_to_page(entry);
|
||||
flush_needed = 0;
|
||||
} else
|
||||
WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
|
||||
|
@ -2104,7 +2102,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
|
|||
swp_entry_t entry;
|
||||
|
||||
entry = pmd_to_swp_entry(old_pmd);
|
||||
page = pfn_to_page(swp_offset(entry));
|
||||
page = migration_entry_to_page(entry);
|
||||
write = is_write_migration_entry(entry);
|
||||
young = false;
|
||||
soft_dirty = pmd_swp_soft_dirty(old_pmd);
|
||||
|
@ -2303,44 +2301,38 @@ void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
|
|||
__split_huge_pmd(vma, pmd, address, freeze, page);
|
||||
}
|
||||
|
||||
static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address)
|
||||
{
|
||||
/*
|
||||
* If the new address isn't hpage aligned and it could previously
|
||||
* contain an hugepage: check if we need to split an huge pmd.
|
||||
*/
|
||||
if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) &&
|
||||
range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE),
|
||||
ALIGN(address, HPAGE_PMD_SIZE)))
|
||||
split_huge_pmd_address(vma, address, false, NULL);
|
||||
}
|
||||
|
||||
void vma_adjust_trans_huge(struct vm_area_struct *vma,
|
||||
unsigned long start,
|
||||
unsigned long end,
|
||||
long adjust_next)
|
||||
{
|
||||
/*
|
||||
* If the new start address isn't hpage aligned and it could
|
||||
* previously contain an hugepage: check if we need to split
|
||||
* an huge pmd.
|
||||
*/
|
||||
if (start & ~HPAGE_PMD_MASK &&
|
||||
(start & HPAGE_PMD_MASK) >= vma->vm_start &&
|
||||
(start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
|
||||
split_huge_pmd_address(vma, start, false, NULL);
|
||||
/* Check if we need to split start first. */
|
||||
split_huge_pmd_if_needed(vma, start);
|
||||
|
||||
/* Check if we need to split end next. */
|
||||
split_huge_pmd_if_needed(vma, end);
|
||||
|
||||
/*
|
||||
* If the new end address isn't hpage aligned and it could
|
||||
* previously contain an hugepage: check if we need to split
|
||||
* an huge pmd.
|
||||
*/
|
||||
if (end & ~HPAGE_PMD_MASK &&
|
||||
(end & HPAGE_PMD_MASK) >= vma->vm_start &&
|
||||
(end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
|
||||
split_huge_pmd_address(vma, end, false, NULL);
|
||||
|
||||
/*
|
||||
* If we're also updating the vma->vm_next->vm_start, if the new
|
||||
* vm_next->vm_start isn't hpage aligned and it could previously
|
||||
* contain an hugepage: check if we need to split an huge pmd.
|
||||
* If we're also updating the vma->vm_next->vm_start,
|
||||
* check if we need to split it.
|
||||
*/
|
||||
if (adjust_next > 0) {
|
||||
struct vm_area_struct *next = vma->vm_next;
|
||||
unsigned long nstart = next->vm_start;
|
||||
nstart += adjust_next;
|
||||
if (nstart & ~HPAGE_PMD_MASK &&
|
||||
(nstart & HPAGE_PMD_MASK) >= next->vm_start &&
|
||||
(nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
|
||||
split_huge_pmd_address(next, nstart, false, NULL);
|
||||
split_huge_pmd_if_needed(next, nstart);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2838,8 +2830,8 @@ void deferred_split_huge_page(struct page *page)
|
|||
ds_queue->split_queue_len++;
|
||||
#ifdef CONFIG_MEMCG
|
||||
if (memcg)
|
||||
memcg_set_shrinker_bit(memcg, page_to_nid(page),
|
||||
deferred_split_shrinker.id);
|
||||
set_shrinker_bit(memcg, page_to_nid(page),
|
||||
deferred_split_shrinker.id);
|
||||
#endif
|
||||
}
|
||||
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
|
||||
|
@ -2924,16 +2916,14 @@ static struct shrinker deferred_split_shrinker = {
|
|||
};
|
||||
|
||||
#ifdef CONFIG_DEBUG_FS
|
||||
static int split_huge_pages_set(void *data, u64 val)
|
||||
static void split_huge_pages_all(void)
|
||||
{
|
||||
struct zone *zone;
|
||||
struct page *page;
|
||||
unsigned long pfn, max_zone_pfn;
|
||||
unsigned long total = 0, split = 0;
|
||||
|
||||
if (val != 1)
|
||||
return -EINVAL;
|
||||
|
||||
pr_debug("Split all THPs\n");
|
||||
for_each_populated_zone(zone) {
|
||||
max_zone_pfn = zone_end_pfn(zone);
|
||||
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
|
||||
|
@ -2957,15 +2947,243 @@ static int split_huge_pages_set(void *data, u64 val)
|
|||
unlock_page(page);
|
||||
next:
|
||||
put_page(page);
|
||||
cond_resched();
|
||||
}
|
||||
}
|
||||
|
||||
pr_info("%lu of %lu THP split\n", split, total);
|
||||
|
||||
return 0;
|
||||
pr_debug("%lu of %lu THP split\n", split, total);
|
||||
}
|
||||
DEFINE_DEBUGFS_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
|
||||
"%llu\n");
|
||||
|
||||
static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma)
|
||||
{
|
||||
return vma_is_special_huge(vma) || (vma->vm_flags & VM_IO) ||
|
||||
is_vm_hugetlb_page(vma);
|
||||
}
|
||||
|
||||
static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
|
||||
unsigned long vaddr_end)
|
||||
{
|
||||
int ret = 0;
|
||||
struct task_struct *task;
|
||||
struct mm_struct *mm;
|
||||
unsigned long total = 0, split = 0;
|
||||
unsigned long addr;
|
||||
|
||||
vaddr_start &= PAGE_MASK;
|
||||
vaddr_end &= PAGE_MASK;
|
||||
|
||||
/* Find the task_struct from pid */
|
||||
rcu_read_lock();
|
||||
task = find_task_by_vpid(pid);
|
||||
if (!task) {
|
||||
rcu_read_unlock();
|
||||
ret = -ESRCH;
|
||||
goto out;
|
||||
}
|
||||
get_task_struct(task);
|
||||
rcu_read_unlock();
|
||||
|
||||
/* Find the mm_struct */
|
||||
mm = get_task_mm(task);
|
||||
put_task_struct(task);
|
||||
|
||||
if (!mm) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n",
|
||||
pid, vaddr_start, vaddr_end);
|
||||
|
||||
mmap_read_lock(mm);
|
||||
/*
|
||||
* always increase addr by PAGE_SIZE, since we could have a PTE page
|
||||
* table filled with PTE-mapped THPs, each of which is distinct.
|
||||
*/
|
||||
for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
|
||||
struct vm_area_struct *vma = find_vma(mm, addr);
|
||||
unsigned int follflags;
|
||||
struct page *page;
|
||||
|
||||
if (!vma || addr < vma->vm_start)
|
||||
break;
|
||||
|
||||
/* skip special VMA and hugetlb VMA */
|
||||
if (vma_not_suitable_for_thp_split(vma)) {
|
||||
addr = vma->vm_end;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* FOLL_DUMP to ignore special (like zero) pages */
|
||||
follflags = FOLL_GET | FOLL_DUMP;
|
||||
page = follow_page(vma, addr, follflags);
|
||||
|
||||
if (IS_ERR(page))
|
||||
continue;
|
||||
if (!page)
|
||||
continue;
|
||||
|
||||
if (!is_transparent_hugepage(page))
|
||||
goto next;
|
||||
|
||||
total++;
|
||||
if (!can_split_huge_page(compound_head(page), NULL))
|
||||
goto next;
|
||||
|
||||
if (!trylock_page(page))
|
||||
goto next;
|
||||
|
||||
if (!split_huge_page(page))
|
||||
split++;
|
||||
|
||||
unlock_page(page);
|
||||
next:
|
||||
put_page(page);
|
||||
cond_resched();
|
||||
}
|
||||
mmap_read_unlock(mm);
|
||||
mmput(mm);
|
||||
|
||||
pr_debug("%lu of %lu THP split\n", split, total);
|
||||
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
|
||||
pgoff_t off_end)
|
||||
{
|
||||
struct filename *file;
|
||||
struct file *candidate;
|
||||
struct address_space *mapping;
|
||||
int ret = -EINVAL;
|
||||
pgoff_t index;
|
||||
int nr_pages = 1;
|
||||
unsigned long total = 0, split = 0;
|
||||
|
||||
file = getname_kernel(file_path);
|
||||
if (IS_ERR(file))
|
||||
return ret;
|
||||
|
||||
candidate = file_open_name(file, O_RDONLY, 0);
|
||||
if (IS_ERR(candidate))
|
||||
goto out;
|
||||
|
||||
pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n",
|
||||
file_path, off_start, off_end);
|
||||
|
||||
mapping = candidate->f_mapping;
|
||||
|
||||
for (index = off_start; index < off_end; index += nr_pages) {
|
||||
struct page *fpage = pagecache_get_page(mapping, index,
|
||||
FGP_ENTRY | FGP_HEAD, 0);
|
||||
|
||||
nr_pages = 1;
|
||||
if (xa_is_value(fpage) || !fpage)
|
||||
continue;
|
||||
|
||||
if (!is_transparent_hugepage(fpage))
|
||||
goto next;
|
||||
|
||||
total++;
|
||||
nr_pages = thp_nr_pages(fpage);
|
||||
|
||||
if (!trylock_page(fpage))
|
||||
goto next;
|
||||
|
||||
if (!split_huge_page(fpage))
|
||||
split++;
|
||||
|
||||
unlock_page(fpage);
|
||||
next:
|
||||
put_page(fpage);
|
||||
cond_resched();
|
||||
}
|
||||
|
||||
filp_close(candidate, NULL);
|
||||
ret = 0;
|
||||
|
||||
pr_debug("%lu of %lu file-backed THP split\n", split, total);
|
||||
out:
|
||||
putname(file);
|
||||
return ret;
|
||||
}
|
||||
|
||||
#define MAX_INPUT_BUF_SZ 255
|
||||
|
||||
static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
|
||||
size_t count, loff_t *ppops)
|
||||
{
|
||||
static DEFINE_MUTEX(split_debug_mutex);
|
||||
ssize_t ret;
|
||||
/* hold pid, start_vaddr, end_vaddr or file_path, off_start, off_end */
|
||||
char input_buf[MAX_INPUT_BUF_SZ];
|
||||
int pid;
|
||||
unsigned long vaddr_start, vaddr_end;
|
||||
|
||||
ret = mutex_lock_interruptible(&split_debug_mutex);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = -EFAULT;
|
||||
|
||||
memset(input_buf, 0, MAX_INPUT_BUF_SZ);
|
||||
if (copy_from_user(input_buf, buf, min_t(size_t, count, MAX_INPUT_BUF_SZ)))
|
||||
goto out;
|
||||
|
||||
input_buf[MAX_INPUT_BUF_SZ - 1] = '\0';
|
||||
|
||||
if (input_buf[0] == '/') {
|
||||
char *tok;
|
||||
char *buf = input_buf;
|
||||
char file_path[MAX_INPUT_BUF_SZ];
|
||||
pgoff_t off_start = 0, off_end = 0;
|
||||
size_t input_len = strlen(input_buf);
|
||||
|
||||
tok = strsep(&buf, ",");
|
||||
if (tok) {
|
||||
strncpy(file_path, tok, MAX_INPUT_BUF_SZ);
|
||||
} else {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = sscanf(buf, "0x%lx,0x%lx", &off_start, &off_end);
|
||||
if (ret != 2) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
ret = split_huge_pages_in_file(file_path, off_start, off_end);
|
||||
if (!ret)
|
||||
ret = input_len;
|
||||
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end);
|
||||
if (ret == 1 && pid == 1) {
|
||||
split_huge_pages_all();
|
||||
ret = strlen(input_buf);
|
||||
goto out;
|
||||
} else if (ret != 3) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end);
|
||||
if (!ret)
|
||||
ret = strlen(input_buf);
|
||||
out:
|
||||
mutex_unlock(&split_debug_mutex);
|
||||
return ret;
|
||||
|
||||
}
|
||||
|
||||
static const struct file_operations split_huge_pages_fops = {
|
||||
.owner = THIS_MODULE,
|
||||
.write = split_huge_pages_write,
|
||||
.llseek = no_llseek,
|
||||
};
|
||||
|
||||
static int __init split_huge_pages_debugfs(void)
|
||||
{
|
||||
|
|
777
mm/hugetlb.c
777
mm/hugetlb.c
File diff suppressed because it is too large
Load Diff
|
@ -204,11 +204,11 @@ static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css)
|
|||
do {
|
||||
idx = 0;
|
||||
for_each_hstate(h) {
|
||||
spin_lock(&hugetlb_lock);
|
||||
spin_lock_irq(&hugetlb_lock);
|
||||
list_for_each_entry(page, &h->hugepage_activelist, lru)
|
||||
hugetlb_cgroup_move_parent(idx, h_cg, page);
|
||||
|
||||
spin_unlock(&hugetlb_lock);
|
||||
spin_unlock_irq(&hugetlb_lock);
|
||||
idx++;
|
||||
}
|
||||
cond_resched();
|
||||
|
@ -784,8 +784,7 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
|
|||
if (hugetlb_cgroup_disabled())
|
||||
return;
|
||||
|
||||
VM_BUG_ON_PAGE(!PageHuge(oldhpage), oldhpage);
|
||||
spin_lock(&hugetlb_lock);
|
||||
spin_lock_irq(&hugetlb_lock);
|
||||
h_cg = hugetlb_cgroup_from_page(oldhpage);
|
||||
h_cg_rsvd = hugetlb_cgroup_from_page_rsvd(oldhpage);
|
||||
set_hugetlb_cgroup(oldhpage, NULL);
|
||||
|
@ -795,7 +794,7 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
|
|||
set_hugetlb_cgroup(newhpage, h_cg);
|
||||
set_hugetlb_cgroup_rsvd(newhpage, h_cg_rsvd);
|
||||
list_move(&newhpage->lru, &h->hugepage_activelist);
|
||||
spin_unlock(&hugetlb_lock);
|
||||
spin_unlock_irq(&hugetlb_lock);
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
|
@ -244,7 +244,13 @@ struct compact_control {
|
|||
unsigned int nr_freepages; /* Number of isolated free pages */
|
||||
unsigned int nr_migratepages; /* Number of pages to migrate */
|
||||
unsigned long free_pfn; /* isolate_freepages search base */
|
||||
unsigned long migrate_pfn; /* isolate_migratepages search base */
|
||||
/*
|
||||
* Acts as an in/out parameter to page isolation for migration.
|
||||
* isolate_migratepages uses it as a search base.
|
||||
* isolate_migratepages_block will update the value to the next pfn
|
||||
* after the last isolated one.
|
||||
*/
|
||||
unsigned long migrate_pfn;
|
||||
unsigned long fast_start_pfn; /* a pfn to start linear scan from */
|
||||
struct zone *zone;
|
||||
unsigned long total_migrate_scanned;
|
||||
|
@ -280,7 +286,7 @@ struct capture_control {
|
|||
unsigned long
|
||||
isolate_freepages_range(struct compact_control *cc,
|
||||
unsigned long start_pfn, unsigned long end_pfn);
|
||||
unsigned long
|
||||
int
|
||||
isolate_migratepages_range(struct compact_control *cc,
|
||||
unsigned long low_pfn, unsigned long end_pfn);
|
||||
int find_suitable_fallback(struct free_area *area, unsigned int order,
|
||||
|
|
|
@ -10,6 +10,7 @@
|
|||
#include <linux/atomic.h>
|
||||
#include <linux/bug.h>
|
||||
#include <linux/debugfs.h>
|
||||
#include <linux/irq_work.h>
|
||||
#include <linux/kcsan-checks.h>
|
||||
#include <linux/kfence.h>
|
||||
#include <linux/kmemleak.h>
|
||||
|
@ -19,6 +20,7 @@
|
|||
#include <linux/moduleparam.h>
|
||||
#include <linux/random.h>
|
||||
#include <linux/rcupdate.h>
|
||||
#include <linux/sched/sysctl.h>
|
||||
#include <linux/seq_file.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/spinlock.h>
|
||||
|
@ -372,6 +374,7 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z
|
|||
|
||||
/* Restore page protection if there was an OOB access. */
|
||||
if (meta->unprotected_page) {
|
||||
memzero_explicit((void *)ALIGN_DOWN(meta->unprotected_page, PAGE_SIZE), PAGE_SIZE);
|
||||
kfence_protect(meta->unprotected_page);
|
||||
meta->unprotected_page = 0;
|
||||
}
|
||||
|
@ -586,6 +589,17 @@ late_initcall(kfence_debugfs_init);
|
|||
|
||||
/* === Allocation Gate Timer ================================================ */
|
||||
|
||||
#ifdef CONFIG_KFENCE_STATIC_KEYS
|
||||
/* Wait queue to wake up allocation-gate timer task. */
|
||||
static DECLARE_WAIT_QUEUE_HEAD(allocation_wait);
|
||||
|
||||
static void wake_up_kfence_timer(struct irq_work *work)
|
||||
{
|
||||
wake_up(&allocation_wait);
|
||||
}
|
||||
static DEFINE_IRQ_WORK(wake_up_kfence_timer_work, wake_up_kfence_timer);
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Set up delayed work, which will enable and disable the static key. We need to
|
||||
* use a work queue (rather than a simple timer), since enabling and disabling a
|
||||
|
@ -603,29 +617,27 @@ static void toggle_allocation_gate(struct work_struct *work)
|
|||
if (!READ_ONCE(kfence_enabled))
|
||||
return;
|
||||
|
||||
/* Enable static key, and await allocation to happen. */
|
||||
atomic_set(&kfence_allocation_gate, 0);
|
||||
#ifdef CONFIG_KFENCE_STATIC_KEYS
|
||||
/* Enable static key, and await allocation to happen. */
|
||||
static_branch_enable(&kfence_allocation_key);
|
||||
/*
|
||||
* Await an allocation. Timeout after 1 second, in case the kernel stops
|
||||
* doing allocations, to avoid stalling this worker task for too long.
|
||||
*/
|
||||
{
|
||||
unsigned long end_wait = jiffies + HZ;
|
||||
|
||||
do {
|
||||
set_current_state(TASK_UNINTERRUPTIBLE);
|
||||
if (atomic_read(&kfence_allocation_gate) != 0)
|
||||
break;
|
||||
schedule_timeout(1);
|
||||
} while (time_before(jiffies, end_wait));
|
||||
__set_current_state(TASK_RUNNING);
|
||||
if (sysctl_hung_task_timeout_secs) {
|
||||
/*
|
||||
* During low activity with no allocations we might wait a
|
||||
* while; let's avoid the hung task warning.
|
||||
*/
|
||||
wait_event_timeout(allocation_wait, atomic_read(&kfence_allocation_gate),
|
||||
sysctl_hung_task_timeout_secs * HZ / 2);
|
||||
} else {
|
||||
wait_event(allocation_wait, atomic_read(&kfence_allocation_gate));
|
||||
}
|
||||
|
||||
/* Disable static key and reset timer. */
|
||||
static_branch_disable(&kfence_allocation_key);
|
||||
#endif
|
||||
schedule_delayed_work(&kfence_timer, msecs_to_jiffies(kfence_sample_interval));
|
||||
queue_delayed_work(system_power_efficient_wq, &kfence_timer,
|
||||
msecs_to_jiffies(kfence_sample_interval));
|
||||
}
|
||||
static DECLARE_DELAYED_WORK(kfence_timer, toggle_allocation_gate);
|
||||
|
||||
|
@ -654,7 +666,7 @@ void __init kfence_init(void)
|
|||
}
|
||||
|
||||
WRITE_ONCE(kfence_enabled, true);
|
||||
schedule_delayed_work(&kfence_timer, 0);
|
||||
queue_delayed_work(system_power_efficient_wq, &kfence_timer, 0);
|
||||
pr_info("initialized - using %lu bytes for %d objects at 0x%p-0x%p\n", KFENCE_POOL_SIZE,
|
||||
CONFIG_KFENCE_NUM_OBJECTS, (void *)__kfence_pool,
|
||||
(void *)(__kfence_pool + KFENCE_POOL_SIZE));
|
||||
|
@ -728,6 +740,19 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
|
|||
*/
|
||||
if (atomic_read(&kfence_allocation_gate) || atomic_inc_return(&kfence_allocation_gate) > 1)
|
||||
return NULL;
|
||||
#ifdef CONFIG_KFENCE_STATIC_KEYS
|
||||
/*
|
||||
* waitqueue_active() is fully ordered after the update of
|
||||
* kfence_allocation_gate per atomic_inc_return().
|
||||
*/
|
||||
if (waitqueue_active(&allocation_wait)) {
|
||||
/*
|
||||
* Calling wake_up() here may deadlock when allocations happen
|
||||
* from within timer code. Use an irq_work to defer it.
|
||||
*/
|
||||
irq_work_queue(&wake_up_kfence_timer_work);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (!READ_ONCE(kfence_enabled))
|
||||
return NULL;
|
||||
|
|
|
@ -481,7 +481,7 @@ int __khugepaged_enter(struct mm_struct *mm)
|
|||
return -ENOMEM;
|
||||
|
||||
/* __khugepaged_exit() must not run from under us */
|
||||
VM_BUG_ON_MM(atomic_read(&mm->mm_users) == 0, mm);
|
||||
VM_BUG_ON_MM(khugepaged_test_exit(mm), mm);
|
||||
if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
|
||||
free_mm_slot(mm_slot);
|
||||
return 0;
|
||||
|
@ -716,17 +716,17 @@ next:
|
|||
if (pte_write(pteval))
|
||||
writable = true;
|
||||
}
|
||||
if (likely(writable)) {
|
||||
if (likely(referenced)) {
|
||||
result = SCAN_SUCCEED;
|
||||
trace_mm_collapse_huge_page_isolate(page, none_or_zero,
|
||||
referenced, writable, result);
|
||||
return 1;
|
||||
}
|
||||
} else {
|
||||
result = SCAN_PAGE_RO;
|
||||
}
|
||||
|
||||
if (unlikely(!writable)) {
|
||||
result = SCAN_PAGE_RO;
|
||||
} else if (unlikely(!referenced)) {
|
||||
result = SCAN_LACK_REFERENCED_PAGE;
|
||||
} else {
|
||||
result = SCAN_SUCCEED;
|
||||
trace_mm_collapse_huge_page_isolate(page, none_or_zero,
|
||||
referenced, writable, result);
|
||||
return 1;
|
||||
}
|
||||
out:
|
||||
release_pte_pages(pte, _pte, compound_pagelist);
|
||||
trace_mm_collapse_huge_page_isolate(page, none_or_zero,
|
||||
|
@ -809,7 +809,7 @@ static bool khugepaged_scan_abort(int nid)
|
|||
* If node_reclaim_mode is disabled, then no extra effort is made to
|
||||
* allocate memory locally.
|
||||
*/
|
||||
if (!node_reclaim_mode)
|
||||
if (!node_reclaim_enabled())
|
||||
return false;
|
||||
|
||||
/* If there is a count for this node already, it must be acceptable */
|
||||
|
@ -1128,10 +1128,10 @@ static void collapse_huge_page(struct mm_struct *mm,
|
|||
mmap_write_lock(mm);
|
||||
result = hugepage_vma_revalidate(mm, address, &vma);
|
||||
if (result)
|
||||
goto out;
|
||||
goto out_up_write;
|
||||
/* check if the pmd is still valid */
|
||||
if (mm_find_pmd(mm, address) != pmd)
|
||||
goto out;
|
||||
goto out_up_write;
|
||||
|
||||
anon_vma_lock_write(vma->anon_vma);
|
||||
|
||||
|
@ -1171,7 +1171,7 @@ static void collapse_huge_page(struct mm_struct *mm,
|
|||
spin_unlock(pmd_ptl);
|
||||
anon_vma_unlock_write(vma->anon_vma);
|
||||
result = SCAN_FAIL;
|
||||
goto out;
|
||||
goto out_up_write;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -1183,19 +1183,18 @@ static void collapse_huge_page(struct mm_struct *mm,
|
|||
__collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl,
|
||||
&compound_pagelist);
|
||||
pte_unmap(pte);
|
||||
/*
|
||||
* spin_lock() below is not the equivalent of smp_wmb(), but
|
||||
* the smp_wmb() inside __SetPageUptodate() can be reused to
|
||||
* avoid the copy_huge_page writes to become visible after
|
||||
* the set_pmd_at() write.
|
||||
*/
|
||||
__SetPageUptodate(new_page);
|
||||
pgtable = pmd_pgtable(_pmd);
|
||||
|
||||
_pmd = mk_huge_pmd(new_page, vma->vm_page_prot);
|
||||
_pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
|
||||
|
||||
/*
|
||||
* spin_lock() below is not the equivalent of smp_wmb(), so
|
||||
* this is needed to avoid the copy_huge_page writes to become
|
||||
* visible after the set_pmd_at() write.
|
||||
*/
|
||||
smp_wmb();
|
||||
|
||||
spin_lock(pmd_ptl);
|
||||
BUG_ON(!pmd_none(*pmd));
|
||||
page_add_new_anon_rmap(new_page, vma, address, true);
|
||||
|
@ -1216,8 +1215,6 @@ out_nolock:
|
|||
mem_cgroup_uncharge(*hpage);
|
||||
trace_mm_collapse_huge_page(mm, isolated, result);
|
||||
return;
|
||||
out:
|
||||
goto out_up_write;
|
||||
}
|
||||
|
||||
static int khugepaged_scan_pmd(struct mm_struct *mm,
|
||||
|
@ -1274,10 +1271,6 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
|
|||
goto out_unmap;
|
||||
}
|
||||
}
|
||||
if (!pte_present(pteval)) {
|
||||
result = SCAN_PTE_NON_PRESENT;
|
||||
goto out_unmap;
|
||||
}
|
||||
if (pte_uffd_wp(pteval)) {
|
||||
/*
|
||||
* Don't collapse the page if any of the small
|
||||
|
@ -1447,7 +1440,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
|
|||
int i;
|
||||
|
||||
if (!vma || !vma->vm_file ||
|
||||
vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE)
|
||||
!range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE))
|
||||
return;
|
||||
|
||||
/*
|
||||
|
@ -1533,16 +1526,16 @@ abort:
|
|||
goto drop_hpage;
|
||||
}
|
||||
|
||||
static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
|
||||
static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
|
||||
{
|
||||
struct mm_struct *mm = mm_slot->mm;
|
||||
int i;
|
||||
|
||||
if (likely(mm_slot->nr_pte_mapped_thp == 0))
|
||||
return 0;
|
||||
return;
|
||||
|
||||
if (!mmap_write_trylock(mm))
|
||||
return -EBUSY;
|
||||
return;
|
||||
|
||||
if (unlikely(khugepaged_test_exit(mm)))
|
||||
goto out;
|
||||
|
@ -1553,7 +1546,6 @@ static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
|
|||
out:
|
||||
mm_slot->nr_pte_mapped_thp = 0;
|
||||
mmap_write_unlock(mm);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
|
||||
|
@ -2057,9 +2049,8 @@ static void khugepaged_scan_file(struct mm_struct *mm,
|
|||
BUILD_BUG();
|
||||
}
|
||||
|
||||
static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
|
||||
static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -2205,11 +2196,9 @@ static void khugepaged_do_scan(void)
|
|||
{
|
||||
struct page *hpage = NULL;
|
||||
unsigned int progress = 0, pass_through_head = 0;
|
||||
unsigned int pages = khugepaged_pages_to_scan;
|
||||
unsigned int pages = READ_ONCE(khugepaged_pages_to_scan);
|
||||
bool wait = true;
|
||||
|
||||
barrier(); /* write khugepaged_pages_to_scan to local stack */
|
||||
|
||||
lru_add_drain_all();
|
||||
|
||||
while (progress < pages) {
|
||||
|
|
17
mm/ksm.c
17
mm/ksm.c
|
@ -215,8 +215,6 @@ struct rmap_item {
|
|||
#define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */
|
||||
#define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */
|
||||
#define STABLE_FLAG 0x200 /* is listed from the stable tree */
|
||||
#define KSM_FLAG_MASK (SEQNR_MASK|UNSTABLE_FLAG|STABLE_FLAG)
|
||||
/* to mask all the flags */
|
||||
|
||||
/* The stable and unstable tree heads */
|
||||
static struct rb_root one_stable_tree[1] = { RB_ROOT };
|
||||
|
@ -778,12 +776,11 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
|
|||
struct page *page;
|
||||
|
||||
stable_node = rmap_item->head;
|
||||
page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
|
||||
page = get_ksm_page(stable_node, GET_KSM_PAGE_NOLOCK);
|
||||
if (!page)
|
||||
goto out;
|
||||
|
||||
hlist_del(&rmap_item->hlist);
|
||||
unlock_page(page);
|
||||
put_page(page);
|
||||
|
||||
if (!hlist_empty(&stable_node->hlist))
|
||||
|
@ -794,6 +791,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
|
|||
stable_node->rmap_hlist_len--;
|
||||
|
||||
put_anon_vma(rmap_item->anon_vma);
|
||||
rmap_item->head = NULL;
|
||||
rmap_item->address &= PAGE_MASK;
|
||||
|
||||
} else if (rmap_item->address & UNSTABLE_FLAG) {
|
||||
|
@ -817,8 +815,7 @@ out:
|
|||
cond_resched(); /* we're called from many long loops */
|
||||
}
|
||||
|
||||
static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
|
||||
struct rmap_item **rmap_list)
|
||||
static void remove_trailing_rmap_items(struct rmap_item **rmap_list)
|
||||
{
|
||||
while (*rmap_list) {
|
||||
struct rmap_item *rmap_item = *rmap_list;
|
||||
|
@ -989,7 +986,7 @@ static int unmerge_and_remove_all_rmap_items(void)
|
|||
goto error;
|
||||
}
|
||||
|
||||
remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list);
|
||||
remove_trailing_rmap_items(&mm_slot->rmap_list);
|
||||
mmap_read_unlock(mm);
|
||||
|
||||
spin_lock(&ksm_mmlist_lock);
|
||||
|
@ -1771,7 +1768,6 @@ chain_append:
|
|||
* stable_node_dup is the dup to replace.
|
||||
*/
|
||||
if (stable_node_dup == stable_node) {
|
||||
VM_BUG_ON(is_stable_node_chain(stable_node_dup));
|
||||
VM_BUG_ON(is_stable_node_dup(stable_node_dup));
|
||||
/* chain is missing so create it */
|
||||
stable_node = alloc_stable_node_chain(stable_node_dup,
|
||||
|
@ -1785,7 +1781,6 @@ chain_append:
|
|||
* of the current nid for this page
|
||||
* content.
|
||||
*/
|
||||
VM_BUG_ON(!is_stable_node_chain(stable_node));
|
||||
VM_BUG_ON(!is_stable_node_dup(stable_node_dup));
|
||||
VM_BUG_ON(page_node->head != &migrate_nodes);
|
||||
list_del(&page_node->list);
|
||||
|
@ -2337,7 +2332,7 @@ next_mm:
|
|||
* Nuke all the rmap_items that are above this current rmap:
|
||||
* because there were no VM_MERGEABLE vmas with such addresses.
|
||||
*/
|
||||
remove_trailing_rmap_items(slot, ksm_scan.rmap_list);
|
||||
remove_trailing_rmap_items(ksm_scan.rmap_list);
|
||||
|
||||
spin_lock(&ksm_mmlist_lock);
|
||||
ksm_scan.mm_slot = list_entry(slot->mm_list.next,
|
||||
|
@ -2634,7 +2629,7 @@ again:
|
|||
vma = vmac->vma;
|
||||
|
||||
/* Ignore the stable/unstable/sqnr flags */
|
||||
addr = rmap_item->address & ~KSM_FLAG_MASK;
|
||||
addr = rmap_item->address & PAGE_MASK;
|
||||
|
||||
if (addr < vma->vm_start || addr >= vma->vm_end)
|
||||
continue;
|
||||
|
|
|
@ -125,8 +125,8 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item)
|
|||
list_add_tail(item, &l->list);
|
||||
/* Set shrinker bit if the first element was added */
|
||||
if (!l->nr_items++)
|
||||
memcg_set_shrinker_bit(memcg, nid,
|
||||
lru_shrinker_id(lru));
|
||||
set_shrinker_bit(memcg, nid,
|
||||
lru_shrinker_id(lru));
|
||||
nlru->nr_items++;
|
||||
spin_unlock(&nlru->lock);
|
||||
return true;
|
||||
|
@ -540,7 +540,7 @@ static void memcg_drain_list_lru_node(struct list_lru *lru, int nid,
|
|||
|
||||
if (src->nr_items) {
|
||||
dst->nr_items += src->nr_items;
|
||||
memcg_set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru));
|
||||
set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru));
|
||||
src->nr_items = 0;
|
||||
}
|
||||
|
||||
|
|
131
mm/memcontrol.c
131
mm/memcontrol.c
|
@ -400,130 +400,6 @@ DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
|
|||
EXPORT_SYMBOL(memcg_kmem_enabled_key);
|
||||
#endif
|
||||
|
||||
static int memcg_shrinker_map_size;
|
||||
static DEFINE_MUTEX(memcg_shrinker_map_mutex);
|
||||
|
||||
static void memcg_free_shrinker_map_rcu(struct rcu_head *head)
|
||||
{
|
||||
kvfree(container_of(head, struct memcg_shrinker_map, rcu));
|
||||
}
|
||||
|
||||
static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg,
|
||||
int size, int old_size)
|
||||
{
|
||||
struct memcg_shrinker_map *new, *old;
|
||||
struct mem_cgroup_per_node *pn;
|
||||
int nid;
|
||||
|
||||
lockdep_assert_held(&memcg_shrinker_map_mutex);
|
||||
|
||||
for_each_node(nid) {
|
||||
pn = memcg->nodeinfo[nid];
|
||||
old = rcu_dereference_protected(pn->shrinker_map, true);
|
||||
/* Not yet online memcg */
|
||||
if (!old)
|
||||
return 0;
|
||||
|
||||
new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
|
||||
if (!new)
|
||||
return -ENOMEM;
|
||||
|
||||
/* Set all old bits, clear all new bits */
|
||||
memset(new->map, (int)0xff, old_size);
|
||||
memset((void *)new->map + old_size, 0, size - old_size);
|
||||
|
||||
rcu_assign_pointer(pn->shrinker_map, new);
|
||||
call_rcu(&old->rcu, memcg_free_shrinker_map_rcu);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void memcg_free_shrinker_maps(struct mem_cgroup *memcg)
|
||||
{
|
||||
struct mem_cgroup_per_node *pn;
|
||||
struct memcg_shrinker_map *map;
|
||||
int nid;
|
||||
|
||||
if (mem_cgroup_is_root(memcg))
|
||||
return;
|
||||
|
||||
for_each_node(nid) {
|
||||
pn = memcg->nodeinfo[nid];
|
||||
map = rcu_dereference_protected(pn->shrinker_map, true);
|
||||
kvfree(map);
|
||||
rcu_assign_pointer(pn->shrinker_map, NULL);
|
||||
}
|
||||
}
|
||||
|
||||
static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
|
||||
{
|
||||
struct memcg_shrinker_map *map;
|
||||
int nid, size, ret = 0;
|
||||
|
||||
if (mem_cgroup_is_root(memcg))
|
||||
return 0;
|
||||
|
||||
mutex_lock(&memcg_shrinker_map_mutex);
|
||||
size = memcg_shrinker_map_size;
|
||||
for_each_node(nid) {
|
||||
map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid);
|
||||
if (!map) {
|
||||
memcg_free_shrinker_maps(memcg);
|
||||
ret = -ENOMEM;
|
||||
break;
|
||||
}
|
||||
rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map);
|
||||
}
|
||||
mutex_unlock(&memcg_shrinker_map_mutex);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int memcg_expand_shrinker_maps(int new_id)
|
||||
{
|
||||
int size, old_size, ret = 0;
|
||||
struct mem_cgroup *memcg;
|
||||
|
||||
size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long);
|
||||
old_size = memcg_shrinker_map_size;
|
||||
if (size <= old_size)
|
||||
return 0;
|
||||
|
||||
mutex_lock(&memcg_shrinker_map_mutex);
|
||||
if (!root_mem_cgroup)
|
||||
goto unlock;
|
||||
|
||||
for_each_mem_cgroup(memcg) {
|
||||
if (mem_cgroup_is_root(memcg))
|
||||
continue;
|
||||
ret = memcg_expand_one_shrinker_map(memcg, size, old_size);
|
||||
if (ret) {
|
||||
mem_cgroup_iter_break(NULL, memcg);
|
||||
goto unlock;
|
||||
}
|
||||
}
|
||||
unlock:
|
||||
if (!ret)
|
||||
memcg_shrinker_map_size = size;
|
||||
mutex_unlock(&memcg_shrinker_map_mutex);
|
||||
return ret;
|
||||
}
|
||||
|
||||
void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
|
||||
{
|
||||
if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
|
||||
struct memcg_shrinker_map *map;
|
||||
|
||||
rcu_read_lock();
|
||||
map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map);
|
||||
/* Pairs with smp mb in shrink_slab() */
|
||||
smp_mb__before_atomic();
|
||||
set_bit(shrinker_id, map->map);
|
||||
rcu_read_unlock();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* mem_cgroup_css_from_page - css of the memcg associated with a page
|
||||
* @page: page of interest
|
||||
|
@ -5242,11 +5118,11 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
|
|||
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
|
||||
|
||||
/*
|
||||
* A memcg must be visible for memcg_expand_shrinker_maps()
|
||||
* A memcg must be visible for expand_shrinker_info()
|
||||
* by the time the maps are allocated. So, we allocate maps
|
||||
* here, when for_each_mem_cgroup() can't skip it.
|
||||
*/
|
||||
if (memcg_alloc_shrinker_maps(memcg)) {
|
||||
if (alloc_shrinker_info(memcg)) {
|
||||
mem_cgroup_id_remove(memcg);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
@ -5278,6 +5154,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
|
|||
page_counter_set_low(&memcg->memory, 0);
|
||||
|
||||
memcg_offline_kmem(memcg);
|
||||
reparent_shrinker_deferred(memcg);
|
||||
wb_memcg_offline(memcg);
|
||||
|
||||
drain_all_stock(memcg);
|
||||
|
@ -5310,7 +5187,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
|
|||
vmpressure_cleanup(&memcg->vmpressure);
|
||||
cancel_work_sync(&memcg->high_work);
|
||||
mem_cgroup_remove_from_trees(memcg);
|
||||
memcg_free_shrinker_maps(memcg);
|
||||
free_shrinker_info(memcg);
|
||||
memcg_free_kmem(memcg);
|
||||
mem_cgroup_free(memcg);
|
||||
}
|
||||
|
|
|
@ -42,6 +42,16 @@
|
|||
#include "internal.h"
|
||||
#include "shuffle.h"
|
||||
|
||||
|
||||
/*
|
||||
* memory_hotplug.memmap_on_memory parameter
|
||||
*/
|
||||
static bool memmap_on_memory __ro_after_init;
|
||||
#ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
|
||||
module_param(memmap_on_memory, bool, 0444);
|
||||
MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug");
|
||||
#endif
|
||||
|
||||
/*
|
||||
* online_page_callback contains pointer to current page onlining function.
|
||||
* Initially it is generic_online_page(). If it is required it could be
|
||||
|
@ -648,9 +658,16 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
|
|||
* decide to not expose all pages to the buddy (e.g., expose them
|
||||
* later). We account all pages as being online and belonging to this
|
||||
* zone ("present").
|
||||
* When using memmap_on_memory, the range might not be aligned to
|
||||
* MAX_ORDER_NR_PAGES - 1, but pageblock aligned. __ffs() will detect
|
||||
* this and the first chunk to online will be pageblock_nr_pages.
|
||||
*/
|
||||
for (pfn = start_pfn; pfn < end_pfn; pfn += MAX_ORDER_NR_PAGES)
|
||||
(*online_page_callback)(pfn_to_page(pfn), MAX_ORDER - 1);
|
||||
for (pfn = start_pfn; pfn < end_pfn;) {
|
||||
int order = min(MAX_ORDER - 1UL, __ffs(pfn));
|
||||
|
||||
(*online_page_callback)(pfn_to_page(pfn), order);
|
||||
pfn += (1UL << order);
|
||||
}
|
||||
|
||||
/* mark all involved sections as online */
|
||||
online_mem_sections(start_pfn, end_pfn);
|
||||
|
@ -817,7 +834,7 @@ static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn
|
|||
return movable_node_enabled ? movable_zone : kernel_zone;
|
||||
}
|
||||
|
||||
struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
|
||||
struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
|
||||
unsigned long nr_pages)
|
||||
{
|
||||
if (online_type == MMOP_ONLINE_KERNEL)
|
||||
|
@ -829,24 +846,86 @@ struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
|
|||
return default_zone_for_pfn(nid, start_pfn, nr_pages);
|
||||
}
|
||||
|
||||
int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
|
||||
int online_type, int nid)
|
||||
/*
|
||||
* This function should only be called by memory_block_{online,offline},
|
||||
* and {online,offline}_pages.
|
||||
*/
|
||||
void adjust_present_page_count(struct zone *zone, long nr_pages)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
zone->present_pages += nr_pages;
|
||||
pgdat_resize_lock(zone->zone_pgdat, &flags);
|
||||
zone->zone_pgdat->node_present_pages += nr_pages;
|
||||
pgdat_resize_unlock(zone->zone_pgdat, &flags);
|
||||
}
|
||||
|
||||
int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
|
||||
struct zone *zone)
|
||||
{
|
||||
unsigned long end_pfn = pfn + nr_pages;
|
||||
int ret;
|
||||
|
||||
ret = kasan_add_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE);
|
||||
|
||||
/*
|
||||
* It might be that the vmemmap_pages fully span sections. If that is
|
||||
* the case, mark those sections online here as otherwise they will be
|
||||
* left offline.
|
||||
*/
|
||||
if (nr_pages >= PAGES_PER_SECTION)
|
||||
online_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION));
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages)
|
||||
{
|
||||
unsigned long end_pfn = pfn + nr_pages;
|
||||
|
||||
/*
|
||||
* It might be that the vmemmap_pages fully span sections. If that is
|
||||
* the case, mark those sections offline here as otherwise they will be
|
||||
* left online.
|
||||
*/
|
||||
if (nr_pages >= PAGES_PER_SECTION)
|
||||
offline_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION));
|
||||
|
||||
/*
|
||||
* The pages associated with this vmemmap have been offlined, so
|
||||
* we can reset its state here.
|
||||
*/
|
||||
remove_pfn_range_from_zone(page_zone(pfn_to_page(pfn)), pfn, nr_pages);
|
||||
kasan_remove_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
|
||||
}
|
||||
|
||||
int __ref online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *zone)
|
||||
{
|
||||
unsigned long flags;
|
||||
struct zone *zone;
|
||||
int need_zonelists_rebuild = 0;
|
||||
const int nid = zone_to_nid(zone);
|
||||
int ret;
|
||||
struct memory_notify arg;
|
||||
|
||||
/* We can only online full sections (e.g., SECTION_IS_ONLINE) */
|
||||
/*
|
||||
* {on,off}lining is constrained to full memory sections (or more
|
||||
* precisly to memory blocks from the user space POV).
|
||||
* memmap_on_memory is an exception because it reserves initial part
|
||||
* of the physical memory space for vmemmaps. That space is pageblock
|
||||
* aligned.
|
||||
*/
|
||||
if (WARN_ON_ONCE(!nr_pages ||
|
||||
!IS_ALIGNED(pfn | nr_pages, PAGES_PER_SECTION)))
|
||||
!IS_ALIGNED(pfn, pageblock_nr_pages) ||
|
||||
!IS_ALIGNED(pfn + nr_pages, PAGES_PER_SECTION)))
|
||||
return -EINVAL;
|
||||
|
||||
mem_hotplug_begin();
|
||||
|
||||
/* associate pfn range with the zone */
|
||||
zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages);
|
||||
move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE);
|
||||
|
||||
arg.start_pfn = pfn;
|
||||
|
@ -877,11 +956,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
|
|||
}
|
||||
|
||||
online_pages_range(pfn, nr_pages);
|
||||
zone->present_pages += nr_pages;
|
||||
|
||||
pgdat_resize_lock(zone->zone_pgdat, &flags);
|
||||
zone->zone_pgdat->node_present_pages += nr_pages;
|
||||
pgdat_resize_unlock(zone->zone_pgdat, &flags);
|
||||
adjust_present_page_count(zone, nr_pages);
|
||||
|
||||
node_states_set_node(nid, &arg);
|
||||
if (need_zonelists_rebuild)
|
||||
|
@ -1064,6 +1139,45 @@ static int online_memory_block(struct memory_block *mem, void *arg)
|
|||
return device_online(&mem->dev);
|
||||
}
|
||||
|
||||
bool mhp_supports_memmap_on_memory(unsigned long size)
|
||||
{
|
||||
unsigned long nr_vmemmap_pages = size / PAGE_SIZE;
|
||||
unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
|
||||
unsigned long remaining_size = size - vmemmap_size;
|
||||
|
||||
/*
|
||||
* Besides having arch support and the feature enabled at runtime, we
|
||||
* need a few more assumptions to hold true:
|
||||
*
|
||||
* a) We span a single memory block: memory onlining/offlinin;g happens
|
||||
* in memory block granularity. We don't want the vmemmap of online
|
||||
* memory blocks to reside on offline memory blocks. In the future,
|
||||
* we might want to support variable-sized memory blocks to make the
|
||||
* feature more versatile.
|
||||
*
|
||||
* b) The vmemmap pages span complete PMDs: We don't want vmemmap code
|
||||
* to populate memory from the altmap for unrelated parts (i.e.,
|
||||
* other memory blocks)
|
||||
*
|
||||
* c) The vmemmap pages (and thereby the pages that will be exposed to
|
||||
* the buddy) have to cover full pageblocks: memory onlining/offlining
|
||||
* code requires applicable ranges to be page-aligned, for example, to
|
||||
* set the migratetypes properly.
|
||||
*
|
||||
* TODO: Although we have a check here to make sure that vmemmap pages
|
||||
* fully populate a PMD, it is not the right place to check for
|
||||
* this. A much better solution involves improving vmemmap code
|
||||
* to fallback to base pages when trying to populate vmemmap using
|
||||
* altmap as an alternative source of memory, and we do not exactly
|
||||
* populate a single PMD.
|
||||
*/
|
||||
return memmap_on_memory &&
|
||||
IS_ENABLED(CONFIG_MHP_MEMMAP_ON_MEMORY) &&
|
||||
size == memory_block_size_bytes() &&
|
||||
IS_ALIGNED(vmemmap_size, PMD_SIZE) &&
|
||||
IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT));
|
||||
}
|
||||
|
||||
/*
|
||||
* NOTE: The caller must call lock_device_hotplug() to serialize hotplug
|
||||
* and online/offline operations (triggered e.g. by sysfs).
|
||||
|
@ -1073,6 +1187,7 @@ static int online_memory_block(struct memory_block *mem, void *arg)
|
|||
int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
|
||||
{
|
||||
struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
|
||||
struct vmem_altmap mhp_altmap = {};
|
||||
u64 start, size;
|
||||
bool new_node = false;
|
||||
int ret;
|
||||
|
@ -1099,13 +1214,26 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
|
|||
goto error;
|
||||
new_node = ret;
|
||||
|
||||
/*
|
||||
* Self hosted memmap array
|
||||
*/
|
||||
if (mhp_flags & MHP_MEMMAP_ON_MEMORY) {
|
||||
if (!mhp_supports_memmap_on_memory(size)) {
|
||||
ret = -EINVAL;
|
||||
goto error;
|
||||
}
|
||||
mhp_altmap.free = PHYS_PFN(size);
|
||||
mhp_altmap.base_pfn = PHYS_PFN(start);
|
||||
params.altmap = &mhp_altmap;
|
||||
}
|
||||
|
||||
/* call arch's memory hotadd */
|
||||
ret = arch_add_memory(nid, start, size, ¶ms);
|
||||
if (ret < 0)
|
||||
goto error;
|
||||
|
||||
/* create memory block devices after memory was added */
|
||||
ret = create_memory_block_devices(start, size);
|
||||
ret = create_memory_block_devices(start, size, mhp_altmap.alloc);
|
||||
if (ret) {
|
||||
arch_remove_memory(nid, start, size, NULL);
|
||||
goto error;
|
||||
|
@ -1573,9 +1701,16 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
|
|||
int ret, node;
|
||||
char *reason;
|
||||
|
||||
/* We can only offline full sections (e.g., SECTION_IS_ONLINE) */
|
||||
/*
|
||||
* {on,off}lining is constrained to full memory sections (or more
|
||||
* precisly to memory blocks from the user space POV).
|
||||
* memmap_on_memory is an exception because it reserves initial part
|
||||
* of the physical memory space for vmemmaps. That space is pageblock
|
||||
* aligned.
|
||||
*/
|
||||
if (WARN_ON_ONCE(!nr_pages ||
|
||||
!IS_ALIGNED(start_pfn | nr_pages, PAGES_PER_SECTION)))
|
||||
!IS_ALIGNED(start_pfn, pageblock_nr_pages) ||
|
||||
!IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION)))
|
||||
return -EINVAL;
|
||||
|
||||
mem_hotplug_begin();
|
||||
|
@ -1611,6 +1746,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
|
|||
* in a way that pages from isolated pageblock are left on pcplists.
|
||||
*/
|
||||
zone_pcp_disable(zone);
|
||||
lru_cache_disable();
|
||||
|
||||
/* set above range as isolated */
|
||||
ret = start_isolate_page_range(start_pfn, end_pfn,
|
||||
|
@ -1642,7 +1778,6 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
|
|||
}
|
||||
|
||||
cond_resched();
|
||||
lru_add_drain_all();
|
||||
|
||||
ret = scan_movable_pages(pfn, end_pfn, &pfn);
|
||||
if (!ret) {
|
||||
|
@ -1687,15 +1822,12 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
|
|||
zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages;
|
||||
spin_unlock_irqrestore(&zone->lock, flags);
|
||||
|
||||
lru_cache_enable();
|
||||
zone_pcp_enable(zone);
|
||||
|
||||
/* removal success */
|
||||
adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages);
|
||||
zone->present_pages -= nr_pages;
|
||||
|
||||
pgdat_resize_lock(zone->zone_pgdat, &flags);
|
||||
zone->zone_pgdat->node_present_pages -= nr_pages;
|
||||
pgdat_resize_unlock(zone->zone_pgdat, &flags);
|
||||
adjust_present_page_count(zone, -nr_pages);
|
||||
|
||||
init_per_zone_wmark_min();
|
||||
|
||||
|
@ -1750,6 +1882,14 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static int get_nr_vmemmap_pages_cb(struct memory_block *mem, void *arg)
|
||||
{
|
||||
/*
|
||||
* If not set, continue with the next block.
|
||||
*/
|
||||
return mem->nr_vmemmap_pages;
|
||||
}
|
||||
|
||||
static int check_cpu_on_node(pg_data_t *pgdat)
|
||||
{
|
||||
int cpu;
|
||||
|
@ -1824,6 +1964,9 @@ EXPORT_SYMBOL(try_offline_node);
|
|||
static int __ref try_remove_memory(int nid, u64 start, u64 size)
|
||||
{
|
||||
int rc = 0;
|
||||
struct vmem_altmap mhp_altmap = {};
|
||||
struct vmem_altmap *altmap = NULL;
|
||||
unsigned long nr_vmemmap_pages;
|
||||
|
||||
BUG_ON(check_hotplug_memory_range(start, size));
|
||||
|
||||
|
@ -1836,6 +1979,31 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
|
|||
if (rc)
|
||||
return rc;
|
||||
|
||||
/*
|
||||
* We only support removing memory added with MHP_MEMMAP_ON_MEMORY in
|
||||
* the same granularity it was added - a single memory block.
|
||||
*/
|
||||
if (memmap_on_memory) {
|
||||
nr_vmemmap_pages = walk_memory_blocks(start, size, NULL,
|
||||
get_nr_vmemmap_pages_cb);
|
||||
if (nr_vmemmap_pages) {
|
||||
if (size != memory_block_size_bytes()) {
|
||||
pr_warn("Refuse to remove %#llx - %#llx,"
|
||||
"wrong granularity\n",
|
||||
start, start + size);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Let remove_pmd_table->free_hugepage_table do the
|
||||
* right thing if we used vmem_altmap when hot-adding
|
||||
* the range.
|
||||
*/
|
||||
mhp_altmap.alloc = nr_vmemmap_pages;
|
||||
altmap = &mhp_altmap;
|
||||
}
|
||||
}
|
||||
|
||||
/* remove memmap entry */
|
||||
firmware_map_remove(start, start + size, "System RAM");
|
||||
|
||||
|
@ -1847,7 +2015,7 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
|
|||
|
||||
mem_hotplug_begin();
|
||||
|
||||
arch_remove_memory(nid, start, size, NULL);
|
||||
arch_remove_memory(nid, start, size, altmap);
|
||||
|
||||
if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
|
||||
memblock_free(start, size);
|
||||
|
|
|
@ -330,7 +330,7 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
|
|||
else if (pol->flags & MPOL_F_RELATIVE_NODES)
|
||||
mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
|
||||
else {
|
||||
nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
|
||||
nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
|
||||
*nodes);
|
||||
pol->w.cpuset_mems_allowed = *nodes;
|
||||
}
|
||||
|
@ -1124,7 +1124,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
|
|||
int err = 0;
|
||||
nodemask_t tmp;
|
||||
|
||||
migrate_prep();
|
||||
lru_cache_disable();
|
||||
|
||||
mmap_read_lock(mm);
|
||||
|
||||
|
@ -1161,7 +1161,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
|
|||
|
||||
tmp = *from;
|
||||
while (!nodes_empty(tmp)) {
|
||||
int s,d;
|
||||
int s, d;
|
||||
int source = NUMA_NO_NODE;
|
||||
int dest = 0;
|
||||
|
||||
|
@ -1208,6 +1208,8 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
|
|||
break;
|
||||
}
|
||||
mmap_read_unlock(mm);
|
||||
|
||||
lru_cache_enable();
|
||||
if (err < 0)
|
||||
return err;
|
||||
return busy;
|
||||
|
@ -1323,7 +1325,7 @@ static long do_mbind(unsigned long start, unsigned long len,
|
|||
|
||||
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
|
||||
|
||||
migrate_prep();
|
||||
lru_cache_disable();
|
||||
}
|
||||
{
|
||||
NODEMASK_SCRATCH(scratch);
|
||||
|
@ -1371,6 +1373,8 @@ up_out:
|
|||
mmap_write_unlock(mm);
|
||||
mpol_out:
|
||||
mpol_put(new);
|
||||
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
|
||||
lru_cache_enable();
|
||||
return err;
|
||||
}
|
||||
|
||||
|
|
|
@ -251,7 +251,7 @@ EXPORT_SYMBOL(mempool_init);
|
|||
mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
|
||||
mempool_free_t *free_fn, void *pool_data)
|
||||
{
|
||||
return mempool_create_node(min_nr,alloc_fn,free_fn, pool_data,
|
||||
return mempool_create_node(min_nr, alloc_fn, free_fn, pool_data,
|
||||
GFP_KERNEL, NUMA_NO_NODE);
|
||||
}
|
||||
EXPORT_SYMBOL(mempool_create);
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue