Merge branch 'kasong/tk5/0001/emm' into 'master' (merge request !45)
EKS and EMM Support First 4 commits, and last 9 commits are for EKS, rest are for EMM modular interface and core update. Stress tested with MySQL + UMRD, Except last 9 commits.
This commit is contained in:
commit
4e5f174036
|
@ -829,6 +829,14 @@ config KVM_GUEST
|
|||
underlying device model, the host provides the guest with
|
||||
timing infrastructure such as time of day, and system time
|
||||
|
||||
config KVM_FORCE_PVCLOCK
|
||||
bool "Force using pvclock"
|
||||
depends on KVM_GUEST
|
||||
default n
|
||||
help
|
||||
Use pvclock even if host tell us not to, don't select this unless you
|
||||
know what you are doing.
|
||||
|
||||
config ARCH_CPUIDLE_HALTPOLL
|
||||
def_bool n
|
||||
prompt "Disable host haltpoll when loading haltpoll driver"
|
||||
|
|
|
@ -23,6 +23,13 @@ CONFIG_LOG_BUF_SHIFT=19
|
|||
CONFIG_NUMA_BALANCING=y
|
||||
# CONFIG_NUMA_BALANCING_DEFAULT_ENABLED is not set
|
||||
CONFIG_MEMCG=y
|
||||
CONFIG_ENHANCED_MM=y
|
||||
CONFIG_EMM_FORCE_SWAPPINESS=y
|
||||
CONFIG_EMM_RAMDISK_SWAP=y
|
||||
CONFIG_EMM_WORKINGSET_TRACKING=y
|
||||
CONFIG_EMM_MEMCG=y
|
||||
CONFIG_EMM_RECLAIM=y
|
||||
CONFIG_EMM_ZRAM_CONF=y
|
||||
CONFIG_BLK_CGROUP=y
|
||||
CONFIG_CFS_BANDWIDTH=y
|
||||
CONFIG_RT_GROUP_SCHED=y
|
||||
|
@ -57,7 +64,7 @@ CONFIG_PVH=y
|
|||
CONFIG_PARAVIRT_TIME_ACCOUNTING=y
|
||||
CONFIG_JAILHOUSE_GUEST=y
|
||||
CONFIG_GART_IOMMU=y
|
||||
CONFIG_MAXSMP=y
|
||||
CONFIG_NR_CPUS=8192
|
||||
CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y
|
||||
CONFIG_X86_MCELOG_LEGACY=y
|
||||
CONFIG_X86_MCE_INJECT=m
|
||||
|
@ -66,6 +73,7 @@ CONFIG_X86_MSR=y
|
|||
CONFIG_X86_CPUID=y
|
||||
CONFIG_AMD_MEM_ENCRYPT=y
|
||||
CONFIG_NUMA=y
|
||||
CONFIG_NODES_SHIFT=8
|
||||
CONFIG_ARCH_MEMORY_PROBE=y
|
||||
CONFIG_X86_CHECK_BIOS_CORRUPTION=y
|
||||
# CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK is not set
|
||||
|
@ -1347,6 +1355,7 @@ CONFIG_CRYPTO_DEV_VIRTIO=m
|
|||
CONFIG_CORDIC=m
|
||||
CONFIG_CRC7=m
|
||||
CONFIG_LIBCRC32C=y
|
||||
CONFIG_CPUMASK_OFFSTACK=y
|
||||
CONFIG_PRINTK_TIME=y
|
||||
CONFIG_BOOT_PRINTK_DELAY=y
|
||||
CONFIG_DYNAMIC_DEBUG=y
|
||||
|
@ -1358,6 +1367,7 @@ CONFIG_STACK_VALIDATION=y
|
|||
CONFIG_MAGIC_SYSRQ=y
|
||||
CONFIG_SCHED_STACK_END_CHECK=y
|
||||
CONFIG_DEBUG_MEMORY_INIT=y
|
||||
CONFIG_DEBUG_PER_CPU_MAPS=y
|
||||
CONFIG_DEBUG_SHIRQ=y
|
||||
CONFIG_PANIC_ON_OOPS=y
|
||||
CONFIG_HARDLOCKUP_DETECTOR=y
|
||||
|
|
|
@ -225,7 +225,8 @@ static u64 vread_pvclock(void)
|
|||
do {
|
||||
version = pvclock_read_begin(pvti);
|
||||
|
||||
if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT)))
|
||||
if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT)) &&
|
||||
!IS_ENABLED(CONFIG_KVM_FORCE_PVCLOCK))
|
||||
return U64_MAX;
|
||||
|
||||
ret = __pvclock_read_cycles(pvti, rdtsc_ordered());
|
||||
|
|
|
@ -250,8 +250,13 @@ static int __init kvm_setup_vsyscall_timeinfo(void)
|
|||
u8 flags;
|
||||
|
||||
flags = pvclock_read_flags(&hv_clock_boot[0].pvti);
|
||||
if (!(flags & PVCLOCK_TSC_STABLE_BIT))
|
||||
if (!(flags & PVCLOCK_TSC_STABLE_BIT)) {
|
||||
if (IS_ENABLED(CONFIG_KVM_FORCE_PVCLOCK)) {
|
||||
pr_info("Forcing vclock_mode = VCLOCK_PVCLOCK\n");
|
||||
kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK;
|
||||
}
|
||||
|
|
77
block/bdev.c
77
block/bdev.c
|
@ -287,6 +287,83 @@ out:
|
|||
}
|
||||
EXPORT_SYMBOL(thaw_bdev);
|
||||
|
||||
/**
|
||||
* bdev_swapin_folio() - Start reading a folio from a block device
|
||||
* @bdev: The device to read the folio from
|
||||
* @sector: The offset on the device to read the folio to (need not be aligned)
|
||||
* @folio: The folio to read
|
||||
*
|
||||
* On entry, the folio should be locked. It will be unlocked when the folio
|
||||
* has been read. If the block driver implements swap_folio synchronously,
|
||||
* that will be true on exit from this function, but it need not be.
|
||||
*
|
||||
* Errors returned by this function are usually "soft", eg out of memory, or
|
||||
* queue full; callers should try a different route to read this folio rather
|
||||
* than propagate an error back up the stack.
|
||||
*
|
||||
* Return: negative errno if an error occurs, 0 if submission was successful.
|
||||
*/
|
||||
int bdev_swapin_folio(struct block_device *bdev, sector_t sector,
|
||||
struct folio *folio)
|
||||
{
|
||||
const struct block_device_operations *ops = bdev->bd_disk->fops;
|
||||
int result;
|
||||
|
||||
if (!ops->swap_folio || bdev_get_integrity(bdev))
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
result = blk_queue_enter(bdev_get_queue(bdev), 0);
|
||||
if (result)
|
||||
return -EOPNOTSUPP;
|
||||
result = ops->swap_folio(bdev, sector + get_start_sect(bdev), folio,
|
||||
REQ_OP_READ);
|
||||
blk_queue_exit(bdev_get_queue(bdev));
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* bdev_swapout_folio() - Start writing a folio to a block device
|
||||
* @bdev: The device to write the folio to
|
||||
* @sector: The offset on the device to write the folio to (need not be aligned)
|
||||
* @folio: The folio to write
|
||||
* @wbc: The writeback_control for the write
|
||||
*
|
||||
* On entry, the folio should be locked and not currently under writeback.
|
||||
* On exit, if the write started successfully, the folio will be unlocked and
|
||||
* under writeback. If the write failed already (eg the driver failed to
|
||||
* queue the folio to the device), the folio will still be locked. If the
|
||||
* caller is a ->writefolio implementation, it will need to unlock the folio.
|
||||
*
|
||||
* Errors returned by this function are usually "soft", eg out of memory, or
|
||||
* queue full; callers should try a different route to write this folio rather
|
||||
* than propagate an error back up the stack.
|
||||
*
|
||||
* Return: negative errno if an error occurs, 0 if submission was successful.
|
||||
*/
|
||||
int bdev_swapout_folio(struct block_device *bdev, sector_t sector,
|
||||
struct folio *folio, struct writeback_control *wbc)
|
||||
{
|
||||
int result;
|
||||
const struct block_device_operations *ops = bdev->bd_disk->fops;
|
||||
|
||||
if (!ops->swap_folio || bdev_get_integrity(bdev))
|
||||
return -EOPNOTSUPP;
|
||||
result = blk_queue_enter(bdev_get_queue(bdev), 0);
|
||||
if (result)
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
folio_start_writeback(folio);
|
||||
result = ops->swap_folio(bdev, sector + get_start_sect(bdev), folio,
|
||||
REQ_OP_WRITE);
|
||||
if (result) {
|
||||
folio_end_writeback(folio);
|
||||
} else {
|
||||
folio_unlock(folio);
|
||||
}
|
||||
blk_queue_exit(bdev_get_queue(bdev));
|
||||
return result;
|
||||
}
|
||||
|
||||
/*
|
||||
* pseudo-fs
|
||||
*/
|
||||
|
|
|
@ -103,6 +103,10 @@ endif
|
|||
# All params enabled by default (except kABI check, see below), ENABLED overrides DEFAULT_DISABLE.
|
||||
DISABLED=$(DEFAULT_DISABLED)
|
||||
ENABLED=$(DEFAULT_ENABLED)
|
||||
# Automatically disable non core package for non standard build
|
||||
ifneq ($(CONFIG),generic-release)
|
||||
DISABLED=ofed bpftool perf tools
|
||||
endif
|
||||
|
||||
## A few shortcut for commonly used params:
|
||||
# Disable KABI check by default
|
||||
|
|
|
@ -0,0 +1,985 @@
|
|||
# CONFIG_LOCALVERSION_AUTO is not set
|
||||
CONFIG_KERNEL_LZ4=y
|
||||
CONFIG_DEFAULT_HOSTNAME="eks-tkex"
|
||||
CONFIG_SYSVIPC=y
|
||||
CONFIG_POSIX_MQUEUE=y
|
||||
CONFIG_NO_HZ=y
|
||||
CONFIG_HIGH_RES_TIMERS=y
|
||||
CONFIG_BPF_SYSCALL=y
|
||||
CONFIG_BPF_JIT=y
|
||||
CONFIG_BPF_JIT_ALWAYS_ON=y
|
||||
CONFIG_PREEMPT_VOLUNTARY=y
|
||||
CONFIG_IRQ_TIME_ACCOUNTING=y
|
||||
CONFIG_BSD_PROCESS_ACCT=y
|
||||
CONFIG_BSD_PROCESS_ACCT_V3=y
|
||||
CONFIG_PSI=y
|
||||
CONFIG_IKCONFIG=y
|
||||
CONFIG_IKCONFIG_PROC=y
|
||||
CONFIG_IKHEADERS=y
|
||||
CONFIG_LOG_BUF_SHIFT=19
|
||||
CONFIG_NUMA_BALANCING=y
|
||||
# CONFIG_NUMA_BALANCING_DEFAULT_ENABLED is not set
|
||||
CONFIG_MEMCG=y
|
||||
CONFIG_BLK_CGROUP=y
|
||||
CONFIG_CFS_BANDWIDTH=y
|
||||
CONFIG_RT_GROUP_SCHED=y
|
||||
CONFIG_CGROUP_PIDS=y
|
||||
CONFIG_CGROUP_FREEZER=y
|
||||
CONFIG_CGROUP_HUGETLB=y
|
||||
CONFIG_CPUSETS=y
|
||||
CONFIG_CGROUP_DEVICE=y
|
||||
CONFIG_CGROUP_CPUACCT=y
|
||||
CONFIG_CGROUP_PERF=y
|
||||
CONFIG_CGROUP_BPF=y
|
||||
CONFIG_CGROUP_MISC=y
|
||||
CONFIG_NAMESPACES=y
|
||||
CONFIG_USER_NS=y
|
||||
CONFIG_CHECKPOINT_RESTORE=y
|
||||
CONFIG_SCHED_AUTOGROUP=y
|
||||
CONFIG_BLK_DEV_INITRD=y
|
||||
CONFIG_EXPERT=y
|
||||
CONFIG_PROFILING=y
|
||||
CONFIG_KEXEC=y
|
||||
CONFIG_KEXEC_FILE=y
|
||||
CONFIG_CRASH_DUMP=y
|
||||
CONFIG_SMP=y
|
||||
CONFIG_X86_X2APIC=y
|
||||
CONFIG_X86_CPU_RESCTRL=y
|
||||
CONFIG_X86_AMD_PLATFORM_DEVICE=y
|
||||
CONFIG_HYPERVISOR_GUEST=y
|
||||
CONFIG_PARAVIRT=y
|
||||
CONFIG_PARAVIRT_SPINLOCKS=y
|
||||
CONFIG_KVM_FORCE_PVCLOCK=y
|
||||
CONFIG_PARAVIRT_TIME_ACCOUNTING=y
|
||||
CONFIG_MAXSMP=y
|
||||
CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y
|
||||
CONFIG_X86_MCELOG_LEGACY=y
|
||||
CONFIG_X86_MCE_INJECT=m
|
||||
CONFIG_PERF_EVENTS_AMD_POWER=y
|
||||
CONFIG_X86_MSR=y
|
||||
CONFIG_X86_CPUID=y
|
||||
CONFIG_NUMA=y
|
||||
# CONFIG_MTRR is not set
|
||||
CONFIG_EFI=y
|
||||
CONFIG_EFI_STUB=y
|
||||
# CONFIG_RANDOMIZE_BASE is not set
|
||||
CONFIG_PHYSICAL_ALIGN=0x1000000
|
||||
CONFIG_COMPAT_VDSO=y
|
||||
CONFIG_LIVEPATCH=y
|
||||
# CONFIG_RETPOLINE is not set
|
||||
# CONFIG_ACPI_SPCR_TABLE is not set
|
||||
# CONFIG_ACPI_REV_OVERRIDE_POSSIBLE is not set
|
||||
CONFIG_ACPI_EC_DEBUGFS=m
|
||||
# CONFIG_ACPI_AC is not set
|
||||
# CONFIG_ACPI_BATTERY is not set
|
||||
CONFIG_ACPI_BUTTON=m
|
||||
# CONFIG_ACPI_FAN is not set
|
||||
CONFIG_ACPI_IPMI=m
|
||||
CONFIG_ACPI_PROCESSOR_AGGREGATOR=m
|
||||
# CONFIG_ACPI_THERMAL is not set
|
||||
# CONFIG_ACPI_TABLE_UPGRADE is not set
|
||||
CONFIG_ACPI_PCI_SLOT=y
|
||||
CONFIG_ACPI_SBS=m
|
||||
CONFIG_ACPI_CUSTOM_METHOD=m
|
||||
CONFIG_ACPI_APEI=y
|
||||
CONFIG_ACPI_APEI_GHES=y
|
||||
CONFIG_ACPI_APEI_MEMORY_FAILURE=y
|
||||
CONFIG_ACPI_APEI_EINJ=m
|
||||
CONFIG_ACPI_APEI_ERST_DEBUG=m
|
||||
CONFIG_CPU_FREQ_GOV_POWERSAVE=y
|
||||
CONFIG_CPU_FREQ_GOV_USERSPACE=y
|
||||
CONFIG_CPU_FREQ_GOV_ONDEMAND=y
|
||||
CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y
|
||||
CONFIG_X86_PCC_CPUFREQ=m
|
||||
CONFIG_X86_ACPI_CPUFREQ=m
|
||||
# CONFIG_X86_ACPI_CPUFREQ_CPB is not set
|
||||
CONFIG_X86_POWERNOW_K8=m
|
||||
CONFIG_X86_AMD_FREQ_SENSITIVITY=m
|
||||
CONFIG_CPU_IDLE_GOV_LADDER=y
|
||||
CONFIG_INTEL_IDLE=y
|
||||
CONFIG_IA32_EMULATION=y
|
||||
CONFIG_KVM=m
|
||||
CONFIG_KVM_INTEL=m
|
||||
CONFIG_KVM_AMD=m
|
||||
# CONFIG_KVM_AMD_SEV is not set
|
||||
CONFIG_KPROBES=y
|
||||
CONFIG_JUMP_LABEL=y
|
||||
CONFIG_MODULES=y
|
||||
CONFIG_MODULE_UNLOAD=y
|
||||
CONFIG_MODVERSIONS=y
|
||||
CONFIG_BLK_DEV_THROTTLING=y
|
||||
CONFIG_BLK_WBT=y
|
||||
CONFIG_BLK_CGROUP_IOLATENCY=y
|
||||
CONFIG_BLK_CGROUP_IOCOST=y
|
||||
CONFIG_PARTITION_ADVANCED=y
|
||||
CONFIG_LDM_PARTITION=y
|
||||
CONFIG_IOSCHED_BFQ=y
|
||||
CONFIG_BINFMT_MISC=m
|
||||
CONFIG_ZSMALLOC_STAT=y
|
||||
# CONFIG_SLAB_MERGE_DEFAULT is not set
|
||||
# CONFIG_COMPAT_BRK is not set
|
||||
CONFIG_KSM=y
|
||||
CONFIG_MEMORY_FAILURE=y
|
||||
CONFIG_HWPOISON_INJECT=m
|
||||
CONFIG_TRANSPARENT_HUGEPAGE=y
|
||||
CONFIG_TRANSPARENT_HUGEPAGE_MADVISE=y
|
||||
CONFIG_READ_ONLY_THP_FOR_FS=y
|
||||
CONFIG_USERFAULTFD=y
|
||||
CONFIG_LRU_GEN=y
|
||||
CONFIG_DAMON=y
|
||||
CONFIG_DAMON_PADDR=y
|
||||
CONFIG_ENHANCED_MM=y
|
||||
CONFIG_EMM_FORCE_SWAPPINESS=y
|
||||
CONFIG_EMM_RAMDISK_SWAP=y
|
||||
CONFIG_EMM_WORKINGSET_TRACKING=y
|
||||
CONFIG_EMM_ZRAM_CONF=y
|
||||
CONFIG_TEXT_UNEVICTABLE=y
|
||||
CONFIG_NET=y
|
||||
CONFIG_PACKET=y
|
||||
CONFIG_PACKET_DIAG=m
|
||||
CONFIG_UNIX=y
|
||||
CONFIG_UNIX_DIAG=m
|
||||
CONFIG_TLS=m
|
||||
CONFIG_TLS_DEVICE=y
|
||||
CONFIG_XFRM_USER=y
|
||||
CONFIG_XFRM_SUB_POLICY=y
|
||||
CONFIG_XFRM_STATISTICS=y
|
||||
CONFIG_NET_KEY=m
|
||||
CONFIG_NET_KEY_MIGRATE=y
|
||||
CONFIG_XDP_SOCKETS=y
|
||||
CONFIG_XDP_SOCKETS_DIAG=m
|
||||
CONFIG_INET=y
|
||||
CONFIG_IP_MULTICAST=y
|
||||
CONFIG_IP_ADVANCED_ROUTER=y
|
||||
CONFIG_IP_FIB_TRIE_STATS=y
|
||||
CONFIG_IP_MULTIPLE_TABLES=y
|
||||
CONFIG_IP_ROUTE_MULTIPATH=y
|
||||
CONFIG_IP_ROUTE_VERBOSE=y
|
||||
CONFIG_NET_IPIP=m
|
||||
CONFIG_NET_IPGRE_DEMUX=m
|
||||
CONFIG_NET_IPGRE=m
|
||||
CONFIG_NET_IPGRE_BROADCAST=y
|
||||
CONFIG_IP_MROUTE=y
|
||||
CONFIG_IP_MROUTE_MULTIPLE_TABLES=y
|
||||
CONFIG_IP_PIMSM_V1=y
|
||||
CONFIG_IP_PIMSM_V2=y
|
||||
CONFIG_NET_IPVTI=m
|
||||
CONFIG_INET_AH=m
|
||||
CONFIG_INET_ESP=m
|
||||
CONFIG_INET_ESP_OFFLOAD=m
|
||||
CONFIG_INET_IPCOMP=m
|
||||
CONFIG_INET_DIAG=m
|
||||
CONFIG_INET_UDP_DIAG=m
|
||||
CONFIG_INET_RAW_DIAG=m
|
||||
CONFIG_TCP_CONG_ADVANCED=y
|
||||
CONFIG_TCP_CONG_HSTCP=m
|
||||
CONFIG_TCP_CONG_HYBLA=m
|
||||
CONFIG_TCP_CONG_NV=m
|
||||
CONFIG_TCP_CONG_SCALABLE=m
|
||||
CONFIG_TCP_CONG_LP=m
|
||||
CONFIG_TCP_CONG_VENO=m
|
||||
CONFIG_TCP_CONG_YEAH=m
|
||||
CONFIG_TCP_CONG_ILLINOIS=m
|
||||
CONFIG_TCP_CONG_DCTCP=m
|
||||
CONFIG_TCP_CONG_CDG=m
|
||||
CONFIG_TCP_CONG_BBR=m
|
||||
CONFIG_TCP_MD5SIG=y
|
||||
CONFIG_IPV6_ROUTER_PREF=y
|
||||
CONFIG_IPV6_ROUTE_INFO=y
|
||||
CONFIG_IPV6_OPTIMISTIC_DAD=y
|
||||
CONFIG_INET6_AH=m
|
||||
CONFIG_INET6_ESP=m
|
||||
CONFIG_INET6_ESP_OFFLOAD=m
|
||||
CONFIG_INET6_IPCOMP=m
|
||||
CONFIG_IPV6_MIP6=m
|
||||
CONFIG_IPV6_SIT=m
|
||||
CONFIG_IPV6_SIT_6RD=y
|
||||
CONFIG_IPV6_GRE=m
|
||||
CONFIG_IPV6_MULTIPLE_TABLES=y
|
||||
CONFIG_IPV6_MROUTE=y
|
||||
CONFIG_IPV6_MROUTE_MULTIPLE_TABLES=y
|
||||
CONFIG_IPV6_PIMSM_V2=y
|
||||
CONFIG_NETWORK_SECMARK=y
|
||||
CONFIG_NETWORK_PHY_TIMESTAMPING=y
|
||||
CONFIG_NETFILTER=y
|
||||
CONFIG_BRIDGE_NETFILTER=y
|
||||
CONFIG_NF_CONNTRACK=m
|
||||
CONFIG_NF_CONNTRACK_SECMARK=y
|
||||
CONFIG_NF_CONNTRACK_ZONES=y
|
||||
CONFIG_NF_CONNTRACK_PROCFS=y
|
||||
CONFIG_NF_CONNTRACK_EVENTS=y
|
||||
CONFIG_NF_CONNTRACK_TIMEOUT=y
|
||||
CONFIG_NF_CONNTRACK_TIMESTAMP=y
|
||||
CONFIG_NF_CONNTRACK_AMANDA=m
|
||||
CONFIG_NF_CONNTRACK_FTP=m
|
||||
CONFIG_NF_CONNTRACK_H323=m
|
||||
CONFIG_NF_CONNTRACK_IRC=m
|
||||
CONFIG_NF_CONNTRACK_NETBIOS_NS=m
|
||||
CONFIG_NF_CONNTRACK_SNMP=m
|
||||
CONFIG_NF_CONNTRACK_PPTP=m
|
||||
CONFIG_NF_CONNTRACK_SANE=m
|
||||
CONFIG_NF_CONNTRACK_SIP=m
|
||||
CONFIG_NF_CONNTRACK_TFTP=m
|
||||
CONFIG_NF_CT_NETLINK=m
|
||||
CONFIG_NF_CT_NETLINK_TIMEOUT=m
|
||||
CONFIG_NF_TABLES=m
|
||||
CONFIG_NF_TABLES_INET=y
|
||||
CONFIG_NF_TABLES_NETDEV=y
|
||||
CONFIG_NFT_NUMGEN=m
|
||||
CONFIG_NFT_CT=m
|
||||
CONFIG_NFT_FLOW_OFFLOAD=m
|
||||
CONFIG_NFT_CONNLIMIT=m
|
||||
CONFIG_NFT_LOG=m
|
||||
CONFIG_NFT_LIMIT=m
|
||||
CONFIG_NFT_MASQ=m
|
||||
CONFIG_NFT_REDIR=m
|
||||
CONFIG_NFT_NAT=m
|
||||
CONFIG_NFT_TUNNEL=m
|
||||
CONFIG_NFT_QUEUE=m
|
||||
CONFIG_NFT_QUOTA=m
|
||||
CONFIG_NFT_REJECT=m
|
||||
CONFIG_NFT_COMPAT=m
|
||||
CONFIG_NFT_HASH=m
|
||||
CONFIG_NFT_FIB_INET=m
|
||||
CONFIG_NFT_XFRM=m
|
||||
CONFIG_NFT_SOCKET=m
|
||||
CONFIG_NFT_OSF=m
|
||||
CONFIG_NFT_TPROXY=m
|
||||
CONFIG_NFT_SYNPROXY=m
|
||||
CONFIG_NFT_DUP_NETDEV=m
|
||||
CONFIG_NFT_FWD_NETDEV=m
|
||||
CONFIG_NFT_FIB_NETDEV=m
|
||||
CONFIG_NF_FLOW_TABLE_INET=m
|
||||
CONFIG_NF_FLOW_TABLE=m
|
||||
CONFIG_NETFILTER_XTABLES=y
|
||||
CONFIG_NETFILTER_XT_SET=m
|
||||
CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m
|
||||
CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
|
||||
CONFIG_NETFILTER_XT_TARGET_CONNMARK=m
|
||||
CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m
|
||||
CONFIG_NETFILTER_XT_TARGET_DSCP=m
|
||||
CONFIG_NETFILTER_XT_TARGET_HMARK=m
|
||||
CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m
|
||||
CONFIG_NETFILTER_XT_TARGET_LOG=m
|
||||
CONFIG_NETFILTER_XT_TARGET_MARK=m
|
||||
CONFIG_NETFILTER_XT_TARGET_NETMAP=m
|
||||
CONFIG_NETFILTER_XT_TARGET_NFLOG=m
|
||||
CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m
|
||||
CONFIG_NETFILTER_XT_TARGET_NOTRACK=m
|
||||
CONFIG_NETFILTER_XT_TARGET_TEE=m
|
||||
CONFIG_NETFILTER_XT_TARGET_TPROXY=m
|
||||
CONFIG_NETFILTER_XT_TARGET_TRACE=m
|
||||
CONFIG_NETFILTER_XT_TARGET_SECMARK=m
|
||||
CONFIG_NETFILTER_XT_TARGET_TCPMSS=m
|
||||
CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m
|
||||
CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=m
|
||||
CONFIG_NETFILTER_XT_MATCH_BPF=m
|
||||
CONFIG_NETFILTER_XT_MATCH_CGROUP=m
|
||||
CONFIG_NETFILTER_XT_MATCH_CLUSTER=m
|
||||
CONFIG_NETFILTER_XT_MATCH_COMMENT=m
|
||||
CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m
|
||||
CONFIG_NETFILTER_XT_MATCH_CONNLABEL=m
|
||||
CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m
|
||||
CONFIG_NETFILTER_XT_MATCH_CONNMARK=m
|
||||
CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
|
||||
CONFIG_NETFILTER_XT_MATCH_CPU=m
|
||||
CONFIG_NETFILTER_XT_MATCH_DEVGROUP=m
|
||||
CONFIG_NETFILTER_XT_MATCH_DSCP=m
|
||||
CONFIG_NETFILTER_XT_MATCH_ESP=m
|
||||
CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m
|
||||
CONFIG_NETFILTER_XT_MATCH_HELPER=m
|
||||
CONFIG_NETFILTER_XT_MATCH_IPRANGE=m
|
||||
CONFIG_NETFILTER_XT_MATCH_IPVS=m
|
||||
CONFIG_NETFILTER_XT_MATCH_LENGTH=m
|
||||
CONFIG_NETFILTER_XT_MATCH_LIMIT=m
|
||||
CONFIG_NETFILTER_XT_MATCH_MAC=m
|
||||
CONFIG_NETFILTER_XT_MATCH_MARK=m
|
||||
CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m
|
||||
CONFIG_NETFILTER_XT_MATCH_NFACCT=m
|
||||
CONFIG_NETFILTER_XT_MATCH_OSF=m
|
||||
CONFIG_NETFILTER_XT_MATCH_OWNER=m
|
||||
CONFIG_NETFILTER_XT_MATCH_POLICY=m
|
||||
CONFIG_NETFILTER_XT_MATCH_PHYSDEV=m
|
||||
CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m
|
||||
CONFIG_NETFILTER_XT_MATCH_QUOTA=m
|
||||
CONFIG_NETFILTER_XT_MATCH_RATEEST=m
|
||||
CONFIG_NETFILTER_XT_MATCH_REALM=m
|
||||
CONFIG_NETFILTER_XT_MATCH_RECENT=m
|
||||
CONFIG_NETFILTER_XT_MATCH_SOCKET=m
|
||||
CONFIG_NETFILTER_XT_MATCH_STATE=m
|
||||
CONFIG_NETFILTER_XT_MATCH_STATISTIC=m
|
||||
CONFIG_NETFILTER_XT_MATCH_STRING=m
|
||||
CONFIG_NETFILTER_XT_MATCH_TCPMSS=m
|
||||
CONFIG_NETFILTER_XT_MATCH_TIME=m
|
||||
CONFIG_NETFILTER_XT_MATCH_U32=m
|
||||
CONFIG_IP_SET=m
|
||||
CONFIG_IP_SET_BITMAP_IP=m
|
||||
CONFIG_IP_SET_BITMAP_IPMAC=m
|
||||
CONFIG_IP_SET_BITMAP_PORT=m
|
||||
CONFIG_IP_SET_HASH_IP=m
|
||||
CONFIG_IP_SET_HASH_IPMARK=m
|
||||
CONFIG_IP_SET_HASH_IPPORT=m
|
||||
CONFIG_IP_SET_HASH_IPPORTIP=m
|
||||
CONFIG_IP_SET_HASH_IPPORTNET=m
|
||||
CONFIG_IP_SET_HASH_NET=m
|
||||
CONFIG_IP_SET_HASH_NETPORT=m
|
||||
CONFIG_IP_SET_HASH_NETIFACE=m
|
||||
CONFIG_IP_SET_LIST_SET=m
|
||||
CONFIG_IP_VS=m
|
||||
CONFIG_IP_VS_IPV6=y
|
||||
CONFIG_IP_VS_PROTO_TCP=y
|
||||
CONFIG_IP_VS_PROTO_UDP=y
|
||||
CONFIG_IP_VS_PROTO_ESP=y
|
||||
CONFIG_IP_VS_PROTO_AH=y
|
||||
CONFIG_IP_VS_PROTO_SCTP=y
|
||||
CONFIG_IP_VS_RR=m
|
||||
CONFIG_IP_VS_WRR=m
|
||||
CONFIG_IP_VS_LC=m
|
||||
CONFIG_IP_VS_WLC=m
|
||||
CONFIG_IP_VS_FO=m
|
||||
CONFIG_IP_VS_OVF=m
|
||||
CONFIG_IP_VS_LBLC=m
|
||||
CONFIG_IP_VS_LBLCR=m
|
||||
CONFIG_IP_VS_DH=m
|
||||
CONFIG_IP_VS_SH=m
|
||||
CONFIG_IP_VS_MH=m
|
||||
CONFIG_IP_VS_SED=m
|
||||
CONFIG_IP_VS_NQ=m
|
||||
CONFIG_IP_VS_SH_TAB_BITS=10
|
||||
CONFIG_IP_VS_FTP=m
|
||||
CONFIG_IP_VS_PE_SIP=m
|
||||
CONFIG_NFT_DUP_IPV4=m
|
||||
CONFIG_NFT_FIB_IPV4=m
|
||||
CONFIG_NF_TABLES_ARP=y
|
||||
CONFIG_NF_LOG_ARP=m
|
||||
CONFIG_IP_NF_IPTABLES=m
|
||||
CONFIG_IP_NF_MATCH_AH=m
|
||||
CONFIG_IP_NF_MATCH_ECN=m
|
||||
CONFIG_IP_NF_MATCH_RPFILTER=m
|
||||
CONFIG_IP_NF_MATCH_TTL=m
|
||||
CONFIG_IP_NF_FILTER=m
|
||||
CONFIG_IP_NF_TARGET_REJECT=m
|
||||
CONFIG_IP_NF_NAT=m
|
||||
CONFIG_IP_NF_TARGET_MASQUERADE=m
|
||||
CONFIG_IP_NF_TARGET_REDIRECT=m
|
||||
CONFIG_IP_NF_MANGLE=m
|
||||
CONFIG_IP_NF_TARGET_ECN=m
|
||||
CONFIG_IP_NF_TARGET_TTL=m
|
||||
CONFIG_IP_NF_RAW=m
|
||||
CONFIG_IP_NF_ARPTABLES=m
|
||||
CONFIG_IP_NF_ARPFILTER=m
|
||||
CONFIG_IP_NF_ARP_MANGLE=m
|
||||
CONFIG_NFT_DUP_IPV6=m
|
||||
CONFIG_NFT_FIB_IPV6=m
|
||||
CONFIG_NF_TABLES_BRIDGE=m
|
||||
CONFIG_NFT_BRIDGE_META=m
|
||||
CONFIG_NFT_BRIDGE_REJECT=m
|
||||
CONFIG_BRIDGE_NF_EBTABLES=m
|
||||
CONFIG_BRIDGE_EBT_BROUTE=m
|
||||
CONFIG_BRIDGE_EBT_T_FILTER=m
|
||||
CONFIG_BRIDGE_EBT_T_NAT=m
|
||||
CONFIG_BRIDGE_EBT_802_3=m
|
||||
CONFIG_BRIDGE_EBT_AMONG=m
|
||||
CONFIG_BRIDGE_EBT_ARP=m
|
||||
CONFIG_BRIDGE_EBT_IP=m
|
||||
CONFIG_BRIDGE_EBT_IP6=m
|
||||
CONFIG_BRIDGE_EBT_LIMIT=m
|
||||
CONFIG_BRIDGE_EBT_MARK=m
|
||||
CONFIG_BRIDGE_EBT_PKTTYPE=m
|
||||
CONFIG_BRIDGE_EBT_STP=m
|
||||
CONFIG_BRIDGE_EBT_VLAN=m
|
||||
CONFIG_BRIDGE_EBT_ARPREPLY=m
|
||||
CONFIG_BRIDGE_EBT_DNAT=m
|
||||
CONFIG_BRIDGE_EBT_MARK_T=m
|
||||
CONFIG_BRIDGE_EBT_REDIRECT=m
|
||||
CONFIG_BRIDGE_EBT_SNAT=m
|
||||
CONFIG_BRIDGE_EBT_LOG=m
|
||||
CONFIG_BRIDGE_EBT_NFLOG=m
|
||||
CONFIG_IP_SCTP=m
|
||||
CONFIG_SCTP_DEFAULT_COOKIE_HMAC_SHA1=y
|
||||
CONFIG_L2TP=m
|
||||
CONFIG_L2TP_DEBUGFS=m
|
||||
CONFIG_L2TP_V3=y
|
||||
CONFIG_L2TP_IP=m
|
||||
CONFIG_L2TP_ETH=m
|
||||
CONFIG_BRIDGE=y
|
||||
CONFIG_BRIDGE_VLAN_FILTERING=y
|
||||
CONFIG_VLAN_8021Q=y
|
||||
CONFIG_VLAN_8021Q_GVRP=y
|
||||
CONFIG_VLAN_8021Q_MVRP=y
|
||||
CONFIG_NET_SCHED=y
|
||||
CONFIG_NET_SCH_HTB=m
|
||||
CONFIG_NET_SCH_HFSC=m
|
||||
CONFIG_NET_SCH_PRIO=m
|
||||
CONFIG_NET_SCH_MULTIQ=m
|
||||
CONFIG_NET_SCH_RED=m
|
||||
CONFIG_NET_SCH_SFB=m
|
||||
CONFIG_NET_SCH_SFQ=m
|
||||
CONFIG_NET_SCH_TEQL=m
|
||||
CONFIG_NET_SCH_TBF=m
|
||||
CONFIG_NET_SCH_CBS=m
|
||||
CONFIG_NET_SCH_ETF=m
|
||||
CONFIG_NET_SCH_TAPRIO=m
|
||||
CONFIG_NET_SCH_GRED=m
|
||||
CONFIG_NET_SCH_NETEM=m
|
||||
CONFIG_NET_SCH_DRR=m
|
||||
CONFIG_NET_SCH_MQPRIO=m
|
||||
CONFIG_NET_SCH_SKBPRIO=m
|
||||
CONFIG_NET_SCH_CHOKE=m
|
||||
CONFIG_NET_SCH_QFQ=m
|
||||
CONFIG_NET_SCH_CODEL=m
|
||||
CONFIG_NET_SCH_FQ_CODEL=m
|
||||
CONFIG_NET_SCH_CAKE=m
|
||||
CONFIG_NET_SCH_FQ=m
|
||||
CONFIG_NET_SCH_HHF=m
|
||||
CONFIG_NET_SCH_PIE=m
|
||||
CONFIG_NET_SCH_INGRESS=m
|
||||
CONFIG_NET_SCH_PLUG=m
|
||||
CONFIG_NET_CLS_BASIC=m
|
||||
CONFIG_NET_CLS_ROUTE4=m
|
||||
CONFIG_NET_CLS_FW=m
|
||||
CONFIG_NET_CLS_U32=m
|
||||
CONFIG_CLS_U32_PERF=y
|
||||
CONFIG_CLS_U32_MARK=y
|
||||
CONFIG_NET_CLS_FLOW=m
|
||||
CONFIG_NET_CLS_CGROUP=y
|
||||
CONFIG_NET_CLS_BPF=m
|
||||
CONFIG_NET_CLS_FLOWER=m
|
||||
CONFIG_NET_CLS_MATCHALL=m
|
||||
CONFIG_NET_EMATCH=y
|
||||
CONFIG_NET_EMATCH_CMP=m
|
||||
CONFIG_NET_EMATCH_NBYTE=m
|
||||
CONFIG_NET_EMATCH_U32=m
|
||||
CONFIG_NET_EMATCH_META=m
|
||||
CONFIG_NET_EMATCH_TEXT=m
|
||||
CONFIG_NET_EMATCH_IPSET=m
|
||||
CONFIG_NET_CLS_ACT=y
|
||||
CONFIG_NET_ACT_POLICE=m
|
||||
CONFIG_NET_ACT_GACT=m
|
||||
CONFIG_GACT_PROB=y
|
||||
CONFIG_NET_ACT_MIRRED=m
|
||||
CONFIG_NET_ACT_IPT=m
|
||||
CONFIG_NET_ACT_NAT=m
|
||||
CONFIG_NET_ACT_PEDIT=m
|
||||
CONFIG_NET_ACT_SIMP=m
|
||||
CONFIG_NET_ACT_SKBEDIT=m
|
||||
CONFIG_NET_ACT_CSUM=m
|
||||
CONFIG_NET_ACT_VLAN=m
|
||||
CONFIG_NET_ACT_BPF=m
|
||||
CONFIG_NET_ACT_CONNMARK=m
|
||||
CONFIG_DNS_RESOLVER=y
|
||||
CONFIG_OPENVSWITCH=m
|
||||
CONFIG_VSOCKETS=m
|
||||
CONFIG_VIRTIO_VSOCKETS=m
|
||||
CONFIG_NETLINK_DIAG=m
|
||||
CONFIG_CGROUP_NET_PRIO=y
|
||||
CONFIG_NET_PKTGEN=m
|
||||
# CONFIG_WIRELESS is not set
|
||||
CONFIG_PCI=y
|
||||
CONFIG_PCIEPORTBUS=y
|
||||
CONFIG_HOTPLUG_PCI_PCIE=y
|
||||
CONFIG_PCIEASPM_PERFORMANCE=y
|
||||
CONFIG_PCI_STUB=m
|
||||
CONFIG_PCI_PF_STUB=m
|
||||
CONFIG_VGA_ARB_MAX_GPUS=64
|
||||
CONFIG_HOTPLUG_PCI=y
|
||||
CONFIG_HOTPLUG_PCI_ACPI=y
|
||||
CONFIG_VMD=m
|
||||
CONFIG_DEVTMPFS=y
|
||||
CONFIG_DEVTMPFS_MOUNT=y
|
||||
CONFIG_FW_LOADER_USER_HELPER=y
|
||||
CONFIG_CONNECTOR=y
|
||||
CONFIG_DMI_SYSFS=y
|
||||
CONFIG_ISCSI_IBFT=m
|
||||
CONFIG_EFI_VARS_PSTORE=m
|
||||
CONFIG_EFI_VARS_PSTORE_DEFAULT_DISABLE=y
|
||||
CONFIG_EFI_CUSTOM_SSDT_OVERLAYS=y
|
||||
# CONFIG_PNP_DEBUG_MESSAGES is not set
|
||||
CONFIG_BLK_DEV_NULL_BLK=m
|
||||
CONFIG_BLK_DEV_FD=m
|
||||
CONFIG_BLK_DEV_PCIESSD_MTIP32XX=y
|
||||
CONFIG_ZRAM=m
|
||||
CONFIG_ZRAM_MULTI_COMP=y
|
||||
CONFIG_BLK_DEV_LOOP=m
|
||||
CONFIG_BLK_DEV_DRBD=m
|
||||
CONFIG_BLK_DEV_NBD=m
|
||||
CONFIG_BLK_DEV_RAM=m
|
||||
CONFIG_BLK_DEV_RAM_SIZE=16384
|
||||
CONFIG_VIRTIO_BLK=y
|
||||
CONFIG_BLK_DEV_RBD=m
|
||||
CONFIG_BLK_DEV_NVME=y
|
||||
CONFIG_EEPROM_93CX6=m
|
||||
CONFIG_RAID_ATTRS=y
|
||||
CONFIG_BLK_DEV_SD=y
|
||||
CONFIG_CHR_DEV_ST=m
|
||||
CONFIG_BLK_DEV_SR=m
|
||||
CONFIG_CHR_DEV_SG=y
|
||||
CONFIG_CHR_DEV_SCH=m
|
||||
CONFIG_SCSI_CONSTANTS=y
|
||||
CONFIG_SCSI_LOGGING=y
|
||||
CONFIG_SCSI_SCAN_ASYNC=y
|
||||
CONFIG_SCSI_FC_ATTRS=m
|
||||
CONFIG_SCSI_SAS_ATA=y
|
||||
CONFIG_ISCSI_TCP=m
|
||||
CONFIG_SCSI_CXGB3_ISCSI=m
|
||||
CONFIG_BE2ISCSI=m
|
||||
CONFIG_SCSI_HPSA=m
|
||||
CONFIG_SCSI_AIC94XX=m
|
||||
# CONFIG_AIC94XX_DEBUG is not set
|
||||
CONFIG_SCSI_MVSAS=m
|
||||
# CONFIG_SCSI_MVSAS_DEBUG is not set
|
||||
CONFIG_SCSI_MVUMI=m
|
||||
CONFIG_SCSI_ARCMSR=m
|
||||
CONFIG_SCSI_HPTIOP=m
|
||||
CONFIG_SCSI_BUSLOGIC=m
|
||||
CONFIG_VMWARE_PVSCSI=m
|
||||
CONFIG_LIBFC=m
|
||||
CONFIG_LIBFCOE=m
|
||||
CONFIG_FCOE=m
|
||||
CONFIG_FCOE_FNIC=m
|
||||
CONFIG_SCSI_ISCI=m
|
||||
CONFIG_SCSI_IPS=m
|
||||
CONFIG_SCSI_INITIO=m
|
||||
CONFIG_SCSI_STEX=m
|
||||
CONFIG_SCSI_QLA_FC=m
|
||||
CONFIG_SCSI_QLA_ISCSI=m
|
||||
CONFIG_SCSI_DEBUG=m
|
||||
CONFIG_SCSI_PMCRAID=m
|
||||
CONFIG_SCSI_PM8001=m
|
||||
CONFIG_SCSI_BFA_FC=m
|
||||
CONFIG_SCSI_VIRTIO=y
|
||||
CONFIG_ATA=y
|
||||
CONFIG_SATA_AHCI=y
|
||||
CONFIG_SATA_AHCI_PLATFORM=y
|
||||
CONFIG_SATA_ACARD_AHCI=y
|
||||
CONFIG_SATA_SIL24=m
|
||||
CONFIG_PDC_ADMA=m
|
||||
CONFIG_SATA_QSTOR=m
|
||||
CONFIG_SATA_SX4=m
|
||||
CONFIG_ATA_PIIX=y
|
||||
CONFIG_SATA_MV=m
|
||||
CONFIG_SATA_NV=m
|
||||
CONFIG_SATA_PROMISE=m
|
||||
CONFIG_SATA_SIL=m
|
||||
CONFIG_SATA_SIS=m
|
||||
CONFIG_SATA_SVW=m
|
||||
CONFIG_SATA_ULI=m
|
||||
CONFIG_SATA_VIA=m
|
||||
CONFIG_SATA_VITESSE=m
|
||||
CONFIG_PATA_SCH=y
|
||||
CONFIG_PATA_MPIIX=y
|
||||
CONFIG_PATA_ACPI=m
|
||||
CONFIG_ATA_GENERIC=y
|
||||
CONFIG_MD=y
|
||||
CONFIG_BLK_DEV_MD=y
|
||||
CONFIG_MD_LINEAR=m
|
||||
CONFIG_MD_MULTIPATH=m
|
||||
CONFIG_MD_FAULTY=m
|
||||
CONFIG_BLK_DEV_DM=m
|
||||
CONFIG_DM_DEBUG=y
|
||||
CONFIG_DM_CRYPT=m
|
||||
CONFIG_DM_SNAPSHOT=m
|
||||
CONFIG_DM_THIN_PROVISIONING=m
|
||||
CONFIG_DM_CACHE=m
|
||||
CONFIG_DM_ERA=m
|
||||
CONFIG_DM_MIRROR=m
|
||||
CONFIG_DM_LOG_USERSPACE=m
|
||||
CONFIG_DM_RAID=m
|
||||
CONFIG_DM_ZERO=m
|
||||
CONFIG_DM_MULTIPATH=m
|
||||
CONFIG_DM_MULTIPATH_QL=m
|
||||
CONFIG_DM_MULTIPATH_ST=m
|
||||
CONFIG_DM_DELAY=m
|
||||
CONFIG_DM_FLAKEY=m
|
||||
CONFIG_DM_VERITY=m
|
||||
CONFIG_DM_SWITCH=m
|
||||
CONFIG_DM_LOG_WRITES=m
|
||||
CONFIG_TARGET_CORE=m
|
||||
CONFIG_TCM_IBLOCK=m
|
||||
CONFIG_TCM_FILEIO=m
|
||||
CONFIG_TCM_PSCSI=m
|
||||
CONFIG_TCM_USER2=m
|
||||
CONFIG_LOOPBACK_TARGET=m
|
||||
CONFIG_TCM_FC=m
|
||||
CONFIG_ISCSI_TARGET=m
|
||||
CONFIG_BONDING=m
|
||||
CONFIG_DUMMY=m
|
||||
CONFIG_IFB=m
|
||||
CONFIG_NET_TEAM=m
|
||||
CONFIG_NET_TEAM_MODE_BROADCAST=m
|
||||
CONFIG_NET_TEAM_MODE_ROUNDROBIN=m
|
||||
CONFIG_NET_TEAM_MODE_RANDOM=m
|
||||
CONFIG_NET_TEAM_MODE_ACTIVEBACKUP=m
|
||||
CONFIG_NET_TEAM_MODE_LOADBALANCE=m
|
||||
CONFIG_MACVLAN=y
|
||||
CONFIG_MACVTAP=y
|
||||
CONFIG_IPVLAN=m
|
||||
CONFIG_IPVTAP=m
|
||||
CONFIG_GENEVE=m
|
||||
CONFIG_NETCONSOLE=m
|
||||
CONFIG_NETCONSOLE_DYNAMIC=y
|
||||
CONFIG_TUN=y
|
||||
CONFIG_VETH=m
|
||||
CONFIG_VIRTIO_NET=m
|
||||
CONFIG_NET_VRF=m
|
||||
CONFIG_VSOCKMON=m
|
||||
# CONFIG_NET_VENDOR_3COM is not set
|
||||
# CONFIG_NET_VENDOR_ADAPTEC is not set
|
||||
# CONFIG_NET_VENDOR_AGERE is not set
|
||||
# CONFIG_NET_VENDOR_ALACRITECH is not set
|
||||
# CONFIG_NET_VENDOR_ALTEON is not set
|
||||
# CONFIG_NET_VENDOR_AMAZON is not set
|
||||
# CONFIG_NET_VENDOR_AMD is not set
|
||||
# CONFIG_NET_VENDOR_AQUANTIA is not set
|
||||
# CONFIG_NET_VENDOR_ARC is not set
|
||||
CONFIG_ATL2=m
|
||||
CONFIG_ATL1=m
|
||||
CONFIG_ATL1E=m
|
||||
CONFIG_ATL1C=m
|
||||
CONFIG_ALX=m
|
||||
CONFIG_BNX2=m
|
||||
CONFIG_BNX2X=m
|
||||
CONFIG_MACB=m
|
||||
# CONFIG_NET_VENDOR_CAVIUM is not set
|
||||
CONFIG_CHELSIO_T4=m
|
||||
CONFIG_CHELSIO_T4VF=m
|
||||
CONFIG_ENIC=m
|
||||
CONFIG_DNET=m
|
||||
# CONFIG_NET_VENDOR_DEC is not set
|
||||
# CONFIG_NET_VENDOR_DLINK is not set
|
||||
# CONFIG_NET_VENDOR_I825XX is not set
|
||||
CONFIG_E1000=m
|
||||
CONFIG_E1000E=m
|
||||
CONFIG_IGB=m
|
||||
# CONFIG_IGB_HWMON is not set
|
||||
CONFIG_IGBVF=m
|
||||
CONFIG_FM10K=m
|
||||
CONFIG_IGC=m
|
||||
CONFIG_JME=m
|
||||
CONFIG_MVMDIO=m
|
||||
CONFIG_MLX4_EN=m
|
||||
CONFIG_MLX5_CORE=m
|
||||
CONFIG_MLX5_FPGA=y
|
||||
CONFIG_MLX5_CORE_EN=y
|
||||
CONFIG_MLX5_CORE_IPOIB=y
|
||||
CONFIG_MLX5_EN_IPSEC=y
|
||||
CONFIG_MLXSW_CORE=m
|
||||
# CONFIG_NET_VENDOR_MICREL is not set
|
||||
# CONFIG_NET_VENDOR_MICROCHIP is not set
|
||||
# CONFIG_NET_VENDOR_MICROSEMI is not set
|
||||
# CONFIG_NET_VENDOR_MYRI is not set
|
||||
# CONFIG_NET_VENDOR_NATSEMI is not set
|
||||
# CONFIG_NET_VENDOR_NETERION is not set
|
||||
# CONFIG_NET_VENDOR_NVIDIA is not set
|
||||
# CONFIG_NET_VENDOR_OKI is not set
|
||||
CONFIG_ETHOC=m
|
||||
# CONFIG_NET_VENDOR_PENSANDO is not set
|
||||
CONFIG_QLA3XXX=m
|
||||
CONFIG_QLCNIC=m
|
||||
CONFIG_NETXEN_NIC=m
|
||||
# CONFIG_NET_VENDOR_BROCADE is not set
|
||||
# CONFIG_NET_VENDOR_QUALCOMM is not set
|
||||
# CONFIG_NET_VENDOR_RDC is not set
|
||||
CONFIG_8139CP=m
|
||||
CONFIG_8139TOO=m
|
||||
# CONFIG_8139TOO_PIO is not set
|
||||
CONFIG_8139TOO_8129=y
|
||||
CONFIG_R8169=m
|
||||
# CONFIG_NET_VENDOR_RENESAS is not set
|
||||
# CONFIG_NET_VENDOR_ROCKER is not set
|
||||
# CONFIG_NET_VENDOR_SAMSUNG is not set
|
||||
# CONFIG_NET_VENDOR_SEEQ is not set
|
||||
# CONFIG_NET_VENDOR_SILAN is not set
|
||||
# CONFIG_NET_VENDOR_SIS is not set
|
||||
# CONFIG_NET_VENDOR_SMSC is not set
|
||||
# CONFIG_NET_VENDOR_STMICRO is not set
|
||||
# CONFIG_NET_VENDOR_SUN is not set
|
||||
# CONFIG_NET_VENDOR_SYNOPSYS is not set
|
||||
# CONFIG_NET_VENDOR_TEHUTI is not set
|
||||
# CONFIG_NET_VENDOR_TI is not set
|
||||
# CONFIG_NET_VENDOR_VIA is not set
|
||||
# CONFIG_NET_VENDOR_WIZNET is not set
|
||||
# CONFIG_NET_VENDOR_XILINX is not set
|
||||
CONFIG_AMD_PHY=m
|
||||
CONFIG_BROADCOM_PHY=m
|
||||
CONFIG_BCM87XX_PHY=m
|
||||
CONFIG_CICADA_PHY=m
|
||||
CONFIG_DAVICOM_PHY=m
|
||||
CONFIG_ICPLUS_PHY=m
|
||||
CONFIG_LXT_PHY=m
|
||||
CONFIG_LSI_ET1011C_PHY=m
|
||||
CONFIG_MARVELL_PHY=m
|
||||
CONFIG_MICREL_PHY=m
|
||||
CONFIG_NATIONAL_PHY=m
|
||||
CONFIG_QSEMI_PHY=m
|
||||
CONFIG_STE10XP=m
|
||||
CONFIG_VITESSE_PHY=m
|
||||
CONFIG_MDIO_BITBANG=m
|
||||
CONFIG_PPP=m
|
||||
CONFIG_PPP_BSDCOMP=m
|
||||
CONFIG_PPP_DEFLATE=m
|
||||
CONFIG_PPP_MPPE=m
|
||||
CONFIG_PPPOE=m
|
||||
CONFIG_PPTP=m
|
||||
CONFIG_PPPOL2TP=m
|
||||
CONFIG_PPP_ASYNC=m
|
||||
CONFIG_PPP_SYNC_TTY=m
|
||||
CONFIG_SLIP=m
|
||||
# CONFIG_WLAN is not set
|
||||
CONFIG_INPUT_SPARSEKMAP=m
|
||||
CONFIG_INPUT_EVDEV=y
|
||||
CONFIG_KEYBOARD_ATKBD=m
|
||||
# CONFIG_INPUT_MOUSE is not set
|
||||
CONFIG_INPUT_MISC=y
|
||||
CONFIG_INPUT_UINPUT=m
|
||||
CONFIG_SERIO_I8042=m
|
||||
CONFIG_SERIO_SERPORT=m
|
||||
CONFIG_SERIO_RAW=m
|
||||
# CONFIG_LEGACY_PTYS is not set
|
||||
CONFIG_SERIAL_8250=y
|
||||
# CONFIG_SERIAL_8250_DEPRECATED_OPTIONS is not set
|
||||
CONFIG_SERIAL_8250_CONSOLE=y
|
||||
CONFIG_SERIAL_8250_EXAR=m
|
||||
CONFIG_SERIAL_8250_LPSS=m
|
||||
CONFIG_SERIAL_8250_MID=m
|
||||
CONFIG_SERIAL_JSM=m
|
||||
CONFIG_SERIAL_ARC=m
|
||||
CONFIG_N_GSM=m
|
||||
CONFIG_NULL_TTY=m
|
||||
CONFIG_TTY_PRINTK=y
|
||||
CONFIG_VIRTIO_CONSOLE=y
|
||||
CONFIG_IPMI_HANDLER=m
|
||||
CONFIG_IPMI_DEVICE_INTERFACE=m
|
||||
CONFIG_IPMI_WATCHDOG=m
|
||||
CONFIG_IPMI_POWEROFF=m
|
||||
CONFIG_HW_RANDOM=y
|
||||
CONFIG_HW_RANDOM_TIMERIOMEM=m
|
||||
CONFIG_HW_RANDOM_INTEL=m
|
||||
CONFIG_HW_RANDOM_AMD=m
|
||||
# CONFIG_HW_RANDOM_VIA is not set
|
||||
CONFIG_HW_RANDOM_VIRTIO=m
|
||||
CONFIG_NVRAM=m
|
||||
CONFIG_HPET=y
|
||||
CONFIG_HANGCHECK_TIMER=m
|
||||
CONFIG_TCG_TPM=m
|
||||
CONFIG_TCG_TIS=m
|
||||
CONFIG_TCG_NSC=m
|
||||
CONFIG_TCG_ATMEL=m
|
||||
CONFIG_TCG_INFINEON=m
|
||||
CONFIG_TELCLOCK=m
|
||||
# CONFIG_I2C_COMPAT is not set
|
||||
# CONFIG_I2C_HELPER_AUTO is not set
|
||||
CONFIG_SENSORS_FAM15H_POWER=m
|
||||
CONFIG_SENSORS_CORETEMP=m
|
||||
# CONFIG_THERMAL_HWMON is not set
|
||||
CONFIG_DRM=m
|
||||
CONFIG_DRM_AST=m
|
||||
CONFIG_DRM_QXL=m
|
||||
CONFIG_DRM_VIRTIO_GPU=m
|
||||
CONFIG_DRM_BOCHS=m
|
||||
CONFIG_DRM_CIRRUS_QEMU=m
|
||||
CONFIG_FB=y
|
||||
CONFIG_FIRMWARE_EDID=y
|
||||
CONFIG_LCD_CLASS_DEVICE=m
|
||||
CONFIG_FRAMEBUFFER_CONSOLE_ROTATION=y
|
||||
# CONFIG_HID is not set
|
||||
# CONFIG_USB_SUPPORT is not set
|
||||
CONFIG_SCSI_UFSHCD=m
|
||||
CONFIG_SCSI_UFSHCD_PCI=m
|
||||
CONFIG_RTC_CLASS=y
|
||||
# CONFIG_RTC_HCTOSYS is not set
|
||||
# CONFIG_RTC_SYSTOHC is not set
|
||||
# CONFIG_RTC_NVMEM is not set
|
||||
CONFIG_DMADEVICES=y
|
||||
CONFIG_UIO=m
|
||||
CONFIG_UIO_PDRV_GENIRQ=m
|
||||
CONFIG_UIO_DMEM_GENIRQ=m
|
||||
CONFIG_UIO_PCI_GENERIC=m
|
||||
CONFIG_VFIO=m
|
||||
CONFIG_VFIO_PCI=m
|
||||
CONFIG_VIRT_DRIVERS=y
|
||||
CONFIG_VIRTIO_PCI=y
|
||||
CONFIG_VIRTIO_PMEM=m
|
||||
CONFIG_VIRTIO_BALLOON=m
|
||||
CONFIG_VIRTIO_INPUT=m
|
||||
CONFIG_VIRTIO_MMIO=m
|
||||
CONFIG_VHOST_NET=m
|
||||
CONFIG_VHOST_SCSI=m
|
||||
CONFIG_VHOST_VSOCK=m
|
||||
CONFIG_AMD_IOMMU=y
|
||||
CONFIG_AMD_IOMMU_V2=m
|
||||
CONFIG_INTEL_IOMMU=y
|
||||
CONFIG_IRQ_REMAP=y
|
||||
CONFIG_RAS_CEC=y
|
||||
CONFIG_LIBNVDIMM=y
|
||||
CONFIG_BLK_DEV_PMEM=m
|
||||
CONFIG_DEV_DAX=m
|
||||
CONFIG_NVMEM=y
|
||||
CONFIG_COUNTER=m
|
||||
CONFIG_EXT4_FS=y
|
||||
CONFIG_EXT4_FS_POSIX_ACL=y
|
||||
CONFIG_EXT4_FS_SECURITY=y
|
||||
CONFIG_XFS_FS=m
|
||||
CONFIG_XFS_QUOTA=y
|
||||
CONFIG_XFS_POSIX_ACL=y
|
||||
CONFIG_XFS_RT=y
|
||||
CONFIG_XFS_WARN=y
|
||||
CONFIG_FS_ENCRYPTION=y
|
||||
CONFIG_FANOTIFY=y
|
||||
CONFIG_FANOTIFY_ACCESS_PERMISSIONS=y
|
||||
CONFIG_QUOTA_NETLINK_INTERFACE=y
|
||||
CONFIG_FUSE_FS=m
|
||||
CONFIG_CUSE=m
|
||||
CONFIG_VIRTIO_FS=m
|
||||
CONFIG_OVERLAY_FS=m
|
||||
CONFIG_OVERLAY_FS_INDEX=y
|
||||
CONFIG_OVERLAY_FS_METACOPY=y
|
||||
CONFIG_FSCACHE=m
|
||||
CONFIG_FSCACHE_STATS=y
|
||||
CONFIG_CACHEFILES=m
|
||||
CONFIG_ISO9660_FS=m
|
||||
CONFIG_JOLIET=y
|
||||
CONFIG_ZISOFS=y
|
||||
CONFIG_UDF_FS=m
|
||||
CONFIG_MSDOS_FS=m
|
||||
CONFIG_VFAT_FS=m
|
||||
CONFIG_FAT_DEFAULT_IOCHARSET="ascii"
|
||||
CONFIG_NTFS_FS=m
|
||||
CONFIG_NTFS_RW=y
|
||||
CONFIG_PROC_KCORE=y
|
||||
CONFIG_TMPFS=y
|
||||
CONFIG_TMPFS_POSIX_ACL=y
|
||||
CONFIG_HUGETLBFS=y
|
||||
CONFIG_CONFIGFS_FS=y
|
||||
CONFIG_EFIVAR_FS=y
|
||||
CONFIG_ECRYPT_FS=m
|
||||
CONFIG_HFSPLUS_FS=m
|
||||
CONFIG_CRAMFS=m
|
||||
CONFIG_SQUASHFS=y
|
||||
CONFIG_SQUASHFS_XATTR=y
|
||||
CONFIG_SQUASHFS_LZ4=y
|
||||
CONFIG_SQUASHFS_LZO=y
|
||||
CONFIG_SQUASHFS_XZ=y
|
||||
CONFIG_NFS_FS=m
|
||||
CONFIG_NFS_V3_ACL=y
|
||||
CONFIG_NFS_V4=m
|
||||
CONFIG_NFS_V4_1=y
|
||||
CONFIG_NFS_V4_2=y
|
||||
CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN=""
|
||||
CONFIG_NFS_FSCACHE=y
|
||||
CONFIG_NFSD=m
|
||||
CONFIG_NFSD_V3_ACL=y
|
||||
CONFIG_NFSD_V4=y
|
||||
CONFIG_NFSD_BLOCKLAYOUT=y
|
||||
CONFIG_NFSD_SCSILAYOUT=y
|
||||
CONFIG_NFSD_FLEXFILELAYOUT=y
|
||||
CONFIG_SUNRPC_DEBUG=y
|
||||
CONFIG_CEPH_FS=m
|
||||
CONFIG_CEPH_FSCACHE=y
|
||||
CONFIG_CEPH_FS_POSIX_ACL=y
|
||||
CONFIG_CIFS=m
|
||||
CONFIG_CIFS_UPCALL=y
|
||||
CONFIG_CIFS_XATTR=y
|
||||
CONFIG_CIFS_POSIX=y
|
||||
# CONFIG_CIFS_DEBUG is not set
|
||||
CONFIG_CIFS_DFS_UPCALL=y
|
||||
CONFIG_CIFS_FSCACHE=y
|
||||
CONFIG_NLS_DEFAULT="utf8"
|
||||
CONFIG_NLS_CODEPAGE_437=y
|
||||
CONFIG_NLS_ASCII=y
|
||||
CONFIG_NLS_ISO8859_1=y
|
||||
CONFIG_NLS_UTF8=y
|
||||
CONFIG_TRUSTED_KEYS=m
|
||||
CONFIG_ENCRYPTED_KEYS=m
|
||||
CONFIG_SECURITY_DMESG_RESTRICT=y
|
||||
CONFIG_SECURITY=y
|
||||
CONFIG_SECURITY_NETWORK=y
|
||||
CONFIG_SECURITY_PATH=y
|
||||
CONFIG_FORTIFY_SOURCE=y
|
||||
CONFIG_SECURITY_SAFESETID=y
|
||||
# CONFIG_INTEGRITY is not set
|
||||
CONFIG_LSM="loadpin,safesetid,integrity,bpf"
|
||||
CONFIG_INIT_ON_ALLOC_DEFAULT_ON=y
|
||||
CONFIG_BUG_ON_DATA_CORRUPTION=y
|
||||
CONFIG_CRYPTO_USER=m
|
||||
CONFIG_CRYPTO_NULL=y
|
||||
CONFIG_CRYPTO_PCRYPT=m
|
||||
CONFIG_CRYPTO_TEST=m
|
||||
CONFIG_CRYPTO_ANUBIS=m
|
||||
CONFIG_CRYPTO_BLOWFISH=m
|
||||
CONFIG_CRYPTO_CAMELLIA=m
|
||||
CONFIG_CRYPTO_FCRYPT=m
|
||||
CONFIG_CRYPTO_KHAZAD=m
|
||||
CONFIG_CRYPTO_SEED=m
|
||||
CONFIG_CRYPTO_TEA=m
|
||||
CONFIG_CRYPTO_TWOFISH=m
|
||||
CONFIG_CRYPTO_ARC4=m
|
||||
CONFIG_CRYPTO_CFB=m
|
||||
CONFIG_CRYPTO_KEYWRAP=m
|
||||
CONFIG_CRYPTO_LRW=m
|
||||
CONFIG_CRYPTO_PCBC=m
|
||||
CONFIG_CRYPTO_CHACHA20POLY1305=m
|
||||
CONFIG_CRYPTO_MICHAEL_MIC=m
|
||||
CONFIG_CRYPTO_RMD160=m
|
||||
CONFIG_CRYPTO_SHA1=y
|
||||
CONFIG_CRYPTO_VMAC=m
|
||||
CONFIG_CRYPTO_WP512=m
|
||||
CONFIG_CRYPTO_XCBC=m
|
||||
CONFIG_CRYPTO_CRC32=m
|
||||
CONFIG_CRYPTO_842=m
|
||||
CONFIG_CRYPTO_ANSI_CPRNG=m
|
||||
CONFIG_CRYPTO_USER_API_HASH=y
|
||||
CONFIG_CRYPTO_USER_API_SKCIPHER=y
|
||||
CONFIG_CRYPTO_USER_API_RNG=m
|
||||
CONFIG_CRYPTO_USER_API_AEAD=m
|
||||
CONFIG_CRYPTO_AES_NI_INTEL=m
|
||||
CONFIG_CRYPTO_BLOWFISH_X86_64=m
|
||||
CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64=m
|
||||
CONFIG_CRYPTO_CAST5_AVX_X86_64=m
|
||||
CONFIG_CRYPTO_CAST6_AVX_X86_64=m
|
||||
CONFIG_CRYPTO_SERPENT_SSE2_X86_64=m
|
||||
CONFIG_CRYPTO_SERPENT_AVX2_X86_64=m
|
||||
CONFIG_CRYPTO_TWOFISH_AVX_X86_64=m
|
||||
CONFIG_CRYPTO_SHA1_SSSE3=m
|
||||
CONFIG_CRYPTO_SHA256_SSSE3=m
|
||||
CONFIG_CRYPTO_SHA512_SSSE3=m
|
||||
CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL=m
|
||||
CONFIG_CRYPTO_CRC32_PCLMUL=m
|
||||
CONFIG_CRYPTO_DEV_PADLOCK=m
|
||||
CONFIG_CRYPTO_DEV_PADLOCK_AES=m
|
||||
CONFIG_CRYPTO_DEV_PADLOCK_SHA=m
|
||||
CONFIG_CRYPTO_DEV_CCP=y
|
||||
CONFIG_CRYPTO_DEV_QAT_DH895xCC=m
|
||||
CONFIG_CRYPTO_DEV_QAT_C3XXX=m
|
||||
CONFIG_CRYPTO_DEV_QAT_C62X=m
|
||||
CONFIG_CRYPTO_DEV_QAT_DH895xCCVF=m
|
||||
CONFIG_CRYPTO_DEV_QAT_C3XXXVF=m
|
||||
CONFIG_CRYPTO_DEV_QAT_C62XVF=m
|
||||
CONFIG_CORDIC=m
|
||||
CONFIG_CRC7=m
|
||||
CONFIG_LIBCRC32C=y
|
||||
CONFIG_PRINTK_TIME=y
|
||||
CONFIG_DYNAMIC_DEBUG=y
|
||||
CONFIG_STRIP_ASM_SYMS=y
|
||||
CONFIG_DEBUG_SECTION_MISMATCH=y
|
||||
CONFIG_MAGIC_SYSRQ=y
|
||||
# CONFIG_MAGIC_SYSRQ_SERIAL is not set
|
||||
CONFIG_SCHED_STACK_END_CHECK=y
|
||||
CONFIG_PANIC_ON_OOPS=y
|
||||
CONFIG_HARDLOCKUP_DETECTOR=y
|
||||
CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y
|
||||
CONFIG_RCU_CPU_STALL_TIMEOUT=60
|
||||
# CONFIG_RCU_TRACE is not set
|
||||
CONFIG_LATENCYTOP=y
|
||||
CONFIG_FUNCTION_PROFILER=y
|
||||
CONFIG_STACK_TRACER=y
|
||||
CONFIG_SCHED_TRACER=y
|
||||
CONFIG_FTRACE_SYSCALLS=y
|
||||
CONFIG_BLK_DEV_IO_TRACE=y
|
||||
CONFIG_BPF_KPROBE_OVERRIDE=y
|
||||
# CONFIG_X86_VERBOSE_BOOTUP is not set
|
||||
# CONFIG_EARLY_PRINTK is not set
|
||||
# CONFIG_X86_DEBUG_FPU is not set
|
||||
CONFIG_NOTIFIER_ERROR_INJECTION=m
|
||||
CONFIG_FUNCTION_ERROR_INJECTION=y
|
||||
# CONFIG_RUNTIME_TESTING_MENU is not set
|
|
@ -139,6 +139,8 @@ BuildRequires: gcc-plugin-devel
|
|||
# glibc-static is required for a consistent build environment (specifically
|
||||
# CONFIG_CC_CAN_LINK_STATIC=y).
|
||||
BuildRequires: glibc-static
|
||||
# Kernel could be compressed with lz4
|
||||
BuildRequires: lz4
|
||||
|
||||
%if %{with_perf}
|
||||
BuildRequires: zlib-devel binutils-devel newt-devel perl(ExtUtils::Embed) bison flex xz-devel
|
||||
|
|
|
@ -360,6 +360,9 @@ static int brd_alloc(int i)
|
|||
blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
|
||||
blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, disk->queue);
|
||||
blk_queue_flag_set(QUEUE_FLAG_NOWAIT, disk->queue);
|
||||
#ifdef CONFIG_EMM_RAMDISK_SWAP
|
||||
blk_queue_flag_set(QUEUE_FLAG_RAMDISK, disk->queue);
|
||||
#endif
|
||||
err = add_disk(disk);
|
||||
if (err)
|
||||
goto out_cleanup_disk;
|
||||
|
|
|
@ -2220,6 +2220,9 @@ static int zram_add(void)
|
|||
/* zram devices sort of resembles non-rotational disks */
|
||||
blk_queue_flag_set(QUEUE_FLAG_NONROT, zram->disk->queue);
|
||||
blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, zram->disk->queue);
|
||||
#ifdef CONFIG_EMM_RAMDISK_SWAP
|
||||
blk_queue_flag_set(QUEUE_FLAG_RAMDISK, zram->disk->queue);
|
||||
#endif
|
||||
|
||||
/*
|
||||
* To ensure that we always get PAGE_SIZE aligned
|
||||
|
|
|
@ -573,6 +573,9 @@ struct request_queue {
|
|||
#define QUEUE_FLAG_NOWAIT 29 /* device supports NOWAIT */
|
||||
#define QUEUE_FLAG_SQ_SCHED 30 /* single queue style io dispatch */
|
||||
#define QUEUE_FLAG_SKIP_TAGSET_QUIESCE 31 /* quiesce_tagset skip the queue*/
|
||||
#ifdef CONFIG_EMM_RAMDISK_SWAP
|
||||
#define QUEUE_FLAG_RAMDISK 32 /* ramdisk requires runtime page alloc */
|
||||
#endif
|
||||
|
||||
#define QUEUE_FLAG_MQ_DEFAULT ((1UL << QUEUE_FLAG_IO_STAT) | \
|
||||
(1UL << QUEUE_FLAG_SAME_COMP) | \
|
||||
|
@ -1396,6 +1399,7 @@ struct block_device_operations {
|
|||
unsigned int flags);
|
||||
int (*open)(struct gendisk *disk, blk_mode_t mode);
|
||||
void (*release)(struct gendisk *disk);
|
||||
int (*swap_folio)(struct block_device *, sector_t, struct folio *, enum req_op);
|
||||
int (*ioctl)(struct block_device *bdev, blk_mode_t mode,
|
||||
unsigned cmd, unsigned long arg);
|
||||
int (*compat_ioctl)(struct block_device *bdev, blk_mode_t mode,
|
||||
|
@ -1437,6 +1441,10 @@ extern int blkdev_compat_ptr_ioctl(struct block_device *, blk_mode_t,
|
|||
#define blkdev_compat_ptr_ioctl NULL
|
||||
#endif
|
||||
|
||||
extern int bdev_swapin_folio(struct block_device *, sector_t, struct folio *);
|
||||
extern int bdev_swapout_folio(struct block_device *, sector_t, struct folio *,
|
||||
struct writeback_control *);
|
||||
|
||||
static inline void blk_wake_io_task(struct task_struct *waiter)
|
||||
{
|
||||
/*
|
||||
|
@ -1564,4 +1572,15 @@ struct io_comp_batch {
|
|||
|
||||
#define DEFINE_IO_COMP_BATCH(name) struct io_comp_batch name = { }
|
||||
|
||||
#ifdef CONFIG_EMM_RAMDISK_SWAP
|
||||
/*
|
||||
* Check if a bdev is ramdisk based
|
||||
*/
|
||||
static inline bool bdev_ramdisk(struct block_device *bdev)
|
||||
{
|
||||
return test_bit(QUEUE_FLAG_RAMDISK,
|
||||
&bdev_get_queue(bdev)->queue_flags);
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* _LINUX_BLKDEV_H */
|
||||
|
|
|
@ -0,0 +1,37 @@
|
|||
#include <linux/memcontrol.h>
|
||||
|
||||
#ifdef CONFIG_EMM_MEMCG
|
||||
|
||||
struct emm_memcg_ops {
|
||||
int (*init)(struct mem_cgroup *memcg);
|
||||
void (*exit)(struct mem_cgroup *memcg);
|
||||
};
|
||||
|
||||
int emm_memcg_init(struct mem_cgroup *memcg);
|
||||
void emm_memcg_exit(struct mem_cgroup *memcg);
|
||||
|
||||
int emm_init(struct emm_memcg_ops *ops);
|
||||
int emm_exit(void);
|
||||
|
||||
#else
|
||||
|
||||
static inline int emm_memcg_init(struct mem_cgroup *memcg)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void emm_memcg_exit(struct mem_cgroup *memcg)
|
||||
{
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_EMM_RECLAIM
|
||||
|
||||
enum {
|
||||
EMM_RECLAIM,
|
||||
EMM_AGE,
|
||||
EMM_MIX,
|
||||
};
|
||||
|
||||
#endif
|
|
@ -37,6 +37,10 @@ enum memcg_stat_item {
|
|||
MEMCG_KMEM,
|
||||
MEMCG_ZSWAP_B,
|
||||
MEMCG_ZSWAPPED,
|
||||
#ifdef CONFIG_MEMCG_ZRAM
|
||||
MEMCG_ZRAM_B,
|
||||
MEMCG_ZRAMED,
|
||||
#endif
|
||||
MEMCG_NR_STAT,
|
||||
};
|
||||
|
||||
|
@ -231,6 +235,11 @@ struct mem_cgroup {
|
|||
unsigned long zswap_max;
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_MEMCG_ZRAM
|
||||
unsigned long zram_max;
|
||||
unsigned short zram_prio;
|
||||
#endif
|
||||
|
||||
unsigned long soft_limit;
|
||||
|
||||
/* vmpressure notifications */
|
||||
|
@ -326,11 +335,6 @@ struct mem_cgroup {
|
|||
struct list_head event_list;
|
||||
spinlock_t event_list_lock;
|
||||
|
||||
KABI_RESERVE(1);
|
||||
KABI_RESERVE(2);
|
||||
KABI_RESERVE(3);
|
||||
KABI_RESERVE(4);
|
||||
|
||||
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
struct deferred_split deferred_split_queue;
|
||||
#endif
|
||||
|
@ -340,6 +344,27 @@ struct mem_cgroup {
|
|||
struct lru_gen_mm_list mm_list;
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_EMM_MEMCG
|
||||
/* EMM: for tracking cgroup level info on the fly and with high performance */
|
||||
void *emm_memcg_data;
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_TEXT_UNEVICTABLE
|
||||
bool allow_unevictable;
|
||||
unsigned int unevictable_percent;
|
||||
/*
|
||||
* the unevictable_size is larger than the real unevictable memory
|
||||
* size, due to there may be multiple tasks sharing the same memory,
|
||||
* such as binary and dynamic library sharing.
|
||||
*/
|
||||
atomic_long_t unevictable_size;
|
||||
#endif
|
||||
|
||||
KABI_RESERVE(1);
|
||||
KABI_RESERVE(2);
|
||||
KABI_RESERVE(3);
|
||||
KABI_RESERVE(4);
|
||||
|
||||
struct mem_cgroup_per_node *nodeinfo[];
|
||||
};
|
||||
|
||||
|
@ -1046,8 +1071,8 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
|
|||
return x;
|
||||
}
|
||||
|
||||
void mem_cgroup_flush_stats(void);
|
||||
void mem_cgroup_flush_stats_ratelimited(void);
|
||||
void mem_cgroup_flush_stats(struct mem_cgroup *memcg);
|
||||
void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg);
|
||||
|
||||
void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
|
||||
int val);
|
||||
|
@ -1531,11 +1556,11 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
|
|||
return node_page_state(lruvec_pgdat(lruvec), idx);
|
||||
}
|
||||
|
||||
static inline void mem_cgroup_flush_stats(void)
|
||||
static inline void mem_cgroup_flush_stats(struct mem_cgroup *memcg)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void mem_cgroup_flush_stats_ratelimited(void)
|
||||
static inline void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg)
|
||||
{
|
||||
}
|
||||
|
||||
|
|
|
@ -444,6 +444,9 @@ extern unsigned int kobjsize(const void *objp);
|
|||
/* This mask represents all the VMA flag bits used by mlock */
|
||||
#define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT)
|
||||
|
||||
/* This mask is used to clear all the VMA flags used by mlock */
|
||||
#define VM_LOCKED_CLEAR_MASK (~(VM_LOCKED | VM_LOCKONFAULT))
|
||||
|
||||
/* Arch-specific flags to clear when updating VM flags on protection change */
|
||||
#ifndef VM_ARCH_CLEAR
|
||||
# define VM_ARCH_CLEAR VM_NONE
|
||||
|
@ -2330,6 +2333,9 @@ static inline bool can_do_mlock(void) { return false; }
|
|||
#endif
|
||||
extern int user_shm_lock(size_t, struct ucounts *);
|
||||
extern void user_shm_unlock(size_t, struct ucounts *);
|
||||
extern int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
|
||||
struct vm_area_struct **prev, unsigned long start,
|
||||
unsigned long end, vm_flags_t newflags);
|
||||
|
||||
struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr,
|
||||
pte_t pte);
|
||||
|
|
|
@ -132,6 +132,14 @@ static inline int lru_hist_from_seq(unsigned long seq)
|
|||
return seq % NR_HIST_GENS;
|
||||
}
|
||||
|
||||
static inline int lru_hist_of_min_seq(struct lruvec *lruvec, bool type)
|
||||
{
|
||||
if (IS_ENABLED(CONFIG_LRU_GEN_STATS))
|
||||
return lru_gen_from_seq(READ_ONCE(lruvec->lrugen.min_seq[type]));
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int lru_tier_from_refs(int refs)
|
||||
{
|
||||
VM_WARN_ON_ONCE(refs > BIT(LRU_REFS_WIDTH));
|
||||
|
@ -231,27 +239,22 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio,
|
|||
if (folio_test_unevictable(folio) || !lrugen->enabled)
|
||||
return false;
|
||||
/*
|
||||
* There are four common cases for this page:
|
||||
* 1. If it's hot, i.e., freshly faulted in, add it to the youngest
|
||||
* generation, and it's protected over the rest below.
|
||||
* 2. If it can't be evicted immediately, i.e., a dirty page pending
|
||||
* writeback, add it to the second youngest generation.
|
||||
* 3. If it should be evicted first, e.g., cold and clean from
|
||||
* folio_rotate_reclaimable(), add it to the oldest generation.
|
||||
* 4. Everything else falls between 2 & 3 above and is added to the
|
||||
* second oldest generation if it's considered inactive, or the
|
||||
* oldest generation otherwise. See lru_gen_is_active().
|
||||
* There are three common cases for this page:
|
||||
* 1. If it's hot, e.g., freshly faulted in or previously hot and
|
||||
* migrated, add it to the youngest generation.
|
||||
* 2. If it's cold but can't be evicted immediately, i.e., an anon page
|
||||
* not in swapcache or a dirty page pending writeback, add it to the
|
||||
* second oldest generation.
|
||||
* 3. Everything else (clean, cold) is added to the oldest generation.
|
||||
*/
|
||||
if (folio_test_active(folio))
|
||||
seq = lrugen->max_seq;
|
||||
else if ((type == LRU_GEN_ANON && !folio_test_swapcache(folio)) ||
|
||||
(folio_test_reclaim(folio) &&
|
||||
(folio_test_dirty(folio) || folio_test_writeback(folio))))
|
||||
seq = lrugen->max_seq - 1;
|
||||
else if (reclaiming || lrugen->min_seq[type] + MIN_NR_GENS >= lrugen->max_seq)
|
||||
seq = lrugen->min_seq[type];
|
||||
else
|
||||
seq = lrugen->min_seq[type] + 1;
|
||||
else
|
||||
seq = lrugen->min_seq[type];
|
||||
|
||||
gen = lru_gen_from_seq(seq);
|
||||
flags = (gen + 1UL) << LRU_GEN_PGOFF;
|
||||
|
|
|
@ -425,9 +425,9 @@ struct lru_gen_folio {
|
|||
/* the multi-gen LRU sizes, eventually consistent */
|
||||
long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
|
||||
/* the exponential moving average of refaulted */
|
||||
unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS];
|
||||
atomic_long_t avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS];
|
||||
/* the exponential moving average of evicted+protected */
|
||||
unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS];
|
||||
atomic_long_t avg_total[ANON_AND_FILE][MAX_NR_TIERS];
|
||||
/* the first tier doesn't need protection, hence the minus one */
|
||||
unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1];
|
||||
/* can be modified without holding the LRU lock */
|
||||
|
@ -486,6 +486,9 @@ struct lru_gen_mm_walk {
|
|||
int batched;
|
||||
bool can_swap;
|
||||
bool force_scan;
|
||||
#ifdef CONFIG_EMM_RECLAIM
|
||||
bool force_full_scan;
|
||||
#endif
|
||||
};
|
||||
|
||||
void lru_gen_init_lruvec(struct lruvec *lruvec);
|
||||
|
@ -626,8 +629,8 @@ struct lruvec {
|
|||
*/
|
||||
unsigned long anon_cost;
|
||||
unsigned long file_cost;
|
||||
/* Non-resident age, driven by LRU movement */
|
||||
atomic_long_t nonresident_age;
|
||||
/* Number of evictions (non-resident age) */
|
||||
atomic_long_t evictions[ANON_AND_FILE];
|
||||
/* Refaults at the time of last reclaim cycle */
|
||||
unsigned long refaults[ANON_AND_FILE];
|
||||
/* Various lruvec state flags (enum lruvec_flags) */
|
||||
|
@ -641,6 +644,18 @@ struct lruvec {
|
|||
#ifdef CONFIG_MEMCG
|
||||
struct pglist_data *pgdat;
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_EMM_WORKINGSET_TRACKING
|
||||
/* Non-resident file age, driven by LRU movement */
|
||||
atomic_long_t evicted_file;
|
||||
/* For estimating avg refault distance */
|
||||
unsigned long refault_count;
|
||||
unsigned long total_distance;
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_EMM_MEMCG
|
||||
void *emm_lruvec_data;
|
||||
#endif
|
||||
};
|
||||
|
||||
/* Isolate unmapped pages */
|
||||
|
|
|
@ -112,4 +112,5 @@ extern void oom_killer_enable(void);
|
|||
|
||||
extern struct task_struct *find_lock_task_mm(struct task_struct *p);
|
||||
|
||||
extern int sysctl_oom_kill_largest_task;
|
||||
#endif /* _INCLUDE_LINUX_OOM_H */
|
||||
|
|
|
@ -206,6 +206,9 @@ enum mapping_flags {
|
|||
AS_RELEASE_ALWAYS, /* Call ->release_folio(), even if no private data */
|
||||
AS_STABLE_WRITES, /* must wait for writeback before modifying
|
||||
folio contents */
|
||||
#ifdef CONFIG_EMM_RAMDISK_SWAP
|
||||
AS_RAM_SWAP, /* ramdisk based swap space, XXX: rename to some thing commonly used */
|
||||
#endif
|
||||
};
|
||||
|
||||
/**
|
||||
|
@ -306,6 +309,13 @@ static inline void mapping_clear_stable_writes(struct address_space *mapping)
|
|||
clear_bit(AS_STABLE_WRITES, &mapping->flags);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_EMM_RAMDISK_SWAP
|
||||
static inline int mapping_ram_swap(struct address_space *mapping)
|
||||
{
|
||||
return !test_bit(AS_RAM_SWAP, &mapping->flags);
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline gfp_t mapping_gfp_mask(struct address_space * mapping)
|
||||
{
|
||||
return mapping->gfp_mask;
|
||||
|
|
|
@ -348,11 +348,9 @@ static inline swp_entry_t page_swap_entry(struct page *page)
|
|||
}
|
||||
|
||||
/* linux/mm/workingset.c */
|
||||
bool workingset_test_recent(void *shadow, bool file, bool *workingset);
|
||||
void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages);
|
||||
bool workingset_test_recent(void *shadow, bool file, bool *workingset, bool tracking);
|
||||
void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg);
|
||||
void workingset_refault(struct folio *folio, void *shadow);
|
||||
void workingset_activation(struct folio *folio);
|
||||
|
||||
/* Only track the nodes of mappings with shadow entries */
|
||||
void workingset_update_node(struct xa_node *node);
|
||||
|
@ -524,7 +522,7 @@ extern int swp_swapcount(swp_entry_t entry);
|
|||
extern struct swap_info_struct *page_swap_info(struct page *);
|
||||
extern struct swap_info_struct *swp_swap_info(swp_entry_t entry);
|
||||
struct backing_dev_info;
|
||||
extern int init_swap_address_space(unsigned int type, unsigned long nr_pages);
|
||||
extern int init_swap_address_space(struct swap_info_struct *si, unsigned long nr_pages);
|
||||
extern void exit_swap_address_space(unsigned int type);
|
||||
extern struct swap_info_struct *get_swap_device(swp_entry_t entry);
|
||||
sector_t swap_page_sector(struct page *page);
|
||||
|
|
|
@ -0,0 +1,67 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
|
||||
#ifndef _TEXT_UNEVICTABLE_H
|
||||
#define _TEXT_UNEVICTABLE_H
|
||||
|
||||
struct mem_cgroup;
|
||||
|
||||
#ifdef CONFIG_TEXT_UNEVICTABLE
|
||||
DECLARE_STATIC_KEY_FALSE(unevictable_enabled_key);
|
||||
|
||||
static inline bool unevictable_enabled(void)
|
||||
{
|
||||
return static_branch_unlikely(&unevictable_enabled_key);
|
||||
}
|
||||
bool is_memcg_unevictable_enabled(struct mem_cgroup *memcg);
|
||||
void memcg_increase_unevict_size(struct mem_cgroup *memcg, unsigned long size);
|
||||
void memcg_decrease_unevict_size(struct mem_cgroup *memcg, unsigned long size);
|
||||
bool is_unevictable_size_overflow(struct mem_cgroup *memcg);
|
||||
unsigned long memcg_exstat_text_unevict_gather(struct mem_cgroup *memcg);
|
||||
void mem_cgroup_can_unevictable(struct task_struct *tsk, struct mem_cgroup *to);
|
||||
void mem_cgroup_cancel_unevictable(struct cgroup_taskset *tset);
|
||||
void memcg_all_processes_unevict(struct mem_cgroup *memcg, bool enable);
|
||||
void del_unevict_task(struct task_struct *tsk);
|
||||
void clean_task_unevict_size(struct task_struct *tsk);
|
||||
#else
|
||||
static inline bool unevictable_enabled(void)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
static inline bool is_memcg_unevictable_enabled(struct mem_cgroup *memcg)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
static inline void memcg_increase_unevict_size(struct mem_cgroup *memcg,
|
||||
unsigned long size)
|
||||
{
|
||||
}
|
||||
static inline void memcg_decrease_unevict_size(struct mem_cgroup *memcg,
|
||||
unsigned long size)
|
||||
{
|
||||
}
|
||||
static inline bool is_unevictable_size_overflow(struct mem_cgroup *memcg)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
static inline unsigned long memcg_exstat_text_unevict_gather(struct mem_cgroup *memcg)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
static inline void mem_cgroup_can_unevictable(struct task_struct *tsk,
|
||||
struct mem_cgroup *to)
|
||||
{
|
||||
}
|
||||
static inline void mem_cgroup_cancel_unevictable(struct cgroup_taskset *tset)
|
||||
{
|
||||
}
|
||||
static inline void memcg_all_processes_unevict(struct mem_cgroup *memcg, bool enable)
|
||||
{
|
||||
}
|
||||
static inline void del_unevict_task(struct task_struct *tsk)
|
||||
{
|
||||
}
|
||||
static inline void clean_task_unevict_size(struct task_struct *tsk)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
#endif
|
16
init/Kconfig
16
init/Kconfig
|
@ -958,6 +958,22 @@ config MEMCG_KMEM
|
|||
depends on MEMCG
|
||||
default y
|
||||
|
||||
config MEMCG_KMEM_DEFAULT_OFF
|
||||
bool "Disable kernel memory cgroup accounting by default"
|
||||
depends on MEMCG_KMEM
|
||||
help
|
||||
Disable kernel memory cgroup accounting by default, since it
|
||||
have extra overhead. User may override this at boot time
|
||||
kmem by passing cgroup.memory=nokmem or cgroup.memory=kmem
|
||||
to kernel cmdline.
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
config MEMCG_ZRAM
|
||||
bool
|
||||
depends on MEMCG && SWAP
|
||||
default y
|
||||
|
||||
config BLK_CGROUP
|
||||
bool "IO controller"
|
||||
depends on BLOCK
|
||||
|
|
|
@ -4387,6 +4387,7 @@ int cgroup_rm_cftypes(struct cftype *cfts)
|
|||
cgroup_unlock();
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(cgroup_rm_cftypes);
|
||||
|
||||
/**
|
||||
* cgroup_add_cftypes - add an array of cftypes to a subsystem
|
||||
|
@ -4443,6 +4444,7 @@ int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
|
|||
cft->flags |= __CFTYPE_ONLY_ON_DFL;
|
||||
return cgroup_add_cftypes(ss, cfts);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(cgroup_add_dfl_cftypes);
|
||||
|
||||
/**
|
||||
* cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies
|
||||
|
@ -4460,6 +4462,7 @@ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
|
|||
cft->flags |= __CFTYPE_NOT_ON_DFL;
|
||||
return cgroup_add_cftypes(ss, cfts);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(cgroup_add_legacy_cftypes);
|
||||
|
||||
/**
|
||||
* cgroup_file_notify - generate a file modified event for a cgroup_file
|
||||
|
|
|
@ -238,6 +238,7 @@ __bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp)
|
|||
cgroup_rstat_flush_locked(cgrp);
|
||||
spin_unlock_irq(&cgroup_rstat_lock);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(cgroup_rstat_flush);
|
||||
|
||||
/**
|
||||
* cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold
|
||||
|
|
|
@ -69,6 +69,9 @@
|
|||
#include <linux/rethook.h>
|
||||
#include <linux/sysfs.h>
|
||||
#include <linux/user_events.h>
|
||||
#ifdef CONFIG_TEXT_UNEVICTABLE
|
||||
#include <linux/unevictable.h>
|
||||
#endif
|
||||
|
||||
#include <linux/uaccess.h>
|
||||
#include <asm/unistd.h>
|
||||
|
@ -856,6 +859,9 @@ void __noreturn do_exit(long code)
|
|||
tsk->exit_code = code;
|
||||
taskstats_exit(tsk, group_dead);
|
||||
|
||||
#ifdef CONFIG_TEXT_UNEVICTABLE
|
||||
clean_task_unevict_size(tsk);
|
||||
#endif
|
||||
exit_mm();
|
||||
|
||||
if (group_dead)
|
||||
|
|
|
@ -134,12 +134,22 @@ static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT;
|
|||
defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT)
|
||||
int sysctl_legacy_va_layout;
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_CGROUPFS
|
||||
extern int container_cpuquota_aware;
|
||||
extern int cgroupfs_stat_show_cpuacct_info;
|
||||
int cgroupfs_mounted;
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_EMM_FORCE_SWAPPINESS
|
||||
extern int sysctl_vm_force_swappiness;
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_EMM_RAMDISK_SWAP
|
||||
extern int sysctl_vm_ramdisk_swaptune;
|
||||
extern int sysctl_vm_swapcache_fastfree;
|
||||
#endif
|
||||
|
||||
#endif /* CONFIG_SYSCTL */
|
||||
|
||||
/*
|
||||
|
@ -2231,6 +2241,15 @@ static struct ctl_table vm_table[] = {
|
|||
.mode = 0644,
|
||||
.proc_handler = overcommit_kbytes_handler,
|
||||
},
|
||||
{
|
||||
.procname = "oom_kill_largest_task",
|
||||
.data = &sysctl_oom_kill_largest_task,
|
||||
.maxlen = sizeof(sysctl_oom_kill_largest_task),
|
||||
.mode = 0644,
|
||||
.proc_handler = &proc_dointvec_minmax,
|
||||
.extra1 = SYSCTL_ZERO,
|
||||
.extra2 = SYSCTL_ONE,
|
||||
},
|
||||
{
|
||||
.procname = "page-cluster",
|
||||
.data = &page_cluster,
|
||||
|
@ -2257,6 +2276,37 @@ static struct ctl_table vm_table[] = {
|
|||
.extra1 = SYSCTL_ZERO,
|
||||
.extra2 = SYSCTL_TWO_HUNDRED,
|
||||
},
|
||||
#ifdef CONFIG_EMM_FORCE_SWAPPINESS
|
||||
{
|
||||
.procname = "force_swappiness",
|
||||
.data = &sysctl_vm_force_swappiness,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_minmax,
|
||||
.extra1 = SYSCTL_ZERO,
|
||||
.extra2 = SYSCTL_ONE,
|
||||
},
|
||||
#endif
|
||||
#ifdef CONFIG_EMM_RAMDISK_SWAP
|
||||
{
|
||||
.procname = "ramdisk_swaptune",
|
||||
.data = &sysctl_vm_ramdisk_swaptune,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_minmax,
|
||||
.extra1 = SYSCTL_ZERO,
|
||||
.extra2 = SYSCTL_ONE,
|
||||
},
|
||||
{
|
||||
.procname = "swapcache_fastfree",
|
||||
.data = &sysctl_vm_swapcache_fastfree,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_minmax,
|
||||
.extra1 = SYSCTL_ZERO,
|
||||
.extra2 = SYSCTL_ONE,
|
||||
},
|
||||
#endif
|
||||
#ifdef CONFIG_NUMA
|
||||
{
|
||||
.procname = "numa_stat",
|
||||
|
|
89
mm/Kconfig
89
mm/Kconfig
|
@ -1268,4 +1268,93 @@ config PAGECACHE_LIMIT
|
|||
|
||||
If unsure, say N.
|
||||
|
||||
config ENHANCED_MM
|
||||
bool "Enable enhanced mm support (EMM)"
|
||||
depends on MEMCG
|
||||
default n
|
||||
help
|
||||
Support for EMM, including in-kernel API, extended interface, and reserved
|
||||
extra data structures.
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
config EMM_FORCE_SWAPPINESS
|
||||
bool "Prevent kswapd from reclaim any anon pages"
|
||||
depends on ENHANCED_MM
|
||||
depends on SWAP
|
||||
default n
|
||||
help
|
||||
This option will prevent kswapd from reclaim any anon pages when
|
||||
when swappiness is set to 0.
|
||||
|
||||
By default, to prevent system-wide OOM, kswapd will reclaim anon
|
||||
pages for root cgroup even if swappiness is set to 0. This overrides
|
||||
this behaviour.
|
||||
|
||||
If unsure say N.
|
||||
|
||||
config EMM_RAMDISK_SWAP
|
||||
bool "Tune ramdisks based swaps"
|
||||
depends on ENHANCED_MM
|
||||
depends on SWAP
|
||||
default n
|
||||
help
|
||||
This option will enable a few tunes for ramdisk based swaps,
|
||||
and make swap work better with memcg bounded ramdisk.
|
||||
|
||||
If unsure say N.
|
||||
|
||||
config EMM_WORKINGSET_TRACKING
|
||||
bool "Evaluate memory eviction usage"
|
||||
depends on ENHANCED_MM
|
||||
default n
|
||||
|
||||
help
|
||||
Evaluate per-cgroup memory eviction status, this help esimate
|
||||
the maximum, or actual potential workingset growth pattern when
|
||||
active shrinker are enabled. Because active shrinker may reduce
|
||||
the in-memory LRU size and change the workingset size, estimating
|
||||
a "raw workingset" can help analyze and improve the memory usage.
|
||||
|
||||
config EMM_MEMCG
|
||||
bool "Enhanced memory management support for memcg."
|
||||
depends on ENHANCED_MM
|
||||
depends on MEMCG
|
||||
default y
|
||||
help
|
||||
This enables enhanced memory management support for memcg.
|
||||
|
||||
config EMM_RECLAIM
|
||||
bool "Enhanced memory reclaim support."
|
||||
depends on ENHANCED_MM
|
||||
depends on MEMCG
|
||||
default y
|
||||
help
|
||||
This enables enhanced memory reclaim support.
|
||||
|
||||
config EMM_ZRAM_CONF
|
||||
bool "A place holder to Ensure required ZRAM configures are enabled."
|
||||
select CRYPTO_LZO
|
||||
select CRYPTO_ZSTD
|
||||
select CRYPTO_LZ4
|
||||
select CRYPTO_LZO
|
||||
select CRYPTO_LZ4HC
|
||||
select MEMCG_ZRAM
|
||||
select ZSMALLOC
|
||||
default n
|
||||
help
|
||||
A place holder to Ensure required ZRAM configures are enabled.
|
||||
|
||||
config TEXT_UNEVICTABLE
|
||||
bool "Enable memcg granularity code section unevictable"
|
||||
depends on MEMCG
|
||||
default n
|
||||
help
|
||||
This feature is used to pin code section of processes in memcg for the
|
||||
corresponding VMAs like mlock does. The reason why add this feature here
|
||||
is to prevent the performance jitter when some key applications code section
|
||||
are reclaimed in memcg.
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
endmenu
|
||||
|
|
|
@ -99,6 +99,7 @@ obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
|
|||
ifdef CONFIG_SWAP
|
||||
obj-$(CONFIG_MEMCG) += swap_cgroup.o
|
||||
endif
|
||||
obj-$(CONFIG_EMM_MEMCG) += emm.o
|
||||
obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
|
||||
obj-$(CONFIG_GUP_TEST) += gup_test.o
|
||||
obj-$(CONFIG_DMAPOOL_TEST) += dmapool_test.o
|
||||
|
@ -138,3 +139,4 @@ obj-$(CONFIG_IO_MAPPING) += io-mapping.o
|
|||
obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
|
||||
obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o
|
||||
obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
|
||||
obj-y += unevictable.o
|
|
@ -0,0 +1,109 @@
|
|||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#include <linux/compiler.h>
|
||||
#include <linux/mm_types.h>
|
||||
#include <linux/cgroup.h>
|
||||
#include <linux/memcontrol.h>
|
||||
#include <linux/emm.h>
|
||||
|
||||
#include <asm-generic/bug.h>
|
||||
|
||||
struct emm_memcg_ops *__emm_memcg_ops __read_mostly;
|
||||
|
||||
static int _emm_do_memcg_init(struct mem_cgroup *memcg)
|
||||
{
|
||||
struct emm_memcg_ops *ops;
|
||||
|
||||
lockdep_assert_held(&cgroup_mutex);
|
||||
|
||||
ops = READ_ONCE(__emm_memcg_ops);
|
||||
|
||||
if (ops)
|
||||
return ops->init(memcg);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void _emm_do_memcg_exit(struct mem_cgroup *memcg)
|
||||
{
|
||||
struct emm_memcg_ops *ops;
|
||||
|
||||
lockdep_assert_held(&cgroup_mutex);
|
||||
|
||||
ops = READ_ONCE(__emm_memcg_ops);
|
||||
|
||||
if (ops)
|
||||
ops->exit(memcg);
|
||||
}
|
||||
|
||||
int emm_memcg_init(struct mem_cgroup *memcg)
|
||||
{
|
||||
return _emm_do_memcg_init(memcg);
|
||||
}
|
||||
|
||||
void emm_memcg_exit(struct mem_cgroup *memcg)
|
||||
{
|
||||
/* cgroup should be dying */
|
||||
WARN_ON_ONCE(!css_is_dying(&memcg->css));
|
||||
|
||||
_emm_do_memcg_exit(memcg);
|
||||
}
|
||||
|
||||
int emm_init(struct emm_memcg_ops *ops)
|
||||
{
|
||||
int ret = 0;
|
||||
struct mem_cgroup *memcg;
|
||||
|
||||
/*
|
||||
* Going to iterate through exiting cgroups,
|
||||
* also use it to protect __emm_memcg_ops
|
||||
*/
|
||||
cgroup_lock();
|
||||
|
||||
if (READ_ONCE(__emm_memcg_ops)) {
|
||||
ret = -EBUSY;
|
||||
goto out;
|
||||
}
|
||||
|
||||
WRITE_ONCE(__emm_memcg_ops, ops);
|
||||
|
||||
memcg = mem_cgroup_iter(NULL, NULL, NULL);
|
||||
do {
|
||||
_emm_do_memcg_init(memcg);
|
||||
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
|
||||
|
||||
out:
|
||||
cgroup_unlock();
|
||||
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(emm_init);
|
||||
|
||||
int emm_exit(void)
|
||||
{
|
||||
int ret = 0;
|
||||
struct mem_cgroup *memcg;
|
||||
|
||||
/*
|
||||
* Going to iterate through exiting cgroups,
|
||||
* also use it to protect __emm_memcg_ops
|
||||
*/
|
||||
cgroup_lock();
|
||||
|
||||
if (!READ_ONCE(__emm_memcg_ops)) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
memcg = mem_cgroup_iter(NULL, NULL, NULL);
|
||||
do {
|
||||
_emm_do_memcg_exit(memcg);
|
||||
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
|
||||
|
||||
WRITE_ONCE(__emm_memcg_ops, NULL);
|
||||
out:
|
||||
cgroup_unlock();
|
||||
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(emm_exit);
|
|
@ -4195,7 +4195,7 @@ static void filemap_cachestat(struct address_space *mapping,
|
|||
shadow = get_shadow_from_swap_cache(swp);
|
||||
}
|
||||
#endif
|
||||
if (workingset_test_recent(shadow, true, &workingset))
|
||||
if (workingset_test_recent(shadow, true, &workingset, false))
|
||||
cs->nr_recently_evicted += nr_pages;
|
||||
|
||||
goto resched;
|
||||
|
|
421
mm/memcontrol.c
421
mm/memcontrol.c
|
@ -63,12 +63,16 @@
|
|||
#include <linux/resume_user_mode.h>
|
||||
#include <linux/psi.h>
|
||||
#include <linux/seq_buf.h>
|
||||
#include <linux/emm.h>
|
||||
#include <linux/sched/isolation.h>
|
||||
#include "internal.h"
|
||||
#include <net/sock.h>
|
||||
#include <net/ip.h>
|
||||
#include "slab.h"
|
||||
#include "swap.h"
|
||||
#ifdef CONFIG_TEXT_UNEVICTABLE
|
||||
#include <linux/unevictable.h>
|
||||
#endif
|
||||
|
||||
#include <linux/uaccess.h>
|
||||
|
||||
|
@ -78,6 +82,7 @@ struct cgroup_subsys memory_cgrp_subsys __read_mostly;
|
|||
EXPORT_SYMBOL(memory_cgrp_subsys);
|
||||
|
||||
struct mem_cgroup *root_mem_cgroup __read_mostly;
|
||||
EXPORT_SYMBOL_GPL(root_mem_cgroup);
|
||||
|
||||
/* Active memory cgroup to use from an interrupt context */
|
||||
DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
|
||||
|
@ -87,7 +92,8 @@ EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg);
|
|||
static bool cgroup_memory_nosocket __ro_after_init;
|
||||
|
||||
/* Kernel memory accounting disabled? */
|
||||
static bool cgroup_memory_nokmem __ro_after_init;
|
||||
bool cgroup_memory_nokmem __ro_after_init = IS_ENABLED(CONFIG_MEMCG_KMEM_DEFAULT_OFF);
|
||||
EXPORT_SYMBOL(cgroup_memory_nokmem);
|
||||
|
||||
/* BPF memory accounting disabled? */
|
||||
static bool cgroup_memory_nobpf __ro_after_init;
|
||||
|
@ -570,116 +576,6 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
|
|||
return mz;
|
||||
}
|
||||
|
||||
/*
|
||||
* memcg and lruvec stats flushing
|
||||
*
|
||||
* Many codepaths leading to stats update or read are performance sensitive and
|
||||
* adding stats flushing in such codepaths is not desirable. So, to optimize the
|
||||
* flushing the kernel does:
|
||||
*
|
||||
* 1) Periodically and asynchronously flush the stats every 2 seconds to not let
|
||||
* rstat update tree grow unbounded.
|
||||
*
|
||||
* 2) Flush the stats synchronously on reader side only when there are more than
|
||||
* (MEMCG_CHARGE_BATCH * nr_cpus) update events. Though this optimization
|
||||
* will let stats be out of sync by atmost (MEMCG_CHARGE_BATCH * nr_cpus) but
|
||||
* only for 2 seconds due to (1).
|
||||
*/
|
||||
static void flush_memcg_stats_dwork(struct work_struct *w);
|
||||
static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
|
||||
static DEFINE_PER_CPU(unsigned int, stats_updates);
|
||||
static atomic_t stats_flush_ongoing = ATOMIC_INIT(0);
|
||||
static atomic_t stats_flush_threshold = ATOMIC_INIT(0);
|
||||
static u64 flush_next_time;
|
||||
|
||||
#define FLUSH_TIME (2UL*HZ)
|
||||
|
||||
/*
|
||||
* Accessors to ensure that preemption is disabled on PREEMPT_RT because it can
|
||||
* not rely on this as part of an acquired spinlock_t lock. These functions are
|
||||
* never used in hardirq context on PREEMPT_RT and therefore disabling preemtion
|
||||
* is sufficient.
|
||||
*/
|
||||
static void memcg_stats_lock(void)
|
||||
{
|
||||
preempt_disable_nested();
|
||||
VM_WARN_ON_IRQS_ENABLED();
|
||||
}
|
||||
|
||||
static void __memcg_stats_lock(void)
|
||||
{
|
||||
preempt_disable_nested();
|
||||
}
|
||||
|
||||
static void memcg_stats_unlock(void)
|
||||
{
|
||||
preempt_enable_nested();
|
||||
}
|
||||
|
||||
static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
|
||||
{
|
||||
unsigned int x;
|
||||
|
||||
if (!val)
|
||||
return;
|
||||
|
||||
cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
|
||||
|
||||
x = __this_cpu_add_return(stats_updates, abs(val));
|
||||
if (x > MEMCG_CHARGE_BATCH) {
|
||||
/*
|
||||
* If stats_flush_threshold exceeds the threshold
|
||||
* (>num_online_cpus()), cgroup stats update will be triggered
|
||||
* in __mem_cgroup_flush_stats(). Increasing this var further
|
||||
* is redundant and simply adds overhead in atomic update.
|
||||
*/
|
||||
if (atomic_read(&stats_flush_threshold) <= num_online_cpus())
|
||||
atomic_add(x / MEMCG_CHARGE_BATCH, &stats_flush_threshold);
|
||||
__this_cpu_write(stats_updates, 0);
|
||||
}
|
||||
}
|
||||
|
||||
static void do_flush_stats(void)
|
||||
{
|
||||
/*
|
||||
* We always flush the entire tree, so concurrent flushers can just
|
||||
* skip. This avoids a thundering herd problem on the rstat global lock
|
||||
* from memcg flushers (e.g. reclaim, refault, etc).
|
||||
*/
|
||||
if (atomic_read(&stats_flush_ongoing) ||
|
||||
atomic_xchg(&stats_flush_ongoing, 1))
|
||||
return;
|
||||
|
||||
WRITE_ONCE(flush_next_time, jiffies_64 + 2*FLUSH_TIME);
|
||||
|
||||
cgroup_rstat_flush(root_mem_cgroup->css.cgroup);
|
||||
|
||||
atomic_set(&stats_flush_threshold, 0);
|
||||
atomic_set(&stats_flush_ongoing, 0);
|
||||
}
|
||||
|
||||
void mem_cgroup_flush_stats(void)
|
||||
{
|
||||
if (atomic_read(&stats_flush_threshold) > num_online_cpus())
|
||||
do_flush_stats();
|
||||
}
|
||||
|
||||
void mem_cgroup_flush_stats_ratelimited(void)
|
||||
{
|
||||
if (time_after64(jiffies_64, READ_ONCE(flush_next_time)))
|
||||
mem_cgroup_flush_stats();
|
||||
}
|
||||
|
||||
static void flush_memcg_stats_dwork(struct work_struct *w)
|
||||
{
|
||||
/*
|
||||
* Always flush here so that flushing in latency-sensitive paths is
|
||||
* as cheap as possible.
|
||||
*/
|
||||
do_flush_stats();
|
||||
queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
|
||||
}
|
||||
|
||||
/* Subset of vm_event_item to report for memcg event stats */
|
||||
static const unsigned int memcg_vm_event_stat[] = {
|
||||
PGPGIN,
|
||||
|
@ -724,6 +620,15 @@ static inline int memcg_events_index(enum vm_event_item idx)
|
|||
}
|
||||
|
||||
struct memcg_vmstats_percpu {
|
||||
/* Stats updates since the last flush */
|
||||
unsigned int stats_updates;
|
||||
|
||||
/* Cached pointers for fast iteration in memcg_rstat_updated() */
|
||||
struct memcg_vmstats_percpu *parent;
|
||||
struct memcg_vmstats *vmstats;
|
||||
|
||||
/* The above should fit a single cacheline for memcg_rstat_updated() */
|
||||
|
||||
/* Local (CPU and cgroup) page state & events */
|
||||
long state[MEMCG_NR_STAT];
|
||||
unsigned long events[NR_MEMCG_EVENTS];
|
||||
|
@ -735,7 +640,7 @@ struct memcg_vmstats_percpu {
|
|||
/* Cgroup1: threshold notifications & softlimit tree updates */
|
||||
unsigned long nr_page_events;
|
||||
unsigned long targets[MEM_CGROUP_NTARGETS];
|
||||
};
|
||||
} ____cacheline_aligned;
|
||||
|
||||
struct memcg_vmstats {
|
||||
/* Aggregated (CPU and subtree) page state & events */
|
||||
|
@ -749,8 +654,133 @@ struct memcg_vmstats {
|
|||
/* Pending child counts during tree propagation */
|
||||
long state_pending[MEMCG_NR_STAT];
|
||||
unsigned long events_pending[NR_MEMCG_EVENTS];
|
||||
|
||||
/* Stats updates since the last flush */
|
||||
atomic64_t stats_updates;
|
||||
};
|
||||
|
||||
/*
|
||||
* memcg and lruvec stats flushing
|
||||
*
|
||||
* Many codepaths leading to stats update or read are performance sensitive and
|
||||
* adding stats flushing in such codepaths is not desirable. So, to optimize the
|
||||
* flushing the kernel does:
|
||||
*
|
||||
* 1) Periodically and asynchronously flush the stats every 2 seconds to not let
|
||||
* rstat update tree grow unbounded.
|
||||
*
|
||||
* 2) Flush the stats synchronously on reader side only when there are more than
|
||||
* (MEMCG_CHARGE_BATCH * nr_cpus) update events. Though this optimization
|
||||
* will let stats be out of sync by atmost (MEMCG_CHARGE_BATCH * nr_cpus) but
|
||||
* only for 2 seconds due to (1).
|
||||
*/
|
||||
static void flush_memcg_stats_dwork(struct work_struct *w);
|
||||
static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
|
||||
static u64 flush_last_time;
|
||||
|
||||
#define FLUSH_TIME (2UL*HZ)
|
||||
|
||||
/*
|
||||
* Accessors to ensure that preemption is disabled on PREEMPT_RT because it can
|
||||
* not rely on this as part of an acquired spinlock_t lock. These functions are
|
||||
* never used in hardirq context on PREEMPT_RT and therefore disabling preemtion
|
||||
* is sufficient.
|
||||
*/
|
||||
static void memcg_stats_lock(void)
|
||||
{
|
||||
preempt_disable_nested();
|
||||
VM_WARN_ON_IRQS_ENABLED();
|
||||
}
|
||||
|
||||
static void __memcg_stats_lock(void)
|
||||
{
|
||||
preempt_disable_nested();
|
||||
}
|
||||
|
||||
static void memcg_stats_unlock(void)
|
||||
{
|
||||
preempt_enable_nested();
|
||||
}
|
||||
|
||||
|
||||
static bool memcg_vmstats_needs_flush(struct memcg_vmstats *vmstats)
|
||||
{
|
||||
return atomic64_read(&vmstats->stats_updates) >
|
||||
MEMCG_CHARGE_BATCH * num_online_cpus();
|
||||
}
|
||||
|
||||
static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
|
||||
{
|
||||
struct memcg_vmstats_percpu *statc;
|
||||
int cpu = smp_processor_id();
|
||||
|
||||
if (!val)
|
||||
return;
|
||||
|
||||
cgroup_rstat_updated(memcg->css.cgroup, cpu);
|
||||
statc = this_cpu_ptr(memcg->vmstats_percpu);
|
||||
for (; statc; statc = statc->parent) {
|
||||
statc->stats_updates += abs(val);
|
||||
if (statc->stats_updates < MEMCG_CHARGE_BATCH)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* If @memcg is already flush-able, increasing stats_updates is
|
||||
* redundant. Avoid the overhead of the atomic update.
|
||||
*/
|
||||
if (!memcg_vmstats_needs_flush(statc->vmstats))
|
||||
atomic64_add(statc->stats_updates,
|
||||
&statc->vmstats->stats_updates);
|
||||
statc->stats_updates = 0;
|
||||
}
|
||||
}
|
||||
|
||||
static void do_flush_stats(struct mem_cgroup *memcg)
|
||||
{
|
||||
if (mem_cgroup_is_root(memcg))
|
||||
WRITE_ONCE(flush_last_time, jiffies_64);
|
||||
|
||||
cgroup_rstat_flush(memcg->css.cgroup);
|
||||
}
|
||||
|
||||
/*
|
||||
* mem_cgroup_flush_stats - flush the stats of a memory cgroup subtree
|
||||
* @memcg: root of the subtree to flush
|
||||
*
|
||||
* Flushing is serialized by the underlying global rstat lock. There is also a
|
||||
* minimum amount of work to be done even if there are no stat updates to flush.
|
||||
* Hence, we only flush the stats if the updates delta exceeds a threshold. This
|
||||
* avoids unnecessary work and contention on the underlying lock.
|
||||
*/
|
||||
void mem_cgroup_flush_stats(struct mem_cgroup *memcg)
|
||||
{
|
||||
if (mem_cgroup_disabled())
|
||||
return;
|
||||
|
||||
if (!memcg)
|
||||
memcg = root_mem_cgroup;
|
||||
|
||||
if (memcg_vmstats_needs_flush(memcg->vmstats))
|
||||
do_flush_stats(memcg);
|
||||
}
|
||||
|
||||
void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg)
|
||||
{
|
||||
/* Only flush if the periodic flusher is one full cycle late */
|
||||
if (time_after64(jiffies_64, READ_ONCE(flush_last_time) + 2*FLUSH_TIME))
|
||||
mem_cgroup_flush_stats(memcg);
|
||||
}
|
||||
|
||||
static void flush_memcg_stats_dwork(struct work_struct *w)
|
||||
{
|
||||
/*
|
||||
* Deliberately ignore memcg_vmstats_needs_flush() here so that flushing
|
||||
* in latency-sensitive paths is as cheap as possible.
|
||||
*/
|
||||
do_flush_stats(root_mem_cgroup);
|
||||
queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
|
||||
}
|
||||
|
||||
unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
|
||||
{
|
||||
long x = READ_ONCE(memcg->vmstats->state[idx]);
|
||||
|
@ -760,6 +790,7 @@ unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
|
|||
#endif
|
||||
return x;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(memcg_page_state);
|
||||
|
||||
/**
|
||||
* __mod_memcg_state - update cgroup memory statistics
|
||||
|
@ -775,6 +806,7 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
|
|||
__this_cpu_add(memcg->vmstats_percpu->state[idx], val);
|
||||
memcg_rstat_updated(memcg, val);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__mod_memcg_state);
|
||||
|
||||
/* idx can be of type enum memcg_stat_item or node_stat_item. */
|
||||
static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
|
||||
|
@ -1198,6 +1230,7 @@ out_unlock:
|
|||
|
||||
return memcg;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(mem_cgroup_iter);
|
||||
|
||||
/**
|
||||
* mem_cgroup_iter_break - abort a hierarchy walk prematurely
|
||||
|
@ -1503,6 +1536,10 @@ static const struct memory_stat memory_stats[] = {
|
|||
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
|
||||
{ "zswap", MEMCG_ZSWAP_B },
|
||||
{ "zswapped", MEMCG_ZSWAPPED },
|
||||
#endif
|
||||
#ifdef CONFIG_MEMCG_ZRAM
|
||||
{ "zram", MEMCG_ZRAM_B },
|
||||
{ "zrammed", MEMCG_ZRAMED },
|
||||
#endif
|
||||
{ "file_mapped", NR_FILE_MAPPED },
|
||||
{ "file_dirty", NR_FILE_DIRTY },
|
||||
|
@ -1539,6 +1576,7 @@ static int memcg_page_state_unit(int item)
|
|||
switch (item) {
|
||||
case MEMCG_PERCPU_B:
|
||||
case MEMCG_ZSWAP_B:
|
||||
case MEMCG_ZRAM_B:
|
||||
case NR_SLAB_RECLAIMABLE_B:
|
||||
case NR_SLAB_UNRECLAIMABLE_B:
|
||||
case WORKINGSET_REFAULT_ANON:
|
||||
|
@ -1576,7 +1614,7 @@ static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
|
|||
*
|
||||
* Current memory state:
|
||||
*/
|
||||
mem_cgroup_flush_stats();
|
||||
mem_cgroup_flush_stats(memcg);
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
|
||||
u64 size;
|
||||
|
@ -3400,11 +3438,13 @@ int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
|
|||
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(obj_cgroup_charge);
|
||||
|
||||
void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
|
||||
{
|
||||
refill_obj_stock(objcg, size, true);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(obj_cgroup_uncharge);
|
||||
|
||||
#endif /* CONFIG_MEMCG_KMEM */
|
||||
|
||||
|
@ -3675,6 +3715,10 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
|
|||
global_node_page_state(NR_ANON_MAPPED);
|
||||
if (swap)
|
||||
val += total_swap_pages - get_nr_swap_pages();
|
||||
#ifdef CONFIG_MEMCG_ZRAM
|
||||
else
|
||||
val += memcg_page_state(memcg, MEMCG_ZRAM_B) / PAGE_SIZE;
|
||||
#endif
|
||||
} else {
|
||||
if (!swap)
|
||||
val = page_counter_read(&memcg->memory);
|
||||
|
@ -4026,7 +4070,7 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v)
|
|||
int nid;
|
||||
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
|
||||
|
||||
mem_cgroup_flush_stats();
|
||||
mem_cgroup_flush_stats(memcg);
|
||||
|
||||
for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
|
||||
seq_printf(m, "%s=%lu", stat->name,
|
||||
|
@ -4101,7 +4145,7 @@ static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
|
|||
|
||||
BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
|
||||
|
||||
mem_cgroup_flush_stats();
|
||||
mem_cgroup_flush_stats(memcg);
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
|
||||
unsigned long nr;
|
||||
|
@ -4173,6 +4217,18 @@ static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
|
|||
#endif
|
||||
}
|
||||
|
||||
#ifdef CONFIG_TEXT_UNEVICTABLE
|
||||
static int memcg_unevict_size_show(struct seq_file *m, void *v)
|
||||
{
|
||||
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
|
||||
|
||||
seq_printf(m, "unevictable_text_size_kb %lu\n",
|
||||
memcg_exstat_text_unevict_gather(memcg) >> 10);
|
||||
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
|
||||
struct cftype *cft)
|
||||
{
|
||||
|
@ -4603,7 +4659,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
|
|||
struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
|
||||
struct mem_cgroup *parent;
|
||||
|
||||
mem_cgroup_flush_stats();
|
||||
mem_cgroup_flush_stats(memcg);
|
||||
|
||||
*pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
|
||||
*pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
|
||||
|
@ -5339,8 +5395,6 @@ static int mem_cgroup_vmstat_read(struct seq_file *m, void *vv)
|
|||
return mem_cgroup_vmstat_read_comm(m, vv, memcg);
|
||||
}
|
||||
|
||||
static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
|
||||
size_t nbytes, loff_t off);
|
||||
static u64 memory_current_read(struct cgroup_subsys_state *css,
|
||||
struct cftype *cft);
|
||||
static int memory_low_show(struct seq_file *m, void *v);
|
||||
|
@ -5354,12 +5408,77 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
|
|||
char *buf, size_t nbytes, loff_t off);
|
||||
static int memory_events_show(struct seq_file *m, void *v);
|
||||
|
||||
#ifdef CONFIG_TEXT_UNEVICTABLE
|
||||
static u64 mem_cgroup_allow_unevictable_read(struct cgroup_subsys_state *css,
|
||||
struct cftype *cft)
|
||||
{
|
||||
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
|
||||
|
||||
return memcg->allow_unevictable;
|
||||
}
|
||||
|
||||
static int mem_cgroup_allow_unevictable_write(struct cgroup_subsys_state *css,
|
||||
struct cftype *cft, u64 val)
|
||||
{
|
||||
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
|
||||
|
||||
if (val > 1)
|
||||
return -EINVAL;
|
||||
if (memcg->allow_unevictable == val)
|
||||
return 0;
|
||||
|
||||
memcg->allow_unevictable = val;
|
||||
if (val)
|
||||
memcg_all_processes_unevict(memcg, true);
|
||||
else
|
||||
memcg_all_processes_unevict(memcg, false);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static u64 mem_cgroup_unevictable_percent_read(struct cgroup_subsys_state *css,
|
||||
struct cftype *cft)
|
||||
{
|
||||
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
|
||||
|
||||
return memcg->unevictable_percent;
|
||||
}
|
||||
|
||||
static int mem_cgroup_unevictable_percent_write(struct cgroup_subsys_state *css,
|
||||
struct cftype *cft, u64 val)
|
||||
{
|
||||
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
|
||||
|
||||
if (val > 100)
|
||||
return -EINVAL;
|
||||
|
||||
memcg->unevictable_percent = val;
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
static struct cftype mem_cgroup_legacy_files[] = {
|
||||
{
|
||||
.name = "usage_in_bytes",
|
||||
.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
|
||||
.read_u64 = mem_cgroup_read_u64,
|
||||
},
|
||||
#ifdef CONFIG_TEXT_UNEVICTABLE
|
||||
{
|
||||
.name = "allow_text_unevictable",
|
||||
.read_u64 = mem_cgroup_allow_unevictable_read,
|
||||
.write_u64 = mem_cgroup_allow_unevictable_write,
|
||||
},
|
||||
{
|
||||
.name = "text_unevictable_percent",
|
||||
.read_u64 = mem_cgroup_unevictable_percent_read,
|
||||
.write_u64 = mem_cgroup_unevictable_percent_write,
|
||||
},
|
||||
{
|
||||
.name = "text_unevictable_size",
|
||||
.seq_show = memcg_unevict_size_show,
|
||||
},
|
||||
#endif
|
||||
{
|
||||
.name = "max_usage_in_bytes",
|
||||
.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
|
||||
|
@ -5529,11 +5648,6 @@ static struct cftype mem_cgroup_legacy_files[] = {
|
|||
.flags = CFTYPE_NOT_ON_ROOT,
|
||||
.seq_show = memory_events_show,
|
||||
},
|
||||
{
|
||||
.name = "reclaim",
|
||||
.flags = CFTYPE_NS_DELEGATABLE,
|
||||
.write = memory_reclaim,
|
||||
},
|
||||
{ }, /* terminate */
|
||||
};
|
||||
|
||||
|
@ -5679,10 +5793,11 @@ static void mem_cgroup_free(struct mem_cgroup *memcg)
|
|||
__mem_cgroup_free(memcg);
|
||||
}
|
||||
|
||||
static struct mem_cgroup *mem_cgroup_alloc(void)
|
||||
static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
|
||||
{
|
||||
struct memcg_vmstats_percpu *statc, *pstatc;
|
||||
struct mem_cgroup *memcg;
|
||||
int node;
|
||||
int node, cpu;
|
||||
int __maybe_unused i;
|
||||
long error = -ENOMEM;
|
||||
|
||||
|
@ -5706,10 +5821,21 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
|
|||
if (!memcg->vmstats_percpu)
|
||||
goto fail;
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
if (parent)
|
||||
pstatc = per_cpu_ptr(parent->vmstats_percpu, cpu);
|
||||
statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
|
||||
statc->parent = parent ? pstatc : NULL;
|
||||
statc->vmstats = memcg->vmstats;
|
||||
}
|
||||
|
||||
for_each_node(node)
|
||||
if (alloc_mem_cgroup_per_node_info(memcg, node))
|
||||
goto fail;
|
||||
|
||||
if (emm_memcg_init(memcg))
|
||||
goto fail;
|
||||
|
||||
if (memcg_wb_domain_init(memcg, GFP_KERNEL))
|
||||
goto fail;
|
||||
|
||||
|
@ -5751,7 +5877,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
|
|||
struct mem_cgroup *memcg, *old_memcg;
|
||||
|
||||
old_memcg = set_active_memcg(parent);
|
||||
memcg = mem_cgroup_alloc();
|
||||
memcg = mem_cgroup_alloc(parent);
|
||||
set_active_memcg(old_memcg);
|
||||
if (IS_ERR(memcg))
|
||||
return ERR_CAST(memcg);
|
||||
|
@ -5760,12 +5886,24 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
|
|||
WRITE_ONCE(memcg->soft_limit, PAGE_COUNTER_MAX);
|
||||
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
|
||||
memcg->zswap_max = PAGE_COUNTER_MAX;
|
||||
#endif
|
||||
#ifdef CONFIG_MEMCG_ZRAM
|
||||
memcg->zram_max = PAGE_COUNTER_MAX;
|
||||
#endif
|
||||
page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
|
||||
#ifdef CONFIG_TEXT_UNEVICTABLE
|
||||
memcg->unevictable_percent = 100;
|
||||
atomic_long_set(&memcg->unevictable_size, 0);
|
||||
#endif
|
||||
if (parent) {
|
||||
#ifdef CONFIG_TEXT_UNEVICTABLE
|
||||
memcg->allow_unevictable = parent->allow_unevictable;
|
||||
#endif
|
||||
WRITE_ONCE(memcg->swappiness, mem_cgroup_swappiness(parent));
|
||||
WRITE_ONCE(memcg->oom_kill_disable, READ_ONCE(parent->oom_kill_disable));
|
||||
|
||||
#ifdef CONFIG_MEMCG_ZRAM
|
||||
memcg->zram_prio = parent->zram_prio;
|
||||
#endif
|
||||
page_counter_init(&memcg->memory, &parent->memory);
|
||||
page_counter_init(&memcg->swap, &parent->swap);
|
||||
page_counter_init(&memcg->kmem, &parent->kmem);
|
||||
|
@ -5872,6 +6010,8 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
|
|||
|
||||
invalidate_reclaim_iterators(memcg);
|
||||
lru_gen_release_memcg(memcg);
|
||||
|
||||
emm_memcg_exit(memcg);
|
||||
}
|
||||
|
||||
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
|
||||
|
@ -6026,6 +6166,10 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
|
|||
}
|
||||
}
|
||||
}
|
||||
statc->stats_updates = 0;
|
||||
/* We are in a per-cpu loop here, only do the atomic write once */
|
||||
if (atomic64_read(&memcg->vmstats->stats_updates))
|
||||
atomic64_set(&memcg->vmstats->stats_updates, 0);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MMU
|
||||
|
@ -6554,6 +6698,10 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
|
|||
if (!p)
|
||||
return 0;
|
||||
|
||||
#ifdef CONFIG_TEXT_UNEVICTABLE
|
||||
mem_cgroup_can_unevictable(p, memcg);
|
||||
#endif
|
||||
|
||||
/*
|
||||
* We are now committed to this value whatever it is. Changes in this
|
||||
* tunable will only affect upcoming migrations, not the current one.
|
||||
|
@ -6597,6 +6745,9 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
|
|||
|
||||
static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
|
||||
{
|
||||
#ifdef CONFIG_TEXT_UNEVICTABLE
|
||||
mem_cgroup_cancel_unevictable(tset);
|
||||
#endif
|
||||
if (mc.to)
|
||||
mem_cgroup_clear_mc();
|
||||
}
|
||||
|
@ -7027,7 +7178,7 @@ static int memory_numa_stat_show(struct seq_file *m, void *v)
|
|||
int i;
|
||||
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
|
||||
|
||||
mem_cgroup_flush_stats();
|
||||
mem_cgroup_flush_stats(memcg);
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
|
||||
int nid;
|
||||
|
@ -7758,6 +7909,8 @@ static int __init cgroup_memory(char *s)
|
|||
cgroup_memory_nokmem = true;
|
||||
if (!strcmp(token, "nobpf"))
|
||||
cgroup_memory_nobpf = true;
|
||||
if (!strcmp(token, "kmem"))
|
||||
cgroup_memory_nokmem = false;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
@ -8189,7 +8342,11 @@ bool obj_cgroup_may_zswap(struct obj_cgroup *objcg)
|
|||
break;
|
||||
}
|
||||
|
||||
cgroup_rstat_flush(memcg->css.cgroup);
|
||||
/*
|
||||
* mem_cgroup_flush_stats() ignores small changes. Use
|
||||
* do_flush_stats() directly to get accurate stats for charging.
|
||||
*/
|
||||
do_flush_stats(memcg);
|
||||
pages = memcg_page_state(memcg, MEMCG_ZSWAP_B) / PAGE_SIZE;
|
||||
if (pages < max)
|
||||
continue;
|
||||
|
@ -8254,8 +8411,10 @@ void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size)
|
|||
static u64 zswap_current_read(struct cgroup_subsys_state *css,
|
||||
struct cftype *cft)
|
||||
{
|
||||
cgroup_rstat_flush(css->cgroup);
|
||||
return memcg_page_state(mem_cgroup_from_css(css), MEMCG_ZSWAP_B);
|
||||
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
|
||||
|
||||
mem_cgroup_flush_stats(memcg);
|
||||
return memcg_page_state(memcg, MEMCG_ZSWAP_B);
|
||||
}
|
||||
|
||||
static int zswap_max_show(struct seq_file *m, void *v)
|
||||
|
|
|
@ -409,7 +409,7 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma,
|
|||
*
|
||||
* For vmas that pass the filters, merge/split as appropriate.
|
||||
*/
|
||||
static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
|
||||
int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
|
||||
struct vm_area_struct **prev, unsigned long start,
|
||||
unsigned long end, vm_flags_t newflags)
|
||||
{
|
||||
|
@ -420,7 +420,7 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
|
|||
vm_flags_t oldflags = vma->vm_flags;
|
||||
|
||||
if (newflags == oldflags || (oldflags & VM_SPECIAL) ||
|
||||
is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) ||
|
||||
is_vm_hugetlb_page(vma) || vma == get_gate_vma(mm) ||
|
||||
vma_is_dax(vma) || vma_is_secretmem(vma))
|
||||
/* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
|
||||
goto out;
|
||||
|
|
|
@ -55,6 +55,7 @@
|
|||
static int sysctl_panic_on_oom;
|
||||
static int sysctl_oom_kill_allocating_task;
|
||||
static int sysctl_oom_dump_tasks = 1;
|
||||
int sysctl_oom_kill_largest_task;
|
||||
|
||||
/*
|
||||
* Serializes oom killer invocations (out_of_memory()) from all contexts to
|
||||
|
@ -230,11 +231,14 @@ long oom_badness(struct task_struct *p, unsigned long totalpages)
|
|||
points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
|
||||
mm_pgtables_bytes(p->mm) / PAGE_SIZE;
|
||||
task_unlock(p);
|
||||
if (sysctl_oom_kill_largest_task)
|
||||
goto ret;
|
||||
|
||||
/* Normalize to oom_score_adj units */
|
||||
adj *= totalpages / 1000;
|
||||
points += adj;
|
||||
|
||||
ret:
|
||||
return points;
|
||||
}
|
||||
|
||||
|
|
25
mm/page_io.c
25
mm/page_io.c
|
@ -201,8 +201,8 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
|
|||
folio_end_writeback(folio);
|
||||
return 0;
|
||||
}
|
||||
__swap_writepage(&folio->page, wbc);
|
||||
return 0;
|
||||
ret = __swap_writepage(&folio->page, wbc);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(swap_writepage);
|
||||
|
||||
|
@ -369,11 +369,21 @@ static void swap_writepage_bdev_async(struct page *page,
|
|||
submit_bio(bio);
|
||||
}
|
||||
|
||||
void __swap_writepage(struct page *page, struct writeback_control *wbc)
|
||||
int __swap_writepage(struct page *page, struct writeback_control *wbc)
|
||||
{
|
||||
struct swap_info_struct *sis = page_swap_info(page);
|
||||
|
||||
VM_BUG_ON_PAGE(!PageSwapCache(page), page);
|
||||
|
||||
if (data_race(sis->flags & SWP_SYNCHRONOUS_IO)) {
|
||||
int ret = bdev_swapout_folio(sis->bdev, swap_page_sector(page), page_folio(page), wbc);
|
||||
if (ret != -EOPNOTSUPP) {
|
||||
if (!ret)
|
||||
count_swpout_vm_event(page_folio(page));
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* ->flags can be updated non-atomicially (scan_swap_map_slots),
|
||||
* but that will never affect SWP_FS_OPS, so the data_race
|
||||
|
@ -385,6 +395,8 @@ void __swap_writepage(struct page *page, struct writeback_control *wbc)
|
|||
swap_writepage_bdev_sync(page, wbc, sis);
|
||||
else
|
||||
swap_writepage_bdev_async(page, wbc, sis);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void swap_write_unplug(struct swap_iocb *sio)
|
||||
|
@ -520,11 +532,18 @@ void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug)
|
|||
} else if (data_race(sis->flags & SWP_FS_OPS)) {
|
||||
swap_readpage_fs(page, plug);
|
||||
} else if (synchronous || (sis->flags & SWP_SYNCHRONOUS_IO)) {
|
||||
int ret = bdev_swapin_folio(sis->bdev, swap_page_sector(page), folio);
|
||||
if (ret != -EOPNOTSUPP) {
|
||||
if (!ret)
|
||||
count_vm_event(PSWPIN);
|
||||
goto out;
|
||||
}
|
||||
swap_readpage_bdev_sync(page, sis);
|
||||
} else {
|
||||
swap_readpage_bdev_async(page, sis);
|
||||
}
|
||||
|
||||
out:
|
||||
if (workingset) {
|
||||
delayacct_thrashing_end(&in_thrashing);
|
||||
psi_memstall_leave(&pflags);
|
||||
|
|
|
@ -482,7 +482,6 @@ void folio_mark_accessed(struct folio *folio)
|
|||
else
|
||||
__lru_cache_activate_folio(folio);
|
||||
folio_clear_referenced(folio);
|
||||
workingset_activation(folio);
|
||||
}
|
||||
if (folio_test_idle(folio))
|
||||
folio_clear_idle(folio);
|
||||
|
@ -910,6 +909,7 @@ void lru_add_drain_all(void)
|
|||
lru_add_drain();
|
||||
}
|
||||
#endif /* CONFIG_SMP */
|
||||
EXPORT_SYMBOL_GPL(lru_add_drain_all);
|
||||
|
||||
atomic_t lru_disable_count = ATOMIC_INIT(0);
|
||||
|
||||
|
|
|
@ -17,7 +17,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
|
|||
}
|
||||
void swap_write_unplug(struct swap_iocb *sio);
|
||||
int swap_writepage(struct page *page, struct writeback_control *wbc);
|
||||
void __swap_writepage(struct page *page, struct writeback_control *wbc);
|
||||
int __swap_writepage(struct page *page, struct writeback_control *wbc);
|
||||
|
||||
/* linux/mm/swap_state.c */
|
||||
/* One swap address space for each 64M swap space */
|
||||
|
|
|
@ -673,11 +673,12 @@ skip:
|
|||
return read_swap_cache_async(entry, gfp_mask, vma, addr, NULL);
|
||||
}
|
||||
|
||||
int init_swap_address_space(unsigned int type, unsigned long nr_pages)
|
||||
int init_swap_address_space(struct swap_info_struct *si, unsigned long nr_pages)
|
||||
{
|
||||
struct address_space *spaces, *space;
|
||||
unsigned int i, nr;
|
||||
unsigned int i, nr, type;
|
||||
|
||||
type = si->type;
|
||||
nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES);
|
||||
spaces = kvcalloc(nr, sizeof(struct address_space), GFP_KERNEL);
|
||||
if (!spaces)
|
||||
|
@ -689,6 +690,10 @@ int init_swap_address_space(unsigned int type, unsigned long nr_pages)
|
|||
space->a_ops = &swap_aops;
|
||||
/* swap cache doesn't use writeback related tags */
|
||||
mapping_set_no_writeback_tags(space);
|
||||
#ifdef CONFIG_EMM_RAMDISK_SWAP
|
||||
if (si->bdev && bdev_ramdisk(si->bdev))
|
||||
set_bit(AS_RAM_SWAP, &space->flags);
|
||||
#endif
|
||||
}
|
||||
nr_swapper_spaces[type] = nr;
|
||||
swapper_spaces[type] = spaces;
|
||||
|
|
|
@ -2802,6 +2802,11 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
|
|||
int error;
|
||||
|
||||
if (S_ISBLK(inode->i_mode)) {
|
||||
#ifdef CONFIG_ENHANCED_MM
|
||||
WARN(p->swap_file->f_mapping->a_ops->swap_activate,
|
||||
"Swapping on block file over filesystem %s, file system operations may get bypassed unexpectedly and lead to data loss.\n",
|
||||
p->swap_file->f_inode->i_sb->s_id);
|
||||
#endif
|
||||
p->bdev = blkdev_get_by_dev(inode->i_rdev,
|
||||
BLK_OPEN_READ | BLK_OPEN_WRITE, p, NULL);
|
||||
if (IS_ERR(p->bdev)) {
|
||||
|
@ -3199,7 +3204,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
|
|||
}
|
||||
}
|
||||
|
||||
error = init_swap_address_space(p->type, maxpages);
|
||||
error = init_swap_address_space(p, maxpages);
|
||||
if (error)
|
||||
goto bad_swap_unlock_inode;
|
||||
|
||||
|
|
|
@ -0,0 +1,865 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-only
|
||||
/*
|
||||
* Pin Process Code Section:
|
||||
* echo PID > /proc/unevictable/add_pid
|
||||
* echo PID > /proc/unevictable/del_pid
|
||||
* cat /proc/unevictable/add_pid
|
||||
*
|
||||
* Copyright (C) 2019 Alibaba
|
||||
* Author: Xunlei Pang <xlpang@linux.alibaba.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*/
|
||||
#include <linux/types.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/proc_fs.h>
|
||||
#include <linux/sched/mm.h>
|
||||
#include <linux/swap.h>
|
||||
#include <linux/ksm.h>
|
||||
#include <linux/hugetlb.h>
|
||||
#include <linux/rbtree.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/kprobes.h>
|
||||
#include <linux/workqueue.h>
|
||||
#include <linux/pid_namespace.h>
|
||||
#ifdef CONFIG_TEXT_UNEVICTABLE
|
||||
#include <linux/unevictable.h>
|
||||
#endif
|
||||
|
||||
#define PROC_NAME "unevictable"
|
||||
#define NAME_BUF 8
|
||||
|
||||
#ifdef CONFIG_TEXT_UNEVICTABLE
|
||||
DEFINE_STATIC_KEY_FALSE(unevictable_enabled_key);
|
||||
|
||||
#define for_each_mem_cgroup(iter) \
|
||||
for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
|
||||
iter != NULL; \
|
||||
iter = mem_cgroup_iter(NULL, iter, NULL))
|
||||
#endif
|
||||
|
||||
struct evict_pids_t {
|
||||
struct rb_root root;
|
||||
};
|
||||
|
||||
struct evict_pid_entry {
|
||||
struct rb_node node;
|
||||
struct list_head list;
|
||||
pid_t rootpid;
|
||||
u64 start_time;
|
||||
#ifdef CONFIG_TEXT_UNEVICTABLE
|
||||
u64 unevict_size;
|
||||
#endif
|
||||
struct task_struct *tsk;
|
||||
bool done;
|
||||
};
|
||||
|
||||
static void execute_vm_lock(struct work_struct *unused);
|
||||
static struct evict_pids_t *base_tree;
|
||||
static DEFINE_MUTEX(pid_mutex);
|
||||
|
||||
LIST_HEAD(pid_list);
|
||||
static int proc_pids_count;
|
||||
|
||||
static DECLARE_DELAYED_WORK(evict_work, execute_vm_lock);
|
||||
|
||||
struct proc_pids_t {
|
||||
struct rb_root proc_pids_tree;
|
||||
};
|
||||
|
||||
/* Called with pid_mutex held always */
|
||||
static void __remove_entry(struct evict_pid_entry *pid)
|
||||
{
|
||||
if (pid == NULL)
|
||||
return;
|
||||
|
||||
rb_erase(&pid->node, &base_tree->root);
|
||||
proc_pids_count--;
|
||||
}
|
||||
|
||||
/* should not be in atomic context(i.e. hrtimer) */
|
||||
static void __evict_pid(struct evict_pid_entry *pid)
|
||||
{
|
||||
struct task_struct *tsk;
|
||||
struct mm_struct *mm;
|
||||
|
||||
if (!pid)
|
||||
return;
|
||||
|
||||
rcu_read_lock();
|
||||
tsk = find_task_by_pid_ns(pid->rootpid, &init_pid_ns);
|
||||
if (tsk)
|
||||
get_task_struct(tsk);
|
||||
rcu_read_unlock();
|
||||
|
||||
if (!tsk)
|
||||
return;
|
||||
|
||||
if (tsk == pid->tsk && pid->start_time == tsk->start_boottime) {
|
||||
mm = get_task_mm(tsk);
|
||||
if (mm) {
|
||||
if (!(mm->def_flags & VM_LOCKED)) {
|
||||
struct vm_area_struct *vma, *prev = NULL;
|
||||
vm_flags_t flag;
|
||||
#ifdef CONFIG_TEXT_UNEVICTABLE
|
||||
unsigned long size = 0;
|
||||
struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
|
||||
#endif
|
||||
|
||||
VMA_ITERATOR(vmi, mm, 0);
|
||||
mmap_write_lock(mm);
|
||||
|
||||
for_each_vma(vmi, vma) {
|
||||
if (vma->vm_file &&
|
||||
(vma->vm_flags & VM_EXEC) &&
|
||||
(vma->vm_flags & VM_READ)) {
|
||||
flag = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
|
||||
mlock_fixup(&vmi, vma, &prev,
|
||||
vma->vm_start, vma->vm_end, flag);
|
||||
#ifdef CONFIG_TEXT_UNEVICTABLE
|
||||
size += vma->vm_end - vma->vm_start;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
mmap_write_unlock(mm);
|
||||
#ifdef CONFIG_TEXT_UNEVICTABLE
|
||||
memcg_decrease_unevict_size(memcg, size);
|
||||
css_put(&memcg->css);
|
||||
pid->unevict_size -= size;
|
||||
#endif
|
||||
}
|
||||
mmput(mm);
|
||||
}
|
||||
}
|
||||
put_task_struct(tsk);
|
||||
}
|
||||
|
||||
static struct evict_pid_entry *lookup_unevict_entry(struct task_struct *tsk)
|
||||
{
|
||||
struct evict_pid_entry *entry, *result;
|
||||
struct rb_node *parent = NULL;
|
||||
struct rb_node **link;
|
||||
pid_t rootpid;
|
||||
|
||||
if (!tsk)
|
||||
return NULL;
|
||||
|
||||
rcu_read_lock();
|
||||
get_task_struct(tsk);
|
||||
rootpid = __task_pid_nr_ns(tsk, PIDTYPE_PID, &init_pid_ns);
|
||||
put_task_struct(tsk);
|
||||
rcu_read_unlock();
|
||||
|
||||
result = NULL;
|
||||
link = &base_tree->root.rb_node;
|
||||
/*maybe unevictable feature not ready */
|
||||
while (*link) {
|
||||
parent = *link;
|
||||
entry = rb_entry(parent, struct evict_pid_entry, node);
|
||||
if (rootpid < entry->rootpid)
|
||||
link = &(*link)->rb_left;
|
||||
else if (rootpid > entry->rootpid)
|
||||
link = &(*link)->rb_right;
|
||||
else {
|
||||
result = entry;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
void del_unevict_task(struct task_struct *tsk)
|
||||
{
|
||||
struct evict_pid_entry *result;
|
||||
|
||||
if (!tsk) {
|
||||
struct evict_pid_entry *pid_entry, *tmp;
|
||||
|
||||
mutex_lock(&pid_mutex);
|
||||
list_for_each_entry_safe(pid_entry, tmp, &pid_list, list) {
|
||||
rcu_read_lock();
|
||||
tsk = find_task_by_pid_ns(pid_entry->rootpid,
|
||||
&init_pid_ns);
|
||||
rcu_read_unlock();
|
||||
if (!tsk) {
|
||||
list_del(&pid_entry->list);
|
||||
__remove_entry(pid_entry);
|
||||
kfree(pid_entry);
|
||||
}
|
||||
}
|
||||
mutex_unlock(&pid_mutex);
|
||||
return;
|
||||
}
|
||||
|
||||
mutex_lock(&pid_mutex);
|
||||
result = lookup_unevict_entry(tsk);
|
||||
if (result) {
|
||||
list_del(&result->list);
|
||||
__remove_entry(result);
|
||||
mutex_unlock(&pid_mutex);
|
||||
__evict_pid(result);
|
||||
kfree(result);
|
||||
} else
|
||||
mutex_unlock(&pid_mutex);
|
||||
}
|
||||
|
||||
static void evict_pid(pid_t pid)
|
||||
{
|
||||
struct task_struct *tsk;
|
||||
|
||||
if (pid <= 0)
|
||||
return;
|
||||
|
||||
rcu_read_lock();
|
||||
tsk = find_task_by_pid_ns(pid, task_active_pid_ns(current));
|
||||
if (!tsk) {
|
||||
rcu_read_unlock();
|
||||
return;
|
||||
}
|
||||
get_task_struct(tsk);
|
||||
rcu_read_unlock();
|
||||
|
||||
del_unevict_task(tsk);
|
||||
put_task_struct(tsk);
|
||||
}
|
||||
|
||||
static void add_unevict_task(struct task_struct *tsk)
|
||||
{
|
||||
struct evict_pid_entry *entry, *new_entry, *result;
|
||||
struct rb_node *parent = NULL;
|
||||
struct rb_node **link;
|
||||
pid_t rootpid;
|
||||
|
||||
if (!tsk)
|
||||
return;
|
||||
|
||||
new_entry = kzalloc(sizeof(*new_entry), GFP_NOWAIT);
|
||||
if (!new_entry)
|
||||
return;
|
||||
|
||||
result = NULL;
|
||||
get_task_struct(tsk);
|
||||
rootpid = __task_pid_nr_ns(tsk, PIDTYPE_PID, &init_pid_ns);
|
||||
put_task_struct(tsk);
|
||||
mutex_lock(&pid_mutex);
|
||||
link = &base_tree->root.rb_node;
|
||||
while (*link) {
|
||||
parent = *link;
|
||||
entry = rb_entry(parent, struct evict_pid_entry, node);
|
||||
if (rootpid < entry->rootpid) {
|
||||
link = &(*link)->rb_left;
|
||||
} else if (rootpid > entry->rootpid) {
|
||||
link = &(*link)->rb_right;
|
||||
} else {
|
||||
result = entry;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!result) {
|
||||
result = new_entry;
|
||||
result->rootpid = rootpid;
|
||||
#ifdef CONFIG_TEXT_UNEVICTABLE
|
||||
result->unevict_size = 0;
|
||||
#endif
|
||||
rb_link_node(&result->node, parent, link);
|
||||
rb_insert_color(&result->node, &base_tree->root);
|
||||
list_add_tail(&result->list, &pid_list);
|
||||
proc_pids_count++;
|
||||
mutex_unlock(&pid_mutex);
|
||||
} else {
|
||||
rcu_read_lock();
|
||||
tsk = find_task_by_pid_ns(rootpid, &init_pid_ns);
|
||||
if (tsk)
|
||||
get_task_struct(tsk);
|
||||
rcu_read_unlock();
|
||||
if (!tsk) {
|
||||
list_del(&result->list);
|
||||
__remove_entry(result);
|
||||
mutex_unlock(&pid_mutex);
|
||||
kfree(result);
|
||||
kfree(new_entry);
|
||||
return;
|
||||
} else if (tsk != result->tsk ||
|
||||
result->start_time != tsk->start_boottime) {
|
||||
result->done = false;
|
||||
}
|
||||
put_task_struct(tsk);
|
||||
mutex_unlock(&pid_mutex);
|
||||
kfree(new_entry);
|
||||
}
|
||||
}
|
||||
|
||||
static void unevict_pid(pid_t pid)
|
||||
{
|
||||
struct task_struct *tsk;
|
||||
|
||||
if (pid <= 0)
|
||||
return;
|
||||
|
||||
rcu_read_lock();
|
||||
tsk = find_task_by_pid_ns(pid, task_active_pid_ns(current));
|
||||
if (!tsk) {
|
||||
rcu_read_unlock();
|
||||
return;
|
||||
}
|
||||
get_task_struct(tsk);
|
||||
rcu_read_unlock();
|
||||
|
||||
#ifdef CONFIG_TEXT_UNEVICTABLE
|
||||
if (is_memcg_unevictable_enabled(mem_cgroup_from_task(tsk))) {
|
||||
put_task_struct(tsk);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
add_unevict_task(tsk);
|
||||
put_task_struct(tsk);
|
||||
}
|
||||
|
||||
struct add_pid_seq_context {
|
||||
int idx;
|
||||
int count;
|
||||
int pids[0];
|
||||
};
|
||||
|
||||
/*
|
||||
* Note there exists a race condition that we may get inconsistent snapshots
|
||||
* of pid array if call add_pid_start() more than one round due to users add
|
||||
* or delete the pid. However, I think it's acceptable because the pid may
|
||||
* still change even we get a consistent snapshot to show.
|
||||
*/
|
||||
static void *add_pid_start(struct seq_file *m, loff_t *pos)
|
||||
{
|
||||
struct add_pid_seq_context *ctx = NULL;
|
||||
struct evict_pid_entry *pid_entry;
|
||||
struct task_struct *tsk;
|
||||
struct evict_pid_entry *tmp;
|
||||
pid_t pid;
|
||||
|
||||
mutex_lock(&pid_mutex);
|
||||
if (*pos >= proc_pids_count)
|
||||
goto done;
|
||||
ctx = kvzalloc(sizeof(*ctx) + proc_pids_count * sizeof(int), GFP_KERNEL);
|
||||
if (unlikely(!ctx))
|
||||
goto done;
|
||||
|
||||
if (proc_pids_count > 0) {
|
||||
list_for_each_entry_safe(pid_entry, tmp, &pid_list, list) {
|
||||
rcu_read_lock();
|
||||
tsk = find_task_by_pid_ns(pid_entry->rootpid,
|
||||
&init_pid_ns);
|
||||
if (tsk) {
|
||||
get_task_struct(tsk);
|
||||
pid = __task_pid_nr_ns(tsk, PIDTYPE_PID,
|
||||
task_active_pid_ns(current));
|
||||
put_task_struct(tsk);
|
||||
} else {
|
||||
pid = -1;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
if (pid != -1) {
|
||||
ctx->pids[ctx->count++] = pid;
|
||||
} else {
|
||||
list_del(&pid_entry->list);
|
||||
__remove_entry(pid_entry);
|
||||
kfree(pid_entry);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (*pos >= ctx->count)
|
||||
goto done;
|
||||
mutex_unlock(&pid_mutex);
|
||||
ctx->idx = *pos;
|
||||
m->private = ctx;
|
||||
return ctx;
|
||||
done:
|
||||
mutex_unlock(&pid_mutex);
|
||||
kvfree(ctx);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void *add_pid_next(struct seq_file *m, void *p, loff_t *pos)
|
||||
{
|
||||
struct add_pid_seq_context *ctx = p;
|
||||
|
||||
ctx->idx = ++*pos;
|
||||
return (ctx->idx < ctx->count) ? ctx : NULL;
|
||||
}
|
||||
|
||||
static void add_pid_stop(struct seq_file *m, void *p)
|
||||
{
|
||||
kvfree(m->private);
|
||||
m->private = NULL;
|
||||
}
|
||||
|
||||
static int add_pid_show(struct seq_file *m, void *p)
|
||||
{
|
||||
struct add_pid_seq_context *ctx = p;
|
||||
|
||||
seq_printf(m, "%d", ctx->pids[ctx->idx]);
|
||||
seq_putc(m, (ctx->idx == ctx->count - 1) ? '\n' : ',');
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct seq_operations seq_add_pid_op = {
|
||||
.start = add_pid_start,
|
||||
.next = add_pid_next,
|
||||
.stop = add_pid_stop,
|
||||
.show = add_pid_show,
|
||||
};
|
||||
|
||||
static int proc_open_add_pid(struct inode *inode, struct file *file)
|
||||
{
|
||||
return seq_open(file, &seq_add_pid_op);
|
||||
}
|
||||
|
||||
static void execute_vm_lock(struct work_struct *unused)
|
||||
{
|
||||
struct task_struct *tsk;
|
||||
struct mm_struct *mm;
|
||||
struct evict_pid_entry *result, *tmp;
|
||||
pid_t rootpid;
|
||||
|
||||
if (!mutex_trylock(&pid_mutex)) {
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (proc_pids_count <= 0) {
|
||||
mutex_unlock(&pid_mutex);
|
||||
goto out;
|
||||
}
|
||||
|
||||
list_for_each_entry_safe(result, tmp, &pid_list, list) {
|
||||
rootpid = result->rootpid;
|
||||
if (result->done || rootpid <= 0)
|
||||
continue;
|
||||
|
||||
rcu_read_lock();
|
||||
tsk = find_task_by_pid_ns(rootpid, &init_pid_ns);
|
||||
if (tsk)
|
||||
get_task_struct(tsk);
|
||||
rcu_read_unlock();
|
||||
if (!tsk) {
|
||||
list_del(&result->list);
|
||||
__remove_entry(result);
|
||||
kfree(result);
|
||||
continue;
|
||||
}
|
||||
|
||||
mm = get_task_mm(tsk);
|
||||
if (mm && !(mm->def_flags & VM_LOCKED)) {
|
||||
#ifdef CONFIG_TEXT_UNEVICTABLE
|
||||
struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
|
||||
#endif
|
||||
struct vm_area_struct *vma, *prev = NULL;
|
||||
vm_flags_t flag;
|
||||
|
||||
VMA_ITERATOR(vmi, mm, 0);
|
||||
mmap_write_lock(mm);
|
||||
|
||||
for_each_vma(vmi, vma) {
|
||||
#ifdef CONFIG_TEXT_UNEVICTABLE
|
||||
if (is_unevictable_size_overflow(memcg))
|
||||
break;
|
||||
#endif
|
||||
if (vma->vm_file &&
|
||||
(vma->vm_flags & VM_EXEC) &&
|
||||
(vma->vm_flags & VM_READ)) {
|
||||
flag = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
|
||||
flag |= (VM_LOCKED | VM_LOCKONFAULT);
|
||||
mlock_fixup(&vmi, vma, &prev,
|
||||
vma->vm_start, vma->vm_end, flag);
|
||||
#ifdef CONFIG_TEXT_UNEVICTABLE
|
||||
result->unevict_size += vma->vm_end - vma->vm_start;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
result->tsk = tsk;
|
||||
result->start_time = tsk->start_boottime;
|
||||
result->done = true;
|
||||
mmap_write_unlock(mm);
|
||||
#ifdef CONFIG_TEXT_UNEVICTABLE
|
||||
memcg_increase_unevict_size(memcg,
|
||||
result->unevict_size);
|
||||
css_put(&memcg->css);
|
||||
#endif
|
||||
} else {
|
||||
list_del(&result->list);
|
||||
__remove_entry(result);
|
||||
kfree(result);
|
||||
}
|
||||
|
||||
if (mm)
|
||||
mmput(mm);
|
||||
if (tsk)
|
||||
put_task_struct(tsk);
|
||||
}
|
||||
mutex_unlock(&pid_mutex);
|
||||
|
||||
out:
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
static ssize_t proc_write_add_pid(struct file *file,
|
||||
const char __user *buffer, size_t count, loff_t *ppos)
|
||||
{
|
||||
char buf[NAME_BUF];
|
||||
int err;
|
||||
long pid;
|
||||
int ret = count;
|
||||
|
||||
if (count > NAME_BUF - 1) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
memset(buf, 0, sizeof(buf));
|
||||
if (copy_from_user(buf, buffer, count)) {
|
||||
ret = -EFAULT;
|
||||
goto out;
|
||||
}
|
||||
|
||||
err = kstrtol(strstrip(buf), 0, &pid);
|
||||
if (err || pid <= 0) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
} else {
|
||||
unevict_pid((pid_t)pid);
|
||||
schedule_delayed_work(&evict_work, HZ);
|
||||
}
|
||||
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static ssize_t proc_write_del_pid(struct file *file,
|
||||
const char __user *buffer, size_t count, loff_t *ppos)
|
||||
{
|
||||
char buf[NAME_BUF];
|
||||
int err;
|
||||
long pid;
|
||||
int ret = count;
|
||||
|
||||
memset(buf, 0, sizeof(buf));
|
||||
if (count > NAME_BUF - 1) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (copy_from_user(buf, buffer, count)) {
|
||||
ret = -EFAULT;
|
||||
goto out;
|
||||
}
|
||||
|
||||
err = kstrtol(strstrip(buf), 0, &pid);
|
||||
if (err || pid <= 0) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
} else {
|
||||
evict_pid(pid);
|
||||
}
|
||||
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
const static struct proc_ops add_proc_fops = {
|
||||
.proc_open = proc_open_add_pid,
|
||||
.proc_read = seq_read,
|
||||
.proc_write = proc_write_add_pid,
|
||||
.proc_lseek = seq_lseek,
|
||||
.proc_release = seq_release,
|
||||
};
|
||||
|
||||
const static struct proc_ops del_proc_fops = {
|
||||
.proc_write = proc_write_del_pid,
|
||||
};
|
||||
|
||||
#ifdef CONFIG_TEXT_UNEVICTABLE
|
||||
void clean_task_unevict_size(struct task_struct *tsk)
|
||||
{
|
||||
struct evict_pid_entry *result;
|
||||
struct mem_cgroup *memcg;
|
||||
|
||||
/*
|
||||
* There must make sure unevictable
|
||||
* function is finished.
|
||||
*/
|
||||
if (!tsk || !base_tree)
|
||||
return;
|
||||
|
||||
mutex_lock(&pid_mutex);
|
||||
result = lookup_unevict_entry(tsk);
|
||||
if (result) {
|
||||
if (result->unevict_size) {
|
||||
rcu_read_lock();
|
||||
memcg = mem_cgroup_from_task(tsk);
|
||||
memcg_decrease_unevict_size(memcg, result->unevict_size);
|
||||
rcu_read_unlock();
|
||||
}
|
||||
list_del(&result->list);
|
||||
__remove_entry(result);
|
||||
mutex_unlock(&pid_mutex);
|
||||
kfree(result);
|
||||
} else
|
||||
mutex_unlock(&pid_mutex);
|
||||
}
|
||||
|
||||
bool is_memcg_unevictable_enabled(struct mem_cgroup *memcg)
|
||||
{
|
||||
if (!unevictable_enabled())
|
||||
return false;
|
||||
|
||||
if (!memcg)
|
||||
return false;
|
||||
|
||||
if (memcg->allow_unevictable)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void memcg_increase_unevict_size(struct mem_cgroup *memcg, unsigned long size)
|
||||
{
|
||||
atomic_long_add(size, &memcg->unevictable_size);
|
||||
}
|
||||
|
||||
void memcg_decrease_unevict_size(struct mem_cgroup *memcg, unsigned long size)
|
||||
{
|
||||
atomic_long_sub(size, &memcg->unevictable_size);
|
||||
}
|
||||
|
||||
bool is_unevictable_size_overflow(struct mem_cgroup *memcg)
|
||||
{
|
||||
struct page_counter *counter;
|
||||
u64 res_limit;
|
||||
u64 size;
|
||||
|
||||
counter = &memcg->memory;
|
||||
res_limit = (u64)counter->max * PAGE_SIZE;
|
||||
size = atomic_long_read(&memcg->unevictable_size);
|
||||
size = size * 100 / res_limit;
|
||||
if (size >= memcg->unevictable_percent)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
unsigned long memcg_exstat_text_unevict_gather(struct mem_cgroup *memcg)
|
||||
{
|
||||
return atomic_long_read(&memcg->unevictable_size);
|
||||
}
|
||||
|
||||
void mem_cgroup_can_unevictable(struct task_struct *tsk, struct mem_cgroup *to)
|
||||
{
|
||||
struct mem_cgroup *from;
|
||||
|
||||
if (!unevictable_enabled())
|
||||
return;
|
||||
|
||||
from = mem_cgroup_from_task(tsk);
|
||||
VM_BUG_ON(from == to);
|
||||
|
||||
if (to->allow_unevictable && !from->allow_unevictable) {
|
||||
add_unevict_task(tsk);
|
||||
schedule_delayed_work(&evict_work, HZ);
|
||||
}
|
||||
|
||||
if (!to->allow_unevictable && from->allow_unevictable)
|
||||
del_unevict_task(tsk);
|
||||
}
|
||||
|
||||
void mem_cgroup_cancel_unevictable(struct cgroup_taskset *tset)
|
||||
{
|
||||
struct task_struct *tsk;
|
||||
struct cgroup_subsys_state *dst_css;
|
||||
struct mem_cgroup *memcg;
|
||||
|
||||
if (!unevictable_enabled())
|
||||
return;
|
||||
|
||||
cgroup_taskset_for_each(tsk, dst_css, tset) {
|
||||
memcg = mem_cgroup_from_task(tsk);
|
||||
|
||||
if (memcg->allow_unevictable)
|
||||
del_unevict_task(tsk);
|
||||
}
|
||||
}
|
||||
|
||||
static inline int schedule_unevict_task(struct task_struct *tsk, void *arg)
|
||||
{
|
||||
add_unevict_task(tsk);
|
||||
schedule_delayed_work(&evict_work, HZ);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int schedule_evict_task(struct task_struct *tsk, void *arg)
|
||||
{
|
||||
del_unevict_task(tsk);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void make_all_memcg_evictable(void)
|
||||
{
|
||||
struct mem_cgroup *memcg;
|
||||
|
||||
for_each_mem_cgroup(memcg) {
|
||||
if (!memcg->allow_unevictable)
|
||||
continue;
|
||||
mem_cgroup_scan_tasks(memcg, schedule_unevict_task, NULL);
|
||||
memcg->allow_unevictable = 0;
|
||||
memcg->unevictable_percent = 100;
|
||||
atomic_long_set(&memcg->unevictable_size, 0);
|
||||
}
|
||||
}
|
||||
|
||||
void memcg_all_processes_unevict(struct mem_cgroup *memcg, bool enable)
|
||||
{
|
||||
struct mem_cgroup *tmp_memcg;
|
||||
|
||||
if (!unevictable_enabled())
|
||||
return;
|
||||
|
||||
if (!memcg)
|
||||
tmp_memcg = root_mem_cgroup;
|
||||
else
|
||||
tmp_memcg = memcg;
|
||||
|
||||
if (enable)
|
||||
mem_cgroup_scan_tasks(tmp_memcg, schedule_unevict_task, NULL);
|
||||
else
|
||||
mem_cgroup_scan_tasks(tmp_memcg, schedule_evict_task, NULL);
|
||||
}
|
||||
|
||||
static int __init setup_unevictable(char *s)
|
||||
{
|
||||
if (!strcmp(s, "1"))
|
||||
static_branch_enable(&unevictable_enabled_key);
|
||||
else if (!strcmp(s, "0"))
|
||||
static_branch_disable(&unevictable_enabled_key);
|
||||
return 1;
|
||||
}
|
||||
__setup("unevictable=", setup_unevictable);
|
||||
|
||||
#ifdef CONFIG_SYSFS
|
||||
static ssize_t unevictable_enabled_show(struct kobject *kobj,
|
||||
struct kobj_attribute *attr, char *buf)
|
||||
{
|
||||
return sprintf(buf, "%d\n", !!static_branch_unlikely(&unevictable_enabled_key));
|
||||
}
|
||||
static ssize_t unevictable_enabled_store(struct kobject *kobj,
|
||||
struct kobj_attribute *attr,
|
||||
const char *buf, size_t count)
|
||||
{
|
||||
static DEFINE_MUTEX(mutex);
|
||||
ssize_t ret = count;
|
||||
|
||||
mutex_lock(&mutex);
|
||||
|
||||
if (!strncmp(buf, "1", 1))
|
||||
static_branch_enable(&unevictable_enabled_key);
|
||||
else if (!strncmp(buf, "0", 1)) {
|
||||
static_branch_disable(&unevictable_enabled_key);
|
||||
make_all_memcg_evictable();
|
||||
} else
|
||||
ret = -EINVAL;
|
||||
|
||||
mutex_unlock(&mutex);
|
||||
return ret;
|
||||
}
|
||||
static struct kobj_attribute unevictable_enabled_attr =
|
||||
__ATTR(enabled, 0644, unevictable_enabled_show,
|
||||
unevictable_enabled_store);
|
||||
|
||||
static struct attribute *unevictable_attrs[] = {
|
||||
&unevictable_enabled_attr.attr,
|
||||
NULL,
|
||||
};
|
||||
|
||||
static struct attribute_group unevictable_attr_group = {
|
||||
.attrs = unevictable_attrs,
|
||||
};
|
||||
|
||||
static int __init unevictable_init_sysfs(void)
|
||||
{
|
||||
int err;
|
||||
struct kobject *unevictable_kobj;
|
||||
|
||||
unevictable_kobj = kobject_create_and_add("unevictable", mm_kobj);
|
||||
if (!unevictable_kobj) {
|
||||
pr_err("failed to create unevictable kobject\n");
|
||||
return -ENOMEM;
|
||||
}
|
||||
err = sysfs_create_group(unevictable_kobj, &unevictable_attr_group);
|
||||
if (err) {
|
||||
pr_err("failed to register unevictable group\n");
|
||||
goto delete_obj;
|
||||
}
|
||||
return 0;
|
||||
|
||||
delete_obj:
|
||||
kobject_put(unevictable_kobj);
|
||||
return err;
|
||||
}
|
||||
#endif /* CONFIG_SYSFS */
|
||||
#endif /* CONFIG_TEXT_UNEVICTABLE */
|
||||
|
||||
static int __init unevictable_init(void)
|
||||
{
|
||||
struct proc_dir_entry *monitor_dir, *add_pid_file, *del_pid_file;
|
||||
|
||||
monitor_dir = proc_mkdir(PROC_NAME, NULL);
|
||||
if (!monitor_dir)
|
||||
goto out;
|
||||
|
||||
add_pid_file = proc_create("add_pid", 0600,
|
||||
monitor_dir, &add_proc_fops);
|
||||
if (!add_pid_file)
|
||||
goto out_dir;
|
||||
|
||||
del_pid_file = proc_create("del_pid", 0200,
|
||||
monitor_dir, &del_proc_fops);
|
||||
if (!del_pid_file)
|
||||
goto out_add_pid;
|
||||
|
||||
base_tree = kzalloc(sizeof(*base_tree), GFP_KERNEL);
|
||||
if (!base_tree)
|
||||
goto out_del_pid;
|
||||
|
||||
INIT_LIST_HEAD(&pid_list);
|
||||
|
||||
#if defined(CONFIG_SYSFS) && defined(CONFIG_TEXT_UNEVICTABLE)
|
||||
if (unevictable_init_sysfs())
|
||||
pr_err("memcg text unevictable sysfs create failed\n");
|
||||
#endif
|
||||
return 0;
|
||||
|
||||
pr_err("unevictpid create proc dir failed\n");
|
||||
|
||||
out_del_pid:
|
||||
remove_proc_entry("del_pid", monitor_dir);
|
||||
out_add_pid:
|
||||
remove_proc_entry("add_pid", monitor_dir);
|
||||
out_dir:
|
||||
remove_proc_entry(PROC_NAME, NULL);
|
||||
out:
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
module_init(unevictable_init);
|
337
mm/vmscan.c
337
mm/vmscan.c
|
@ -57,6 +57,7 @@
|
|||
#include <linux/khugepaged.h>
|
||||
#include <linux/rculist_nulls.h>
|
||||
#include <linux/random.h>
|
||||
#include <linux/emm.h>
|
||||
|
||||
#include <asm/tlbflush.h>
|
||||
#include <asm/div64.h>
|
||||
|
@ -167,6 +168,25 @@ struct scan_control {
|
|||
|
||||
/* for recording the reclaimed slab by now */
|
||||
struct reclaim_state reclaim_state;
|
||||
|
||||
#ifdef CONFIG_EMM_RECLAIM
|
||||
union {
|
||||
struct {
|
||||
/* Just like setting all may_writepage, may_swap, may_unmap to zero, but also forbids dropping clean cache */
|
||||
unsigned int emm_aging:1;
|
||||
/* Do reclaim without aging any LRU, NOTE: MM may still prompt pages during reclaim if a page is found active */
|
||||
unsigned int emm_reclaiming:1;
|
||||
/* Do both reclaim and aging, just like normal reclaim but other EMM data are also in effect. */
|
||||
unsigned int emm_mix:1;
|
||||
};
|
||||
/* If emm_ageing, emm_reclaiming is set, EMM is be used for this reclaim */
|
||||
unsigned int emm_running:3;
|
||||
};
|
||||
/* One time swappiness override, NOTE: this could got extended to 201, for anon only reclaim */
|
||||
u8 emm_swappiness;
|
||||
/* Number of pages shrinked/aged/scanned, could be used by different emm reclaim phase */
|
||||
unsigned long emm_nr_taken;
|
||||
#endif
|
||||
};
|
||||
|
||||
#ifdef ARCH_HAS_PREFETCHW
|
||||
|
@ -187,6 +207,20 @@ struct scan_control {
|
|||
* From 0 .. 200. Higher means more swappy.
|
||||
*/
|
||||
int vm_swappiness = 60;
|
||||
EXPORT_SYMBOL(vm_swappiness);
|
||||
|
||||
#ifdef CONFIG_EMM_FORCE_SWAPPINESS
|
||||
unsigned int sysctl_vm_force_swappiness __read_mostly;
|
||||
#else
|
||||
#define sysctl_vm_force_swappiness 0
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_EMM_RAMDISK_SWAP
|
||||
unsigned int sysctl_vm_ramdisk_swaptune __read_mostly;
|
||||
unsigned int sysctl_vm_swapcache_fastfree __read_mostly;
|
||||
#else
|
||||
#define sysctl_vm_swapcache_fastfree 0
|
||||
#endif
|
||||
|
||||
LIST_HEAD(shrinker_list);
|
||||
DECLARE_RWSEM(shrinker_rwsem);
|
||||
|
@ -1353,8 +1387,35 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping,
|
|||
|
||||
folio_set_reclaim(folio);
|
||||
res = mapping->a_ops->writepage(&folio->page, &wbc);
|
||||
if (res < 0)
|
||||
if (res < 0) {
|
||||
#ifdef CONFIG_EMM_RAMDISK_SWAP
|
||||
if (mapping_ram_swap(mapping) &&
|
||||
sysctl_vm_ramdisk_swaptune) {
|
||||
/*
|
||||
* Return the page as activated so other
|
||||
* pages could be tried when the ramdisk
|
||||
* limit is hit (eg. ZRAM may then be
|
||||
* able to catch a few zero pages and
|
||||
* create more space), also don't leave
|
||||
* a error mark.
|
||||
*
|
||||
* try_to_free_swap will still set PageDirty
|
||||
* but nothing cleans PageReclaim if we return
|
||||
* here, so just in case, tidy up page flags.
|
||||
*
|
||||
* TODO: We may also implement
|
||||
* secondary fall back swap layer later.
|
||||
*/
|
||||
if (res == -ENOMEM) {
|
||||
folio_set_dirty(folio);
|
||||
folio_clear_reclaim(folio);
|
||||
|
||||
return PAGE_ACTIVATE;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
handle_write_error(mapping, folio, res);
|
||||
}
|
||||
if (res == AOP_WRITEPAGE_ACTIVATE) {
|
||||
folio_clear_reclaim(folio);
|
||||
return PAGE_ACTIVATE;
|
||||
|
@ -2132,8 +2193,8 @@ activate_locked_split:
|
|||
}
|
||||
activate_locked:
|
||||
/* Not a candidate for swapping, so reclaim swap space. */
|
||||
if (folio_test_swapcache(folio) &&
|
||||
(mem_cgroup_swap_full(folio) || folio_test_mlocked(folio)))
|
||||
if (folio_test_swapcache(folio) && (sysctl_vm_swapcache_fastfree ||
|
||||
mem_cgroup_swap_full(folio) || folio_test_mlocked(folio)))
|
||||
folio_free_swap(folio);
|
||||
VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
|
||||
if (!folio_test_mlocked(folio)) {
|
||||
|
@ -2539,8 +2600,6 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec,
|
|||
lruvec_add_folio(lruvec, folio);
|
||||
nr_pages = folio_nr_pages(folio);
|
||||
nr_moved += nr_pages;
|
||||
if (folio_test_active(folio))
|
||||
workingset_age_nonresident(lruvec, nr_pages);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -2708,6 +2767,10 @@ static void shrink_active_list(unsigned long nr_to_scan,
|
|||
nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &l_hold,
|
||||
&nr_scanned, sc, lru);
|
||||
|
||||
#ifdef CONFIG_EMM_RECLAIM
|
||||
sc->emm_nr_taken += nr_taken;
|
||||
#endif
|
||||
|
||||
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
|
||||
|
||||
if (!cgroup_reclaim(sc))
|
||||
|
@ -2844,6 +2907,15 @@ unsigned long reclaim_pages(struct list_head *folio_list)
|
|||
static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
|
||||
struct lruvec *lruvec, struct scan_control *sc)
|
||||
{
|
||||
#ifdef CONFIG_EMM_RECLAIM
|
||||
/* Don't set skipped_deactivate here, if reclaim all failed simply bail out */
|
||||
if (sc->emm_reclaiming && is_active_lru(lru))
|
||||
return 0;
|
||||
|
||||
if (sc->emm_aging && !is_active_lru(lru))
|
||||
return 0;
|
||||
#endif
|
||||
|
||||
if (is_active_lru(lru)) {
|
||||
if (sc->may_deactivate & (1 << is_file_lru(lru)))
|
||||
shrink_active_list(nr_to_scan, lruvec, sc, lru);
|
||||
|
@ -2923,7 +2995,7 @@ static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc)
|
|||
* Flush the memory cgroup stats, so that we read accurate per-memcg
|
||||
* lruvec stats for heuristics.
|
||||
*/
|
||||
mem_cgroup_flush_stats();
|
||||
mem_cgroup_flush_stats(sc->target_mem_cgroup);
|
||||
|
||||
/*
|
||||
* Determine the scan balance between anon and file LRUs.
|
||||
|
@ -3035,6 +3107,20 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
|
|||
unsigned long ap, fp;
|
||||
enum lru_list lru;
|
||||
|
||||
#ifdef CONFIG_EMM_RECLAIM
|
||||
if (sc->emm_running) {
|
||||
swappiness = sc->emm_swappiness;
|
||||
if (swappiness == 201) {
|
||||
scan_balance = SCAN_ANON;
|
||||
swappiness = 200;
|
||||
goto out;
|
||||
} else if (!swappiness) {
|
||||
scan_balance = SCAN_FILE;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/* If we have no swap space, do not bother scanning anon folios. */
|
||||
if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) {
|
||||
scan_balance = SCAN_FILE;
|
||||
|
@ -3047,8 +3133,11 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
|
|||
* disable swapping for individual groups completely when
|
||||
* using the memory controller's swap limit feature would be
|
||||
* too expensive.
|
||||
*
|
||||
* If sysctl_vm_force_swappiness is set, don't reclaim anon
|
||||
* page even if the system may hit OOM.
|
||||
*/
|
||||
if (cgroup_reclaim(sc) && !swappiness) {
|
||||
if ((sysctl_vm_force_swappiness || cgroup_reclaim(sc)) && !swappiness) {
|
||||
scan_balance = SCAN_FILE;
|
||||
goto out;
|
||||
}
|
||||
|
@ -3243,6 +3332,8 @@ DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS);
|
|||
DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS);
|
||||
#define get_cap(cap) static_branch_unlikely(&lru_gen_caps[cap])
|
||||
#endif
|
||||
/* Some module may want to behave differently when lru_gen is enabled */
|
||||
EXPORT_SYMBOL_GPL(lru_gen_caps);
|
||||
|
||||
static bool should_walk_mmu(void)
|
||||
{
|
||||
|
@ -3707,9 +3798,9 @@ static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
|
|||
struct lru_gen_folio *lrugen = &lruvec->lrugen;
|
||||
int hist = lru_hist_from_seq(lrugen->min_seq[type]);
|
||||
|
||||
pos->refaulted = lrugen->avg_refaulted[type][tier] +
|
||||
pos->refaulted = atomic_long_read(&lrugen->avg_refaulted[type][tier]) +
|
||||
atomic_long_read(&lrugen->refaulted[hist][type][tier]);
|
||||
pos->total = lrugen->avg_total[type][tier] +
|
||||
pos->total = atomic_long_read(&lrugen->avg_total[type][tier]) +
|
||||
atomic_long_read(&lrugen->evicted[hist][type][tier]);
|
||||
if (tier)
|
||||
pos->total += lrugen->protected[hist][type][tier - 1];
|
||||
|
@ -3734,15 +3825,15 @@ static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)
|
|||
if (carryover) {
|
||||
unsigned long sum;
|
||||
|
||||
sum = lrugen->avg_refaulted[type][tier] +
|
||||
sum = atomic_long_read(&lrugen->avg_refaulted[type][tier]) +
|
||||
atomic_long_read(&lrugen->refaulted[hist][type][tier]);
|
||||
WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2);
|
||||
atomic_long_set(&lrugen->avg_refaulted[type][tier], sum / 2);
|
||||
|
||||
sum = lrugen->avg_total[type][tier] +
|
||||
sum = atomic_long_read(&lrugen->avg_total[type][tier]) +
|
||||
atomic_long_read(&lrugen->evicted[hist][type][tier]);
|
||||
if (tier)
|
||||
sum += lrugen->protected[hist][type][tier - 1];
|
||||
WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2);
|
||||
atomic_long_set(&lrugen->avg_total[type][tier], sum / 2);
|
||||
}
|
||||
|
||||
if (clear) {
|
||||
|
@ -4260,6 +4351,11 @@ restart:
|
|||
|
||||
walk_pmd_range(&val, addr, next, args);
|
||||
|
||||
#ifdef CONFIG_EMM_RECLAIM
|
||||
if (walk->force_full_scan)
|
||||
continue;
|
||||
#endif
|
||||
|
||||
if (need_resched() || walk->batched >= MAX_LRU_BATCH) {
|
||||
end = (addr | ~PUD_MASK) + 1;
|
||||
goto done;
|
||||
|
@ -4363,7 +4459,12 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
|
|||
struct lru_gen_folio *lrugen = &lruvec->lrugen;
|
||||
int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
|
||||
|
||||
if (type == LRU_GEN_ANON && !can_swap)
|
||||
/*
|
||||
* Keep tracking of Anon gen even if swappiness is not set for EMM,
|
||||
* because EMM can adjust swappiness dynamically and may drop to 0
|
||||
* from time to time, we can't lose hotness info here.
|
||||
*/
|
||||
if (type == LRU_GEN_ANON && !can_swap && !IS_ENABLED(CONFIG_EMM_RECLAIM))
|
||||
goto done;
|
||||
|
||||
/* prevent cold/hot inversion if force_scan is true */
|
||||
|
@ -5134,6 +5235,21 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw
|
|||
int tier = -1;
|
||||
DEFINE_MIN_SEQ(lruvec);
|
||||
|
||||
#ifdef CONFIG_EMM_RECLAIM
|
||||
/*
|
||||
* When called by EMM we must come here from run_eviction directly, else
|
||||
* the code is broken and need to be fixed.
|
||||
*
|
||||
* For swappiness == 0, we have type = LRU_GEN_FILE set below. But
|
||||
* for forcing a ANON isolation we have to extent swappiness to 201
|
||||
* and return directly to avoid FILE LRU fallback.
|
||||
*/
|
||||
if (sc->emm_running && sc->emm_swappiness == 201) {
|
||||
*type_scanned = LRU_GEN_ANON;
|
||||
return scan_folios(lruvec, sc, LRU_GEN_ANON, MAX_NR_TIERS, list);
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Try to make the obvious choice first. When anon and file are both
|
||||
* available from the same generation, interpret swappiness 1 as file
|
||||
|
@ -5922,8 +6038,8 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
|
|||
|
||||
if (seq == max_seq) {
|
||||
s = "RT ";
|
||||
n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]);
|
||||
n[1] = READ_ONCE(lrugen->avg_total[type][tier]);
|
||||
n[0] = atomic_long_read(&lrugen->avg_refaulted[type][tier]);
|
||||
n[1] = atomic_long_read(&lrugen->avg_total[type][tier]);
|
||||
} else if (seq == min_seq[type] || NR_HIST_GENS > 1) {
|
||||
s = "rep";
|
||||
n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]);
|
||||
|
@ -6289,6 +6405,43 @@ static int __init init_lru_gen(void)
|
|||
};
|
||||
late_initcall(init_lru_gen);
|
||||
|
||||
#ifdef CONFIG_ENHANCED_MM
|
||||
static int __init parse_cmdlinelru_gen(char *s)
|
||||
{
|
||||
int i, nid;
|
||||
bool enable;
|
||||
|
||||
if (!strcmp(s, "1") || !strcmp(s, "y"))
|
||||
enable = 1;
|
||||
else if (!strcmp(s, "0") || !strcmp(s, "n"))
|
||||
enable = 0;
|
||||
else
|
||||
return 1;
|
||||
|
||||
for_each_node(nid) {
|
||||
struct lruvec *lruvec = get_lruvec(NULL, nid);
|
||||
|
||||
if (!lruvec)
|
||||
continue;
|
||||
|
||||
VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
|
||||
VM_WARN_ON_ONCE(!state_is_valid(lruvec));
|
||||
|
||||
lruvec->lrugen.enabled = enable;
|
||||
}
|
||||
|
||||
for (i = 0; i < NR_LRU_GEN_CAPS; i++) {
|
||||
if (enable)
|
||||
static_branch_enable(&lru_gen_caps[i]);
|
||||
else
|
||||
static_branch_disable(&lru_gen_caps[i]);
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
__setup("lru_gen=", parse_cmdlinelru_gen);
|
||||
#endif
|
||||
|
||||
#else /* !CONFIG_LRU_GEN */
|
||||
|
||||
static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
|
||||
|
@ -6342,6 +6495,9 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
|||
|
||||
blk_start_plug(&plug);
|
||||
while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
|
||||
#ifdef CONFIG_EMM_RECLAIM
|
||||
(sc->emm_aging && sc->emm_swappiness == 201 && nr[LRU_ACTIVE_ANON]) ||
|
||||
#endif
|
||||
nr[LRU_INACTIVE_FILE]) {
|
||||
unsigned long nr_anon, nr_file, percentage;
|
||||
unsigned long nr_scanned;
|
||||
|
@ -6878,6 +7034,11 @@ retry:
|
|||
if (sc->nr_reclaimed >= sc->nr_to_reclaim)
|
||||
break;
|
||||
|
||||
#ifdef CONFIG_EMM_RECLAIM
|
||||
if (sc->emm_aging && sc->emm_nr_taken >= sc->nr_to_reclaim)
|
||||
break;
|
||||
#endif
|
||||
|
||||
if (sc->compaction_ready)
|
||||
break;
|
||||
|
||||
|
@ -7196,6 +7357,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
|
|||
|
||||
return nr_reclaimed;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(try_to_free_mem_cgroup_pages);
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_PAGECACHE_LIMIT
|
||||
|
@ -8560,6 +8722,149 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
|
|||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_EMM_RECLAIM
|
||||
int memcg_emm_reclaim(struct mem_cgroup *memcg, int mode,
|
||||
unsigned long nr_pages, unsigned long swappiness)
|
||||
{
|
||||
unsigned int noreclaim_flag;
|
||||
struct zonelist *zonelist;
|
||||
unsigned long nr_shrinked;
|
||||
bool retry = true;
|
||||
|
||||
struct scan_control sc = {
|
||||
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
|
||||
.gfp_mask = GFP_KERNEL,
|
||||
.reclaim_idx = MAX_NR_ZONES - 1,
|
||||
.target_mem_cgroup = memcg,
|
||||
.priority = DEF_PRIORITY,
|
||||
.may_writepage = true,
|
||||
.may_unmap = true,
|
||||
.may_swap = !!swappiness,
|
||||
.emm_swappiness = swappiness,
|
||||
.emm_reclaiming = mode == EMM_RECLAIM,
|
||||
.emm_aging = mode == EMM_AGE,
|
||||
};
|
||||
|
||||
again:
|
||||
/*
|
||||
* Copy & paste from try_to_free_mem_cgroup_pages
|
||||
*/
|
||||
zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
|
||||
|
||||
set_task_reclaim_state(current, &sc.reclaim_state);
|
||||
|
||||
noreclaim_flag = memalloc_noreclaim_save();
|
||||
|
||||
nr_shrinked = do_try_to_free_pages(zonelist, &sc);
|
||||
|
||||
if (mode != EMM_RECLAIM) {
|
||||
nr_shrinked += sc.emm_nr_taken;
|
||||
sc.emm_nr_taken = 0;
|
||||
}
|
||||
|
||||
if (nr_shrinked) {
|
||||
nr_pages -= min(nr_pages, nr_shrinked);
|
||||
} else if (retry) {
|
||||
retry = false;
|
||||
lru_add_drain_all();
|
||||
goto again;
|
||||
}
|
||||
|
||||
memalloc_noreclaim_restore(noreclaim_flag);
|
||||
|
||||
set_task_reclaim_state(current, NULL);
|
||||
|
||||
if (nr_pages)
|
||||
return -EAGAIN;
|
||||
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(memcg_emm_reclaim);
|
||||
|
||||
#ifdef CONFIG_LRU_GEN
|
||||
int memcg_lru_gen_emm_reclaim(struct mem_cgroup *memcg, int mode,
|
||||
unsigned long nr_pages, unsigned long swappiness)
|
||||
{
|
||||
unsigned int flags;
|
||||
struct blk_plug plug;
|
||||
struct lruvec *lruvec;
|
||||
struct lru_gen_mm_walk *walk;
|
||||
int ret = 0, nid;
|
||||
|
||||
struct scan_control sc = {
|
||||
.gfp_mask = GFP_KERNEL,
|
||||
.reclaim_idx = MAX_NR_ZONES - 1,
|
||||
.may_writepage = true,
|
||||
.may_unmap = true,
|
||||
.may_swap = !!swappiness,
|
||||
.emm_swappiness = swappiness,
|
||||
.emm_reclaiming = mode == EMM_RECLAIM,
|
||||
.emm_aging = mode == EMM_AGE,
|
||||
};
|
||||
|
||||
set_task_reclaim_state(current, &sc.reclaim_state);
|
||||
flags = memalloc_noreclaim_save();
|
||||
|
||||
walk = set_mm_walk(NULL, true);
|
||||
if (!walk) {
|
||||
ret = -ENOMEM;
|
||||
goto done;
|
||||
}
|
||||
|
||||
if (nr_pages)
|
||||
walk->force_scan = true;
|
||||
|
||||
if (nr_pages == PAGE_COUNTER_MAX)
|
||||
walk->force_full_scan = true;
|
||||
|
||||
/* Don't expose extended swappiness to rest of lru_gen */
|
||||
if (swappiness > 200)
|
||||
swappiness = 200;
|
||||
|
||||
blk_start_plug(&plug);
|
||||
for_each_node_state(nid, N_MEMORY) {
|
||||
lruvec = get_lruvec(memcg, nid);
|
||||
if (lruvec) {
|
||||
DEFINE_MAX_SEQ(lruvec);
|
||||
if (mode == EMM_AGE) {
|
||||
ret = run_aging(lruvec, max_seq, &sc,
|
||||
!!swappiness, !!nr_pages);
|
||||
} else if (mode == EMM_RECLAIM) {
|
||||
ret = run_eviction(lruvec, max_seq - MIN_NR_GENS, &sc,
|
||||
swappiness, nr_pages);
|
||||
nr_pages -= min(nr_pages, sc.nr_reclaimed);
|
||||
|
||||
/*
|
||||
* If swappiness is less than 100 (bias towards cache), reclaim slab,
|
||||
* using DEF_PRIORITY which means 1/4096 of inactive objects will be
|
||||
* reclaimed. If too many pages were asked to reclaim, shrink harder.
|
||||
*/
|
||||
if (swappiness <= 100)
|
||||
shrink_slab(GFP_KERNEL, nid, memcg,
|
||||
(nr_pages > MAX_LRU_BATCH) ? DEF_PRIORITY - 1 : DEF_PRIORITY);
|
||||
} else {
|
||||
ret = -EINVAL;
|
||||
}
|
||||
|
||||
if (ret < 0)
|
||||
break;
|
||||
}
|
||||
}
|
||||
blk_finish_plug(&plug);
|
||||
done:
|
||||
clear_mm_walk();
|
||||
memalloc_noreclaim_restore(flags);
|
||||
set_task_reclaim_state(current, NULL);
|
||||
|
||||
if (ret < 0 || mode != EMM_RECLAIM)
|
||||
return ret;
|
||||
|
||||
return nr_pages ? -EAGAIN: 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(memcg_lru_gen_emm_reclaim);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/**
|
||||
* check_move_unevictable_folios - Move evictable folios to appropriate zone
|
||||
* lru list
|
||||
|
|
|
@ -1027,6 +1027,7 @@ unsigned long node_page_state(struct pglist_data *pgdat,
|
|||
|
||||
return node_page_state_pages(pgdat, item);
|
||||
}
|
||||
EXPORT_SYMBOL(node_page_state);
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_COMPACTION
|
||||
|
|
665
mm/workingset.c
665
mm/workingset.c
|
@ -64,74 +64,64 @@
|
|||
* thrashing on the inactive list, after which refaulting pages can be
|
||||
* activated optimistically to compete with the existing active pages.
|
||||
*
|
||||
* Approximating inactive page access frequency - Observations:
|
||||
* For such approximation, we introduce a counter `eviction` (E)
|
||||
* here. This counter increases each time a page is evicted, and each evicted
|
||||
* page will have a shadow that stores the counter reading at the eviction
|
||||
* time as a timestamp. So when an evicted page was faulted again, we have:
|
||||
*
|
||||
* 1. When a page is accessed for the first time, it is added to the
|
||||
* head of the inactive list, slides every existing inactive page
|
||||
* towards the tail by one slot, and pushes the current tail page
|
||||
* out of memory.
|
||||
* Let SP = ((E's reading @ current) - (E's reading @ eviction))
|
||||
*
|
||||
* 2. When a page is accessed for the second time, it is promoted to
|
||||
* the active list, shrinking the inactive list by one slot. This
|
||||
* also slides all inactive pages that were faulted into the cache
|
||||
* more recently than the activated page towards the tail of the
|
||||
* inactive list.
|
||||
* +-memory available to cache-+
|
||||
* | |
|
||||
* +-------------------------+===============+===========+
|
||||
* | * shadows O O O | INACTIVE | ACTIVE |
|
||||
* +-+-----------------------+===============+===========+
|
||||
* | |
|
||||
* +-----------------------+
|
||||
* | SP
|
||||
* fault page O -> Hole left by previously faulted in pages
|
||||
* * -> The page corresponding to SP
|
||||
*
|
||||
* Thus:
|
||||
* Here SP can stands for how far the current workflow could push a page
|
||||
* out of available memory. Since all evicted page was once head of
|
||||
* INACTIVE list, the page could have such an access distance of:
|
||||
*
|
||||
* 1. The sum of evictions and activations between any two points in
|
||||
* time indicate the minimum number of inactive pages accessed in
|
||||
* between.
|
||||
* SP + NR_INACTIVE
|
||||
*
|
||||
* 2. Moving one inactive page N page slots towards the tail of the
|
||||
* list requires at least N inactive page accesses.
|
||||
* So if:
|
||||
*
|
||||
* Combining these:
|
||||
* SP + NR_INACTIVE < NR_INACTIVE + NR_ACTIVE
|
||||
*
|
||||
* 1. When a page is finally evicted from memory, the number of
|
||||
* inactive pages accessed while the page was in cache is at least
|
||||
* the number of page slots on the inactive list.
|
||||
* Which can be simplified to:
|
||||
*
|
||||
* 2. In addition, measuring the sum of evictions and activations (E)
|
||||
* at the time of a page's eviction, and comparing it to another
|
||||
* reading (R) at the time the page faults back into memory tells
|
||||
* the minimum number of accesses while the page was not cached.
|
||||
* This is called the refault distance.
|
||||
* SP < NR_ACTIVE
|
||||
*
|
||||
* Because the first access of the page was the fault and the second
|
||||
* access the refault, we combine the in-cache distance with the
|
||||
* out-of-cache distance to get the complete minimum access distance
|
||||
* of this page:
|
||||
* Then the page is worth getting re-activated to start from ACTIVE part,
|
||||
* since the access distance is shorter than total memory to make it stay.
|
||||
*
|
||||
* NR_inactive + (R - E)
|
||||
* And since this is only an estimation, based on several hypotheses, and
|
||||
* it could break the ability of LRU to distinguish a workingset out of
|
||||
* caches, so throttle this by two factors:
|
||||
*
|
||||
* And knowing the minimum access distance of a page, we can easily
|
||||
* tell if the page would be able to stay in cache assuming all page
|
||||
* slots in the cache were available:
|
||||
* 1. Notice that re-faulted in pages may leave "holes" on the shadow
|
||||
* part of LRU, that part is left unhandled on purpose to decrease
|
||||
* re-activate rate for pages that have a large SP value (the larger
|
||||
* SP value a page have, the more likely it will be affected by such
|
||||
* holes).
|
||||
* 2. When the ACTIVE part of LRU is long enough, challenging ACTIVE pages
|
||||
* by re-activating a one-time faulted previously INACTIVE page may not
|
||||
* be a good idea, so throttle the re-activation when ACTIVE > INACTIVE
|
||||
* by comparing with INACTIVE instead.
|
||||
*
|
||||
* NR_inactive + (R - E) <= NR_inactive + NR_active
|
||||
* Combined all above, we have:
|
||||
* Upon refault, if any of the following conditions is met, mark the page
|
||||
* as active:
|
||||
*
|
||||
* If we have swap we should consider about NR_inactive_anon and
|
||||
* NR_active_anon, so for page cache and anonymous respectively:
|
||||
*
|
||||
* NR_inactive_file + (R - E) <= NR_inactive_file + NR_active_file
|
||||
* + NR_inactive_anon + NR_active_anon
|
||||
*
|
||||
* NR_inactive_anon + (R - E) <= NR_inactive_anon + NR_active_anon
|
||||
* + NR_inactive_file + NR_active_file
|
||||
*
|
||||
* Which can be further simplified to:
|
||||
*
|
||||
* (R - E) <= NR_active_file + NR_inactive_anon + NR_active_anon
|
||||
*
|
||||
* (R - E) <= NR_active_anon + NR_inactive_file + NR_active_file
|
||||
*
|
||||
* Put into words, the refault distance (out-of-cache) can be seen as
|
||||
* a deficit in inactive list space (in-cache). If the inactive list
|
||||
* had (R - E) more page slots, the page would not have been evicted
|
||||
* in between accesses, but activated instead. And on a full system,
|
||||
* the only thing eating into inactive list space is active pages.
|
||||
* - If ACTIVE LRU is low (NR_ACTIVE < NR_INACTIVE), check if:
|
||||
* SP < NR_ACTIVE
|
||||
*
|
||||
* - If ACTIVE LRU is high (NR_ACTIVE >= NR_INACTIVE), check if:
|
||||
* SP < NR_INACTIVE
|
||||
*
|
||||
* Refaulting inactive pages
|
||||
*
|
||||
|
@ -169,7 +159,7 @@
|
|||
* Implementation
|
||||
*
|
||||
* For each node's LRU lists, a counter for inactive evictions and
|
||||
* activations is maintained (node->nonresident_age).
|
||||
* activations is maintained (node->evictions).
|
||||
*
|
||||
* On eviction, a snapshot of this counter (along with some bits to
|
||||
* identify the node) is stored in the now empty page cache
|
||||
|
@ -180,10 +170,12 @@
|
|||
*/
|
||||
|
||||
#define WORKINGSET_SHIFT 1
|
||||
#define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \
|
||||
#define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \
|
||||
WORKINGSET_SHIFT + NODES_SHIFT + \
|
||||
MEM_CGROUP_ID_SHIFT)
|
||||
#define EVICTION_BITS (BITS_PER_LONG - (EVICTION_SHIFT))
|
||||
#define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
|
||||
#define LRU_GEN_EVICTION_BITS (EVICTION_BITS - LRU_REFS_WIDTH)
|
||||
|
||||
/*
|
||||
* Eviction timestamps need to be able to cover the full range of
|
||||
|
@ -194,6 +186,7 @@
|
|||
* evictions into coarser buckets by shaving off lower timestamp bits.
|
||||
*/
|
||||
static unsigned int bucket_order __read_mostly;
|
||||
static unsigned int lru_gen_bucket_order __read_mostly;
|
||||
|
||||
static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
|
||||
bool workingset)
|
||||
|
@ -226,134 +219,100 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
|
|||
*workingsetp = workingset;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_LRU_GEN
|
||||
|
||||
static void *lru_gen_eviction(struct folio *folio)
|
||||
#ifdef CONFIG_EMM_WORKINGSET_TRACKING
|
||||
static void workingset_eviction_file(struct lruvec *lruvec, unsigned long nr_pages)
|
||||
{
|
||||
int hist;
|
||||
unsigned long token;
|
||||
unsigned long min_seq;
|
||||
struct lruvec *lruvec;
|
||||
struct lru_gen_folio *lrugen;
|
||||
int type = folio_is_file_lru(folio);
|
||||
int delta = folio_nr_pages(folio);
|
||||
int refs = folio_lru_refs(folio);
|
||||
int tier = lru_tier_from_refs(refs);
|
||||
struct mem_cgroup *memcg = folio_memcg(folio);
|
||||
struct pglist_data *pgdat = folio_pgdat(folio);
|
||||
|
||||
BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT);
|
||||
|
||||
lruvec = mem_cgroup_lruvec(memcg, pgdat);
|
||||
lrugen = &lruvec->lrugen;
|
||||
min_seq = READ_ONCE(lrugen->min_seq[type]);
|
||||
token = (min_seq << LRU_REFS_WIDTH) | max(refs - 1, 0);
|
||||
|
||||
hist = lru_hist_from_seq(min_seq);
|
||||
atomic_long_add(delta, &lrugen->evicted[hist][type][tier]);
|
||||
|
||||
return pack_shadow(mem_cgroup_id(memcg), pgdat, token, refs);
|
||||
do {
|
||||
atomic_long_add(nr_pages, &lruvec->evicted_file);
|
||||
} while ((lruvec = parent_lruvec(lruvec)));
|
||||
}
|
||||
|
||||
/*
|
||||
* Tests if the shadow entry is for a folio that was recently evicted.
|
||||
* Fills in @lruvec, @token, @workingset with the values unpacked from shadow.
|
||||
* If a page is evicted and never come back, either this page is really cold or it
|
||||
* is deleted on disk.
|
||||
*
|
||||
* For cold page, it could take up all of memory until kswapd start to shrink it.
|
||||
* For deleted page, the shadow will be gone too, so no refault.
|
||||
*
|
||||
* If a page comes back before it's shadow is released, that's a refault, which means
|
||||
* file page reclaim have gone over-aggressive and that page would not have been evicted
|
||||
* if all the page, include it self, stayed in memory.
|
||||
*/
|
||||
static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec,
|
||||
unsigned long *token, bool *workingset)
|
||||
static void workingset_refault_track(struct lruvec *lruvec, unsigned long refault_distance)
|
||||
{
|
||||
int memcg_id;
|
||||
unsigned long min_seq;
|
||||
struct mem_cgroup *memcg;
|
||||
struct pglist_data *pgdat;
|
||||
|
||||
unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset);
|
||||
|
||||
memcg = mem_cgroup_from_id(memcg_id);
|
||||
*lruvec = mem_cgroup_lruvec(memcg, pgdat);
|
||||
|
||||
min_seq = READ_ONCE((*lruvec)->lrugen.min_seq[file]);
|
||||
return (*token >> LRU_REFS_WIDTH) == (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH));
|
||||
do {
|
||||
/*
|
||||
* Not taking any lock, for better performance, may lead to some
|
||||
* event got lost, but it's just a rough estimation anyway.
|
||||
*/
|
||||
WRITE_ONCE(lruvec->refault_count, READ_ONCE(lruvec->refault_count) + 1);
|
||||
WRITE_ONCE(lruvec->total_distance, READ_ONCE(lruvec->total_distance) + refault_distance);
|
||||
} while ((lruvec = parent_lruvec(lruvec)));
|
||||
}
|
||||
|
||||
static void lru_gen_refault(struct folio *folio, void *shadow)
|
||||
#else
|
||||
static void workingset_eviction_file(struct lruvec *lruvec, unsigned long nr_pages)
|
||||
{
|
||||
bool recent;
|
||||
int hist, tier, refs;
|
||||
bool workingset;
|
||||
unsigned long token;
|
||||
struct lruvec *lruvec;
|
||||
struct lru_gen_folio *lrugen;
|
||||
int type = folio_is_file_lru(folio);
|
||||
int delta = folio_nr_pages(folio);
|
||||
}
|
||||
static void workingset_refault_track(struct lruvec *lruvec, unsigned long refault_distance)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
rcu_read_lock();
|
||||
|
||||
recent = lru_gen_test_recent(shadow, type, &lruvec, &token, &workingset);
|
||||
if (lruvec != folio_lruvec(folio))
|
||||
goto unlock;
|
||||
|
||||
mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta);
|
||||
|
||||
if (!recent)
|
||||
goto unlock;
|
||||
|
||||
lrugen = &lruvec->lrugen;
|
||||
|
||||
hist = lru_hist_from_seq(READ_ONCE(lrugen->min_seq[type]));
|
||||
/* see the comment in folio_lru_refs() */
|
||||
refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + workingset;
|
||||
tier = lru_tier_from_refs(refs);
|
||||
|
||||
atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]);
|
||||
mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
|
||||
static inline struct mem_cgroup *try_get_flush_memcg(int memcgid)
|
||||
{
|
||||
struct mem_cgroup *memcg;
|
||||
|
||||
/*
|
||||
* Count the following two cases as stalls:
|
||||
* 1. For pages accessed through page tables, hotter pages pushed out
|
||||
* hot pages which refaulted immediately.
|
||||
* 2. For pages accessed multiple times through file descriptors,
|
||||
* they would have been protected by sort_folio().
|
||||
* Look up the memcg associated with the stored ID. It might
|
||||
* have been deleted since the folio's eviction.
|
||||
*
|
||||
* Note that in rare events the ID could have been recycled
|
||||
* for a new cgroup that refaults a shared folio. This is
|
||||
* impossible to tell from the available data. However, this
|
||||
* should be a rare and limited disturbance, and activations
|
||||
* are always speculative anyway. Ultimately, it's the aging
|
||||
* algorithm's job to shake out the minimum access frequency
|
||||
* for the active cache.
|
||||
*
|
||||
* XXX: On !CONFIG_MEMCG, this will always return NULL; it
|
||||
* would be better if the root_mem_cgroup existed in all
|
||||
* configurations instead.
|
||||
*/
|
||||
if (lru_gen_in_fault() || refs >= BIT(LRU_REFS_WIDTH) - 1) {
|
||||
set_mask_bits(&folio->flags, 0, LRU_REFS_MASK | BIT(PG_workingset));
|
||||
mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta);
|
||||
rcu_read_lock();
|
||||
memcg = mem_cgroup_from_id(memcgid);
|
||||
if (!mem_cgroup_disabled() &&
|
||||
(!memcg || !mem_cgroup_tryget(memcg))) {
|
||||
rcu_read_unlock();
|
||||
return NULL;
|
||||
}
|
||||
unlock:
|
||||
rcu_read_unlock();
|
||||
|
||||
/*
|
||||
* Flush stats (and potentially sleep) outside the RCU read section.
|
||||
* XXX: With per-memcg flushing and thresholding, is ratelimiting
|
||||
* still needed here?
|
||||
*/
|
||||
mem_cgroup_flush_stats_ratelimited(memcg);
|
||||
|
||||
return memcg;
|
||||
}
|
||||
|
||||
#else /* !CONFIG_LRU_GEN */
|
||||
|
||||
static void *lru_gen_eviction(struct folio *folio)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec,
|
||||
unsigned long *token, bool *workingset)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
static void lru_gen_refault(struct folio *folio, void *shadow)
|
||||
{
|
||||
}
|
||||
|
||||
#endif /* CONFIG_LRU_GEN */
|
||||
|
||||
/**
|
||||
* workingset_age_nonresident - age non-resident entries as LRU ages
|
||||
* @lruvec: the lruvec that was aged
|
||||
* @nr_pages: the number of pages to count
|
||||
* lru_eviction - age non-resident entries as LRU ages
|
||||
*
|
||||
* As in-memory pages are aged, non-resident pages need to be aged as
|
||||
* well, in order for the refault distances later on to be comparable
|
||||
* to the in-memory dimensions. This function allows reclaim and LRU
|
||||
* operations to drive the non-resident aging along in parallel.
|
||||
*/
|
||||
void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages)
|
||||
static inline unsigned long lru_eviction(struct lruvec *lruvec, int type,
|
||||
int nr_pages, int bits, int bucket_order)
|
||||
{
|
||||
unsigned long eviction;
|
||||
|
||||
if (type)
|
||||
workingset_eviction_file(lruvec, nr_pages);
|
||||
|
||||
/*
|
||||
* Reclaiming a cgroup means reclaiming all its children in a
|
||||
* round-robin fashion. That means that each cgroup has an LRU
|
||||
|
@ -365,11 +324,241 @@ void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages)
|
|||
* the virtual inactive lists of all its parents, including
|
||||
* the root cgroup's, age as well.
|
||||
*/
|
||||
do {
|
||||
atomic_long_add(nr_pages, &lruvec->nonresident_age);
|
||||
} while ((lruvec = parent_lruvec(lruvec)));
|
||||
eviction = atomic_long_fetch_add_relaxed(nr_pages, &lruvec->evictions[type]);
|
||||
while ((lruvec = parent_lruvec(lruvec)))
|
||||
atomic_long_add(nr_pages, &lruvec->evictions[type]);
|
||||
|
||||
/* Truncate the timestamp to fit in limited bits */
|
||||
eviction >>= bucket_order;
|
||||
eviction &= ~0UL >> (BITS_PER_LONG - bits);
|
||||
return eviction;
|
||||
}
|
||||
|
||||
/*
|
||||
* lru_distance - calculate the refault distance based on non-resident age
|
||||
*/
|
||||
static inline unsigned long lru_distance(struct lruvec *lruvec, int type,
|
||||
unsigned long eviction, int bits,
|
||||
int bucket_order)
|
||||
{
|
||||
unsigned long refault = atomic_long_read(&lruvec->evictions[type]);
|
||||
|
||||
eviction &= ~0UL >> (BITS_PER_LONG - bits);
|
||||
eviction <<= bucket_order;
|
||||
|
||||
/*
|
||||
* The unsigned subtraction here gives an accurate distance
|
||||
* across non-resident age overflows in most cases. There is a
|
||||
* special case: usually, shadow entries have a short lifetime
|
||||
* and are either refaulted or reclaimed along with the inode
|
||||
* before they get too old. But it is not impossible for the
|
||||
* non-resident age to lap a shadow entry in the field, which
|
||||
* can then result in a false small refault distance, leading
|
||||
* to a false activation should this old entry actually
|
||||
* refault again. However, earlier kernels used to deactivate
|
||||
* unconditionally with *every* reclaim invocation for the
|
||||
* longest time, so the occasional inappropriate activation
|
||||
* leading to pressure on the active list is not a problem.
|
||||
*/
|
||||
return (refault - eviction) & (~0UL >> (BITS_PER_LONG - bits));
|
||||
}
|
||||
|
||||
#ifdef CONFIG_LRU_GEN
|
||||
|
||||
static void *lru_gen_eviction(struct folio *folio)
|
||||
{
|
||||
int hist;
|
||||
unsigned long token;
|
||||
struct lruvec *lruvec;
|
||||
struct lru_gen_folio *lrugen;
|
||||
int type = folio_is_file_lru(folio);
|
||||
int delta = folio_nr_pages(folio);
|
||||
int refs = folio_lru_refs(folio);
|
||||
int tier = lru_tier_from_refs(refs);
|
||||
struct mem_cgroup *memcg = folio_memcg(folio);
|
||||
struct pglist_data *pgdat = folio_pgdat(folio);
|
||||
|
||||
BUILD_BUG_ON(LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT);
|
||||
|
||||
lruvec = mem_cgroup_lruvec(memcg, pgdat);
|
||||
lrugen = &lruvec->lrugen;
|
||||
hist = lru_hist_of_min_seq(lruvec, type);
|
||||
|
||||
token = max(refs - 1, 0);
|
||||
token <<= LRU_GEN_EVICTION_BITS;
|
||||
token |= lru_eviction(lruvec, type, delta,
|
||||
LRU_GEN_EVICTION_BITS, lru_gen_bucket_order);
|
||||
atomic_long_add(delta, &lrugen->evicted[hist][type][tier]);
|
||||
|
||||
return pack_shadow(mem_cgroup_id(memcg), pgdat, token, refs);
|
||||
}
|
||||
|
||||
/*
|
||||
* Tests if the shadow entry is for a folio that was recently evicted.
|
||||
* Fills in @lruvec, @token, @workingset with the values unpacked from shadow.
|
||||
*/
|
||||
static bool inline lru_gen_test_recent(struct lruvec *lruvec, bool type,
|
||||
unsigned long distance)
|
||||
{
|
||||
int hist;
|
||||
unsigned long evicted = 0;
|
||||
struct lru_gen_folio *lrugen;
|
||||
|
||||
lrugen = &lruvec->lrugen;
|
||||
hist = lru_hist_of_min_seq(lruvec, type);
|
||||
|
||||
for (int tier = 0; tier < MAX_NR_TIERS; tier++)
|
||||
evicted += atomic_long_read(&lrugen->evicted[hist][type][tier]);
|
||||
|
||||
return distance <= evicted;
|
||||
}
|
||||
|
||||
enum lru_gen_refault_distance {
|
||||
DISTANCE_SHORT,
|
||||
DISTANCE_MID,
|
||||
DISTANCE_LONG,
|
||||
DISTANCE_NONE,
|
||||
};
|
||||
|
||||
static inline int lru_gen_test_refault(struct lruvec *lruvec, bool file,
|
||||
unsigned long distance, bool can_swap)
|
||||
{
|
||||
unsigned long total;
|
||||
|
||||
total = lruvec_page_state(lruvec, NR_ACTIVE_FILE) +
|
||||
lruvec_page_state(lruvec, NR_INACTIVE_FILE);
|
||||
|
||||
if (can_swap)
|
||||
total += lruvec_page_state(lruvec, NR_ACTIVE_ANON) +
|
||||
lruvec_page_state(lruvec, NR_INACTIVE_ANON);
|
||||
|
||||
/* Imagine having an extra gen outside of available memory */
|
||||
if (distance <= total / MAX_NR_GENS)
|
||||
return DISTANCE_SHORT;
|
||||
if (distance <= total / MIN_NR_GENS)
|
||||
return DISTANCE_MID;
|
||||
if (distance <= total)
|
||||
return DISTANCE_LONG;
|
||||
return DISTANCE_NONE;
|
||||
}
|
||||
|
||||
static void lru_gen_refault(struct folio *folio, void *shadow)
|
||||
{
|
||||
int memcgid;
|
||||
bool recent;
|
||||
bool workingset;
|
||||
unsigned long token;
|
||||
int hist, tier, refs;
|
||||
struct lruvec *lruvec;
|
||||
struct mem_cgroup *memcg;
|
||||
struct pglist_data *pgdat;
|
||||
struct lru_gen_folio *lrugen;
|
||||
int type = folio_is_file_lru(folio);
|
||||
int delta = folio_nr_pages(folio);
|
||||
int distance;
|
||||
unsigned long refault_distance, protect_tier;
|
||||
|
||||
unpack_shadow(shadow, &memcgid, &pgdat, &token, &workingset);
|
||||
memcg = try_get_flush_memcg(memcgid);
|
||||
if (!memcg)
|
||||
return;
|
||||
|
||||
lruvec = mem_cgroup_lruvec(memcg, pgdat);
|
||||
if (lruvec != folio_lruvec(folio))
|
||||
goto unlock;
|
||||
|
||||
mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta);
|
||||
refault_distance = lru_distance(lruvec, type, token,
|
||||
LRU_GEN_EVICTION_BITS, lru_gen_bucket_order);
|
||||
workingset_refault_track(lruvec, distance);
|
||||
/* Check if the gen the page was evicted from still exist */
|
||||
recent = lru_gen_test_recent(lruvec, type, refault_distance);
|
||||
/* Check if the distance indicates a refault */
|
||||
distance = lru_gen_test_refault(lruvec, type, refault_distance,
|
||||
mem_cgroup_get_nr_swap_pages(memcg));
|
||||
if (!recent && distance == DISTANCE_NONE)
|
||||
goto unlock;
|
||||
|
||||
/* see the comment in folio_lru_refs() */
|
||||
token >>= LRU_GEN_EVICTION_BITS;
|
||||
refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + workingset;
|
||||
tier = lru_tier_from_refs(refs);
|
||||
|
||||
/*
|
||||
* Count the following two cases as stalls:
|
||||
* 1. For pages accessed through page tables, hotter pages pushed out
|
||||
* hot pages which refaulted immediately.
|
||||
* 2. For pages accessed multiple times through file descriptors,
|
||||
* they would have been protected by sort_folio().
|
||||
*/
|
||||
if (lru_gen_in_fault() || refs >= BIT(LRU_REFS_WIDTH) - 1) {
|
||||
if (distance <= DISTANCE_SHORT) {
|
||||
/* Set ref bits and workingset (increase refs by one) */
|
||||
if (!lru_gen_in_fault())
|
||||
folio_set_active(folio);
|
||||
else
|
||||
set_mask_bits(&folio->flags, 0,
|
||||
min_t(unsigned long, refs, BIT(LRU_REFS_WIDTH) - 1)
|
||||
<< LRU_REFS_PGOFF);
|
||||
folio_set_workingset(folio);
|
||||
} else if (recent || distance <= DISTANCE_MID) {
|
||||
/*
|
||||
* Beyound PID protection range, no point increasing refs
|
||||
* for highest tier, but we can activate file page.
|
||||
*/
|
||||
set_mask_bits(&folio->flags, 0, (refs - workingset) << LRU_REFS_PGOFF);
|
||||
folio_set_workingset(folio);
|
||||
} else {
|
||||
set_mask_bits(&folio->flags, 0, 1 << LRU_REFS_PGOFF);
|
||||
}
|
||||
mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta);
|
||||
}
|
||||
|
||||
lrugen = &lruvec->lrugen;
|
||||
hist = lru_hist_of_min_seq(lruvec, type);
|
||||
protect_tier = tier;
|
||||
|
||||
/*
|
||||
* Don't over-protect clean cache page (!tier page), if the page wasn't access
|
||||
* for a while (refault distance > LRU / MAX_NR_GENS), there is no help keeping
|
||||
* it in memory, bias higher tier instead.
|
||||
*/
|
||||
if (distance <= DISTANCE_SHORT && !tier) {
|
||||
/* The folio is referenced one more time in the shadow gen */
|
||||
folio_set_workingset(folio);
|
||||
protect_tier = lru_tier_from_refs(1);
|
||||
mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
|
||||
}
|
||||
|
||||
if (protect_tier == tier && recent) {
|
||||
atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]);
|
||||
} else {
|
||||
atomic_long_add(delta, &lrugen->avg_total[type][protect_tier]);
|
||||
atomic_long_add(delta, &lrugen->avg_refaulted[type][protect_tier]);
|
||||
}
|
||||
unlock:
|
||||
mem_cgroup_put(memcg);
|
||||
}
|
||||
|
||||
#else /* !CONFIG_LRU_GEN */
|
||||
|
||||
static void *lru_gen_eviction(struct folio *folio)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static bool lru_gen_test_recent(struct lruvec *lruvec, bool file,
|
||||
unsigned long token)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
static void lru_gen_refault(struct folio *folio, void *shadow)
|
||||
{
|
||||
}
|
||||
|
||||
#endif /* CONFIG_LRU_GEN */
|
||||
|
||||
/**
|
||||
* workingset_eviction - note the eviction of a folio from memory
|
||||
* @target_memcg: the cgroup that is causing the reclaim
|
||||
|
@ -396,9 +585,8 @@ void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg)
|
|||
lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
|
||||
/* XXX: target_memcg can be NULL, go through lruvec */
|
||||
memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
|
||||
eviction = atomic_long_read(&lruvec->nonresident_age);
|
||||
eviction >>= bucket_order;
|
||||
workingset_age_nonresident(lruvec, folio_nr_pages(folio));
|
||||
eviction = lru_eviction(lruvec, folio_is_file_lru(folio),
|
||||
folio_nr_pages(folio), EVICTION_BITS, bucket_order);
|
||||
return pack_shadow(memcgid, pgdat, eviction,
|
||||
folio_test_workingset(folio));
|
||||
}
|
||||
|
@ -411,25 +599,22 @@ void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg)
|
|||
* @file: whether the corresponding folio is from the file lru.
|
||||
* @workingset: where the workingset value unpacked from shadow should
|
||||
* be stored.
|
||||
* @tracking: whether do workingset tracking or not
|
||||
*
|
||||
* Return: true if the shadow is for a recently evicted folio; false otherwise.
|
||||
*/
|
||||
bool workingset_test_recent(void *shadow, bool file, bool *workingset)
|
||||
bool workingset_test_recent(void *shadow, bool file, bool *workingset, bool tracking)
|
||||
{
|
||||
struct mem_cgroup *eviction_memcg;
|
||||
struct lruvec *eviction_lruvec;
|
||||
unsigned long refault_distance;
|
||||
unsigned long workingset_size;
|
||||
unsigned long refault;
|
||||
unsigned long inactive;
|
||||
unsigned long active;
|
||||
int memcgid;
|
||||
struct pglist_data *pgdat;
|
||||
unsigned long eviction;
|
||||
|
||||
if (lru_gen_enabled())
|
||||
return lru_gen_test_recent(shadow, file, &eviction_lruvec, &eviction, workingset);
|
||||
|
||||
unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset);
|
||||
eviction <<= bucket_order;
|
||||
|
||||
/*
|
||||
* Look up the memcg associated with the stored ID. It might
|
||||
|
@ -447,30 +632,32 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset)
|
|||
* would be better if the root_mem_cgroup existed in all
|
||||
* configurations instead.
|
||||
*/
|
||||
eviction_memcg = mem_cgroup_from_id(memcgid);
|
||||
if (!mem_cgroup_disabled() && !eviction_memcg)
|
||||
eviction_memcg = try_get_flush_memcg(memcgid);
|
||||
if (!eviction_memcg)
|
||||
return false;
|
||||
|
||||
eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat);
|
||||
refault = atomic_long_read(&eviction_lruvec->nonresident_age);
|
||||
|
||||
/*
|
||||
* Calculate the refault distance
|
||||
*
|
||||
* The unsigned subtraction here gives an accurate distance
|
||||
* across nonresident_age overflows in most cases. There is a
|
||||
* special case: usually, shadow entries have a short lifetime
|
||||
* and are either refaulted or reclaimed along with the inode
|
||||
* before they get too old. But it is not impossible for the
|
||||
* nonresident_age to lap a shadow entry in the field, which
|
||||
* can then result in a false small refault distance, leading
|
||||
* to a false activation should this old entry actually
|
||||
* refault again. However, earlier kernels used to deactivate
|
||||
* unconditionally with *every* reclaim invocation for the
|
||||
* longest time, so the occasional inappropriate activation
|
||||
* leading to pressure on the active list is not a problem.
|
||||
* Flush stats (and potentially sleep) outside the RCU read section.
|
||||
* XXX: With per-memcg flushing and thresholding, is ratelimiting
|
||||
* still needed here?
|
||||
*/
|
||||
refault_distance = (refault - eviction) & EVICTION_MASK;
|
||||
mem_cgroup_flush_stats_ratelimited(eviction_memcg);
|
||||
eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat);
|
||||
|
||||
if (lru_gen_enabled()) {
|
||||
bool recent;
|
||||
refault_distance = lru_distance(eviction_lruvec, file, eviction,
|
||||
LRU_GEN_EVICTION_BITS, lru_gen_bucket_order);
|
||||
recent = lru_gen_test_recent(eviction_lruvec, file, refault_distance);
|
||||
mem_cgroup_put(eviction_memcg);
|
||||
return recent;
|
||||
}
|
||||
|
||||
refault_distance = lru_distance(eviction_lruvec, file,
|
||||
eviction, EVICTION_BITS, bucket_order);
|
||||
|
||||
if (tracking)
|
||||
workingset_refault_track(eviction_lruvec, refault_distance);
|
||||
|
||||
/*
|
||||
* Compare the distance to the existing workingset size. We
|
||||
|
@ -479,21 +666,22 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset)
|
|||
* workingset competition needs to consider anon or not depends
|
||||
* on having free swap space.
|
||||
*/
|
||||
workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE);
|
||||
if (!file) {
|
||||
workingset_size += lruvec_page_state(eviction_lruvec,
|
||||
NR_INACTIVE_FILE);
|
||||
}
|
||||
active = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE);
|
||||
inactive = lruvec_page_state(eviction_lruvec, NR_INACTIVE_FILE);
|
||||
|
||||
if (mem_cgroup_get_nr_swap_pages(eviction_memcg) > 0) {
|
||||
workingset_size += lruvec_page_state(eviction_lruvec,
|
||||
NR_ACTIVE_ANON);
|
||||
if (file) {
|
||||
workingset_size += lruvec_page_state(eviction_lruvec,
|
||||
NR_INACTIVE_ANON);
|
||||
}
|
||||
active += lruvec_page_state(eviction_lruvec, NR_ACTIVE_ANON);
|
||||
inactive += lruvec_page_state(eviction_lruvec, NR_INACTIVE_ANON);
|
||||
}
|
||||
|
||||
return refault_distance <= workingset_size;
|
||||
mem_cgroup_put(eviction_memcg);
|
||||
|
||||
/*
|
||||
* When there are already enough active pages, be less aggressive
|
||||
* on reactivating pages, challenge an large set of established
|
||||
* active pages with one time refaulted page may not be a good idea.
|
||||
*/
|
||||
return refault_distance < min(active, inactive);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -514,24 +702,22 @@ void workingset_refault(struct folio *folio, void *shadow)
|
|||
bool workingset;
|
||||
long nr;
|
||||
|
||||
if (lru_gen_enabled()) {
|
||||
lru_gen_refault(folio, shadow);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Flush stats (and potentially sleep) before holding RCU read lock */
|
||||
mem_cgroup_flush_stats_ratelimited();
|
||||
|
||||
rcu_read_lock();
|
||||
|
||||
/*
|
||||
* The activation decision for this folio is made at the level
|
||||
* where the eviction occurred, as that is where the LRU order
|
||||
* during folio reclaim is being determined.
|
||||
*
|
||||
* However, the cgroup that will own the folio is the one that
|
||||
* is actually experiencing the refault event.
|
||||
* is actually experiencing the refault event. Make sure the folio is
|
||||
* locked to guarantee folio_memcg() stability throughout.
|
||||
*/
|
||||
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
|
||||
|
||||
if (lru_gen_enabled()) {
|
||||
lru_gen_refault(folio, shadow);
|
||||
return;
|
||||
}
|
||||
|
||||
nr = folio_nr_pages(folio);
|
||||
memcg = folio_memcg(folio);
|
||||
pgdat = folio_pgdat(folio);
|
||||
|
@ -539,11 +725,10 @@ void workingset_refault(struct folio *folio, void *shadow)
|
|||
|
||||
mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr);
|
||||
|
||||
if (!workingset_test_recent(shadow, file, &workingset))
|
||||
goto out;
|
||||
if (!workingset_test_recent(shadow, file, &workingset, true))
|
||||
return;
|
||||
|
||||
folio_set_active(folio);
|
||||
workingset_age_nonresident(lruvec, nr);
|
||||
mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file, nr);
|
||||
|
||||
/* Folio was active prior to eviction */
|
||||
|
@ -556,32 +741,6 @@ void workingset_refault(struct folio *folio, void *shadow)
|
|||
lru_note_cost_refault(folio);
|
||||
mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr);
|
||||
}
|
||||
out:
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
/**
|
||||
* workingset_activation - note a page activation
|
||||
* @folio: Folio that is being activated.
|
||||
*/
|
||||
void workingset_activation(struct folio *folio)
|
||||
{
|
||||
struct mem_cgroup *memcg;
|
||||
|
||||
rcu_read_lock();
|
||||
/*
|
||||
* Filter non-memcg pages here, e.g. unmap can call
|
||||
* mark_page_accessed() on VDSO pages.
|
||||
*
|
||||
* XXX: See workingset_refault() - this should return
|
||||
* root_mem_cgroup even for !CONFIG_MEMCG.
|
||||
*/
|
||||
memcg = folio_memcg_rcu(folio);
|
||||
if (!mem_cgroup_disabled() && !memcg)
|
||||
goto out;
|
||||
workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio));
|
||||
out:
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -664,7 +823,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
|
|||
struct lruvec *lruvec;
|
||||
int i;
|
||||
|
||||
mem_cgroup_flush_stats();
|
||||
mem_cgroup_flush_stats(sc->memcg);
|
||||
lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
|
||||
for (pages = 0, i = 0; i < NR_LRU_LISTS; i++)
|
||||
pages += lruvec_page_state_local(lruvec,
|
||||
|
@ -778,7 +937,6 @@ static struct lock_class_key shadow_nodes_key;
|
|||
|
||||
static int __init workingset_init(void)
|
||||
{
|
||||
unsigned int timestamp_bits;
|
||||
unsigned int max_order;
|
||||
int ret;
|
||||
|
||||
|
@ -790,12 +948,17 @@ static int __init workingset_init(void)
|
|||
* some more pages at runtime, so keep working with up to
|
||||
* double the initial memory by using totalram_pages as-is.
|
||||
*/
|
||||
timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
|
||||
max_order = fls_long(totalram_pages() - 1);
|
||||
if (max_order > timestamp_bits)
|
||||
bucket_order = max_order - timestamp_bits;
|
||||
if (max_order > EVICTION_BITS)
|
||||
bucket_order = max_order - EVICTION_BITS;
|
||||
pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
|
||||
timestamp_bits, max_order, bucket_order);
|
||||
EVICTION_BITS, max_order, bucket_order);
|
||||
#ifdef CONFIG_LRU_GEN
|
||||
if (max_order > LRU_GEN_EVICTION_BITS)
|
||||
lru_gen_bucket_order = max_order - LRU_GEN_EVICTION_BITS;
|
||||
pr_info("workingset: lru_gen_timestamp_bits=%d lru_gen_bucket_order=%u\n",
|
||||
LRU_GEN_EVICTION_BITS, lru_gen_bucket_order);
|
||||
#endif
|
||||
|
||||
ret = prealloc_shrinker(&workingset_shadow_shrinker, "mm-shadow");
|
||||
if (ret)
|
||||
|
|
|
@ -1142,9 +1142,11 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
|
|||
SetPageReclaim(page);
|
||||
|
||||
/* start writeback */
|
||||
__swap_writepage(page, &wbc);
|
||||
ret = __swap_writepage(page, &wbc);
|
||||
put_page(page);
|
||||
zswap_written_back_pages++;
|
||||
|
||||
if (!ret)
|
||||
zswap_written_back_pages++;
|
||||
|
||||
return ret;
|
||||
|
||||
|
|
|
@ -20,6 +20,9 @@
|
|||
*
|
||||
*/
|
||||
|
||||
static int ali_cip;
|
||||
module_param(ali_cip, int, 0600);
|
||||
MODULE_PARM_DESC(ali_cip, "Enable ali cip option: 0xfe. Value could be 0 or 1, defaults to 0.");
|
||||
|
||||
/*
|
||||
* Statistics of toa in proc /proc/net/toa_stats
|
||||
|
@ -78,7 +81,8 @@ static void *get_toa_data(struct sk_buff *skb)
|
|||
return NULL;
|
||||
if (opsize > length)
|
||||
return NULL; /* don't parse partial options */
|
||||
if (TCPOPT_TOA == opcode && TCPOLEN_TOA == opsize) {
|
||||
if ((TCPOPT_TOA == opcode && TCPOLEN_TOA == opsize) ||
|
||||
(ali_cip == 1 && TCPOPT_TOA_ALI_CIP == opcode && TCPOLEN_TOA_ALI_CIP == opsize)) {
|
||||
memcpy(&tdata, ptr - 2, sizeof (tdata));
|
||||
//TOA_DBG("find toa data: ip = %u.%u.%u.%u, port = %u\n", NIPQUAD(tdata.ip),
|
||||
//ntohs(tdata.port));
|
||||
|
@ -118,7 +122,8 @@ inet_getname_toa(struct socket *sock, struct sockaddr *uaddr, int peer)
|
|||
if (retval == 0 && NULL != sk->sk_user_data && peer) {
|
||||
if (sock_def_readable == sk->sk_data_ready) {
|
||||
memcpy(&tdata, &sk->sk_user_data, sizeof (tdata));
|
||||
if (TCPOPT_TOA == tdata.opcode && TCPOLEN_TOA == tdata.opsize) {
|
||||
if ((TCPOPT_TOA == tdata.opcode && TCPOLEN_TOA == tdata.opsize) ||
|
||||
(ali_cip == 1 && TCPOPT_TOA_ALI_CIP == tdata.opcode && TCPOLEN_TOA_ALI_CIP == tdata.opsize)) {
|
||||
TOA_INC_STATS(ext_stats, GETNAME_TOA_OK_CNT);
|
||||
//TOA_DBG("inet_getname_toa: set new sockaddr, ip %u.%u.%u.%u -> %u.%u.%u.%u, port %u -> %u\n",
|
||||
// NIPQUAD(sin->sin_addr.s_addr), NIPQUAD(tdata.ip), ntohs(sin->sin_port),
|
||||
|
@ -158,7 +163,8 @@ inet6_getname_toa(struct socket *sock, struct sockaddr *uaddr, int peer)
|
|||
if (retval == 0 && NULL != sk->sk_user_data && peer) {
|
||||
if (sock_def_readable == sk->sk_data_ready) {
|
||||
memcpy(&tdata, &sk->sk_user_data, sizeof (tdata));
|
||||
if (TCPOPT_TOA == tdata.opcode && TCPOLEN_TOA == tdata.opsize) {
|
||||
if ((TCPOPT_TOA == tdata.opcode && TCPOLEN_TOA == tdata.opsize) ||
|
||||
(ali_cip == 1 && TCPOPT_TOA_ALI_CIP == tdata.opcode && TCPOLEN_TOA_ALI_CIP == tdata.opsize)) {
|
||||
TOA_INC_STATS(ext_stats, GETNAME_TOA_OK_CNT);
|
||||
sin->sin6_port = tdata.port;
|
||||
ipv6_addr_set(&sin->sin6_addr, 0, 0, htonl(0x0000FFFF), tdata.ip);
|
||||
|
|
|
@ -34,9 +34,13 @@
|
|||
} while (0)
|
||||
|
||||
#define TCPOPT_TOA 200
|
||||
|
||||
/* MUST be 4n !!!! */
|
||||
#define TCPOLEN_TOA 8 /* |opcode|size|ip+port| = 1 + 1 + 6 */
|
||||
/* |opcode|size|ip+port| = 1 + 1 + 6 */
|
||||
#define TCPOLEN_TOA 8
|
||||
|
||||
#define TCPOPT_TOA_ALI_CIP 0xfe
|
||||
/* |opcode|size|sport|sip| = 1 + 1 + 2 + 4 */
|
||||
#define TCPOLEN_TOA_ALI_CIP 8
|
||||
|
||||
/* MUST be 4 bytes alignment */
|
||||
struct toa_data {
|
||||
|
|
Loading…
Reference in New Issue