Merge branch 'kasong/tk5/0001/emm' into 'master' (merge request !45)

EKS and EMM Support
First 4 commits, and last 9 commits are for EKS, rest are for EMM modular interface and core update.

Stress tested with MySQL + UMRD, Except last 9 commits.
This commit is contained in:
frankjpliu 2024-04-03 12:08:57 +00:00
commit 4e5f174036
44 changed files with 3543 additions and 450 deletions

View File

@ -829,6 +829,14 @@ config KVM_GUEST
underlying device model, the host provides the guest with
timing infrastructure such as time of day, and system time
config KVM_FORCE_PVCLOCK
bool "Force using pvclock"
depends on KVM_GUEST
default n
help
Use pvclock even if host tell us not to, don't select this unless you
know what you are doing.
config ARCH_CPUIDLE_HALTPOLL
def_bool n
prompt "Disable host haltpoll when loading haltpoll driver"

View File

@ -23,6 +23,13 @@ CONFIG_LOG_BUF_SHIFT=19
CONFIG_NUMA_BALANCING=y
# CONFIG_NUMA_BALANCING_DEFAULT_ENABLED is not set
CONFIG_MEMCG=y
CONFIG_ENHANCED_MM=y
CONFIG_EMM_FORCE_SWAPPINESS=y
CONFIG_EMM_RAMDISK_SWAP=y
CONFIG_EMM_WORKINGSET_TRACKING=y
CONFIG_EMM_MEMCG=y
CONFIG_EMM_RECLAIM=y
CONFIG_EMM_ZRAM_CONF=y
CONFIG_BLK_CGROUP=y
CONFIG_CFS_BANDWIDTH=y
CONFIG_RT_GROUP_SCHED=y
@ -57,7 +64,7 @@ CONFIG_PVH=y
CONFIG_PARAVIRT_TIME_ACCOUNTING=y
CONFIG_JAILHOUSE_GUEST=y
CONFIG_GART_IOMMU=y
CONFIG_MAXSMP=y
CONFIG_NR_CPUS=8192
CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y
CONFIG_X86_MCELOG_LEGACY=y
CONFIG_X86_MCE_INJECT=m
@ -66,6 +73,7 @@ CONFIG_X86_MSR=y
CONFIG_X86_CPUID=y
CONFIG_AMD_MEM_ENCRYPT=y
CONFIG_NUMA=y
CONFIG_NODES_SHIFT=8
CONFIG_ARCH_MEMORY_PROBE=y
CONFIG_X86_CHECK_BIOS_CORRUPTION=y
# CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK is not set
@ -1347,6 +1355,7 @@ CONFIG_CRYPTO_DEV_VIRTIO=m
CONFIG_CORDIC=m
CONFIG_CRC7=m
CONFIG_LIBCRC32C=y
CONFIG_CPUMASK_OFFSTACK=y
CONFIG_PRINTK_TIME=y
CONFIG_BOOT_PRINTK_DELAY=y
CONFIG_DYNAMIC_DEBUG=y
@ -1358,6 +1367,7 @@ CONFIG_STACK_VALIDATION=y
CONFIG_MAGIC_SYSRQ=y
CONFIG_SCHED_STACK_END_CHECK=y
CONFIG_DEBUG_MEMORY_INIT=y
CONFIG_DEBUG_PER_CPU_MAPS=y
CONFIG_DEBUG_SHIRQ=y
CONFIG_PANIC_ON_OOPS=y
CONFIG_HARDLOCKUP_DETECTOR=y

View File

@ -225,7 +225,8 @@ static u64 vread_pvclock(void)
do {
version = pvclock_read_begin(pvti);
if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT)))
if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT)) &&
!IS_ENABLED(CONFIG_KVM_FORCE_PVCLOCK))
return U64_MAX;
ret = __pvclock_read_cycles(pvti, rdtsc_ordered());

View File

@ -250,8 +250,13 @@ static int __init kvm_setup_vsyscall_timeinfo(void)
u8 flags;
flags = pvclock_read_flags(&hv_clock_boot[0].pvti);
if (!(flags & PVCLOCK_TSC_STABLE_BIT))
if (!(flags & PVCLOCK_TSC_STABLE_BIT)) {
if (IS_ENABLED(CONFIG_KVM_FORCE_PVCLOCK)) {
pr_info("Forcing vclock_mode = VCLOCK_PVCLOCK\n");
kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK;
}
return 0;
}
kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK;
}

View File

@ -287,6 +287,83 @@ out:
}
EXPORT_SYMBOL(thaw_bdev);
/**
* bdev_swapin_folio() - Start reading a folio from a block device
* @bdev: The device to read the folio from
* @sector: The offset on the device to read the folio to (need not be aligned)
* @folio: The folio to read
*
* On entry, the folio should be locked. It will be unlocked when the folio
* has been read. If the block driver implements swap_folio synchronously,
* that will be true on exit from this function, but it need not be.
*
* Errors returned by this function are usually "soft", eg out of memory, or
* queue full; callers should try a different route to read this folio rather
* than propagate an error back up the stack.
*
* Return: negative errno if an error occurs, 0 if submission was successful.
*/
int bdev_swapin_folio(struct block_device *bdev, sector_t sector,
struct folio *folio)
{
const struct block_device_operations *ops = bdev->bd_disk->fops;
int result;
if (!ops->swap_folio || bdev_get_integrity(bdev))
return -EOPNOTSUPP;
result = blk_queue_enter(bdev_get_queue(bdev), 0);
if (result)
return -EOPNOTSUPP;
result = ops->swap_folio(bdev, sector + get_start_sect(bdev), folio,
REQ_OP_READ);
blk_queue_exit(bdev_get_queue(bdev));
return result;
}
/**
* bdev_swapout_folio() - Start writing a folio to a block device
* @bdev: The device to write the folio to
* @sector: The offset on the device to write the folio to (need not be aligned)
* @folio: The folio to write
* @wbc: The writeback_control for the write
*
* On entry, the folio should be locked and not currently under writeback.
* On exit, if the write started successfully, the folio will be unlocked and
* under writeback. If the write failed already (eg the driver failed to
* queue the folio to the device), the folio will still be locked. If the
* caller is a ->writefolio implementation, it will need to unlock the folio.
*
* Errors returned by this function are usually "soft", eg out of memory, or
* queue full; callers should try a different route to write this folio rather
* than propagate an error back up the stack.
*
* Return: negative errno if an error occurs, 0 if submission was successful.
*/
int bdev_swapout_folio(struct block_device *bdev, sector_t sector,
struct folio *folio, struct writeback_control *wbc)
{
int result;
const struct block_device_operations *ops = bdev->bd_disk->fops;
if (!ops->swap_folio || bdev_get_integrity(bdev))
return -EOPNOTSUPP;
result = blk_queue_enter(bdev_get_queue(bdev), 0);
if (result)
return -EOPNOTSUPP;
folio_start_writeback(folio);
result = ops->swap_folio(bdev, sector + get_start_sect(bdev), folio,
REQ_OP_WRITE);
if (result) {
folio_end_writeback(folio);
} else {
folio_unlock(folio);
}
blk_queue_exit(bdev_get_queue(bdev));
return result;
}
/*
* pseudo-fs
*/

4
dist/Makefile vendored
View File

@ -103,6 +103,10 @@ endif
# All params enabled by default (except kABI check, see below), ENABLED overrides DEFAULT_DISABLE.
DISABLED=$(DEFAULT_DISABLED)
ENABLED=$(DEFAULT_ENABLED)
# Automatically disable non core package for non standard build
ifneq ($(CONFIG),generic-release)
DISABLED=ofed bpftool perf tools
endif
## A few shortcut for commonly used params:
# Disable KABI check by default

985
dist/configs/00base/eks/default.config vendored Normal file
View File

@ -0,0 +1,985 @@
# CONFIG_LOCALVERSION_AUTO is not set
CONFIG_KERNEL_LZ4=y
CONFIG_DEFAULT_HOSTNAME="eks-tkex"
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
CONFIG_NO_HZ=y
CONFIG_HIGH_RES_TIMERS=y
CONFIG_BPF_SYSCALL=y
CONFIG_BPF_JIT=y
CONFIG_BPF_JIT_ALWAYS_ON=y
CONFIG_PREEMPT_VOLUNTARY=y
CONFIG_IRQ_TIME_ACCOUNTING=y
CONFIG_BSD_PROCESS_ACCT=y
CONFIG_BSD_PROCESS_ACCT_V3=y
CONFIG_PSI=y
CONFIG_IKCONFIG=y
CONFIG_IKCONFIG_PROC=y
CONFIG_IKHEADERS=y
CONFIG_LOG_BUF_SHIFT=19
CONFIG_NUMA_BALANCING=y
# CONFIG_NUMA_BALANCING_DEFAULT_ENABLED is not set
CONFIG_MEMCG=y
CONFIG_BLK_CGROUP=y
CONFIG_CFS_BANDWIDTH=y
CONFIG_RT_GROUP_SCHED=y
CONFIG_CGROUP_PIDS=y
CONFIG_CGROUP_FREEZER=y
CONFIG_CGROUP_HUGETLB=y
CONFIG_CPUSETS=y
CONFIG_CGROUP_DEVICE=y
CONFIG_CGROUP_CPUACCT=y
CONFIG_CGROUP_PERF=y
CONFIG_CGROUP_BPF=y
CONFIG_CGROUP_MISC=y
CONFIG_NAMESPACES=y
CONFIG_USER_NS=y
CONFIG_CHECKPOINT_RESTORE=y
CONFIG_SCHED_AUTOGROUP=y
CONFIG_BLK_DEV_INITRD=y
CONFIG_EXPERT=y
CONFIG_PROFILING=y
CONFIG_KEXEC=y
CONFIG_KEXEC_FILE=y
CONFIG_CRASH_DUMP=y
CONFIG_SMP=y
CONFIG_X86_X2APIC=y
CONFIG_X86_CPU_RESCTRL=y
CONFIG_X86_AMD_PLATFORM_DEVICE=y
CONFIG_HYPERVISOR_GUEST=y
CONFIG_PARAVIRT=y
CONFIG_PARAVIRT_SPINLOCKS=y
CONFIG_KVM_FORCE_PVCLOCK=y
CONFIG_PARAVIRT_TIME_ACCOUNTING=y
CONFIG_MAXSMP=y
CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y
CONFIG_X86_MCELOG_LEGACY=y
CONFIG_X86_MCE_INJECT=m
CONFIG_PERF_EVENTS_AMD_POWER=y
CONFIG_X86_MSR=y
CONFIG_X86_CPUID=y
CONFIG_NUMA=y
# CONFIG_MTRR is not set
CONFIG_EFI=y
CONFIG_EFI_STUB=y
# CONFIG_RANDOMIZE_BASE is not set
CONFIG_PHYSICAL_ALIGN=0x1000000
CONFIG_COMPAT_VDSO=y
CONFIG_LIVEPATCH=y
# CONFIG_RETPOLINE is not set
# CONFIG_ACPI_SPCR_TABLE is not set
# CONFIG_ACPI_REV_OVERRIDE_POSSIBLE is not set
CONFIG_ACPI_EC_DEBUGFS=m
# CONFIG_ACPI_AC is not set
# CONFIG_ACPI_BATTERY is not set
CONFIG_ACPI_BUTTON=m
# CONFIG_ACPI_FAN is not set
CONFIG_ACPI_IPMI=m
CONFIG_ACPI_PROCESSOR_AGGREGATOR=m
# CONFIG_ACPI_THERMAL is not set
# CONFIG_ACPI_TABLE_UPGRADE is not set
CONFIG_ACPI_PCI_SLOT=y
CONFIG_ACPI_SBS=m
CONFIG_ACPI_CUSTOM_METHOD=m
CONFIG_ACPI_APEI=y
CONFIG_ACPI_APEI_GHES=y
CONFIG_ACPI_APEI_MEMORY_FAILURE=y
CONFIG_ACPI_APEI_EINJ=m
CONFIG_ACPI_APEI_ERST_DEBUG=m
CONFIG_CPU_FREQ_GOV_POWERSAVE=y
CONFIG_CPU_FREQ_GOV_USERSPACE=y
CONFIG_CPU_FREQ_GOV_ONDEMAND=y
CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y
CONFIG_X86_PCC_CPUFREQ=m
CONFIG_X86_ACPI_CPUFREQ=m
# CONFIG_X86_ACPI_CPUFREQ_CPB is not set
CONFIG_X86_POWERNOW_K8=m
CONFIG_X86_AMD_FREQ_SENSITIVITY=m
CONFIG_CPU_IDLE_GOV_LADDER=y
CONFIG_INTEL_IDLE=y
CONFIG_IA32_EMULATION=y
CONFIG_KVM=m
CONFIG_KVM_INTEL=m
CONFIG_KVM_AMD=m
# CONFIG_KVM_AMD_SEV is not set
CONFIG_KPROBES=y
CONFIG_JUMP_LABEL=y
CONFIG_MODULES=y
CONFIG_MODULE_UNLOAD=y
CONFIG_MODVERSIONS=y
CONFIG_BLK_DEV_THROTTLING=y
CONFIG_BLK_WBT=y
CONFIG_BLK_CGROUP_IOLATENCY=y
CONFIG_BLK_CGROUP_IOCOST=y
CONFIG_PARTITION_ADVANCED=y
CONFIG_LDM_PARTITION=y
CONFIG_IOSCHED_BFQ=y
CONFIG_BINFMT_MISC=m
CONFIG_ZSMALLOC_STAT=y
# CONFIG_SLAB_MERGE_DEFAULT is not set
# CONFIG_COMPAT_BRK is not set
CONFIG_KSM=y
CONFIG_MEMORY_FAILURE=y
CONFIG_HWPOISON_INJECT=m
CONFIG_TRANSPARENT_HUGEPAGE=y
CONFIG_TRANSPARENT_HUGEPAGE_MADVISE=y
CONFIG_READ_ONLY_THP_FOR_FS=y
CONFIG_USERFAULTFD=y
CONFIG_LRU_GEN=y
CONFIG_DAMON=y
CONFIG_DAMON_PADDR=y
CONFIG_ENHANCED_MM=y
CONFIG_EMM_FORCE_SWAPPINESS=y
CONFIG_EMM_RAMDISK_SWAP=y
CONFIG_EMM_WORKINGSET_TRACKING=y
CONFIG_EMM_ZRAM_CONF=y
CONFIG_TEXT_UNEVICTABLE=y
CONFIG_NET=y
CONFIG_PACKET=y
CONFIG_PACKET_DIAG=m
CONFIG_UNIX=y
CONFIG_UNIX_DIAG=m
CONFIG_TLS=m
CONFIG_TLS_DEVICE=y
CONFIG_XFRM_USER=y
CONFIG_XFRM_SUB_POLICY=y
CONFIG_XFRM_STATISTICS=y
CONFIG_NET_KEY=m
CONFIG_NET_KEY_MIGRATE=y
CONFIG_XDP_SOCKETS=y
CONFIG_XDP_SOCKETS_DIAG=m
CONFIG_INET=y
CONFIG_IP_MULTICAST=y
CONFIG_IP_ADVANCED_ROUTER=y
CONFIG_IP_FIB_TRIE_STATS=y
CONFIG_IP_MULTIPLE_TABLES=y
CONFIG_IP_ROUTE_MULTIPATH=y
CONFIG_IP_ROUTE_VERBOSE=y
CONFIG_NET_IPIP=m
CONFIG_NET_IPGRE_DEMUX=m
CONFIG_NET_IPGRE=m
CONFIG_NET_IPGRE_BROADCAST=y
CONFIG_IP_MROUTE=y
CONFIG_IP_MROUTE_MULTIPLE_TABLES=y
CONFIG_IP_PIMSM_V1=y
CONFIG_IP_PIMSM_V2=y
CONFIG_NET_IPVTI=m
CONFIG_INET_AH=m
CONFIG_INET_ESP=m
CONFIG_INET_ESP_OFFLOAD=m
CONFIG_INET_IPCOMP=m
CONFIG_INET_DIAG=m
CONFIG_INET_UDP_DIAG=m
CONFIG_INET_RAW_DIAG=m
CONFIG_TCP_CONG_ADVANCED=y
CONFIG_TCP_CONG_HSTCP=m
CONFIG_TCP_CONG_HYBLA=m
CONFIG_TCP_CONG_NV=m
CONFIG_TCP_CONG_SCALABLE=m
CONFIG_TCP_CONG_LP=m
CONFIG_TCP_CONG_VENO=m
CONFIG_TCP_CONG_YEAH=m
CONFIG_TCP_CONG_ILLINOIS=m
CONFIG_TCP_CONG_DCTCP=m
CONFIG_TCP_CONG_CDG=m
CONFIG_TCP_CONG_BBR=m
CONFIG_TCP_MD5SIG=y
CONFIG_IPV6_ROUTER_PREF=y
CONFIG_IPV6_ROUTE_INFO=y
CONFIG_IPV6_OPTIMISTIC_DAD=y
CONFIG_INET6_AH=m
CONFIG_INET6_ESP=m
CONFIG_INET6_ESP_OFFLOAD=m
CONFIG_INET6_IPCOMP=m
CONFIG_IPV6_MIP6=m
CONFIG_IPV6_SIT=m
CONFIG_IPV6_SIT_6RD=y
CONFIG_IPV6_GRE=m
CONFIG_IPV6_MULTIPLE_TABLES=y
CONFIG_IPV6_MROUTE=y
CONFIG_IPV6_MROUTE_MULTIPLE_TABLES=y
CONFIG_IPV6_PIMSM_V2=y
CONFIG_NETWORK_SECMARK=y
CONFIG_NETWORK_PHY_TIMESTAMPING=y
CONFIG_NETFILTER=y
CONFIG_BRIDGE_NETFILTER=y
CONFIG_NF_CONNTRACK=m
CONFIG_NF_CONNTRACK_SECMARK=y
CONFIG_NF_CONNTRACK_ZONES=y
CONFIG_NF_CONNTRACK_PROCFS=y
CONFIG_NF_CONNTRACK_EVENTS=y
CONFIG_NF_CONNTRACK_TIMEOUT=y
CONFIG_NF_CONNTRACK_TIMESTAMP=y
CONFIG_NF_CONNTRACK_AMANDA=m
CONFIG_NF_CONNTRACK_FTP=m
CONFIG_NF_CONNTRACK_H323=m
CONFIG_NF_CONNTRACK_IRC=m
CONFIG_NF_CONNTRACK_NETBIOS_NS=m
CONFIG_NF_CONNTRACK_SNMP=m
CONFIG_NF_CONNTRACK_PPTP=m
CONFIG_NF_CONNTRACK_SANE=m
CONFIG_NF_CONNTRACK_SIP=m
CONFIG_NF_CONNTRACK_TFTP=m
CONFIG_NF_CT_NETLINK=m
CONFIG_NF_CT_NETLINK_TIMEOUT=m
CONFIG_NF_TABLES=m
CONFIG_NF_TABLES_INET=y
CONFIG_NF_TABLES_NETDEV=y
CONFIG_NFT_NUMGEN=m
CONFIG_NFT_CT=m
CONFIG_NFT_FLOW_OFFLOAD=m
CONFIG_NFT_CONNLIMIT=m
CONFIG_NFT_LOG=m
CONFIG_NFT_LIMIT=m
CONFIG_NFT_MASQ=m
CONFIG_NFT_REDIR=m
CONFIG_NFT_NAT=m
CONFIG_NFT_TUNNEL=m
CONFIG_NFT_QUEUE=m
CONFIG_NFT_QUOTA=m
CONFIG_NFT_REJECT=m
CONFIG_NFT_COMPAT=m
CONFIG_NFT_HASH=m
CONFIG_NFT_FIB_INET=m
CONFIG_NFT_XFRM=m
CONFIG_NFT_SOCKET=m
CONFIG_NFT_OSF=m
CONFIG_NFT_TPROXY=m
CONFIG_NFT_SYNPROXY=m
CONFIG_NFT_DUP_NETDEV=m
CONFIG_NFT_FWD_NETDEV=m
CONFIG_NFT_FIB_NETDEV=m
CONFIG_NF_FLOW_TABLE_INET=m
CONFIG_NF_FLOW_TABLE=m
CONFIG_NETFILTER_XTABLES=y
CONFIG_NETFILTER_XT_SET=m
CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m
CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
CONFIG_NETFILTER_XT_TARGET_CONNMARK=m
CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m
CONFIG_NETFILTER_XT_TARGET_DSCP=m
CONFIG_NETFILTER_XT_TARGET_HMARK=m
CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m
CONFIG_NETFILTER_XT_TARGET_LOG=m
CONFIG_NETFILTER_XT_TARGET_MARK=m
CONFIG_NETFILTER_XT_TARGET_NETMAP=m
CONFIG_NETFILTER_XT_TARGET_NFLOG=m
CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m
CONFIG_NETFILTER_XT_TARGET_NOTRACK=m
CONFIG_NETFILTER_XT_TARGET_TEE=m
CONFIG_NETFILTER_XT_TARGET_TPROXY=m
CONFIG_NETFILTER_XT_TARGET_TRACE=m
CONFIG_NETFILTER_XT_TARGET_SECMARK=m
CONFIG_NETFILTER_XT_TARGET_TCPMSS=m
CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m
CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=m
CONFIG_NETFILTER_XT_MATCH_BPF=m
CONFIG_NETFILTER_XT_MATCH_CGROUP=m
CONFIG_NETFILTER_XT_MATCH_CLUSTER=m
CONFIG_NETFILTER_XT_MATCH_COMMENT=m
CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m
CONFIG_NETFILTER_XT_MATCH_CONNLABEL=m
CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m
CONFIG_NETFILTER_XT_MATCH_CONNMARK=m
CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
CONFIG_NETFILTER_XT_MATCH_CPU=m
CONFIG_NETFILTER_XT_MATCH_DEVGROUP=m
CONFIG_NETFILTER_XT_MATCH_DSCP=m
CONFIG_NETFILTER_XT_MATCH_ESP=m
CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m
CONFIG_NETFILTER_XT_MATCH_HELPER=m
CONFIG_NETFILTER_XT_MATCH_IPRANGE=m
CONFIG_NETFILTER_XT_MATCH_IPVS=m
CONFIG_NETFILTER_XT_MATCH_LENGTH=m
CONFIG_NETFILTER_XT_MATCH_LIMIT=m
CONFIG_NETFILTER_XT_MATCH_MAC=m
CONFIG_NETFILTER_XT_MATCH_MARK=m
CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m
CONFIG_NETFILTER_XT_MATCH_NFACCT=m
CONFIG_NETFILTER_XT_MATCH_OSF=m
CONFIG_NETFILTER_XT_MATCH_OWNER=m
CONFIG_NETFILTER_XT_MATCH_POLICY=m
CONFIG_NETFILTER_XT_MATCH_PHYSDEV=m
CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m
CONFIG_NETFILTER_XT_MATCH_QUOTA=m
CONFIG_NETFILTER_XT_MATCH_RATEEST=m
CONFIG_NETFILTER_XT_MATCH_REALM=m
CONFIG_NETFILTER_XT_MATCH_RECENT=m
CONFIG_NETFILTER_XT_MATCH_SOCKET=m
CONFIG_NETFILTER_XT_MATCH_STATE=m
CONFIG_NETFILTER_XT_MATCH_STATISTIC=m
CONFIG_NETFILTER_XT_MATCH_STRING=m
CONFIG_NETFILTER_XT_MATCH_TCPMSS=m
CONFIG_NETFILTER_XT_MATCH_TIME=m
CONFIG_NETFILTER_XT_MATCH_U32=m
CONFIG_IP_SET=m
CONFIG_IP_SET_BITMAP_IP=m
CONFIG_IP_SET_BITMAP_IPMAC=m
CONFIG_IP_SET_BITMAP_PORT=m
CONFIG_IP_SET_HASH_IP=m
CONFIG_IP_SET_HASH_IPMARK=m
CONFIG_IP_SET_HASH_IPPORT=m
CONFIG_IP_SET_HASH_IPPORTIP=m
CONFIG_IP_SET_HASH_IPPORTNET=m
CONFIG_IP_SET_HASH_NET=m
CONFIG_IP_SET_HASH_NETPORT=m
CONFIG_IP_SET_HASH_NETIFACE=m
CONFIG_IP_SET_LIST_SET=m
CONFIG_IP_VS=m
CONFIG_IP_VS_IPV6=y
CONFIG_IP_VS_PROTO_TCP=y
CONFIG_IP_VS_PROTO_UDP=y
CONFIG_IP_VS_PROTO_ESP=y
CONFIG_IP_VS_PROTO_AH=y
CONFIG_IP_VS_PROTO_SCTP=y
CONFIG_IP_VS_RR=m
CONFIG_IP_VS_WRR=m
CONFIG_IP_VS_LC=m
CONFIG_IP_VS_WLC=m
CONFIG_IP_VS_FO=m
CONFIG_IP_VS_OVF=m
CONFIG_IP_VS_LBLC=m
CONFIG_IP_VS_LBLCR=m
CONFIG_IP_VS_DH=m
CONFIG_IP_VS_SH=m
CONFIG_IP_VS_MH=m
CONFIG_IP_VS_SED=m
CONFIG_IP_VS_NQ=m
CONFIG_IP_VS_SH_TAB_BITS=10
CONFIG_IP_VS_FTP=m
CONFIG_IP_VS_PE_SIP=m
CONFIG_NFT_DUP_IPV4=m
CONFIG_NFT_FIB_IPV4=m
CONFIG_NF_TABLES_ARP=y
CONFIG_NF_LOG_ARP=m
CONFIG_IP_NF_IPTABLES=m
CONFIG_IP_NF_MATCH_AH=m
CONFIG_IP_NF_MATCH_ECN=m
CONFIG_IP_NF_MATCH_RPFILTER=m
CONFIG_IP_NF_MATCH_TTL=m
CONFIG_IP_NF_FILTER=m
CONFIG_IP_NF_TARGET_REJECT=m
CONFIG_IP_NF_NAT=m
CONFIG_IP_NF_TARGET_MASQUERADE=m
CONFIG_IP_NF_TARGET_REDIRECT=m
CONFIG_IP_NF_MANGLE=m
CONFIG_IP_NF_TARGET_ECN=m
CONFIG_IP_NF_TARGET_TTL=m
CONFIG_IP_NF_RAW=m
CONFIG_IP_NF_ARPTABLES=m
CONFIG_IP_NF_ARPFILTER=m
CONFIG_IP_NF_ARP_MANGLE=m
CONFIG_NFT_DUP_IPV6=m
CONFIG_NFT_FIB_IPV6=m
CONFIG_NF_TABLES_BRIDGE=m
CONFIG_NFT_BRIDGE_META=m
CONFIG_NFT_BRIDGE_REJECT=m
CONFIG_BRIDGE_NF_EBTABLES=m
CONFIG_BRIDGE_EBT_BROUTE=m
CONFIG_BRIDGE_EBT_T_FILTER=m
CONFIG_BRIDGE_EBT_T_NAT=m
CONFIG_BRIDGE_EBT_802_3=m
CONFIG_BRIDGE_EBT_AMONG=m
CONFIG_BRIDGE_EBT_ARP=m
CONFIG_BRIDGE_EBT_IP=m
CONFIG_BRIDGE_EBT_IP6=m
CONFIG_BRIDGE_EBT_LIMIT=m
CONFIG_BRIDGE_EBT_MARK=m
CONFIG_BRIDGE_EBT_PKTTYPE=m
CONFIG_BRIDGE_EBT_STP=m
CONFIG_BRIDGE_EBT_VLAN=m
CONFIG_BRIDGE_EBT_ARPREPLY=m
CONFIG_BRIDGE_EBT_DNAT=m
CONFIG_BRIDGE_EBT_MARK_T=m
CONFIG_BRIDGE_EBT_REDIRECT=m
CONFIG_BRIDGE_EBT_SNAT=m
CONFIG_BRIDGE_EBT_LOG=m
CONFIG_BRIDGE_EBT_NFLOG=m
CONFIG_IP_SCTP=m
CONFIG_SCTP_DEFAULT_COOKIE_HMAC_SHA1=y
CONFIG_L2TP=m
CONFIG_L2TP_DEBUGFS=m
CONFIG_L2TP_V3=y
CONFIG_L2TP_IP=m
CONFIG_L2TP_ETH=m
CONFIG_BRIDGE=y
CONFIG_BRIDGE_VLAN_FILTERING=y
CONFIG_VLAN_8021Q=y
CONFIG_VLAN_8021Q_GVRP=y
CONFIG_VLAN_8021Q_MVRP=y
CONFIG_NET_SCHED=y
CONFIG_NET_SCH_HTB=m
CONFIG_NET_SCH_HFSC=m
CONFIG_NET_SCH_PRIO=m
CONFIG_NET_SCH_MULTIQ=m
CONFIG_NET_SCH_RED=m
CONFIG_NET_SCH_SFB=m
CONFIG_NET_SCH_SFQ=m
CONFIG_NET_SCH_TEQL=m
CONFIG_NET_SCH_TBF=m
CONFIG_NET_SCH_CBS=m
CONFIG_NET_SCH_ETF=m
CONFIG_NET_SCH_TAPRIO=m
CONFIG_NET_SCH_GRED=m
CONFIG_NET_SCH_NETEM=m
CONFIG_NET_SCH_DRR=m
CONFIG_NET_SCH_MQPRIO=m
CONFIG_NET_SCH_SKBPRIO=m
CONFIG_NET_SCH_CHOKE=m
CONFIG_NET_SCH_QFQ=m
CONFIG_NET_SCH_CODEL=m
CONFIG_NET_SCH_FQ_CODEL=m
CONFIG_NET_SCH_CAKE=m
CONFIG_NET_SCH_FQ=m
CONFIG_NET_SCH_HHF=m
CONFIG_NET_SCH_PIE=m
CONFIG_NET_SCH_INGRESS=m
CONFIG_NET_SCH_PLUG=m
CONFIG_NET_CLS_BASIC=m
CONFIG_NET_CLS_ROUTE4=m
CONFIG_NET_CLS_FW=m
CONFIG_NET_CLS_U32=m
CONFIG_CLS_U32_PERF=y
CONFIG_CLS_U32_MARK=y
CONFIG_NET_CLS_FLOW=m
CONFIG_NET_CLS_CGROUP=y
CONFIG_NET_CLS_BPF=m
CONFIG_NET_CLS_FLOWER=m
CONFIG_NET_CLS_MATCHALL=m
CONFIG_NET_EMATCH=y
CONFIG_NET_EMATCH_CMP=m
CONFIG_NET_EMATCH_NBYTE=m
CONFIG_NET_EMATCH_U32=m
CONFIG_NET_EMATCH_META=m
CONFIG_NET_EMATCH_TEXT=m
CONFIG_NET_EMATCH_IPSET=m
CONFIG_NET_CLS_ACT=y
CONFIG_NET_ACT_POLICE=m
CONFIG_NET_ACT_GACT=m
CONFIG_GACT_PROB=y
CONFIG_NET_ACT_MIRRED=m
CONFIG_NET_ACT_IPT=m
CONFIG_NET_ACT_NAT=m
CONFIG_NET_ACT_PEDIT=m
CONFIG_NET_ACT_SIMP=m
CONFIG_NET_ACT_SKBEDIT=m
CONFIG_NET_ACT_CSUM=m
CONFIG_NET_ACT_VLAN=m
CONFIG_NET_ACT_BPF=m
CONFIG_NET_ACT_CONNMARK=m
CONFIG_DNS_RESOLVER=y
CONFIG_OPENVSWITCH=m
CONFIG_VSOCKETS=m
CONFIG_VIRTIO_VSOCKETS=m
CONFIG_NETLINK_DIAG=m
CONFIG_CGROUP_NET_PRIO=y
CONFIG_NET_PKTGEN=m
# CONFIG_WIRELESS is not set
CONFIG_PCI=y
CONFIG_PCIEPORTBUS=y
CONFIG_HOTPLUG_PCI_PCIE=y
CONFIG_PCIEASPM_PERFORMANCE=y
CONFIG_PCI_STUB=m
CONFIG_PCI_PF_STUB=m
CONFIG_VGA_ARB_MAX_GPUS=64
CONFIG_HOTPLUG_PCI=y
CONFIG_HOTPLUG_PCI_ACPI=y
CONFIG_VMD=m
CONFIG_DEVTMPFS=y
CONFIG_DEVTMPFS_MOUNT=y
CONFIG_FW_LOADER_USER_HELPER=y
CONFIG_CONNECTOR=y
CONFIG_DMI_SYSFS=y
CONFIG_ISCSI_IBFT=m
CONFIG_EFI_VARS_PSTORE=m
CONFIG_EFI_VARS_PSTORE_DEFAULT_DISABLE=y
CONFIG_EFI_CUSTOM_SSDT_OVERLAYS=y
# CONFIG_PNP_DEBUG_MESSAGES is not set
CONFIG_BLK_DEV_NULL_BLK=m
CONFIG_BLK_DEV_FD=m
CONFIG_BLK_DEV_PCIESSD_MTIP32XX=y
CONFIG_ZRAM=m
CONFIG_ZRAM_MULTI_COMP=y
CONFIG_BLK_DEV_LOOP=m
CONFIG_BLK_DEV_DRBD=m
CONFIG_BLK_DEV_NBD=m
CONFIG_BLK_DEV_RAM=m
CONFIG_BLK_DEV_RAM_SIZE=16384
CONFIG_VIRTIO_BLK=y
CONFIG_BLK_DEV_RBD=m
CONFIG_BLK_DEV_NVME=y
CONFIG_EEPROM_93CX6=m
CONFIG_RAID_ATTRS=y
CONFIG_BLK_DEV_SD=y
CONFIG_CHR_DEV_ST=m
CONFIG_BLK_DEV_SR=m
CONFIG_CHR_DEV_SG=y
CONFIG_CHR_DEV_SCH=m
CONFIG_SCSI_CONSTANTS=y
CONFIG_SCSI_LOGGING=y
CONFIG_SCSI_SCAN_ASYNC=y
CONFIG_SCSI_FC_ATTRS=m
CONFIG_SCSI_SAS_ATA=y
CONFIG_ISCSI_TCP=m
CONFIG_SCSI_CXGB3_ISCSI=m
CONFIG_BE2ISCSI=m
CONFIG_SCSI_HPSA=m
CONFIG_SCSI_AIC94XX=m
# CONFIG_AIC94XX_DEBUG is not set
CONFIG_SCSI_MVSAS=m
# CONFIG_SCSI_MVSAS_DEBUG is not set
CONFIG_SCSI_MVUMI=m
CONFIG_SCSI_ARCMSR=m
CONFIG_SCSI_HPTIOP=m
CONFIG_SCSI_BUSLOGIC=m
CONFIG_VMWARE_PVSCSI=m
CONFIG_LIBFC=m
CONFIG_LIBFCOE=m
CONFIG_FCOE=m
CONFIG_FCOE_FNIC=m
CONFIG_SCSI_ISCI=m
CONFIG_SCSI_IPS=m
CONFIG_SCSI_INITIO=m
CONFIG_SCSI_STEX=m
CONFIG_SCSI_QLA_FC=m
CONFIG_SCSI_QLA_ISCSI=m
CONFIG_SCSI_DEBUG=m
CONFIG_SCSI_PMCRAID=m
CONFIG_SCSI_PM8001=m
CONFIG_SCSI_BFA_FC=m
CONFIG_SCSI_VIRTIO=y
CONFIG_ATA=y
CONFIG_SATA_AHCI=y
CONFIG_SATA_AHCI_PLATFORM=y
CONFIG_SATA_ACARD_AHCI=y
CONFIG_SATA_SIL24=m
CONFIG_PDC_ADMA=m
CONFIG_SATA_QSTOR=m
CONFIG_SATA_SX4=m
CONFIG_ATA_PIIX=y
CONFIG_SATA_MV=m
CONFIG_SATA_NV=m
CONFIG_SATA_PROMISE=m
CONFIG_SATA_SIL=m
CONFIG_SATA_SIS=m
CONFIG_SATA_SVW=m
CONFIG_SATA_ULI=m
CONFIG_SATA_VIA=m
CONFIG_SATA_VITESSE=m
CONFIG_PATA_SCH=y
CONFIG_PATA_MPIIX=y
CONFIG_PATA_ACPI=m
CONFIG_ATA_GENERIC=y
CONFIG_MD=y
CONFIG_BLK_DEV_MD=y
CONFIG_MD_LINEAR=m
CONFIG_MD_MULTIPATH=m
CONFIG_MD_FAULTY=m
CONFIG_BLK_DEV_DM=m
CONFIG_DM_DEBUG=y
CONFIG_DM_CRYPT=m
CONFIG_DM_SNAPSHOT=m
CONFIG_DM_THIN_PROVISIONING=m
CONFIG_DM_CACHE=m
CONFIG_DM_ERA=m
CONFIG_DM_MIRROR=m
CONFIG_DM_LOG_USERSPACE=m
CONFIG_DM_RAID=m
CONFIG_DM_ZERO=m
CONFIG_DM_MULTIPATH=m
CONFIG_DM_MULTIPATH_QL=m
CONFIG_DM_MULTIPATH_ST=m
CONFIG_DM_DELAY=m
CONFIG_DM_FLAKEY=m
CONFIG_DM_VERITY=m
CONFIG_DM_SWITCH=m
CONFIG_DM_LOG_WRITES=m
CONFIG_TARGET_CORE=m
CONFIG_TCM_IBLOCK=m
CONFIG_TCM_FILEIO=m
CONFIG_TCM_PSCSI=m
CONFIG_TCM_USER2=m
CONFIG_LOOPBACK_TARGET=m
CONFIG_TCM_FC=m
CONFIG_ISCSI_TARGET=m
CONFIG_BONDING=m
CONFIG_DUMMY=m
CONFIG_IFB=m
CONFIG_NET_TEAM=m
CONFIG_NET_TEAM_MODE_BROADCAST=m
CONFIG_NET_TEAM_MODE_ROUNDROBIN=m
CONFIG_NET_TEAM_MODE_RANDOM=m
CONFIG_NET_TEAM_MODE_ACTIVEBACKUP=m
CONFIG_NET_TEAM_MODE_LOADBALANCE=m
CONFIG_MACVLAN=y
CONFIG_MACVTAP=y
CONFIG_IPVLAN=m
CONFIG_IPVTAP=m
CONFIG_GENEVE=m
CONFIG_NETCONSOLE=m
CONFIG_NETCONSOLE_DYNAMIC=y
CONFIG_TUN=y
CONFIG_VETH=m
CONFIG_VIRTIO_NET=m
CONFIG_NET_VRF=m
CONFIG_VSOCKMON=m
# CONFIG_NET_VENDOR_3COM is not set
# CONFIG_NET_VENDOR_ADAPTEC is not set
# CONFIG_NET_VENDOR_AGERE is not set
# CONFIG_NET_VENDOR_ALACRITECH is not set
# CONFIG_NET_VENDOR_ALTEON is not set
# CONFIG_NET_VENDOR_AMAZON is not set
# CONFIG_NET_VENDOR_AMD is not set
# CONFIG_NET_VENDOR_AQUANTIA is not set
# CONFIG_NET_VENDOR_ARC is not set
CONFIG_ATL2=m
CONFIG_ATL1=m
CONFIG_ATL1E=m
CONFIG_ATL1C=m
CONFIG_ALX=m
CONFIG_BNX2=m
CONFIG_BNX2X=m
CONFIG_MACB=m
# CONFIG_NET_VENDOR_CAVIUM is not set
CONFIG_CHELSIO_T4=m
CONFIG_CHELSIO_T4VF=m
CONFIG_ENIC=m
CONFIG_DNET=m
# CONFIG_NET_VENDOR_DEC is not set
# CONFIG_NET_VENDOR_DLINK is not set
# CONFIG_NET_VENDOR_I825XX is not set
CONFIG_E1000=m
CONFIG_E1000E=m
CONFIG_IGB=m
# CONFIG_IGB_HWMON is not set
CONFIG_IGBVF=m
CONFIG_FM10K=m
CONFIG_IGC=m
CONFIG_JME=m
CONFIG_MVMDIO=m
CONFIG_MLX4_EN=m
CONFIG_MLX5_CORE=m
CONFIG_MLX5_FPGA=y
CONFIG_MLX5_CORE_EN=y
CONFIG_MLX5_CORE_IPOIB=y
CONFIG_MLX5_EN_IPSEC=y
CONFIG_MLXSW_CORE=m
# CONFIG_NET_VENDOR_MICREL is not set
# CONFIG_NET_VENDOR_MICROCHIP is not set
# CONFIG_NET_VENDOR_MICROSEMI is not set
# CONFIG_NET_VENDOR_MYRI is not set
# CONFIG_NET_VENDOR_NATSEMI is not set
# CONFIG_NET_VENDOR_NETERION is not set
# CONFIG_NET_VENDOR_NVIDIA is not set
# CONFIG_NET_VENDOR_OKI is not set
CONFIG_ETHOC=m
# CONFIG_NET_VENDOR_PENSANDO is not set
CONFIG_QLA3XXX=m
CONFIG_QLCNIC=m
CONFIG_NETXEN_NIC=m
# CONFIG_NET_VENDOR_BROCADE is not set
# CONFIG_NET_VENDOR_QUALCOMM is not set
# CONFIG_NET_VENDOR_RDC is not set
CONFIG_8139CP=m
CONFIG_8139TOO=m
# CONFIG_8139TOO_PIO is not set
CONFIG_8139TOO_8129=y
CONFIG_R8169=m
# CONFIG_NET_VENDOR_RENESAS is not set
# CONFIG_NET_VENDOR_ROCKER is not set
# CONFIG_NET_VENDOR_SAMSUNG is not set
# CONFIG_NET_VENDOR_SEEQ is not set
# CONFIG_NET_VENDOR_SILAN is not set
# CONFIG_NET_VENDOR_SIS is not set
# CONFIG_NET_VENDOR_SMSC is not set
# CONFIG_NET_VENDOR_STMICRO is not set
# CONFIG_NET_VENDOR_SUN is not set
# CONFIG_NET_VENDOR_SYNOPSYS is not set
# CONFIG_NET_VENDOR_TEHUTI is not set
# CONFIG_NET_VENDOR_TI is not set
# CONFIG_NET_VENDOR_VIA is not set
# CONFIG_NET_VENDOR_WIZNET is not set
# CONFIG_NET_VENDOR_XILINX is not set
CONFIG_AMD_PHY=m
CONFIG_BROADCOM_PHY=m
CONFIG_BCM87XX_PHY=m
CONFIG_CICADA_PHY=m
CONFIG_DAVICOM_PHY=m
CONFIG_ICPLUS_PHY=m
CONFIG_LXT_PHY=m
CONFIG_LSI_ET1011C_PHY=m
CONFIG_MARVELL_PHY=m
CONFIG_MICREL_PHY=m
CONFIG_NATIONAL_PHY=m
CONFIG_QSEMI_PHY=m
CONFIG_STE10XP=m
CONFIG_VITESSE_PHY=m
CONFIG_MDIO_BITBANG=m
CONFIG_PPP=m
CONFIG_PPP_BSDCOMP=m
CONFIG_PPP_DEFLATE=m
CONFIG_PPP_MPPE=m
CONFIG_PPPOE=m
CONFIG_PPTP=m
CONFIG_PPPOL2TP=m
CONFIG_PPP_ASYNC=m
CONFIG_PPP_SYNC_TTY=m
CONFIG_SLIP=m
# CONFIG_WLAN is not set
CONFIG_INPUT_SPARSEKMAP=m
CONFIG_INPUT_EVDEV=y
CONFIG_KEYBOARD_ATKBD=m
# CONFIG_INPUT_MOUSE is not set
CONFIG_INPUT_MISC=y
CONFIG_INPUT_UINPUT=m
CONFIG_SERIO_I8042=m
CONFIG_SERIO_SERPORT=m
CONFIG_SERIO_RAW=m
# CONFIG_LEGACY_PTYS is not set
CONFIG_SERIAL_8250=y
# CONFIG_SERIAL_8250_DEPRECATED_OPTIONS is not set
CONFIG_SERIAL_8250_CONSOLE=y
CONFIG_SERIAL_8250_EXAR=m
CONFIG_SERIAL_8250_LPSS=m
CONFIG_SERIAL_8250_MID=m
CONFIG_SERIAL_JSM=m
CONFIG_SERIAL_ARC=m
CONFIG_N_GSM=m
CONFIG_NULL_TTY=m
CONFIG_TTY_PRINTK=y
CONFIG_VIRTIO_CONSOLE=y
CONFIG_IPMI_HANDLER=m
CONFIG_IPMI_DEVICE_INTERFACE=m
CONFIG_IPMI_WATCHDOG=m
CONFIG_IPMI_POWEROFF=m
CONFIG_HW_RANDOM=y
CONFIG_HW_RANDOM_TIMERIOMEM=m
CONFIG_HW_RANDOM_INTEL=m
CONFIG_HW_RANDOM_AMD=m
# CONFIG_HW_RANDOM_VIA is not set
CONFIG_HW_RANDOM_VIRTIO=m
CONFIG_NVRAM=m
CONFIG_HPET=y
CONFIG_HANGCHECK_TIMER=m
CONFIG_TCG_TPM=m
CONFIG_TCG_TIS=m
CONFIG_TCG_NSC=m
CONFIG_TCG_ATMEL=m
CONFIG_TCG_INFINEON=m
CONFIG_TELCLOCK=m
# CONFIG_I2C_COMPAT is not set
# CONFIG_I2C_HELPER_AUTO is not set
CONFIG_SENSORS_FAM15H_POWER=m
CONFIG_SENSORS_CORETEMP=m
# CONFIG_THERMAL_HWMON is not set
CONFIG_DRM=m
CONFIG_DRM_AST=m
CONFIG_DRM_QXL=m
CONFIG_DRM_VIRTIO_GPU=m
CONFIG_DRM_BOCHS=m
CONFIG_DRM_CIRRUS_QEMU=m
CONFIG_FB=y
CONFIG_FIRMWARE_EDID=y
CONFIG_LCD_CLASS_DEVICE=m
CONFIG_FRAMEBUFFER_CONSOLE_ROTATION=y
# CONFIG_HID is not set
# CONFIG_USB_SUPPORT is not set
CONFIG_SCSI_UFSHCD=m
CONFIG_SCSI_UFSHCD_PCI=m
CONFIG_RTC_CLASS=y
# CONFIG_RTC_HCTOSYS is not set
# CONFIG_RTC_SYSTOHC is not set
# CONFIG_RTC_NVMEM is not set
CONFIG_DMADEVICES=y
CONFIG_UIO=m
CONFIG_UIO_PDRV_GENIRQ=m
CONFIG_UIO_DMEM_GENIRQ=m
CONFIG_UIO_PCI_GENERIC=m
CONFIG_VFIO=m
CONFIG_VFIO_PCI=m
CONFIG_VIRT_DRIVERS=y
CONFIG_VIRTIO_PCI=y
CONFIG_VIRTIO_PMEM=m
CONFIG_VIRTIO_BALLOON=m
CONFIG_VIRTIO_INPUT=m
CONFIG_VIRTIO_MMIO=m
CONFIG_VHOST_NET=m
CONFIG_VHOST_SCSI=m
CONFIG_VHOST_VSOCK=m
CONFIG_AMD_IOMMU=y
CONFIG_AMD_IOMMU_V2=m
CONFIG_INTEL_IOMMU=y
CONFIG_IRQ_REMAP=y
CONFIG_RAS_CEC=y
CONFIG_LIBNVDIMM=y
CONFIG_BLK_DEV_PMEM=m
CONFIG_DEV_DAX=m
CONFIG_NVMEM=y
CONFIG_COUNTER=m
CONFIG_EXT4_FS=y
CONFIG_EXT4_FS_POSIX_ACL=y
CONFIG_EXT4_FS_SECURITY=y
CONFIG_XFS_FS=m
CONFIG_XFS_QUOTA=y
CONFIG_XFS_POSIX_ACL=y
CONFIG_XFS_RT=y
CONFIG_XFS_WARN=y
CONFIG_FS_ENCRYPTION=y
CONFIG_FANOTIFY=y
CONFIG_FANOTIFY_ACCESS_PERMISSIONS=y
CONFIG_QUOTA_NETLINK_INTERFACE=y
CONFIG_FUSE_FS=m
CONFIG_CUSE=m
CONFIG_VIRTIO_FS=m
CONFIG_OVERLAY_FS=m
CONFIG_OVERLAY_FS_INDEX=y
CONFIG_OVERLAY_FS_METACOPY=y
CONFIG_FSCACHE=m
CONFIG_FSCACHE_STATS=y
CONFIG_CACHEFILES=m
CONFIG_ISO9660_FS=m
CONFIG_JOLIET=y
CONFIG_ZISOFS=y
CONFIG_UDF_FS=m
CONFIG_MSDOS_FS=m
CONFIG_VFAT_FS=m
CONFIG_FAT_DEFAULT_IOCHARSET="ascii"
CONFIG_NTFS_FS=m
CONFIG_NTFS_RW=y
CONFIG_PROC_KCORE=y
CONFIG_TMPFS=y
CONFIG_TMPFS_POSIX_ACL=y
CONFIG_HUGETLBFS=y
CONFIG_CONFIGFS_FS=y
CONFIG_EFIVAR_FS=y
CONFIG_ECRYPT_FS=m
CONFIG_HFSPLUS_FS=m
CONFIG_CRAMFS=m
CONFIG_SQUASHFS=y
CONFIG_SQUASHFS_XATTR=y
CONFIG_SQUASHFS_LZ4=y
CONFIG_SQUASHFS_LZO=y
CONFIG_SQUASHFS_XZ=y
CONFIG_NFS_FS=m
CONFIG_NFS_V3_ACL=y
CONFIG_NFS_V4=m
CONFIG_NFS_V4_1=y
CONFIG_NFS_V4_2=y
CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN=""
CONFIG_NFS_FSCACHE=y
CONFIG_NFSD=m
CONFIG_NFSD_V3_ACL=y
CONFIG_NFSD_V4=y
CONFIG_NFSD_BLOCKLAYOUT=y
CONFIG_NFSD_SCSILAYOUT=y
CONFIG_NFSD_FLEXFILELAYOUT=y
CONFIG_SUNRPC_DEBUG=y
CONFIG_CEPH_FS=m
CONFIG_CEPH_FSCACHE=y
CONFIG_CEPH_FS_POSIX_ACL=y
CONFIG_CIFS=m
CONFIG_CIFS_UPCALL=y
CONFIG_CIFS_XATTR=y
CONFIG_CIFS_POSIX=y
# CONFIG_CIFS_DEBUG is not set
CONFIG_CIFS_DFS_UPCALL=y
CONFIG_CIFS_FSCACHE=y
CONFIG_NLS_DEFAULT="utf8"
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ASCII=y
CONFIG_NLS_ISO8859_1=y
CONFIG_NLS_UTF8=y
CONFIG_TRUSTED_KEYS=m
CONFIG_ENCRYPTED_KEYS=m
CONFIG_SECURITY_DMESG_RESTRICT=y
CONFIG_SECURITY=y
CONFIG_SECURITY_NETWORK=y
CONFIG_SECURITY_PATH=y
CONFIG_FORTIFY_SOURCE=y
CONFIG_SECURITY_SAFESETID=y
# CONFIG_INTEGRITY is not set
CONFIG_LSM="loadpin,safesetid,integrity,bpf"
CONFIG_INIT_ON_ALLOC_DEFAULT_ON=y
CONFIG_BUG_ON_DATA_CORRUPTION=y
CONFIG_CRYPTO_USER=m
CONFIG_CRYPTO_NULL=y
CONFIG_CRYPTO_PCRYPT=m
CONFIG_CRYPTO_TEST=m
CONFIG_CRYPTO_ANUBIS=m
CONFIG_CRYPTO_BLOWFISH=m
CONFIG_CRYPTO_CAMELLIA=m
CONFIG_CRYPTO_FCRYPT=m
CONFIG_CRYPTO_KHAZAD=m
CONFIG_CRYPTO_SEED=m
CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_ARC4=m
CONFIG_CRYPTO_CFB=m
CONFIG_CRYPTO_KEYWRAP=m
CONFIG_CRYPTO_LRW=m
CONFIG_CRYPTO_PCBC=m
CONFIG_CRYPTO_CHACHA20POLY1305=m
CONFIG_CRYPTO_MICHAEL_MIC=m
CONFIG_CRYPTO_RMD160=m
CONFIG_CRYPTO_SHA1=y
CONFIG_CRYPTO_VMAC=m
CONFIG_CRYPTO_WP512=m
CONFIG_CRYPTO_XCBC=m
CONFIG_CRYPTO_CRC32=m
CONFIG_CRYPTO_842=m
CONFIG_CRYPTO_ANSI_CPRNG=m
CONFIG_CRYPTO_USER_API_HASH=y
CONFIG_CRYPTO_USER_API_SKCIPHER=y
CONFIG_CRYPTO_USER_API_RNG=m
CONFIG_CRYPTO_USER_API_AEAD=m
CONFIG_CRYPTO_AES_NI_INTEL=m
CONFIG_CRYPTO_BLOWFISH_X86_64=m
CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64=m
CONFIG_CRYPTO_CAST5_AVX_X86_64=m
CONFIG_CRYPTO_CAST6_AVX_X86_64=m
CONFIG_CRYPTO_SERPENT_SSE2_X86_64=m
CONFIG_CRYPTO_SERPENT_AVX2_X86_64=m
CONFIG_CRYPTO_TWOFISH_AVX_X86_64=m
CONFIG_CRYPTO_SHA1_SSSE3=m
CONFIG_CRYPTO_SHA256_SSSE3=m
CONFIG_CRYPTO_SHA512_SSSE3=m
CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL=m
CONFIG_CRYPTO_CRC32_PCLMUL=m
CONFIG_CRYPTO_DEV_PADLOCK=m
CONFIG_CRYPTO_DEV_PADLOCK_AES=m
CONFIG_CRYPTO_DEV_PADLOCK_SHA=m
CONFIG_CRYPTO_DEV_CCP=y
CONFIG_CRYPTO_DEV_QAT_DH895xCC=m
CONFIG_CRYPTO_DEV_QAT_C3XXX=m
CONFIG_CRYPTO_DEV_QAT_C62X=m
CONFIG_CRYPTO_DEV_QAT_DH895xCCVF=m
CONFIG_CRYPTO_DEV_QAT_C3XXXVF=m
CONFIG_CRYPTO_DEV_QAT_C62XVF=m
CONFIG_CORDIC=m
CONFIG_CRC7=m
CONFIG_LIBCRC32C=y
CONFIG_PRINTK_TIME=y
CONFIG_DYNAMIC_DEBUG=y
CONFIG_STRIP_ASM_SYMS=y
CONFIG_DEBUG_SECTION_MISMATCH=y
CONFIG_MAGIC_SYSRQ=y
# CONFIG_MAGIC_SYSRQ_SERIAL is not set
CONFIG_SCHED_STACK_END_CHECK=y
CONFIG_PANIC_ON_OOPS=y
CONFIG_HARDLOCKUP_DETECTOR=y
CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y
CONFIG_RCU_CPU_STALL_TIMEOUT=60
# CONFIG_RCU_TRACE is not set
CONFIG_LATENCYTOP=y
CONFIG_FUNCTION_PROFILER=y
CONFIG_STACK_TRACER=y
CONFIG_SCHED_TRACER=y
CONFIG_FTRACE_SYSCALLS=y
CONFIG_BLK_DEV_IO_TRACE=y
CONFIG_BPF_KPROBE_OVERRIDE=y
# CONFIG_X86_VERBOSE_BOOTUP is not set
# CONFIG_EARLY_PRINTK is not set
# CONFIG_X86_DEBUG_FPU is not set
CONFIG_NOTIFIER_ERROR_INJECTION=m
CONFIG_FUNCTION_ERROR_INJECTION=y
# CONFIG_RUNTIME_TESTING_MENU is not set

View File

@ -139,6 +139,8 @@ BuildRequires: gcc-plugin-devel
# glibc-static is required for a consistent build environment (specifically
# CONFIG_CC_CAN_LINK_STATIC=y).
BuildRequires: glibc-static
# Kernel could be compressed with lz4
BuildRequires: lz4
%if %{with_perf}
BuildRequires: zlib-devel binutils-devel newt-devel perl(ExtUtils::Embed) bison flex xz-devel

View File

@ -360,6 +360,9 @@ static int brd_alloc(int i)
blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, disk->queue);
blk_queue_flag_set(QUEUE_FLAG_NOWAIT, disk->queue);
#ifdef CONFIG_EMM_RAMDISK_SWAP
blk_queue_flag_set(QUEUE_FLAG_RAMDISK, disk->queue);
#endif
err = add_disk(disk);
if (err)
goto out_cleanup_disk;

View File

@ -2220,6 +2220,9 @@ static int zram_add(void)
/* zram devices sort of resembles non-rotational disks */
blk_queue_flag_set(QUEUE_FLAG_NONROT, zram->disk->queue);
blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, zram->disk->queue);
#ifdef CONFIG_EMM_RAMDISK_SWAP
blk_queue_flag_set(QUEUE_FLAG_RAMDISK, zram->disk->queue);
#endif
/*
* To ensure that we always get PAGE_SIZE aligned

View File

@ -573,6 +573,9 @@ struct request_queue {
#define QUEUE_FLAG_NOWAIT 29 /* device supports NOWAIT */
#define QUEUE_FLAG_SQ_SCHED 30 /* single queue style io dispatch */
#define QUEUE_FLAG_SKIP_TAGSET_QUIESCE 31 /* quiesce_tagset skip the queue*/
#ifdef CONFIG_EMM_RAMDISK_SWAP
#define QUEUE_FLAG_RAMDISK 32 /* ramdisk requires runtime page alloc */
#endif
#define QUEUE_FLAG_MQ_DEFAULT ((1UL << QUEUE_FLAG_IO_STAT) | \
(1UL << QUEUE_FLAG_SAME_COMP) | \
@ -1396,6 +1399,7 @@ struct block_device_operations {
unsigned int flags);
int (*open)(struct gendisk *disk, blk_mode_t mode);
void (*release)(struct gendisk *disk);
int (*swap_folio)(struct block_device *, sector_t, struct folio *, enum req_op);
int (*ioctl)(struct block_device *bdev, blk_mode_t mode,
unsigned cmd, unsigned long arg);
int (*compat_ioctl)(struct block_device *bdev, blk_mode_t mode,
@ -1437,6 +1441,10 @@ extern int blkdev_compat_ptr_ioctl(struct block_device *, blk_mode_t,
#define blkdev_compat_ptr_ioctl NULL
#endif
extern int bdev_swapin_folio(struct block_device *, sector_t, struct folio *);
extern int bdev_swapout_folio(struct block_device *, sector_t, struct folio *,
struct writeback_control *);
static inline void blk_wake_io_task(struct task_struct *waiter)
{
/*
@ -1564,4 +1572,15 @@ struct io_comp_batch {
#define DEFINE_IO_COMP_BATCH(name) struct io_comp_batch name = { }
#ifdef CONFIG_EMM_RAMDISK_SWAP
/*
* Check if a bdev is ramdisk based
*/
static inline bool bdev_ramdisk(struct block_device *bdev)
{
return test_bit(QUEUE_FLAG_RAMDISK,
&bdev_get_queue(bdev)->queue_flags);
}
#endif
#endif /* _LINUX_BLKDEV_H */

37
include/linux/emm.h Normal file
View File

@ -0,0 +1,37 @@
#include <linux/memcontrol.h>
#ifdef CONFIG_EMM_MEMCG
struct emm_memcg_ops {
int (*init)(struct mem_cgroup *memcg);
void (*exit)(struct mem_cgroup *memcg);
};
int emm_memcg_init(struct mem_cgroup *memcg);
void emm_memcg_exit(struct mem_cgroup *memcg);
int emm_init(struct emm_memcg_ops *ops);
int emm_exit(void);
#else
static inline int emm_memcg_init(struct mem_cgroup *memcg)
{
return 0;
}
static inline void emm_memcg_exit(struct mem_cgroup *memcg)
{
}
#endif
#ifdef CONFIG_EMM_RECLAIM
enum {
EMM_RECLAIM,
EMM_AGE,
EMM_MIX,
};
#endif

View File

@ -37,6 +37,10 @@ enum memcg_stat_item {
MEMCG_KMEM,
MEMCG_ZSWAP_B,
MEMCG_ZSWAPPED,
#ifdef CONFIG_MEMCG_ZRAM
MEMCG_ZRAM_B,
MEMCG_ZRAMED,
#endif
MEMCG_NR_STAT,
};
@ -231,6 +235,11 @@ struct mem_cgroup {
unsigned long zswap_max;
#endif
#ifdef CONFIG_MEMCG_ZRAM
unsigned long zram_max;
unsigned short zram_prio;
#endif
unsigned long soft_limit;
/* vmpressure notifications */
@ -326,11 +335,6 @@ struct mem_cgroup {
struct list_head event_list;
spinlock_t event_list_lock;
KABI_RESERVE(1);
KABI_RESERVE(2);
KABI_RESERVE(3);
KABI_RESERVE(4);
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
struct deferred_split deferred_split_queue;
#endif
@ -340,6 +344,27 @@ struct mem_cgroup {
struct lru_gen_mm_list mm_list;
#endif
#ifdef CONFIG_EMM_MEMCG
/* EMM: for tracking cgroup level info on the fly and with high performance */
void *emm_memcg_data;
#endif
#ifdef CONFIG_TEXT_UNEVICTABLE
bool allow_unevictable;
unsigned int unevictable_percent;
/*
* the unevictable_size is larger than the real unevictable memory
* size, due to there may be multiple tasks sharing the same memory,
* such as binary and dynamic library sharing.
*/
atomic_long_t unevictable_size;
#endif
KABI_RESERVE(1);
KABI_RESERVE(2);
KABI_RESERVE(3);
KABI_RESERVE(4);
struct mem_cgroup_per_node *nodeinfo[];
};
@ -1046,8 +1071,8 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
return x;
}
void mem_cgroup_flush_stats(void);
void mem_cgroup_flush_stats_ratelimited(void);
void mem_cgroup_flush_stats(struct mem_cgroup *memcg);
void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg);
void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
int val);
@ -1531,11 +1556,11 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
return node_page_state(lruvec_pgdat(lruvec), idx);
}
static inline void mem_cgroup_flush_stats(void)
static inline void mem_cgroup_flush_stats(struct mem_cgroup *memcg)
{
}
static inline void mem_cgroup_flush_stats_ratelimited(void)
static inline void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg)
{
}

View File

@ -444,6 +444,9 @@ extern unsigned int kobjsize(const void *objp);
/* This mask represents all the VMA flag bits used by mlock */
#define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT)
/* This mask is used to clear all the VMA flags used by mlock */
#define VM_LOCKED_CLEAR_MASK (~(VM_LOCKED | VM_LOCKONFAULT))
/* Arch-specific flags to clear when updating VM flags on protection change */
#ifndef VM_ARCH_CLEAR
# define VM_ARCH_CLEAR VM_NONE
@ -2330,6 +2333,9 @@ static inline bool can_do_mlock(void) { return false; }
#endif
extern int user_shm_lock(size_t, struct ucounts *);
extern void user_shm_unlock(size_t, struct ucounts *);
extern int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
struct vm_area_struct **prev, unsigned long start,
unsigned long end, vm_flags_t newflags);
struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr,
pte_t pte);

View File

@ -132,6 +132,14 @@ static inline int lru_hist_from_seq(unsigned long seq)
return seq % NR_HIST_GENS;
}
static inline int lru_hist_of_min_seq(struct lruvec *lruvec, bool type)
{
if (IS_ENABLED(CONFIG_LRU_GEN_STATS))
return lru_gen_from_seq(READ_ONCE(lruvec->lrugen.min_seq[type]));
else
return 0;
}
static inline int lru_tier_from_refs(int refs)
{
VM_WARN_ON_ONCE(refs > BIT(LRU_REFS_WIDTH));
@ -231,27 +239,22 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio,
if (folio_test_unevictable(folio) || !lrugen->enabled)
return false;
/*
* There are four common cases for this page:
* 1. If it's hot, i.e., freshly faulted in, add it to the youngest
* generation, and it's protected over the rest below.
* 2. If it can't be evicted immediately, i.e., a dirty page pending
* writeback, add it to the second youngest generation.
* 3. If it should be evicted first, e.g., cold and clean from
* folio_rotate_reclaimable(), add it to the oldest generation.
* 4. Everything else falls between 2 & 3 above and is added to the
* second oldest generation if it's considered inactive, or the
* oldest generation otherwise. See lru_gen_is_active().
* There are three common cases for this page:
* 1. If it's hot, e.g., freshly faulted in or previously hot and
* migrated, add it to the youngest generation.
* 2. If it's cold but can't be evicted immediately, i.e., an anon page
* not in swapcache or a dirty page pending writeback, add it to the
* second oldest generation.
* 3. Everything else (clean, cold) is added to the oldest generation.
*/
if (folio_test_active(folio))
seq = lrugen->max_seq;
else if ((type == LRU_GEN_ANON && !folio_test_swapcache(folio)) ||
(folio_test_reclaim(folio) &&
(folio_test_dirty(folio) || folio_test_writeback(folio))))
seq = lrugen->max_seq - 1;
else if (reclaiming || lrugen->min_seq[type] + MIN_NR_GENS >= lrugen->max_seq)
seq = lrugen->min_seq[type];
else
seq = lrugen->min_seq[type] + 1;
else
seq = lrugen->min_seq[type];
gen = lru_gen_from_seq(seq);
flags = (gen + 1UL) << LRU_GEN_PGOFF;

View File

@ -425,9 +425,9 @@ struct lru_gen_folio {
/* the multi-gen LRU sizes, eventually consistent */
long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
/* the exponential moving average of refaulted */
unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS];
atomic_long_t avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS];
/* the exponential moving average of evicted+protected */
unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS];
atomic_long_t avg_total[ANON_AND_FILE][MAX_NR_TIERS];
/* the first tier doesn't need protection, hence the minus one */
unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1];
/* can be modified without holding the LRU lock */
@ -486,6 +486,9 @@ struct lru_gen_mm_walk {
int batched;
bool can_swap;
bool force_scan;
#ifdef CONFIG_EMM_RECLAIM
bool force_full_scan;
#endif
};
void lru_gen_init_lruvec(struct lruvec *lruvec);
@ -626,8 +629,8 @@ struct lruvec {
*/
unsigned long anon_cost;
unsigned long file_cost;
/* Non-resident age, driven by LRU movement */
atomic_long_t nonresident_age;
/* Number of evictions (non-resident age) */
atomic_long_t evictions[ANON_AND_FILE];
/* Refaults at the time of last reclaim cycle */
unsigned long refaults[ANON_AND_FILE];
/* Various lruvec state flags (enum lruvec_flags) */
@ -641,6 +644,18 @@ struct lruvec {
#ifdef CONFIG_MEMCG
struct pglist_data *pgdat;
#endif
#ifdef CONFIG_EMM_WORKINGSET_TRACKING
/* Non-resident file age, driven by LRU movement */
atomic_long_t evicted_file;
/* For estimating avg refault distance */
unsigned long refault_count;
unsigned long total_distance;
#endif
#ifdef CONFIG_EMM_MEMCG
void *emm_lruvec_data;
#endif
};
/* Isolate unmapped pages */

View File

@ -112,4 +112,5 @@ extern void oom_killer_enable(void);
extern struct task_struct *find_lock_task_mm(struct task_struct *p);
extern int sysctl_oom_kill_largest_task;
#endif /* _INCLUDE_LINUX_OOM_H */

View File

@ -206,6 +206,9 @@ enum mapping_flags {
AS_RELEASE_ALWAYS, /* Call ->release_folio(), even if no private data */
AS_STABLE_WRITES, /* must wait for writeback before modifying
folio contents */
#ifdef CONFIG_EMM_RAMDISK_SWAP
AS_RAM_SWAP, /* ramdisk based swap space, XXX: rename to some thing commonly used */
#endif
};
/**
@ -306,6 +309,13 @@ static inline void mapping_clear_stable_writes(struct address_space *mapping)
clear_bit(AS_STABLE_WRITES, &mapping->flags);
}
#ifdef CONFIG_EMM_RAMDISK_SWAP
static inline int mapping_ram_swap(struct address_space *mapping)
{
return !test_bit(AS_RAM_SWAP, &mapping->flags);
}
#endif
static inline gfp_t mapping_gfp_mask(struct address_space * mapping)
{
return mapping->gfp_mask;

View File

@ -348,11 +348,9 @@ static inline swp_entry_t page_swap_entry(struct page *page)
}
/* linux/mm/workingset.c */
bool workingset_test_recent(void *shadow, bool file, bool *workingset);
void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages);
bool workingset_test_recent(void *shadow, bool file, bool *workingset, bool tracking);
void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg);
void workingset_refault(struct folio *folio, void *shadow);
void workingset_activation(struct folio *folio);
/* Only track the nodes of mappings with shadow entries */
void workingset_update_node(struct xa_node *node);
@ -524,7 +522,7 @@ extern int swp_swapcount(swp_entry_t entry);
extern struct swap_info_struct *page_swap_info(struct page *);
extern struct swap_info_struct *swp_swap_info(swp_entry_t entry);
struct backing_dev_info;
extern int init_swap_address_space(unsigned int type, unsigned long nr_pages);
extern int init_swap_address_space(struct swap_info_struct *si, unsigned long nr_pages);
extern void exit_swap_address_space(unsigned int type);
extern struct swap_info_struct *get_swap_device(swp_entry_t entry);
sector_t swap_page_sector(struct page *page);

View File

@ -0,0 +1,67 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _TEXT_UNEVICTABLE_H
#define _TEXT_UNEVICTABLE_H
struct mem_cgroup;
#ifdef CONFIG_TEXT_UNEVICTABLE
DECLARE_STATIC_KEY_FALSE(unevictable_enabled_key);
static inline bool unevictable_enabled(void)
{
return static_branch_unlikely(&unevictable_enabled_key);
}
bool is_memcg_unevictable_enabled(struct mem_cgroup *memcg);
void memcg_increase_unevict_size(struct mem_cgroup *memcg, unsigned long size);
void memcg_decrease_unevict_size(struct mem_cgroup *memcg, unsigned long size);
bool is_unevictable_size_overflow(struct mem_cgroup *memcg);
unsigned long memcg_exstat_text_unevict_gather(struct mem_cgroup *memcg);
void mem_cgroup_can_unevictable(struct task_struct *tsk, struct mem_cgroup *to);
void mem_cgroup_cancel_unevictable(struct cgroup_taskset *tset);
void memcg_all_processes_unevict(struct mem_cgroup *memcg, bool enable);
void del_unevict_task(struct task_struct *tsk);
void clean_task_unevict_size(struct task_struct *tsk);
#else
static inline bool unevictable_enabled(void)
{
return false;
}
static inline bool is_memcg_unevictable_enabled(struct mem_cgroup *memcg)
{
return false;
}
static inline void memcg_increase_unevict_size(struct mem_cgroup *memcg,
unsigned long size)
{
}
static inline void memcg_decrease_unevict_size(struct mem_cgroup *memcg,
unsigned long size)
{
}
static inline bool is_unevictable_size_overflow(struct mem_cgroup *memcg)
{
return false;
}
static inline unsigned long memcg_exstat_text_unevict_gather(struct mem_cgroup *memcg)
{
return 0;
}
static inline void mem_cgroup_can_unevictable(struct task_struct *tsk,
struct mem_cgroup *to)
{
}
static inline void mem_cgroup_cancel_unevictable(struct cgroup_taskset *tset)
{
}
static inline void memcg_all_processes_unevict(struct mem_cgroup *memcg, bool enable)
{
}
static inline void del_unevict_task(struct task_struct *tsk)
{
}
static inline void clean_task_unevict_size(struct task_struct *tsk)
{
}
#endif
#endif

View File

@ -958,6 +958,22 @@ config MEMCG_KMEM
depends on MEMCG
default y
config MEMCG_KMEM_DEFAULT_OFF
bool "Disable kernel memory cgroup accounting by default"
depends on MEMCG_KMEM
help
Disable kernel memory cgroup accounting by default, since it
have extra overhead. User may override this at boot time
kmem by passing cgroup.memory=nokmem or cgroup.memory=kmem
to kernel cmdline.
If unsure, say N.
config MEMCG_ZRAM
bool
depends on MEMCG && SWAP
default y
config BLK_CGROUP
bool "IO controller"
depends on BLOCK

View File

@ -4387,6 +4387,7 @@ int cgroup_rm_cftypes(struct cftype *cfts)
cgroup_unlock();
return 0;
}
EXPORT_SYMBOL_GPL(cgroup_rm_cftypes);
/**
* cgroup_add_cftypes - add an array of cftypes to a subsystem
@ -4443,6 +4444,7 @@ int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
cft->flags |= __CFTYPE_ONLY_ON_DFL;
return cgroup_add_cftypes(ss, cfts);
}
EXPORT_SYMBOL_GPL(cgroup_add_dfl_cftypes);
/**
* cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies
@ -4460,6 +4462,7 @@ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
cft->flags |= __CFTYPE_NOT_ON_DFL;
return cgroup_add_cftypes(ss, cfts);
}
EXPORT_SYMBOL_GPL(cgroup_add_legacy_cftypes);
/**
* cgroup_file_notify - generate a file modified event for a cgroup_file

View File

@ -238,6 +238,7 @@ __bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp)
cgroup_rstat_flush_locked(cgrp);
spin_unlock_irq(&cgroup_rstat_lock);
}
EXPORT_SYMBOL_GPL(cgroup_rstat_flush);
/**
* cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold

View File

@ -69,6 +69,9 @@
#include <linux/rethook.h>
#include <linux/sysfs.h>
#include <linux/user_events.h>
#ifdef CONFIG_TEXT_UNEVICTABLE
#include <linux/unevictable.h>
#endif
#include <linux/uaccess.h>
#include <asm/unistd.h>
@ -856,6 +859,9 @@ void __noreturn do_exit(long code)
tsk->exit_code = code;
taskstats_exit(tsk, group_dead);
#ifdef CONFIG_TEXT_UNEVICTABLE
clean_task_unevict_size(tsk);
#endif
exit_mm();
if (group_dead)

View File

@ -134,12 +134,22 @@ static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT;
defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT)
int sysctl_legacy_va_layout;
#endif
#ifdef CONFIG_CGROUPFS
extern int container_cpuquota_aware;
extern int cgroupfs_stat_show_cpuacct_info;
int cgroupfs_mounted;
#endif
#ifdef CONFIG_EMM_FORCE_SWAPPINESS
extern int sysctl_vm_force_swappiness;
#endif
#ifdef CONFIG_EMM_RAMDISK_SWAP
extern int sysctl_vm_ramdisk_swaptune;
extern int sysctl_vm_swapcache_fastfree;
#endif
#endif /* CONFIG_SYSCTL */
/*
@ -2231,6 +2241,15 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = overcommit_kbytes_handler,
},
{
.procname = "oom_kill_largest_task",
.data = &sysctl_oom_kill_largest_task,
.maxlen = sizeof(sysctl_oom_kill_largest_task),
.mode = 0644,
.proc_handler = &proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
{
.procname = "page-cluster",
.data = &page_cluster,
@ -2257,6 +2276,37 @@ static struct ctl_table vm_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_TWO_HUNDRED,
},
#ifdef CONFIG_EMM_FORCE_SWAPPINESS
{
.procname = "force_swappiness",
.data = &sysctl_vm_force_swappiness,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
#endif
#ifdef CONFIG_EMM_RAMDISK_SWAP
{
.procname = "ramdisk_swaptune",
.data = &sysctl_vm_ramdisk_swaptune,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
{
.procname = "swapcache_fastfree",
.data = &sysctl_vm_swapcache_fastfree,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
#endif
#ifdef CONFIG_NUMA
{
.procname = "numa_stat",

View File

@ -1268,4 +1268,93 @@ config PAGECACHE_LIMIT
If unsure, say N.
config ENHANCED_MM
bool "Enable enhanced mm support (EMM)"
depends on MEMCG
default n
help
Support for EMM, including in-kernel API, extended interface, and reserved
extra data structures.
If unsure, say N.
config EMM_FORCE_SWAPPINESS
bool "Prevent kswapd from reclaim any anon pages"
depends on ENHANCED_MM
depends on SWAP
default n
help
This option will prevent kswapd from reclaim any anon pages when
when swappiness is set to 0.
By default, to prevent system-wide OOM, kswapd will reclaim anon
pages for root cgroup even if swappiness is set to 0. This overrides
this behaviour.
If unsure say N.
config EMM_RAMDISK_SWAP
bool "Tune ramdisks based swaps"
depends on ENHANCED_MM
depends on SWAP
default n
help
This option will enable a few tunes for ramdisk based swaps,
and make swap work better with memcg bounded ramdisk.
If unsure say N.
config EMM_WORKINGSET_TRACKING
bool "Evaluate memory eviction usage"
depends on ENHANCED_MM
default n
help
Evaluate per-cgroup memory eviction status, this help esimate
the maximum, or actual potential workingset growth pattern when
active shrinker are enabled. Because active shrinker may reduce
the in-memory LRU size and change the workingset size, estimating
a "raw workingset" can help analyze and improve the memory usage.
config EMM_MEMCG
bool "Enhanced memory management support for memcg."
depends on ENHANCED_MM
depends on MEMCG
default y
help
This enables enhanced memory management support for memcg.
config EMM_RECLAIM
bool "Enhanced memory reclaim support."
depends on ENHANCED_MM
depends on MEMCG
default y
help
This enables enhanced memory reclaim support.
config EMM_ZRAM_CONF
bool "A place holder to Ensure required ZRAM configures are enabled."
select CRYPTO_LZO
select CRYPTO_ZSTD
select CRYPTO_LZ4
select CRYPTO_LZO
select CRYPTO_LZ4HC
select MEMCG_ZRAM
select ZSMALLOC
default n
help
A place holder to Ensure required ZRAM configures are enabled.
config TEXT_UNEVICTABLE
bool "Enable memcg granularity code section unevictable"
depends on MEMCG
default n
help
This feature is used to pin code section of processes in memcg for the
corresponding VMAs like mlock does. The reason why add this feature here
is to prevent the performance jitter when some key applications code section
are reclaimed in memcg.
If unsure, say N.
endmenu

View File

@ -99,6 +99,7 @@ obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
ifdef CONFIG_SWAP
obj-$(CONFIG_MEMCG) += swap_cgroup.o
endif
obj-$(CONFIG_EMM_MEMCG) += emm.o
obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
obj-$(CONFIG_GUP_TEST) += gup_test.o
obj-$(CONFIG_DMAPOOL_TEST) += dmapool_test.o
@ -138,3 +139,4 @@ obj-$(CONFIG_IO_MAPPING) += io-mapping.o
obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o
obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
obj-y += unevictable.o

109
mm/emm.c Normal file
View File

@ -0,0 +1,109 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/compiler.h>
#include <linux/mm_types.h>
#include <linux/cgroup.h>
#include <linux/memcontrol.h>
#include <linux/emm.h>
#include <asm-generic/bug.h>
struct emm_memcg_ops *__emm_memcg_ops __read_mostly;
static int _emm_do_memcg_init(struct mem_cgroup *memcg)
{
struct emm_memcg_ops *ops;
lockdep_assert_held(&cgroup_mutex);
ops = READ_ONCE(__emm_memcg_ops);
if (ops)
return ops->init(memcg);
return 0;
}
static void _emm_do_memcg_exit(struct mem_cgroup *memcg)
{
struct emm_memcg_ops *ops;
lockdep_assert_held(&cgroup_mutex);
ops = READ_ONCE(__emm_memcg_ops);
if (ops)
ops->exit(memcg);
}
int emm_memcg_init(struct mem_cgroup *memcg)
{
return _emm_do_memcg_init(memcg);
}
void emm_memcg_exit(struct mem_cgroup *memcg)
{
/* cgroup should be dying */
WARN_ON_ONCE(!css_is_dying(&memcg->css));
_emm_do_memcg_exit(memcg);
}
int emm_init(struct emm_memcg_ops *ops)
{
int ret = 0;
struct mem_cgroup *memcg;
/*
* Going to iterate through exiting cgroups,
* also use it to protect __emm_memcg_ops
*/
cgroup_lock();
if (READ_ONCE(__emm_memcg_ops)) {
ret = -EBUSY;
goto out;
}
WRITE_ONCE(__emm_memcg_ops, ops);
memcg = mem_cgroup_iter(NULL, NULL, NULL);
do {
_emm_do_memcg_init(memcg);
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
out:
cgroup_unlock();
return ret;
}
EXPORT_SYMBOL(emm_init);
int emm_exit(void)
{
int ret = 0;
struct mem_cgroup *memcg;
/*
* Going to iterate through exiting cgroups,
* also use it to protect __emm_memcg_ops
*/
cgroup_lock();
if (!READ_ONCE(__emm_memcg_ops)) {
ret = -EINVAL;
goto out;
}
memcg = mem_cgroup_iter(NULL, NULL, NULL);
do {
_emm_do_memcg_exit(memcg);
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
WRITE_ONCE(__emm_memcg_ops, NULL);
out:
cgroup_unlock();
return ret;
}
EXPORT_SYMBOL(emm_exit);

View File

@ -4195,7 +4195,7 @@ static void filemap_cachestat(struct address_space *mapping,
shadow = get_shadow_from_swap_cache(swp);
}
#endif
if (workingset_test_recent(shadow, true, &workingset))
if (workingset_test_recent(shadow, true, &workingset, false))
cs->nr_recently_evicted += nr_pages;
goto resched;

View File

@ -63,12 +63,16 @@
#include <linux/resume_user_mode.h>
#include <linux/psi.h>
#include <linux/seq_buf.h>
#include <linux/emm.h>
#include <linux/sched/isolation.h>
#include "internal.h"
#include <net/sock.h>
#include <net/ip.h>
#include "slab.h"
#include "swap.h"
#ifdef CONFIG_TEXT_UNEVICTABLE
#include <linux/unevictable.h>
#endif
#include <linux/uaccess.h>
@ -78,6 +82,7 @@ struct cgroup_subsys memory_cgrp_subsys __read_mostly;
EXPORT_SYMBOL(memory_cgrp_subsys);
struct mem_cgroup *root_mem_cgroup __read_mostly;
EXPORT_SYMBOL_GPL(root_mem_cgroup);
/* Active memory cgroup to use from an interrupt context */
DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
@ -87,7 +92,8 @@ EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg);
static bool cgroup_memory_nosocket __ro_after_init;
/* Kernel memory accounting disabled? */
static bool cgroup_memory_nokmem __ro_after_init;
bool cgroup_memory_nokmem __ro_after_init = IS_ENABLED(CONFIG_MEMCG_KMEM_DEFAULT_OFF);
EXPORT_SYMBOL(cgroup_memory_nokmem);
/* BPF memory accounting disabled? */
static bool cgroup_memory_nobpf __ro_after_init;
@ -570,116 +576,6 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
return mz;
}
/*
* memcg and lruvec stats flushing
*
* Many codepaths leading to stats update or read are performance sensitive and
* adding stats flushing in such codepaths is not desirable. So, to optimize the
* flushing the kernel does:
*
* 1) Periodically and asynchronously flush the stats every 2 seconds to not let
* rstat update tree grow unbounded.
*
* 2) Flush the stats synchronously on reader side only when there are more than
* (MEMCG_CHARGE_BATCH * nr_cpus) update events. Though this optimization
* will let stats be out of sync by atmost (MEMCG_CHARGE_BATCH * nr_cpus) but
* only for 2 seconds due to (1).
*/
static void flush_memcg_stats_dwork(struct work_struct *w);
static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
static DEFINE_PER_CPU(unsigned int, stats_updates);
static atomic_t stats_flush_ongoing = ATOMIC_INIT(0);
static atomic_t stats_flush_threshold = ATOMIC_INIT(0);
static u64 flush_next_time;
#define FLUSH_TIME (2UL*HZ)
/*
* Accessors to ensure that preemption is disabled on PREEMPT_RT because it can
* not rely on this as part of an acquired spinlock_t lock. These functions are
* never used in hardirq context on PREEMPT_RT and therefore disabling preemtion
* is sufficient.
*/
static void memcg_stats_lock(void)
{
preempt_disable_nested();
VM_WARN_ON_IRQS_ENABLED();
}
static void __memcg_stats_lock(void)
{
preempt_disable_nested();
}
static void memcg_stats_unlock(void)
{
preempt_enable_nested();
}
static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
{
unsigned int x;
if (!val)
return;
cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
x = __this_cpu_add_return(stats_updates, abs(val));
if (x > MEMCG_CHARGE_BATCH) {
/*
* If stats_flush_threshold exceeds the threshold
* (>num_online_cpus()), cgroup stats update will be triggered
* in __mem_cgroup_flush_stats(). Increasing this var further
* is redundant and simply adds overhead in atomic update.
*/
if (atomic_read(&stats_flush_threshold) <= num_online_cpus())
atomic_add(x / MEMCG_CHARGE_BATCH, &stats_flush_threshold);
__this_cpu_write(stats_updates, 0);
}
}
static void do_flush_stats(void)
{
/*
* We always flush the entire tree, so concurrent flushers can just
* skip. This avoids a thundering herd problem on the rstat global lock
* from memcg flushers (e.g. reclaim, refault, etc).
*/
if (atomic_read(&stats_flush_ongoing) ||
atomic_xchg(&stats_flush_ongoing, 1))
return;
WRITE_ONCE(flush_next_time, jiffies_64 + 2*FLUSH_TIME);
cgroup_rstat_flush(root_mem_cgroup->css.cgroup);
atomic_set(&stats_flush_threshold, 0);
atomic_set(&stats_flush_ongoing, 0);
}
void mem_cgroup_flush_stats(void)
{
if (atomic_read(&stats_flush_threshold) > num_online_cpus())
do_flush_stats();
}
void mem_cgroup_flush_stats_ratelimited(void)
{
if (time_after64(jiffies_64, READ_ONCE(flush_next_time)))
mem_cgroup_flush_stats();
}
static void flush_memcg_stats_dwork(struct work_struct *w)
{
/*
* Always flush here so that flushing in latency-sensitive paths is
* as cheap as possible.
*/
do_flush_stats();
queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
}
/* Subset of vm_event_item to report for memcg event stats */
static const unsigned int memcg_vm_event_stat[] = {
PGPGIN,
@ -724,6 +620,15 @@ static inline int memcg_events_index(enum vm_event_item idx)
}
struct memcg_vmstats_percpu {
/* Stats updates since the last flush */
unsigned int stats_updates;
/* Cached pointers for fast iteration in memcg_rstat_updated() */
struct memcg_vmstats_percpu *parent;
struct memcg_vmstats *vmstats;
/* The above should fit a single cacheline for memcg_rstat_updated() */
/* Local (CPU and cgroup) page state & events */
long state[MEMCG_NR_STAT];
unsigned long events[NR_MEMCG_EVENTS];
@ -735,7 +640,7 @@ struct memcg_vmstats_percpu {
/* Cgroup1: threshold notifications & softlimit tree updates */
unsigned long nr_page_events;
unsigned long targets[MEM_CGROUP_NTARGETS];
};
} ____cacheline_aligned;
struct memcg_vmstats {
/* Aggregated (CPU and subtree) page state & events */
@ -749,8 +654,133 @@ struct memcg_vmstats {
/* Pending child counts during tree propagation */
long state_pending[MEMCG_NR_STAT];
unsigned long events_pending[NR_MEMCG_EVENTS];
/* Stats updates since the last flush */
atomic64_t stats_updates;
};
/*
* memcg and lruvec stats flushing
*
* Many codepaths leading to stats update or read are performance sensitive and
* adding stats flushing in such codepaths is not desirable. So, to optimize the
* flushing the kernel does:
*
* 1) Periodically and asynchronously flush the stats every 2 seconds to not let
* rstat update tree grow unbounded.
*
* 2) Flush the stats synchronously on reader side only when there are more than
* (MEMCG_CHARGE_BATCH * nr_cpus) update events. Though this optimization
* will let stats be out of sync by atmost (MEMCG_CHARGE_BATCH * nr_cpus) but
* only for 2 seconds due to (1).
*/
static void flush_memcg_stats_dwork(struct work_struct *w);
static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
static u64 flush_last_time;
#define FLUSH_TIME (2UL*HZ)
/*
* Accessors to ensure that preemption is disabled on PREEMPT_RT because it can
* not rely on this as part of an acquired spinlock_t lock. These functions are
* never used in hardirq context on PREEMPT_RT and therefore disabling preemtion
* is sufficient.
*/
static void memcg_stats_lock(void)
{
preempt_disable_nested();
VM_WARN_ON_IRQS_ENABLED();
}
static void __memcg_stats_lock(void)
{
preempt_disable_nested();
}
static void memcg_stats_unlock(void)
{
preempt_enable_nested();
}
static bool memcg_vmstats_needs_flush(struct memcg_vmstats *vmstats)
{
return atomic64_read(&vmstats->stats_updates) >
MEMCG_CHARGE_BATCH * num_online_cpus();
}
static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
{
struct memcg_vmstats_percpu *statc;
int cpu = smp_processor_id();
if (!val)
return;
cgroup_rstat_updated(memcg->css.cgroup, cpu);
statc = this_cpu_ptr(memcg->vmstats_percpu);
for (; statc; statc = statc->parent) {
statc->stats_updates += abs(val);
if (statc->stats_updates < MEMCG_CHARGE_BATCH)
continue;
/*
* If @memcg is already flush-able, increasing stats_updates is
* redundant. Avoid the overhead of the atomic update.
*/
if (!memcg_vmstats_needs_flush(statc->vmstats))
atomic64_add(statc->stats_updates,
&statc->vmstats->stats_updates);
statc->stats_updates = 0;
}
}
static void do_flush_stats(struct mem_cgroup *memcg)
{
if (mem_cgroup_is_root(memcg))
WRITE_ONCE(flush_last_time, jiffies_64);
cgroup_rstat_flush(memcg->css.cgroup);
}
/*
* mem_cgroup_flush_stats - flush the stats of a memory cgroup subtree
* @memcg: root of the subtree to flush
*
* Flushing is serialized by the underlying global rstat lock. There is also a
* minimum amount of work to be done even if there are no stat updates to flush.
* Hence, we only flush the stats if the updates delta exceeds a threshold. This
* avoids unnecessary work and contention on the underlying lock.
*/
void mem_cgroup_flush_stats(struct mem_cgroup *memcg)
{
if (mem_cgroup_disabled())
return;
if (!memcg)
memcg = root_mem_cgroup;
if (memcg_vmstats_needs_flush(memcg->vmstats))
do_flush_stats(memcg);
}
void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg)
{
/* Only flush if the periodic flusher is one full cycle late */
if (time_after64(jiffies_64, READ_ONCE(flush_last_time) + 2*FLUSH_TIME))
mem_cgroup_flush_stats(memcg);
}
static void flush_memcg_stats_dwork(struct work_struct *w)
{
/*
* Deliberately ignore memcg_vmstats_needs_flush() here so that flushing
* in latency-sensitive paths is as cheap as possible.
*/
do_flush_stats(root_mem_cgroup);
queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
}
unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
{
long x = READ_ONCE(memcg->vmstats->state[idx]);
@ -760,6 +790,7 @@ unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
#endif
return x;
}
EXPORT_SYMBOL_GPL(memcg_page_state);
/**
* __mod_memcg_state - update cgroup memory statistics
@ -775,6 +806,7 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
__this_cpu_add(memcg->vmstats_percpu->state[idx], val);
memcg_rstat_updated(memcg, val);
}
EXPORT_SYMBOL_GPL(__mod_memcg_state);
/* idx can be of type enum memcg_stat_item or node_stat_item. */
static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
@ -1198,6 +1230,7 @@ out_unlock:
return memcg;
}
EXPORT_SYMBOL_GPL(mem_cgroup_iter);
/**
* mem_cgroup_iter_break - abort a hierarchy walk prematurely
@ -1503,6 +1536,10 @@ static const struct memory_stat memory_stats[] = {
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
{ "zswap", MEMCG_ZSWAP_B },
{ "zswapped", MEMCG_ZSWAPPED },
#endif
#ifdef CONFIG_MEMCG_ZRAM
{ "zram", MEMCG_ZRAM_B },
{ "zrammed", MEMCG_ZRAMED },
#endif
{ "file_mapped", NR_FILE_MAPPED },
{ "file_dirty", NR_FILE_DIRTY },
@ -1539,6 +1576,7 @@ static int memcg_page_state_unit(int item)
switch (item) {
case MEMCG_PERCPU_B:
case MEMCG_ZSWAP_B:
case MEMCG_ZRAM_B:
case NR_SLAB_RECLAIMABLE_B:
case NR_SLAB_UNRECLAIMABLE_B:
case WORKINGSET_REFAULT_ANON:
@ -1576,7 +1614,7 @@ static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
*
* Current memory state:
*/
mem_cgroup_flush_stats();
mem_cgroup_flush_stats(memcg);
for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
u64 size;
@ -3400,11 +3438,13 @@ int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
return ret;
}
EXPORT_SYMBOL_GPL(obj_cgroup_charge);
void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
{
refill_obj_stock(objcg, size, true);
}
EXPORT_SYMBOL_GPL(obj_cgroup_uncharge);
#endif /* CONFIG_MEMCG_KMEM */
@ -3675,6 +3715,10 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
global_node_page_state(NR_ANON_MAPPED);
if (swap)
val += total_swap_pages - get_nr_swap_pages();
#ifdef CONFIG_MEMCG_ZRAM
else
val += memcg_page_state(memcg, MEMCG_ZRAM_B) / PAGE_SIZE;
#endif
} else {
if (!swap)
val = page_counter_read(&memcg->memory);
@ -4026,7 +4070,7 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v)
int nid;
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
mem_cgroup_flush_stats();
mem_cgroup_flush_stats(memcg);
for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
seq_printf(m, "%s=%lu", stat->name,
@ -4101,7 +4145,7 @@ static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
mem_cgroup_flush_stats();
mem_cgroup_flush_stats(memcg);
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
unsigned long nr;
@ -4173,6 +4217,18 @@ static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
#endif
}
#ifdef CONFIG_TEXT_UNEVICTABLE
static int memcg_unevict_size_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
seq_printf(m, "unevictable_text_size_kb %lu\n",
memcg_exstat_text_unevict_gather(memcg) >> 10);
return 0;
}
#endif
static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
@ -4603,7 +4659,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
struct mem_cgroup *parent;
mem_cgroup_flush_stats();
mem_cgroup_flush_stats(memcg);
*pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
*pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
@ -5339,8 +5395,6 @@ static int mem_cgroup_vmstat_read(struct seq_file *m, void *vv)
return mem_cgroup_vmstat_read_comm(m, vv, memcg);
}
static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off);
static u64 memory_current_read(struct cgroup_subsys_state *css,
struct cftype *cft);
static int memory_low_show(struct seq_file *m, void *v);
@ -5354,12 +5408,77 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off);
static int memory_events_show(struct seq_file *m, void *v);
#ifdef CONFIG_TEXT_UNEVICTABLE
static u64 mem_cgroup_allow_unevictable_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
return memcg->allow_unevictable;
}
static int mem_cgroup_allow_unevictable_write(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
if (val > 1)
return -EINVAL;
if (memcg->allow_unevictable == val)
return 0;
memcg->allow_unevictable = val;
if (val)
memcg_all_processes_unevict(memcg, true);
else
memcg_all_processes_unevict(memcg, false);
return 0;
}
static u64 mem_cgroup_unevictable_percent_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
return memcg->unevictable_percent;
}
static int mem_cgroup_unevictable_percent_write(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
if (val > 100)
return -EINVAL;
memcg->unevictable_percent = val;
return 0;
}
#endif
static struct cftype mem_cgroup_legacy_files[] = {
{
.name = "usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
.read_u64 = mem_cgroup_read_u64,
},
#ifdef CONFIG_TEXT_UNEVICTABLE
{
.name = "allow_text_unevictable",
.read_u64 = mem_cgroup_allow_unevictable_read,
.write_u64 = mem_cgroup_allow_unevictable_write,
},
{
.name = "text_unevictable_percent",
.read_u64 = mem_cgroup_unevictable_percent_read,
.write_u64 = mem_cgroup_unevictable_percent_write,
},
{
.name = "text_unevictable_size",
.seq_show = memcg_unevict_size_show,
},
#endif
{
.name = "max_usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
@ -5529,11 +5648,6 @@ static struct cftype mem_cgroup_legacy_files[] = {
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = memory_events_show,
},
{
.name = "reclaim",
.flags = CFTYPE_NS_DELEGATABLE,
.write = memory_reclaim,
},
{ }, /* terminate */
};
@ -5679,10 +5793,11 @@ static void mem_cgroup_free(struct mem_cgroup *memcg)
__mem_cgroup_free(memcg);
}
static struct mem_cgroup *mem_cgroup_alloc(void)
static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
{
struct memcg_vmstats_percpu *statc, *pstatc;
struct mem_cgroup *memcg;
int node;
int node, cpu;
int __maybe_unused i;
long error = -ENOMEM;
@ -5706,10 +5821,21 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
if (!memcg->vmstats_percpu)
goto fail;
for_each_possible_cpu(cpu) {
if (parent)
pstatc = per_cpu_ptr(parent->vmstats_percpu, cpu);
statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
statc->parent = parent ? pstatc : NULL;
statc->vmstats = memcg->vmstats;
}
for_each_node(node)
if (alloc_mem_cgroup_per_node_info(memcg, node))
goto fail;
if (emm_memcg_init(memcg))
goto fail;
if (memcg_wb_domain_init(memcg, GFP_KERNEL))
goto fail;
@ -5751,7 +5877,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
struct mem_cgroup *memcg, *old_memcg;
old_memcg = set_active_memcg(parent);
memcg = mem_cgroup_alloc();
memcg = mem_cgroup_alloc(parent);
set_active_memcg(old_memcg);
if (IS_ERR(memcg))
return ERR_CAST(memcg);
@ -5760,12 +5886,24 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
WRITE_ONCE(memcg->soft_limit, PAGE_COUNTER_MAX);
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
memcg->zswap_max = PAGE_COUNTER_MAX;
#endif
#ifdef CONFIG_MEMCG_ZRAM
memcg->zram_max = PAGE_COUNTER_MAX;
#endif
page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
#ifdef CONFIG_TEXT_UNEVICTABLE
memcg->unevictable_percent = 100;
atomic_long_set(&memcg->unevictable_size, 0);
#endif
if (parent) {
#ifdef CONFIG_TEXT_UNEVICTABLE
memcg->allow_unevictable = parent->allow_unevictable;
#endif
WRITE_ONCE(memcg->swappiness, mem_cgroup_swappiness(parent));
WRITE_ONCE(memcg->oom_kill_disable, READ_ONCE(parent->oom_kill_disable));
#ifdef CONFIG_MEMCG_ZRAM
memcg->zram_prio = parent->zram_prio;
#endif
page_counter_init(&memcg->memory, &parent->memory);
page_counter_init(&memcg->swap, &parent->swap);
page_counter_init(&memcg->kmem, &parent->kmem);
@ -5872,6 +6010,8 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
invalidate_reclaim_iterators(memcg);
lru_gen_release_memcg(memcg);
emm_memcg_exit(memcg);
}
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
@ -6026,6 +6166,10 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
}
}
}
statc->stats_updates = 0;
/* We are in a per-cpu loop here, only do the atomic write once */
if (atomic64_read(&memcg->vmstats->stats_updates))
atomic64_set(&memcg->vmstats->stats_updates, 0);
}
#ifdef CONFIG_MMU
@ -6554,6 +6698,10 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
if (!p)
return 0;
#ifdef CONFIG_TEXT_UNEVICTABLE
mem_cgroup_can_unevictable(p, memcg);
#endif
/*
* We are now committed to this value whatever it is. Changes in this
* tunable will only affect upcoming migrations, not the current one.
@ -6597,6 +6745,9 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
{
#ifdef CONFIG_TEXT_UNEVICTABLE
mem_cgroup_cancel_unevictable(tset);
#endif
if (mc.to)
mem_cgroup_clear_mc();
}
@ -7027,7 +7178,7 @@ static int memory_numa_stat_show(struct seq_file *m, void *v)
int i;
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
mem_cgroup_flush_stats();
mem_cgroup_flush_stats(memcg);
for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
int nid;
@ -7758,6 +7909,8 @@ static int __init cgroup_memory(char *s)
cgroup_memory_nokmem = true;
if (!strcmp(token, "nobpf"))
cgroup_memory_nobpf = true;
if (!strcmp(token, "kmem"))
cgroup_memory_nokmem = false;
}
return 1;
}
@ -8189,7 +8342,11 @@ bool obj_cgroup_may_zswap(struct obj_cgroup *objcg)
break;
}
cgroup_rstat_flush(memcg->css.cgroup);
/*
* mem_cgroup_flush_stats() ignores small changes. Use
* do_flush_stats() directly to get accurate stats for charging.
*/
do_flush_stats(memcg);
pages = memcg_page_state(memcg, MEMCG_ZSWAP_B) / PAGE_SIZE;
if (pages < max)
continue;
@ -8254,8 +8411,10 @@ void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size)
static u64 zswap_current_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
cgroup_rstat_flush(css->cgroup);
return memcg_page_state(mem_cgroup_from_css(css), MEMCG_ZSWAP_B);
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
mem_cgroup_flush_stats(memcg);
return memcg_page_state(memcg, MEMCG_ZSWAP_B);
}
static int zswap_max_show(struct seq_file *m, void *v)

View File

@ -409,7 +409,7 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma,
*
* For vmas that pass the filters, merge/split as appropriate.
*/
static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
struct vm_area_struct **prev, unsigned long start,
unsigned long end, vm_flags_t newflags)
{
@ -420,7 +420,7 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
vm_flags_t oldflags = vma->vm_flags;
if (newflags == oldflags || (oldflags & VM_SPECIAL) ||
is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) ||
is_vm_hugetlb_page(vma) || vma == get_gate_vma(mm) ||
vma_is_dax(vma) || vma_is_secretmem(vma))
/* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
goto out;

View File

@ -55,6 +55,7 @@
static int sysctl_panic_on_oom;
static int sysctl_oom_kill_allocating_task;
static int sysctl_oom_dump_tasks = 1;
int sysctl_oom_kill_largest_task;
/*
* Serializes oom killer invocations (out_of_memory()) from all contexts to
@ -230,11 +231,14 @@ long oom_badness(struct task_struct *p, unsigned long totalpages)
points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
mm_pgtables_bytes(p->mm) / PAGE_SIZE;
task_unlock(p);
if (sysctl_oom_kill_largest_task)
goto ret;
/* Normalize to oom_score_adj units */
adj *= totalpages / 1000;
points += adj;
ret:
return points;
}

View File

@ -201,8 +201,8 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
folio_end_writeback(folio);
return 0;
}
__swap_writepage(&folio->page, wbc);
return 0;
ret = __swap_writepage(&folio->page, wbc);
return ret;
}
EXPORT_SYMBOL(swap_writepage);
@ -369,11 +369,21 @@ static void swap_writepage_bdev_async(struct page *page,
submit_bio(bio);
}
void __swap_writepage(struct page *page, struct writeback_control *wbc)
int __swap_writepage(struct page *page, struct writeback_control *wbc)
{
struct swap_info_struct *sis = page_swap_info(page);
VM_BUG_ON_PAGE(!PageSwapCache(page), page);
if (data_race(sis->flags & SWP_SYNCHRONOUS_IO)) {
int ret = bdev_swapout_folio(sis->bdev, swap_page_sector(page), page_folio(page), wbc);
if (ret != -EOPNOTSUPP) {
if (!ret)
count_swpout_vm_event(page_folio(page));
return ret;
}
}
/*
* ->flags can be updated non-atomicially (scan_swap_map_slots),
* but that will never affect SWP_FS_OPS, so the data_race
@ -385,6 +395,8 @@ void __swap_writepage(struct page *page, struct writeback_control *wbc)
swap_writepage_bdev_sync(page, wbc, sis);
else
swap_writepage_bdev_async(page, wbc, sis);
return 0;
}
void swap_write_unplug(struct swap_iocb *sio)
@ -520,11 +532,18 @@ void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug)
} else if (data_race(sis->flags & SWP_FS_OPS)) {
swap_readpage_fs(page, plug);
} else if (synchronous || (sis->flags & SWP_SYNCHRONOUS_IO)) {
int ret = bdev_swapin_folio(sis->bdev, swap_page_sector(page), folio);
if (ret != -EOPNOTSUPP) {
if (!ret)
count_vm_event(PSWPIN);
goto out;
}
swap_readpage_bdev_sync(page, sis);
} else {
swap_readpage_bdev_async(page, sis);
}
out:
if (workingset) {
delayacct_thrashing_end(&in_thrashing);
psi_memstall_leave(&pflags);

View File

@ -482,7 +482,6 @@ void folio_mark_accessed(struct folio *folio)
else
__lru_cache_activate_folio(folio);
folio_clear_referenced(folio);
workingset_activation(folio);
}
if (folio_test_idle(folio))
folio_clear_idle(folio);
@ -910,6 +909,7 @@ void lru_add_drain_all(void)
lru_add_drain();
}
#endif /* CONFIG_SMP */
EXPORT_SYMBOL_GPL(lru_add_drain_all);
atomic_t lru_disable_count = ATOMIC_INIT(0);

View File

@ -17,7 +17,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
}
void swap_write_unplug(struct swap_iocb *sio);
int swap_writepage(struct page *page, struct writeback_control *wbc);
void __swap_writepage(struct page *page, struct writeback_control *wbc);
int __swap_writepage(struct page *page, struct writeback_control *wbc);
/* linux/mm/swap_state.c */
/* One swap address space for each 64M swap space */

View File

@ -673,11 +673,12 @@ skip:
return read_swap_cache_async(entry, gfp_mask, vma, addr, NULL);
}
int init_swap_address_space(unsigned int type, unsigned long nr_pages)
int init_swap_address_space(struct swap_info_struct *si, unsigned long nr_pages)
{
struct address_space *spaces, *space;
unsigned int i, nr;
unsigned int i, nr, type;
type = si->type;
nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES);
spaces = kvcalloc(nr, sizeof(struct address_space), GFP_KERNEL);
if (!spaces)
@ -689,6 +690,10 @@ int init_swap_address_space(unsigned int type, unsigned long nr_pages)
space->a_ops = &swap_aops;
/* swap cache doesn't use writeback related tags */
mapping_set_no_writeback_tags(space);
#ifdef CONFIG_EMM_RAMDISK_SWAP
if (si->bdev && bdev_ramdisk(si->bdev))
set_bit(AS_RAM_SWAP, &space->flags);
#endif
}
nr_swapper_spaces[type] = nr;
swapper_spaces[type] = spaces;

View File

@ -2802,6 +2802,11 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
int error;
if (S_ISBLK(inode->i_mode)) {
#ifdef CONFIG_ENHANCED_MM
WARN(p->swap_file->f_mapping->a_ops->swap_activate,
"Swapping on block file over filesystem %s, file system operations may get bypassed unexpectedly and lead to data loss.\n",
p->swap_file->f_inode->i_sb->s_id);
#endif
p->bdev = blkdev_get_by_dev(inode->i_rdev,
BLK_OPEN_READ | BLK_OPEN_WRITE, p, NULL);
if (IS_ERR(p->bdev)) {
@ -3199,7 +3204,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
}
}
error = init_swap_address_space(p->type, maxpages);
error = init_swap_address_space(p, maxpages);
if (error)
goto bad_swap_unlock_inode;

865
mm/unevictable.c Normal file
View File

@ -0,0 +1,865 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Pin Process Code Section:
* echo PID > /proc/unevictable/add_pid
* echo PID > /proc/unevictable/del_pid
* cat /proc/unevictable/add_pid
*
* Copyright (C) 2019 Alibaba
* Author: Xunlei Pang <xlpang@linux.alibaba.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#include <linux/types.h>
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/proc_fs.h>
#include <linux/sched/mm.h>
#include <linux/swap.h>
#include <linux/ksm.h>
#include <linux/hugetlb.h>
#include <linux/rbtree.h>
#include <linux/uaccess.h>
#include <linux/kprobes.h>
#include <linux/workqueue.h>
#include <linux/pid_namespace.h>
#ifdef CONFIG_TEXT_UNEVICTABLE
#include <linux/unevictable.h>
#endif
#define PROC_NAME "unevictable"
#define NAME_BUF 8
#ifdef CONFIG_TEXT_UNEVICTABLE
DEFINE_STATIC_KEY_FALSE(unevictable_enabled_key);
#define for_each_mem_cgroup(iter) \
for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
iter != NULL; \
iter = mem_cgroup_iter(NULL, iter, NULL))
#endif
struct evict_pids_t {
struct rb_root root;
};
struct evict_pid_entry {
struct rb_node node;
struct list_head list;
pid_t rootpid;
u64 start_time;
#ifdef CONFIG_TEXT_UNEVICTABLE
u64 unevict_size;
#endif
struct task_struct *tsk;
bool done;
};
static void execute_vm_lock(struct work_struct *unused);
static struct evict_pids_t *base_tree;
static DEFINE_MUTEX(pid_mutex);
LIST_HEAD(pid_list);
static int proc_pids_count;
static DECLARE_DELAYED_WORK(evict_work, execute_vm_lock);
struct proc_pids_t {
struct rb_root proc_pids_tree;
};
/* Called with pid_mutex held always */
static void __remove_entry(struct evict_pid_entry *pid)
{
if (pid == NULL)
return;
rb_erase(&pid->node, &base_tree->root);
proc_pids_count--;
}
/* should not be in atomic context(i.e. hrtimer) */
static void __evict_pid(struct evict_pid_entry *pid)
{
struct task_struct *tsk;
struct mm_struct *mm;
if (!pid)
return;
rcu_read_lock();
tsk = find_task_by_pid_ns(pid->rootpid, &init_pid_ns);
if (tsk)
get_task_struct(tsk);
rcu_read_unlock();
if (!tsk)
return;
if (tsk == pid->tsk && pid->start_time == tsk->start_boottime) {
mm = get_task_mm(tsk);
if (mm) {
if (!(mm->def_flags & VM_LOCKED)) {
struct vm_area_struct *vma, *prev = NULL;
vm_flags_t flag;
#ifdef CONFIG_TEXT_UNEVICTABLE
unsigned long size = 0;
struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
#endif
VMA_ITERATOR(vmi, mm, 0);
mmap_write_lock(mm);
for_each_vma(vmi, vma) {
if (vma->vm_file &&
(vma->vm_flags & VM_EXEC) &&
(vma->vm_flags & VM_READ)) {
flag = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
mlock_fixup(&vmi, vma, &prev,
vma->vm_start, vma->vm_end, flag);
#ifdef CONFIG_TEXT_UNEVICTABLE
size += vma->vm_end - vma->vm_start;
#endif
}
}
mmap_write_unlock(mm);
#ifdef CONFIG_TEXT_UNEVICTABLE
memcg_decrease_unevict_size(memcg, size);
css_put(&memcg->css);
pid->unevict_size -= size;
#endif
}
mmput(mm);
}
}
put_task_struct(tsk);
}
static struct evict_pid_entry *lookup_unevict_entry(struct task_struct *tsk)
{
struct evict_pid_entry *entry, *result;
struct rb_node *parent = NULL;
struct rb_node **link;
pid_t rootpid;
if (!tsk)
return NULL;
rcu_read_lock();
get_task_struct(tsk);
rootpid = __task_pid_nr_ns(tsk, PIDTYPE_PID, &init_pid_ns);
put_task_struct(tsk);
rcu_read_unlock();
result = NULL;
link = &base_tree->root.rb_node;
/*maybe unevictable feature not ready */
while (*link) {
parent = *link;
entry = rb_entry(parent, struct evict_pid_entry, node);
if (rootpid < entry->rootpid)
link = &(*link)->rb_left;
else if (rootpid > entry->rootpid)
link = &(*link)->rb_right;
else {
result = entry;
break;
}
}
return result;
}
void del_unevict_task(struct task_struct *tsk)
{
struct evict_pid_entry *result;
if (!tsk) {
struct evict_pid_entry *pid_entry, *tmp;
mutex_lock(&pid_mutex);
list_for_each_entry_safe(pid_entry, tmp, &pid_list, list) {
rcu_read_lock();
tsk = find_task_by_pid_ns(pid_entry->rootpid,
&init_pid_ns);
rcu_read_unlock();
if (!tsk) {
list_del(&pid_entry->list);
__remove_entry(pid_entry);
kfree(pid_entry);
}
}
mutex_unlock(&pid_mutex);
return;
}
mutex_lock(&pid_mutex);
result = lookup_unevict_entry(tsk);
if (result) {
list_del(&result->list);
__remove_entry(result);
mutex_unlock(&pid_mutex);
__evict_pid(result);
kfree(result);
} else
mutex_unlock(&pid_mutex);
}
static void evict_pid(pid_t pid)
{
struct task_struct *tsk;
if (pid <= 0)
return;
rcu_read_lock();
tsk = find_task_by_pid_ns(pid, task_active_pid_ns(current));
if (!tsk) {
rcu_read_unlock();
return;
}
get_task_struct(tsk);
rcu_read_unlock();
del_unevict_task(tsk);
put_task_struct(tsk);
}
static void add_unevict_task(struct task_struct *tsk)
{
struct evict_pid_entry *entry, *new_entry, *result;
struct rb_node *parent = NULL;
struct rb_node **link;
pid_t rootpid;
if (!tsk)
return;
new_entry = kzalloc(sizeof(*new_entry), GFP_NOWAIT);
if (!new_entry)
return;
result = NULL;
get_task_struct(tsk);
rootpid = __task_pid_nr_ns(tsk, PIDTYPE_PID, &init_pid_ns);
put_task_struct(tsk);
mutex_lock(&pid_mutex);
link = &base_tree->root.rb_node;
while (*link) {
parent = *link;
entry = rb_entry(parent, struct evict_pid_entry, node);
if (rootpid < entry->rootpid) {
link = &(*link)->rb_left;
} else if (rootpid > entry->rootpid) {
link = &(*link)->rb_right;
} else {
result = entry;
break;
}
}
if (!result) {
result = new_entry;
result->rootpid = rootpid;
#ifdef CONFIG_TEXT_UNEVICTABLE
result->unevict_size = 0;
#endif
rb_link_node(&result->node, parent, link);
rb_insert_color(&result->node, &base_tree->root);
list_add_tail(&result->list, &pid_list);
proc_pids_count++;
mutex_unlock(&pid_mutex);
} else {
rcu_read_lock();
tsk = find_task_by_pid_ns(rootpid, &init_pid_ns);
if (tsk)
get_task_struct(tsk);
rcu_read_unlock();
if (!tsk) {
list_del(&result->list);
__remove_entry(result);
mutex_unlock(&pid_mutex);
kfree(result);
kfree(new_entry);
return;
} else if (tsk != result->tsk ||
result->start_time != tsk->start_boottime) {
result->done = false;
}
put_task_struct(tsk);
mutex_unlock(&pid_mutex);
kfree(new_entry);
}
}
static void unevict_pid(pid_t pid)
{
struct task_struct *tsk;
if (pid <= 0)
return;
rcu_read_lock();
tsk = find_task_by_pid_ns(pid, task_active_pid_ns(current));
if (!tsk) {
rcu_read_unlock();
return;
}
get_task_struct(tsk);
rcu_read_unlock();
#ifdef CONFIG_TEXT_UNEVICTABLE
if (is_memcg_unevictable_enabled(mem_cgroup_from_task(tsk))) {
put_task_struct(tsk);
return;
}
#endif
add_unevict_task(tsk);
put_task_struct(tsk);
}
struct add_pid_seq_context {
int idx;
int count;
int pids[0];
};
/*
* Note there exists a race condition that we may get inconsistent snapshots
* of pid array if call add_pid_start() more than one round due to users add
* or delete the pid. However, I think it's acceptable because the pid may
* still change even we get a consistent snapshot to show.
*/
static void *add_pid_start(struct seq_file *m, loff_t *pos)
{
struct add_pid_seq_context *ctx = NULL;
struct evict_pid_entry *pid_entry;
struct task_struct *tsk;
struct evict_pid_entry *tmp;
pid_t pid;
mutex_lock(&pid_mutex);
if (*pos >= proc_pids_count)
goto done;
ctx = kvzalloc(sizeof(*ctx) + proc_pids_count * sizeof(int), GFP_KERNEL);
if (unlikely(!ctx))
goto done;
if (proc_pids_count > 0) {
list_for_each_entry_safe(pid_entry, tmp, &pid_list, list) {
rcu_read_lock();
tsk = find_task_by_pid_ns(pid_entry->rootpid,
&init_pid_ns);
if (tsk) {
get_task_struct(tsk);
pid = __task_pid_nr_ns(tsk, PIDTYPE_PID,
task_active_pid_ns(current));
put_task_struct(tsk);
} else {
pid = -1;
}
rcu_read_unlock();
if (pid != -1) {
ctx->pids[ctx->count++] = pid;
} else {
list_del(&pid_entry->list);
__remove_entry(pid_entry);
kfree(pid_entry);
}
}
}
if (*pos >= ctx->count)
goto done;
mutex_unlock(&pid_mutex);
ctx->idx = *pos;
m->private = ctx;
return ctx;
done:
mutex_unlock(&pid_mutex);
kvfree(ctx);
return NULL;
}
static void *add_pid_next(struct seq_file *m, void *p, loff_t *pos)
{
struct add_pid_seq_context *ctx = p;
ctx->idx = ++*pos;
return (ctx->idx < ctx->count) ? ctx : NULL;
}
static void add_pid_stop(struct seq_file *m, void *p)
{
kvfree(m->private);
m->private = NULL;
}
static int add_pid_show(struct seq_file *m, void *p)
{
struct add_pid_seq_context *ctx = p;
seq_printf(m, "%d", ctx->pids[ctx->idx]);
seq_putc(m, (ctx->idx == ctx->count - 1) ? '\n' : ',');
return 0;
}
static const struct seq_operations seq_add_pid_op = {
.start = add_pid_start,
.next = add_pid_next,
.stop = add_pid_stop,
.show = add_pid_show,
};
static int proc_open_add_pid(struct inode *inode, struct file *file)
{
return seq_open(file, &seq_add_pid_op);
}
static void execute_vm_lock(struct work_struct *unused)
{
struct task_struct *tsk;
struct mm_struct *mm;
struct evict_pid_entry *result, *tmp;
pid_t rootpid;
if (!mutex_trylock(&pid_mutex)) {
goto out;
}
if (proc_pids_count <= 0) {
mutex_unlock(&pid_mutex);
goto out;
}
list_for_each_entry_safe(result, tmp, &pid_list, list) {
rootpid = result->rootpid;
if (result->done || rootpid <= 0)
continue;
rcu_read_lock();
tsk = find_task_by_pid_ns(rootpid, &init_pid_ns);
if (tsk)
get_task_struct(tsk);
rcu_read_unlock();
if (!tsk) {
list_del(&result->list);
__remove_entry(result);
kfree(result);
continue;
}
mm = get_task_mm(tsk);
if (mm && !(mm->def_flags & VM_LOCKED)) {
#ifdef CONFIG_TEXT_UNEVICTABLE
struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
#endif
struct vm_area_struct *vma, *prev = NULL;
vm_flags_t flag;
VMA_ITERATOR(vmi, mm, 0);
mmap_write_lock(mm);
for_each_vma(vmi, vma) {
#ifdef CONFIG_TEXT_UNEVICTABLE
if (is_unevictable_size_overflow(memcg))
break;
#endif
if (vma->vm_file &&
(vma->vm_flags & VM_EXEC) &&
(vma->vm_flags & VM_READ)) {
flag = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
flag |= (VM_LOCKED | VM_LOCKONFAULT);
mlock_fixup(&vmi, vma, &prev,
vma->vm_start, vma->vm_end, flag);
#ifdef CONFIG_TEXT_UNEVICTABLE
result->unevict_size += vma->vm_end - vma->vm_start;
#endif
}
}
result->tsk = tsk;
result->start_time = tsk->start_boottime;
result->done = true;
mmap_write_unlock(mm);
#ifdef CONFIG_TEXT_UNEVICTABLE
memcg_increase_unevict_size(memcg,
result->unevict_size);
css_put(&memcg->css);
#endif
} else {
list_del(&result->list);
__remove_entry(result);
kfree(result);
}
if (mm)
mmput(mm);
if (tsk)
put_task_struct(tsk);
}
mutex_unlock(&pid_mutex);
out:
return;
}
static ssize_t proc_write_add_pid(struct file *file,
const char __user *buffer, size_t count, loff_t *ppos)
{
char buf[NAME_BUF];
int err;
long pid;
int ret = count;
if (count > NAME_BUF - 1) {
ret = -EINVAL;
goto out;
}
memset(buf, 0, sizeof(buf));
if (copy_from_user(buf, buffer, count)) {
ret = -EFAULT;
goto out;
}
err = kstrtol(strstrip(buf), 0, &pid);
if (err || pid <= 0) {
ret = -EINVAL;
goto out;
} else {
unevict_pid((pid_t)pid);
schedule_delayed_work(&evict_work, HZ);
}
out:
return ret;
}
static ssize_t proc_write_del_pid(struct file *file,
const char __user *buffer, size_t count, loff_t *ppos)
{
char buf[NAME_BUF];
int err;
long pid;
int ret = count;
memset(buf, 0, sizeof(buf));
if (count > NAME_BUF - 1) {
ret = -EINVAL;
goto out;
}
if (copy_from_user(buf, buffer, count)) {
ret = -EFAULT;
goto out;
}
err = kstrtol(strstrip(buf), 0, &pid);
if (err || pid <= 0) {
ret = -EINVAL;
goto out;
} else {
evict_pid(pid);
}
out:
return ret;
}
const static struct proc_ops add_proc_fops = {
.proc_open = proc_open_add_pid,
.proc_read = seq_read,
.proc_write = proc_write_add_pid,
.proc_lseek = seq_lseek,
.proc_release = seq_release,
};
const static struct proc_ops del_proc_fops = {
.proc_write = proc_write_del_pid,
};
#ifdef CONFIG_TEXT_UNEVICTABLE
void clean_task_unevict_size(struct task_struct *tsk)
{
struct evict_pid_entry *result;
struct mem_cgroup *memcg;
/*
* There must make sure unevictable
* function is finished.
*/
if (!tsk || !base_tree)
return;
mutex_lock(&pid_mutex);
result = lookup_unevict_entry(tsk);
if (result) {
if (result->unevict_size) {
rcu_read_lock();
memcg = mem_cgroup_from_task(tsk);
memcg_decrease_unevict_size(memcg, result->unevict_size);
rcu_read_unlock();
}
list_del(&result->list);
__remove_entry(result);
mutex_unlock(&pid_mutex);
kfree(result);
} else
mutex_unlock(&pid_mutex);
}
bool is_memcg_unevictable_enabled(struct mem_cgroup *memcg)
{
if (!unevictable_enabled())
return false;
if (!memcg)
return false;
if (memcg->allow_unevictable)
return true;
return false;
}
void memcg_increase_unevict_size(struct mem_cgroup *memcg, unsigned long size)
{
atomic_long_add(size, &memcg->unevictable_size);
}
void memcg_decrease_unevict_size(struct mem_cgroup *memcg, unsigned long size)
{
atomic_long_sub(size, &memcg->unevictable_size);
}
bool is_unevictable_size_overflow(struct mem_cgroup *memcg)
{
struct page_counter *counter;
u64 res_limit;
u64 size;
counter = &memcg->memory;
res_limit = (u64)counter->max * PAGE_SIZE;
size = atomic_long_read(&memcg->unevictable_size);
size = size * 100 / res_limit;
if (size >= memcg->unevictable_percent)
return true;
return false;
}
unsigned long memcg_exstat_text_unevict_gather(struct mem_cgroup *memcg)
{
return atomic_long_read(&memcg->unevictable_size);
}
void mem_cgroup_can_unevictable(struct task_struct *tsk, struct mem_cgroup *to)
{
struct mem_cgroup *from;
if (!unevictable_enabled())
return;
from = mem_cgroup_from_task(tsk);
VM_BUG_ON(from == to);
if (to->allow_unevictable && !from->allow_unevictable) {
add_unevict_task(tsk);
schedule_delayed_work(&evict_work, HZ);
}
if (!to->allow_unevictable && from->allow_unevictable)
del_unevict_task(tsk);
}
void mem_cgroup_cancel_unevictable(struct cgroup_taskset *tset)
{
struct task_struct *tsk;
struct cgroup_subsys_state *dst_css;
struct mem_cgroup *memcg;
if (!unevictable_enabled())
return;
cgroup_taskset_for_each(tsk, dst_css, tset) {
memcg = mem_cgroup_from_task(tsk);
if (memcg->allow_unevictable)
del_unevict_task(tsk);
}
}
static inline int schedule_unevict_task(struct task_struct *tsk, void *arg)
{
add_unevict_task(tsk);
schedule_delayed_work(&evict_work, HZ);
return 0;
}
static inline int schedule_evict_task(struct task_struct *tsk, void *arg)
{
del_unevict_task(tsk);
return 0;
}
static inline void make_all_memcg_evictable(void)
{
struct mem_cgroup *memcg;
for_each_mem_cgroup(memcg) {
if (!memcg->allow_unevictable)
continue;
mem_cgroup_scan_tasks(memcg, schedule_unevict_task, NULL);
memcg->allow_unevictable = 0;
memcg->unevictable_percent = 100;
atomic_long_set(&memcg->unevictable_size, 0);
}
}
void memcg_all_processes_unevict(struct mem_cgroup *memcg, bool enable)
{
struct mem_cgroup *tmp_memcg;
if (!unevictable_enabled())
return;
if (!memcg)
tmp_memcg = root_mem_cgroup;
else
tmp_memcg = memcg;
if (enable)
mem_cgroup_scan_tasks(tmp_memcg, schedule_unevict_task, NULL);
else
mem_cgroup_scan_tasks(tmp_memcg, schedule_evict_task, NULL);
}
static int __init setup_unevictable(char *s)
{
if (!strcmp(s, "1"))
static_branch_enable(&unevictable_enabled_key);
else if (!strcmp(s, "0"))
static_branch_disable(&unevictable_enabled_key);
return 1;
}
__setup("unevictable=", setup_unevictable);
#ifdef CONFIG_SYSFS
static ssize_t unevictable_enabled_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sprintf(buf, "%d\n", !!static_branch_unlikely(&unevictable_enabled_key));
}
static ssize_t unevictable_enabled_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
static DEFINE_MUTEX(mutex);
ssize_t ret = count;
mutex_lock(&mutex);
if (!strncmp(buf, "1", 1))
static_branch_enable(&unevictable_enabled_key);
else if (!strncmp(buf, "0", 1)) {
static_branch_disable(&unevictable_enabled_key);
make_all_memcg_evictable();
} else
ret = -EINVAL;
mutex_unlock(&mutex);
return ret;
}
static struct kobj_attribute unevictable_enabled_attr =
__ATTR(enabled, 0644, unevictable_enabled_show,
unevictable_enabled_store);
static struct attribute *unevictable_attrs[] = {
&unevictable_enabled_attr.attr,
NULL,
};
static struct attribute_group unevictable_attr_group = {
.attrs = unevictable_attrs,
};
static int __init unevictable_init_sysfs(void)
{
int err;
struct kobject *unevictable_kobj;
unevictable_kobj = kobject_create_and_add("unevictable", mm_kobj);
if (!unevictable_kobj) {
pr_err("failed to create unevictable kobject\n");
return -ENOMEM;
}
err = sysfs_create_group(unevictable_kobj, &unevictable_attr_group);
if (err) {
pr_err("failed to register unevictable group\n");
goto delete_obj;
}
return 0;
delete_obj:
kobject_put(unevictable_kobj);
return err;
}
#endif /* CONFIG_SYSFS */
#endif /* CONFIG_TEXT_UNEVICTABLE */
static int __init unevictable_init(void)
{
struct proc_dir_entry *monitor_dir, *add_pid_file, *del_pid_file;
monitor_dir = proc_mkdir(PROC_NAME, NULL);
if (!monitor_dir)
goto out;
add_pid_file = proc_create("add_pid", 0600,
monitor_dir, &add_proc_fops);
if (!add_pid_file)
goto out_dir;
del_pid_file = proc_create("del_pid", 0200,
monitor_dir, &del_proc_fops);
if (!del_pid_file)
goto out_add_pid;
base_tree = kzalloc(sizeof(*base_tree), GFP_KERNEL);
if (!base_tree)
goto out_del_pid;
INIT_LIST_HEAD(&pid_list);
#if defined(CONFIG_SYSFS) && defined(CONFIG_TEXT_UNEVICTABLE)
if (unevictable_init_sysfs())
pr_err("memcg text unevictable sysfs create failed\n");
#endif
return 0;
pr_err("unevictpid create proc dir failed\n");
out_del_pid:
remove_proc_entry("del_pid", monitor_dir);
out_add_pid:
remove_proc_entry("add_pid", monitor_dir);
out_dir:
remove_proc_entry(PROC_NAME, NULL);
out:
return -ENOMEM;
}
module_init(unevictable_init);

View File

@ -57,6 +57,7 @@
#include <linux/khugepaged.h>
#include <linux/rculist_nulls.h>
#include <linux/random.h>
#include <linux/emm.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@ -167,6 +168,25 @@ struct scan_control {
/* for recording the reclaimed slab by now */
struct reclaim_state reclaim_state;
#ifdef CONFIG_EMM_RECLAIM
union {
struct {
/* Just like setting all may_writepage, may_swap, may_unmap to zero, but also forbids dropping clean cache */
unsigned int emm_aging:1;
/* Do reclaim without aging any LRU, NOTE: MM may still prompt pages during reclaim if a page is found active */
unsigned int emm_reclaiming:1;
/* Do both reclaim and aging, just like normal reclaim but other EMM data are also in effect. */
unsigned int emm_mix:1;
};
/* If emm_ageing, emm_reclaiming is set, EMM is be used for this reclaim */
unsigned int emm_running:3;
};
/* One time swappiness override, NOTE: this could got extended to 201, for anon only reclaim */
u8 emm_swappiness;
/* Number of pages shrinked/aged/scanned, could be used by different emm reclaim phase */
unsigned long emm_nr_taken;
#endif
};
#ifdef ARCH_HAS_PREFETCHW
@ -187,6 +207,20 @@ struct scan_control {
* From 0 .. 200. Higher means more swappy.
*/
int vm_swappiness = 60;
EXPORT_SYMBOL(vm_swappiness);
#ifdef CONFIG_EMM_FORCE_SWAPPINESS
unsigned int sysctl_vm_force_swappiness __read_mostly;
#else
#define sysctl_vm_force_swappiness 0
#endif
#ifdef CONFIG_EMM_RAMDISK_SWAP
unsigned int sysctl_vm_ramdisk_swaptune __read_mostly;
unsigned int sysctl_vm_swapcache_fastfree __read_mostly;
#else
#define sysctl_vm_swapcache_fastfree 0
#endif
LIST_HEAD(shrinker_list);
DECLARE_RWSEM(shrinker_rwsem);
@ -1353,8 +1387,35 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping,
folio_set_reclaim(folio);
res = mapping->a_ops->writepage(&folio->page, &wbc);
if (res < 0)
if (res < 0) {
#ifdef CONFIG_EMM_RAMDISK_SWAP
if (mapping_ram_swap(mapping) &&
sysctl_vm_ramdisk_swaptune) {
/*
* Return the page as activated so other
* pages could be tried when the ramdisk
* limit is hit (eg. ZRAM may then be
* able to catch a few zero pages and
* create more space), also don't leave
* a error mark.
*
* try_to_free_swap will still set PageDirty
* but nothing cleans PageReclaim if we return
* here, so just in case, tidy up page flags.
*
* TODO: We may also implement
* secondary fall back swap layer later.
*/
if (res == -ENOMEM) {
folio_set_dirty(folio);
folio_clear_reclaim(folio);
return PAGE_ACTIVATE;
}
}
#endif
handle_write_error(mapping, folio, res);
}
if (res == AOP_WRITEPAGE_ACTIVATE) {
folio_clear_reclaim(folio);
return PAGE_ACTIVATE;
@ -2132,8 +2193,8 @@ activate_locked_split:
}
activate_locked:
/* Not a candidate for swapping, so reclaim swap space. */
if (folio_test_swapcache(folio) &&
(mem_cgroup_swap_full(folio) || folio_test_mlocked(folio)))
if (folio_test_swapcache(folio) && (sysctl_vm_swapcache_fastfree ||
mem_cgroup_swap_full(folio) || folio_test_mlocked(folio)))
folio_free_swap(folio);
VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
if (!folio_test_mlocked(folio)) {
@ -2539,8 +2600,6 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec,
lruvec_add_folio(lruvec, folio);
nr_pages = folio_nr_pages(folio);
nr_moved += nr_pages;
if (folio_test_active(folio))
workingset_age_nonresident(lruvec, nr_pages);
}
/*
@ -2708,6 +2767,10 @@ static void shrink_active_list(unsigned long nr_to_scan,
nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &l_hold,
&nr_scanned, sc, lru);
#ifdef CONFIG_EMM_RECLAIM
sc->emm_nr_taken += nr_taken;
#endif
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
if (!cgroup_reclaim(sc))
@ -2844,6 +2907,15 @@ unsigned long reclaim_pages(struct list_head *folio_list)
static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
struct lruvec *lruvec, struct scan_control *sc)
{
#ifdef CONFIG_EMM_RECLAIM
/* Don't set skipped_deactivate here, if reclaim all failed simply bail out */
if (sc->emm_reclaiming && is_active_lru(lru))
return 0;
if (sc->emm_aging && !is_active_lru(lru))
return 0;
#endif
if (is_active_lru(lru)) {
if (sc->may_deactivate & (1 << is_file_lru(lru)))
shrink_active_list(nr_to_scan, lruvec, sc, lru);
@ -2923,7 +2995,7 @@ static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc)
* Flush the memory cgroup stats, so that we read accurate per-memcg
* lruvec stats for heuristics.
*/
mem_cgroup_flush_stats();
mem_cgroup_flush_stats(sc->target_mem_cgroup);
/*
* Determine the scan balance between anon and file LRUs.
@ -3035,6 +3107,20 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
unsigned long ap, fp;
enum lru_list lru;
#ifdef CONFIG_EMM_RECLAIM
if (sc->emm_running) {
swappiness = sc->emm_swappiness;
if (swappiness == 201) {
scan_balance = SCAN_ANON;
swappiness = 200;
goto out;
} else if (!swappiness) {
scan_balance = SCAN_FILE;
goto out;
}
}
#endif
/* If we have no swap space, do not bother scanning anon folios. */
if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) {
scan_balance = SCAN_FILE;
@ -3047,8 +3133,11 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
* disable swapping for individual groups completely when
* using the memory controller's swap limit feature would be
* too expensive.
*
* If sysctl_vm_force_swappiness is set, don't reclaim anon
* page even if the system may hit OOM.
*/
if (cgroup_reclaim(sc) && !swappiness) {
if ((sysctl_vm_force_swappiness || cgroup_reclaim(sc)) && !swappiness) {
scan_balance = SCAN_FILE;
goto out;
}
@ -3243,6 +3332,8 @@ DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS);
DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS);
#define get_cap(cap) static_branch_unlikely(&lru_gen_caps[cap])
#endif
/* Some module may want to behave differently when lru_gen is enabled */
EXPORT_SYMBOL_GPL(lru_gen_caps);
static bool should_walk_mmu(void)
{
@ -3707,9 +3798,9 @@ static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
struct lru_gen_folio *lrugen = &lruvec->lrugen;
int hist = lru_hist_from_seq(lrugen->min_seq[type]);
pos->refaulted = lrugen->avg_refaulted[type][tier] +
pos->refaulted = atomic_long_read(&lrugen->avg_refaulted[type][tier]) +
atomic_long_read(&lrugen->refaulted[hist][type][tier]);
pos->total = lrugen->avg_total[type][tier] +
pos->total = atomic_long_read(&lrugen->avg_total[type][tier]) +
atomic_long_read(&lrugen->evicted[hist][type][tier]);
if (tier)
pos->total += lrugen->protected[hist][type][tier - 1];
@ -3734,15 +3825,15 @@ static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)
if (carryover) {
unsigned long sum;
sum = lrugen->avg_refaulted[type][tier] +
sum = atomic_long_read(&lrugen->avg_refaulted[type][tier]) +
atomic_long_read(&lrugen->refaulted[hist][type][tier]);
WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2);
atomic_long_set(&lrugen->avg_refaulted[type][tier], sum / 2);
sum = lrugen->avg_total[type][tier] +
sum = atomic_long_read(&lrugen->avg_total[type][tier]) +
atomic_long_read(&lrugen->evicted[hist][type][tier]);
if (tier)
sum += lrugen->protected[hist][type][tier - 1];
WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2);
atomic_long_set(&lrugen->avg_total[type][tier], sum / 2);
}
if (clear) {
@ -4260,6 +4351,11 @@ restart:
walk_pmd_range(&val, addr, next, args);
#ifdef CONFIG_EMM_RECLAIM
if (walk->force_full_scan)
continue;
#endif
if (need_resched() || walk->batched >= MAX_LRU_BATCH) {
end = (addr | ~PUD_MASK) + 1;
goto done;
@ -4363,7 +4459,12 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
struct lru_gen_folio *lrugen = &lruvec->lrugen;
int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
if (type == LRU_GEN_ANON && !can_swap)
/*
* Keep tracking of Anon gen even if swappiness is not set for EMM,
* because EMM can adjust swappiness dynamically and may drop to 0
* from time to time, we can't lose hotness info here.
*/
if (type == LRU_GEN_ANON && !can_swap && !IS_ENABLED(CONFIG_EMM_RECLAIM))
goto done;
/* prevent cold/hot inversion if force_scan is true */
@ -5134,6 +5235,21 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw
int tier = -1;
DEFINE_MIN_SEQ(lruvec);
#ifdef CONFIG_EMM_RECLAIM
/*
* When called by EMM we must come here from run_eviction directly, else
* the code is broken and need to be fixed.
*
* For swappiness == 0, we have type = LRU_GEN_FILE set below. But
* for forcing a ANON isolation we have to extent swappiness to 201
* and return directly to avoid FILE LRU fallback.
*/
if (sc->emm_running && sc->emm_swappiness == 201) {
*type_scanned = LRU_GEN_ANON;
return scan_folios(lruvec, sc, LRU_GEN_ANON, MAX_NR_TIERS, list);
}
#endif
/*
* Try to make the obvious choice first. When anon and file are both
* available from the same generation, interpret swappiness 1 as file
@ -5922,8 +6038,8 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
if (seq == max_seq) {
s = "RT ";
n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]);
n[1] = READ_ONCE(lrugen->avg_total[type][tier]);
n[0] = atomic_long_read(&lrugen->avg_refaulted[type][tier]);
n[1] = atomic_long_read(&lrugen->avg_total[type][tier]);
} else if (seq == min_seq[type] || NR_HIST_GENS > 1) {
s = "rep";
n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]);
@ -6289,6 +6405,43 @@ static int __init init_lru_gen(void)
};
late_initcall(init_lru_gen);
#ifdef CONFIG_ENHANCED_MM
static int __init parse_cmdlinelru_gen(char *s)
{
int i, nid;
bool enable;
if (!strcmp(s, "1") || !strcmp(s, "y"))
enable = 1;
else if (!strcmp(s, "0") || !strcmp(s, "n"))
enable = 0;
else
return 1;
for_each_node(nid) {
struct lruvec *lruvec = get_lruvec(NULL, nid);
if (!lruvec)
continue;
VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
VM_WARN_ON_ONCE(!state_is_valid(lruvec));
lruvec->lrugen.enabled = enable;
}
for (i = 0; i < NR_LRU_GEN_CAPS; i++) {
if (enable)
static_branch_enable(&lru_gen_caps[i]);
else
static_branch_disable(&lru_gen_caps[i]);
}
return 1;
}
__setup("lru_gen=", parse_cmdlinelru_gen);
#endif
#else /* !CONFIG_LRU_GEN */
static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
@ -6342,6 +6495,9 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
blk_start_plug(&plug);
while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
#ifdef CONFIG_EMM_RECLAIM
(sc->emm_aging && sc->emm_swappiness == 201 && nr[LRU_ACTIVE_ANON]) ||
#endif
nr[LRU_INACTIVE_FILE]) {
unsigned long nr_anon, nr_file, percentage;
unsigned long nr_scanned;
@ -6878,6 +7034,11 @@ retry:
if (sc->nr_reclaimed >= sc->nr_to_reclaim)
break;
#ifdef CONFIG_EMM_RECLAIM
if (sc->emm_aging && sc->emm_nr_taken >= sc->nr_to_reclaim)
break;
#endif
if (sc->compaction_ready)
break;
@ -7196,6 +7357,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
return nr_reclaimed;
}
EXPORT_SYMBOL_GPL(try_to_free_mem_cgroup_pages);
#endif
#ifdef CONFIG_PAGECACHE_LIMIT
@ -8560,6 +8722,149 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
}
#endif
#ifdef CONFIG_EMM_RECLAIM
int memcg_emm_reclaim(struct mem_cgroup *memcg, int mode,
unsigned long nr_pages, unsigned long swappiness)
{
unsigned int noreclaim_flag;
struct zonelist *zonelist;
unsigned long nr_shrinked;
bool retry = true;
struct scan_control sc = {
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
.gfp_mask = GFP_KERNEL,
.reclaim_idx = MAX_NR_ZONES - 1,
.target_mem_cgroup = memcg,
.priority = DEF_PRIORITY,
.may_writepage = true,
.may_unmap = true,
.may_swap = !!swappiness,
.emm_swappiness = swappiness,
.emm_reclaiming = mode == EMM_RECLAIM,
.emm_aging = mode == EMM_AGE,
};
again:
/*
* Copy & paste from try_to_free_mem_cgroup_pages
*/
zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
set_task_reclaim_state(current, &sc.reclaim_state);
noreclaim_flag = memalloc_noreclaim_save();
nr_shrinked = do_try_to_free_pages(zonelist, &sc);
if (mode != EMM_RECLAIM) {
nr_shrinked += sc.emm_nr_taken;
sc.emm_nr_taken = 0;
}
if (nr_shrinked) {
nr_pages -= min(nr_pages, nr_shrinked);
} else if (retry) {
retry = false;
lru_add_drain_all();
goto again;
}
memalloc_noreclaim_restore(noreclaim_flag);
set_task_reclaim_state(current, NULL);
if (nr_pages)
return -EAGAIN;
return 0;
}
EXPORT_SYMBOL_GPL(memcg_emm_reclaim);
#ifdef CONFIG_LRU_GEN
int memcg_lru_gen_emm_reclaim(struct mem_cgroup *memcg, int mode,
unsigned long nr_pages, unsigned long swappiness)
{
unsigned int flags;
struct blk_plug plug;
struct lruvec *lruvec;
struct lru_gen_mm_walk *walk;
int ret = 0, nid;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
.reclaim_idx = MAX_NR_ZONES - 1,
.may_writepage = true,
.may_unmap = true,
.may_swap = !!swappiness,
.emm_swappiness = swappiness,
.emm_reclaiming = mode == EMM_RECLAIM,
.emm_aging = mode == EMM_AGE,
};
set_task_reclaim_state(current, &sc.reclaim_state);
flags = memalloc_noreclaim_save();
walk = set_mm_walk(NULL, true);
if (!walk) {
ret = -ENOMEM;
goto done;
}
if (nr_pages)
walk->force_scan = true;
if (nr_pages == PAGE_COUNTER_MAX)
walk->force_full_scan = true;
/* Don't expose extended swappiness to rest of lru_gen */
if (swappiness > 200)
swappiness = 200;
blk_start_plug(&plug);
for_each_node_state(nid, N_MEMORY) {
lruvec = get_lruvec(memcg, nid);
if (lruvec) {
DEFINE_MAX_SEQ(lruvec);
if (mode == EMM_AGE) {
ret = run_aging(lruvec, max_seq, &sc,
!!swappiness, !!nr_pages);
} else if (mode == EMM_RECLAIM) {
ret = run_eviction(lruvec, max_seq - MIN_NR_GENS, &sc,
swappiness, nr_pages);
nr_pages -= min(nr_pages, sc.nr_reclaimed);
/*
* If swappiness is less than 100 (bias towards cache), reclaim slab,
* using DEF_PRIORITY which means 1/4096 of inactive objects will be
* reclaimed. If too many pages were asked to reclaim, shrink harder.
*/
if (swappiness <= 100)
shrink_slab(GFP_KERNEL, nid, memcg,
(nr_pages > MAX_LRU_BATCH) ? DEF_PRIORITY - 1 : DEF_PRIORITY);
} else {
ret = -EINVAL;
}
if (ret < 0)
break;
}
}
blk_finish_plug(&plug);
done:
clear_mm_walk();
memalloc_noreclaim_restore(flags);
set_task_reclaim_state(current, NULL);
if (ret < 0 || mode != EMM_RECLAIM)
return ret;
return nr_pages ? -EAGAIN: 0;
}
EXPORT_SYMBOL_GPL(memcg_lru_gen_emm_reclaim);
#endif
#endif
/**
* check_move_unevictable_folios - Move evictable folios to appropriate zone
* lru list

View File

@ -1027,6 +1027,7 @@ unsigned long node_page_state(struct pglist_data *pgdat,
return node_page_state_pages(pgdat, item);
}
EXPORT_SYMBOL(node_page_state);
#endif
#ifdef CONFIG_COMPACTION

View File

@ -64,74 +64,64 @@
* thrashing on the inactive list, after which refaulting pages can be
* activated optimistically to compete with the existing active pages.
*
* Approximating inactive page access frequency - Observations:
* For such approximation, we introduce a counter `eviction` (E)
* here. This counter increases each time a page is evicted, and each evicted
* page will have a shadow that stores the counter reading at the eviction
* time as a timestamp. So when an evicted page was faulted again, we have:
*
* 1. When a page is accessed for the first time, it is added to the
* head of the inactive list, slides every existing inactive page
* towards the tail by one slot, and pushes the current tail page
* out of memory.
* Let SP = ((E's reading @ current) - (E's reading @ eviction))
*
* 2. When a page is accessed for the second time, it is promoted to
* the active list, shrinking the inactive list by one slot. This
* also slides all inactive pages that were faulted into the cache
* more recently than the activated page towards the tail of the
* inactive list.
* +-memory available to cache-+
* | |
* +-------------------------+===============+===========+
* | * shadows O O O | INACTIVE | ACTIVE |
* +-+-----------------------+===============+===========+
* | |
* +-----------------------+
* | SP
* fault page O -> Hole left by previously faulted in pages
* * -> The page corresponding to SP
*
* Thus:
* Here SP can stands for how far the current workflow could push a page
* out of available memory. Since all evicted page was once head of
* INACTIVE list, the page could have such an access distance of:
*
* 1. The sum of evictions and activations between any two points in
* time indicate the minimum number of inactive pages accessed in
* between.
* SP + NR_INACTIVE
*
* 2. Moving one inactive page N page slots towards the tail of the
* list requires at least N inactive page accesses.
* So if:
*
* Combining these:
* SP + NR_INACTIVE < NR_INACTIVE + NR_ACTIVE
*
* 1. When a page is finally evicted from memory, the number of
* inactive pages accessed while the page was in cache is at least
* the number of page slots on the inactive list.
* Which can be simplified to:
*
* 2. In addition, measuring the sum of evictions and activations (E)
* at the time of a page's eviction, and comparing it to another
* reading (R) at the time the page faults back into memory tells
* the minimum number of accesses while the page was not cached.
* This is called the refault distance.
* SP < NR_ACTIVE
*
* Because the first access of the page was the fault and the second
* access the refault, we combine the in-cache distance with the
* out-of-cache distance to get the complete minimum access distance
* of this page:
* Then the page is worth getting re-activated to start from ACTIVE part,
* since the access distance is shorter than total memory to make it stay.
*
* NR_inactive + (R - E)
* And since this is only an estimation, based on several hypotheses, and
* it could break the ability of LRU to distinguish a workingset out of
* caches, so throttle this by two factors:
*
* And knowing the minimum access distance of a page, we can easily
* tell if the page would be able to stay in cache assuming all page
* slots in the cache were available:
* 1. Notice that re-faulted in pages may leave "holes" on the shadow
* part of LRU, that part is left unhandled on purpose to decrease
* re-activate rate for pages that have a large SP value (the larger
* SP value a page have, the more likely it will be affected by such
* holes).
* 2. When the ACTIVE part of LRU is long enough, challenging ACTIVE pages
* by re-activating a one-time faulted previously INACTIVE page may not
* be a good idea, so throttle the re-activation when ACTIVE > INACTIVE
* by comparing with INACTIVE instead.
*
* NR_inactive + (R - E) <= NR_inactive + NR_active
* Combined all above, we have:
* Upon refault, if any of the following conditions is met, mark the page
* as active:
*
* If we have swap we should consider about NR_inactive_anon and
* NR_active_anon, so for page cache and anonymous respectively:
*
* NR_inactive_file + (R - E) <= NR_inactive_file + NR_active_file
* + NR_inactive_anon + NR_active_anon
*
* NR_inactive_anon + (R - E) <= NR_inactive_anon + NR_active_anon
* + NR_inactive_file + NR_active_file
*
* Which can be further simplified to:
*
* (R - E) <= NR_active_file + NR_inactive_anon + NR_active_anon
*
* (R - E) <= NR_active_anon + NR_inactive_file + NR_active_file
*
* Put into words, the refault distance (out-of-cache) can be seen as
* a deficit in inactive list space (in-cache). If the inactive list
* had (R - E) more page slots, the page would not have been evicted
* in between accesses, but activated instead. And on a full system,
* the only thing eating into inactive list space is active pages.
* - If ACTIVE LRU is low (NR_ACTIVE < NR_INACTIVE), check if:
* SP < NR_ACTIVE
*
* - If ACTIVE LRU is high (NR_ACTIVE >= NR_INACTIVE), check if:
* SP < NR_INACTIVE
*
* Refaulting inactive pages
*
@ -169,7 +159,7 @@
* Implementation
*
* For each node's LRU lists, a counter for inactive evictions and
* activations is maintained (node->nonresident_age).
* activations is maintained (node->evictions).
*
* On eviction, a snapshot of this counter (along with some bits to
* identify the node) is stored in the now empty page cache
@ -180,10 +170,12 @@
*/
#define WORKINGSET_SHIFT 1
#define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \
#define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \
WORKINGSET_SHIFT + NODES_SHIFT + \
MEM_CGROUP_ID_SHIFT)
#define EVICTION_BITS (BITS_PER_LONG - (EVICTION_SHIFT))
#define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
#define LRU_GEN_EVICTION_BITS (EVICTION_BITS - LRU_REFS_WIDTH)
/*
* Eviction timestamps need to be able to cover the full range of
@ -194,6 +186,7 @@
* evictions into coarser buckets by shaving off lower timestamp bits.
*/
static unsigned int bucket_order __read_mostly;
static unsigned int lru_gen_bucket_order __read_mostly;
static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
bool workingset)
@ -226,134 +219,100 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
*workingsetp = workingset;
}
#ifdef CONFIG_LRU_GEN
static void *lru_gen_eviction(struct folio *folio)
#ifdef CONFIG_EMM_WORKINGSET_TRACKING
static void workingset_eviction_file(struct lruvec *lruvec, unsigned long nr_pages)
{
int hist;
unsigned long token;
unsigned long min_seq;
struct lruvec *lruvec;
struct lru_gen_folio *lrugen;
int type = folio_is_file_lru(folio);
int delta = folio_nr_pages(folio);
int refs = folio_lru_refs(folio);
int tier = lru_tier_from_refs(refs);
struct mem_cgroup *memcg = folio_memcg(folio);
struct pglist_data *pgdat = folio_pgdat(folio);
BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT);
lruvec = mem_cgroup_lruvec(memcg, pgdat);
lrugen = &lruvec->lrugen;
min_seq = READ_ONCE(lrugen->min_seq[type]);
token = (min_seq << LRU_REFS_WIDTH) | max(refs - 1, 0);
hist = lru_hist_from_seq(min_seq);
atomic_long_add(delta, &lrugen->evicted[hist][type][tier]);
return pack_shadow(mem_cgroup_id(memcg), pgdat, token, refs);
do {
atomic_long_add(nr_pages, &lruvec->evicted_file);
} while ((lruvec = parent_lruvec(lruvec)));
}
/*
* Tests if the shadow entry is for a folio that was recently evicted.
* Fills in @lruvec, @token, @workingset with the values unpacked from shadow.
* If a page is evicted and never come back, either this page is really cold or it
* is deleted on disk.
*
* For cold page, it could take up all of memory until kswapd start to shrink it.
* For deleted page, the shadow will be gone too, so no refault.
*
* If a page comes back before it's shadow is released, that's a refault, which means
* file page reclaim have gone over-aggressive and that page would not have been evicted
* if all the page, include it self, stayed in memory.
*/
static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec,
unsigned long *token, bool *workingset)
static void workingset_refault_track(struct lruvec *lruvec, unsigned long refault_distance)
{
int memcg_id;
unsigned long min_seq;
struct mem_cgroup *memcg;
struct pglist_data *pgdat;
unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset);
memcg = mem_cgroup_from_id(memcg_id);
*lruvec = mem_cgroup_lruvec(memcg, pgdat);
min_seq = READ_ONCE((*lruvec)->lrugen.min_seq[file]);
return (*token >> LRU_REFS_WIDTH) == (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH));
do {
/*
* Not taking any lock, for better performance, may lead to some
* event got lost, but it's just a rough estimation anyway.
*/
WRITE_ONCE(lruvec->refault_count, READ_ONCE(lruvec->refault_count) + 1);
WRITE_ONCE(lruvec->total_distance, READ_ONCE(lruvec->total_distance) + refault_distance);
} while ((lruvec = parent_lruvec(lruvec)));
}
static void lru_gen_refault(struct folio *folio, void *shadow)
#else
static void workingset_eviction_file(struct lruvec *lruvec, unsigned long nr_pages)
{
bool recent;
int hist, tier, refs;
bool workingset;
unsigned long token;
struct lruvec *lruvec;
struct lru_gen_folio *lrugen;
int type = folio_is_file_lru(folio);
int delta = folio_nr_pages(folio);
}
static void workingset_refault_track(struct lruvec *lruvec, unsigned long refault_distance)
{
}
#endif
rcu_read_lock();
recent = lru_gen_test_recent(shadow, type, &lruvec, &token, &workingset);
if (lruvec != folio_lruvec(folio))
goto unlock;
mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta);
if (!recent)
goto unlock;
lrugen = &lruvec->lrugen;
hist = lru_hist_from_seq(READ_ONCE(lrugen->min_seq[type]));
/* see the comment in folio_lru_refs() */
refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + workingset;
tier = lru_tier_from_refs(refs);
atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]);
mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
static inline struct mem_cgroup *try_get_flush_memcg(int memcgid)
{
struct mem_cgroup *memcg;
/*
* Count the following two cases as stalls:
* 1. For pages accessed through page tables, hotter pages pushed out
* hot pages which refaulted immediately.
* 2. For pages accessed multiple times through file descriptors,
* they would have been protected by sort_folio().
* Look up the memcg associated with the stored ID. It might
* have been deleted since the folio's eviction.
*
* Note that in rare events the ID could have been recycled
* for a new cgroup that refaults a shared folio. This is
* impossible to tell from the available data. However, this
* should be a rare and limited disturbance, and activations
* are always speculative anyway. Ultimately, it's the aging
* algorithm's job to shake out the minimum access frequency
* for the active cache.
*
* XXX: On !CONFIG_MEMCG, this will always return NULL; it
* would be better if the root_mem_cgroup existed in all
* configurations instead.
*/
if (lru_gen_in_fault() || refs >= BIT(LRU_REFS_WIDTH) - 1) {
set_mask_bits(&folio->flags, 0, LRU_REFS_MASK | BIT(PG_workingset));
mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta);
rcu_read_lock();
memcg = mem_cgroup_from_id(memcgid);
if (!mem_cgroup_disabled() &&
(!memcg || !mem_cgroup_tryget(memcg))) {
rcu_read_unlock();
return NULL;
}
unlock:
rcu_read_unlock();
/*
* Flush stats (and potentially sleep) outside the RCU read section.
* XXX: With per-memcg flushing and thresholding, is ratelimiting
* still needed here?
*/
mem_cgroup_flush_stats_ratelimited(memcg);
return memcg;
}
#else /* !CONFIG_LRU_GEN */
static void *lru_gen_eviction(struct folio *folio)
{
return NULL;
}
static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec,
unsigned long *token, bool *workingset)
{
return false;
}
static void lru_gen_refault(struct folio *folio, void *shadow)
{
}
#endif /* CONFIG_LRU_GEN */
/**
* workingset_age_nonresident - age non-resident entries as LRU ages
* @lruvec: the lruvec that was aged
* @nr_pages: the number of pages to count
* lru_eviction - age non-resident entries as LRU ages
*
* As in-memory pages are aged, non-resident pages need to be aged as
* well, in order for the refault distances later on to be comparable
* to the in-memory dimensions. This function allows reclaim and LRU
* operations to drive the non-resident aging along in parallel.
*/
void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages)
static inline unsigned long lru_eviction(struct lruvec *lruvec, int type,
int nr_pages, int bits, int bucket_order)
{
unsigned long eviction;
if (type)
workingset_eviction_file(lruvec, nr_pages);
/*
* Reclaiming a cgroup means reclaiming all its children in a
* round-robin fashion. That means that each cgroup has an LRU
@ -365,11 +324,241 @@ void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages)
* the virtual inactive lists of all its parents, including
* the root cgroup's, age as well.
*/
do {
atomic_long_add(nr_pages, &lruvec->nonresident_age);
} while ((lruvec = parent_lruvec(lruvec)));
eviction = atomic_long_fetch_add_relaxed(nr_pages, &lruvec->evictions[type]);
while ((lruvec = parent_lruvec(lruvec)))
atomic_long_add(nr_pages, &lruvec->evictions[type]);
/* Truncate the timestamp to fit in limited bits */
eviction >>= bucket_order;
eviction &= ~0UL >> (BITS_PER_LONG - bits);
return eviction;
}
/*
* lru_distance - calculate the refault distance based on non-resident age
*/
static inline unsigned long lru_distance(struct lruvec *lruvec, int type,
unsigned long eviction, int bits,
int bucket_order)
{
unsigned long refault = atomic_long_read(&lruvec->evictions[type]);
eviction &= ~0UL >> (BITS_PER_LONG - bits);
eviction <<= bucket_order;
/*
* The unsigned subtraction here gives an accurate distance
* across non-resident age overflows in most cases. There is a
* special case: usually, shadow entries have a short lifetime
* and are either refaulted or reclaimed along with the inode
* before they get too old. But it is not impossible for the
* non-resident age to lap a shadow entry in the field, which
* can then result in a false small refault distance, leading
* to a false activation should this old entry actually
* refault again. However, earlier kernels used to deactivate
* unconditionally with *every* reclaim invocation for the
* longest time, so the occasional inappropriate activation
* leading to pressure on the active list is not a problem.
*/
return (refault - eviction) & (~0UL >> (BITS_PER_LONG - bits));
}
#ifdef CONFIG_LRU_GEN
static void *lru_gen_eviction(struct folio *folio)
{
int hist;
unsigned long token;
struct lruvec *lruvec;
struct lru_gen_folio *lrugen;
int type = folio_is_file_lru(folio);
int delta = folio_nr_pages(folio);
int refs = folio_lru_refs(folio);
int tier = lru_tier_from_refs(refs);
struct mem_cgroup *memcg = folio_memcg(folio);
struct pglist_data *pgdat = folio_pgdat(folio);
BUILD_BUG_ON(LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT);
lruvec = mem_cgroup_lruvec(memcg, pgdat);
lrugen = &lruvec->lrugen;
hist = lru_hist_of_min_seq(lruvec, type);
token = max(refs - 1, 0);
token <<= LRU_GEN_EVICTION_BITS;
token |= lru_eviction(lruvec, type, delta,
LRU_GEN_EVICTION_BITS, lru_gen_bucket_order);
atomic_long_add(delta, &lrugen->evicted[hist][type][tier]);
return pack_shadow(mem_cgroup_id(memcg), pgdat, token, refs);
}
/*
* Tests if the shadow entry is for a folio that was recently evicted.
* Fills in @lruvec, @token, @workingset with the values unpacked from shadow.
*/
static bool inline lru_gen_test_recent(struct lruvec *lruvec, bool type,
unsigned long distance)
{
int hist;
unsigned long evicted = 0;
struct lru_gen_folio *lrugen;
lrugen = &lruvec->lrugen;
hist = lru_hist_of_min_seq(lruvec, type);
for (int tier = 0; tier < MAX_NR_TIERS; tier++)
evicted += atomic_long_read(&lrugen->evicted[hist][type][tier]);
return distance <= evicted;
}
enum lru_gen_refault_distance {
DISTANCE_SHORT,
DISTANCE_MID,
DISTANCE_LONG,
DISTANCE_NONE,
};
static inline int lru_gen_test_refault(struct lruvec *lruvec, bool file,
unsigned long distance, bool can_swap)
{
unsigned long total;
total = lruvec_page_state(lruvec, NR_ACTIVE_FILE) +
lruvec_page_state(lruvec, NR_INACTIVE_FILE);
if (can_swap)
total += lruvec_page_state(lruvec, NR_ACTIVE_ANON) +
lruvec_page_state(lruvec, NR_INACTIVE_ANON);
/* Imagine having an extra gen outside of available memory */
if (distance <= total / MAX_NR_GENS)
return DISTANCE_SHORT;
if (distance <= total / MIN_NR_GENS)
return DISTANCE_MID;
if (distance <= total)
return DISTANCE_LONG;
return DISTANCE_NONE;
}
static void lru_gen_refault(struct folio *folio, void *shadow)
{
int memcgid;
bool recent;
bool workingset;
unsigned long token;
int hist, tier, refs;
struct lruvec *lruvec;
struct mem_cgroup *memcg;
struct pglist_data *pgdat;
struct lru_gen_folio *lrugen;
int type = folio_is_file_lru(folio);
int delta = folio_nr_pages(folio);
int distance;
unsigned long refault_distance, protect_tier;
unpack_shadow(shadow, &memcgid, &pgdat, &token, &workingset);
memcg = try_get_flush_memcg(memcgid);
if (!memcg)
return;
lruvec = mem_cgroup_lruvec(memcg, pgdat);
if (lruvec != folio_lruvec(folio))
goto unlock;
mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta);
refault_distance = lru_distance(lruvec, type, token,
LRU_GEN_EVICTION_BITS, lru_gen_bucket_order);
workingset_refault_track(lruvec, distance);
/* Check if the gen the page was evicted from still exist */
recent = lru_gen_test_recent(lruvec, type, refault_distance);
/* Check if the distance indicates a refault */
distance = lru_gen_test_refault(lruvec, type, refault_distance,
mem_cgroup_get_nr_swap_pages(memcg));
if (!recent && distance == DISTANCE_NONE)
goto unlock;
/* see the comment in folio_lru_refs() */
token >>= LRU_GEN_EVICTION_BITS;
refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + workingset;
tier = lru_tier_from_refs(refs);
/*
* Count the following two cases as stalls:
* 1. For pages accessed through page tables, hotter pages pushed out
* hot pages which refaulted immediately.
* 2. For pages accessed multiple times through file descriptors,
* they would have been protected by sort_folio().
*/
if (lru_gen_in_fault() || refs >= BIT(LRU_REFS_WIDTH) - 1) {
if (distance <= DISTANCE_SHORT) {
/* Set ref bits and workingset (increase refs by one) */
if (!lru_gen_in_fault())
folio_set_active(folio);
else
set_mask_bits(&folio->flags, 0,
min_t(unsigned long, refs, BIT(LRU_REFS_WIDTH) - 1)
<< LRU_REFS_PGOFF);
folio_set_workingset(folio);
} else if (recent || distance <= DISTANCE_MID) {
/*
* Beyound PID protection range, no point increasing refs
* for highest tier, but we can activate file page.
*/
set_mask_bits(&folio->flags, 0, (refs - workingset) << LRU_REFS_PGOFF);
folio_set_workingset(folio);
} else {
set_mask_bits(&folio->flags, 0, 1 << LRU_REFS_PGOFF);
}
mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta);
}
lrugen = &lruvec->lrugen;
hist = lru_hist_of_min_seq(lruvec, type);
protect_tier = tier;
/*
* Don't over-protect clean cache page (!tier page), if the page wasn't access
* for a while (refault distance > LRU / MAX_NR_GENS), there is no help keeping
* it in memory, bias higher tier instead.
*/
if (distance <= DISTANCE_SHORT && !tier) {
/* The folio is referenced one more time in the shadow gen */
folio_set_workingset(folio);
protect_tier = lru_tier_from_refs(1);
mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
}
if (protect_tier == tier && recent) {
atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]);
} else {
atomic_long_add(delta, &lrugen->avg_total[type][protect_tier]);
atomic_long_add(delta, &lrugen->avg_refaulted[type][protect_tier]);
}
unlock:
mem_cgroup_put(memcg);
}
#else /* !CONFIG_LRU_GEN */
static void *lru_gen_eviction(struct folio *folio)
{
return NULL;
}
static bool lru_gen_test_recent(struct lruvec *lruvec, bool file,
unsigned long token)
{
return false;
}
static void lru_gen_refault(struct folio *folio, void *shadow)
{
}
#endif /* CONFIG_LRU_GEN */
/**
* workingset_eviction - note the eviction of a folio from memory
* @target_memcg: the cgroup that is causing the reclaim
@ -396,9 +585,8 @@ void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg)
lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
/* XXX: target_memcg can be NULL, go through lruvec */
memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
eviction = atomic_long_read(&lruvec->nonresident_age);
eviction >>= bucket_order;
workingset_age_nonresident(lruvec, folio_nr_pages(folio));
eviction = lru_eviction(lruvec, folio_is_file_lru(folio),
folio_nr_pages(folio), EVICTION_BITS, bucket_order);
return pack_shadow(memcgid, pgdat, eviction,
folio_test_workingset(folio));
}
@ -411,25 +599,22 @@ void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg)
* @file: whether the corresponding folio is from the file lru.
* @workingset: where the workingset value unpacked from shadow should
* be stored.
* @tracking: whether do workingset tracking or not
*
* Return: true if the shadow is for a recently evicted folio; false otherwise.
*/
bool workingset_test_recent(void *shadow, bool file, bool *workingset)
bool workingset_test_recent(void *shadow, bool file, bool *workingset, bool tracking)
{
struct mem_cgroup *eviction_memcg;
struct lruvec *eviction_lruvec;
unsigned long refault_distance;
unsigned long workingset_size;
unsigned long refault;
unsigned long inactive;
unsigned long active;
int memcgid;
struct pglist_data *pgdat;
unsigned long eviction;
if (lru_gen_enabled())
return lru_gen_test_recent(shadow, file, &eviction_lruvec, &eviction, workingset);
unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset);
eviction <<= bucket_order;
/*
* Look up the memcg associated with the stored ID. It might
@ -447,30 +632,32 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset)
* would be better if the root_mem_cgroup existed in all
* configurations instead.
*/
eviction_memcg = mem_cgroup_from_id(memcgid);
if (!mem_cgroup_disabled() && !eviction_memcg)
eviction_memcg = try_get_flush_memcg(memcgid);
if (!eviction_memcg)
return false;
eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat);
refault = atomic_long_read(&eviction_lruvec->nonresident_age);
/*
* Calculate the refault distance
*
* The unsigned subtraction here gives an accurate distance
* across nonresident_age overflows in most cases. There is a
* special case: usually, shadow entries have a short lifetime
* and are either refaulted or reclaimed along with the inode
* before they get too old. But it is not impossible for the
* nonresident_age to lap a shadow entry in the field, which
* can then result in a false small refault distance, leading
* to a false activation should this old entry actually
* refault again. However, earlier kernels used to deactivate
* unconditionally with *every* reclaim invocation for the
* longest time, so the occasional inappropriate activation
* leading to pressure on the active list is not a problem.
* Flush stats (and potentially sleep) outside the RCU read section.
* XXX: With per-memcg flushing and thresholding, is ratelimiting
* still needed here?
*/
refault_distance = (refault - eviction) & EVICTION_MASK;
mem_cgroup_flush_stats_ratelimited(eviction_memcg);
eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat);
if (lru_gen_enabled()) {
bool recent;
refault_distance = lru_distance(eviction_lruvec, file, eviction,
LRU_GEN_EVICTION_BITS, lru_gen_bucket_order);
recent = lru_gen_test_recent(eviction_lruvec, file, refault_distance);
mem_cgroup_put(eviction_memcg);
return recent;
}
refault_distance = lru_distance(eviction_lruvec, file,
eviction, EVICTION_BITS, bucket_order);
if (tracking)
workingset_refault_track(eviction_lruvec, refault_distance);
/*
* Compare the distance to the existing workingset size. We
@ -479,21 +666,22 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset)
* workingset competition needs to consider anon or not depends
* on having free swap space.
*/
workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE);
if (!file) {
workingset_size += lruvec_page_state(eviction_lruvec,
NR_INACTIVE_FILE);
}
active = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE);
inactive = lruvec_page_state(eviction_lruvec, NR_INACTIVE_FILE);
if (mem_cgroup_get_nr_swap_pages(eviction_memcg) > 0) {
workingset_size += lruvec_page_state(eviction_lruvec,
NR_ACTIVE_ANON);
if (file) {
workingset_size += lruvec_page_state(eviction_lruvec,
NR_INACTIVE_ANON);
}
active += lruvec_page_state(eviction_lruvec, NR_ACTIVE_ANON);
inactive += lruvec_page_state(eviction_lruvec, NR_INACTIVE_ANON);
}
return refault_distance <= workingset_size;
mem_cgroup_put(eviction_memcg);
/*
* When there are already enough active pages, be less aggressive
* on reactivating pages, challenge an large set of established
* active pages with one time refaulted page may not be a good idea.
*/
return refault_distance < min(active, inactive);
}
/**
@ -514,24 +702,22 @@ void workingset_refault(struct folio *folio, void *shadow)
bool workingset;
long nr;
if (lru_gen_enabled()) {
lru_gen_refault(folio, shadow);
return;
}
/* Flush stats (and potentially sleep) before holding RCU read lock */
mem_cgroup_flush_stats_ratelimited();
rcu_read_lock();
/*
* The activation decision for this folio is made at the level
* where the eviction occurred, as that is where the LRU order
* during folio reclaim is being determined.
*
* However, the cgroup that will own the folio is the one that
* is actually experiencing the refault event.
* is actually experiencing the refault event. Make sure the folio is
* locked to guarantee folio_memcg() stability throughout.
*/
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
if (lru_gen_enabled()) {
lru_gen_refault(folio, shadow);
return;
}
nr = folio_nr_pages(folio);
memcg = folio_memcg(folio);
pgdat = folio_pgdat(folio);
@ -539,11 +725,10 @@ void workingset_refault(struct folio *folio, void *shadow)
mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr);
if (!workingset_test_recent(shadow, file, &workingset))
goto out;
if (!workingset_test_recent(shadow, file, &workingset, true))
return;
folio_set_active(folio);
workingset_age_nonresident(lruvec, nr);
mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file, nr);
/* Folio was active prior to eviction */
@ -556,32 +741,6 @@ void workingset_refault(struct folio *folio, void *shadow)
lru_note_cost_refault(folio);
mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr);
}
out:
rcu_read_unlock();
}
/**
* workingset_activation - note a page activation
* @folio: Folio that is being activated.
*/
void workingset_activation(struct folio *folio)
{
struct mem_cgroup *memcg;
rcu_read_lock();
/*
* Filter non-memcg pages here, e.g. unmap can call
* mark_page_accessed() on VDSO pages.
*
* XXX: See workingset_refault() - this should return
* root_mem_cgroup even for !CONFIG_MEMCG.
*/
memcg = folio_memcg_rcu(folio);
if (!mem_cgroup_disabled() && !memcg)
goto out;
workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio));
out:
rcu_read_unlock();
}
/*
@ -664,7 +823,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
struct lruvec *lruvec;
int i;
mem_cgroup_flush_stats();
mem_cgroup_flush_stats(sc->memcg);
lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
for (pages = 0, i = 0; i < NR_LRU_LISTS; i++)
pages += lruvec_page_state_local(lruvec,
@ -778,7 +937,6 @@ static struct lock_class_key shadow_nodes_key;
static int __init workingset_init(void)
{
unsigned int timestamp_bits;
unsigned int max_order;
int ret;
@ -790,12 +948,17 @@ static int __init workingset_init(void)
* some more pages at runtime, so keep working with up to
* double the initial memory by using totalram_pages as-is.
*/
timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
max_order = fls_long(totalram_pages() - 1);
if (max_order > timestamp_bits)
bucket_order = max_order - timestamp_bits;
if (max_order > EVICTION_BITS)
bucket_order = max_order - EVICTION_BITS;
pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
timestamp_bits, max_order, bucket_order);
EVICTION_BITS, max_order, bucket_order);
#ifdef CONFIG_LRU_GEN
if (max_order > LRU_GEN_EVICTION_BITS)
lru_gen_bucket_order = max_order - LRU_GEN_EVICTION_BITS;
pr_info("workingset: lru_gen_timestamp_bits=%d lru_gen_bucket_order=%u\n",
LRU_GEN_EVICTION_BITS, lru_gen_bucket_order);
#endif
ret = prealloc_shrinker(&workingset_shadow_shrinker, "mm-shadow");
if (ret)

View File

@ -1142,9 +1142,11 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
SetPageReclaim(page);
/* start writeback */
__swap_writepage(page, &wbc);
ret = __swap_writepage(page, &wbc);
put_page(page);
zswap_written_back_pages++;
if (!ret)
zswap_written_back_pages++;
return ret;

View File

@ -20,6 +20,9 @@
*
*/
static int ali_cip;
module_param(ali_cip, int, 0600);
MODULE_PARM_DESC(ali_cip, "Enable ali cip option: 0xfe. Value could be 0 or 1, defaults to 0.");
/*
* Statistics of toa in proc /proc/net/toa_stats
@ -78,7 +81,8 @@ static void *get_toa_data(struct sk_buff *skb)
return NULL;
if (opsize > length)
return NULL; /* don't parse partial options */
if (TCPOPT_TOA == opcode && TCPOLEN_TOA == opsize) {
if ((TCPOPT_TOA == opcode && TCPOLEN_TOA == opsize) ||
(ali_cip == 1 && TCPOPT_TOA_ALI_CIP == opcode && TCPOLEN_TOA_ALI_CIP == opsize)) {
memcpy(&tdata, ptr - 2, sizeof (tdata));
//TOA_DBG("find toa data: ip = %u.%u.%u.%u, port = %u\n", NIPQUAD(tdata.ip),
//ntohs(tdata.port));
@ -118,7 +122,8 @@ inet_getname_toa(struct socket *sock, struct sockaddr *uaddr, int peer)
if (retval == 0 && NULL != sk->sk_user_data && peer) {
if (sock_def_readable == sk->sk_data_ready) {
memcpy(&tdata, &sk->sk_user_data, sizeof (tdata));
if (TCPOPT_TOA == tdata.opcode && TCPOLEN_TOA == tdata.opsize) {
if ((TCPOPT_TOA == tdata.opcode && TCPOLEN_TOA == tdata.opsize) ||
(ali_cip == 1 && TCPOPT_TOA_ALI_CIP == tdata.opcode && TCPOLEN_TOA_ALI_CIP == tdata.opsize)) {
TOA_INC_STATS(ext_stats, GETNAME_TOA_OK_CNT);
//TOA_DBG("inet_getname_toa: set new sockaddr, ip %u.%u.%u.%u -> %u.%u.%u.%u, port %u -> %u\n",
// NIPQUAD(sin->sin_addr.s_addr), NIPQUAD(tdata.ip), ntohs(sin->sin_port),
@ -158,7 +163,8 @@ inet6_getname_toa(struct socket *sock, struct sockaddr *uaddr, int peer)
if (retval == 0 && NULL != sk->sk_user_data && peer) {
if (sock_def_readable == sk->sk_data_ready) {
memcpy(&tdata, &sk->sk_user_data, sizeof (tdata));
if (TCPOPT_TOA == tdata.opcode && TCPOLEN_TOA == tdata.opsize) {
if ((TCPOPT_TOA == tdata.opcode && TCPOLEN_TOA == tdata.opsize) ||
(ali_cip == 1 && TCPOPT_TOA_ALI_CIP == tdata.opcode && TCPOLEN_TOA_ALI_CIP == tdata.opsize)) {
TOA_INC_STATS(ext_stats, GETNAME_TOA_OK_CNT);
sin->sin6_port = tdata.port;
ipv6_addr_set(&sin->sin6_addr, 0, 0, htonl(0x0000FFFF), tdata.ip);

View File

@ -34,9 +34,13 @@
} while (0)
#define TCPOPT_TOA 200
/* MUST be 4n !!!! */
#define TCPOLEN_TOA 8 /* |opcode|size|ip+port| = 1 + 1 + 6 */
/* |opcode|size|ip+port| = 1 + 1 + 6 */
#define TCPOLEN_TOA 8
#define TCPOPT_TOA_ALI_CIP 0xfe
/* |opcode|size|sport|sip| = 1 + 1 + 2 + 4 */
#define TCPOLEN_TOA_ALI_CIP 8
/* MUST be 4 bytes alignment */
struct toa_data {