OpenCloudOS-Kernel/arch/x86/Makefile

316 lines
11 KiB
Makefile
Raw Normal View History

# Unified Makefile for i386 and x86_64
# select defconfig based on actual architecture
ifeq ($(ARCH),x86)
x86: Default to ARCH=x86 to avoid overriding CONFIG_64BIT It is easy to waste a bunch of time when one takes a 32-bit .config from a test machine and try to build it on a faster 64-bit system, and its existing setting of CONFIG_64BIT=n gets *changed* to match the build host. Similarly, if one has an existing build tree it is easy to trash an entire build tree that way. This is because the default setting for $ARCH when discovered from 'uname' is one of the legacy pre-x86-merge values (i386 or x86_64), which effectively force the setting of CONFIG_64BIT to match. We should default to ARCH=x86 instead, finally completing the merge that we started so long ago. This patch preserves the behaviour of the legacy ARCH settings for commands such as: make ARCH=x86_64 randconfig make ARCH=i386 randconfig ... since making the value of CONFIG_64BIT actually random in that situation is not desirable. In time, perhaps we can retire this legacy use of the old ARCH= values. We already have a way to override values for *any* config option, using $KCONFIG_ALLCONFIG, so it could be argued that we don't necessarily need to keep ARCH={i386,x86_64} around as a special case just for overriding CONFIG_64BIT. We'd probably at least want to add a way to override config options from the command line ('make CONFIG_FOO=y oldconfig') before we talk about doing that though. Signed-off-by: David Woodhouse <David.Woodhouse@intel.com> Link: http://lkml.kernel.org/r/1356040315.3198.51.camel@shinybook.infradead.org Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
2012-12-21 05:51:55 +08:00
ifeq ($(shell uname -m),x86_64)
KBUILD_DEFCONFIG := x86_64_defconfig
else
KBUILD_DEFCONFIG := i386_defconfig
x86: Default to ARCH=x86 to avoid overriding CONFIG_64BIT It is easy to waste a bunch of time when one takes a 32-bit .config from a test machine and try to build it on a faster 64-bit system, and its existing setting of CONFIG_64BIT=n gets *changed* to match the build host. Similarly, if one has an existing build tree it is easy to trash an entire build tree that way. This is because the default setting for $ARCH when discovered from 'uname' is one of the legacy pre-x86-merge values (i386 or x86_64), which effectively force the setting of CONFIG_64BIT to match. We should default to ARCH=x86 instead, finally completing the merge that we started so long ago. This patch preserves the behaviour of the legacy ARCH settings for commands such as: make ARCH=x86_64 randconfig make ARCH=i386 randconfig ... since making the value of CONFIG_64BIT actually random in that situation is not desirable. In time, perhaps we can retire this legacy use of the old ARCH= values. We already have a way to override values for *any* config option, using $KCONFIG_ALLCONFIG, so it could be argued that we don't necessarily need to keep ARCH={i386,x86_64} around as a special case just for overriding CONFIG_64BIT. We'd probably at least want to add a way to override config options from the command line ('make CONFIG_FOO=y oldconfig') before we talk about doing that though. Signed-off-by: David Woodhouse <David.Woodhouse@intel.com> Link: http://lkml.kernel.org/r/1356040315.3198.51.camel@shinybook.infradead.org Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
2012-12-21 05:51:55 +08:00
endif
else
KBUILD_DEFCONFIG := $(ARCH)_defconfig
endif
# How to compile the 16-bit code. Note we always compile for -march=i386;
# that way we can complain to the user if the CPU is insufficient.
#
# The -m16 option is supported by GCC >= 4.9 and clang >= 3.5. For
# older versions of GCC, include an *assembly* header to make sure that
# gcc doesn't play any games behind our back.
CODE16GCC_CFLAGS := -m32 -Wa,$(srctree)/arch/x86/boot/code16gcc.h
M16_CFLAGS := $(call cc-option, -m16, $(CODE16GCC_CFLAGS))
REALMODE_CFLAGS := $(M16_CFLAGS) -g -Os -D__KERNEL__ \
-DDISABLE_BRANCH_PROFILING \
-Wall -Wstrict-prototypes -march=i386 -mregparm=3 \
-fno-strict-aliasing -fomit-frame-pointer -fno-pic \
-mno-mmx -mno-sse \
$(call cc-option, -ffreestanding) \
$(call cc-option, -fno-stack-protector) \
$(call cc-option, -mpreferred-stack-boundary=2)
export REALMODE_CFLAGS
# BITS is used as extension for files which are available in a 32 bit
# and a 64 bit version to simplify shared Makefiles.
# e.g.: obj-y += foo_$(BITS).o
export BITS
ifdef CONFIG_X86_NEED_RELOCS
LDFLAGS_vmlinux := --emit-relocs
endif
#
# Prevent GCC from generating any FP code by mistake.
#
# This must happen before we try the -mpreferred-stack-boundary, see:
#
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383
#
KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow
KBUILD_CFLAGS += $(call cc-option,-mno-avx,)
ifeq ($(CONFIG_X86_32),y)
BITS := 32
UTS_MACHINE := i386
CHECKFLAGS += -D__i386__
biarch := $(call cc-option,-m32)
KBUILD_AFLAGS += $(biarch)
KBUILD_CFLAGS += $(biarch)
KBUILD_CFLAGS += -msoft-float -mregparm=3 -freg-struct-return
# Never want PIC in a 32-bit kernel, prevent breakage with GCC built
# with nonstandard options
KBUILD_CFLAGS += -fno-pic
# prevent gcc from keeping the stack 16 byte aligned
KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2)
# Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use
# a lot more stack due to the lack of sharing of stacklots:
KBUILD_CFLAGS += $(call cc-ifversion, -lt, 0400, \
$(call cc-option,-fno-unit-at-a-time))
# CPU-specific tuning. Anything which can be shared with UML should go here.
include arch/x86/Makefile_32.cpu
KBUILD_CFLAGS += $(cflags-y)
# temporary until string.h is fixed
KBUILD_CFLAGS += -ffreestanding
else
BITS := 64
UTS_MACHINE := x86_64
CHECKFLAGS += -D__x86_64__ -m64
biarch := -m64
KBUILD_AFLAGS += -m64
KBUILD_CFLAGS += -m64
x86: Align jump targets to 1-byte boundaries The following NOP in a hot function caught my attention: > 5a: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1) That's a dead NOP that bloats the function a bit, added for the default 16-byte alignment that GCC applies for jump targets. I realize that x86 CPU manufacturers recommend 16-byte jump target alignments (it's in the Intel optimization manual), to help their relatively narrow decoder prefetch alignment and uop cache constraints, but the cost of that is very significant: text data bss dec filename 12566391 1617840 1089536 15273767 vmlinux.align.16-byte 12224951 1617840 1089536 14932327 vmlinux.align.1-byte By using 1-byte jump target alignment (i.e. no alignment at all) we get an almost 3% reduction in kernel size (!) - and a probably similar reduction in I$ footprint. Now, the usual justification for jump target alignment is the following: - modern decoders tend to have 16-byte (effective) decoder prefetch windows. (AMD documents it higher but measurements suggest the effective prefetch window on curretn uarchs is still around 16 bytes) - on Intel there's also the uop-cache with cachelines that have 16-byte granularity and limited associativity. - older x86 uarchs had a penalty for decoder fetches that crossed 16-byte boundaries. These limits are mostly gone from recent uarchs. So if a forward jump target is aligned to cacheline boundary then prefetches will start from a new prefetch-cacheline and there's higher chance for decoding in fewer steps and packing tightly. But I think that argument is flawed for typical optimized kernel code flows: forward jumps often go to 'cold' (uncommon) pieces of code, and aligning cold code to cache lines does not bring a lot of advantages (they are uncommon), while it causes collateral damage: - their alignment 'spreads out' the cache footprint, it shifts followup hot code further out - plus it slows down even 'cold' code that immediately follows 'hot' code (like in the above case), which could have benefited from the partial cacheline that comes off the end of hot code. But even in the cache-hot case the 16 byte alignment brings disadvantages: - it spreads out the cache footprint, possibly making the code fall out of the L1 I$. - On Intel CPUs, recent microarchitectures have plenty of uop cache (typically doubling every 3 years) - while the size of the L1 cache grows much less aggressively. So workloads are rarely uop cache limited. The only situation where alignment might matter are tight loops that could fit into a single 16 byte chunk - but those are pretty rare in the kernel: if they exist they tend to be pointer chasing or generic memory ops, which both tend to be cache miss (or cache allocation) intensive and are not decoder bandwidth limited. So the balance of arguments strongly favors packing kernel instructions tightly versus maximizing for decoder bandwidth: this patch changes the jump target alignment from 16 bytes to 1 byte (tightly packed, unaligned). Acked-by: Denys Vlasenko <dvlasenk@redhat.com> Cc: Andy Lutomirski <luto@amacapital.net> Cc: Aswin Chandramouleeswaran <aswin@hp.com> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Jason Low <jason.low2@hp.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Tim Chen <tim.c.chen@linux.intel.com> Link: http://lkml.kernel.org/r/20150410120846.GA17101@gmail.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-04-10 20:08:46 +08:00
# Align jump targets to 1 byte, not the default 16 bytes:
KBUILD_CFLAGS += $(call cc-option,-falign-jumps=1)
x86: Align jump targets to 1-byte boundaries The following NOP in a hot function caught my attention: > 5a: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1) That's a dead NOP that bloats the function a bit, added for the default 16-byte alignment that GCC applies for jump targets. I realize that x86 CPU manufacturers recommend 16-byte jump target alignments (it's in the Intel optimization manual), to help their relatively narrow decoder prefetch alignment and uop cache constraints, but the cost of that is very significant: text data bss dec filename 12566391 1617840 1089536 15273767 vmlinux.align.16-byte 12224951 1617840 1089536 14932327 vmlinux.align.1-byte By using 1-byte jump target alignment (i.e. no alignment at all) we get an almost 3% reduction in kernel size (!) - and a probably similar reduction in I$ footprint. Now, the usual justification for jump target alignment is the following: - modern decoders tend to have 16-byte (effective) decoder prefetch windows. (AMD documents it higher but measurements suggest the effective prefetch window on curretn uarchs is still around 16 bytes) - on Intel there's also the uop-cache with cachelines that have 16-byte granularity and limited associativity. - older x86 uarchs had a penalty for decoder fetches that crossed 16-byte boundaries. These limits are mostly gone from recent uarchs. So if a forward jump target is aligned to cacheline boundary then prefetches will start from a new prefetch-cacheline and there's higher chance for decoding in fewer steps and packing tightly. But I think that argument is flawed for typical optimized kernel code flows: forward jumps often go to 'cold' (uncommon) pieces of code, and aligning cold code to cache lines does not bring a lot of advantages (they are uncommon), while it causes collateral damage: - their alignment 'spreads out' the cache footprint, it shifts followup hot code further out - plus it slows down even 'cold' code that immediately follows 'hot' code (like in the above case), which could have benefited from the partial cacheline that comes off the end of hot code. But even in the cache-hot case the 16 byte alignment brings disadvantages: - it spreads out the cache footprint, possibly making the code fall out of the L1 I$. - On Intel CPUs, recent microarchitectures have plenty of uop cache (typically doubling every 3 years) - while the size of the L1 cache grows much less aggressively. So workloads are rarely uop cache limited. The only situation where alignment might matter are tight loops that could fit into a single 16 byte chunk - but those are pretty rare in the kernel: if they exist they tend to be pointer chasing or generic memory ops, which both tend to be cache miss (or cache allocation) intensive and are not decoder bandwidth limited. So the balance of arguments strongly favors packing kernel instructions tightly versus maximizing for decoder bandwidth: this patch changes the jump target alignment from 16 bytes to 1 byte (tightly packed, unaligned). Acked-by: Denys Vlasenko <dvlasenk@redhat.com> Cc: Andy Lutomirski <luto@amacapital.net> Cc: Aswin Chandramouleeswaran <aswin@hp.com> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Jason Low <jason.low2@hp.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Tim Chen <tim.c.chen@linux.intel.com> Link: http://lkml.kernel.org/r/20150410120846.GA17101@gmail.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-04-10 20:08:46 +08:00
x86: Pack loops tightly as well Packing loops tightly (-falign-loops=1) is beneficial to code size: text data bss dec filename 12566391 1617840 1089536 15273767 vmlinux.align.16-byte 12224951 1617840 1089536 14932327 vmlinux.align.1-byte 11976567 1617840 1089536 14683943 vmlinux.align.1-byte.funcs-1-byte 11903735 1617840 1089536 14611111 vmlinux.align.1-byte.funcs-1-byte.loops-1-byte Which reduces the size of the kernel by another 0.6%, so the the total combined size reduction of the alignment-packing patches is ~5.5%. The x86 decoder bandwidth and caching arguments laid out in: be6cb02779ca ("x86: Align jump targets to 1-byte boundaries") apply to loop alignment as well. Furtermore, modern CPU uarchs have a loop cache/buffer that is a L0 cache before even any uop cache, covering a few dozen most recently executed instructions. This loop cache generally does not have the 16-byte alignment restrictions of the uop cache. Now loop alignment can still be beneficial if: - a loop is cache-hot and its surroundings are not. - if the loop is so cache hot that the instruction flow becomes x86 decoder bandwidth limited But loop alignment is harmful if: - a loop is cache-cold - a loop's surroundings are cache-hot as well - two cache-hot loops are close to each other - if the loop fits into the loop cache - if the code flow is not decoder bandwidth limited and I'd argue that the latter five scenarios are much more common in the kernel, as our hottest loops are typically: - pointer chasing: this should fit into the loop cache in most cases and is typically data cache and address generation limited - generic memory ops (memset, memcpy, etc.): these generally fit into the loop cache as well, and are likewise data cache limited. So this patch packs loop addresses tightly as well. Acked-by: Denys Vlasenko <dvlasenk@redhat.com> Cc: Andy Lutomirski <luto@amacapital.net> Cc: Aswin Chandramouleeswaran <aswin@hp.com> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Jason Low <jason.low2@hp.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Tim Chen <tim.c.chen@linux.intel.com> Link: http://lkml.kernel.org/r/20150410123017.GB19918@gmail.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-05-17 13:56:54 +08:00
# Pack loops tightly as well:
KBUILD_CFLAGS += $(call cc-option,-falign-loops=1)
x86: Pack loops tightly as well Packing loops tightly (-falign-loops=1) is beneficial to code size: text data bss dec filename 12566391 1617840 1089536 15273767 vmlinux.align.16-byte 12224951 1617840 1089536 14932327 vmlinux.align.1-byte 11976567 1617840 1089536 14683943 vmlinux.align.1-byte.funcs-1-byte 11903735 1617840 1089536 14611111 vmlinux.align.1-byte.funcs-1-byte.loops-1-byte Which reduces the size of the kernel by another 0.6%, so the the total combined size reduction of the alignment-packing patches is ~5.5%. The x86 decoder bandwidth and caching arguments laid out in: be6cb02779ca ("x86: Align jump targets to 1-byte boundaries") apply to loop alignment as well. Furtermore, modern CPU uarchs have a loop cache/buffer that is a L0 cache before even any uop cache, covering a few dozen most recently executed instructions. This loop cache generally does not have the 16-byte alignment restrictions of the uop cache. Now loop alignment can still be beneficial if: - a loop is cache-hot and its surroundings are not. - if the loop is so cache hot that the instruction flow becomes x86 decoder bandwidth limited But loop alignment is harmful if: - a loop is cache-cold - a loop's surroundings are cache-hot as well - two cache-hot loops are close to each other - if the loop fits into the loop cache - if the code flow is not decoder bandwidth limited and I'd argue that the latter five scenarios are much more common in the kernel, as our hottest loops are typically: - pointer chasing: this should fit into the loop cache in most cases and is typically data cache and address generation limited - generic memory ops (memset, memcpy, etc.): these generally fit into the loop cache as well, and are likewise data cache limited. So this patch packs loop addresses tightly as well. Acked-by: Denys Vlasenko <dvlasenk@redhat.com> Cc: Andy Lutomirski <luto@amacapital.net> Cc: Aswin Chandramouleeswaran <aswin@hp.com> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Jason Low <jason.low2@hp.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Tim Chen <tim.c.chen@linux.intel.com> Link: http://lkml.kernel.org/r/20150410123017.GB19918@gmail.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-05-17 13:56:54 +08:00
# Don't autogenerate traditional x87 instructions
KBUILD_CFLAGS += $(call cc-option,-mno-80387)
KBUILD_CFLAGS += $(call cc-option,-mno-fp-ret-in-387)
# Use -mpreferred-stack-boundary=3 if supported.
KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=3)
# Use -mskip-rax-setup if supported.
KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup)
# FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu)
cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8)
cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona)
cflags-$(CONFIG_MCORE2) += \
$(call cc-option,-march=core2,$(call cc-option,-mtune=generic))
cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom) \
$(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic))
cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic)
KBUILD_CFLAGS += $(cflags-y)
KBUILD_CFLAGS += -mno-red-zone
KBUILD_CFLAGS += -mcmodel=kernel
# -funit-at-a-time shrinks the kernel .text considerably
# unfortunately it makes reading oopses harder.
KBUILD_CFLAGS += $(call cc-option,-funit-at-a-time)
endif
ifdef CONFIG_X86_X32
x32_ld_ok := $(call try-run,\
/bin/echo -e '1: .quad 1b' | \
$(CC) $(KBUILD_AFLAGS) -c -x assembler -o "$$TMP" - && \
$(OBJCOPY) -O elf32-x86-64 "$$TMP" "$$TMPO" && \
$(LD) -m elf32_x86_64 "$$TMPO" -o "$$TMP",y,n)
ifeq ($(x32_ld_ok),y)
CONFIG_X86_X32_ABI := y
KBUILD_AFLAGS += -DCONFIG_X86_X32_ABI
KBUILD_CFLAGS += -DCONFIG_X86_X32_ABI
else
$(warning CONFIG_X86_X32 enabled but no binutils support)
endif
endif
export CONFIG_X86_X32_ABI
# Don't unroll struct assignments with kmemcheck enabled
ifeq ($(CONFIG_KMEMCHECK),y)
KBUILD_CFLAGS += $(call cc-option,-fno-builtin-memcpy)
endif
x86/build: Mostly disable '-maccumulate-outgoing-args' The GCC '-maccumulate-outgoing-args' flag is enabled for most configs, mostly because of issues which are no longer relevant. For most configs, and with most recent versions of GCC, it's no longer needed. Clarify which cases need it, and only enable it for those cases. Also produce a compile-time error for the ftrace graph + mcount + '-Os' case, which will otherwise cause runtime failures. The main benefit of '-maccumulate-outgoing-args' is that it prevents an ugly prologue for functions which have aligned stacks. But removing the option also has some benefits: more readable argument saves, smaller text size, and (presumably) slightly improved performance. Here are the object size savings for 32-bit and 64-bit defconfig kernels: text data bss dec hex filename 10006710 3543328 1773568 15323606 e9d1d6 vmlinux.x86-32.before 9706358 3547424 1773568 15027350 e54c96 vmlinux.x86-32.after text data bss dec hex filename 10652105 4537576 843776 16033457 f4a6b1 vmlinux.x86-64.before 10639629 4537576 843776 16020981 f475f5 vmlinux.x86-64.after That comes out to a 3% text size improvement on x86-32 and a 0.1% text size improvement on x86-64. Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com> Cc: Andrew Lutomirski <luto@kernel.org> Cc: Andy Lutomirski <luto@amacapital.net> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Denys Vlasenko <dvlasenk@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Pavel Machek <pavel@ucw.cz> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Thomas Gleixner <tglx@linutronix.de> Link: http://lkml.kernel.org/r/20170316193133.zrj6gug53766m6nn@treble Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-03-17 03:31:33 +08:00
#
# If the function graph tracer is used with mcount instead of fentry,
# '-maccumulate-outgoing-args' is needed to prevent a GCC bug
# (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=42109)
#
ifdef CONFIG_FUNCTION_GRAPH_TRACER
ifndef CONFIG_HAVE_FENTRY
ACCUMULATE_OUTGOING_ARGS := 1
else
ifeq ($(call cc-option-yn, -mfentry), n)
ACCUMULATE_OUTGOING_ARGS := 1
# GCC ignores '-maccumulate-outgoing-args' when used with '-Os'.
# If '-Os' is enabled, disable it and print a warning.
ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
undefine CONFIG_CC_OPTIMIZE_FOR_SIZE
$(warning Disabling CONFIG_CC_OPTIMIZE_FOR_SIZE. Your compiler does not have -mfentry so you cannot optimize for size with CONFIG_FUNCTION_GRAPH_TRACER.)
endif
x86/build: Mostly disable '-maccumulate-outgoing-args' The GCC '-maccumulate-outgoing-args' flag is enabled for most configs, mostly because of issues which are no longer relevant. For most configs, and with most recent versions of GCC, it's no longer needed. Clarify which cases need it, and only enable it for those cases. Also produce a compile-time error for the ftrace graph + mcount + '-Os' case, which will otherwise cause runtime failures. The main benefit of '-maccumulate-outgoing-args' is that it prevents an ugly prologue for functions which have aligned stacks. But removing the option also has some benefits: more readable argument saves, smaller text size, and (presumably) slightly improved performance. Here are the object size savings for 32-bit and 64-bit defconfig kernels: text data bss dec hex filename 10006710 3543328 1773568 15323606 e9d1d6 vmlinux.x86-32.before 9706358 3547424 1773568 15027350 e54c96 vmlinux.x86-32.after text data bss dec hex filename 10652105 4537576 843776 16033457 f4a6b1 vmlinux.x86-64.before 10639629 4537576 843776 16020981 f475f5 vmlinux.x86-64.after That comes out to a 3% text size improvement on x86-32 and a 0.1% text size improvement on x86-64. Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com> Cc: Andrew Lutomirski <luto@kernel.org> Cc: Andy Lutomirski <luto@amacapital.net> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Denys Vlasenko <dvlasenk@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Pavel Machek <pavel@ucw.cz> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Thomas Gleixner <tglx@linutronix.de> Link: http://lkml.kernel.org/r/20170316193133.zrj6gug53766m6nn@treble Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-03-17 03:31:33 +08:00
endif
endif
endif
#
# Jump labels need '-maccumulate-outgoing-args' for gcc < 4.5.2 to prevent a
# GCC bug (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=46226). There's no way
# to test for this bug at compile-time because the test case needs to execute,
# which is a no-go for cross compilers. So check the GCC version instead.
#
ifdef CONFIG_JUMP_LABEL
ifneq ($(ACCUMULATE_OUTGOING_ARGS), 1)
ACCUMULATE_OUTGOING_ARGS = $(call cc-if-fullversion, -lt, 040502, 1)
endif
endif
ifeq ($(ACCUMULATE_OUTGOING_ARGS), 1)
KBUILD_CFLAGS += -maccumulate-outgoing-args
endif
# Stackpointer is addressed different for 32 bit and 64 bit x86
sp-$(CONFIG_X86_32) := esp
sp-$(CONFIG_X86_64) := rsp
# do binutils support CFI?
cfi := $(call as-instr,.cfi_startproc\n.cfi_rel_offset $(sp-y)$(comma)0\n.cfi_endproc,-DCONFIG_AS_CFI=1)
# is .cfi_signal_frame supported too?
cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1)
cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTIONS=1)
# does binutils support specific instructions?
asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1)
asinstr += $(call as-instr,pshufb %xmm0$(comma)%xmm0,-DCONFIG_AS_SSSE3=1)
asinstr += $(call as-instr,crc32l %eax$(comma)%eax,-DCONFIG_AS_CRC32=1)
avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1)
avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1)
avx512_instr :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,-DCONFIG_AS_AVX512=1)
sha1_ni_instr :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA1_NI=1)
sha256_ni_instr :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA256_NI=1)
KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr)
KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr)
LDFLAGS := -m elf_$(UTS_MACHINE)
# Speed up the build
KBUILD_CFLAGS += -pipe
# Workaround for a gcc prelease that unfortunately was shipped in a suse release
KBUILD_CFLAGS += -Wno-sign-compare
#
KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
KBUILD_CFLAGS += $(mflags-y)
KBUILD_AFLAGS += $(mflags-y)
archscripts: scripts_basic
$(Q)$(MAKE) $(build)=arch/x86/tools relocs
###
# Syscall table generation
archheaders:
$(Q)$(MAKE) $(build)=arch/x86/entry/syscalls all
archprepare:
kexec: create a new config option CONFIG_KEXEC_FILE for new syscall Currently new system call kexec_file_load() and all the associated code compiles if CONFIG_KEXEC=y. But new syscall also compiles purgatory code which currently uses gcc option -mcmodel=large. This option seems to be available only gcc 4.4 onwards. Hiding new functionality behind a new config option will not break existing users of old gcc. Those who wish to enable new functionality will require new gcc. Having said that, I am trying to figure out how can I move away from using -mcmodel=large but that can take a while. I think there are other advantages of introducing this new config option. As this option will be enabled only on x86_64, other arches don't have to compile generic kexec code which will never be used. This new code selects CRYPTO=y and CRYPTO_SHA256=y. And all other arches had to do this for CONFIG_KEXEC. Now with introduction of new config option, we can remove crypto dependency from other arches. Now CONFIG_KEXEC_FILE is available only on x86_64. So whereever I had CONFIG_X86_64 defined, I got rid of that. For CONFIG_KEXEC_FILE, instead of doing select CRYPTO=y, I changed it to "depends on CRYPTO=y". This should be safer as "select" is not recursive. Signed-off-by: Vivek Goyal <vgoyal@redhat.com> Cc: Eric Biederman <ebiederm@xmission.com> Cc: H. Peter Anvin <hpa@zytor.com> Tested-by: Shaun Ruffell <sruffell@digium.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-30 06:18:46 +08:00
ifeq ($(CONFIG_KEXEC_FILE),y)
$(Q)$(MAKE) $(build)=arch/x86/purgatory arch/x86/purgatory/kexec-purgatory.c
endif
###
# Kernel objects
head-y := arch/x86/kernel/head_$(BITS).o
head-y += arch/x86/kernel/head$(BITS).o
head-y += arch/x86/kernel/ebda.o
x86/rtc: Replace paravirt rtc check with platform legacy quirk We have 4 types of x86 platforms that disable RTC: * Intel MID * Lguest - uses paravirt * Xen dom-U - uses paravirt * x86 on legacy systems annotated with an ACPI legacy flag We can consolidate all of these into a platform specific legacy quirk set early in boot through i386_start_kernel() and through x86_64_start_reservations(). This deals with the RTC quirks which we can rely on through the hardware subarch, the ACPI check can be dealt with separately. For Xen things are bit more complex given that the @X86_SUBARCH_XEN x86_hardware_subarch is shared on for Xen which uses the PV path for both domU and dom0. Since the semantics for differentiating between the two are Xen specific we provide a platform helper to help override default legacy features -- x86_platform.set_legacy_features(). Use of this helper is highly discouraged, its only purpose should be to account for the lack of semantics available within your given x86_hardware_subarch. As per 0-day, this bumps the vmlinux size using i386-tinyconfig as follows: TOTAL TEXT init.text x86_early_init_platform_quirks() +70 +62 +62 +43 Only 8 bytes overhead total, as the main increase in size is all removed via __init. Suggested-by: Ingo Molnar <mingo@kernel.org> Signed-off-by: Luis R. Rodriguez <mcgrof@kernel.org> Reviewed-by: Juergen Gross <jgross@suse.com> Cc: Andy Lutomirski <luto@amacapital.net> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Denys Vlasenko <dvlasenk@redhat.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: andrew.cooper3@citrix.com Cc: andriy.shevchenko@linux.intel.com Cc: bigeasy@linutronix.de Cc: boris.ostrovsky@oracle.com Cc: david.vrabel@citrix.com Cc: ffainelli@freebox.fr Cc: george.dunlap@citrix.com Cc: glin@suse.com Cc: jlee@suse.com Cc: josh@joshtriplett.org Cc: julien.grall@linaro.org Cc: konrad.wilk@oracle.com Cc: kozerkov@parallels.com Cc: lenb@kernel.org Cc: lguest@lists.ozlabs.org Cc: linux-acpi@vger.kernel.org Cc: lv.zheng@intel.com Cc: matt@codeblueprint.co.uk Cc: mbizon@freebox.fr Cc: rjw@rjwysocki.net Cc: robert.moore@intel.com Cc: rusty@rustcorp.com.au Cc: tiwai@suse.de Cc: toshi.kani@hp.com Cc: xen-devel@lists.xensource.com Link: http://lkml.kernel.org/r/1460592286-300-5-git-send-email-mcgrof@kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-04-14 08:04:34 +08:00
head-y += arch/x86/kernel/platform-quirks.o
libs-y += arch/x86/lib/
# See arch/x86/Kbuild for content of core part of the kernel
core-y += arch/x86/
# drivers-y are linked after core-y
drivers-$(CONFIG_MATH_EMULATION) += arch/x86/math-emu/
drivers-$(CONFIG_PCI) += arch/x86/pci/
# must be linked after kernel/
drivers-$(CONFIG_OPROFILE) += arch/x86/oprofile/
# suspend and hibernation support
drivers-$(CONFIG_PM) += arch/x86/power/
drivers-$(CONFIG_FB) += arch/x86/video/
drivers-$(CONFIG_RAS) += arch/x86/ras/
####
# boot loader support. Several targets are kept for legacy purposes
boot := arch/x86/boot
BOOT_TARGETS = bzlilo bzdisk fdimage fdimage144 fdimage288 isoimage
PHONY += bzImage $(BOOT_TARGETS)
# Default kernel to build
all: bzImage
# KBUILD_IMAGE specify target image being built
KBUILD_IMAGE := $(boot)/bzImage
bzImage: vmlinux
ifeq ($(CONFIG_X86_DECODER_SELFTEST),y)
$(Q)$(MAKE) $(build)=arch/x86/tools posttest
endif
$(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE)
$(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot
$(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@
$(BOOT_TARGETS): vmlinux
$(Q)$(MAKE) $(build)=$(boot) $@
PHONY += install
install:
$(Q)$(MAKE) $(build)=$(boot) $@
PHONY += vdso_install
vdso_install:
$(Q)$(MAKE) $(build)=arch/x86/entry/vdso $@
archclean:
$(Q)rm -rf $(objtree)/arch/i386
$(Q)rm -rf $(objtree)/arch/x86_64
$(Q)$(MAKE) $(clean)=$(boot)
$(Q)$(MAKE) $(clean)=arch/x86/tools
$(Q)$(MAKE) $(clean)=arch/x86/purgatory
define archhelp
echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)'
echo ' install - Install kernel using'
kbuild: use INSTALLKERNEL to select customized installkernel script Replace the use of CROSS_COMPILE to select a customized installkernel script with the possibility to set INSTALLKERNEL to select a custom installkernel script when running make: make INSTALLKERNEL=arm-installkernel install With this patch we are now more consistent across different architectures - they did not all support use of CROSS_COMPILE. The use of CROSS_COMPILE was a hack as this really belongs to gcc/binutils and the installkernel script does not change just because we change toolchain. The use of CROSS_COMPILE caused troubles with an upcoming patch that saves CROSS_COMPILE when a kernel is built - it would no longer be installable. [Thanks to Peter Z. for this hint] This patch undos what Ian did in commit: 0f8e2d62fa04441cd12c08ce521e84e5bd3f8a46 ("use ${CROSS_COMPILE}installkernel in arch/*/boot/install.sh") The patch has been lightly tested on x86 only - but all changes looks obvious. Acked-by: Peter Zijlstra <peterz@infradead.org> Acked-by: Mike Frysinger <vapier@gentoo.org> [blackfin] Acked-by: Russell King <linux@arm.linux.org.uk> [arm] Acked-by: Paul Mundt <lethal@linux-sh.org> [sh] Acked-by: "H. Peter Anvin" <hpa@zytor.com> [x86] Cc: Ian Campbell <icampbell@arcom.com> Cc: Tony Luck <tony.luck@intel.com> [ia64] Cc: Fenghua Yu <fenghua.yu@intel.com> [ia64] Cc: Hirokazu Takata <takata@linux-m32r.org> [m32r] Cc: Geert Uytterhoeven <geert@linux-m68k.org> [m68k] Cc: Kyle McMartin <kyle@mcmartin.ca> [parisc] Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> [powerpc] Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> [s390] Cc: Thomas Gleixner <tglx@linutronix.de> [x86] Cc: Ingo Molnar <mingo@redhat.com> [x86] Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
2009-07-21 03:37:11 +08:00
echo ' (your) ~/bin/$(INSTALLKERNEL) or'
echo ' (distribution) /sbin/$(INSTALLKERNEL) or'
echo ' install to $$(INSTALL_PATH) and run lilo'
echo ' fdimage - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)'
echo ' fdimage144 - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)'
echo ' fdimage288 - Create 2.8MB boot floppy image (arch/x86/boot/fdimage)'
echo ' isoimage - Create a boot CD-ROM image (arch/x86/boot/image.iso)'
echo ' bzdisk/fdimage*/isoimage also accept:'
echo ' FDARGS="..." arguments for the booted kernel'
echo ' FDINITRD=file initrd for the booted kernel'
endef