From 1a1d48a4a8fde49aedc045d894efe67173d59fe0 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 4 Aug 2015 16:15:14 +0200 Subject: [PATCH 1/3] linux/bitmap: Force inlining of bitmap weight functions With this config: http://busybox.net/~vda/kernel_config_OPTIMIZE_INLINING_and_Os gcc-4.7.2 generates many copies of these tiny functions: bitmap_weight (55 copies): 55 push %rbp 48 89 e5 mov %rsp,%rbp e8 3f 3a 8b 00 callq __bitmap_weight 5d pop %rbp c3 retq hweight_long (23 copies): 55 push %rbp e8 b5 65 8e 00 callq __sw_hweight64 48 89 e5 mov %rsp,%rbp 5d pop %rbp c3 retq See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66122 This patch fixes this via s/inline/__always_inline/ While at it, replaced two "__inline__" with usual "inline" (the rest of the source file uses the latter). text data bss dec filename 86971357 17195880 36659200 140826437 vmlinux.before 86971120 17195912 36659200 140826232 vmlinux Signed-off-by: Denys Vlasenko Cc: Andrew Morton Cc: David Rientjes Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Thomas Graf Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1438697716-28121-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- include/linux/bitmap.h | 2 +- include/linux/bitops.h | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index ea17cca9e685..9653fdb76a42 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -295,7 +295,7 @@ static inline int bitmap_full(const unsigned long *src, unsigned int nbits) return find_first_zero_bit(src, nbits) == nbits; } -static inline int bitmap_weight(const unsigned long *src, unsigned int nbits) +static __always_inline int bitmap_weight(const unsigned long *src, unsigned int nbits) { if (small_const_nbits(nbits)) return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits)); diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 297f5bda4fdf..e63553386ae7 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -57,7 +57,7 @@ extern unsigned long __sw_hweight64(__u64 w); (bit) < (size); \ (bit) = find_next_zero_bit((addr), (size), (bit) + 1)) -static __inline__ int get_bitmask_order(unsigned int count) +static inline int get_bitmask_order(unsigned int count) { int order; @@ -65,7 +65,7 @@ static __inline__ int get_bitmask_order(unsigned int count) return order; /* We could be slightly more clever with -1 here... */ } -static __inline__ int get_count_order(unsigned int count) +static inline int get_count_order(unsigned int count) { int order; @@ -75,7 +75,7 @@ static __inline__ int get_count_order(unsigned int count) return order; } -static inline unsigned long hweight_long(unsigned long w) +static __always_inline unsigned long hweight_long(unsigned long w) { return sizeof(w) == 4 ? hweight32(w) : hweight64(w); } From d14edb1648221e59fc9fd47127fcc57bf26d759f Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 4 Aug 2015 16:15:15 +0200 Subject: [PATCH 2/3] x86/hweight: Force inlining of __arch_hweight{32,64}() With this config: http://busybox.net/~vda/kernel_config_OPTIMIZE_INLINING_and_Os gcc-4.7.2 generates many copies of these tiny functions: __arch_hweight32 (35 copies): 55 push %rbp e8 66 9b 4a 00 callq __sw_hweight32 48 89 e5 mov %rsp,%rbp 5d pop %rbp c3 retq __arch_hweight64 (8 copies): 55 push %rbp e8 5e c2 8a 00 callq __sw_hweight64 48 89 e5 mov %rsp,%rbp 5d pop %rbp c3 retq See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66122 This patch fixes this via s/inline/__always_inline/ To avoid touching 32-bit case where such change was not tested to be a win, reformat __arch_hweight64() to have completely disjoint 64-bit and 32-bit implementations. IOW: made #ifdef / 32 bits and 64 bits instead of having #ifdef / #else / #endif inside a single function body. Only 64-bit __arch_hweight64() is __always_inline'd. text data bss dec filename 86971120 17195912 36659200 140826232 vmlinux.before 86970954 17195912 36659200 140826066 vmlinux Signed-off-by: Denys Vlasenko Cc: Andrew Morton Cc: David Rientjes Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Thomas Graf Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1438697716-28121-2-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/arch_hweight.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h index 9686c3d9ff73..259a7c1ef709 100644 --- a/arch/x86/include/asm/arch_hweight.h +++ b/arch/x86/include/asm/arch_hweight.h @@ -21,7 +21,7 @@ * ARCH_HWEIGHT_CFLAGS in for the respective * compiler switches. */ -static inline unsigned int __arch_hweight32(unsigned int w) +static __always_inline unsigned int __arch_hweight32(unsigned int w) { unsigned int res = 0; @@ -42,20 +42,23 @@ static inline unsigned int __arch_hweight8(unsigned int w) return __arch_hweight32(w & 0xff); } +#ifdef CONFIG_X86_32 static inline unsigned long __arch_hweight64(__u64 w) +{ + return __arch_hweight32((u32)w) + + __arch_hweight32((u32)(w >> 32)); +} +#else +static __always_inline unsigned long __arch_hweight64(__u64 w) { unsigned long res = 0; -#ifdef CONFIG_X86_32 - return __arch_hweight32((u32)w) + - __arch_hweight32((u32)(w >> 32)); -#else asm (ALTERNATIVE("call __sw_hweight64", POPCNT64, X86_FEATURE_POPCNT) : "="REG_OUT (res) : REG_IN (w)); -#endif /* CONFIG_X86_32 */ return res; } +#endif /* CONFIG_X86_32 */ #endif From accd0b9ec015d611eb7783dd86f1bb31bf8d62ab Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 4 Aug 2015 16:15:16 +0200 Subject: [PATCH 3/3] jiffies: Force inlining of {m,u}msecs_to_jiffies() With this config: http://busybox.net/~vda/kernel_config_OPTIMIZE_INLINING_and_Os gcc-4.7.2 generates many copies of these tiny functions: msecs_to_jiffies (45 copies): 55 push %rbp 48 89 e5 mov %rsp,%rbp e8 59 ec 03 00 callq __msecs_to_jiffies 5d pop %rbp c3 retq usecs_to_jiffies (10 copies): 55 push %rbp 48 89 e5 mov %rsp,%rbp e8 5d 54 5e ff callq __usecs_to_jiffies 5d pop %rbp c3 retq See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66122 This patch fixes this via s/inline/__always_inline/ text data bss dec filename 86970954 17195912 36659200 140826066 vmlinux.before 86966150 17195912 36659200 140821262 vmlinux Signed-off-by: Denys Vlasenko Cc: Andrew Morton Cc: David Rientjes Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Thomas Graf Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1438697716-28121-3-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- include/linux/jiffies.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 535fd3bb1ba8..1ba48a18c1d7 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -351,7 +351,7 @@ static inline unsigned long _msecs_to_jiffies(const unsigned int m) * directly here and from __msecs_to_jiffies() in the case where * constant folding is not possible. */ -static inline unsigned long msecs_to_jiffies(const unsigned int m) +static __always_inline unsigned long msecs_to_jiffies(const unsigned int m) { if (__builtin_constant_p(m)) { if ((int)m < 0) @@ -405,7 +405,7 @@ static inline unsigned long _usecs_to_jiffies(const unsigned int u) * directly here and from __msecs_to_jiffies() in the case where * constant folding is not possible. */ -static inline unsigned long usecs_to_jiffies(const unsigned int u) +static __always_inline unsigned long usecs_to_jiffies(const unsigned int u) { if (__builtin_constant_p(u)) { if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET))