diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 37a7cdd779d5..888af176a86d 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -288,6 +288,13 @@ def FeatureERMSB "ermsb", "HasERMSB", "true", "REP MOVS/STOS are fast">; +// Sandy Bridge and newer processors have many instructions that can be +// fused with conditional branches and pass through the CPU as a single +// operation. +def FeatureMacroFusion + : SubtargetFeature<"macrofusion", "HasMacroFusion", "true", + "Various instructions can be fused with conditional branches">; + //===----------------------------------------------------------------------===// // X86 processors supported. //===----------------------------------------------------------------------===// @@ -372,7 +379,8 @@ def : ProcessorModel<"core2", SandyBridgeModel, [ FeatureFXSR, FeatureCMPXCHG16B, FeatureSlowBTMem, - FeatureLAHFSAHF + FeatureLAHFSAHF, + FeatureMacroFusion ]>; def : ProcessorModel<"penryn", SandyBridgeModel, [ FeatureX87, @@ -382,7 +390,8 @@ def : ProcessorModel<"penryn", SandyBridgeModel, [ FeatureFXSR, FeatureCMPXCHG16B, FeatureSlowBTMem, - FeatureLAHFSAHF + FeatureLAHFSAHF, + FeatureMacroFusion ]>; // Atom CPUs. @@ -468,7 +477,8 @@ class NehalemProc : ProcessorModel; def : NehalemProc<"nehalem">; def : NehalemProc<"corei7">; @@ -485,7 +495,8 @@ class WestmereProc : ProcessorModel; def : WestmereProc<"westmere">; @@ -516,7 +527,8 @@ def SNBFeatures : ProcessorFeatures<[], [ FeatureLAHFSAHF, FeatureSlow3OpsLEA, FeatureFastScalarFSQRT, - FeatureFastSHLDRotate + FeatureFastSHLDRotate, + FeatureMacroFusion ]>; class SandyBridgeProc : ProcModel; // Piledriver def : Proc<"bdver2", [ @@ -755,7 +768,8 @@ def : Proc<"bdver2", [ FeatureLWP, FeatureFMA, FeatureSlowSHLD, - FeatureLAHFSAHF + FeatureLAHFSAHF, + FeatureMacroFusion ]>; // Steamroller @@ -782,7 +796,8 @@ def : Proc<"bdver3", [ FeatureXSAVEOPT, FeatureSlowSHLD, FeatureFSGSBase, - FeatureLAHFSAHF + FeatureLAHFSAHF, + FeatureMacroFusion ]>; // Excavator @@ -810,7 +825,8 @@ def : Proc<"bdver4", [ FeatureSlowSHLD, FeatureFSGSBase, FeatureLAHFSAHF, - FeatureMWAITX + FeatureMWAITX, + FeatureMacroFusion ]>; // Znver1 @@ -830,6 +846,7 @@ def: ProcessorModel<"znver1", Znver1Model, [ FeatureFastLZCNT, FeatureLAHFSAHF, FeatureLZCNT, + FeatureMacroFusion, FeatureMMX, FeatureMOVBE, FeatureMWAITX, @@ -873,7 +890,8 @@ def : ProcessorModel<"x86-64", SandyBridgeModel, [ Feature64Bit, FeatureSlow3OpsLEA, FeatureSlowBTMem, - FeatureSlowIncDec + FeatureSlowIncDec, + FeatureMacroFusion ]>; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/X86MacroFusion.cpp b/llvm/lib/Target/X86/X86MacroFusion.cpp index 8fdf10617059..d3ef7aa8d6c6 100644 --- a/llvm/lib/Target/X86/X86MacroFusion.cpp +++ b/llvm/lib/Target/X86/X86MacroFusion.cpp @@ -27,10 +27,8 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, const MachineInstr *FirstMI, const MachineInstr &SecondMI) { const X86Subtarget &ST = static_cast(TSI); - // Check if this processor supports macro-fusion. Since this is a minor - // heuristic, we haven't specifically reserved a feature. hasAVX is a decent - // proxy for SandyBridge+. - if (!ST.hasAVX()) + // Check if this processor supports macro-fusion. + if (!ST.hasMacroFusion()) return false; enum { diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp index 6ad6da95d7b0..2a7733996c4b 100644 --- a/llvm/lib/Target/X86/X86Subtarget.cpp +++ b/llvm/lib/Target/X86/X86Subtarget.cpp @@ -347,6 +347,7 @@ void X86Subtarget::initializeEnvironment() { HasFastVectorFSQRT = false; HasFastLZCNT = false; HasFastSHLDRotate = false; + HasMacroFusion = false; HasERMSB = false; HasSlowDivide32 = false; HasSlowDivide64 = false; diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h index 8b869022d761..7c85e9c2eee0 100644 --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -238,6 +238,9 @@ protected: /// True if SHLD based rotate is fast. bool HasFastSHLDRotate; + /// True if the processor supports macrofusion. + bool HasMacroFusion; + /// True if the processor has enhanced REP MOVSB/STOSB. bool HasERMSB; @@ -488,6 +491,7 @@ public: bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; } bool hasFastLZCNT() const { return HasFastLZCNT; } bool hasFastSHLDRotate() const { return HasFastSHLDRotate; } + bool hasMacroFusion() const { return HasMacroFusion; } bool hasERMSB() const { return HasERMSB; } bool hasSlowDivide32() const { return HasSlowDivide32; } bool hasSlowDivide64() const { return HasSlowDivide64; } diff --git a/llvm/test/CodeGen/X86/avx-select.ll b/llvm/test/CodeGen/X86/avx-select.ll index 7484f8257ca1..f5ab0cab17f5 100644 --- a/llvm/test/CodeGen/X86/avx-select.ll +++ b/llvm/test/CodeGen/X86/avx-select.ll @@ -16,8 +16,8 @@ define <8 x i32> @select00(i32 %a, <8 x i32> %b) nounwind { ; ; X64-LABEL: select00: ; X64: # BB#0: -; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-NEXT: cmpl $255, %edi +; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-NEXT: je .LBB0_2 ; X64-NEXT: # BB#1: ; X64-NEXT: vmovaps %ymm0, %ymm1 @@ -44,8 +44,8 @@ define <4 x i64> @select01(i32 %a, <4 x i64> %b) nounwind { ; ; X64-LABEL: select01: ; X64: # BB#0: -; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-NEXT: cmpl $255, %edi +; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-NEXT: je .LBB1_2 ; X64-NEXT: # BB#1: ; X64-NEXT: vmovaps %ymm0, %ymm1 diff --git a/llvm/test/CodeGen/X86/avx-splat.ll b/llvm/test/CodeGen/X86/avx-splat.ll index 91d1f64c6706..0f3f3e5fb6e3 100644 --- a/llvm/test/CodeGen/X86/avx-splat.ll +++ b/llvm/test/CodeGen/X86/avx-splat.ll @@ -60,8 +60,8 @@ define <8 x float> @funcE() nounwind { ; CHECK-LABEL: funcE: ; CHECK: # BB#0: # %for_exit499 ; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: # implicit-def: %YMM0 ; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: # implicit-def: %YMM0 ; CHECK-NEXT: jne .LBB4_2 ; CHECK-NEXT: # BB#1: # %load.i1247 ; CHECK-NEXT: pushq %rbp diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll index f6d752ddc3c8..77a2a021416f 100644 --- a/llvm/test/CodeGen/X86/avx512-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -692,8 +692,8 @@ define <16 x i8> @test8(<16 x i32>%a, <16 x i32>%b, i32 %a1, i32 %b1) { ; ; AVX512BW-LABEL: test8: ; AVX512BW: ## BB#0: -; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-NEXT: cmpl %esi, %edi +; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-NEXT: jg LBB17_1 ; AVX512BW-NEXT: ## BB#2: ; AVX512BW-NEXT: vpcmpltud %zmm2, %zmm1, %k0 @@ -708,8 +708,8 @@ define <16 x i8> @test8(<16 x i32>%a, <16 x i32>%b, i32 %a1, i32 %b1) { ; ; AVX512DQ-LABEL: test8: ; AVX512DQ: ## BB#0: -; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512DQ-NEXT: cmpl %esi, %edi +; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512DQ-NEXT: jg LBB17_1 ; AVX512DQ-NEXT: ## BB#2: ; AVX512DQ-NEXT: vpcmpltud %zmm2, %zmm1, %k0 diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll index 7cb1c95cb01a..3e36969f879c 100644 --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -1678,8 +1678,8 @@ define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) { ; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; VEX-NEXT: .LBB39_6: ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; VEX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; VEX-NEXT: js .LBB39_8 ; VEX-NEXT: # BB#7: ; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 @@ -1914,8 +1914,8 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) { ; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; VEX-NEXT: .LBB41_6: ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; VEX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; VEX-NEXT: js .LBB41_8 ; VEX-NEXT: # BB#7: ; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 diff --git a/llvm/test/CodeGen/X86/x86-cmov-converter.ll b/llvm/test/CodeGen/X86/x86-cmov-converter.ll index cdb8894bfd91..5fec1380e14b 100644 --- a/llvm/test/CodeGen/X86/x86-cmov-converter.ll +++ b/llvm/test/CodeGen/X86/x86-cmov-converter.ll @@ -296,9 +296,9 @@ while.end: ; preds = %while.body, %entry ; CHECK-LABEL: Transform ; CHECK-NOT: cmov ; CHECK: divl [[a:%[0-9a-z]*]] -; CHECK: cmpl [[a]], %eax ; CHECK: movl $11, [[s1:%[0-9a-z]*]] ; CHECK: movl [[a]], [[s2:%[0-9a-z]*]] +; CHECK: cmpl [[a]], %edx ; CHECK: ja [[SinkBB:.*]] ; CHECK: [[FalseBB:.*]]: ; CHECK: movl $22, [[s1]]