diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 4a0a57ad426f..6574108df613 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -58,6 +58,17 @@ static cl::opt<bool> ReMatPICStubLoad("remat-pic-stub-load", cl::desc("Re-materialize load from stub in PIC mode"), cl::init(false), cl::Hidden); +static cl::opt<unsigned> +PartialRegUpdateClearance("partial-reg-update-clearance", + cl::desc("Clearance between two register writes " + "for inserting XOR to avoid partial " + "register update"), + cl::init(64), cl::Hidden); +static cl::opt<unsigned> +UndefRegClearance("undef-reg-clearance", + cl::desc("How many idle instructions we would like before " + "certain undef register reads"), + cl::init(64), cl::Hidden); enum { // Select which memory operand is being unfolded. @@ -5972,10 +5983,10 @@ getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum, return 0; } - // If any of the preceding 16 instructions are reading Reg, insert a - // dependency breaking instruction. The magic number is based on a few - // Nehalem experiments. - return 16; + // If any instructions in the clearance range are reading Reg, insert a + // dependency breaking instruction, which is inexpensive and is likely to + // be hidden in other instruction's cycles. + return PartialRegUpdateClearance; } // Return true for any instruction the copies the high bits of the first source @@ -6060,8 +6071,7 @@ getUndefRegClearance(const MachineInstr *MI, unsigned &OpNum, const MachineOperand &MO = MI->getOperand(OpNum); if (MO.isUndef() && TargetRegisterInfo::isPhysicalRegister(MO.getReg())) { - // Use the same magic number as getPartialRegUpdateClearance. - return 16; + return UndefRegClearance; } return 0; } diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll index 1540b272857f..6cb4b180b2e0 100644 --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -1580,6 +1580,7 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { ; AVX1-NEXT: .LBB45_10: ; AVX1-NEXT: shrq %rax ; AVX1-NEXT: orq %rax, %rcx +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0 ; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] @@ -1647,6 +1648,7 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { ; AVX2-NEXT: .LBB45_10: ; AVX2-NEXT: shrq %rax ; AVX2-NEXT: orq %rax, %rcx +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0 ; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] @@ -2773,6 +2775,7 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { ; AVX1-NEXT: .LBB74_10: ; AVX1-NEXT: shrq %rax ; AVX1-NEXT: orq %rax, %rcx +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0 ; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] @@ -2841,6 +2844,7 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { ; AVX2-NEXT: .LBB74_10: ; AVX2-NEXT: shrq %rax ; AVX2-NEXT: orq %rax, %rcx +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0 ; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] @@ -2993,6 +2997,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE-NEXT: .LBB78_10: ; SSE-NEXT: shrq %rax ; SSE-NEXT: orq %rax, %rcx +; SSE-NEXT: xorps %xmm5, %xmm5 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm5 ; SSE-NEXT: addss %xmm5, %xmm5 ; SSE-NEXT: .LBB78_12: @@ -3016,11 +3021,13 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB78_16 ; SSE-NEXT: # BB#17: +; SSE-NEXT: xorps %xmm1, %xmm1 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1 ; SSE-NEXT: jmp .LBB78_18 ; SSE-NEXT: .LBB78_16: ; SSE-NEXT: shrq %rax ; SSE-NEXT: orq %rax, %rcx +; SSE-NEXT: xorps %xmm1, %xmm1 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm1 ; SSE-NEXT: addss %xmm1, %xmm1 ; SSE-NEXT: .LBB78_18: @@ -3165,11 +3172,13 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB78_19 ; AVX1-NEXT: # BB#20: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm5 ; AVX1-NEXT: jmp .LBB78_21 ; AVX1-NEXT: .LBB78_19: ; AVX1-NEXT: shrq %rax ; AVX1-NEXT: orq %rax, %rcx +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0 ; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm5 ; AVX1-NEXT: .LBB78_21: @@ -3292,11 +3301,13 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB78_19 ; AVX2-NEXT: # BB#20: +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm5 ; AVX2-NEXT: jmp .LBB78_21 ; AVX2-NEXT: .LBB78_19: ; AVX2-NEXT: shrq %rax ; AVX2-NEXT: orq %rax, %rcx +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0 ; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm5 ; AVX2-NEXT: .LBB78_21: