[X86][AVX] combineSignExtendInReg - promote mask arithmetic before v4i64 canonicalization

We rely on the combine (sext_in_reg (v4i64 a/sext (v4i32 x)), v4i1) -> (v4i64 sext (v4i32 sext_in_reg (v4i32 x, ExtraVT))) to avoid complex v4i64 ashr codegen, but doing so prevents v4i64 comparison mask promotion, so ensure we attempt to promote before canonicalizing the (hopefully now redundant sext_in_reg). Helps with the poor codegen in PR45808.
2020-05-07 12:28:31 +01:00 · 2020-05-07 12:28:31 +01:00 · b8a725274c
parent 751da4d596
commit b8a725274c
2 changed files with 14 additions and 14 deletions
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@ -44615,7 +44615,7 @@ static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
  //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
  // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
  if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
-      N0.getOpcode() == ISD::SIGN_EXTEND)) {
+                           N0.getOpcode() == ISD::SIGN_EXTEND)) {
    SDValue N00 = N0.getOperand(0);

    // EXTLOAD has a better solution on AVX2,
@ -44624,9 +44624,14 @@ static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
      if (!ISD::isNormalLoad(N00.getNode()))
        return SDValue();

+    // Attempt to promote any comparison mask ops before moving the
+    // SIGN_EXTEND_INREG in the way.
+    if (SDValue Promote = PromoteMaskArithmetic(N0.getNode(), DAG, Subtarget))
+      return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
+
    if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
-        SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
-                                  N00, N1);
+      SDValue Tmp =
+          DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
      return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
    }
  }
--- a/llvm/test/CodeGen/X86/promote-cmp.ll
+++ b/llvm/test/CodeGen/X86/promote-cmp.ll
@ -72,12 +72,10 @@ define <4 x i64> @PR45808(<4 x i64> %0, <4 x i64> %1) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm3
-; AVX1-NEXT:    vpackssdw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpxor {{.*}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT:    vpslld $31, %xmm2, %xmm2
-; AVX1-NEXT:    vpmovsxdq %xmm2, %xmm3
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; AVX1-NEXT:    vpmovsxdq %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpsllq $63, %xmm3, %xmm3
+; AVX1-NEXT:    vpsllq $63, %xmm2, %xmm2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
 ; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
@ -85,11 +83,8 @@ define <4 x i64> @PR45808(<4 x i64> %0, <4 x i64> %1) {
 ; AVX2-LABEL: PR45808:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX2-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vpxor {{.*}}(%rip), %xmm2, %xmm2
-; AVX2-NEXT:    vpslld $31, %xmm2, %xmm2
-; AVX2-NEXT:    vpmovsxdq %xmm2, %ymm2
+; AVX2-NEXT:    vpxor {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT:    vpsllq $63, %ymm2, %ymm2
 ; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
  %3 = icmp sgt <4 x i64> %0, %1