[ARM] Convert a bitcast VDUP to a VDUP

The idea, under MVE, is to introduce more bitcasts around VDUP's in an attempt to get the type correct across basic block boundaries. In order to do that without other regressions we need a few fixups, of which this is the first. If the code is a bitcast of a VDUP, we can convert that straight into a VDUP of the new type, so long as they have the same size. Differential Revision: https://reviews.llvm.org/D78706
2020-05-06 14:06:02 +01:00 · 2020-05-06 14:06:02 +01:00 · ed7db68c35
parent 06591b6d19
commit ed7db68c35
3 changed files with 16 additions and 13 deletions
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@ -15244,8 +15244,17 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
  return Res;
 }

-static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG,
+                                    const ARMSubtarget *ST) {
  SDValue Src = N->getOperand(0);
+  EVT DstVT = N->getValueType(0);
+
+  // Convert v4f32 bitcast (v4i32 vdup (i32)) -> v4f32 vdup (i32) under MVE.
+  if (ST->hasMVEIntegerOps() && Src.getOpcode() == ARMISD::VDUP) {
+    EVT SrcVT = Src.getValueType();
+    if (SrcVT.getScalarSizeInBits() == DstVT.getScalarSizeInBits())
+      return DAG.getNode(ARMISD::VDUP, SDLoc(N), DstVT, Src.getOperand(0));
+  }

  // We may have a bitcast of something that has already had this bitcast
  // combine performed on it, so skip past any VECTOR_REG_CASTs.
@ -15255,7 +15264,6 @@ static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG) {
  // Bitcast from element-wise VMOV or VMVN doesn't need VREV if the VREV that
  // would be generated is at least the width of the element type.
  EVT SrcVT = Src.getValueType();
-  EVT DstVT = N->getValueType(0);
  if ((Src.getOpcode() == ARMISD::VMOVIMM ||
       Src.getOpcode() == ARMISD::VMVNIMM ||
       Src.getOpcode() == ARMISD::VMOVFPIMM) &&
@ -15321,7 +15329,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
  case ARMISD::BUILD_VECTOR:
    return PerformARMBUILD_VECTORCombine(N, DCI);
  case ISD::BITCAST:
-    return PerformBITCASTCombine(N, DCI.DAG);
+    return PerformBITCASTCombine(N, DCI.DAG, Subtarget);
  case ARMISD::PREDICATE_CAST:
    return PerformPREDICATE_CASTCombine(N, DCI);
  case ARMISD::VECTOR_REG_CAST:
--- a/llvm/test/CodeGen/Thumb2/mve-vaddqr.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vaddqr.ll
@ -131,8 +131,7 @@ define arm_aapcs_vfpcc <4 x float> @vaddqr_v4f32_3(<4 x float> %src, float %src2
 ; CHECK-LABEL: vaddqr_v4f32_3:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vdup.32 q1, r0
-; CHECK-NEXT:    vadd.f32 q0, q0, q1
+; CHECK-NEXT:    vadd.f32 q0, q0, r0
 ; CHECK-NEXT:    bx lr
 entry:
  %src2bc = bitcast float %src2 to i32
@ -147,8 +146,7 @@ define arm_aapcs_vfpcc <8 x half> @vaddqr_v8f16_3(<8 x half> %src, half *%src2p,
 ; CHECK-LABEL: vaddqr_v8f16_3:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    ldrh r0, [r0]
-; CHECK-NEXT:    vdup.16 q1, r0
-; CHECK-NEXT:    vadd.f16 q0, q0, q1
+; CHECK-NEXT:    vadd.f16 q0, q0, r0
 ; CHECK-NEXT:    bx lr
 entry:
  %src2 = load half, half *%src2p, align 2
@ -164,8 +162,7 @@ define arm_aapcs_vfpcc <4 x float> @vaddqr_v4f32_4(<4 x float> %src, float %src2
 ; CHECK-LABEL: vaddqr_v4f32_4:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vdup.32 q1, r0
-; CHECK-NEXT:    vadd.f32 q0, q1, q0
+; CHECK-NEXT:    vadd.f32 q0, q0, r0
 ; CHECK-NEXT:    bx lr
 entry:
  %src2bc = bitcast float %src2 to i32
@ -180,8 +177,7 @@ define arm_aapcs_vfpcc <8 x half> @vaddqr_v8f16_4(<8 x half> %src, half *%src2p,
 ; CHECK-LABEL: vaddqr_v8f16_4:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    ldrh r0, [r0]
-; CHECK-NEXT:    vdup.16 q1, r0
-; CHECK-NEXT:    vadd.f16 q0, q1, q0
+; CHECK-NEXT:    vadd.f16 q0, q0, r0
 ; CHECK-NEXT:    bx lr
 entry:
  %src2 = load half, half *%src2p, align 2
--- a/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll
@ -5289,8 +5289,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16_bc(<8 x half> %src, half* %src
 ; CHECK-MVEFP-LABEL: vcmp_oeq_v8f16_bc:
 ; CHECK-MVEFP:       @ %bb.0: @ %entry
 ; CHECK-MVEFP-NEXT:    ldrh r0, [r0]
-; CHECK-MVEFP-NEXT:    vdup.16 q3, r0
-; CHECK-MVEFP-NEXT:    vcmp.f16 eq, q0, q3
+; CHECK-MVEFP-NEXT:    vcmp.f16 eq, q0, r0
 ; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
 ; CHECK-MVEFP-NEXT:    bx lr
 entry: