[ARM] Remove PromotedBitwiseVT for NEON types

This removes the promotion of NEON AND, OR and XOR nodes to v2i32/v4i32, treating them the same as the AArch64 and MVE backends where we just add the relevant patterns for each legal type. This prevents a lot of bitcasts from being added to the DAG, which have the potential to make optimizations more difficult. It does mean adding extra patterns, and some codegen can change due to the types now being legal, not promoted. Differential Revision: https://reviews.llvm.org/D105588
2021-07-19 16:36:33 +01:00 · 2021-07-19 16:36:33 +01:00 · 5561ad8b36
parent 74f0f9a455
commit 5561ad8b36
7 changed files with 72 additions and 38 deletions
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@ -1388,7 +1388,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
  PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
 }

-void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
+void AArch64TargetLowering::addTypeForNEON(MVT VT) {
  assert(VT.isVector() && "VT should be a vector type");

  if (VT.isFloatingPoint()) {
@ -1589,12 +1589,12 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {

 void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
  addRegisterClass(VT, &AArch64::FPR64RegClass);
-  addTypeForNEON(VT, MVT::v2i32);
+  addTypeForNEON(VT);
 }

 void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
  addRegisterClass(VT, &AArch64::FPR128RegClass);
-  addTypeForNEON(VT, MVT::v4i32);
+  addTypeForNEON(VT);
 }

 EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &,
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@ -831,7 +831,7 @@ private:

  bool isExtFreeImpl(const Instruction *Ext) const override;

-  void addTypeForNEON(MVT VT, MVT PromotedBitwiseVT);
+  void addTypeForNEON(MVT VT);
  void addTypeForFixedLengthSVE(MVT VT);
  void addDRTypeForNEON(MVT VT);
  void addQRTypeForNEON(MVT VT);
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@ -154,8 +154,7 @@ static const MCPhysReg GPRArgRegs[] = {
  ARM::R0, ARM::R1, ARM::R2, ARM::R3
 };

-void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT,
-                                       MVT PromotedBitwiseVT) {
+void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) {
  if (VT != PromotedLdStVT) {
    setOperationAction(ISD::LOAD, VT, Promote);
    AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
@ -194,16 +193,6 @@ void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT,
    setOperationAction(ISD::SRL, VT, Custom);
  }

-  // Promote all bit-wise operations.
-  if (VT.isInteger() && VT != PromotedBitwiseVT) {
-    setOperationAction(ISD::AND, VT, Promote);
-    AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT);
-    setOperationAction(ISD::OR,  VT, Promote);
-    AddPromotedToType (ISD::OR,  VT, PromotedBitwiseVT);
-    setOperationAction(ISD::XOR, VT, Promote);
-    AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT);
-  }
-
  // Neon does not support vector divide/remainder operations.
  setOperationAction(ISD::SDIV, VT, Expand);
  setOperationAction(ISD::UDIV, VT, Expand);
@ -225,12 +214,12 @@ void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT,

 void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
  addRegisterClass(VT, &ARM::DPRRegClass);
-  addTypeForNEON(VT, MVT::f64, MVT::v2i32);
+  addTypeForNEON(VT, MVT::f64);
 }

 void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
  addRegisterClass(VT, &ARM::DPairRegClass);
-  addTypeForNEON(VT, MVT::v2f64, MVT::v4i32);
+  addTypeForNEON(VT, MVT::v2f64);
 }

 void ARMTargetLowering::setAllExpand(MVT VT) {
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@ -756,7 +756,7 @@ class VectorType;

    bool HasStandaloneRem = true;

-    void addTypeForNEON(MVT VT, MVT PromotedLdStVT, MVT PromotedBitwiseVT);
+    void addTypeForNEON(MVT VT, MVT PromotedLdStVT);
    void addDRTypeForNEON(MVT VT);
    void addQRTypeForNEON(MVT VT);
    std::pair<SDValue, SDValue> getARMXALUOOp(SDValue Op, SelectionDAG &DAG, SDValue &ARMcc) const;
--- a/llvm/lib/Target/ARM/ARMInstrNEON.td
+++ b/llvm/lib/Target/ARM/ARMInstrNEON.td
@ -5341,6 +5341,29 @@ def  VORRd    : N3VDX<0, 0, 0b10, 0b0001, 1, IIC_VBINiD, "vorr",
 def  VORRq    : N3VQX<0, 0, 0b10, 0b0001, 1, IIC_VBINiQ, "vorr",
                      v4i32, v4i32, or, 1>;

+multiclass BitwisePatterns<string Name, SDPatternOperator OpNodeD,
+                           SDPatternOperator OpNodeQ> {
+  def : Pat<(v8i8 (OpNodeD DPR:$LHS, DPR:$RHS)),
+            (!cast<Instruction>(Name#"d") DPR:$LHS, DPR:$RHS)>;
+  def : Pat<(v4i16 (OpNodeD DPR:$LHS, DPR:$RHS)),
+            (!cast<Instruction>(Name#"d") DPR:$LHS, DPR:$RHS)>;
+  def : Pat<(v1i64 (OpNodeD DPR:$LHS, DPR:$RHS)),
+            (!cast<Instruction>(Name#"d") DPR:$LHS, DPR:$RHS)>;
+
+  def : Pat<(v16i8 (OpNodeQ QPR:$LHS, QPR:$RHS)),
+            (!cast<Instruction>(Name#"q") QPR:$LHS, QPR:$RHS)>;
+  def : Pat<(v8i16 (OpNodeQ QPR:$LHS, QPR:$RHS)),
+            (!cast<Instruction>(Name#"q") QPR:$LHS, QPR:$RHS)>;
+  def : Pat<(v2i64 (OpNodeQ QPR:$LHS, QPR:$RHS)),
+            (!cast<Instruction>(Name#"q") QPR:$LHS, QPR:$RHS)>;
+}
+
+let Predicates = [HasNEON] in {
+  defm : BitwisePatterns<"VAND", and, and>;
+  defm : BitwisePatterns<"VORR", or, or>;
+  defm : BitwisePatterns<"VEOR", xor, xor>;
+}
+
 def VORRiv4i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 0, 0, 1,
                          (outs DPR:$Vd), (ins nImmSplatI16:$SIMM, DPR:$src),
                          IIC_VMOVImm,
@ -5392,6 +5415,11 @@ def  VBICq    : N3VX<0, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd),
                                                 (vnotq QPR:$Vm))))]>;
 }

+let Predicates = [HasNEON] in {
+  defm : BitwisePatterns<"VBIC", BinOpFrag<(and node:$LHS, (vnotd node:$RHS))>,
+                                 BinOpFrag<(and node:$LHS, (vnotq node:$RHS))>>;
+}
+
 def VBICiv4i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 0, 1, 1,
                          (outs DPR:$Vd), (ins nImmSplatI16:$SIMM, DPR:$src),
                          IIC_VMOVImm,
@ -5440,6 +5468,11 @@ def  VORNq    : N3VX<0, 0, 0b11, 0b0001, 1, 1, (outs QPR:$Vd),
                     [(set QPR:$Vd, (v4i32 (or QPR:$Vn,
                                                (vnotq QPR:$Vm))))]>;

+let Predicates = [HasNEON] in {
+  defm : BitwisePatterns<"VORN", BinOpFrag<(or node:$LHS, (vnotd node:$RHS))>,
+                                 BinOpFrag<(or node:$LHS, (vnotq node:$RHS))>>;
+}
+
 //   VMVN     : Vector Bitwise NOT (Immediate)

 let isReMaterializable = 1 in {
@ -5483,8 +5516,18 @@ def  VMVNq    : N2VX<0b11, 0b11, 0b00, 0b00, 0b01011, 1, 0,
                     "vmvn", "$Vd, $Vm", "",
                     [(set QPR:$Vd, (v4i32 (vnotq QPR:$Vm)))]>;
 let Predicates = [HasNEON] in {
-def : Pat<(v2i32 (vnotd DPR:$src)), (VMVNd DPR:$src)>;
-def : Pat<(v4i32 (vnotq QPR:$src)), (VMVNq QPR:$src)>;
+def : Pat<(v1i64 (vnotd DPR:$src)),
+          (VMVNd DPR:$src)>;
+def : Pat<(v4i16 (vnotd DPR:$src)),
+          (VMVNd DPR:$src)>;
+def : Pat<(v8i8 (vnotd DPR:$src)),
+          (VMVNd DPR:$src)>;
+def : Pat<(v2i64 (vnotq QPR:$src)),
+          (VMVNq QPR:$src)>;
+def : Pat<(v8i16 (vnotq QPR:$src)),
+          (VMVNq QPR:$src)>;
+def : Pat<(v16i8 (vnotq QPR:$src)),
+          (VMVNq QPR:$src)>;
 }

 // The TwoAddress pass will not go looking for equivalent operations
@ -5513,10 +5556,15 @@ def : Pat<(v1i64 (int_arm_neon_vbsl (v1i64 DPR:$src1),
                                    (v1i64 DPR:$Vn), (v1i64 DPR:$Vm))),
          (VBSPd DPR:$src1, DPR:$Vn, DPR:$Vm)>;

+def : Pat<(v8i8 (or (and DPR:$Vn, DPR:$Vd),
+                    (and DPR:$Vm, (vnotd DPR:$Vd)))),
+          (VBSPd DPR:$Vd, DPR:$Vn, DPR:$Vm)>;
+def : Pat<(v4i16 (or (and DPR:$Vn, DPR:$Vd),
+                     (and DPR:$Vm, (vnotd DPR:$Vd)))),
+          (VBSPd DPR:$Vd, DPR:$Vn, DPR:$Vm)>;
 def : Pat<(v2i32 (or (and DPR:$Vn, DPR:$Vd),
                     (and DPR:$Vm, (vnotd DPR:$Vd)))),
          (VBSPd DPR:$Vd, DPR:$Vn, DPR:$Vm)>;
-
 def : Pat<(v1i64 (or (and DPR:$Vn, DPR:$Vd),
                     (and DPR:$Vm, (vnotd DPR:$Vd)))),
          (VBSPd DPR:$Vd, DPR:$Vn, DPR:$Vm)>;
@ -5544,6 +5592,12 @@ def : Pat<(v2i64 (int_arm_neon_vbsl (v2i64 QPR:$src1),
                                    (v2i64 QPR:$Vn), (v2i64 QPR:$Vm))),
          (VBSPq QPR:$src1, QPR:$Vn, QPR:$Vm)>;

+def : Pat<(v16i8 (or (and QPR:$Vn, QPR:$Vd),
+                     (and QPR:$Vm, (vnotq QPR:$Vd)))),
+          (VBSPq QPR:$Vd, QPR:$Vn, QPR:$Vm)>;
+def : Pat<(v8i16 (or (and QPR:$Vn, QPR:$Vd),
+                     (and QPR:$Vm, (vnotq QPR:$Vd)))),
+          (VBSPq QPR:$Vd, QPR:$Vn, QPR:$Vm)>;
 def : Pat<(v4i32 (or (and QPR:$Vn, QPR:$Vd),
                     (and QPR:$Vm, (vnotq QPR:$Vd)))),
          (VBSPq QPR:$Vd, QPR:$Vn, QPR:$Vm)>;
@ -5633,10 +5687,10 @@ def abd_shr :
                            (zext node:$in2)), (i32 $shift))>;

 let Predicates = [HasNEON] in {
-def : Pat<(xor (v4i32 (bitconvert (v2i64 (abd_shr (v2i32 DPR:$opA), (v2i32 DPR:$opB), 63)))),
-               (v4i32 (bitconvert (v2i64 (add (sub (zext (v2i32 DPR:$opA)),
-                                                   (zext (v2i32 DPR:$opB))),
-                                         (abd_shr (v2i32 DPR:$opA), (v2i32 DPR:$opB), 63)))))),
+def : Pat<(xor (v2i64 (abd_shr (v2i32 DPR:$opA), (v2i32 DPR:$opB), 63)),
+               (v2i64 (add (sub (zext (v2i32 DPR:$opA)),
+                                (zext (v2i32 DPR:$opB))),
+                           (abd_shr (v2i32 DPR:$opA), (v2i32 DPR:$opB), 63)))),
          (VABDLuv2i64 DPR:$opA, DPR:$opB)>;
 }

--- a/llvm/test/CodeGen/ARM/vector-promotion.ll
+++ b/llvm/test/CodeGen/ARM/vector-promotion.ll
@ -356,18 +356,10 @@ define void @simpleOneInstructionPromotionVariableIdx(<2 x i32>* %addr1, i32* %d
 }

 ; Check a vector with more than 2 elements.
-; This requires the STRESS mode because currently 'or v8i8' is not marked
-; as legal or custom, althought the actual assembly is better if we were
-; promoting it.
 ; IR-BOTH-LABEL: @simpleOneInstructionPromotion8x8
 ; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <8 x i8>, <8 x i8>* %addr1
-; Scalar version:  
-; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <8 x i8> [[LOAD]], i32 1
-; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = or i8 [[EXTRACT]], 1
-; Vector version:  
-; IR-STRESS-NEXT: [[OR:%[a-zA-Z_0-9-]+]] = or <8 x i8> [[LOAD]], <i8 undef, i8 1, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>
-; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <8 x i8> [[OR]], i32 1
-;
+; IR-BOTH-NEXT: [[OR:%[a-zA-Z_0-9-]+]] = or <8 x i8> [[LOAD]], <i8 undef, i8 1, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>
+; IR-BOTH-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <8 x i8> [[OR]], i32 1
 ; IR-BOTH-NEXT: store i8 [[RES]], i8* %dest
 ; IR-BOTH-NEXT: ret
 define void @simpleOneInstructionPromotion8x8(<8 x i8>* %addr1, i8* %dest) {
--- a/llvm/test/CodeGen/ARM/vmov.ll
+++ b/llvm/test/CodeGen/ARM/vmov.ll
@ -676,10 +676,9 @@ define arm_aapcs_vfpcc void @any_extend(<4 x i1> %x, <4 x i32> %y) nounwind ssp
 ; CHECK-BE-LABEL: any_extend:
 ; CHECK-BE:       @ %bb.0: @ %entry
 ; CHECK-BE-NEXT:    vmov.i16 d16, #0x1
-; CHECK-BE-NEXT:    vrev64.32 d17, d0
+; CHECK-BE-NEXT:    vrev64.16 d17, d0
 ; CHECK-BE-NEXT:    vrev64.32 q9, q1
 ; CHECK-BE-NEXT:    vand d16, d17, d16
-; CHECK-BE-NEXT:    vrev32.16 d16, d16
 ; CHECK-BE-NEXT:    vmovl.u16 q8, d16
 ; CHECK-BE-NEXT:    vsub.i32 q8, q8, q9
 ; CHECK-BE-NEXT:    vmovn.i32 d16, q8