[ARM][MVE] Restrict allowed types of gather/scatter offsets

The MVE gather instructions smaller than 32bits zext extend the values in the offset register, as opposed to sign extending them. We need to make sure that the code that we select from is suitably extended, which this patch attempts to fix by tightening up the offset checks. Differential Revision: https://reviews.llvm.org/D75361
2020-03-02 09:14:37 +00:00 · 2020-03-02 09:14:37 +00:00 · 394974111b
parent 4962a0b26a
commit 394974111b
2 changed files with 152 additions and 12 deletions
--- a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
+++ b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
@ -145,26 +145,35 @@ Value *MVEGatherScatterLowering::checkGEP(Value *&Offsets, Type *Ty, Value *Ptr,
    return nullptr;
  }
  Offsets = GEP->getOperand(1);
-  // SExt offsets inside masked gathers are not permitted by the architecture;
-  // we therefore can't fold them
+  // Paranoid check whether the number of parallel lanes is the same
+  assert(Ty->getVectorNumElements() ==
+         Offsets->getType()->getVectorNumElements());
+  // Only <N x i32> offsets can be integrated into an arm gather, any smaller
+  // type would have to be sign extended by the gep - and arm gathers can only
+  // zero extend. Additionally, the offsets do have to originate from a zext of
+  // a vector with element types smaller or equal the type of the gather we're
+  // looking at
+  if (Offsets->getType()->getScalarSizeInBits() != 32)
+    return nullptr;
  if (ZExtInst *ZextOffs = dyn_cast<ZExtInst>(Offsets))
    Offsets = ZextOffs->getOperand(0);
-  Type *OffsType = VectorType::getInteger(cast<VectorType>(Ty));
-  // If the offset we found does not have the type the intrinsic expects,
-  // i.e., the same type as the gather (or scatter input) itself, we need to
-  // convert it (only i types) or fall back to expanding the gather
-  if (OffsType != Offsets->getType()) {
-    if (OffsType->getScalarSizeInBits() >
-        Offsets->getType()->getScalarSizeInBits()) {
-      LLVM_DEBUG(dbgs() << "masked gathers/scatters: extending offsets\n");
-      Offsets = Builder.CreateZExt(Offsets, OffsType, "");
-    } else {
+  else if (!(Offsets->getType()->getVectorNumElements() == 4 &&
+             Offsets->getType()->getScalarSizeInBits() == 32))
+    return nullptr;
+
+  if (Ty != Offsets->getType()) {
+    if ((Ty->getScalarSizeInBits() <
+         Offsets->getType()->getScalarSizeInBits())) {
      LLVM_DEBUG(dbgs() << "masked gathers/scatters: no correct offset type."
                        << " Can't create intrinsic.\n");
      return nullptr;
+    } else {
+      Offsets = Builder.CreateZExt(
+          Offsets, VectorType::getInteger(cast<VectorType>(Ty)));
    }
  }
  // If none of the checks failed, return the gep's base pointer
+  LLVM_DEBUG(dbgs() << "masked gathers/scatters: found correct offsets\n");
  return GEPPtr;
 }

--- a/llvm/test/CodeGen/Thumb2/mve-gather-ind16-unscaled.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind16-unscaled.ll
@ -16,6 +16,137 @@ entry:
  ret <8 x i16> %gather.zext
 }

+define arm_aapcs_vfpcc <8 x i16> @zext_unscaled_i8_i16_noext(i8* %base, <8 x i8>* %offptr) {
+; CHECK-LABEL: zext_unscaled_i8_i16_noext:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    vldrb.s32 q0, [r1]
+; CHECK-NEXT:    vldrb.s32 q1, [r1, #4]
+; CHECK-NEXT:    vadd.i32 q0, q0, r0
+; CHECK-NEXT:    vadd.i32 q1, q1, r0
+; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov r3, s1
+; CHECK-NEXT:    vmov r5, s0
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r1, s5
+; CHECK-NEXT:    vmov r4, s7
+; CHECK-NEXT:    ldrb.w r12, [r2]
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    ldrb.w lr, [r3]
+; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    ldrb r5, [r5]
+; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    vmov.16 q0[0], r5
+; CHECK-NEXT:    ldrb r1, [r1]
+; CHECK-NEXT:    vmov.16 q0[1], lr
+; CHECK-NEXT:    ldrb r4, [r4]
+; CHECK-NEXT:    ldrb r2, [r2]
+; CHECK-NEXT:    ldrb r3, [r3]
+; CHECK-NEXT:    vmov.16 q0[2], r2
+; CHECK-NEXT:    vmov.16 q0[3], r12
+; CHECK-NEXT:    vmov.16 q0[4], r0
+; CHECK-NEXT:    vmov.16 q0[5], r1
+; CHECK-NEXT:    vmov.16 q0[6], r3
+; CHECK-NEXT:    vmov.16 q0[7], r4
+; CHECK-NEXT:    vmovlb.u8 q0, q0
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
+entry:
+  %offs = load <8 x i8>, <8 x i8>* %offptr, align 2
+  %ptrs = getelementptr inbounds i8, i8* %base, <8 x i8> %offs
+  %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %ptrs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef)
+  %gather.zext = zext <8 x i8> %gather to <8 x i16>
+  ret <8 x i16> %gather.zext
+}
+
+define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_sext(i16* %base, <8 x i8>* %offptr) {
+; CHECK-LABEL: scaled_v8i16_sext:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    vldrb.s32 q0, [r1]
+; CHECK-NEXT:    vldrb.s32 q1, [r1, #4]
+; CHECK-NEXT:    vshl.i32 q0, q0, #1
+; CHECK-NEXT:    vshl.i32 q1, q1, #1
+; CHECK-NEXT:    vadd.i32 q0, q0, r0
+; CHECK-NEXT:    vadd.i32 q1, q1, r0
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r3, s3
+; CHECK-NEXT:    vmov r5, s1
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r1, s5
+; CHECK-NEXT:    vmov r4, s7
+; CHECK-NEXT:    ldrh.w r12, [r2]
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    ldrh.w lr, [r3]
+; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    ldrh r5, [r5]
+; CHECK-NEXT:    ldrh r0, [r0]
+; CHECK-NEXT:    ldrh r1, [r1]
+; CHECK-NEXT:    ldrh r4, [r4]
+; CHECK-NEXT:    ldrh r2, [r2]
+; CHECK-NEXT:    ldrh r3, [r3]
+; CHECK-NEXT:    vmov.16 q0[0], r2
+; CHECK-NEXT:    vmov.16 q0[1], r5
+; CHECK-NEXT:    vmov.16 q0[2], r12
+; CHECK-NEXT:    vmov.16 q0[3], lr
+; CHECK-NEXT:    vmov.16 q0[4], r0
+; CHECK-NEXT:    vmov.16 q0[5], r1
+; CHECK-NEXT:    vmov.16 q0[6], r3
+; CHECK-NEXT:    vmov.16 q0[7], r4
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
+entry:
+  %offs = load <8 x i8>, <8 x i8>* %offptr, align 2
+  %offs.sext = sext <8 x i8> %offs to <8 x i16>
+  %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> %offs.sext
+  %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef)
+  ret <8 x i16> %gather
+}
+
+define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_zext(i16* %base, <8 x i8>* %offptr) {
+; CHECK-LABEL: scaled_v8i16_zext:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    vldrb.u32 q0, [r1]
+; CHECK-NEXT:    vldrb.u32 q1, [r1, #4]
+; CHECK-NEXT:    vshl.i32 q0, q0, #1
+; CHECK-NEXT:    vshl.i32 q1, q1, #1
+; CHECK-NEXT:    vadd.i32 q0, q0, r0
+; CHECK-NEXT:    vadd.i32 q1, q1, r0
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r3, s3
+; CHECK-NEXT:    vmov r5, s1
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r1, s5
+; CHECK-NEXT:    vmov r4, s7
+; CHECK-NEXT:    ldrh.w r12, [r2]
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    ldrh.w lr, [r3]
+; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    ldrh r5, [r5]
+; CHECK-NEXT:    ldrh r0, [r0]
+; CHECK-NEXT:    ldrh r1, [r1]
+; CHECK-NEXT:    ldrh r4, [r4]
+; CHECK-NEXT:    ldrh r2, [r2]
+; CHECK-NEXT:    ldrh r3, [r3]
+; CHECK-NEXT:    vmov.16 q0[0], r2
+; CHECK-NEXT:    vmov.16 q0[1], r5
+; CHECK-NEXT:    vmov.16 q0[2], r12
+; CHECK-NEXT:    vmov.16 q0[3], lr
+; CHECK-NEXT:    vmov.16 q0[4], r0
+; CHECK-NEXT:    vmov.16 q0[5], r1
+; CHECK-NEXT:    vmov.16 q0[6], r3
+; CHECK-NEXT:    vmov.16 q0[7], r4
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
+entry:
+  %offs = load <8 x i8>, <8 x i8>* %offptr, align 2
+  %offs.zext = zext <8 x i8> %offs to <8 x i16>
+  %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> %offs.zext
+  %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef)
+  ret <8 x i16> %gather
+}
+
 define arm_aapcs_vfpcc <8 x i16> @sext_unscaled_i8_i16(i8* %base, <8 x i16>* %offptr) {
 ; CHECK-LABEL: sext_unscaled_i8_i16:
 ; CHECK:       @ %bb.0: @ %entry