[ARM] Use rq gather/scatters for smaller v4 vectors

A pointer will always fit into an i32, so a rq offset gather/scatter can
be used with v4i8 and v4i16 gathers, using a base of 0 and the Ptr as
the offsets. The rq gather can then correctly extend the type, allowing
us to use the gathers without falling back to scalarizing.

This patch rejigs tryCreateMaskedGatherOffset in the
MVEGatherScatterLowering pass to decompose the Ptr into Base:0 +
Offset:Ptr (with a scale of 1), if the Ptr could not be decomposed from
a GEP. v4i32 gathers will already use qi gathers, this extends that to
v4i8 and v4i16 gathers using the extending rq variants.

Differential Revision: https://reviews.llvm.org/D103674
This commit is contained in:
David Green 2021-06-15 17:06:15 +01:00
parent 2c21278e74
commit 680d3f8f17
4 changed files with 65 additions and 100 deletions

View File

@ -81,6 +81,12 @@ private:
Align Alignment); Align Alignment);
// Check whether Ptr is hidden behind a bitcast and look through it // Check whether Ptr is hidden behind a bitcast and look through it
void lookThroughBitcast(Value *&Ptr); void lookThroughBitcast(Value *&Ptr);
// Decompose a ptr into Base and Offsets, potentially using a GEP to return a
// scalar base and vector offsets, or else fallback to using a base of 0 and
// offset of Ptr where possible.
Value *decomposePtr(Value *Ptr, Value *&Offsets, int &Scale,
FixedVectorType *Ty, Type *MemoryTy,
IRBuilder<> &Builder);
// Check for a getelementptr and deduce base and offsets from it, on success // Check for a getelementptr and deduce base and offsets from it, on success
// returning the base directly and the offsets indirectly using the Offsets // returning the base directly and the offsets indirectly using the Offsets
// argument // argument
@ -213,6 +219,33 @@ static bool checkOffsetSize(Value *Offsets, unsigned TargetElemCount) {
return true; return true;
} }
Value *MVEGatherScatterLowering::decomposePtr(Value *Ptr, Value *&Offsets,
int &Scale, FixedVectorType *Ty,
Type *MemoryTy,
IRBuilder<> &Builder) {
if (auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
if (Value *V = decomposeGEP(Offsets, Ty, GEP, Builder)) {
Scale =
computeScale(GEP->getSourceElementType()->getPrimitiveSizeInBits(),
MemoryTy->getScalarSizeInBits());
return Scale == -1 ? nullptr : V;
}
}
// If we couldn't use the GEP (or it doesn't exist), attempt to use a
// BasePtr of 0 with Ptr as the Offsets, so long as there are only 4
// elements.
FixedVectorType *PtrTy = cast<FixedVectorType>(Ptr->getType());
if (PtrTy->getNumElements() != 4 || MemoryTy->getScalarSizeInBits() == 32)
return nullptr;
Value *Zero = ConstantInt::get(Builder.getInt32Ty(), 0);
Value *BasePtr = Builder.CreateIntToPtr(Zero, Builder.getInt8PtrTy());
Offsets = Builder.CreatePtrToInt(
Ptr, FixedVectorType::get(Builder.getInt32Ty(), 4));
Scale = 0;
return BasePtr;
}
Value *MVEGatherScatterLowering::decomposeGEP(Value *&Offsets, Value *MVEGatherScatterLowering::decomposeGEP(Value *&Offsets,
FixedVectorType *Ty, FixedVectorType *Ty,
GetElementPtrInst *GEP, GetElementPtrInst *GEP,
@ -446,14 +479,14 @@ Value *MVEGatherScatterLowering::tryCreateMaskedGatherOffset(
IntrinsicInst *I, Value *Ptr, Instruction *&Root, IRBuilder<> &Builder) { IntrinsicInst *I, Value *Ptr, Instruction *&Root, IRBuilder<> &Builder) {
using namespace PatternMatch; using namespace PatternMatch;
Type *OriginalTy = I->getType(); Type *MemoryTy = I->getType();
Type *ResultTy = OriginalTy; Type *ResultTy = MemoryTy;
unsigned Unsigned = 1; unsigned Unsigned = 1;
// The size of the gather was already checked in isLegalTypeAndAlignment; // The size of the gather was already checked in isLegalTypeAndAlignment;
// if it was not a full vector width an appropriate extend should follow. // if it was not a full vector width an appropriate extend should follow.
auto *Extend = Root; auto *Extend = Root;
if (OriginalTy->getPrimitiveSizeInBits() < 128) { if (MemoryTy->getPrimitiveSizeInBits() < 128) {
// Only transform gathers with exactly one use // Only transform gathers with exactly one use
if (!I->hasOneUse()) if (!I->hasOneUse())
return nullptr; return nullptr;
@ -478,32 +511,26 @@ Value *MVEGatherScatterLowering::tryCreateMaskedGatherOffset(
} }
} }
GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
Value *Offsets; Value *Offsets;
Value *BasePtr = int Scale;
decomposeGEP(Offsets, cast<FixedVectorType>(ResultTy), GEP, Builder); Value *BasePtr = decomposePtr(
Ptr, Offsets, Scale, cast<FixedVectorType>(ResultTy), MemoryTy, Builder);
if (!BasePtr) if (!BasePtr)
return nullptr; return nullptr;
int Scale =
computeScale(GEP->getSourceElementType()->getPrimitiveSizeInBits(),
OriginalTy->getScalarSizeInBits());
if (Scale == -1)
return nullptr;
Root = Extend; Root = Extend;
Value *Mask = I->getArgOperand(2); Value *Mask = I->getArgOperand(2);
if (!match(Mask, m_One())) if (!match(Mask, m_One()))
return Builder.CreateIntrinsic( return Builder.CreateIntrinsic(
Intrinsic::arm_mve_vldr_gather_offset_predicated, Intrinsic::arm_mve_vldr_gather_offset_predicated,
{ResultTy, BasePtr->getType(), Offsets->getType(), Mask->getType()}, {ResultTy, BasePtr->getType(), Offsets->getType(), Mask->getType()},
{BasePtr, Offsets, Builder.getInt32(OriginalTy->getScalarSizeInBits()), {BasePtr, Offsets, Builder.getInt32(MemoryTy->getScalarSizeInBits()),
Builder.getInt32(Scale), Builder.getInt32(Unsigned), Mask}); Builder.getInt32(Scale), Builder.getInt32(Unsigned), Mask});
else else
return Builder.CreateIntrinsic( return Builder.CreateIntrinsic(
Intrinsic::arm_mve_vldr_gather_offset, Intrinsic::arm_mve_vldr_gather_offset,
{ResultTy, BasePtr->getType(), Offsets->getType()}, {ResultTy, BasePtr->getType(), Offsets->getType()},
{BasePtr, Offsets, Builder.getInt32(OriginalTy->getScalarSizeInBits()), {BasePtr, Offsets, Builder.getInt32(MemoryTy->getScalarSizeInBits()),
Builder.getInt32(Scale), Builder.getInt32(Unsigned)}); Builder.getInt32(Scale), Builder.getInt32(Unsigned)});
} }
@ -617,19 +644,13 @@ Value *MVEGatherScatterLowering::tryCreateMaskedScatterOffset(
return nullptr; return nullptr;
} }
GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
Value *Offsets; Value *Offsets;
Value *BasePtr = int Scale;
decomposeGEP(Offsets, cast<FixedVectorType>(InputTy), GEP, Builder); Value *BasePtr = decomposePtr(
Ptr, Offsets, Scale, cast<FixedVectorType>(InputTy), MemoryTy, Builder);
if (!BasePtr) if (!BasePtr)
return nullptr; return nullptr;
int Scale =
computeScale(GEP->getSourceElementType()->getPrimitiveSizeInBits(),
MemoryTy->getScalarSizeInBits());
if (Scale == -1)
return nullptr;
if (!match(Mask, m_One())) if (!match(Mask, m_One()))
return Builder.CreateIntrinsic( return Builder.CreateIntrinsic(
Intrinsic::arm_mve_vstr_scatter_offset_predicated, Intrinsic::arm_mve_vstr_scatter_offset_predicated,

View File

@ -234,16 +234,9 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @ptr_v4i16_sext(<4 x i16*>* %offptr) { define arm_aapcs_vfpcc <4 x i32> @ptr_v4i16_sext(<4 x i16*>* %offptr) {
; CHECK-LABEL: ptr_v4i16_sext: ; CHECK-LABEL: ptr_v4i16_sext:
; CHECK: @ %bb.0: @ %entry ; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: movs r1, #0
; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: vldrh.s32 q0, [r1, q1]
; CHECK-NEXT: ldrh r0, [r0]
; CHECK-NEXT: ldrh r2, [r2]
; CHECK-NEXT: ldrh r1, [r1]
; CHECK-NEXT: ldrh r3, [r3]
; CHECK-NEXT: vmov q0[2], q0[0], r2, r0
; CHECK-NEXT: vmov q0[3], q0[1], r3, r1
; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: bx lr ; CHECK-NEXT: bx lr
entry: entry:
%offs = load <4 x i16*>, <4 x i16*>* %offptr, align 4 %offs = load <4 x i16*>, <4 x i16*>* %offptr, align 4
@ -255,16 +248,9 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @ptr_v4i16_zext(<4 x i16*>* %offptr) { define arm_aapcs_vfpcc <4 x i32> @ptr_v4i16_zext(<4 x i16*>* %offptr) {
; CHECK-LABEL: ptr_v4i16_zext: ; CHECK-LABEL: ptr_v4i16_zext:
; CHECK: @ %bb.0: @ %entry ; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: movs r1, #0
; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: vldrh.u32 q0, [r1, q1]
; CHECK-NEXT: ldrh r0, [r0]
; CHECK-NEXT: ldrh r2, [r2]
; CHECK-NEXT: ldrh r1, [r1]
; CHECK-NEXT: ldrh r3, [r3]
; CHECK-NEXT: vmov q0[2], q0[0], r2, r0
; CHECK-NEXT: vmov q0[3], q0[1], r3, r1
; CHECK-NEXT: vmovlb.u16 q0, q0
; CHECK-NEXT: bx lr ; CHECK-NEXT: bx lr
entry: entry:
%offs = load <4 x i16*>, <4 x i16*>* %offptr, align 4 %offs = load <4 x i16*>, <4 x i16*>* %offptr, align 4
@ -574,17 +560,9 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @ptr_v4i8_sext32(<4 x i8*>* %offptr) { define arm_aapcs_vfpcc <4 x i32> @ptr_v4i8_sext32(<4 x i8*>* %offptr) {
; CHECK-LABEL: ptr_v4i8_sext32: ; CHECK-LABEL: ptr_v4i8_sext32:
; CHECK: @ %bb.0: @ %entry ; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: movs r1, #0
; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: vldrb.s32 q0, [r1, q1]
; CHECK-NEXT: ldrb r0, [r0]
; CHECK-NEXT: ldrb r2, [r2]
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: ldrb r3, [r3]
; CHECK-NEXT: vmov q0[2], q0[0], r2, r0
; CHECK-NEXT: vmov q0[3], q0[1], r3, r1
; CHECK-NEXT: vmovlb.s8 q0, q0
; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: bx lr ; CHECK-NEXT: bx lr
entry: entry:
%offs = load <4 x i8*>, <4 x i8*>* %offptr, align 4 %offs = load <4 x i8*>, <4 x i8*>* %offptr, align 4
@ -596,17 +574,9 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @ptr_v4i8_zext32(<4 x i8*>* %offptr) { define arm_aapcs_vfpcc <4 x i32> @ptr_v4i8_zext32(<4 x i8*>* %offptr) {
; CHECK-LABEL: ptr_v4i8_zext32: ; CHECK-LABEL: ptr_v4i8_zext32:
; CHECK: @ %bb.0: @ %entry ; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: movs r1, #0
; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: vldrb.u32 q0, [r1, q1]
; CHECK-NEXT: vmov.i32 q0, #0xff
; CHECK-NEXT: ldrb r0, [r0]
; CHECK-NEXT: ldrb r2, [r2]
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: ldrb r3, [r3]
; CHECK-NEXT: vmov q1[2], q1[0], r2, r0
; CHECK-NEXT: vmov q1[3], q1[1], r3, r1
; CHECK-NEXT: vand q0, q1, q0
; CHECK-NEXT: bx lr ; CHECK-NEXT: bx lr
entry: entry:
%offs = load <4 x i8*>, <4 x i8*>* %offptr, align 4 %offs = load <4 x i8*>, <4 x i8*>* %offptr, align 4

View File

@ -236,22 +236,14 @@ entry:
define arm_aapcs_vfpcc void @ext_scaled_i16_i32_2gep(i16* %base, <4 x i32>* %offptr, <4 x i32> %input) { define arm_aapcs_vfpcc void @ext_scaled_i16_i32_2gep(i16* %base, <4 x i32>* %offptr, <4 x i32> %input) {
; CHECK-LABEL: ext_scaled_i16_i32_2gep: ; CHECK-LABEL: ext_scaled_i16_i32_2gep:
; CHECK: @ %bb.0: @ %entry ; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r7, lr}
; CHECK-NEXT: push {r4, r5, r7, lr}
; CHECK-NEXT: vldrw.u32 q2, [r1] ; CHECK-NEXT: vldrw.u32 q2, [r1]
; CHECK-NEXT: vmov.i32 q1, #0xa ; CHECK-NEXT: vmov.i32 q1, #0xa
; CHECK-NEXT: vmov r1, r3, d0 ; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: vshl.i32 q2, q2, #1 ; CHECK-NEXT: vshl.i32 q2, q2, #1
; CHECK-NEXT: vmov r4, r5, d1
; CHECK-NEXT: vadd.i32 q2, q2, r0 ; CHECK-NEXT: vadd.i32 q2, q2, r0
; CHECK-NEXT: vadd.i32 q1, q2, q1 ; CHECK-NEXT: vadd.i32 q1, q2, q1
; CHECK-NEXT: vmov r0, r12, d2 ; CHECK-NEXT: vstrh.32 q0, [r2, q1]
; CHECK-NEXT: vmov r2, lr, d3 ; CHECK-NEXT: bx lr
; CHECK-NEXT: strh r1, [r0]
; CHECK-NEXT: strh.w r3, [r12]
; CHECK-NEXT: strh r4, [r2]
; CHECK-NEXT: strh.w r5, [lr]
; CHECK-NEXT: pop {r4, r5, r7, pc}
entry: entry:
%offs = load <4 x i32>, <4 x i32>* %offptr, align 4 %offs = load <4 x i32>, <4 x i32>* %offptr, align 4
%ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs

View File

@ -228,22 +228,13 @@ entry:
ret void ret void
} }
; Expand
define arm_aapcs_vfpcc void @ptr_v4i16_trunc(<4 x i32> %v, <4 x i16*>* %offptr) { define arm_aapcs_vfpcc void @ptr_v4i16_trunc(<4 x i32> %v, <4 x i16*>* %offptr) {
; CHECK-LABEL: ptr_v4i16_trunc: ; CHECK-LABEL: ptr_v4i16_trunc:
; CHECK: @ %bb.0: @ %entry ; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r7, lr}
; CHECK-NEXT: push {r4, r5, r7, lr}
; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: vmov r1, r3, d0 ; CHECK-NEXT: movs r0, #0
; CHECK-NEXT: vmov r4, r5, d1 ; CHECK-NEXT: vstrh.32 q0, [r0, q1]
; CHECK-NEXT: vmov r0, r12, d2 ; CHECK-NEXT: bx lr
; CHECK-NEXT: vmov r2, lr, d3
; CHECK-NEXT: strh r1, [r0]
; CHECK-NEXT: strh.w r3, [r12]
; CHECK-NEXT: strh r4, [r2]
; CHECK-NEXT: strh.w r5, [lr]
; CHECK-NEXT: pop {r4, r5, r7, pc}
entry: entry:
%offs = load <4 x i16*>, <4 x i16*>* %offptr, align 4 %offs = load <4 x i16*>, <4 x i16*>* %offptr, align 4
%ext = trunc <4 x i32> %v to <4 x i16> %ext = trunc <4 x i32> %v to <4 x i16>
@ -463,22 +454,13 @@ entry:
ret void ret void
} }
; Expand
define arm_aapcs_vfpcc void @ptr_v4i8_trunc32(<4 x i32> %v, <4 x i8*>* %offptr) { define arm_aapcs_vfpcc void @ptr_v4i8_trunc32(<4 x i32> %v, <4 x i8*>* %offptr) {
; CHECK-LABEL: ptr_v4i8_trunc32: ; CHECK-LABEL: ptr_v4i8_trunc32:
; CHECK: @ %bb.0: @ %entry ; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r7, lr}
; CHECK-NEXT: push {r4, r5, r7, lr}
; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: vmov r1, r3, d0 ; CHECK-NEXT: movs r0, #0
; CHECK-NEXT: vmov r4, r5, d1 ; CHECK-NEXT: vstrb.32 q0, [r0, q1]
; CHECK-NEXT: vmov r0, r12, d2 ; CHECK-NEXT: bx lr
; CHECK-NEXT: vmov r2, lr, d3
; CHECK-NEXT: strb r1, [r0]
; CHECK-NEXT: strb.w r3, [r12]
; CHECK-NEXT: strb r4, [r2]
; CHECK-NEXT: strb.w r5, [lr]
; CHECK-NEXT: pop {r4, r5, r7, pc}
entry: entry:
%offs = load <4 x i8*>, <4 x i8*>* %offptr, align 4 %offs = load <4 x i8*>, <4 x i8*>* %offptr, align 4
%ext = trunc <4 x i32> %v to <4 x i8> %ext = trunc <4 x i32> %v to <4 x i8>