forked from OSchip/llvm-project
[ARM] Extend narrow values to allow using truncating scatters
As a minor adjustment to the existing lowering of offset scatters, this extends any smaller-than-legal vectors into full vectors using a zext, so that the truncating scatters can be used. Due to the way MVE legalizes the vectors this should be cheap in most situations, and will prevent the vector from being scalarized. Differential Revision: https://reviews.llvm.org/D103704
This commit is contained in:
parent
a6948da86a
commit
b9bd2936f9
|
@ -638,6 +638,18 @@ Value *MVEGatherScatterLowering::tryCreateMaskedScatterOffset(
|
||||||
InputTy = PreTruncTy;
|
InputTy = PreTruncTy;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (InputTy->getPrimitiveSizeInBits() < 128 &&
|
||||||
|
InputTy->isIntOrIntVectorTy()) {
|
||||||
|
// If we can't find a trunc to incorporate into the instruction, create an
|
||||||
|
// implicit one with a zext, so that we can still create a scatter. We know
|
||||||
|
// that the input type is 4x/8x/16x and of type i8/i16/i32, so any type
|
||||||
|
// smaller than 128 bits will divide evenly into a 128bit vector.
|
||||||
|
InputTy = InputTy->getWithNewBitWidth(
|
||||||
|
128 / cast<FixedVectorType>(InputTy)->getNumElements());
|
||||||
|
Input = Builder.CreateZExt(Input, InputTy);
|
||||||
|
LLVM_DEBUG(dbgs() << "masked scatters: Small input type, extending with:\n"
|
||||||
|
<< *Input << "\n");
|
||||||
|
}
|
||||||
if (InputTy->getPrimitiveSizeInBits() != 128) {
|
if (InputTy->getPrimitiveSizeInBits() != 128) {
|
||||||
LLVM_DEBUG(dbgs() << "masked scatters: cannot create scatters for "
|
LLVM_DEBUG(dbgs() << "masked scatters: cannot create scatters for "
|
||||||
"non-standard input types. Expanding.\n");
|
"non-standard input types. Expanding.\n");
|
||||||
|
|
|
@ -361,19 +361,11 @@ entry:
|
||||||
define arm_aapcs_vfpcc void @trunc_signed_unscaled_i16_i8(i8* %base, <4 x i8>* %offptr, <4 x i16> %input) {
|
define arm_aapcs_vfpcc void @trunc_signed_unscaled_i16_i8(i8* %base, <4 x i8>* %offptr, <4 x i16> %input) {
|
||||||
; CHECK-LABEL: trunc_signed_unscaled_i16_i8:
|
; CHECK-LABEL: trunc_signed_unscaled_i16_i8:
|
||||||
; CHECK: @ %bb.0: @ %entry
|
; CHECK: @ %bb.0: @ %entry
|
||||||
; CHECK-NEXT: .save {r4, r5, r7, lr}
|
; CHECK-NEXT: vmov.i32 q1, #0xff
|
||||||
; CHECK-NEXT: push {r4, r5, r7, lr}
|
; CHECK-NEXT: vldrb.s32 q2, [r1]
|
||||||
; CHECK-NEXT: vldrb.s32 q1, [r1]
|
; CHECK-NEXT: vand q0, q0, q1
|
||||||
; CHECK-NEXT: vmov r1, r3, d0
|
; CHECK-NEXT: vstrb.32 q0, [r0, q2]
|
||||||
; CHECK-NEXT: vmov r4, r5, d1
|
; CHECK-NEXT: bx lr
|
||||||
; CHECK-NEXT: vadd.i32 q1, q1, r0
|
|
||||||
; CHECK-NEXT: vmov r0, r12, d2
|
|
||||||
; CHECK-NEXT: vmov r2, lr, d3
|
|
||||||
; CHECK-NEXT: strb r1, [r0]
|
|
||||||
; CHECK-NEXT: strb.w r3, [r12]
|
|
||||||
; CHECK-NEXT: strb r4, [r2]
|
|
||||||
; CHECK-NEXT: strb.w r5, [lr]
|
|
||||||
; CHECK-NEXT: pop {r4, r5, r7, pc}
|
|
||||||
entry:
|
entry:
|
||||||
%offs = load <4 x i8>, <4 x i8>* %offptr, align 1
|
%offs = load <4 x i8>, <4 x i8>* %offptr, align 1
|
||||||
%offs.sext = sext <4 x i8> %offs to <4 x i32>
|
%offs.sext = sext <4 x i8> %offs to <4 x i32>
|
||||||
|
@ -386,19 +378,11 @@ entry:
|
||||||
define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i16_i8(i8* %base, <4 x i8>* %offptr, <4 x i16> %input) {
|
define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i16_i8(i8* %base, <4 x i8>* %offptr, <4 x i16> %input) {
|
||||||
; CHECK-LABEL: trunc_unsigned_unscaled_i16_i8:
|
; CHECK-LABEL: trunc_unsigned_unscaled_i16_i8:
|
||||||
; CHECK: @ %bb.0: @ %entry
|
; CHECK: @ %bb.0: @ %entry
|
||||||
; CHECK-NEXT: .save {r4, r5, r7, lr}
|
; CHECK-NEXT: vmov.i32 q1, #0xff
|
||||||
; CHECK-NEXT: push {r4, r5, r7, lr}
|
; CHECK-NEXT: vldrb.u32 q2, [r1]
|
||||||
; CHECK-NEXT: vldrb.u32 q1, [r1]
|
; CHECK-NEXT: vand q0, q0, q1
|
||||||
; CHECK-NEXT: vmov r1, r3, d0
|
; CHECK-NEXT: vstrb.32 q0, [r0, q2]
|
||||||
; CHECK-NEXT: vmov r4, r5, d1
|
; CHECK-NEXT: bx lr
|
||||||
; CHECK-NEXT: vadd.i32 q1, q1, r0
|
|
||||||
; CHECK-NEXT: vmov r0, r12, d2
|
|
||||||
; CHECK-NEXT: vmov r2, lr, d3
|
|
||||||
; CHECK-NEXT: strb r1, [r0]
|
|
||||||
; CHECK-NEXT: strb.w r3, [r12]
|
|
||||||
; CHECK-NEXT: strb r4, [r2]
|
|
||||||
; CHECK-NEXT: strb.w r5, [lr]
|
|
||||||
; CHECK-NEXT: pop {r4, r5, r7, pc}
|
|
||||||
entry:
|
entry:
|
||||||
%offs = load <4 x i8>, <4 x i8>* %offptr, align 1
|
%offs = load <4 x i8>, <4 x i8>* %offptr, align 1
|
||||||
%offs.zext = zext <4 x i8> %offs to <4 x i32>
|
%offs.zext = zext <4 x i8> %offs to <4 x i32>
|
||||||
|
|
|
@ -16,37 +16,13 @@ entry:
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; Expanded ?
|
|
||||||
define arm_aapcs_vfpcc void @unscaled_v8i8_i8(i8* %base, <8 x i8>* %offptr, <8 x i8> %input) {
|
define arm_aapcs_vfpcc void @unscaled_v8i8_i8(i8* %base, <8 x i8>* %offptr, <8 x i8> %input) {
|
||||||
; CHECK-LABEL: unscaled_v8i8_i8:
|
; CHECK-LABEL: unscaled_v8i8_i8:
|
||||||
; CHECK: @ %bb.0: @ %entry
|
; CHECK: @ %bb.0: @ %entry
|
||||||
; CHECK-NEXT: .save {r4, r5, r6, lr}
|
; CHECK-NEXT: vldrb.u16 q1, [r1]
|
||||||
; CHECK-NEXT: push {r4, r5, r6, lr}
|
; CHECK-NEXT: vmovlb.u8 q0, q0
|
||||||
; CHECK-NEXT: vldrb.u32 q1, [r1]
|
; CHECK-NEXT: vstrb.16 q0, [r0, q1]
|
||||||
; CHECK-NEXT: vmov.u16 r6, q0[0]
|
; CHECK-NEXT: bx lr
|
||||||
; CHECK-NEXT: vadd.i32 q1, q1, r0
|
|
||||||
; CHECK-NEXT: vmov r2, r3, d2
|
|
||||||
; CHECK-NEXT: vmov r12, lr, d3
|
|
||||||
; CHECK-NEXT: vldrb.u32 q1, [r1, #4]
|
|
||||||
; CHECK-NEXT: vadd.i32 q1, q1, r0
|
|
||||||
; CHECK-NEXT: vmov r0, r1, d2
|
|
||||||
; CHECK-NEXT: vmov r4, r5, d3
|
|
||||||
; CHECK-NEXT: strb r6, [r2]
|
|
||||||
; CHECK-NEXT: vmov.u16 r2, q0[1]
|
|
||||||
; CHECK-NEXT: strb r2, [r3]
|
|
||||||
; CHECK-NEXT: vmov.u16 r2, q0[2]
|
|
||||||
; CHECK-NEXT: strb.w r2, [r12]
|
|
||||||
; CHECK-NEXT: vmov.u16 r2, q0[3]
|
|
||||||
; CHECK-NEXT: strb.w r2, [lr]
|
|
||||||
; CHECK-NEXT: vmov.u16 r2, q0[4]
|
|
||||||
; CHECK-NEXT: strb r2, [r0]
|
|
||||||
; CHECK-NEXT: vmov.u16 r0, q0[5]
|
|
||||||
; CHECK-NEXT: strb r0, [r1]
|
|
||||||
; CHECK-NEXT: vmov.u16 r0, q0[6]
|
|
||||||
; CHECK-NEXT: strb r0, [r4]
|
|
||||||
; CHECK-NEXT: vmov.u16 r0, q0[7]
|
|
||||||
; CHECK-NEXT: strb r0, [r5]
|
|
||||||
; CHECK-NEXT: pop {r4, r5, r6, pc}
|
|
||||||
entry:
|
entry:
|
||||||
%offs = load <8 x i8>, <8 x i8>* %offptr, align 1
|
%offs = load <8 x i8>, <8 x i8>* %offptr, align 1
|
||||||
%offs.zext = zext <8 x i8> %offs to <8 x i32>
|
%offs.zext = zext <8 x i8> %offs to <8 x i32>
|
||||||
|
|
|
@ -245,12 +245,10 @@ entry:
|
||||||
define arm_aapcs_vfpcc void @ptr_v4i16_dup(i32 %v, <4 x i16*> %offs) {
|
define arm_aapcs_vfpcc void @ptr_v4i16_dup(i32 %v, <4 x i16*> %offs) {
|
||||||
; CHECK-LABEL: ptr_v4i16_dup:
|
; CHECK-LABEL: ptr_v4i16_dup:
|
||||||
; CHECK: @ %bb.0: @ %entry
|
; CHECK: @ %bb.0: @ %entry
|
||||||
; CHECK-NEXT: vmov r1, r2, d0
|
; CHECK-NEXT: vdup.32 q1, r0
|
||||||
; CHECK-NEXT: vmov r3, r12, d1
|
; CHECK-NEXT: movs r1, #0
|
||||||
; CHECK-NEXT: strh r0, [r1]
|
; CHECK-NEXT: vmovlb.u16 q1, q1
|
||||||
; CHECK-NEXT: strh r0, [r2]
|
; CHECK-NEXT: vstrh.32 q1, [r1, q0]
|
||||||
; CHECK-NEXT: strh r0, [r3]
|
|
||||||
; CHECK-NEXT: strh.w r0, [r12]
|
|
||||||
; CHECK-NEXT: bx lr
|
; CHECK-NEXT: bx lr
|
||||||
entry:
|
entry:
|
||||||
%ext = trunc i32 %v to i16
|
%ext = trunc i32 %v to i16
|
||||||
|
|
Loading…
Reference in New Issue