[ARM] MVE Masked loads and stores

Masked loads and store fit naturally with MVE, the instructions being easily
predicated. This adds lowering for the simple cases of masked loads and stores.
It does not yet deal with widening/narrowing or pre/post inc.

The llvm masked load intrinsic will accept a "passthru" value, dictating the
values used for the zero masked lanes. In MVE the instructions write 0 to the
zero predicated lanes, so we need to match a passthru that isn't 0 (or undef)
with a select instruction to pull in the correct data after the load.

We also need to do something with unaligned loads/stores. Currently this uses a
similar method used in big endian, using an VLDRB.8 (and potentially a VREV in
BE). This does mean that the predicate mask is converted from, for example, a
v4i1 to a v16i1. The VLDR instructions are defined as using the first bit of
the relevant mask lane, so this could potentially load different results if the
predicate is little odd. As the input is a v4i1 however, I believe this is OK
and all the bits required should be set in the predicate, making the VLDRB.8
load the same data.

Differential Revision: https://reviews.llvm.org/D66534

llvm-svn: 370329
This commit is contained in:
David Green 2019-08-29 10:54:35 +00:00
parent 313d2ce999
commit 942c2e3795
6 changed files with 567 additions and 5297 deletions

View File

@ -259,6 +259,8 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
setOperationAction(ISD::UMAX, VT, Legal);
setOperationAction(ISD::ABS, VT, Legal);
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::MLOAD, VT, Custom);
setOperationAction(ISD::MSTORE, VT, Legal);
// No native support for these.
setOperationAction(ISD::UDIV, VT, Expand);
@ -300,6 +302,8 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
setOperationAction(ISD::BUILD_VECTOR, VT.getVectorElementType(), Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal);
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::MLOAD, VT, Custom);
setOperationAction(ISD::MSTORE, VT, Legal);
// Pre and Post inc are supported on loads and stores
for (unsigned im = (unsigned)ISD::PRE_INC;
@ -8726,6 +8730,31 @@ void ARMTargetLowering::ExpandDIV_Windows(
Results.push_back(Upper);
}
static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) {
MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
MVT VT = Op.getSimpleValueType();
SDValue Mask = N->getMask();
SDValue PassThru = N->getPassThru();
SDLoc dl(Op);
if (ISD::isBuildVectorAllZeros(PassThru.getNode()) ||
(PassThru->getOpcode() == ARMISD::VMOVIMM &&
isNullConstant(PassThru->getOperand(0))))
return Op;
// MVE Masked loads use zero as the passthru value. Here we convert undef to
// zero too, and other values are lowered to a select.
SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
DAG.getTargetConstant(0, dl, MVT::i32));
SDValue NewLoad = DAG.getMaskedLoad(
VT, dl, N->getChain(), N->getBasePtr(), Mask, ZeroVec, N->getMemoryVT(),
N->getMemOperand(), N->getExtensionType(), N->isExpandingLoad());
SDValue Combo = NewLoad;
if (!PassThru.isUndef())
Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl);
}
static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering()))
// Acquire/Release load/store is not legal for targets without a dmb or
@ -8925,6 +8954,8 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::UADDO:
case ISD::USUBO:
return LowerUnsignedALUO(Op, DAG);
case ISD::MLOAD:
return LowerMLOAD(Op, DAG);
case ISD::ATOMIC_LOAD:
case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG);
case ISD::FSINCOS: return LowerFSINCOS(Op, DAG);

View File

@ -4810,6 +4810,10 @@ class MVE_vector_store_typed<ValueType Ty, Instruction RegImmInst,
PatFrag StoreKind, int shift>
: Pat<(StoreKind (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr),
(RegImmInst (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr)>;
class MVE_vector_maskedstore_typed<ValueType Ty, Instruction RegImmInst,
PatFrag StoreKind, int shift>
: Pat<(StoreKind (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr, VCCR:$pred),
(RegImmInst (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr, (i32 1), VCCR:$pred)>;
multiclass MVE_vector_store<Instruction RegImmInst, PatFrag StoreKind,
int shift> {
@ -4826,6 +4830,10 @@ class MVE_vector_load_typed<ValueType Ty, Instruction RegImmInst,
PatFrag LoadKind, int shift>
: Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr)),
(Ty (RegImmInst t2addrmode_imm7<shift>:$addr))>;
class MVE_vector_maskedload_typed<ValueType Ty, Instruction RegImmInst,
PatFrag LoadKind, int shift>
: Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr, VCCR:$pred, (Ty NEONimmAllZerosV))),
(Ty (RegImmInst t2addrmode_imm7<shift>:$addr, (i32 1), VCCR:$pred))>;
multiclass MVE_vector_load<Instruction RegImmInst, PatFrag LoadKind,
int shift> {
@ -4871,6 +4879,28 @@ def aligned16_post_store : PatFrag<(ops node:$val, node:$ptr, node:$offset),
return cast<StoreSDNode>(N)->getAlignment() >= 2;
}]>;
def alignedmaskedload32 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
(masked_ld node:$ptr, node:$pred, node:$passthru), [{
return cast<MaskedLoadSDNode>(N)->getAlignment() >= 4;
}]>;
def alignedmaskedload16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
(masked_ld node:$ptr, node:$pred, node:$passthru), [{
return cast<MaskedLoadSDNode>(N)->getAlignment() >= 2;
}]>;
def maskedload : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
(masked_ld node:$ptr, node:$pred, node:$passthru)>;
def alignedmaskedstore32 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
(masked_st node:$val, node:$ptr, node:$pred), [{
return cast<MaskedStoreSDNode>(N)->getAlignment() >= 4;
}]>;
def alignedmaskedstore16 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
(masked_st node:$val, node:$ptr, node:$pred), [{
return cast<MaskedStoreSDNode>(N)->getAlignment() >= 2;
}]>;
def maskedstore : PatFrag<(ops node:$val, node:$ptr, node:$pred),
(masked_st node:$val, node:$ptr, node:$pred)>;
let Predicates = [HasMVEInt, IsLE] in {
// Stores
defm : MVE_vector_store<MVE_VSTRBU8, byte_alignedstore, 0>;
@ -4889,6 +4919,26 @@ let Predicates = [HasMVEInt, IsLE] in {
defm : MVE_vector_offset_store<MVE_VSTRHU16_post, aligned16_post_store, 1>;
defm : MVE_vector_offset_store<MVE_VSTRWU32_pre, aligned32_pre_store, 2>;
defm : MVE_vector_offset_store<MVE_VSTRWU32_post, aligned32_post_store, 2>;
// Unaligned masked stores (aligned are below)
def : Pat<(maskedstore (v4i32 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred),
(MVE_VSTRBU8 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
def : Pat<(maskedstore (v4f32 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred),
(MVE_VSTRBU8 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
def : Pat<(maskedstore (v8i16 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred),
(MVE_VSTRBU8 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
def : Pat<(maskedstore (v8f16 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred),
(MVE_VSTRBU8 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
// Unaligned masked loads
def : Pat<(v4i32 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v4i32 NEONimmAllZerosV))),
(v4i32 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
def : Pat<(v4f32 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v4f32 NEONimmAllZerosV))),
(v4f32 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
def : Pat<(v8i16 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v8i16 NEONimmAllZerosV))),
(v8i16 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
def : Pat<(v8f16 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v8f16 NEONimmAllZerosV))),
(v8f16 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
}
let Predicates = [HasMVEInt, IsBE] in {
@ -4943,9 +4993,41 @@ let Predicates = [HasMVEInt, IsBE] in {
def : MVE_vector_offset_store_typed<v4i32, MVE_VSTRWU32_post, aligned32_post_store, 2>;
def : MVE_vector_offset_store_typed<v4f32, MVE_VSTRWU32_pre, aligned32_pre_store, 2>;
def : MVE_vector_offset_store_typed<v4f32, MVE_VSTRWU32_post, aligned32_post_store, 2>;
// Unaligned masked stores (aligned are below)
def : Pat<(maskedstore (v4i32 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred),
(MVE_VSTRBU8 (MVE_VREV32_8 MQPR:$val), t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
def : Pat<(maskedstore (v4f32 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred),
(MVE_VSTRBU8 (MVE_VREV32_8 MQPR:$val), t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
def : Pat<(maskedstore (v8i16 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred),
(MVE_VSTRBU8 (MVE_VREV16_8 MQPR:$val), t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
def : Pat<(maskedstore (v8f16 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred),
(MVE_VSTRBU8 (MVE_VREV16_8 MQPR:$val), t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
// Unaligned masked loads
def : Pat<(v4i32 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v4i32 NEONimmAllZerosV))),
(v4i32 (MVE_VREV32_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)))>;
def : Pat<(v4f32 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v4f32 NEONimmAllZerosV))),
(v4f32 (MVE_VREV32_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)))>;
def : Pat<(v8i16 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v8i16 NEONimmAllZerosV))),
(v8i16 (MVE_VREV16_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)))>;
def : Pat<(v8f16 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v8f16 NEONimmAllZerosV))),
(v8f16 (MVE_VREV16_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)))>;
}
let Predicates = [HasMVEInt] in {
// Aligned masked store, shared between LE and BE
def : MVE_vector_maskedstore_typed<v16i8, MVE_VSTRBU8, maskedstore, 0>;
def : MVE_vector_maskedstore_typed<v8i16, MVE_VSTRHU16, alignedmaskedstore16, 1>;
def : MVE_vector_maskedstore_typed<v8f16, MVE_VSTRHU16, alignedmaskedstore16, 1>;
def : MVE_vector_maskedstore_typed<v4i32, MVE_VSTRWU32, alignedmaskedstore32, 2>;
def : MVE_vector_maskedstore_typed<v4f32, MVE_VSTRWU32, alignedmaskedstore32, 2>;
// Aligned masked loads
def : MVE_vector_maskedload_typed<v16i8, MVE_VLDRBU8, maskedload, 0>;
def : MVE_vector_maskedload_typed<v8i16, MVE_VLDRHU16, alignedmaskedload16, 1>;
def : MVE_vector_maskedload_typed<v8f16, MVE_VLDRHU16, alignedmaskedload16, 1>;
def : MVE_vector_maskedload_typed<v4i32, MVE_VLDRWU32, alignedmaskedload32, 2>;
def : MVE_vector_maskedload_typed<v4f32, MVE_VLDRWU32, alignedmaskedload32, 2>;
// Predicate loads
def : Pat<(v16i1 (load t2addrmode_imm7<2>:$addr)),
(v16i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>;

View File

@ -106,6 +106,20 @@ public:
return !ST->isTargetDarwin() && !ST->hasMVEFloatOps();
}
bool isLegalMaskedLoad(Type *DataTy) {
if (!ST->hasMVEIntegerOps())
return false;
unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
if (VecWidth != 128)
return false;
unsigned EltWidth = DataTy->getScalarSizeInBits();
return EltWidth == 32 || EltWidth == 16 || EltWidth == 8;
}
bool isLegalMaskedStore(Type *DataTy) { return isLegalMaskedLoad(DataTy); }
/// \name Scalar TTI Implementations
/// @{

View File

@ -5,50 +5,11 @@
define void @foo_v4i32_v4i32(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i32> *%src) {
; CHECK-LABEL: foo_v4i32_v4i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .pad #8
; CHECK-NEXT: sub sp, #8
; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: add r3, sp, #4
; CHECK-NEXT: vcmp.s32 gt, q0, zr
; CHECK-NEXT: @ implicit-def: $q0
; CHECK-NEXT: vstr p0, [r3]
; CHECK-NEXT: ldrb.w r1, [sp, #4]
; CHECK-NEXT: lsls r3, r1, #31
; CHECK-NEXT: itt ne
; CHECK-NEXT: ldrne r3, [r2]
; CHECK-NEXT: vmovne.32 q0[0], r3
; CHECK-NEXT: lsls r3, r1, #30
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrmi r3, [r2, #4]
; CHECK-NEXT: vmovmi.32 q0[1], r3
; CHECK-NEXT: lsls r3, r1, #29
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrmi r3, [r2, #8]
; CHECK-NEXT: vmovmi.32 q0[2], r3
; CHECK-NEXT: lsls r1, r1, #28
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrmi r1, [r2, #12]
; CHECK-NEXT: vmovmi.32 q0[3], r1
; CHECK-NEXT: mov r1, sp
; CHECK-NEXT: vstr p0, [r1]
; CHECK-NEXT: ldrb.w r1, [sp]
; CHECK-NEXT: lsls r2, r1, #31
; CHECK-NEXT: itt ne
; CHECK-NEXT: vmovne r2, s0
; CHECK-NEXT: strne r2, [r0]
; CHECK-NEXT: lsls r2, r1, #30
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi r2, s1
; CHECK-NEXT: strmi r2, [r0, #4]
; CHECK-NEXT: lsls r2, r1, #29
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi r2, s2
; CHECK-NEXT: strmi r2, [r0, #8]
; CHECK-NEXT: lsls r1, r1, #28
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi r1, s3
; CHECK-NEXT: strmi r1, [r0, #12]
; CHECK-NEXT: add sp, #8
; CHECK-NEXT: vpstt
; CHECK-NEXT: vldrwt.u32 q0, [r2]
; CHECK-NEXT: vstrwt.32 q0, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load <4 x i32>, <4 x i32>* %mask, align 4
@ -61,14 +22,14 @@ entry:
define void @foo_sext_v4i32_v4i8(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i8> *%src) {
; CHECK-LABEL: foo_sext_v4i32_v4i8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .pad #8
; CHECK-NEXT: sub sp, #8
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: add r3, sp, #4
; CHECK-NEXT: mov r3, sp
; CHECK-NEXT: vcmp.s32 gt, q0, zr
; CHECK-NEXT: @ implicit-def: $q0
; CHECK-NEXT: vstr p0, [r3]
; CHECK-NEXT: ldrb.w r1, [sp, #4]
; CHECK-NEXT: ldrb.w r1, [sp]
; CHECK-NEXT: lsls r3, r1, #31
; CHECK-NEXT: itt ne
; CHECK-NEXT: ldrbne r3, [r2]
@ -85,28 +46,11 @@ define void @foo_sext_v4i32_v4i8(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i8> *%
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrbmi r1, [r2, #3]
; CHECK-NEXT: vmovmi.32 q0[3], r1
; CHECK-NEXT: mov r1, sp
; CHECK-NEXT: vmovlb.s8 q0, q0
; CHECK-NEXT: vstr p0, [r1]
; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: ldrb.w r1, [sp]
; CHECK-NEXT: lsls r2, r1, #31
; CHECK-NEXT: itt ne
; CHECK-NEXT: vmovne r2, s0
; CHECK-NEXT: strne r2, [r0]
; CHECK-NEXT: lsls r2, r1, #30
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi r2, s1
; CHECK-NEXT: strmi r2, [r0, #4]
; CHECK-NEXT: lsls r2, r1, #29
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi r2, s2
; CHECK-NEXT: strmi r2, [r0, #8]
; CHECK-NEXT: lsls r1, r1, #28
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi r1, s3
; CHECK-NEXT: strmi r1, [r0, #12]
; CHECK-NEXT: add sp, #8
; CHECK-NEXT: vpst
; CHECK-NEXT: vstrwt.32 q0, [r0]
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: bx lr
entry:
%0 = load <4 x i32>, <4 x i32>* %mask, align 4
@ -120,14 +64,14 @@ entry:
define void @foo_sext_v4i32_v4i16(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i16> *%src) {
; CHECK-LABEL: foo_sext_v4i32_v4i16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .pad #8
; CHECK-NEXT: sub sp, #8
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: add r3, sp, #4
; CHECK-NEXT: mov r3, sp
; CHECK-NEXT: vcmp.s32 gt, q0, zr
; CHECK-NEXT: @ implicit-def: $q0
; CHECK-NEXT: vstr p0, [r3]
; CHECK-NEXT: ldrb.w r1, [sp, #4]
; CHECK-NEXT: ldrb.w r1, [sp]
; CHECK-NEXT: lsls r3, r1, #31
; CHECK-NEXT: itt ne
; CHECK-NEXT: ldrhne r3, [r2]
@ -144,27 +88,10 @@ define void @foo_sext_v4i32_v4i16(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i16>
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrhmi r1, [r2, #6]
; CHECK-NEXT: vmovmi.32 q0[3], r1
; CHECK-NEXT: mov r1, sp
; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: vstr p0, [r1]
; CHECK-NEXT: ldrb.w r1, [sp]
; CHECK-NEXT: lsls r2, r1, #31
; CHECK-NEXT: itt ne
; CHECK-NEXT: vmovne r2, s0
; CHECK-NEXT: strne r2, [r0]
; CHECK-NEXT: lsls r2, r1, #30
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi r2, s1
; CHECK-NEXT: strmi r2, [r0, #4]
; CHECK-NEXT: lsls r2, r1, #29
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi r2, s2
; CHECK-NEXT: strmi r2, [r0, #8]
; CHECK-NEXT: lsls r1, r1, #28
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi r1, s3
; CHECK-NEXT: strmi r1, [r0, #12]
; CHECK-NEXT: add sp, #8
; CHECK-NEXT: vpst
; CHECK-NEXT: vstrwt.32 q0, [r0]
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: bx lr
entry:
%0 = load <4 x i32>, <4 x i32>* %mask, align 4
@ -178,15 +105,15 @@ entry:
define void @foo_zext_v4i32_v4i8(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i8> *%src) {
; CHECK-LABEL: foo_zext_v4i32_v4i8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .pad #8
; CHECK-NEXT: sub sp, #8
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: add r3, sp, #4
; CHECK-NEXT: mov r3, sp
; CHECK-NEXT: vmov.i32 q1, #0xff
; CHECK-NEXT: vcmp.s32 gt, q0, zr
; CHECK-NEXT: @ implicit-def: $q0
; CHECK-NEXT: vstr p0, [r3]
; CHECK-NEXT: ldrb.w r1, [sp, #4]
; CHECK-NEXT: ldrb.w r1, [sp]
; CHECK-NEXT: lsls r3, r1, #31
; CHECK-NEXT: itt ne
; CHECK-NEXT: ldrbne r3, [r2]
@ -203,27 +130,10 @@ define void @foo_zext_v4i32_v4i8(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i8> *%
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrbmi r1, [r2, #3]
; CHECK-NEXT: vmovmi.32 q0[3], r1
; CHECK-NEXT: mov r1, sp
; CHECK-NEXT: vand q0, q0, q1
; CHECK-NEXT: vstr p0, [r1]
; CHECK-NEXT: ldrb.w r1, [sp]
; CHECK-NEXT: lsls r2, r1, #31
; CHECK-NEXT: itt ne
; CHECK-NEXT: vmovne r2, s0
; CHECK-NEXT: strne r2, [r0]
; CHECK-NEXT: lsls r2, r1, #30
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi r2, s1
; CHECK-NEXT: strmi r2, [r0, #4]
; CHECK-NEXT: lsls r2, r1, #29
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi r2, s2
; CHECK-NEXT: strmi r2, [r0, #8]
; CHECK-NEXT: lsls r1, r1, #28
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi r1, s3
; CHECK-NEXT: strmi r1, [r0, #12]
; CHECK-NEXT: add sp, #8
; CHECK-NEXT: vpst
; CHECK-NEXT: vstrwt.32 q0, [r0]
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: bx lr
entry:
%0 = load <4 x i32>, <4 x i32>* %mask, align 4
@ -237,14 +147,14 @@ entry:
define void @foo_zext_v4i32_v4i16(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i16> *%src) {
; CHECK-LABEL: foo_zext_v4i32_v4i16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .pad #8
; CHECK-NEXT: sub sp, #8
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: add r3, sp, #4
; CHECK-NEXT: mov r3, sp
; CHECK-NEXT: vcmp.s32 gt, q0, zr
; CHECK-NEXT: @ implicit-def: $q0
; CHECK-NEXT: vstr p0, [r3]
; CHECK-NEXT: ldrb.w r1, [sp, #4]
; CHECK-NEXT: ldrb.w r1, [sp]
; CHECK-NEXT: lsls r3, r1, #31
; CHECK-NEXT: itt ne
; CHECK-NEXT: ldrhne r3, [r2]
@ -261,27 +171,10 @@ define void @foo_zext_v4i32_v4i16(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i16>
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrhmi r1, [r2, #6]
; CHECK-NEXT: vmovmi.32 q0[3], r1
; CHECK-NEXT: mov r1, sp
; CHECK-NEXT: vmovlb.u16 q0, q0
; CHECK-NEXT: vstr p0, [r1]
; CHECK-NEXT: ldrb.w r1, [sp]
; CHECK-NEXT: lsls r2, r1, #31
; CHECK-NEXT: itt ne
; CHECK-NEXT: vmovne r2, s0
; CHECK-NEXT: strne r2, [r0]
; CHECK-NEXT: lsls r2, r1, #30
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi r2, s1
; CHECK-NEXT: strmi r2, [r0, #4]
; CHECK-NEXT: lsls r2, r1, #29
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi r2, s2
; CHECK-NEXT: strmi r2, [r0, #8]
; CHECK-NEXT: lsls r1, r1, #28
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi r1, s3
; CHECK-NEXT: strmi r1, [r0, #12]
; CHECK-NEXT: add sp, #8
; CHECK-NEXT: vpst
; CHECK-NEXT: vstrwt.32 q0, [r0]
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: bx lr
entry:
%0 = load <4 x i32>, <4 x i32>* %mask, align 4
@ -295,82 +188,11 @@ entry:
define void @foo_v8i16_v8i16(<8 x i16> *%dest, <8 x i16> *%mask, <8 x i16> *%src) {
; CHECK-LABEL: foo_v8i16_v8i16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .pad #16
; CHECK-NEXT: sub sp, #16
; CHECK-NEXT: vldrh.u16 q0, [r1]
; CHECK-NEXT: add r3, sp, #8
; CHECK-NEXT: vcmp.s16 gt, q0, zr
; CHECK-NEXT: @ implicit-def: $q0
; CHECK-NEXT: vstr p0, [r3]
; CHECK-NEXT: ldrb.w r1, [sp, #8]
; CHECK-NEXT: lsls r3, r1, #31
; CHECK-NEXT: itt ne
; CHECK-NEXT: ldrhne r3, [r2]
; CHECK-NEXT: vmovne.16 q0[0], r3
; CHECK-NEXT: lsls r3, r1, #30
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrhmi r3, [r2, #2]
; CHECK-NEXT: vmovmi.16 q0[1], r3
; CHECK-NEXT: lsls r3, r1, #29
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrhmi r3, [r2, #4]
; CHECK-NEXT: vmovmi.16 q0[2], r3
; CHECK-NEXT: lsls r3, r1, #28
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrhmi r3, [r2, #6]
; CHECK-NEXT: vmovmi.16 q0[3], r3
; CHECK-NEXT: lsls r3, r1, #27
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrhmi r3, [r2, #8]
; CHECK-NEXT: vmovmi.16 q0[4], r3
; CHECK-NEXT: lsls r3, r1, #26
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrhmi r3, [r2, #10]
; CHECK-NEXT: vmovmi.16 q0[5], r3
; CHECK-NEXT: lsls r3, r1, #25
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrhmi r3, [r2, #12]
; CHECK-NEXT: vmovmi.16 q0[6], r3
; CHECK-NEXT: lsls r1, r1, #24
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrhmi r1, [r2, #14]
; CHECK-NEXT: vmovmi.16 q0[7], r1
; CHECK-NEXT: mov r1, sp
; CHECK-NEXT: vstr p0, [r1]
; CHECK-NEXT: ldrb.w r1, [sp]
; CHECK-NEXT: lsls r2, r1, #31
; CHECK-NEXT: itt ne
; CHECK-NEXT: vmovne.u16 r2, q0[0]
; CHECK-NEXT: strhne r2, [r0]
; CHECK-NEXT: lsls r2, r1, #30
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi.u16 r2, q0[1]
; CHECK-NEXT: strhmi r2, [r0, #2]
; CHECK-NEXT: lsls r2, r1, #29
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi.u16 r2, q0[2]
; CHECK-NEXT: strhmi r2, [r0, #4]
; CHECK-NEXT: lsls r2, r1, #28
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi.u16 r2, q0[3]
; CHECK-NEXT: strhmi r2, [r0, #6]
; CHECK-NEXT: lsls r2, r1, #27
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi.u16 r2, q0[4]
; CHECK-NEXT: strhmi r2, [r0, #8]
; CHECK-NEXT: lsls r2, r1, #26
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi.u16 r2, q0[5]
; CHECK-NEXT: strhmi r2, [r0, #10]
; CHECK-NEXT: lsls r2, r1, #25
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi.u16 r2, q0[6]
; CHECK-NEXT: strhmi r2, [r0, #12]
; CHECK-NEXT: lsls r1, r1, #24
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi.u16 r1, q0[7]
; CHECK-NEXT: strhmi r1, [r0, #14]
; CHECK-NEXT: add sp, #16
; CHECK-NEXT: vpstt
; CHECK-NEXT: vldrht.u16 q0, [r2]
; CHECK-NEXT: vstrht.16 q0, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load <8 x i16>, <8 x i16>* %mask, align 2
@ -383,14 +205,14 @@ entry:
define void @foo_sext_v8i16_v8i8(<8 x i16> *%dest, <8 x i16> *%mask, <8 x i8> *%src) {
; CHECK-LABEL: foo_sext_v8i16_v8i8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .pad #16
; CHECK-NEXT: sub sp, #16
; CHECK-NEXT: .pad #8
; CHECK-NEXT: sub sp, #8
; CHECK-NEXT: vldrh.u16 q0, [r1]
; CHECK-NEXT: add r3, sp, #8
; CHECK-NEXT: mov r3, sp
; CHECK-NEXT: vcmp.s16 gt, q0, zr
; CHECK-NEXT: @ implicit-def: $q0
; CHECK-NEXT: vstr p0, [r3]
; CHECK-NEXT: ldrb.w r1, [sp, #8]
; CHECK-NEXT: ldrb.w r1, [sp]
; CHECK-NEXT: lsls r3, r1, #31
; CHECK-NEXT: itt ne
; CHECK-NEXT: ldrbne r3, [r2]
@ -423,43 +245,10 @@ define void @foo_sext_v8i16_v8i8(<8 x i16> *%dest, <8 x i16> *%mask, <8 x i8> *%
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrbmi r1, [r2, #7]
; CHECK-NEXT: vmovmi.16 q0[7], r1
; CHECK-NEXT: mov r1, sp
; CHECK-NEXT: vmovlb.s8 q0, q0
; CHECK-NEXT: vstr p0, [r1]
; CHECK-NEXT: ldrb.w r1, [sp]
; CHECK-NEXT: lsls r2, r1, #31
; CHECK-NEXT: itt ne
; CHECK-NEXT: vmovne.u16 r2, q0[0]
; CHECK-NEXT: strhne r2, [r0]
; CHECK-NEXT: lsls r2, r1, #30
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi.u16 r2, q0[1]
; CHECK-NEXT: strhmi r2, [r0, #2]
; CHECK-NEXT: lsls r2, r1, #29
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi.u16 r2, q0[2]
; CHECK-NEXT: strhmi r2, [r0, #4]
; CHECK-NEXT: lsls r2, r1, #28
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi.u16 r2, q0[3]
; CHECK-NEXT: strhmi r2, [r0, #6]
; CHECK-NEXT: lsls r2, r1, #27
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi.u16 r2, q0[4]
; CHECK-NEXT: strhmi r2, [r0, #8]
; CHECK-NEXT: lsls r2, r1, #26
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi.u16 r2, q0[5]
; CHECK-NEXT: strhmi r2, [r0, #10]
; CHECK-NEXT: lsls r2, r1, #25
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi.u16 r2, q0[6]
; CHECK-NEXT: strhmi r2, [r0, #12]
; CHECK-NEXT: lsls r1, r1, #24
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi.u16 r1, q0[7]
; CHECK-NEXT: strhmi r1, [r0, #14]
; CHECK-NEXT: add sp, #16
; CHECK-NEXT: vpst
; CHECK-NEXT: vstrht.16 q0, [r0]
; CHECK-NEXT: add sp, #8
; CHECK-NEXT: bx lr
entry:
%0 = load <8 x i16>, <8 x i16>* %mask, align 2
@ -473,14 +262,14 @@ entry:
define void @foo_zext_v8i16_v8i8(<8 x i16> *%dest, <8 x i16> *%mask, <8 x i8> *%src) {
; CHECK-LABEL: foo_zext_v8i16_v8i8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .pad #16
; CHECK-NEXT: sub sp, #16
; CHECK-NEXT: .pad #8
; CHECK-NEXT: sub sp, #8
; CHECK-NEXT: vldrh.u16 q0, [r1]
; CHECK-NEXT: add r3, sp, #8
; CHECK-NEXT: mov r3, sp
; CHECK-NEXT: vcmp.s16 gt, q0, zr
; CHECK-NEXT: @ implicit-def: $q0
; CHECK-NEXT: vstr p0, [r3]
; CHECK-NEXT: ldrb.w r1, [sp, #8]
; CHECK-NEXT: ldrb.w r1, [sp]
; CHECK-NEXT: lsls r3, r1, #31
; CHECK-NEXT: itt ne
; CHECK-NEXT: ldrbne r3, [r2]
@ -513,43 +302,10 @@ define void @foo_zext_v8i16_v8i8(<8 x i16> *%dest, <8 x i16> *%mask, <8 x i8> *%
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrbmi r1, [r2, #7]
; CHECK-NEXT: vmovmi.16 q0[7], r1
; CHECK-NEXT: mov r1, sp
; CHECK-NEXT: vmovlb.u8 q0, q0
; CHECK-NEXT: vstr p0, [r1]
; CHECK-NEXT: ldrb.w r1, [sp]
; CHECK-NEXT: lsls r2, r1, #31
; CHECK-NEXT: itt ne
; CHECK-NEXT: vmovne.u16 r2, q0[0]
; CHECK-NEXT: strhne r2, [r0]
; CHECK-NEXT: lsls r2, r1, #30
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi.u16 r2, q0[1]
; CHECK-NEXT: strhmi r2, [r0, #2]
; CHECK-NEXT: lsls r2, r1, #29
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi.u16 r2, q0[2]
; CHECK-NEXT: strhmi r2, [r0, #4]
; CHECK-NEXT: lsls r2, r1, #28
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi.u16 r2, q0[3]
; CHECK-NEXT: strhmi r2, [r0, #6]
; CHECK-NEXT: lsls r2, r1, #27
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi.u16 r2, q0[4]
; CHECK-NEXT: strhmi r2, [r0, #8]
; CHECK-NEXT: lsls r2, r1, #26
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi.u16 r2, q0[5]
; CHECK-NEXT: strhmi r2, [r0, #10]
; CHECK-NEXT: lsls r2, r1, #25
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi.u16 r2, q0[6]
; CHECK-NEXT: strhmi r2, [r0, #12]
; CHECK-NEXT: lsls r1, r1, #24
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi.u16 r1, q0[7]
; CHECK-NEXT: strhmi r1, [r0, #14]
; CHECK-NEXT: add sp, #16
; CHECK-NEXT: vpst
; CHECK-NEXT: vstrht.16 q0, [r0]
; CHECK-NEXT: add sp, #8
; CHECK-NEXT: bx lr
entry:
%0 = load <8 x i16>, <8 x i16>* %mask, align 2
@ -563,155 +319,12 @@ entry:
define void @foo_v16i8_v16i8(<16 x i8> *%dest, <16 x i8> *%mask, <16 x i8> *%src) {
; CHECK-LABEL: foo_v16i8_v16i8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r6, r7, lr}
; CHECK-NEXT: push {r4, r6, r7, lr}
; CHECK-NEXT: .setfp r7, sp, #8
; CHECK-NEXT: add r7, sp, #8
; CHECK-NEXT: .pad #32
; CHECK-NEXT: sub sp, #32
; CHECK-NEXT: mov r4, sp
; CHECK-NEXT: bfc r4, #0, #4
; CHECK-NEXT: mov sp, r4
; CHECK-NEXT: vldrb.u8 q0, [r1]
; CHECK-NEXT: add r3, sp, #16
; CHECK-NEXT: sub.w r4, r7, #8
; CHECK-NEXT: vcmp.s8 gt, q0, zr
; CHECK-NEXT: @ implicit-def: $q0
; CHECK-NEXT: vstr p0, [r3]
; CHECK-NEXT: ldrh.w r1, [sp, #16]
; CHECK-NEXT: lsls r3, r1, #31
; CHECK-NEXT: itt ne
; CHECK-NEXT: ldrbne r3, [r2]
; CHECK-NEXT: vmovne.8 q0[0], r3
; CHECK-NEXT: lsls r3, r1, #30
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrbmi r3, [r2, #1]
; CHECK-NEXT: vmovmi.8 q0[1], r3
; CHECK-NEXT: lsls r3, r1, #29
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrbmi r3, [r2, #2]
; CHECK-NEXT: vmovmi.8 q0[2], r3
; CHECK-NEXT: lsls r3, r1, #28
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrbmi r3, [r2, #3]
; CHECK-NEXT: vmovmi.8 q0[3], r3
; CHECK-NEXT: lsls r3, r1, #27
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrbmi r3, [r2, #4]
; CHECK-NEXT: vmovmi.8 q0[4], r3
; CHECK-NEXT: lsls r3, r1, #26
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrbmi r3, [r2, #5]
; CHECK-NEXT: vmovmi.8 q0[5], r3
; CHECK-NEXT: lsls r3, r1, #25
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrbmi r3, [r2, #6]
; CHECK-NEXT: vmovmi.8 q0[6], r3
; CHECK-NEXT: lsls r3, r1, #24
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrbmi r3, [r2, #7]
; CHECK-NEXT: vmovmi.8 q0[7], r3
; CHECK-NEXT: lsls r3, r1, #23
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrbmi r3, [r2, #8]
; CHECK-NEXT: vmovmi.8 q0[8], r3
; CHECK-NEXT: lsls r3, r1, #22
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrbmi r3, [r2, #9]
; CHECK-NEXT: vmovmi.8 q0[9], r3
; CHECK-NEXT: lsls r3, r1, #21
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrbmi r3, [r2, #10]
; CHECK-NEXT: vmovmi.8 q0[10], r3
; CHECK-NEXT: lsls r3, r1, #20
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrbmi r3, [r2, #11]
; CHECK-NEXT: vmovmi.8 q0[11], r3
; CHECK-NEXT: lsls r3, r1, #19
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrbmi r3, [r2, #12]
; CHECK-NEXT: vmovmi.8 q0[12], r3
; CHECK-NEXT: lsls r3, r1, #18
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrbmi r3, [r2, #13]
; CHECK-NEXT: vmovmi.8 q0[13], r3
; CHECK-NEXT: lsls r3, r1, #17
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrbmi r3, [r2, #14]
; CHECK-NEXT: vmovmi.8 q0[14], r3
; CHECK-NEXT: lsls r1, r1, #16
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrbmi r1, [r2, #15]
; CHECK-NEXT: vmovmi.8 q0[15], r1
; CHECK-NEXT: mov r1, sp
; CHECK-NEXT: vstr p0, [r1]
; CHECK-NEXT: ldrh.w r1, [sp]
; CHECK-NEXT: lsls r2, r1, #31
; CHECK-NEXT: itt ne
; CHECK-NEXT: vmovne.u8 r2, q0[0]
; CHECK-NEXT: strbne r2, [r0]
; CHECK-NEXT: lsls r2, r1, #30
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi.u8 r2, q0[1]
; CHECK-NEXT: strbmi r2, [r0, #1]
; CHECK-NEXT: lsls r2, r1, #29
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi.u8 r2, q0[2]
; CHECK-NEXT: strbmi r2, [r0, #2]
; CHECK-NEXT: lsls r2, r1, #28
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi.u8 r2, q0[3]
; CHECK-NEXT: strbmi r2, [r0, #3]
; CHECK-NEXT: lsls r2, r1, #27
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi.u8 r2, q0[4]
; CHECK-NEXT: strbmi r2, [r0, #4]
; CHECK-NEXT: lsls r2, r1, #26
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi.u8 r2, q0[5]
; CHECK-NEXT: strbmi r2, [r0, #5]
; CHECK-NEXT: lsls r2, r1, #25
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi.u8 r2, q0[6]
; CHECK-NEXT: strbmi r2, [r0, #6]
; CHECK-NEXT: lsls r2, r1, #24
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi.u8 r2, q0[7]
; CHECK-NEXT: strbmi r2, [r0, #7]
; CHECK-NEXT: lsls r2, r1, #23
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi.u8 r2, q0[8]
; CHECK-NEXT: strbmi r2, [r0, #8]
; CHECK-NEXT: lsls r2, r1, #22
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi.u8 r2, q0[9]
; CHECK-NEXT: strbmi r2, [r0, #9]
; CHECK-NEXT: lsls r2, r1, #21
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi.u8 r2, q0[10]
; CHECK-NEXT: strbmi r2, [r0, #10]
; CHECK-NEXT: lsls r2, r1, #20
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi.u8 r2, q0[11]
; CHECK-NEXT: strbmi r2, [r0, #11]
; CHECK-NEXT: lsls r2, r1, #19
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi.u8 r2, q0[12]
; CHECK-NEXT: strbmi r2, [r0, #12]
; CHECK-NEXT: lsls r2, r1, #18
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi.u8 r2, q0[13]
; CHECK-NEXT: strbmi r2, [r0, #13]
; CHECK-NEXT: lsls r2, r1, #17
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi.u8 r2, q0[14]
; CHECK-NEXT: strbmi r2, [r0, #14]
; CHECK-NEXT: lsls r1, r1, #16
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi.u8 r1, q0[15]
; CHECK-NEXT: strbmi r1, [r0, #15]
; CHECK-NEXT: mov sp, r4
; CHECK-NEXT: pop {r4, r6, r7, pc}
; CHECK-NEXT: vpstt
; CHECK-NEXT: vldrbt.u8 q0, [r2]
; CHECK-NEXT: vstrbt.8 q0, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load <16 x i8>, <16 x i8>* %mask, align 1
%1 = icmp sgt <16 x i8> %0, zeroinitializer
@ -723,48 +336,14 @@ entry:
define void @foo_trunc_v8i8_v8i16(<8 x i8> *%dest, <8 x i16> *%mask, <8 x i16> *%src) {
; CHECK-LABEL: foo_trunc_v8i8_v8i16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .pad #16
; CHECK-NEXT: sub sp, #16
; CHECK-NEXT: .pad #8
; CHECK-NEXT: sub sp, #8
; CHECK-NEXT: vldrh.u16 q0, [r1]
; CHECK-NEXT: add r3, sp, #8
; CHECK-NEXT: mov r3, sp
; CHECK-NEXT: vcmp.s16 gt, q0, zr
; CHECK-NEXT: @ implicit-def: $q0
; CHECK-NEXT: vstr p0, [r3]
; CHECK-NEXT: ldrb.w r1, [sp, #8]
; CHECK-NEXT: lsls r3, r1, #31
; CHECK-NEXT: itt ne
; CHECK-NEXT: ldrhne r3, [r2]
; CHECK-NEXT: vmovne.16 q0[0], r3
; CHECK-NEXT: lsls r3, r1, #30
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrhmi r3, [r2, #2]
; CHECK-NEXT: vmovmi.16 q0[1], r3
; CHECK-NEXT: lsls r3, r1, #29
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrhmi r3, [r2, #4]
; CHECK-NEXT: vmovmi.16 q0[2], r3
; CHECK-NEXT: lsls r3, r1, #28
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrhmi r3, [r2, #6]
; CHECK-NEXT: vmovmi.16 q0[3], r3
; CHECK-NEXT: lsls r3, r1, #27
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrhmi r3, [r2, #8]
; CHECK-NEXT: vmovmi.16 q0[4], r3
; CHECK-NEXT: lsls r3, r1, #26
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrhmi r3, [r2, #10]
; CHECK-NEXT: vmovmi.16 q0[5], r3
; CHECK-NEXT: lsls r3, r1, #25
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrhmi r3, [r2, #12]
; CHECK-NEXT: vmovmi.16 q0[6], r3
; CHECK-NEXT: lsls r1, r1, #24
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrhmi r1, [r2, #14]
; CHECK-NEXT: vmovmi.16 q0[7], r1
; CHECK-NEXT: mov r1, sp
; CHECK-NEXT: vstr p0, [r1]
; CHECK-NEXT: vpst
; CHECK-NEXT: vldrht.u16 q0, [r2]
; CHECK-NEXT: ldrb.w r1, [sp]
; CHECK-NEXT: lsls r2, r1, #31
; CHECK-NEXT: itt ne
@ -798,7 +377,7 @@ define void @foo_trunc_v8i8_v8i16(<8 x i8> *%dest, <8 x i16> *%mask, <8 x i16> *
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi.u16 r1, q0[7]
; CHECK-NEXT: strbmi r1, [r0, #7]
; CHECK-NEXT: add sp, #16
; CHECK-NEXT: add sp, #8
; CHECK-NEXT: bx lr
entry:
%0 = load <8 x i16>, <8 x i16>* %mask, align 2
@ -812,32 +391,14 @@ entry:
define void @foo_trunc_v4i8_v4i32(<4 x i8> *%dest, <4 x i32> *%mask, <4 x i32> *%src) {
; CHECK-LABEL: foo_trunc_v4i8_v4i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .pad #8
; CHECK-NEXT: sub sp, #8
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: add r3, sp, #4
; CHECK-NEXT: mov r3, sp
; CHECK-NEXT: vcmp.s32 gt, q0, zr
; CHECK-NEXT: @ implicit-def: $q0
; CHECK-NEXT: vstr p0, [r3]
; CHECK-NEXT: ldrb.w r1, [sp, #4]
; CHECK-NEXT: lsls r3, r1, #31
; CHECK-NEXT: itt ne
; CHECK-NEXT: ldrne r3, [r2]
; CHECK-NEXT: vmovne.32 q0[0], r3
; CHECK-NEXT: lsls r3, r1, #30
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrmi r3, [r2, #4]
; CHECK-NEXT: vmovmi.32 q0[1], r3
; CHECK-NEXT: lsls r3, r1, #29
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrmi r3, [r2, #8]
; CHECK-NEXT: vmovmi.32 q0[2], r3
; CHECK-NEXT: lsls r1, r1, #28
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrmi r1, [r2, #12]
; CHECK-NEXT: vmovmi.32 q0[3], r1
; CHECK-NEXT: mov r1, sp
; CHECK-NEXT: vstr p0, [r1]
; CHECK-NEXT: vpst
; CHECK-NEXT: vldrwt.u32 q0, [r2]
; CHECK-NEXT: ldrb.w r1, [sp]
; CHECK-NEXT: lsls r2, r1, #31
; CHECK-NEXT: itt ne
@ -855,7 +416,7 @@ define void @foo_trunc_v4i8_v4i32(<4 x i8> *%dest, <4 x i32> *%mask, <4 x i32> *
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi r1, s3
; CHECK-NEXT: strbmi r1, [r0, #3]
; CHECK-NEXT: add sp, #8
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: bx lr
entry:
%0 = load <4 x i32>, <4 x i32>* %mask, align 4
@ -869,32 +430,14 @@ entry:
define void @foo_trunc_v4i16_v4i32(<4 x i16> *%dest, <4 x i32> *%mask, <4 x i32> *%src) {
; CHECK-LABEL: foo_trunc_v4i16_v4i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .pad #8
; CHECK-NEXT: sub sp, #8
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: add r3, sp, #4
; CHECK-NEXT: mov r3, sp
; CHECK-NEXT: vcmp.s32 gt, q0, zr
; CHECK-NEXT: @ implicit-def: $q0
; CHECK-NEXT: vstr p0, [r3]
; CHECK-NEXT: ldrb.w r1, [sp, #4]
; CHECK-NEXT: lsls r3, r1, #31
; CHECK-NEXT: itt ne
; CHECK-NEXT: ldrne r3, [r2]
; CHECK-NEXT: vmovne.32 q0[0], r3
; CHECK-NEXT: lsls r3, r1, #30
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrmi r3, [r2, #4]
; CHECK-NEXT: vmovmi.32 q0[1], r3
; CHECK-NEXT: lsls r3, r1, #29
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrmi r3, [r2, #8]
; CHECK-NEXT: vmovmi.32 q0[2], r3
; CHECK-NEXT: lsls r1, r1, #28
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrmi r1, [r2, #12]
; CHECK-NEXT: vmovmi.32 q0[3], r1
; CHECK-NEXT: mov r1, sp
; CHECK-NEXT: vstr p0, [r1]
; CHECK-NEXT: vpst
; CHECK-NEXT: vldrwt.u32 q0, [r2]
; CHECK-NEXT: ldrb.w r1, [sp]
; CHECK-NEXT: lsls r2, r1, #31
; CHECK-NEXT: itt ne
@ -912,7 +455,7 @@ define void @foo_trunc_v4i16_v4i32(<4 x i16> *%dest, <4 x i32> *%mask, <4 x i32>
; CHECK-NEXT: itt mi
; CHECK-NEXT: vmovmi r1, s3
; CHECK-NEXT: strhmi r1, [r0, #6]
; CHECK-NEXT: add sp, #8
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: bx lr
entry:
%0 = load <4 x i32>, <4 x i32>* %mask, align 4
@ -926,42 +469,11 @@ entry:
define void @foo_v4f32_v4f32(<4 x float> *%dest, <4 x i32> *%mask, <4 x float> *%src) {
; CHECK-LABEL: foo_v4f32_v4f32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .pad #8
; CHECK-NEXT: sub sp, #8
; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: add r3, sp, #4
; CHECK-NEXT: vcmp.s32 gt, q0, zr
; CHECK-NEXT: @ implicit-def: $q0
; CHECK-NEXT: vstr p0, [r3]
; CHECK-NEXT: ldrb.w r1, [sp, #4]
; CHECK-NEXT: lsls r3, r1, #31
; CHECK-NEXT: it ne
; CHECK-NEXT: vldrne s0, [r2]
; CHECK-NEXT: lsls r3, r1, #30
; CHECK-NEXT: it mi
; CHECK-NEXT: vldrmi s1, [r2, #4]
; CHECK-NEXT: lsls r3, r1, #29
; CHECK-NEXT: it mi
; CHECK-NEXT: vldrmi s2, [r2, #8]
; CHECK-NEXT: lsls r1, r1, #28
; CHECK-NEXT: it mi
; CHECK-NEXT: vldrmi s3, [r2, #12]
; CHECK-NEXT: mov r1, sp
; CHECK-NEXT: vstr p0, [r1]
; CHECK-NEXT: ldrb.w r1, [sp]
; CHECK-NEXT: lsls r2, r1, #31
; CHECK-NEXT: it ne
; CHECK-NEXT: vstrne s0, [r0]
; CHECK-NEXT: lsls r2, r1, #30
; CHECK-NEXT: it mi
; CHECK-NEXT: vstrmi s1, [r0, #4]
; CHECK-NEXT: lsls r2, r1, #29
; CHECK-NEXT: it mi
; CHECK-NEXT: vstrmi s2, [r0, #8]
; CHECK-NEXT: lsls r1, r1, #28
; CHECK-NEXT: it mi
; CHECK-NEXT: vstrmi s3, [r0, #12]
; CHECK-NEXT: add sp, #8
; CHECK-NEXT: vpstt
; CHECK-NEXT: vldrwt.u32 q0, [r2]
; CHECK-NEXT: vstrwt.32 q0, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load <4 x i32>, <4 x i32>* %mask, align 4
@ -974,147 +486,11 @@ entry:
define void @foo_v8f16_v8f16(<8 x half> *%dest, <8 x i16> *%mask, <8 x half> *%src) {
; CHECK-LABEL: foo_v8f16_v8f16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .pad #16
; CHECK-NEXT: sub sp, #16
; CHECK-NEXT: vldrh.u16 q0, [r1]
; CHECK-NEXT: add r3, sp, #8
; CHECK-NEXT: vcmp.s16 gt, q0, zr
; CHECK-NEXT: @ implicit-def: $q0
; CHECK-NEXT: vstr p0, [r3]
; CHECK-NEXT: ldrb.w r1, [sp, #8]
; CHECK-NEXT: lsls r3, r1, #31
; CHECK-NEXT: bne .LBB13_18
; CHECK-NEXT: @ %bb.1: @ %else
; CHECK-NEXT: lsls r3, r1, #30
; CHECK-NEXT: bmi .LBB13_19
; CHECK-NEXT: .LBB13_2: @ %else2
; CHECK-NEXT: lsls r3, r1, #29
; CHECK-NEXT: bmi .LBB13_20
; CHECK-NEXT: .LBB13_3: @ %else5
; CHECK-NEXT: lsls r3, r1, #28
; CHECK-NEXT: bmi .LBB13_21
; CHECK-NEXT: .LBB13_4: @ %else8
; CHECK-NEXT: lsls r3, r1, #27
; CHECK-NEXT: bmi .LBB13_22
; CHECK-NEXT: .LBB13_5: @ %else11
; CHECK-NEXT: lsls r3, r1, #26
; CHECK-NEXT: bmi .LBB13_23
; CHECK-NEXT: .LBB13_6: @ %else14
; CHECK-NEXT: lsls r3, r1, #25
; CHECK-NEXT: bmi .LBB13_24
; CHECK-NEXT: .LBB13_7: @ %else17
; CHECK-NEXT: lsls r1, r1, #24
; CHECK-NEXT: bpl .LBB13_9
; CHECK-NEXT: .LBB13_8: @ %cond.load19
; CHECK-NEXT: vldr.16 s4, [r2, #14]
; CHECK-NEXT: vmov r1, s4
; CHECK-NEXT: vmov.16 q0[7], r1
; CHECK-NEXT: .LBB13_9: @ %else20
; CHECK-NEXT: mov r1, sp
; CHECK-NEXT: vstr p0, [r1]
; CHECK-NEXT: ldrb.w r1, [sp]
; CHECK-NEXT: lsls r2, r1, #31
; CHECK-NEXT: bne .LBB13_25
; CHECK-NEXT: @ %bb.10: @ %else23
; CHECK-NEXT: lsls r2, r1, #30
; CHECK-NEXT: bmi .LBB13_26
; CHECK-NEXT: .LBB13_11: @ %else25
; CHECK-NEXT: lsls r2, r1, #29
; CHECK-NEXT: bmi .LBB13_27
; CHECK-NEXT: .LBB13_12: @ %else27
; CHECK-NEXT: lsls r2, r1, #28
; CHECK-NEXT: bmi .LBB13_28
; CHECK-NEXT: .LBB13_13: @ %else29
; CHECK-NEXT: lsls r2, r1, #27
; CHECK-NEXT: bmi .LBB13_29
; CHECK-NEXT: .LBB13_14: @ %else31
; CHECK-NEXT: lsls r2, r1, #26
; CHECK-NEXT: bmi .LBB13_30
; CHECK-NEXT: .LBB13_15: @ %else33
; CHECK-NEXT: lsls r2, r1, #25
; CHECK-NEXT: bmi .LBB13_31
; CHECK-NEXT: .LBB13_16: @ %else35
; CHECK-NEXT: lsls r1, r1, #24
; CHECK-NEXT: bmi .LBB13_32
; CHECK-NEXT: .LBB13_17: @ %else37
; CHECK-NEXT: add sp, #16
; CHECK-NEXT: bx lr
; CHECK-NEXT: .LBB13_18: @ %cond.load
; CHECK-NEXT: vldr.16 s0, [r2]
; CHECK-NEXT: lsls r3, r1, #30
; CHECK-NEXT: bpl .LBB13_2
; CHECK-NEXT: .LBB13_19: @ %cond.load1
; CHECK-NEXT: vldr.16 s4, [r2, #2]
; CHECK-NEXT: vmov r3, s4
; CHECK-NEXT: vmov.16 q0[1], r3
; CHECK-NEXT: lsls r3, r1, #29
; CHECK-NEXT: bpl .LBB13_3
; CHECK-NEXT: .LBB13_20: @ %cond.load4
; CHECK-NEXT: vldr.16 s4, [r2, #4]
; CHECK-NEXT: vmov r3, s4
; CHECK-NEXT: vmov.16 q0[2], r3
; CHECK-NEXT: lsls r3, r1, #28
; CHECK-NEXT: bpl .LBB13_4
; CHECK-NEXT: .LBB13_21: @ %cond.load7
; CHECK-NEXT: vldr.16 s4, [r2, #6]
; CHECK-NEXT: vmov r3, s4
; CHECK-NEXT: vmov.16 q0[3], r3
; CHECK-NEXT: lsls r3, r1, #27
; CHECK-NEXT: bpl .LBB13_5
; CHECK-NEXT: .LBB13_22: @ %cond.load10
; CHECK-NEXT: vldr.16 s4, [r2, #8]
; CHECK-NEXT: vmov r3, s4
; CHECK-NEXT: vmov.16 q0[4], r3
; CHECK-NEXT: lsls r3, r1, #26
; CHECK-NEXT: bpl .LBB13_6
; CHECK-NEXT: .LBB13_23: @ %cond.load13
; CHECK-NEXT: vldr.16 s4, [r2, #10]
; CHECK-NEXT: vmov r3, s4
; CHECK-NEXT: vmov.16 q0[5], r3
; CHECK-NEXT: lsls r3, r1, #25
; CHECK-NEXT: bpl .LBB13_7
; CHECK-NEXT: .LBB13_24: @ %cond.load16
; CHECK-NEXT: vldr.16 s4, [r2, #12]
; CHECK-NEXT: vmov r3, s4
; CHECK-NEXT: vmov.16 q0[6], r3
; CHECK-NEXT: lsls r1, r1, #24
; CHECK-NEXT: bmi .LBB13_8
; CHECK-NEXT: b .LBB13_9
; CHECK-NEXT: .LBB13_25: @ %cond.store
; CHECK-NEXT: vstr.16 s0, [r0]
; CHECK-NEXT: lsls r2, r1, #30
; CHECK-NEXT: bpl .LBB13_11
; CHECK-NEXT: .LBB13_26: @ %cond.store24
; CHECK-NEXT: vmovx.f16 s4, s0
; CHECK-NEXT: vstr.16 s4, [r0, #2]
; CHECK-NEXT: lsls r2, r1, #29
; CHECK-NEXT: bpl .LBB13_12
; CHECK-NEXT: .LBB13_27: @ %cond.store26
; CHECK-NEXT: vstr.16 s1, [r0, #4]
; CHECK-NEXT: lsls r2, r1, #28
; CHECK-NEXT: bpl .LBB13_13
; CHECK-NEXT: .LBB13_28: @ %cond.store28
; CHECK-NEXT: vmovx.f16 s4, s1
; CHECK-NEXT: vstr.16 s4, [r0, #6]
; CHECK-NEXT: lsls r2, r1, #27
; CHECK-NEXT: bpl .LBB13_14
; CHECK-NEXT: .LBB13_29: @ %cond.store30
; CHECK-NEXT: vstr.16 s2, [r0, #8]
; CHECK-NEXT: lsls r2, r1, #26
; CHECK-NEXT: bpl .LBB13_15
; CHECK-NEXT: .LBB13_30: @ %cond.store32
; CHECK-NEXT: vmovx.f16 s4, s2
; CHECK-NEXT: vstr.16 s4, [r0, #10]
; CHECK-NEXT: lsls r2, r1, #25
; CHECK-NEXT: bpl .LBB13_16
; CHECK-NEXT: .LBB13_31: @ %cond.store34
; CHECK-NEXT: vstr.16 s3, [r0, #12]
; CHECK-NEXT: lsls r1, r1, #24
; CHECK-NEXT: bpl .LBB13_17
; CHECK-NEXT: .LBB13_32: @ %cond.store36
; CHECK-NEXT: vmovx.f16 s0, s3
; CHECK-NEXT: vstr.16 s0, [r0, #14]
; CHECK-NEXT: add sp, #16
; CHECK-NEXT: vpstt
; CHECK-NEXT: vldrht.u16 q0, [r2]
; CHECK-NEXT: vstrht.16 q0, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load <8 x i16>, <8 x i16>* %mask, align 2

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff