[ARM] Fix loads and stores for predicate vectors

These predicate vectors can usually be loaded and stored with a single
instruction, a VSTR_P0. However this instruction will store the entire P0
predicate, 16 bits, zeroextended to 32bits. Each lane of the the
v4i1/v8i1/v16i1 representing 4/2/1 bits.

As far as I understand, when llvm says "store this v4i1", it really does need
to store 4 bits (or 8, that being the size of a byte, with this bottom 4 as the
interesting bits). For example a bitcast from a v8i1 to a i8 is defined as a
store followed by a load, which is how the code is expanded.

So this instead lowers the v4i1/v8i1 load/store through some shuffles to get
the bits into the correct positions. This, as you might imagine, is not as
efficient as a single instruction. But I believe it is needed for correctness.
v16i1 equally should not load/store 32bits, only storing the 16bits of data.
Stack loads/stores are still using the VSTR_P0 (as can be seen by the test not
changing). This is fine as they are self-consistent, it is only "externally
observable loads/stores" (from our point of view) that need to be corrected.

Differential revision: https://reviews.llvm.org/D67085

llvm-svn: 371419
This commit is contained in:
David Green 2019-09-09 16:35:49 +00:00
parent 63e6d8db1c
commit 2b7089949e
7 changed files with 3091 additions and 775 deletions

View File

@ -378,6 +378,8 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand);
setOperationAction(ISD::LOAD, VT, Custom);
setOperationAction(ISD::STORE, VT, Custom);
}
}
@ -8783,6 +8785,65 @@ void ARMTargetLowering::ExpandDIV_Windows(
Results.push_back(Upper);
}
static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG) {
LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
EVT MemVT = LD->getMemoryVT();
assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) &&
"Expected a predicate type!");
assert(MemVT == Op.getValueType());
assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
"Expected a non-extending load");
assert(LD->isUnindexed() && "Expected a unindexed load");
// The basic MVE VLDR on a v4i1/v8i1 actually loads the entire 16bit
// predicate, with the "v4i1" bits spread out over the 16 bits loaded. We
// need to make sure that 8/4 bits are actually loaded into the correct
// place, which means loading the value and then shuffling the values into
// the bottom bits of the predicate.
// Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect
// for BE).
SDLoc dl(Op);
SDValue Load = DAG.getExtLoad(
ISD::EXTLOAD, dl, MVT::i32, LD->getChain(), LD->getBasePtr(),
EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()),
LD->getMemOperand());
SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Load);
if (MemVT != MVT::v16i1)
Pred = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Pred,
DAG.getConstant(0, dl, MVT::i32));
return DAG.getMergeValues({Pred, Load.getValue(1)}, dl);
}
static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG) {
StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
EVT MemVT = ST->getMemoryVT();
assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) &&
"Expected a predicate type!");
assert(MemVT == ST->getValue().getValueType());
assert(!ST->isTruncatingStore() && "Expected a non-extending store");
assert(ST->isUnindexed() && "Expected a unindexed store");
// Only store the v4i1 or v8i1 worth of bits, via a buildvector with top bits
// unset and a scalar store.
SDLoc dl(Op);
SDValue Build = ST->getValue();
if (MemVT != MVT::v16i1) {
SmallVector<SDValue, 16> Ops;
for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++)
Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Build,
DAG.getConstant(I, dl, MVT::i32)));
for (unsigned I = MemVT.getVectorNumElements(); I < 16; I++)
Ops.push_back(DAG.getUNDEF(MVT::i32));
Build = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i1, Ops);
}
SDValue GRP = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Build);
return DAG.getTruncStore(
ST->getChain(), dl, GRP, ST->getBasePtr(),
EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()),
ST->getMemOperand());
}
static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering()))
// Acquire/Release load/store is not legal for targets without a dmb or
@ -8982,6 +9043,10 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::UADDO:
case ISD::USUBO:
return LowerUnsignedALUO(Op, DAG);
case ISD::LOAD:
return LowerPredicateLoad(Op, DAG);
case ISD::STORE:
return LowerPredicateStore(Op, DAG);
case ISD::ATOMIC_LOAD:
case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG);
case ISD::FSINCOS: return LowerFSINCOS(Op, DAG);

View File

@ -4999,24 +4999,6 @@ let Predicates = [HasMVEInt, IsBE] in {
def : MVE_vector_offset_store_typed<v4f32, MVE_VSTRWU32_post, aligned32_post_store, 2>;
}
let Predicates = [HasMVEInt] in {
// Predicate loads
def : Pat<(v16i1 (load t2addrmode_imm7<2>:$addr)),
(v16i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>;
def : Pat<(v8i1 (load t2addrmode_imm7<2>:$addr)),
(v8i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>;
def : Pat<(v4i1 (load t2addrmode_imm7<2>:$addr)),
(v4i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>;
// Predicate stores
def : Pat<(store (v4i1 VCCR:$val), t2addrmode_imm7<2>:$addr),
(VSTR_P0_off VCCR:$val, t2addrmode_imm7<2>:$addr)>;
def : Pat<(store (v8i1 VCCR:$val), t2addrmode_imm7<2>:$addr),
(VSTR_P0_off VCCR:$val, t2addrmode_imm7<2>:$addr)>;
def : Pat<(store (v16i1 VCCR:$val), t2addrmode_imm7<2>:$addr),
(VSTR_P0_off VCCR:$val, t2addrmode_imm7<2>:$addr)>;
}
// Widening/Narrowing Loads/Stores

View File

@ -8,11 +8,23 @@ define void @foo_v4i32_v4i32(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i32> *%src
; CHECK-NEXT: .pad #8
; CHECK-NEXT: sub sp, #8
; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: add r3, sp, #4
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: vcmp.s32 gt, q0, zr
; CHECK-NEXT: @ implicit-def: $q0
; CHECK-NEXT: vstr p0, [r3]
; CHECK-NEXT: ldrb.w r1, [sp, #4]
; CHECK-NEXT: vmrs r12, p0
; CHECK-NEXT: and r1, r12, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #0, #1
; CHECK-NEXT: ubfx r1, r12, #4, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #1, #1
; CHECK-NEXT: ubfx r1, r12, #8, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #2, #1
; CHECK-NEXT: ubfx r1, r12, #12, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #3, #1
; CHECK-NEXT: and r1, r3, #15
; CHECK-NEXT: lsls r3, r1, #31
; CHECK-NEXT: itt ne
; CHECK-NEXT: ldrne r3, [r2]
@ -29,9 +41,21 @@ define void @foo_v4i32_v4i32(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i32> *%src
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrmi r1, [r2, #12]
; CHECK-NEXT: vmovmi.32 q0[3], r1
; CHECK-NEXT: mov r1, sp
; CHECK-NEXT: vstr p0, [r1]
; CHECK-NEXT: ldrb.w r1, [sp]
; CHECK-NEXT: vmrs r2, p0
; CHECK-NEXT: movs r1, #0
; CHECK-NEXT: and r3, r2, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r1, r3, #0, #1
; CHECK-NEXT: ubfx r3, r2, #4, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r1, r3, #1, #1
; CHECK-NEXT: ubfx r3, r2, #8, #1
; CHECK-NEXT: ubfx r2, r2, #12, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r1, r3, #2, #1
; CHECK-NEXT: rsbs r2, r2, #0
; CHECK-NEXT: bfi r1, r2, #3, #1
; CHECK-NEXT: and r1, r1, #15
; CHECK-NEXT: lsls r2, r1, #31
; CHECK-NEXT: itt ne
; CHECK-NEXT: vmovne r2, s0
@ -64,11 +88,23 @@ define void @foo_sext_v4i32_v4i8(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i8> *%
; CHECK-NEXT: .pad #8
; CHECK-NEXT: sub sp, #8
; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: add r3, sp, #4
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: vcmp.s32 gt, q0, zr
; CHECK-NEXT: @ implicit-def: $q0
; CHECK-NEXT: vstr p0, [r3]
; CHECK-NEXT: ldrb.w r1, [sp, #4]
; CHECK-NEXT: vmrs r12, p0
; CHECK-NEXT: and r1, r12, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #0, #1
; CHECK-NEXT: ubfx r1, r12, #4, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #1, #1
; CHECK-NEXT: ubfx r1, r12, #8, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #2, #1
; CHECK-NEXT: ubfx r1, r12, #12, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #3, #1
; CHECK-NEXT: and r1, r3, #15
; CHECK-NEXT: lsls r3, r1, #31
; CHECK-NEXT: itt ne
; CHECK-NEXT: ldrbne r3, [r2]
@ -85,11 +121,23 @@ define void @foo_sext_v4i32_v4i8(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i8> *%
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrbmi r1, [r2, #3]
; CHECK-NEXT: vmovmi.32 q0[3], r1
; CHECK-NEXT: mov r1, sp
; CHECK-NEXT: vmrs r2, p0
; CHECK-NEXT: movs r1, #0
; CHECK-NEXT: vmovlb.s8 q0, q0
; CHECK-NEXT: vstr p0, [r1]
; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: ldrb.w r1, [sp]
; CHECK-NEXT: and r3, r2, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r1, r3, #0, #1
; CHECK-NEXT: ubfx r3, r2, #4, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r1, r3, #1, #1
; CHECK-NEXT: ubfx r3, r2, #8, #1
; CHECK-NEXT: ubfx r2, r2, #12, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r1, r3, #2, #1
; CHECK-NEXT: rsbs r2, r2, #0
; CHECK-NEXT: bfi r1, r2, #3, #1
; CHECK-NEXT: and r1, r1, #15
; CHECK-NEXT: lsls r2, r1, #31
; CHECK-NEXT: itt ne
; CHECK-NEXT: vmovne r2, s0
@ -123,11 +171,23 @@ define void @foo_sext_v4i32_v4i16(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i16>
; CHECK-NEXT: .pad #8
; CHECK-NEXT: sub sp, #8
; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: add r3, sp, #4
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: vcmp.s32 gt, q0, zr
; CHECK-NEXT: @ implicit-def: $q0
; CHECK-NEXT: vstr p0, [r3]
; CHECK-NEXT: ldrb.w r1, [sp, #4]
; CHECK-NEXT: vmrs r12, p0
; CHECK-NEXT: and r1, r12, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #0, #1
; CHECK-NEXT: ubfx r1, r12, #4, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #1, #1
; CHECK-NEXT: ubfx r1, r12, #8, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #2, #1
; CHECK-NEXT: ubfx r1, r12, #12, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #3, #1
; CHECK-NEXT: and r1, r3, #15
; CHECK-NEXT: lsls r3, r1, #31
; CHECK-NEXT: itt ne
; CHECK-NEXT: ldrhne r3, [r2]
@ -144,10 +204,22 @@ define void @foo_sext_v4i32_v4i16(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i16>
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrhmi r1, [r2, #6]
; CHECK-NEXT: vmovmi.32 q0[3], r1
; CHECK-NEXT: mov r1, sp
; CHECK-NEXT: vmrs r2, p0
; CHECK-NEXT: movs r1, #0
; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: vstr p0, [r1]
; CHECK-NEXT: ldrb.w r1, [sp]
; CHECK-NEXT: and r3, r2, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r1, r3, #0, #1
; CHECK-NEXT: ubfx r3, r2, #4, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r1, r3, #1, #1
; CHECK-NEXT: ubfx r3, r2, #8, #1
; CHECK-NEXT: ubfx r2, r2, #12, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r1, r3, #2, #1
; CHECK-NEXT: rsbs r2, r2, #0
; CHECK-NEXT: bfi r1, r2, #3, #1
; CHECK-NEXT: and r1, r1, #15
; CHECK-NEXT: lsls r2, r1, #31
; CHECK-NEXT: itt ne
; CHECK-NEXT: vmovne r2, s0
@ -181,12 +253,24 @@ define void @foo_zext_v4i32_v4i8(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i8> *%
; CHECK-NEXT: .pad #8
; CHECK-NEXT: sub sp, #8
; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: add r3, sp, #4
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: vmov.i32 q1, #0xff
; CHECK-NEXT: vcmp.s32 gt, q0, zr
; CHECK-NEXT: @ implicit-def: $q0
; CHECK-NEXT: vstr p0, [r3]
; CHECK-NEXT: ldrb.w r1, [sp, #4]
; CHECK-NEXT: vmrs r12, p0
; CHECK-NEXT: and r1, r12, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #0, #1
; CHECK-NEXT: ubfx r1, r12, #4, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #1, #1
; CHECK-NEXT: ubfx r1, r12, #8, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #2, #1
; CHECK-NEXT: ubfx r1, r12, #12, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #3, #1
; CHECK-NEXT: and r1, r3, #15
; CHECK-NEXT: lsls r3, r1, #31
; CHECK-NEXT: itt ne
; CHECK-NEXT: ldrbne r3, [r2]
@ -203,10 +287,22 @@ define void @foo_zext_v4i32_v4i8(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i8> *%
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrbmi r1, [r2, #3]
; CHECK-NEXT: vmovmi.32 q0[3], r1
; CHECK-NEXT: mov r1, sp
; CHECK-NEXT: vmrs r2, p0
; CHECK-NEXT: movs r1, #0
; CHECK-NEXT: vand q0, q0, q1
; CHECK-NEXT: vstr p0, [r1]
; CHECK-NEXT: ldrb.w r1, [sp]
; CHECK-NEXT: and r3, r2, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r1, r3, #0, #1
; CHECK-NEXT: ubfx r3, r2, #4, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r1, r3, #1, #1
; CHECK-NEXT: ubfx r3, r2, #8, #1
; CHECK-NEXT: ubfx r2, r2, #12, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r1, r3, #2, #1
; CHECK-NEXT: rsbs r2, r2, #0
; CHECK-NEXT: bfi r1, r2, #3, #1
; CHECK-NEXT: and r1, r1, #15
; CHECK-NEXT: lsls r2, r1, #31
; CHECK-NEXT: itt ne
; CHECK-NEXT: vmovne r2, s0
@ -240,11 +336,23 @@ define void @foo_zext_v4i32_v4i16(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i16>
; CHECK-NEXT: .pad #8
; CHECK-NEXT: sub sp, #8
; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: add r3, sp, #4
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: vcmp.s32 gt, q0, zr
; CHECK-NEXT: @ implicit-def: $q0
; CHECK-NEXT: vstr p0, [r3]
; CHECK-NEXT: ldrb.w r1, [sp, #4]
; CHECK-NEXT: vmrs r12, p0
; CHECK-NEXT: and r1, r12, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #0, #1
; CHECK-NEXT: ubfx r1, r12, #4, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #1, #1
; CHECK-NEXT: ubfx r1, r12, #8, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #2, #1
; CHECK-NEXT: ubfx r1, r12, #12, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #3, #1
; CHECK-NEXT: and r1, r3, #15
; CHECK-NEXT: lsls r3, r1, #31
; CHECK-NEXT: itt ne
; CHECK-NEXT: ldrhne r3, [r2]
@ -261,10 +369,22 @@ define void @foo_zext_v4i32_v4i16(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i16>
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrhmi r1, [r2, #6]
; CHECK-NEXT: vmovmi.32 q0[3], r1
; CHECK-NEXT: mov r1, sp
; CHECK-NEXT: vmrs r2, p0
; CHECK-NEXT: movs r1, #0
; CHECK-NEXT: vmovlb.u16 q0, q0
; CHECK-NEXT: vstr p0, [r1]
; CHECK-NEXT: ldrb.w r1, [sp]
; CHECK-NEXT: and r3, r2, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r1, r3, #0, #1
; CHECK-NEXT: ubfx r3, r2, #4, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r1, r3, #1, #1
; CHECK-NEXT: ubfx r3, r2, #8, #1
; CHECK-NEXT: ubfx r2, r2, #12, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r1, r3, #2, #1
; CHECK-NEXT: rsbs r2, r2, #0
; CHECK-NEXT: bfi r1, r2, #3, #1
; CHECK-NEXT: and r1, r1, #15
; CHECK-NEXT: lsls r2, r1, #31
; CHECK-NEXT: itt ne
; CHECK-NEXT: vmovne r2, s0
@ -298,12 +418,36 @@ define void @foo_v8i16_v8i16(<8 x i16> *%dest, <8 x i16> *%mask, <8 x i16> *%src
; CHECK-NEXT: .pad #16
; CHECK-NEXT: sub sp, #16
; CHECK-NEXT: vldrh.u16 q0, [r1]
; CHECK-NEXT: add r3, sp, #8
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: vcmp.s16 gt, q0, zr
; CHECK-NEXT: @ implicit-def: $q0
; CHECK-NEXT: vstr p0, [r3]
; CHECK-NEXT: ldrb.w r1, [sp, #8]
; CHECK-NEXT: lsls r3, r1, #31
; CHECK-NEXT: vmrs r12, p0
; CHECK-NEXT: and r1, r12, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #0, #1
; CHECK-NEXT: ubfx r1, r12, #2, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #1, #1
; CHECK-NEXT: ubfx r1, r12, #4, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #2, #1
; CHECK-NEXT: ubfx r1, r12, #6, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #3, #1
; CHECK-NEXT: ubfx r1, r12, #8, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #4, #1
; CHECK-NEXT: ubfx r1, r12, #10, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #5, #1
; CHECK-NEXT: ubfx r1, r12, #12, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #6, #1
; CHECK-NEXT: ubfx r1, r12, #14, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #7, #1
; CHECK-NEXT: uxtb r1, r3
; CHECK-NEXT: lsls r3, r3, #31
; CHECK-NEXT: itt ne
; CHECK-NEXT: ldrhne r3, [r2]
; CHECK-NEXT: vmovne.16 q0[0], r3
@ -335,10 +479,34 @@ define void @foo_v8i16_v8i16(<8 x i16> *%dest, <8 x i16> *%mask, <8 x i16> *%src
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrhmi r1, [r2, #14]
; CHECK-NEXT: vmovmi.16 q0[7], r1
; CHECK-NEXT: mov r1, sp
; CHECK-NEXT: vstr p0, [r1]
; CHECK-NEXT: ldrb.w r1, [sp]
; CHECK-NEXT: lsls r2, r1, #31
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: vmrs r1, p0
; CHECK-NEXT: and r3, r1, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r2, r3, #0, #1
; CHECK-NEXT: ubfx r3, r1, #2, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r2, r3, #1, #1
; CHECK-NEXT: ubfx r3, r1, #4, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r2, r3, #2, #1
; CHECK-NEXT: ubfx r3, r1, #6, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r2, r3, #3, #1
; CHECK-NEXT: ubfx r3, r1, #8, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r2, r3, #4, #1
; CHECK-NEXT: ubfx r3, r1, #10, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r2, r3, #5, #1
; CHECK-NEXT: ubfx r3, r1, #12, #1
; CHECK-NEXT: ubfx r1, r1, #14, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r2, r3, #6, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r2, r1, #7, #1
; CHECK-NEXT: uxtb r1, r2
; CHECK-NEXT: lsls r2, r2, #31
; CHECK-NEXT: itt ne
; CHECK-NEXT: vmovne.u16 r2, q0[0]
; CHECK-NEXT: strhne r2, [r0]
@ -386,12 +554,36 @@ define void @foo_sext_v8i16_v8i8(<8 x i16> *%dest, <8 x i16> *%mask, <8 x i8> *%
; CHECK-NEXT: .pad #16
; CHECK-NEXT: sub sp, #16
; CHECK-NEXT: vldrh.u16 q0, [r1]
; CHECK-NEXT: add r3, sp, #8
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: vcmp.s16 gt, q0, zr
; CHECK-NEXT: @ implicit-def: $q0
; CHECK-NEXT: vstr p0, [r3]
; CHECK-NEXT: ldrb.w r1, [sp, #8]
; CHECK-NEXT: lsls r3, r1, #31
; CHECK-NEXT: vmrs r12, p0
; CHECK-NEXT: and r1, r12, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #0, #1
; CHECK-NEXT: ubfx r1, r12, #2, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #1, #1
; CHECK-NEXT: ubfx r1, r12, #4, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #2, #1
; CHECK-NEXT: ubfx r1, r12, #6, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #3, #1
; CHECK-NEXT: ubfx r1, r12, #8, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #4, #1
; CHECK-NEXT: ubfx r1, r12, #10, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #5, #1
; CHECK-NEXT: ubfx r1, r12, #12, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #6, #1
; CHECK-NEXT: ubfx r1, r12, #14, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #7, #1
; CHECK-NEXT: uxtb r1, r3
; CHECK-NEXT: lsls r3, r3, #31
; CHECK-NEXT: itt ne
; CHECK-NEXT: ldrbne r3, [r2]
; CHECK-NEXT: vmovne.16 q0[0], r3
@ -423,11 +615,35 @@ define void @foo_sext_v8i16_v8i8(<8 x i16> *%dest, <8 x i16> *%mask, <8 x i8> *%
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrbmi r1, [r2, #7]
; CHECK-NEXT: vmovmi.16 q0[7], r1
; CHECK-NEXT: mov r1, sp
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: vmrs r1, p0
; CHECK-NEXT: vmovlb.s8 q0, q0
; CHECK-NEXT: vstr p0, [r1]
; CHECK-NEXT: ldrb.w r1, [sp]
; CHECK-NEXT: lsls r2, r1, #31
; CHECK-NEXT: and r3, r1, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r2, r3, #0, #1
; CHECK-NEXT: ubfx r3, r1, #2, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r2, r3, #1, #1
; CHECK-NEXT: ubfx r3, r1, #4, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r2, r3, #2, #1
; CHECK-NEXT: ubfx r3, r1, #6, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r2, r3, #3, #1
; CHECK-NEXT: ubfx r3, r1, #8, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r2, r3, #4, #1
; CHECK-NEXT: ubfx r3, r1, #10, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r2, r3, #5, #1
; CHECK-NEXT: ubfx r3, r1, #12, #1
; CHECK-NEXT: ubfx r1, r1, #14, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r2, r3, #6, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r2, r1, #7, #1
; CHECK-NEXT: uxtb r1, r2
; CHECK-NEXT: lsls r2, r2, #31
; CHECK-NEXT: itt ne
; CHECK-NEXT: vmovne.u16 r2, q0[0]
; CHECK-NEXT: strhne r2, [r0]
@ -476,12 +692,36 @@ define void @foo_zext_v8i16_v8i8(<8 x i16> *%dest, <8 x i16> *%mask, <8 x i8> *%
; CHECK-NEXT: .pad #16
; CHECK-NEXT: sub sp, #16
; CHECK-NEXT: vldrh.u16 q0, [r1]
; CHECK-NEXT: add r3, sp, #8
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: vcmp.s16 gt, q0, zr
; CHECK-NEXT: @ implicit-def: $q0
; CHECK-NEXT: vstr p0, [r3]
; CHECK-NEXT: ldrb.w r1, [sp, #8]
; CHECK-NEXT: lsls r3, r1, #31
; CHECK-NEXT: vmrs r12, p0
; CHECK-NEXT: and r1, r12, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #0, #1
; CHECK-NEXT: ubfx r1, r12, #2, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #1, #1
; CHECK-NEXT: ubfx r1, r12, #4, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #2, #1
; CHECK-NEXT: ubfx r1, r12, #6, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #3, #1
; CHECK-NEXT: ubfx r1, r12, #8, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #4, #1
; CHECK-NEXT: ubfx r1, r12, #10, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #5, #1
; CHECK-NEXT: ubfx r1, r12, #12, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #6, #1
; CHECK-NEXT: ubfx r1, r12, #14, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #7, #1
; CHECK-NEXT: uxtb r1, r3
; CHECK-NEXT: lsls r3, r3, #31
; CHECK-NEXT: itt ne
; CHECK-NEXT: ldrbne r3, [r2]
; CHECK-NEXT: vmovne.16 q0[0], r3
@ -513,11 +753,35 @@ define void @foo_zext_v8i16_v8i8(<8 x i16> *%dest, <8 x i16> *%mask, <8 x i8> *%
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrbmi r1, [r2, #7]
; CHECK-NEXT: vmovmi.16 q0[7], r1
; CHECK-NEXT: mov r1, sp
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: vmrs r1, p0
; CHECK-NEXT: vmovlb.u8 q0, q0
; CHECK-NEXT: vstr p0, [r1]
; CHECK-NEXT: ldrb.w r1, [sp]
; CHECK-NEXT: lsls r2, r1, #31
; CHECK-NEXT: and r3, r1, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r2, r3, #0, #1
; CHECK-NEXT: ubfx r3, r1, #2, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r2, r3, #1, #1
; CHECK-NEXT: ubfx r3, r1, #4, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r2, r3, #2, #1
; CHECK-NEXT: ubfx r3, r1, #6, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r2, r3, #3, #1
; CHECK-NEXT: ubfx r3, r1, #8, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r2, r3, #4, #1
; CHECK-NEXT: ubfx r3, r1, #10, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r2, r3, #5, #1
; CHECK-NEXT: ubfx r3, r1, #12, #1
; CHECK-NEXT: ubfx r1, r1, #14, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r2, r3, #6, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r2, r1, #7, #1
; CHECK-NEXT: uxtb r1, r2
; CHECK-NEXT: lsls r2, r2, #31
; CHECK-NEXT: itt ne
; CHECK-NEXT: vmovne.u16 r2, q0[0]
; CHECK-NEXT: strhne r2, [r0]
@ -573,13 +837,12 @@ define void @foo_v16i8_v16i8(<16 x i8> *%dest, <16 x i8> *%mask, <16 x i8> *%src
; CHECK-NEXT: bfc r4, #0, #4
; CHECK-NEXT: mov sp, r4
; CHECK-NEXT: vldrb.u8 q0, [r1]
; CHECK-NEXT: add r3, sp, #16
; CHECK-NEXT: sub.w r4, r7, #8
; CHECK-NEXT: vcmp.s8 gt, q0, zr
; CHECK-NEXT: @ implicit-def: $q0
; CHECK-NEXT: vstr p0, [r3]
; CHECK-NEXT: ldrh.w r1, [sp, #16]
; CHECK-NEXT: lsls r3, r1, #31
; CHECK-NEXT: vmrs r3, p0
; CHECK-NEXT: uxth r1, r3
; CHECK-NEXT: lsls r3, r3, #31
; CHECK-NEXT: itt ne
; CHECK-NEXT: ldrbne r3, [r2]
; CHECK-NEXT: vmovne.8 q0[0], r3
@ -643,10 +906,9 @@ define void @foo_v16i8_v16i8(<16 x i8> *%dest, <16 x i8> *%mask, <16 x i8> *%src
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrbmi r1, [r2, #15]
; CHECK-NEXT: vmovmi.8 q0[15], r1
; CHECK-NEXT: mov r1, sp
; CHECK-NEXT: vstr p0, [r1]
; CHECK-NEXT: ldrh.w r1, [sp]
; CHECK-NEXT: lsls r2, r1, #31
; CHECK-NEXT: vmrs r2, p0
; CHECK-NEXT: uxth r1, r2
; CHECK-NEXT: lsls r2, r2, #31
; CHECK-NEXT: itt ne
; CHECK-NEXT: vmovne.u8 r2, q0[0]
; CHECK-NEXT: strbne r2, [r0]
@ -726,12 +988,36 @@ define void @foo_trunc_v8i8_v8i16(<8 x i8> *%dest, <8 x i16> *%mask, <8 x i16> *
; CHECK-NEXT: .pad #16
; CHECK-NEXT: sub sp, #16
; CHECK-NEXT: vldrh.u16 q0, [r1]
; CHECK-NEXT: add r3, sp, #8
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: vcmp.s16 gt, q0, zr
; CHECK-NEXT: @ implicit-def: $q0
; CHECK-NEXT: vstr p0, [r3]
; CHECK-NEXT: ldrb.w r1, [sp, #8]
; CHECK-NEXT: lsls r3, r1, #31
; CHECK-NEXT: vmrs r12, p0
; CHECK-NEXT: and r1, r12, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #0, #1
; CHECK-NEXT: ubfx r1, r12, #2, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #1, #1
; CHECK-NEXT: ubfx r1, r12, #4, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #2, #1
; CHECK-NEXT: ubfx r1, r12, #6, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #3, #1
; CHECK-NEXT: ubfx r1, r12, #8, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #4, #1
; CHECK-NEXT: ubfx r1, r12, #10, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #5, #1
; CHECK-NEXT: ubfx r1, r12, #12, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #6, #1
; CHECK-NEXT: ubfx r1, r12, #14, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #7, #1
; CHECK-NEXT: uxtb r1, r3
; CHECK-NEXT: lsls r3, r3, #31
; CHECK-NEXT: itt ne
; CHECK-NEXT: ldrhne r3, [r2]
; CHECK-NEXT: vmovne.16 q0[0], r3
@ -763,10 +1049,34 @@ define void @foo_trunc_v8i8_v8i16(<8 x i8> *%dest, <8 x i16> *%mask, <8 x i16> *
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrhmi r1, [r2, #14]
; CHECK-NEXT: vmovmi.16 q0[7], r1
; CHECK-NEXT: mov r1, sp
; CHECK-NEXT: vstr p0, [r1]
; CHECK-NEXT: ldrb.w r1, [sp]
; CHECK-NEXT: lsls r2, r1, #31
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: vmrs r1, p0
; CHECK-NEXT: and r3, r1, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r2, r3, #0, #1
; CHECK-NEXT: ubfx r3, r1, #2, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r2, r3, #1, #1
; CHECK-NEXT: ubfx r3, r1, #4, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r2, r3, #2, #1
; CHECK-NEXT: ubfx r3, r1, #6, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r2, r3, #3, #1
; CHECK-NEXT: ubfx r3, r1, #8, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r2, r3, #4, #1
; CHECK-NEXT: ubfx r3, r1, #10, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r2, r3, #5, #1
; CHECK-NEXT: ubfx r3, r1, #12, #1
; CHECK-NEXT: ubfx r1, r1, #14, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r2, r3, #6, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r2, r1, #7, #1
; CHECK-NEXT: uxtb r1, r2
; CHECK-NEXT: lsls r2, r2, #31
; CHECK-NEXT: itt ne
; CHECK-NEXT: vmovne.u16 r2, q0[0]
; CHECK-NEXT: strbne r2, [r0]
@ -815,11 +1125,23 @@ define void @foo_trunc_v4i8_v4i32(<4 x i8> *%dest, <4 x i32> *%mask, <4 x i32> *
; CHECK-NEXT: .pad #8
; CHECK-NEXT: sub sp, #8
; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: add r3, sp, #4
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: vcmp.s32 gt, q0, zr
; CHECK-NEXT: @ implicit-def: $q0
; CHECK-NEXT: vstr p0, [r3]
; CHECK-NEXT: ldrb.w r1, [sp, #4]
; CHECK-NEXT: vmrs r12, p0
; CHECK-NEXT: and r1, r12, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #0, #1
; CHECK-NEXT: ubfx r1, r12, #4, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #1, #1
; CHECK-NEXT: ubfx r1, r12, #8, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #2, #1
; CHECK-NEXT: ubfx r1, r12, #12, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #3, #1
; CHECK-NEXT: and r1, r3, #15
; CHECK-NEXT: lsls r3, r1, #31
; CHECK-NEXT: itt ne
; CHECK-NEXT: ldrne r3, [r2]
@ -836,9 +1158,21 @@ define void @foo_trunc_v4i8_v4i32(<4 x i8> *%dest, <4 x i32> *%mask, <4 x i32> *
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrmi r1, [r2, #12]
; CHECK-NEXT: vmovmi.32 q0[3], r1
; CHECK-NEXT: mov r1, sp
; CHECK-NEXT: vstr p0, [r1]
; CHECK-NEXT: ldrb.w r1, [sp]
; CHECK-NEXT: vmrs r2, p0
; CHECK-NEXT: movs r1, #0
; CHECK-NEXT: and r3, r2, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r1, r3, #0, #1
; CHECK-NEXT: ubfx r3, r2, #4, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r1, r3, #1, #1
; CHECK-NEXT: ubfx r3, r2, #8, #1
; CHECK-NEXT: ubfx r2, r2, #12, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r1, r3, #2, #1
; CHECK-NEXT: rsbs r2, r2, #0
; CHECK-NEXT: bfi r1, r2, #3, #1
; CHECK-NEXT: and r1, r1, #15
; CHECK-NEXT: lsls r2, r1, #31
; CHECK-NEXT: itt ne
; CHECK-NEXT: vmovne r2, s0
@ -872,11 +1206,23 @@ define void @foo_trunc_v4i16_v4i32(<4 x i16> *%dest, <4 x i32> *%mask, <4 x i32>
; CHECK-NEXT: .pad #8
; CHECK-NEXT: sub sp, #8
; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: add r3, sp, #4
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: vcmp.s32 gt, q0, zr
; CHECK-NEXT: @ implicit-def: $q0
; CHECK-NEXT: vstr p0, [r3]
; CHECK-NEXT: ldrb.w r1, [sp, #4]
; CHECK-NEXT: vmrs r12, p0
; CHECK-NEXT: and r1, r12, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #0, #1
; CHECK-NEXT: ubfx r1, r12, #4, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #1, #1
; CHECK-NEXT: ubfx r1, r12, #8, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #2, #1
; CHECK-NEXT: ubfx r1, r12, #12, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #3, #1
; CHECK-NEXT: and r1, r3, #15
; CHECK-NEXT: lsls r3, r1, #31
; CHECK-NEXT: itt ne
; CHECK-NEXT: ldrne r3, [r2]
@ -893,9 +1239,21 @@ define void @foo_trunc_v4i16_v4i32(<4 x i16> *%dest, <4 x i32> *%mask, <4 x i32>
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrmi r1, [r2, #12]
; CHECK-NEXT: vmovmi.32 q0[3], r1
; CHECK-NEXT: mov r1, sp
; CHECK-NEXT: vstr p0, [r1]
; CHECK-NEXT: ldrb.w r1, [sp]
; CHECK-NEXT: vmrs r2, p0
; CHECK-NEXT: movs r1, #0
; CHECK-NEXT: and r3, r2, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r1, r3, #0, #1
; CHECK-NEXT: ubfx r3, r2, #4, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r1, r3, #1, #1
; CHECK-NEXT: ubfx r3, r2, #8, #1
; CHECK-NEXT: ubfx r2, r2, #12, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r1, r3, #2, #1
; CHECK-NEXT: rsbs r2, r2, #0
; CHECK-NEXT: bfi r1, r2, #3, #1
; CHECK-NEXT: and r1, r1, #15
; CHECK-NEXT: lsls r2, r1, #31
; CHECK-NEXT: itt ne
; CHECK-NEXT: vmovne r2, s0
@ -929,11 +1287,23 @@ define void @foo_v4f32_v4f32(<4 x float> *%dest, <4 x i32> *%mask, <4 x float> *
; CHECK-NEXT: .pad #8
; CHECK-NEXT: sub sp, #8
; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: add r3, sp, #4
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: vcmp.s32 gt, q0, zr
; CHECK-NEXT: @ implicit-def: $q0
; CHECK-NEXT: vstr p0, [r3]
; CHECK-NEXT: ldrb.w r1, [sp, #4]
; CHECK-NEXT: vmrs r12, p0
; CHECK-NEXT: and r1, r12, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #0, #1
; CHECK-NEXT: ubfx r1, r12, #4, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #1, #1
; CHECK-NEXT: ubfx r1, r12, #8, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #2, #1
; CHECK-NEXT: ubfx r1, r12, #12, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #3, #1
; CHECK-NEXT: and r1, r3, #15
; CHECK-NEXT: lsls r3, r1, #31
; CHECK-NEXT: it ne
; CHECK-NEXT: vldrne s0, [r2]
@ -946,9 +1316,21 @@ define void @foo_v4f32_v4f32(<4 x float> *%dest, <4 x i32> *%mask, <4 x float> *
; CHECK-NEXT: lsls r1, r1, #28
; CHECK-NEXT: it mi
; CHECK-NEXT: vldrmi s3, [r2, #12]
; CHECK-NEXT: mov r1, sp
; CHECK-NEXT: vstr p0, [r1]
; CHECK-NEXT: ldrb.w r1, [sp]
; CHECK-NEXT: vmrs r2, p0
; CHECK-NEXT: movs r1, #0
; CHECK-NEXT: and r3, r2, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r1, r3, #0, #1
; CHECK-NEXT: ubfx r3, r2, #4, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r1, r3, #1, #1
; CHECK-NEXT: ubfx r3, r2, #8, #1
; CHECK-NEXT: ubfx r2, r2, #12, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r1, r3, #2, #1
; CHECK-NEXT: rsbs r2, r2, #0
; CHECK-NEXT: bfi r1, r2, #3, #1
; CHECK-NEXT: and r1, r1, #15
; CHECK-NEXT: lsls r2, r1, #31
; CHECK-NEXT: it ne
; CHECK-NEXT: vstrne s0, [r0]
@ -977,12 +1359,36 @@ define void @foo_v8f16_v8f16(<8 x half> *%dest, <8 x i16> *%mask, <8 x half> *%s
; CHECK-NEXT: .pad #16
; CHECK-NEXT: sub sp, #16
; CHECK-NEXT: vldrh.u16 q0, [r1]
; CHECK-NEXT: add r3, sp, #8
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: vcmp.s16 gt, q0, zr
; CHECK-NEXT: @ implicit-def: $q0
; CHECK-NEXT: vstr p0, [r3]
; CHECK-NEXT: ldrb.w r1, [sp, #8]
; CHECK-NEXT: lsls r3, r1, #31
; CHECK-NEXT: vmrs r12, p0
; CHECK-NEXT: and r1, r12, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #0, #1
; CHECK-NEXT: ubfx r1, r12, #2, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #1, #1
; CHECK-NEXT: ubfx r1, r12, #4, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #2, #1
; CHECK-NEXT: ubfx r1, r12, #6, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #3, #1
; CHECK-NEXT: ubfx r1, r12, #8, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #4, #1
; CHECK-NEXT: ubfx r1, r12, #10, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #5, #1
; CHECK-NEXT: ubfx r1, r12, #12, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #6, #1
; CHECK-NEXT: ubfx r1, r12, #14, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r3, r1, #7, #1
; CHECK-NEXT: uxtb r1, r3
; CHECK-NEXT: lsls r3, r3, #31
; CHECK-NEXT: bne .LBB13_18
; CHECK-NEXT: @ %bb.1: @ %else
; CHECK-NEXT: lsls r3, r1, #30
@ -1010,10 +1416,34 @@ define void @foo_v8f16_v8f16(<8 x half> *%dest, <8 x i16> *%mask, <8 x half> *%s
; CHECK-NEXT: vmov r1, s4
; CHECK-NEXT: vmov.16 q0[7], r1
; CHECK-NEXT: .LBB13_9: @ %else20
; CHECK-NEXT: mov r1, sp
; CHECK-NEXT: vstr p0, [r1]
; CHECK-NEXT: ldrb.w r1, [sp]
; CHECK-NEXT: lsls r2, r1, #31
; CHECK-NEXT: vmrs r1, p0
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: and r3, r1, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r2, r3, #0, #1
; CHECK-NEXT: ubfx r3, r1, #2, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r2, r3, #1, #1
; CHECK-NEXT: ubfx r3, r1, #4, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r2, r3, #2, #1
; CHECK-NEXT: ubfx r3, r1, #6, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r2, r3, #3, #1
; CHECK-NEXT: ubfx r3, r1, #8, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r2, r3, #4, #1
; CHECK-NEXT: ubfx r3, r1, #10, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r2, r3, #5, #1
; CHECK-NEXT: ubfx r3, r1, #12, #1
; CHECK-NEXT: ubfx r1, r1, #14, #1
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: bfi r2, r3, #6, #1
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: bfi r2, r1, #7, #1
; CHECK-NEXT: uxtb r1, r2
; CHECK-NEXT: lsls r2, r2, #31
; CHECK-NEXT: bne .LBB13_25
; CHECK-NEXT: @ %bb.10: @ %else23
; CHECK-NEXT: lsls r2, r1, #30
@ -1072,13 +1502,13 @@ define void @foo_v8f16_v8f16(<8 x half> *%dest, <8 x i16> *%mask, <8 x half> *%s
; CHECK-NEXT: vmov r3, s4
; CHECK-NEXT: vmov.16 q0[5], r3
; CHECK-NEXT: lsls r3, r1, #25
; CHECK-NEXT: bpl .LBB13_7
; CHECK-NEXT: bpl.w .LBB13_7
; CHECK-NEXT: .LBB13_24: @ %cond.load16
; CHECK-NEXT: vldr.16 s4, [r2, #12]
; CHECK-NEXT: vmov r3, s4
; CHECK-NEXT: vmov.16 q0[6], r3
; CHECK-NEXT: lsls r1, r1, #24
; CHECK-NEXT: bmi .LBB13_8
; CHECK-NEXT: bmi.w .LBB13_8
; CHECK-NEXT: b .LBB13_9
; CHECK-NEXT: .LBB13_25: @ %cond.store
; CHECK-NEXT: vstr.16 s0, [r0]

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,19 +1,55 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE
; RUN: llc -mtriple=thumbebv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE
define arm_aapcs_vfpcc <4 x i32> @bitcast_to_v4i1(i4 %b, <4 x i32> %a) {
; CHECK-LABEL: bitcast_to_v4i1:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: and r0, r0, #15
; CHECK-NEXT: strb.w r0, [sp]
; CHECK-NEXT: mov r0, sp
; CHECK-NEXT: vmov.i32 q1, #0x0
; CHECK-NEXT: vldr p0, [r0]
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: bx lr
; CHECK-LE-LABEL: bitcast_to_v4i1:
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: .pad #4
; CHECK-LE-NEXT: sub sp, #4
; CHECK-LE-NEXT: and r0, r0, #15
; CHECK-LE-NEXT: vmov.i8 q1, #0x0
; CHECK-LE-NEXT: vmov.i8 q2, #0xff
; CHECK-LE-NEXT: vmsr p0, r0
; CHECK-LE-NEXT: vpsel q1, q2, q1
; CHECK-LE-NEXT: vmov.u8 r0, q1[0]
; CHECK-LE-NEXT: vmov.32 q2[0], r0
; CHECK-LE-NEXT: vmov.u8 r0, q1[1]
; CHECK-LE-NEXT: vmov.32 q2[1], r0
; CHECK-LE-NEXT: vmov.u8 r0, q1[2]
; CHECK-LE-NEXT: vmov.32 q2[2], r0
; CHECK-LE-NEXT: vmov.u8 r0, q1[3]
; CHECK-LE-NEXT: vmov.32 q2[3], r0
; CHECK-LE-NEXT: vmov.i32 q1, #0x0
; CHECK-LE-NEXT: vcmp.i32 ne, q2, zr
; CHECK-LE-NEXT: vpsel q0, q0, q1
; CHECK-LE-NEXT: add sp, #4
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: bitcast_to_v4i1:
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: .pad #4
; CHECK-BE-NEXT: sub sp, #4
; CHECK-BE-NEXT: and r0, r0, #15
; CHECK-BE-NEXT: vmov.i8 q1, #0x0
; CHECK-BE-NEXT: vmov.i8 q2, #0xff
; CHECK-BE-NEXT: vmsr p0, r0
; CHECK-BE-NEXT: vpsel q1, q2, q1
; CHECK-BE-NEXT: vmov.u8 r0, q1[0]
; CHECK-BE-NEXT: vmov.32 q2[0], r0
; CHECK-BE-NEXT: vmov.u8 r0, q1[1]
; CHECK-BE-NEXT: vmov.32 q2[1], r0
; CHECK-BE-NEXT: vmov.u8 r0, q1[2]
; CHECK-BE-NEXT: vmov.32 q2[2], r0
; CHECK-BE-NEXT: vmov.u8 r0, q1[3]
; CHECK-BE-NEXT: vmov.32 q2[3], r0
; CHECK-BE-NEXT: vrev64.32 q1, q0
; CHECK-BE-NEXT: vcmp.i32 ne, q2, zr
; CHECK-BE-NEXT: vmov.i32 q0, #0x0
; CHECK-BE-NEXT: vpsel q1, q1, q0
; CHECK-BE-NEXT: vrev64.32 q0, q1
; CHECK-BE-NEXT: add sp, #4
; CHECK-BE-NEXT: bx lr
entry:
%c = bitcast i4 %b to <4 x i1>
%s = select <4 x i1> %c, <4 x i32> %a, <4 x i32> zeroinitializer
@ -21,17 +57,70 @@ entry:
}
define arm_aapcs_vfpcc <8 x i16> @bitcast_to_v8i1(i8 %b, <8 x i16> %a) {
; CHECK-LABEL: bitcast_to_v8i1:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .pad #8
; CHECK-NEXT: sub sp, #8
; CHECK-NEXT: strb.w r0, [sp]
; CHECK-NEXT: mov r0, sp
; CHECK-NEXT: vldr p0, [r0]
; CHECK-NEXT: vmov.i32 q1, #0x0
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: add sp, #8
; CHECK-NEXT: bx lr
; CHECK-LE-LABEL: bitcast_to_v8i1:
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: .pad #8
; CHECK-LE-NEXT: sub sp, #8
; CHECK-LE-NEXT: uxtb r0, r0
; CHECK-LE-NEXT: vmov.i8 q1, #0x0
; CHECK-LE-NEXT: vmov.i8 q2, #0xff
; CHECK-LE-NEXT: vmsr p0, r0
; CHECK-LE-NEXT: vpsel q2, q2, q1
; CHECK-LE-NEXT: vmov.u8 r0, q2[0]
; CHECK-LE-NEXT: vmov.16 q1[0], r0
; CHECK-LE-NEXT: vmov.u8 r0, q2[1]
; CHECK-LE-NEXT: vmov.16 q1[1], r0
; CHECK-LE-NEXT: vmov.u8 r0, q2[2]
; CHECK-LE-NEXT: vmov.16 q1[2], r0
; CHECK-LE-NEXT: vmov.u8 r0, q2[3]
; CHECK-LE-NEXT: vmov.16 q1[3], r0
; CHECK-LE-NEXT: vmov.u8 r0, q2[4]
; CHECK-LE-NEXT: vmov.16 q1[4], r0
; CHECK-LE-NEXT: vmov.u8 r0, q2[5]
; CHECK-LE-NEXT: vmov.16 q1[5], r0
; CHECK-LE-NEXT: vmov.u8 r0, q2[6]
; CHECK-LE-NEXT: vmov.16 q1[6], r0
; CHECK-LE-NEXT: vmov.u8 r0, q2[7]
; CHECK-LE-NEXT: vmov.16 q1[7], r0
; CHECK-LE-NEXT: vcmp.i16 ne, q1, zr
; CHECK-LE-NEXT: vmov.i32 q1, #0x0
; CHECK-LE-NEXT: vpsel q0, q0, q1
; CHECK-LE-NEXT: add sp, #8
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: bitcast_to_v8i1:
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: .pad #8
; CHECK-BE-NEXT: sub sp, #8
; CHECK-BE-NEXT: uxtb r0, r0
; CHECK-BE-NEXT: vmov.i8 q1, #0x0
; CHECK-BE-NEXT: vmov.i8 q2, #0xff
; CHECK-BE-NEXT: vmsr p0, r0
; CHECK-BE-NEXT: vpsel q2, q2, q1
; CHECK-BE-NEXT: vmov.u8 r0, q2[0]
; CHECK-BE-NEXT: vmov.16 q1[0], r0
; CHECK-BE-NEXT: vmov.u8 r0, q2[1]
; CHECK-BE-NEXT: vmov.16 q1[1], r0
; CHECK-BE-NEXT: vmov.u8 r0, q2[2]
; CHECK-BE-NEXT: vmov.16 q1[2], r0
; CHECK-BE-NEXT: vmov.u8 r0, q2[3]
; CHECK-BE-NEXT: vmov.16 q1[3], r0
; CHECK-BE-NEXT: vmov.u8 r0, q2[4]
; CHECK-BE-NEXT: vmov.16 q1[4], r0
; CHECK-BE-NEXT: vmov.u8 r0, q2[5]
; CHECK-BE-NEXT: vmov.16 q1[5], r0
; CHECK-BE-NEXT: vmov.u8 r0, q2[6]
; CHECK-BE-NEXT: vmov.16 q1[6], r0
; CHECK-BE-NEXT: vmov.u8 r0, q2[7]
; CHECK-BE-NEXT: vmov.16 q1[7], r0
; CHECK-BE-NEXT: vcmp.i16 ne, q1, zr
; CHECK-BE-NEXT: vrev64.16 q1, q0
; CHECK-BE-NEXT: vmov.i32 q0, #0x0
; CHECK-BE-NEXT: vrev32.16 q0, q0
; CHECK-BE-NEXT: vpsel q1, q1, q0
; CHECK-BE-NEXT: vrev64.16 q0, q1
; CHECK-BE-NEXT: add sp, #8
; CHECK-BE-NEXT: bx lr
entry:
%c = bitcast i8 %b to <8 x i1>
%s = select <8 x i1> %c, <8 x i16> %a, <8 x i16> zeroinitializer
@ -39,25 +128,46 @@ entry:
}
define arm_aapcs_vfpcc <16 x i8> @bitcast_to_v16i1(i16 %b, <16 x i8> %a) {
; CHECK-LABEL: bitcast_to_v16i1:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r6, r7, lr}
; CHECK-NEXT: push {r4, r6, r7, lr}
; CHECK-NEXT: .setfp r7, sp, #8
; CHECK-NEXT: add r7, sp, #8
; CHECK-NEXT: .pad #16
; CHECK-NEXT: sub sp, #16
; CHECK-NEXT: mov r4, sp
; CHECK-NEXT: bfc r4, #0, #4
; CHECK-NEXT: mov sp, r4
; CHECK-NEXT: strh.w r0, [sp]
; CHECK-NEXT: mov r0, sp
; CHECK-NEXT: sub.w r4, r7, #8
; CHECK-NEXT: vldr p0, [r0]
; CHECK-NEXT: vmov.i32 q1, #0x0
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: mov sp, r4
; CHECK-NEXT: pop {r4, r6, r7, pc}
; CHECK-LE-LABEL: bitcast_to_v16i1:
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: .save {r4, r6, r7, lr}
; CHECK-LE-NEXT: push {r4, r6, r7, lr}
; CHECK-LE-NEXT: .setfp r7, sp, #8
; CHECK-LE-NEXT: add r7, sp, #8
; CHECK-LE-NEXT: .pad #16
; CHECK-LE-NEXT: sub sp, #16
; CHECK-LE-NEXT: mov r4, sp
; CHECK-LE-NEXT: bfc r4, #0, #4
; CHECK-LE-NEXT: mov sp, r4
; CHECK-LE-NEXT: uxth r0, r0
; CHECK-LE-NEXT: sub.w r4, r7, #8
; CHECK-LE-NEXT: vmov.i32 q1, #0x0
; CHECK-LE-NEXT: vmsr p0, r0
; CHECK-LE-NEXT: vpsel q0, q0, q1
; CHECK-LE-NEXT: mov sp, r4
; CHECK-LE-NEXT: pop {r4, r6, r7, pc}
;
; CHECK-BE-LABEL: bitcast_to_v16i1:
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: .save {r4, r6, r7, lr}
; CHECK-BE-NEXT: push {r4, r6, r7, lr}
; CHECK-BE-NEXT: .setfp r7, sp, #8
; CHECK-BE-NEXT: add r7, sp, #8
; CHECK-BE-NEXT: .pad #16
; CHECK-BE-NEXT: sub sp, #16
; CHECK-BE-NEXT: mov r4, sp
; CHECK-BE-NEXT: bfc r4, #0, #4
; CHECK-BE-NEXT: mov sp, r4
; CHECK-BE-NEXT: vrev64.8 q1, q0
; CHECK-BE-NEXT: vmov.i32 q0, #0x0
; CHECK-BE-NEXT: uxth r0, r0
; CHECK-BE-NEXT: sub.w r4, r7, #8
; CHECK-BE-NEXT: vrev32.8 q0, q0
; CHECK-BE-NEXT: vmsr p0, r0
; CHECK-BE-NEXT: vpsel q1, q1, q0
; CHECK-BE-NEXT: vrev64.8 q0, q1
; CHECK-BE-NEXT: mov sp, r4
; CHECK-BE-NEXT: pop {r4, r6, r7, pc}
entry:
%c = bitcast i16 %b to <16 x i1>
%s = select <16 x i1> %c, <16 x i8> %a, <16 x i8> zeroinitializer
@ -65,20 +175,36 @@ entry:
}
define arm_aapcs_vfpcc <2 x i64> @bitcast_to_v2i1(i2 %b, <2 x i64> %a) {
; CHECK-LABEL: bitcast_to_v2i1:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: and r0, r0, #3
; CHECK-NEXT: sbfx r1, r0, #0, #1
; CHECK-NEXT: sbfx r0, r0, #1, #1
; CHECK-NEXT: vmov.32 q1[0], r1
; CHECK-NEXT: vmov.32 q1[1], r1
; CHECK-NEXT: vmov.32 q1[2], r0
; CHECK-NEXT: vmov.32 q1[3], r0
; CHECK-NEXT: vand q0, q0, q1
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: bx lr
; CHECK-LE-LABEL: bitcast_to_v2i1:
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: .pad #4
; CHECK-LE-NEXT: sub sp, #4
; CHECK-LE-NEXT: and r0, r0, #3
; CHECK-LE-NEXT: sbfx r1, r0, #0, #1
; CHECK-LE-NEXT: sbfx r0, r0, #1, #1
; CHECK-LE-NEXT: vmov.32 q1[0], r1
; CHECK-LE-NEXT: vmov.32 q1[1], r1
; CHECK-LE-NEXT: vmov.32 q1[2], r0
; CHECK-LE-NEXT: vmov.32 q1[3], r0
; CHECK-LE-NEXT: vand q0, q0, q1
; CHECK-LE-NEXT: add sp, #4
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: bitcast_to_v2i1:
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: .pad #4
; CHECK-BE-NEXT: sub sp, #4
; CHECK-BE-NEXT: and r0, r0, #3
; CHECK-BE-NEXT: sbfx r1, r0, #0, #1
; CHECK-BE-NEXT: sbfx r0, r0, #1, #1
; CHECK-BE-NEXT: vmov.32 q1[0], r1
; CHECK-BE-NEXT: vmov.32 q1[1], r1
; CHECK-BE-NEXT: vmov.32 q1[2], r0
; CHECK-BE-NEXT: vmov.32 q1[3], r0
; CHECK-BE-NEXT: vrev64.32 q2, q1
; CHECK-BE-NEXT: vand q0, q0, q2
; CHECK-BE-NEXT: add sp, #4
; CHECK-BE-NEXT: bx lr
entry:
%c = bitcast i2 %b to <2 x i1>
%s = select <2 x i1> %c, <2 x i64> %a, <2 x i64> zeroinitializer
@ -87,16 +213,52 @@ entry:
define arm_aapcs_vfpcc i4 @bitcast_from_v4i1(<4 x i32> %a) {
; CHECK-LABEL: bitcast_from_v4i1:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: vcmp.i32 eq, q0, zr
; CHECK-NEXT: mov r0, sp
; CHECK-NEXT: vstr p0, [r0]
; CHECK-NEXT: ldrb.w r0, [sp]
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: bx lr
; CHECK-LE-LABEL: bitcast_from_v4i1:
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: .pad #4
; CHECK-LE-NEXT: sub sp, #4
; CHECK-LE-NEXT: vcmp.i32 eq, q0, zr
; CHECK-LE-NEXT: movs r0, #0
; CHECK-LE-NEXT: vmrs r1, p0
; CHECK-LE-NEXT: and r2, r1, #1
; CHECK-LE-NEXT: rsbs r2, r2, #0
; CHECK-LE-NEXT: bfi r0, r2, #0, #1
; CHECK-LE-NEXT: ubfx r2, r1, #4, #1
; CHECK-LE-NEXT: rsbs r2, r2, #0
; CHECK-LE-NEXT: bfi r0, r2, #1, #1
; CHECK-LE-NEXT: ubfx r2, r1, #8, #1
; CHECK-LE-NEXT: ubfx r1, r1, #12, #1
; CHECK-LE-NEXT: rsbs r2, r2, #0
; CHECK-LE-NEXT: bfi r0, r2, #2, #1
; CHECK-LE-NEXT: rsbs r1, r1, #0
; CHECK-LE-NEXT: bfi r0, r1, #3, #1
; CHECK-LE-NEXT: and r0, r0, #15
; CHECK-LE-NEXT: add sp, #4
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: bitcast_from_v4i1:
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: .pad #4
; CHECK-BE-NEXT: sub sp, #4
; CHECK-BE-NEXT: vrev64.32 q1, q0
; CHECK-BE-NEXT: movs r3, #0
; CHECK-BE-NEXT: vcmp.i32 eq, q1, zr
; CHECK-BE-NEXT: vmrs r0, p0
; CHECK-BE-NEXT: and r2, r0, #1
; CHECK-BE-NEXT: ubfx r1, r0, #4, #1
; CHECK-BE-NEXT: rsbs r2, r2, #0
; CHECK-BE-NEXT: rsbs r1, r1, #0
; CHECK-BE-NEXT: bfi r3, r2, #0, #1
; CHECK-BE-NEXT: bfi r3, r1, #1, #1
; CHECK-BE-NEXT: ubfx r1, r0, #8, #1
; CHECK-BE-NEXT: ubfx r0, r0, #12, #1
; CHECK-BE-NEXT: rsbs r1, r1, #0
; CHECK-BE-NEXT: bfi r3, r1, #2, #1
; CHECK-BE-NEXT: rsbs r0, r0, #0
; CHECK-BE-NEXT: bfi r3, r0, #3, #1
; CHECK-BE-NEXT: and r0, r3, #15
; CHECK-BE-NEXT: add sp, #4
; CHECK-BE-NEXT: bx lr
entry:
%c = icmp eq <4 x i32> %a, zeroinitializer
%b = bitcast <4 x i1> %c to i4
@ -104,16 +266,76 @@ entry:
}
define arm_aapcs_vfpcc i8 @bitcast_from_v8i1(<8 x i16> %a) {
; CHECK-LABEL: bitcast_from_v8i1:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .pad #8
; CHECK-NEXT: sub sp, #8
; CHECK-NEXT: vcmp.i16 eq, q0, zr
; CHECK-NEXT: mov r0, sp
; CHECK-NEXT: vstr p0, [r0]
; CHECK-NEXT: ldrb.w r0, [sp]
; CHECK-NEXT: add sp, #8
; CHECK-NEXT: bx lr
; CHECK-LE-LABEL: bitcast_from_v8i1:
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: .pad #8
; CHECK-LE-NEXT: sub sp, #8
; CHECK-LE-NEXT: vcmp.i16 eq, q0, zr
; CHECK-LE-NEXT: movs r0, #0
; CHECK-LE-NEXT: vmrs r1, p0
; CHECK-LE-NEXT: and r2, r1, #1
; CHECK-LE-NEXT: rsbs r2, r2, #0
; CHECK-LE-NEXT: bfi r0, r2, #0, #1
; CHECK-LE-NEXT: ubfx r2, r1, #2, #1
; CHECK-LE-NEXT: rsbs r2, r2, #0
; CHECK-LE-NEXT: bfi r0, r2, #1, #1
; CHECK-LE-NEXT: ubfx r2, r1, #4, #1
; CHECK-LE-NEXT: rsbs r2, r2, #0
; CHECK-LE-NEXT: bfi r0, r2, #2, #1
; CHECK-LE-NEXT: ubfx r2, r1, #6, #1
; CHECK-LE-NEXT: rsbs r2, r2, #0
; CHECK-LE-NEXT: bfi r0, r2, #3, #1
; CHECK-LE-NEXT: ubfx r2, r1, #8, #1
; CHECK-LE-NEXT: rsbs r2, r2, #0
; CHECK-LE-NEXT: bfi r0, r2, #4, #1
; CHECK-LE-NEXT: ubfx r2, r1, #10, #1
; CHECK-LE-NEXT: rsbs r2, r2, #0
; CHECK-LE-NEXT: bfi r0, r2, #5, #1
; CHECK-LE-NEXT: ubfx r2, r1, #12, #1
; CHECK-LE-NEXT: ubfx r1, r1, #14, #1
; CHECK-LE-NEXT: rsbs r2, r2, #0
; CHECK-LE-NEXT: bfi r0, r2, #6, #1
; CHECK-LE-NEXT: rsbs r1, r1, #0
; CHECK-LE-NEXT: bfi r0, r1, #7, #1
; CHECK-LE-NEXT: uxtb r0, r0
; CHECK-LE-NEXT: add sp, #8
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: bitcast_from_v8i1:
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: .pad #8
; CHECK-BE-NEXT: sub sp, #8
; CHECK-BE-NEXT: vrev64.16 q1, q0
; CHECK-BE-NEXT: vcmp.i16 eq, q1, zr
; CHECK-BE-NEXT: vmrs r1, p0
; CHECK-BE-NEXT: ubfx r0, r1, #2, #1
; CHECK-BE-NEXT: rsbs r2, r0, #0
; CHECK-BE-NEXT: and r0, r1, #1
; CHECK-BE-NEXT: rsbs r3, r0, #0
; CHECK-BE-NEXT: movs r0, #0
; CHECK-BE-NEXT: bfi r0, r3, #0, #1
; CHECK-BE-NEXT: bfi r0, r2, #1, #1
; CHECK-BE-NEXT: ubfx r2, r1, #4, #1
; CHECK-BE-NEXT: rsbs r2, r2, #0
; CHECK-BE-NEXT: bfi r0, r2, #2, #1
; CHECK-BE-NEXT: ubfx r2, r1, #6, #1
; CHECK-BE-NEXT: rsbs r2, r2, #0
; CHECK-BE-NEXT: bfi r0, r2, #3, #1
; CHECK-BE-NEXT: ubfx r2, r1, #8, #1
; CHECK-BE-NEXT: rsbs r2, r2, #0
; CHECK-BE-NEXT: bfi r0, r2, #4, #1
; CHECK-BE-NEXT: ubfx r2, r1, #10, #1
; CHECK-BE-NEXT: rsbs r2, r2, #0
; CHECK-BE-NEXT: bfi r0, r2, #5, #1
; CHECK-BE-NEXT: ubfx r2, r1, #12, #1
; CHECK-BE-NEXT: ubfx r1, r1, #14, #1
; CHECK-BE-NEXT: rsbs r2, r2, #0
; CHECK-BE-NEXT: bfi r0, r2, #6, #1
; CHECK-BE-NEXT: rsbs r1, r1, #0
; CHECK-BE-NEXT: bfi r0, r1, #7, #1
; CHECK-BE-NEXT: uxtb r0, r0
; CHECK-BE-NEXT: add sp, #8
; CHECK-BE-NEXT: bx lr
entry:
%c = icmp eq <8 x i16> %a, zeroinitializer
%b = bitcast <8 x i1> %c to i8
@ -121,24 +343,42 @@ entry:
}
define arm_aapcs_vfpcc i16 @bitcast_from_v16i1(<16 x i8> %a) {
; CHECK-LABEL: bitcast_from_v16i1:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r6, r7, lr}
; CHECK-NEXT: push {r4, r6, r7, lr}
; CHECK-NEXT: .setfp r7, sp, #8
; CHECK-NEXT: add r7, sp, #8
; CHECK-NEXT: .pad #16
; CHECK-NEXT: sub sp, #16
; CHECK-NEXT: mov r4, sp
; CHECK-NEXT: bfc r4, #0, #4
; CHECK-NEXT: mov sp, r4
; CHECK-NEXT: sub.w r4, r7, #8
; CHECK-NEXT: vcmp.i8 eq, q0, zr
; CHECK-NEXT: mov r0, sp
; CHECK-NEXT: vstr p0, [r0]
; CHECK-NEXT: ldrh.w r0, [sp]
; CHECK-NEXT: mov sp, r4
; CHECK-NEXT: pop {r4, r6, r7, pc}
; CHECK-LE-LABEL: bitcast_from_v16i1:
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: .save {r4, r6, r7, lr}
; CHECK-LE-NEXT: push {r4, r6, r7, lr}
; CHECK-LE-NEXT: .setfp r7, sp, #8
; CHECK-LE-NEXT: add r7, sp, #8
; CHECK-LE-NEXT: .pad #16
; CHECK-LE-NEXT: sub sp, #16
; CHECK-LE-NEXT: mov r4, sp
; CHECK-LE-NEXT: bfc r4, #0, #4
; CHECK-LE-NEXT: mov sp, r4
; CHECK-LE-NEXT: vcmp.i8 eq, q0, zr
; CHECK-LE-NEXT: sub.w r4, r7, #8
; CHECK-LE-NEXT: vmrs r0, p0
; CHECK-LE-NEXT: uxth r0, r0
; CHECK-LE-NEXT: mov sp, r4
; CHECK-LE-NEXT: pop {r4, r6, r7, pc}
;
; CHECK-BE-LABEL: bitcast_from_v16i1:
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: .save {r4, r6, r7, lr}
; CHECK-BE-NEXT: push {r4, r6, r7, lr}
; CHECK-BE-NEXT: .setfp r7, sp, #8
; CHECK-BE-NEXT: add r7, sp, #8
; CHECK-BE-NEXT: .pad #16
; CHECK-BE-NEXT: sub sp, #16
; CHECK-BE-NEXT: mov r4, sp
; CHECK-BE-NEXT: bfc r4, #0, #4
; CHECK-BE-NEXT: mov sp, r4
; CHECK-BE-NEXT: vrev64.8 q1, q0
; CHECK-BE-NEXT: sub.w r4, r7, #8
; CHECK-BE-NEXT: vcmp.i8 eq, q1, zr
; CHECK-BE-NEXT: vmrs r0, p0
; CHECK-BE-NEXT: uxth r0, r0
; CHECK-BE-NEXT: mov sp, r4
; CHECK-BE-NEXT: pop {r4, r6, r7, pc}
entry:
%c = icmp eq <16 x i8> %a, zeroinitializer
%b = bitcast <16 x i1> %c to i16
@ -146,25 +386,46 @@ entry:
}
define arm_aapcs_vfpcc i2 @bitcast_from_v2i1(<2 x i64> %a) {
; CHECK-LABEL: bitcast_from_v2i1:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: vmov r0, s1
; CHECK-NEXT: vmov r1, s0
; CHECK-NEXT: vmov r2, s2
; CHECK-NEXT: orrs r0, r1
; CHECK-NEXT: vmov r1, s3
; CHECK-NEXT: cset r0, eq
; CHECK-NEXT: orrs r1, r2
; CHECK-NEXT: cset r1, eq
; CHECK-NEXT: ands r1, r1, #1
; CHECK-NEXT: it ne
; CHECK-NEXT: mvnne r1, #1
; CHECK-NEXT: bfi r1, r0, #0, #1
; CHECK-NEXT: and r0, r1, #3
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: bx lr
; CHECK-LE-LABEL: bitcast_from_v2i1:
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: .pad #4
; CHECK-LE-NEXT: sub sp, #4
; CHECK-LE-NEXT: vmov r0, s1
; CHECK-LE-NEXT: vmov r1, s0
; CHECK-LE-NEXT: vmov r2, s2
; CHECK-LE-NEXT: orrs r0, r1
; CHECK-LE-NEXT: vmov r1, s3
; CHECK-LE-NEXT: cset r0, eq
; CHECK-LE-NEXT: orrs r1, r2
; CHECK-LE-NEXT: cset r1, eq
; CHECK-LE-NEXT: ands r1, r1, #1
; CHECK-LE-NEXT: it ne
; CHECK-LE-NEXT: mvnne r1, #1
; CHECK-LE-NEXT: bfi r1, r0, #0, #1
; CHECK-LE-NEXT: and r0, r1, #3
; CHECK-LE-NEXT: add sp, #4
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: bitcast_from_v2i1:
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: .pad #4
; CHECK-BE-NEXT: sub sp, #4
; CHECK-BE-NEXT: vrev64.32 q1, q0
; CHECK-BE-NEXT: vmov r0, s6
; CHECK-BE-NEXT: vmov r1, s7
; CHECK-BE-NEXT: vmov r2, s5
; CHECK-BE-NEXT: orrs r0, r1
; CHECK-BE-NEXT: vmov r1, s4
; CHECK-BE-NEXT: cset r0, eq
; CHECK-BE-NEXT: orrs r1, r2
; CHECK-BE-NEXT: cset r1, eq
; CHECK-BE-NEXT: ands r1, r1, #1
; CHECK-BE-NEXT: it ne
; CHECK-BE-NEXT: mvnne r1, #1
; CHECK-BE-NEXT: bfi r1, r0, #0, #1
; CHECK-BE-NEXT: and r0, r1, #3
; CHECK-BE-NEXT: add sp, #4
; CHECK-BE-NEXT: bx lr
entry:
%c = icmp eq <2 x i64> %a, zeroinitializer
%b = bitcast <2 x i1> %c to i2

View File

@ -5,15 +5,41 @@
define arm_aapcs_vfpcc <4 x i32> @load_v4i1(<4 x i1> *%src, <4 x i32> %a) {
; CHECK-LE-LABEL: load_v4i1:
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: vldr p0, [r0]
; CHECK-LE-NEXT: ldrb r0, [r0]
; CHECK-LE-NEXT: vmov.i8 q1, #0x0
; CHECK-LE-NEXT: vmov.i8 q2, #0xff
; CHECK-LE-NEXT: vmsr p0, r0
; CHECK-LE-NEXT: vpsel q1, q2, q1
; CHECK-LE-NEXT: vmov.u8 r0, q1[0]
; CHECK-LE-NEXT: vmov.32 q2[0], r0
; CHECK-LE-NEXT: vmov.u8 r0, q1[1]
; CHECK-LE-NEXT: vmov.32 q2[1], r0
; CHECK-LE-NEXT: vmov.u8 r0, q1[2]
; CHECK-LE-NEXT: vmov.32 q2[2], r0
; CHECK-LE-NEXT: vmov.u8 r0, q1[3]
; CHECK-LE-NEXT: vmov.32 q2[3], r0
; CHECK-LE-NEXT: vmov.i32 q1, #0x0
; CHECK-LE-NEXT: vcmp.i32 ne, q2, zr
; CHECK-LE-NEXT: vpsel q0, q0, q1
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: load_v4i1:
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: vldr p0, [r0]
; CHECK-BE-NEXT: ldrb r0, [r0]
; CHECK-BE-NEXT: vmov.i8 q1, #0x0
; CHECK-BE-NEXT: vmov.i8 q2, #0xff
; CHECK-BE-NEXT: vmsr p0, r0
; CHECK-BE-NEXT: vpsel q1, q2, q1
; CHECK-BE-NEXT: vmov.u8 r0, q1[0]
; CHECK-BE-NEXT: vmov.32 q2[0], r0
; CHECK-BE-NEXT: vmov.u8 r0, q1[1]
; CHECK-BE-NEXT: vmov.32 q2[1], r0
; CHECK-BE-NEXT: vmov.u8 r0, q1[2]
; CHECK-BE-NEXT: vmov.32 q2[2], r0
; CHECK-BE-NEXT: vmov.u8 r0, q1[3]
; CHECK-BE-NEXT: vmov.32 q2[3], r0
; CHECK-BE-NEXT: vrev64.32 q1, q0
; CHECK-BE-NEXT: vcmp.i32 ne, q2, zr
; CHECK-BE-NEXT: vmov.i32 q0, #0x0
; CHECK-BE-NEXT: vpsel q1, q1, q0
; CHECK-BE-NEXT: vrev64.32 q0, q1
@ -27,16 +53,58 @@ entry:
define arm_aapcs_vfpcc <8 x i16> @load_v8i1(<8 x i1> *%src, <8 x i16> %a) {
; CHECK-LE-LABEL: load_v8i1:
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: vldr p0, [r0]
; CHECK-LE-NEXT: ldrb r0, [r0]
; CHECK-LE-NEXT: vmov.i8 q1, #0x0
; CHECK-LE-NEXT: vmov.i8 q2, #0xff
; CHECK-LE-NEXT: vmsr p0, r0
; CHECK-LE-NEXT: vpsel q2, q2, q1
; CHECK-LE-NEXT: vmov.u8 r0, q2[0]
; CHECK-LE-NEXT: vmov.16 q1[0], r0
; CHECK-LE-NEXT: vmov.u8 r0, q2[1]
; CHECK-LE-NEXT: vmov.16 q1[1], r0
; CHECK-LE-NEXT: vmov.u8 r0, q2[2]
; CHECK-LE-NEXT: vmov.16 q1[2], r0
; CHECK-LE-NEXT: vmov.u8 r0, q2[3]
; CHECK-LE-NEXT: vmov.16 q1[3], r0
; CHECK-LE-NEXT: vmov.u8 r0, q2[4]
; CHECK-LE-NEXT: vmov.16 q1[4], r0
; CHECK-LE-NEXT: vmov.u8 r0, q2[5]
; CHECK-LE-NEXT: vmov.16 q1[5], r0
; CHECK-LE-NEXT: vmov.u8 r0, q2[6]
; CHECK-LE-NEXT: vmov.16 q1[6], r0
; CHECK-LE-NEXT: vmov.u8 r0, q2[7]
; CHECK-LE-NEXT: vmov.16 q1[7], r0
; CHECK-LE-NEXT: vcmp.i16 ne, q1, zr
; CHECK-LE-NEXT: vmov.i32 q1, #0x0
; CHECK-LE-NEXT: vpsel q0, q0, q1
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: load_v8i1:
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: ldrb r0, [r0]
; CHECK-BE-NEXT: vmov.i8 q1, #0x0
; CHECK-BE-NEXT: vmov.i8 q2, #0xff
; CHECK-BE-NEXT: vmsr p0, r0
; CHECK-BE-NEXT: vpsel q2, q2, q1
; CHECK-BE-NEXT: vmov.u8 r0, q2[0]
; CHECK-BE-NEXT: vmov.16 q1[0], r0
; CHECK-BE-NEXT: vmov.u8 r0, q2[1]
; CHECK-BE-NEXT: vmov.16 q1[1], r0
; CHECK-BE-NEXT: vmov.u8 r0, q2[2]
; CHECK-BE-NEXT: vmov.16 q1[2], r0
; CHECK-BE-NEXT: vmov.u8 r0, q2[3]
; CHECK-BE-NEXT: vmov.16 q1[3], r0
; CHECK-BE-NEXT: vmov.u8 r0, q2[4]
; CHECK-BE-NEXT: vmov.16 q1[4], r0
; CHECK-BE-NEXT: vmov.u8 r0, q2[5]
; CHECK-BE-NEXT: vmov.16 q1[5], r0
; CHECK-BE-NEXT: vmov.u8 r0, q2[6]
; CHECK-BE-NEXT: vmov.16 q1[6], r0
; CHECK-BE-NEXT: vmov.u8 r0, q2[7]
; CHECK-BE-NEXT: vmov.16 q1[7], r0
; CHECK-BE-NEXT: vcmp.i16 ne, q1, zr
; CHECK-BE-NEXT: vrev64.16 q1, q0
; CHECK-BE-NEXT: vmov.i32 q0, #0x0
; CHECK-BE-NEXT: vldr p0, [r0]
; CHECK-BE-NEXT: vrev32.16 q0, q0
; CHECK-BE-NEXT: vpsel q1, q1, q0
; CHECK-BE-NEXT: vrev64.16 q0, q1
@ -50,17 +118,19 @@ entry:
define arm_aapcs_vfpcc <16 x i8> @load_v16i1(<16 x i1> *%src, <16 x i8> %a) {
; CHECK-LE-LABEL: load_v16i1:
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: vldr p0, [r0]
; CHECK-LE-NEXT: ldrh r0, [r0]
; CHECK-LE-NEXT: vmov.i32 q1, #0x0
; CHECK-LE-NEXT: vmsr p0, r0
; CHECK-LE-NEXT: vpsel q0, q0, q1
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: load_v16i1:
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: ldrh r0, [r0]
; CHECK-BE-NEXT: vrev64.8 q1, q0
; CHECK-BE-NEXT: vmov.i32 q0, #0x0
; CHECK-BE-NEXT: vldr p0, [r0]
; CHECK-BE-NEXT: vrev32.8 q0, q0
; CHECK-BE-NEXT: vmsr p0, r0
; CHECK-BE-NEXT: vpsel q1, q1, q0
; CHECK-BE-NEXT: vrev64.8 q0, q1
; CHECK-BE-NEXT: bx lr
@ -106,14 +176,44 @@ define arm_aapcs_vfpcc void @store_v4i1(<4 x i1> *%dst, <4 x i32> %a) {
; CHECK-LE-LABEL: store_v4i1:
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: vcmp.i32 eq, q0, zr
; CHECK-LE-NEXT: vstr p0, [r0]
; CHECK-LE-NEXT: movs r1, #0
; CHECK-LE-NEXT: vmrs r2, p0
; CHECK-LE-NEXT: and r3, r2, #1
; CHECK-LE-NEXT: rsbs r3, r3, #0
; CHECK-LE-NEXT: bfi r1, r3, #0, #1
; CHECK-LE-NEXT: ubfx r3, r2, #4, #1
; CHECK-LE-NEXT: rsbs r3, r3, #0
; CHECK-LE-NEXT: bfi r1, r3, #1, #1
; CHECK-LE-NEXT: ubfx r3, r2, #8, #1
; CHECK-LE-NEXT: ubfx r2, r2, #12, #1
; CHECK-LE-NEXT: rsbs r3, r3, #0
; CHECK-LE-NEXT: bfi r1, r3, #2, #1
; CHECK-LE-NEXT: rsbs r2, r2, #0
; CHECK-LE-NEXT: bfi r1, r2, #3, #1
; CHECK-LE-NEXT: and r1, r1, #15
; CHECK-LE-NEXT: strb r1, [r0]
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: store_v4i1:
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: vrev64.32 q1, q0
; CHECK-BE-NEXT: vcmp.i32 eq, q1, zr
; CHECK-BE-NEXT: vstr p0, [r0]
; CHECK-BE-NEXT: vmrs r1, p0
; CHECK-BE-NEXT: and r3, r1, #1
; CHECK-BE-NEXT: ubfx r2, r1, #4, #1
; CHECK-BE-NEXT: rsbs r3, r3, #0
; CHECK-BE-NEXT: rsb.w r12, r2, #0
; CHECK-BE-NEXT: movs r2, #0
; CHECK-BE-NEXT: bfi r2, r3, #0, #1
; CHECK-BE-NEXT: ubfx r3, r1, #8, #1
; CHECK-BE-NEXT: ubfx r1, r1, #12, #1
; CHECK-BE-NEXT: bfi r2, r12, #1, #1
; CHECK-BE-NEXT: rsbs r3, r3, #0
; CHECK-BE-NEXT: rsbs r1, r1, #0
; CHECK-BE-NEXT: bfi r2, r3, #2, #1
; CHECK-BE-NEXT: bfi r2, r1, #3, #1
; CHECK-BE-NEXT: and r1, r2, #15
; CHECK-BE-NEXT: strb r1, [r0]
; CHECK-BE-NEXT: bx lr
entry:
%c = icmp eq <4 x i32> %a, zeroinitializer
@ -125,14 +225,66 @@ define arm_aapcs_vfpcc void @store_v8i1(<8 x i1> *%dst, <8 x i16> %a) {
; CHECK-LE-LABEL: store_v8i1:
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: vcmp.i16 eq, q0, zr
; CHECK-LE-NEXT: vstr p0, [r0]
; CHECK-LE-NEXT: movs r1, #0
; CHECK-LE-NEXT: vmrs r2, p0
; CHECK-LE-NEXT: and r3, r2, #1
; CHECK-LE-NEXT: rsbs r3, r3, #0
; CHECK-LE-NEXT: bfi r1, r3, #0, #1
; CHECK-LE-NEXT: ubfx r3, r2, #2, #1
; CHECK-LE-NEXT: rsbs r3, r3, #0
; CHECK-LE-NEXT: bfi r1, r3, #1, #1
; CHECK-LE-NEXT: ubfx r3, r2, #4, #1
; CHECK-LE-NEXT: rsbs r3, r3, #0
; CHECK-LE-NEXT: bfi r1, r3, #2, #1
; CHECK-LE-NEXT: ubfx r3, r2, #6, #1
; CHECK-LE-NEXT: rsbs r3, r3, #0
; CHECK-LE-NEXT: bfi r1, r3, #3, #1
; CHECK-LE-NEXT: ubfx r3, r2, #8, #1
; CHECK-LE-NEXT: rsbs r3, r3, #0
; CHECK-LE-NEXT: bfi r1, r3, #4, #1
; CHECK-LE-NEXT: ubfx r3, r2, #10, #1
; CHECK-LE-NEXT: rsbs r3, r3, #0
; CHECK-LE-NEXT: bfi r1, r3, #5, #1
; CHECK-LE-NEXT: ubfx r3, r2, #12, #1
; CHECK-LE-NEXT: ubfx r2, r2, #14, #1
; CHECK-LE-NEXT: rsbs r3, r3, #0
; CHECK-LE-NEXT: bfi r1, r3, #6, #1
; CHECK-LE-NEXT: rsbs r2, r2, #0
; CHECK-LE-NEXT: bfi r1, r2, #7, #1
; CHECK-LE-NEXT: strb r1, [r0]
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: store_v8i1:
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: vrev64.16 q1, q0
; CHECK-BE-NEXT: vcmp.i16 eq, q1, zr
; CHECK-BE-NEXT: vstr p0, [r0]
; CHECK-BE-NEXT: vmrs r2, p0
; CHECK-BE-NEXT: ubfx r1, r2, #2, #1
; CHECK-BE-NEXT: rsb.w r12, r1, #0
; CHECK-BE-NEXT: and r1, r2, #1
; CHECK-BE-NEXT: rsbs r3, r1, #0
; CHECK-BE-NEXT: movs r1, #0
; CHECK-BE-NEXT: bfi r1, r3, #0, #1
; CHECK-BE-NEXT: ubfx r3, r2, #4, #1
; CHECK-BE-NEXT: bfi r1, r12, #1, #1
; CHECK-BE-NEXT: rsbs r3, r3, #0
; CHECK-BE-NEXT: bfi r1, r3, #2, #1
; CHECK-BE-NEXT: ubfx r3, r2, #6, #1
; CHECK-BE-NEXT: rsbs r3, r3, #0
; CHECK-BE-NEXT: bfi r1, r3, #3, #1
; CHECK-BE-NEXT: ubfx r3, r2, #8, #1
; CHECK-BE-NEXT: rsbs r3, r3, #0
; CHECK-BE-NEXT: bfi r1, r3, #4, #1
; CHECK-BE-NEXT: ubfx r3, r2, #10, #1
; CHECK-BE-NEXT: rsbs r3, r3, #0
; CHECK-BE-NEXT: bfi r1, r3, #5, #1
; CHECK-BE-NEXT: ubfx r3, r2, #12, #1
; CHECK-BE-NEXT: ubfx r2, r2, #14, #1
; CHECK-BE-NEXT: rsbs r3, r3, #0
; CHECK-BE-NEXT: bfi r1, r3, #6, #1
; CHECK-BE-NEXT: rsbs r2, r2, #0
; CHECK-BE-NEXT: bfi r1, r2, #7, #1
; CHECK-BE-NEXT: strb r1, [r0]
; CHECK-BE-NEXT: bx lr
entry:
%c = icmp eq <8 x i16> %a, zeroinitializer
@ -144,14 +296,16 @@ define arm_aapcs_vfpcc void @store_v16i1(<16 x i1> *%dst, <16 x i8> %a) {
; CHECK-LE-LABEL: store_v16i1:
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: vcmp.i8 eq, q0, zr
; CHECK-LE-NEXT: vstr p0, [r0]
; CHECK-LE-NEXT: vmrs r1, p0
; CHECK-LE-NEXT: strh r1, [r0]
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: store_v16i1:
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: vrev64.8 q1, q0
; CHECK-BE-NEXT: vcmp.i8 eq, q1, zr
; CHECK-BE-NEXT: vstr p0, [r0]
; CHECK-BE-NEXT: vmrs r1, p0
; CHECK-BE-NEXT: strh r1, [r0]
; CHECK-BE-NEXT: bx lr
entry:
%c = icmp eq <16 x i8> %a, zeroinitializer