forked from OSchip/llvm-project
[ARM] MVE VMULL patterns
This adds MVE vmull patterns, which are conceptually the same as mul(vmovl, vmovl), and so the tablegen patterns follow the same structure. For i8 and i16 this is simple enough, but in the i32 version the multiply (in 64bits) is illegal, meaning we need to catch the pattern earlier in a dag fold. Because bitcasts are involved in the zext versions and the patterns are a little different in little and big endian. I have only added little endian support in this patch. Differential Revision: https://reviews.llvm.org/D76740
This commit is contained in:
parent
c697dd9ffd
commit
fbd53ffc3a
|
@ -12125,18 +12125,86 @@ static SDValue PerformVMULCombine(SDNode *N,
|
|||
DAG.getNode(ISD::MUL, DL, VT, N01, N1));
|
||||
}
|
||||
|
||||
static SDValue PerformMVEVMULLCombine(SDNode *N, SelectionDAG &DAG,
|
||||
const ARMSubtarget *Subtarget) {
|
||||
EVT VT = N->getValueType(0);
|
||||
if (VT != MVT::v2i64)
|
||||
return SDValue();
|
||||
|
||||
SDValue N0 = N->getOperand(0);
|
||||
SDValue N1 = N->getOperand(1);
|
||||
|
||||
auto IsSignExt = [&](SDValue Op) {
|
||||
if (Op->getOpcode() != ISD::SIGN_EXTEND_INREG)
|
||||
return SDValue();
|
||||
EVT VT = cast<VTSDNode>(Op->getOperand(1))->getVT();
|
||||
if (VT.getScalarSizeInBits() == 32)
|
||||
return Op->getOperand(0);
|
||||
return SDValue();
|
||||
};
|
||||
auto IsZeroExt = [&](SDValue Op) {
|
||||
// Zero extends are a little more awkward. At the point we are matching
|
||||
// this, we are looking for an AND with a (-1, 0, -1, 0) buildvector mask.
|
||||
// That might be before of after a bitcast depending on how the and is
|
||||
// placed. Because this has to look through bitcasts, it is currently only
|
||||
// supported on LE.
|
||||
if (!Subtarget->isLittle())
|
||||
return SDValue();
|
||||
|
||||
SDValue And = Op;
|
||||
if (And->getOpcode() == ISD::BITCAST)
|
||||
And = And->getOperand(0);
|
||||
if (And->getOpcode() != ISD::AND)
|
||||
return SDValue();
|
||||
SDValue Mask = And->getOperand(1);
|
||||
if (Mask->getOpcode() == ISD::BITCAST)
|
||||
Mask = Mask->getOperand(0);
|
||||
|
||||
if (Mask->getOpcode() != ISD::BUILD_VECTOR ||
|
||||
Mask.getValueType() != MVT::v4i32)
|
||||
return SDValue();
|
||||
if (isAllOnesConstant(Mask->getOperand(0)) &&
|
||||
isNullConstant(Mask->getOperand(1)) &&
|
||||
isAllOnesConstant(Mask->getOperand(2)) &&
|
||||
isNullConstant(Mask->getOperand(3)))
|
||||
return And->getOperand(0);
|
||||
return SDValue();
|
||||
};
|
||||
|
||||
SDLoc dl(N);
|
||||
if (SDValue Op0 = IsSignExt(N0)) {
|
||||
if (SDValue Op1 = IsSignExt(N1)) {
|
||||
SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
|
||||
SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
|
||||
return DAG.getNode(ARMISD::VMULLs, dl, VT, New0a, New1a);
|
||||
}
|
||||
}
|
||||
if (SDValue Op0 = IsZeroExt(N0)) {
|
||||
if (SDValue Op1 = IsZeroExt(N1)) {
|
||||
SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
|
||||
SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
|
||||
return DAG.getNode(ARMISD::VMULLu, dl, VT, New0a, New1a);
|
||||
}
|
||||
}
|
||||
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
static SDValue PerformMULCombine(SDNode *N,
|
||||
TargetLowering::DAGCombinerInfo &DCI,
|
||||
const ARMSubtarget *Subtarget) {
|
||||
SelectionDAG &DAG = DCI.DAG;
|
||||
|
||||
EVT VT = N->getValueType(0);
|
||||
if (Subtarget->hasMVEIntegerOps() && VT == MVT::v2i64)
|
||||
return PerformMVEVMULLCombine(N, DAG, Subtarget);
|
||||
|
||||
if (Subtarget->isThumb1Only())
|
||||
return SDValue();
|
||||
|
||||
if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
|
||||
return SDValue();
|
||||
|
||||
EVT VT = N->getValueType(0);
|
||||
if (VT.is64BitVector() || VT.is128BitVector())
|
||||
return PerformVMULCombine(N, DCI, Subtarget);
|
||||
if (VT != MVT::i32)
|
||||
|
|
|
@ -289,6 +289,11 @@ def ARMvshruImm : SDNode<"ARMISD::VSHRuIMM", SDTARMVSHIMM>;
|
|||
def ARMvshls : SDNode<"ARMISD::VSHLs", SDTARMVSH>;
|
||||
def ARMvshlu : SDNode<"ARMISD::VSHLu", SDTARMVSH>;
|
||||
|
||||
def SDTARMVMULL : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
|
||||
SDTCisSameAs<1, 2>]>;
|
||||
def ARMvmulls : SDNode<"ARMISD::VMULLs", SDTARMVMULL>;
|
||||
def ARMvmullu : SDNode<"ARMISD::VMULLu", SDTARMVMULL>;
|
||||
|
||||
def SDTARMVCMP : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>,
|
||||
SDTCisInt<3>]>;
|
||||
def SDTARMVCMPZ : SDTypeProfile<1, 2, [SDTCisInt<2>]>;
|
||||
|
|
|
@ -2558,22 +2558,23 @@ let Predicates = [HasMVEInt] in {
|
|||
def : Pat<(sext_inreg (v4i32 MQPR:$src), v4i8),
|
||||
(MVE_VMOVLs16bh (MVE_VMOVLs8bh MQPR:$src))>;
|
||||
|
||||
def : Pat<(sext_inreg (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src)))),
|
||||
v8i8), (MVE_VMOVLs8th MQPR:$src)>;
|
||||
def : Pat<(sext_inreg (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src)))),
|
||||
v4i16), (MVE_VMOVLs16th MQPR:$src)>;
|
||||
def : Pat<(ARMvbicImm (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src)))),
|
||||
(i32 0xAFF)), (MVE_VMOVLu8th MQPR:$src)>;
|
||||
def : Pat<(and (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src)))),
|
||||
(v4i32 (ARMvmovImm (i32 0xCFF)))),
|
||||
(MVE_VMOVLu16th MQPR:$src)>;
|
||||
def : Pat<(sext_inreg (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src)))), v8i8),
|
||||
(MVE_VMOVLs8th MQPR:$src)>;
|
||||
def : Pat<(sext_inreg (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src)))), v4i16),
|
||||
(MVE_VMOVLs16th MQPR:$src)>;
|
||||
|
||||
// zext_inreg 16 -> 32
|
||||
def : Pat<(and (v4i32 MQPR:$src), (v4i32 (ARMvmovImm (i32 0xCFF)))),
|
||||
(MVE_VMOVLu16bh MQPR:$src)>;
|
||||
// zext_inreg 8 -> 16
|
||||
def : Pat<(ARMvbicImm (v8i16 MQPR:$src), (i32 0xAFF)),
|
||||
(MVE_VMOVLu8bh MQPR:$src)>;
|
||||
// zext_inreg 16 -> 32
|
||||
def : Pat<(and (v4i32 MQPR:$src), (v4i32 (ARMvmovImm (i32 0xCFF)))),
|
||||
(MVE_VMOVLu16bh MQPR:$src)>;
|
||||
// Same zext_inreg with vrevs, picking the top half
|
||||
def : Pat<(ARMvbicImm (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src)))), (i32 0xAFF)),
|
||||
(MVE_VMOVLu8th MQPR:$src)>;
|
||||
def : Pat<(and (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src)))),
|
||||
(v4i32 (ARMvmovImm (i32 0xCFF)))),
|
||||
(MVE_VMOVLu16th MQPR:$src)>;
|
||||
}
|
||||
|
||||
|
||||
|
@ -4443,6 +4444,50 @@ defm MVE_VMULLBp16 : MVE_VMULL_m<MVE_v8p16, int_arm_mve_vmull_poly,
|
|||
defm MVE_VMULLTp16 : MVE_VMULL_m<MVE_v8p16, int_arm_mve_vmull_poly,
|
||||
int_arm_mve_mull_poly_predicated, 0b1>;
|
||||
|
||||
let Predicates = [HasMVEInt] in {
|
||||
def : Pat<(v2i64 (ARMvmulls (v4i32 MQPR:$src1), (v4i32 MQPR:$src2))),
|
||||
(MVE_VMULLBs32 MQPR:$src1, MQPR:$src2)>;
|
||||
def : Pat<(v2i64 (ARMvmulls (v4i32 (ARMvrev64 (v4i32 MQPR:$src1))),
|
||||
(v4i32 (ARMvrev64 (v4i32 MQPR:$src2))))),
|
||||
(MVE_VMULLTs32 MQPR:$src1, MQPR:$src2)>;
|
||||
|
||||
def : Pat<(mul (sext_inreg (v4i32 MQPR:$src1), v4i16),
|
||||
(sext_inreg (v4i32 MQPR:$src2), v4i16)),
|
||||
(MVE_VMULLBs16 MQPR:$src1, MQPR:$src2)>;
|
||||
def : Pat<(mul (sext_inreg (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src1)))), v4i16),
|
||||
(sext_inreg (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src2)))), v4i16)),
|
||||
(MVE_VMULLTs16 MQPR:$src1, MQPR:$src2)>;
|
||||
|
||||
def : Pat<(mul (sext_inreg (v8i16 MQPR:$src1), v8i8),
|
||||
(sext_inreg (v8i16 MQPR:$src2), v8i8)),
|
||||
(MVE_VMULLBs8 MQPR:$src1, MQPR:$src2)>;
|
||||
def : Pat<(mul (sext_inreg (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src1)))), v8i8),
|
||||
(sext_inreg (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src2)))), v8i8)),
|
||||
(MVE_VMULLTs8 MQPR:$src1, MQPR:$src2)>;
|
||||
|
||||
def : Pat<(v2i64 (ARMvmullu (v4i32 MQPR:$src1), (v4i32 MQPR:$src2))),
|
||||
(MVE_VMULLBu32 MQPR:$src1, MQPR:$src2)>;
|
||||
def : Pat<(v2i64 (ARMvmullu (v4i32 (ARMvrev64 (v4i32 MQPR:$src1))),
|
||||
(v4i32 (ARMvrev64 (v4i32 MQPR:$src2))))),
|
||||
(MVE_VMULLTu32 MQPR:$src1, MQPR:$src2)>;
|
||||
|
||||
def : Pat<(mul (and (v4i32 MQPR:$src1), (v4i32 (ARMvmovImm (i32 0xCFF)))),
|
||||
(and (v4i32 MQPR:$src2), (v4i32 (ARMvmovImm (i32 0xCFF))))),
|
||||
(MVE_VMULLBu16 MQPR:$src1, MQPR:$src2)>;
|
||||
def : Pat<(mul (and (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src1)))),
|
||||
(v4i32 (ARMvmovImm (i32 0xCFF)))),
|
||||
(and (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src2)))),
|
||||
(v4i32 (ARMvmovImm (i32 0xCFF))))),
|
||||
(MVE_VMULLTu16 MQPR:$src1, MQPR:$src2)>;
|
||||
|
||||
def : Pat<(mul (ARMvbicImm (v8i16 MQPR:$src1), (i32 0xAFF)),
|
||||
(ARMvbicImm (v8i16 MQPR:$src2), (i32 0xAFF))),
|
||||
(MVE_VMULLBu8 MQPR:$src1, MQPR:$src2)>;
|
||||
def : Pat<(mul (ARMvbicImm (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src1)))), (i32 0xAFF)),
|
||||
(ARMvbicImm (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src2)))), (i32 0xAFF))),
|
||||
(MVE_VMULLTu8 MQPR:$src1, MQPR:$src2)>;
|
||||
}
|
||||
|
||||
class MVE_VxMULH<string iname, string suffix, bit U, bits<2> size, bit round,
|
||||
list<dag> pattern=[]>
|
||||
: MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
|
||||
|
|
|
@ -526,11 +526,6 @@ def NEONzip : SDNode<"ARMISD::VZIP", SDTARMVSHUF2>;
|
|||
def NEONuzp : SDNode<"ARMISD::VUZP", SDTARMVSHUF2>;
|
||||
def NEONtrn : SDNode<"ARMISD::VTRN", SDTARMVSHUF2>;
|
||||
|
||||
def SDTARMVMULL : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
|
||||
SDTCisSameAs<1, 2>]>;
|
||||
def NEONvmulls : SDNode<"ARMISD::VMULLs", SDTARMVMULL>;
|
||||
def NEONvmullu : SDNode<"ARMISD::VMULLu", SDTARMVMULL>;
|
||||
|
||||
def SDTARMVTBL1 : SDTypeProfile<1, 2, [SDTCisVT<0, v8i8>, SDTCisVT<1, v8i8>,
|
||||
SDTCisVT<2, v8i8>]>;
|
||||
def SDTARMVTBL2 : SDTypeProfile<1, 3, [SDTCisVT<0, v8i8>, SDTCisVT<1, v8i8>,
|
||||
|
@ -4428,17 +4423,17 @@ def : Pat<(v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$src1),
|
|||
let PostEncoderMethod = "NEONThumb2DataIPostEncoder",
|
||||
DecoderNamespace = "NEONData" in {
|
||||
defm VMULLs : N3VL_QHS<0,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D,
|
||||
"vmull", "s", NEONvmulls, 1>;
|
||||
"vmull", "s", ARMvmulls, 1>;
|
||||
defm VMULLu : N3VL_QHS<1,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D,
|
||||
"vmull", "u", NEONvmullu, 1>;
|
||||
"vmull", "u", ARMvmullu, 1>;
|
||||
def VMULLp8 : N3VLInt<0, 1, 0b00, 0b1110, 0, IIC_VMULi16D, "vmull", "p8",
|
||||
v8i16, v8i8, int_arm_neon_vmullp, 1>;
|
||||
def VMULLp64 : N3VLIntnp<0b00101, 0b10, 0b1110, 0, 0, NoItinerary,
|
||||
"vmull", "p64", v2i64, v1i64, int_arm_neon_vmullp, 1>,
|
||||
Requires<[HasV8, HasCrypto]>;
|
||||
}
|
||||
defm VMULLsls : N3VLSL_HS<0, 0b1010, IIC_VMULi16D, "vmull", "s", NEONvmulls>;
|
||||
defm VMULLslu : N3VLSL_HS<1, 0b1010, IIC_VMULi16D, "vmull", "u", NEONvmullu>;
|
||||
defm VMULLsls : N3VLSL_HS<0, 0b1010, IIC_VMULi16D, "vmull", "s", ARMvmulls>;
|
||||
defm VMULLslu : N3VLSL_HS<1, 0b1010, IIC_VMULi16D, "vmull", "u", ARMvmullu>;
|
||||
|
||||
// VQDMULL : Vector Saturating Doubling Multiply Long (Q = D * D)
|
||||
defm VQDMULL : N3VLInt_HS<0,1,0b1101,0, IIC_VMULi16D, IIC_VMULi32D,
|
||||
|
@ -4508,12 +4503,12 @@ def : Pat<(v4f32 (fadd_mlx (v4f32 QPR:$src1),
|
|||
|
||||
// VMLAL : Vector Multiply Accumulate Long (Q += D * D)
|
||||
defm VMLALs : N3VLMulOp_QHS<0,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D,
|
||||
"vmlal", "s", NEONvmulls, add>;
|
||||
"vmlal", "s", ARMvmulls, add>;
|
||||
defm VMLALu : N3VLMulOp_QHS<1,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D,
|
||||
"vmlal", "u", NEONvmullu, add>;
|
||||
"vmlal", "u", ARMvmullu, add>;
|
||||
|
||||
defm VMLALsls : N3VLMulOpSL_HS<0, 0b0010, "vmlal", "s", NEONvmulls, add>;
|
||||
defm VMLALslu : N3VLMulOpSL_HS<1, 0b0010, "vmlal", "u", NEONvmullu, add>;
|
||||
defm VMLALsls : N3VLMulOpSL_HS<0, 0b0010, "vmlal", "s", ARMvmulls, add>;
|
||||
defm VMLALslu : N3VLMulOpSL_HS<1, 0b0010, "vmlal", "u", ARMvmullu, add>;
|
||||
|
||||
let Predicates = [HasNEON, HasV8_1a] in {
|
||||
// v8.1a Neon Rounding Double Multiply-Op vector operations,
|
||||
|
@ -4741,12 +4736,12 @@ def : Pat<(v4f32 (fsub_mlx (v4f32 QPR:$src1),
|
|||
|
||||
// VMLSL : Vector Multiply Subtract Long (Q -= D * D)
|
||||
defm VMLSLs : N3VLMulOp_QHS<0,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D,
|
||||
"vmlsl", "s", NEONvmulls, sub>;
|
||||
"vmlsl", "s", ARMvmulls, sub>;
|
||||
defm VMLSLu : N3VLMulOp_QHS<1,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D,
|
||||
"vmlsl", "u", NEONvmullu, sub>;
|
||||
"vmlsl", "u", ARMvmullu, sub>;
|
||||
|
||||
defm VMLSLsls : N3VLMulOpSL_HS<0, 0b0110, "vmlsl", "s", NEONvmulls, sub>;
|
||||
defm VMLSLslu : N3VLMulOpSL_HS<1, 0b0110, "vmlsl", "u", NEONvmullu, sub>;
|
||||
defm VMLSLsls : N3VLMulOpSL_HS<0, 0b0110, "vmlsl", "s", ARMvmulls, sub>;
|
||||
defm VMLSLslu : N3VLMulOpSL_HS<1, 0b0110, "vmlsl", "u", ARMvmullu, sub>;
|
||||
|
||||
// VQDMLSL : Vector Saturating Doubling Multiply Subtract Long (Q -= D * D)
|
||||
defm VQDMLSL : N3VLInt3_HS<0, 1, 0b1011, 0, IIC_VMACi16D, IIC_VMACi32D,
|
||||
|
|
|
@ -41,12 +41,13 @@ entry:
|
|||
define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_zext(<2 x i32> %x, <2 x i32> %y) {
|
||||
; CHECK-LABEL: add_v2i32_v2i64_zext:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vmov r0, s4
|
||||
; CHECK-NEXT: vmov r1, s0
|
||||
; CHECK-NEXT: vmov r2, s6
|
||||
; CHECK-NEXT: vmov r3, s2
|
||||
; CHECK-NEXT: umull r0, r1, r1, r0
|
||||
; CHECK-NEXT: umlal r0, r1, r3, r2
|
||||
; CHECK-NEXT: vmullb.u32 q2, q0, q1
|
||||
; CHECK-NEXT: vmov r0, s10
|
||||
; CHECK-NEXT: vmov r3, s8
|
||||
; CHECK-NEXT: vmov r1, s11
|
||||
; CHECK-NEXT: vmov r2, s9
|
||||
; CHECK-NEXT: adds r0, r0, r3
|
||||
; CHECK-NEXT: adcs r1, r2
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%xx = zext <2 x i32> %x to <2 x i64>
|
||||
|
@ -59,12 +60,13 @@ entry:
|
|||
define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_sext(<2 x i32> %x, <2 x i32> %y) {
|
||||
; CHECK-LABEL: add_v2i32_v2i64_sext:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vmov r0, s4
|
||||
; CHECK-NEXT: vmov r1, s0
|
||||
; CHECK-NEXT: vmov r2, s6
|
||||
; CHECK-NEXT: vmov r3, s2
|
||||
; CHECK-NEXT: smull r0, r1, r1, r0
|
||||
; CHECK-NEXT: smlal r0, r1, r3, r2
|
||||
; CHECK-NEXT: vmullb.s32 q2, q0, q1
|
||||
; CHECK-NEXT: vmov r0, s10
|
||||
; CHECK-NEXT: vmov r3, s8
|
||||
; CHECK-NEXT: vmov r1, s11
|
||||
; CHECK-NEXT: vmov r2, s9
|
||||
; CHECK-NEXT: adds r0, r0, r3
|
||||
; CHECK-NEXT: adcs r1, r2
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%xx = sext <2 x i32> %x to <2 x i64>
|
||||
|
@ -306,10 +308,8 @@ define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_zext(<16 x i8> %x, <16 x i8
|
|||
; CHECK-NEXT: vmov.16 q3[6], r0
|
||||
; CHECK-NEXT: vmov.u8 r0, q0[15]
|
||||
; CHECK-NEXT: vmov.16 q3[7], r0
|
||||
; CHECK-NEXT: vmovlb.u8 q2, q2
|
||||
; CHECK-NEXT: vmovlb.u8 q3, q3
|
||||
; CHECK-NEXT: vmov.u8 r0, q1[0]
|
||||
; CHECK-NEXT: vmul.i16 q2, q3, q2
|
||||
; CHECK-NEXT: vmullb.u8 q2, q3, q2
|
||||
; CHECK-NEXT: vmov.16 q3[0], r0
|
||||
; CHECK-NEXT: vmov.u8 r0, q1[1]
|
||||
; CHECK-NEXT: vmov.16 q3[1], r0
|
||||
|
@ -326,24 +326,22 @@ define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_zext(<16 x i8> %x, <16 x i8
|
|||
; CHECK-NEXT: vmov.u8 r0, q1[7]
|
||||
; CHECK-NEXT: vmov.16 q3[7], r0
|
||||
; CHECK-NEXT: vmov.u8 r0, q0[0]
|
||||
; CHECK-NEXT: vmovlb.u8 q1, q3
|
||||
; CHECK-NEXT: vmov.16 q3[0], r0
|
||||
; CHECK-NEXT: vmov.16 q1[0], r0
|
||||
; CHECK-NEXT: vmov.u8 r0, q0[1]
|
||||
; CHECK-NEXT: vmov.16 q3[1], r0
|
||||
; CHECK-NEXT: vmov.16 q1[1], r0
|
||||
; CHECK-NEXT: vmov.u8 r0, q0[2]
|
||||
; CHECK-NEXT: vmov.16 q3[2], r0
|
||||
; CHECK-NEXT: vmov.16 q1[2], r0
|
||||
; CHECK-NEXT: vmov.u8 r0, q0[3]
|
||||
; CHECK-NEXT: vmov.16 q3[3], r0
|
||||
; CHECK-NEXT: vmov.16 q1[3], r0
|
||||
; CHECK-NEXT: vmov.u8 r0, q0[4]
|
||||
; CHECK-NEXT: vmov.16 q3[4], r0
|
||||
; CHECK-NEXT: vmov.16 q1[4], r0
|
||||
; CHECK-NEXT: vmov.u8 r0, q0[5]
|
||||
; CHECK-NEXT: vmov.16 q3[5], r0
|
||||
; CHECK-NEXT: vmov.16 q1[5], r0
|
||||
; CHECK-NEXT: vmov.u8 r0, q0[6]
|
||||
; CHECK-NEXT: vmov.16 q3[6], r0
|
||||
; CHECK-NEXT: vmov.16 q1[6], r0
|
||||
; CHECK-NEXT: vmov.u8 r0, q0[7]
|
||||
; CHECK-NEXT: vmov.16 q3[7], r0
|
||||
; CHECK-NEXT: vmovlb.u8 q0, q3
|
||||
; CHECK-NEXT: vmul.i16 q0, q0, q1
|
||||
; CHECK-NEXT: vmov.16 q1[7], r0
|
||||
; CHECK-NEXT: vmullb.u8 q0, q1, q3
|
||||
; CHECK-NEXT: vadd.i16 q0, q0, q2
|
||||
; CHECK-NEXT: vaddv.u16 r0, q0
|
||||
; CHECK-NEXT: uxth r0, r0
|
||||
|
@ -391,10 +389,8 @@ define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_sext(<16 x i8> %x, <16 x i8
|
|||
; CHECK-NEXT: vmov.16 q3[6], r0
|
||||
; CHECK-NEXT: vmov.u8 r0, q0[15]
|
||||
; CHECK-NEXT: vmov.16 q3[7], r0
|
||||
; CHECK-NEXT: vmovlb.s8 q2, q2
|
||||
; CHECK-NEXT: vmovlb.s8 q3, q3
|
||||
; CHECK-NEXT: vmov.u8 r0, q1[0]
|
||||
; CHECK-NEXT: vmul.i16 q2, q3, q2
|
||||
; CHECK-NEXT: vmullb.s8 q2, q3, q2
|
||||
; CHECK-NEXT: vmov.16 q3[0], r0
|
||||
; CHECK-NEXT: vmov.u8 r0, q1[1]
|
||||
; CHECK-NEXT: vmov.16 q3[1], r0
|
||||
|
@ -411,24 +407,22 @@ define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_sext(<16 x i8> %x, <16 x i8
|
|||
; CHECK-NEXT: vmov.u8 r0, q1[7]
|
||||
; CHECK-NEXT: vmov.16 q3[7], r0
|
||||
; CHECK-NEXT: vmov.u8 r0, q0[0]
|
||||
; CHECK-NEXT: vmovlb.s8 q1, q3
|
||||
; CHECK-NEXT: vmov.16 q3[0], r0
|
||||
; CHECK-NEXT: vmov.16 q1[0], r0
|
||||
; CHECK-NEXT: vmov.u8 r0, q0[1]
|
||||
; CHECK-NEXT: vmov.16 q3[1], r0
|
||||
; CHECK-NEXT: vmov.16 q1[1], r0
|
||||
; CHECK-NEXT: vmov.u8 r0, q0[2]
|
||||
; CHECK-NEXT: vmov.16 q3[2], r0
|
||||
; CHECK-NEXT: vmov.16 q1[2], r0
|
||||
; CHECK-NEXT: vmov.u8 r0, q0[3]
|
||||
; CHECK-NEXT: vmov.16 q3[3], r0
|
||||
; CHECK-NEXT: vmov.16 q1[3], r0
|
||||
; CHECK-NEXT: vmov.u8 r0, q0[4]
|
||||
; CHECK-NEXT: vmov.16 q3[4], r0
|
||||
; CHECK-NEXT: vmov.16 q1[4], r0
|
||||
; CHECK-NEXT: vmov.u8 r0, q0[5]
|
||||
; CHECK-NEXT: vmov.16 q3[5], r0
|
||||
; CHECK-NEXT: vmov.16 q1[5], r0
|
||||
; CHECK-NEXT: vmov.u8 r0, q0[6]
|
||||
; CHECK-NEXT: vmov.16 q3[6], r0
|
||||
; CHECK-NEXT: vmov.16 q1[6], r0
|
||||
; CHECK-NEXT: vmov.u8 r0, q0[7]
|
||||
; CHECK-NEXT: vmov.16 q3[7], r0
|
||||
; CHECK-NEXT: vmovlb.s8 q0, q3
|
||||
; CHECK-NEXT: vmul.i16 q0, q0, q1
|
||||
; CHECK-NEXT: vmov.16 q1[7], r0
|
||||
; CHECK-NEXT: vmullb.s8 q0, q1, q3
|
||||
; CHECK-NEXT: vadd.i16 q0, q0, q2
|
||||
; CHECK-NEXT: vaddv.u16 r0, q0
|
||||
; CHECK-NEXT: sxth r0, r0
|
||||
|
@ -444,9 +438,7 @@ entry:
|
|||
define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_zext(<8 x i8> %x, <8 x i8> %y) {
|
||||
; CHECK-LABEL: add_v8i8_v8i16_zext:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vmovlb.u8 q1, q1
|
||||
; CHECK-NEXT: vmovlb.u8 q0, q0
|
||||
; CHECK-NEXT: vmul.i16 q0, q0, q1
|
||||
; CHECK-NEXT: vmullb.u8 q0, q0, q1
|
||||
; CHECK-NEXT: vaddv.u16 r0, q0
|
||||
; CHECK-NEXT: uxth r0, r0
|
||||
; CHECK-NEXT: bx lr
|
||||
|
@ -461,9 +453,7 @@ entry:
|
|||
define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_sext(<8 x i8> %x, <8 x i8> %y) {
|
||||
; CHECK-LABEL: add_v8i8_v8i16_sext:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vmovlb.s8 q1, q1
|
||||
; CHECK-NEXT: vmovlb.s8 q0, q0
|
||||
; CHECK-NEXT: vmul.i16 q0, q0, q1
|
||||
; CHECK-NEXT: vmullb.s8 q0, q0, q1
|
||||
; CHECK-NEXT: vaddv.u16 r0, q0
|
||||
; CHECK-NEXT: sxth r0, r0
|
||||
; CHECK-NEXT: bx lr
|
||||
|
@ -990,14 +980,15 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, <2 x i32> %y,
|
|||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .save {r7, lr}
|
||||
; CHECK-NEXT: push {r7, lr}
|
||||
; CHECK-NEXT: vmov r2, s4
|
||||
; CHECK-NEXT: vmov r3, s0
|
||||
; CHECK-NEXT: vmov r12, s6
|
||||
; CHECK-NEXT: umull r2, lr, r3, r2
|
||||
; CHECK-NEXT: vmov r3, s2
|
||||
; CHECK-NEXT: umlal r2, lr, r3, r12
|
||||
; CHECK-NEXT: vmullb.u32 q2, q0, q1
|
||||
; CHECK-NEXT: vmov r2, s10
|
||||
; CHECK-NEXT: vmov r3, s8
|
||||
; CHECK-NEXT: vmov r12, s11
|
||||
; CHECK-NEXT: vmov lr, s9
|
||||
; CHECK-NEXT: adds r2, r2, r3
|
||||
; CHECK-NEXT: adc.w r3, lr, r12
|
||||
; CHECK-NEXT: adds r0, r0, r2
|
||||
; CHECK-NEXT: adc.w r1, r1, lr
|
||||
; CHECK-NEXT: adcs r1, r3
|
||||
; CHECK-NEXT: pop {r7, pc}
|
||||
entry:
|
||||
%xx = zext <2 x i32> %x to <2 x i64>
|
||||
|
@ -1013,14 +1004,15 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_sext(<2 x i32> %x, <2 x i32> %y,
|
|||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .save {r7, lr}
|
||||
; CHECK-NEXT: push {r7, lr}
|
||||
; CHECK-NEXT: vmov r2, s4
|
||||
; CHECK-NEXT: vmov r3, s0
|
||||
; CHECK-NEXT: vmov r12, s6
|
||||
; CHECK-NEXT: smull r2, lr, r3, r2
|
||||
; CHECK-NEXT: vmov r3, s2
|
||||
; CHECK-NEXT: smlal r2, lr, r3, r12
|
||||
; CHECK-NEXT: vmullb.s32 q2, q0, q1
|
||||
; CHECK-NEXT: vmov r2, s10
|
||||
; CHECK-NEXT: vmov r3, s8
|
||||
; CHECK-NEXT: vmov r12, s11
|
||||
; CHECK-NEXT: vmov lr, s9
|
||||
; CHECK-NEXT: adds r2, r2, r3
|
||||
; CHECK-NEXT: adc.w r3, lr, r12
|
||||
; CHECK-NEXT: adds r0, r0, r2
|
||||
; CHECK-NEXT: adc.w r1, r1, lr
|
||||
; CHECK-NEXT: adcs r1, r3
|
||||
; CHECK-NEXT: pop {r7, pc}
|
||||
entry:
|
||||
%xx = sext <2 x i32> %x to <2 x i64>
|
||||
|
@ -1284,10 +1276,8 @@ define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_acc_zext(<16 x i8> %x, <16
|
|||
; CHECK-NEXT: vmov.16 q3[6], r1
|
||||
; CHECK-NEXT: vmov.u8 r1, q0[15]
|
||||
; CHECK-NEXT: vmov.16 q3[7], r1
|
||||
; CHECK-NEXT: vmovlb.u8 q2, q2
|
||||
; CHECK-NEXT: vmovlb.u8 q3, q3
|
||||
; CHECK-NEXT: vmov.u8 r1, q1[0]
|
||||
; CHECK-NEXT: vmul.i16 q2, q3, q2
|
||||
; CHECK-NEXT: vmullb.u8 q2, q3, q2
|
||||
; CHECK-NEXT: vmov.16 q3[0], r1
|
||||
; CHECK-NEXT: vmov.u8 r1, q1[1]
|
||||
; CHECK-NEXT: vmov.16 q3[1], r1
|
||||
|
@ -1304,24 +1294,22 @@ define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_acc_zext(<16 x i8> %x, <16
|
|||
; CHECK-NEXT: vmov.u8 r1, q1[7]
|
||||
; CHECK-NEXT: vmov.16 q3[7], r1
|
||||
; CHECK-NEXT: vmov.u8 r1, q0[0]
|
||||
; CHECK-NEXT: vmovlb.u8 q1, q3
|
||||
; CHECK-NEXT: vmov.16 q3[0], r1
|
||||
; CHECK-NEXT: vmov.16 q1[0], r1
|
||||
; CHECK-NEXT: vmov.u8 r1, q0[1]
|
||||
; CHECK-NEXT: vmov.16 q3[1], r1
|
||||
; CHECK-NEXT: vmov.16 q1[1], r1
|
||||
; CHECK-NEXT: vmov.u8 r1, q0[2]
|
||||
; CHECK-NEXT: vmov.16 q3[2], r1
|
||||
; CHECK-NEXT: vmov.16 q1[2], r1
|
||||
; CHECK-NEXT: vmov.u8 r1, q0[3]
|
||||
; CHECK-NEXT: vmov.16 q3[3], r1
|
||||
; CHECK-NEXT: vmov.16 q1[3], r1
|
||||
; CHECK-NEXT: vmov.u8 r1, q0[4]
|
||||
; CHECK-NEXT: vmov.16 q3[4], r1
|
||||
; CHECK-NEXT: vmov.16 q1[4], r1
|
||||
; CHECK-NEXT: vmov.u8 r1, q0[5]
|
||||
; CHECK-NEXT: vmov.16 q3[5], r1
|
||||
; CHECK-NEXT: vmov.16 q1[5], r1
|
||||
; CHECK-NEXT: vmov.u8 r1, q0[6]
|
||||
; CHECK-NEXT: vmov.16 q3[6], r1
|
||||
; CHECK-NEXT: vmov.16 q1[6], r1
|
||||
; CHECK-NEXT: vmov.u8 r1, q0[7]
|
||||
; CHECK-NEXT: vmov.16 q3[7], r1
|
||||
; CHECK-NEXT: vmovlb.u8 q0, q3
|
||||
; CHECK-NEXT: vmul.i16 q0, q0, q1
|
||||
; CHECK-NEXT: vmov.16 q1[7], r1
|
||||
; CHECK-NEXT: vmullb.u8 q0, q1, q3
|
||||
; CHECK-NEXT: vadd.i16 q0, q0, q2
|
||||
; CHECK-NEXT: vaddva.u16 r0, q0
|
||||
; CHECK-NEXT: uxth r0, r0
|
||||
|
@ -1370,10 +1358,8 @@ define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_acc_sext(<16 x i8> %x, <16
|
|||
; CHECK-NEXT: vmov.16 q3[6], r1
|
||||
; CHECK-NEXT: vmov.u8 r1, q0[15]
|
||||
; CHECK-NEXT: vmov.16 q3[7], r1
|
||||
; CHECK-NEXT: vmovlb.s8 q2, q2
|
||||
; CHECK-NEXT: vmovlb.s8 q3, q3
|
||||
; CHECK-NEXT: vmov.u8 r1, q1[0]
|
||||
; CHECK-NEXT: vmul.i16 q2, q3, q2
|
||||
; CHECK-NEXT: vmullb.s8 q2, q3, q2
|
||||
; CHECK-NEXT: vmov.16 q3[0], r1
|
||||
; CHECK-NEXT: vmov.u8 r1, q1[1]
|
||||
; CHECK-NEXT: vmov.16 q3[1], r1
|
||||
|
@ -1390,24 +1376,22 @@ define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_acc_sext(<16 x i8> %x, <16
|
|||
; CHECK-NEXT: vmov.u8 r1, q1[7]
|
||||
; CHECK-NEXT: vmov.16 q3[7], r1
|
||||
; CHECK-NEXT: vmov.u8 r1, q0[0]
|
||||
; CHECK-NEXT: vmovlb.s8 q1, q3
|
||||
; CHECK-NEXT: vmov.16 q3[0], r1
|
||||
; CHECK-NEXT: vmov.16 q1[0], r1
|
||||
; CHECK-NEXT: vmov.u8 r1, q0[1]
|
||||
; CHECK-NEXT: vmov.16 q3[1], r1
|
||||
; CHECK-NEXT: vmov.16 q1[1], r1
|
||||
; CHECK-NEXT: vmov.u8 r1, q0[2]
|
||||
; CHECK-NEXT: vmov.16 q3[2], r1
|
||||
; CHECK-NEXT: vmov.16 q1[2], r1
|
||||
; CHECK-NEXT: vmov.u8 r1, q0[3]
|
||||
; CHECK-NEXT: vmov.16 q3[3], r1
|
||||
; CHECK-NEXT: vmov.16 q1[3], r1
|
||||
; CHECK-NEXT: vmov.u8 r1, q0[4]
|
||||
; CHECK-NEXT: vmov.16 q3[4], r1
|
||||
; CHECK-NEXT: vmov.16 q1[4], r1
|
||||
; CHECK-NEXT: vmov.u8 r1, q0[5]
|
||||
; CHECK-NEXT: vmov.16 q3[5], r1
|
||||
; CHECK-NEXT: vmov.16 q1[5], r1
|
||||
; CHECK-NEXT: vmov.u8 r1, q0[6]
|
||||
; CHECK-NEXT: vmov.16 q3[6], r1
|
||||
; CHECK-NEXT: vmov.16 q1[6], r1
|
||||
; CHECK-NEXT: vmov.u8 r1, q0[7]
|
||||
; CHECK-NEXT: vmov.16 q3[7], r1
|
||||
; CHECK-NEXT: vmovlb.s8 q0, q3
|
||||
; CHECK-NEXT: vmul.i16 q0, q0, q1
|
||||
; CHECK-NEXT: vmov.16 q1[7], r1
|
||||
; CHECK-NEXT: vmullb.s8 q0, q1, q3
|
||||
; CHECK-NEXT: vadd.i16 q0, q0, q2
|
||||
; CHECK-NEXT: vaddva.u16 r0, q0
|
||||
; CHECK-NEXT: sxth r0, r0
|
||||
|
@ -1424,9 +1408,7 @@ entry:
|
|||
define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_acc_zext(<8 x i8> %x, <8 x i8> %y, i16 %a) {
|
||||
; CHECK-LABEL: add_v8i8_v8i16_acc_zext:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vmovlb.u8 q1, q1
|
||||
; CHECK-NEXT: vmovlb.u8 q0, q0
|
||||
; CHECK-NEXT: vmul.i16 q0, q0, q1
|
||||
; CHECK-NEXT: vmullb.u8 q0, q0, q1
|
||||
; CHECK-NEXT: vaddva.u16 r0, q0
|
||||
; CHECK-NEXT: uxth r0, r0
|
||||
; CHECK-NEXT: bx lr
|
||||
|
@ -1442,9 +1424,7 @@ entry:
|
|||
define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_acc_sext(<8 x i8> %x, <8 x i8> %y, i16 %a) {
|
||||
; CHECK-LABEL: add_v8i8_v8i16_acc_sext:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vmovlb.s8 q1, q1
|
||||
; CHECK-NEXT: vmovlb.s8 q0, q0
|
||||
; CHECK-NEXT: vmul.i16 q0, q0, q1
|
||||
; CHECK-NEXT: vmullb.s8 q0, q0, q1
|
||||
; CHECK-NEXT: vaddva.u16 r0, q0
|
||||
; CHECK-NEXT: sxth r0, r0
|
||||
; CHECK-NEXT: bx lr
|
||||
|
|
|
@ -6,38 +6,33 @@ define arm_aapcs_vfpcc void @test32(i32* noalias nocapture readonly %x, i32* noa
|
|||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .save {r5, lr}
|
||||
; CHECK-NEXT: push {r5, lr}
|
||||
; CHECK-NEXT: .vsave {d8, d9}
|
||||
; CHECK-NEXT: vpush {d8, d9}
|
||||
; CHECK-NEXT: cmp r3, #1
|
||||
; CHECK-NEXT: blt .LBB0_2
|
||||
; CHECK-NEXT: it lt
|
||||
; CHECK-NEXT: poplt {r5, pc}
|
||||
; CHECK-NEXT: .LBB0_1: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vldrw.u32 q2, [r0], #16
|
||||
; CHECK-NEXT: vldrw.u32 q3, [r1], #16
|
||||
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
|
||||
; CHECK-NEXT: vldrw.u32 q2, [r1], #16
|
||||
; CHECK-NEXT: subs r3, #4
|
||||
; CHECK-NEXT: vrev64.32 q1, q2
|
||||
; CHECK-NEXT: vrev64.32 q4, q3
|
||||
; CHECK-NEXT: vmov r12, s4
|
||||
; CHECK-NEXT: vmov lr, s16
|
||||
; CHECK-NEXT: smull r12, r5, lr, r12
|
||||
; CHECK-NEXT: vmullt.s32 q3, q2, q1
|
||||
; CHECK-NEXT: vmov r5, s13
|
||||
; CHECK-NEXT: vmov r12, s12
|
||||
; CHECK-NEXT: lsrl r12, r5, #31
|
||||
; CHECK-NEXT: vmov.32 q0[0], r12
|
||||
; CHECK-NEXT: vmov r12, s6
|
||||
; CHECK-NEXT: vmov r12, s14
|
||||
; CHECK-NEXT: vmov.32 q0[1], r5
|
||||
; CHECK-NEXT: vmov r5, s18
|
||||
; CHECK-NEXT: smull r12, r5, r5, r12
|
||||
; CHECK-NEXT: vmov r5, s15
|
||||
; CHECK-NEXT: lsrl r12, r5, #31
|
||||
; CHECK-NEXT: vmullb.s32 q3, q2, q1
|
||||
; CHECK-NEXT: vmov.32 q0[2], r12
|
||||
; CHECK-NEXT: vmov r12, s8
|
||||
; CHECK-NEXT: vmov r12, s12
|
||||
; CHECK-NEXT: vmov.32 q0[3], r5
|
||||
; CHECK-NEXT: vmov r5, s12
|
||||
; CHECK-NEXT: smull r12, r5, r5, r12
|
||||
; CHECK-NEXT: vmov r5, s13
|
||||
; CHECK-NEXT: lsrl r12, r5, #31
|
||||
; CHECK-NEXT: vmov.32 q1[0], r12
|
||||
; CHECK-NEXT: vmov r12, s10
|
||||
; CHECK-NEXT: vmov r12, s14
|
||||
; CHECK-NEXT: vmov.32 q1[1], r5
|
||||
; CHECK-NEXT: vmov r5, s14
|
||||
; CHECK-NEXT: smull r12, r5, r5, r12
|
||||
; CHECK-NEXT: vmov r5, s15
|
||||
; CHECK-NEXT: lsrl r12, r5, #31
|
||||
; CHECK-NEXT: vmov.32 q1[2], r12
|
||||
; CHECK-NEXT: vmov.32 q1[3], r5
|
||||
|
@ -52,8 +47,7 @@ define arm_aapcs_vfpcc void @test32(i32* noalias nocapture readonly %x, i32* noa
|
|||
; CHECK-NEXT: vmov.f32 s7, s10
|
||||
; CHECK-NEXT: vstrb.8 q1, [r2], #16
|
||||
; CHECK-NEXT: bne .LBB0_1
|
||||
; CHECK-NEXT: .LBB0_2: @ %for.cond.cleanup
|
||||
; CHECK-NEXT: vpop {d8, d9}
|
||||
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
|
||||
; CHECK-NEXT: pop {r5, pc}
|
||||
entry:
|
||||
%0 = and i32 %n, 3
|
||||
|
@ -103,17 +97,13 @@ define arm_aapcs_vfpcc void @test16(i16* noalias nocapture readonly %x, i16* noa
|
|||
; CHECK-NEXT: .LBB1_1: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vldrh.u16 q0, [r0], #16
|
||||
; CHECK-NEXT: vldrh.u16 q2, [r1], #16
|
||||
; CHECK-NEXT: vldrh.u16 q1, [r1], #16
|
||||
; CHECK-NEXT: subs r3, #8
|
||||
; CHECK-NEXT: vmovlt.s16 q1, q0
|
||||
; CHECK-NEXT: vmovlt.s16 q3, q2
|
||||
; CHECK-NEXT: vmovlb.s16 q0, q0
|
||||
; CHECK-NEXT: vmovlb.s16 q2, q2
|
||||
; CHECK-NEXT: vmul.i32 q1, q3, q1
|
||||
; CHECK-NEXT: vmul.i32 q0, q2, q0
|
||||
; CHECK-NEXT: vshr.u32 q1, q1, #15
|
||||
; CHECK-NEXT: vmullt.s16 q2, q1, q0
|
||||
; CHECK-NEXT: vmullb.s16 q0, q1, q0
|
||||
; CHECK-NEXT: vshr.u32 q2, q2, #15
|
||||
; CHECK-NEXT: vshr.u32 q0, q0, #15
|
||||
; CHECK-NEXT: vmovnt.i32 q0, q1
|
||||
; CHECK-NEXT: vmovnt.i32 q0, q2
|
||||
; CHECK-NEXT: vstrb.8 q0, [r2], #16
|
||||
; CHECK-NEXT: bne .LBB1_1
|
||||
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
|
||||
|
@ -166,17 +156,13 @@ define arm_aapcs_vfpcc void @test8(i8* noalias nocapture readonly %x, i8* noalia
|
|||
; CHECK-NEXT: .LBB2_1: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vldrb.u8 q0, [r0], #16
|
||||
; CHECK-NEXT: vldrb.u8 q2, [r1], #16
|
||||
; CHECK-NEXT: vldrb.u8 q1, [r1], #16
|
||||
; CHECK-NEXT: subs r3, #16
|
||||
; CHECK-NEXT: vmovlt.u8 q1, q0
|
||||
; CHECK-NEXT: vmovlt.u8 q3, q2
|
||||
; CHECK-NEXT: vmovlb.u8 q0, q0
|
||||
; CHECK-NEXT: vmovlb.u8 q2, q2
|
||||
; CHECK-NEXT: vmul.i16 q1, q3, q1
|
||||
; CHECK-NEXT: vmul.i16 q0, q2, q0
|
||||
; CHECK-NEXT: vshr.u16 q1, q1, #7
|
||||
; CHECK-NEXT: vmullt.u8 q2, q1, q0
|
||||
; CHECK-NEXT: vmullb.u8 q0, q1, q0
|
||||
; CHECK-NEXT: vshr.u16 q2, q2, #7
|
||||
; CHECK-NEXT: vshr.u16 q0, q0, #7
|
||||
; CHECK-NEXT: vmovnt.i16 q0, q1
|
||||
; CHECK-NEXT: vmovnt.i16 q0, q2
|
||||
; CHECK-NEXT: vstrb.8 q0, [r2], #16
|
||||
; CHECK-NEXT: bne .LBB2_1
|
||||
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
|
||||
|
|
|
@ -4,16 +4,7 @@
|
|||
define arm_aapcs_vfpcc <2 x i64> @sext_02(<4 x i32> %src1, <4 x i32> %src2) {
|
||||
; CHECK-LABEL: sext_02:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vmov r0, s4
|
||||
; CHECK-NEXT: vmov r1, s0
|
||||
; CHECK-NEXT: smull r0, r1, r1, r0
|
||||
; CHECK-NEXT: vmov.32 q2[0], r0
|
||||
; CHECK-NEXT: vmov r0, s6
|
||||
; CHECK-NEXT: vmov.32 q2[1], r1
|
||||
; CHECK-NEXT: vmov r1, s2
|
||||
; CHECK-NEXT: smull r0, r1, r1, r0
|
||||
; CHECK-NEXT: vmov.32 q2[2], r0
|
||||
; CHECK-NEXT: vmov.32 q2[3], r1
|
||||
; CHECK-NEXT: vmullb.s32 q2, q0, q1
|
||||
; CHECK-NEXT: vmov q0, q2
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
|
@ -28,18 +19,8 @@ entry:
|
|||
define arm_aapcs_vfpcc <2 x i64> @sext_13(<4 x i32> %src1, <4 x i32> %src2) {
|
||||
; CHECK-LABEL: sext_13:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vrev64.32 q2, q1
|
||||
; CHECK-NEXT: vrev64.32 q1, q0
|
||||
; CHECK-NEXT: vmov r0, s8
|
||||
; CHECK-NEXT: vmov r1, s4
|
||||
; CHECK-NEXT: smull r0, r1, r1, r0
|
||||
; CHECK-NEXT: vmov.32 q0[0], r0
|
||||
; CHECK-NEXT: vmov r0, s10
|
||||
; CHECK-NEXT: vmov.32 q0[1], r1
|
||||
; CHECK-NEXT: vmov r1, s6
|
||||
; CHECK-NEXT: smull r0, r1, r1, r0
|
||||
; CHECK-NEXT: vmov.32 q0[2], r0
|
||||
; CHECK-NEXT: vmov.32 q0[3], r1
|
||||
; CHECK-NEXT: vmullt.s32 q2, q0, q1
|
||||
; CHECK-NEXT: vmov q0, q2
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
|
||||
|
@ -53,16 +34,7 @@ entry:
|
|||
define arm_aapcs_vfpcc <2 x i64> @zext_02(<4 x i32> %src1, <4 x i32> %src2) {
|
||||
; CHECK-LABEL: zext_02:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vmov r0, s4
|
||||
; CHECK-NEXT: vmov r1, s0
|
||||
; CHECK-NEXT: umull r0, r1, r1, r0
|
||||
; CHECK-NEXT: vmov.32 q2[0], r0
|
||||
; CHECK-NEXT: vmov r0, s6
|
||||
; CHECK-NEXT: vmov.32 q2[1], r1
|
||||
; CHECK-NEXT: vmov r1, s2
|
||||
; CHECK-NEXT: umull r0, r1, r1, r0
|
||||
; CHECK-NEXT: vmov.32 q2[2], r0
|
||||
; CHECK-NEXT: vmov.32 q2[3], r1
|
||||
; CHECK-NEXT: vmullb.u32 q2, q0, q1
|
||||
; CHECK-NEXT: vmov q0, q2
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
|
@ -77,18 +49,8 @@ entry:
|
|||
define arm_aapcs_vfpcc <2 x i64> @zext_13(<4 x i32> %src1, <4 x i32> %src2) {
|
||||
; CHECK-LABEL: zext_13:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vrev64.32 q2, q1
|
||||
; CHECK-NEXT: vrev64.32 q1, q0
|
||||
; CHECK-NEXT: vmov r0, s8
|
||||
; CHECK-NEXT: vmov r1, s4
|
||||
; CHECK-NEXT: umull r0, r1, r1, r0
|
||||
; CHECK-NEXT: vmov.32 q0[0], r0
|
||||
; CHECK-NEXT: vmov r0, s10
|
||||
; CHECK-NEXT: vmov.32 q0[1], r1
|
||||
; CHECK-NEXT: vmov r1, s6
|
||||
; CHECK-NEXT: umull r0, r1, r1, r0
|
||||
; CHECK-NEXT: vmov.32 q0[2], r0
|
||||
; CHECK-NEXT: vmov.32 q0[3], r1
|
||||
; CHECK-NEXT: vmullt.u32 q2, q0, q1
|
||||
; CHECK-NEXT: vmov q0, q2
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
|
||||
|
@ -103,9 +65,7 @@ entry:
|
|||
define arm_aapcs_vfpcc <4 x i32> @sext_0246(<8 x i16> %src1, <8 x i16> %src2) {
|
||||
; CHECK-LABEL: sext_0246:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vmovlb.s16 q1, q1
|
||||
; CHECK-NEXT: vmovlb.s16 q0, q0
|
||||
; CHECK-NEXT: vmul.i32 q0, q0, q1
|
||||
; CHECK-NEXT: vmullb.s16 q0, q0, q1
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%shuf1 = shufflevector <8 x i16> %src1, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
|
||||
|
@ -119,9 +79,7 @@ entry:
|
|||
define arm_aapcs_vfpcc <4 x i32> @sext_1357(<8 x i16> %src1, <8 x i16> %src2) {
|
||||
; CHECK-LABEL: sext_1357:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vmovlt.s16 q1, q1
|
||||
; CHECK-NEXT: vmovlt.s16 q0, q0
|
||||
; CHECK-NEXT: vmul.i32 q0, q0, q1
|
||||
; CHECK-NEXT: vmullt.s16 q0, q0, q1
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%shuf1 = shufflevector <8 x i16> %src1, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
|
||||
|
@ -135,9 +93,7 @@ entry:
|
|||
define arm_aapcs_vfpcc <4 x i32> @zext_0246(<8 x i16> %src1, <8 x i16> %src2) {
|
||||
; CHECK-LABEL: zext_0246:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vmovlb.u16 q1, q1
|
||||
; CHECK-NEXT: vmovlb.u16 q0, q0
|
||||
; CHECK-NEXT: vmul.i32 q0, q0, q1
|
||||
; CHECK-NEXT: vmullb.u16 q0, q0, q1
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%shuf1 = shufflevector <8 x i16> %src1, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
|
||||
|
@ -151,9 +107,7 @@ entry:
|
|||
define arm_aapcs_vfpcc <4 x i32> @zext_1357(<8 x i16> %src1, <8 x i16> %src2) {
|
||||
; CHECK-LABEL: zext_1357:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vmovlt.u16 q1, q1
|
||||
; CHECK-NEXT: vmovlt.u16 q0, q0
|
||||
; CHECK-NEXT: vmul.i32 q0, q0, q1
|
||||
; CHECK-NEXT: vmullt.u16 q0, q0, q1
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%shuf1 = shufflevector <8 x i16> %src1, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
|
||||
|
@ -167,9 +121,7 @@ entry:
|
|||
define arm_aapcs_vfpcc <8 x i16> @sext_02468101214(<16 x i8> %src1, <16 x i8> %src2) {
|
||||
; CHECK-LABEL: sext_02468101214:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vmovlb.s8 q1, q1
|
||||
; CHECK-NEXT: vmovlb.s8 q0, q0
|
||||
; CHECK-NEXT: vmul.i16 q0, q0, q1
|
||||
; CHECK-NEXT: vmullb.s8 q0, q0, q1
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%shuf1 = shufflevector <16 x i8> %src1, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
|
||||
|
@ -183,9 +135,7 @@ entry:
|
|||
define arm_aapcs_vfpcc <8 x i16> @sext_13579111315(<16 x i8> %src1, <16 x i8> %src2) {
|
||||
; CHECK-LABEL: sext_13579111315:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vmovlt.s8 q1, q1
|
||||
; CHECK-NEXT: vmovlt.s8 q0, q0
|
||||
; CHECK-NEXT: vmul.i16 q0, q0, q1
|
||||
; CHECK-NEXT: vmullt.s8 q0, q0, q1
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%shuf1 = shufflevector <16 x i8> %src1, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
|
||||
|
@ -199,9 +149,7 @@ entry:
|
|||
define arm_aapcs_vfpcc <8 x i16> @zext_02468101214(<16 x i8> %src1, <16 x i8> %src2) {
|
||||
; CHECK-LABEL: zext_02468101214:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vmovlb.u8 q1, q1
|
||||
; CHECK-NEXT: vmovlb.u8 q0, q0
|
||||
; CHECK-NEXT: vmul.i16 q0, q0, q1
|
||||
; CHECK-NEXT: vmullb.u8 q0, q0, q1
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%shuf1 = shufflevector <16 x i8> %src1, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
|
||||
|
@ -215,9 +163,7 @@ entry:
|
|||
define arm_aapcs_vfpcc <8 x i16> @zext_13579111315(<16 x i8> %src1, <16 x i8> %src2) {
|
||||
; CHECK-LABEL: zext_13579111315:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vmovlt.u8 q1, q1
|
||||
; CHECK-NEXT: vmovlt.u8 q0, q0
|
||||
; CHECK-NEXT: vmul.i16 q0, q0, q1
|
||||
; CHECK-NEXT: vmullt.u8 q0, q0, q1
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%shuf1 = shufflevector <16 x i8> %src1, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
|
||||
|
|
Loading…
Reference in New Issue