diff --git a/llvm/include/llvm/IR/IntrinsicsMips.td b/llvm/include/llvm/IR/IntrinsicsMips.td index abca235681d7..01158e34dddb 100644 --- a/llvm/include/llvm/IR/IntrinsicsMips.td +++ b/llvm/include/llvm/IR/IntrinsicsMips.td @@ -1728,13 +1728,17 @@ def int_mips_subvi_d : GCCBuiltin<"__builtin_msa_subvi_d">, Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem]>; def int_mips_vshf_b : GCCBuiltin<"__builtin_msa_vshf_b">, - Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>; + Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty], + [IntrNoMem]>; def int_mips_vshf_h : GCCBuiltin<"__builtin_msa_vshf_h">, - Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>; + Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty], + [IntrNoMem]>; def int_mips_vshf_w : GCCBuiltin<"__builtin_msa_vshf_w">, - Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>; + Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty], + [IntrNoMem]>; def int_mips_vshf_d : GCCBuiltin<"__builtin_msa_vshf_d">, - Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>; + Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty], + [IntrNoMem]>; def int_mips_xor_v : GCCBuiltin<"__builtin_msa_xor_v">, Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>; diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp index 5f019367b295..04f90833d827 100644 --- a/llvm/lib/Target/Mips/MipsISelLowering.cpp +++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp @@ -224,6 +224,7 @@ const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const { case MipsISD::VEXTRACT_SEXT_ELT: return "MipsISD::VEXTRACT_SEXT_ELT"; case MipsISD::VEXTRACT_ZEXT_ELT: return "MipsISD::VEXTRACT_ZEXT_ELT"; case MipsISD::VNOR: return "MipsISD::VNOR"; + case MipsISD::VSHF: return "MipsISD::VSHF"; default: return NULL; } } diff --git a/llvm/lib/Target/Mips/MipsISelLowering.h b/llvm/lib/Target/Mips/MipsISelLowering.h index ae82e7e7ed66..0cb67b8d4403 100644 --- a/llvm/lib/Target/Mips/MipsISelLowering.h +++ b/llvm/lib/Target/Mips/MipsISelLowering.h @@ -172,6 +172,9 @@ namespace llvm { VUMAX, VUMIN, + // Vector Shuffle with mask as an operand + VSHF, // Generic shuffle + // Combined (XOR (OR $a, $b), -1) VNOR, diff --git a/llvm/lib/Target/Mips/MipsMSAInstrInfo.td b/llvm/lib/Target/Mips/MipsMSAInstrInfo.td index 60dcdce0861c..4909743e3491 100644 --- a/llvm/lib/Target/Mips/MipsMSAInstrInfo.td +++ b/llvm/lib/Target/Mips/MipsMSAInstrInfo.td @@ -20,6 +20,9 @@ def SDT_VFSetCC : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisFP<1>, SDTCisSameAs<1, 2>, SDTCisVT<3, OtherVT>]>; +def SDT_VSHF : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisVec<0>, + SDTCisInt<1>, SDTCisVec<1>, + SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>]>; def MipsVAllNonZero : SDNode<"MipsISD::VALL_NONZERO", SDT_MipsVecCond>; def MipsVAnyNonZero : SDNode<"MipsISD::VANY_NONZERO", SDT_MipsVecCond>; @@ -35,6 +38,7 @@ def MipsVUMin : SDNode<"MipsISD::VUMIN", SDTIntBinOp, [SDNPCommutative, SDNPAssociative]>; def MipsVNOR : SDNode<"MipsISD::VNOR", SDTIntBinOp, [SDNPCommutative, SDNPAssociative]>; +def MipsVSHF : SDNode<"MipsISD::VSHF", SDT_VSHF>; def vsetcc : SDNode<"ISD::SETCC", SDT_VSetCC>; def vfsetcc : SDNode<"ISD::SETCC", SDT_VFSetCC>; @@ -1119,6 +1123,19 @@ class MSA_3R_DESC_BASE { + dag OutOperandList = (outs RCWD:$wd); + dag InOperandList = (ins RCWD:$wd_in, RCWS:$ws, RCWT:$wt); + string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $wt"); + list Pattern = [(set RCWD:$wd, (MipsVSHF RCWD:$wd_in, RCWS:$ws, + RCWT:$wt))]; + string Constraints = "$wd = $wd_in"; + InstrItinClass Itinerary = itin; +} + class MSA_3R_4R_DESC_BASE; class SUBVI_W_DESC : MSA_I5_DESC_BASE<"subvi.w", sub, vsplati32_uimm5, MSA128W>; class SUBVI_D_DESC : MSA_I5_DESC_BASE<"subvi.d", sub, vsplati64_uimm5, MSA128D>; -class VSHF_B_DESC : MSA_3R_DESC_BASE<"vshf.b", int_mips_vshf_b, MSA128B>; -class VSHF_H_DESC : MSA_3R_DESC_BASE<"vshf.h", int_mips_vshf_h, MSA128H>; -class VSHF_W_DESC : MSA_3R_DESC_BASE<"vshf.w", int_mips_vshf_w, MSA128W>; -class VSHF_D_DESC : MSA_3R_DESC_BASE<"vshf.d", int_mips_vshf_d, MSA128D>; +class VSHF_B_DESC : MSA_3R_VSHF_DESC_BASE<"vshf.b", MSA128B>; +class VSHF_H_DESC : MSA_3R_VSHF_DESC_BASE<"vshf.h", MSA128H>; +class VSHF_W_DESC : MSA_3R_VSHF_DESC_BASE<"vshf.w", MSA128W>; +class VSHF_D_DESC : MSA_3R_VSHF_DESC_BASE<"vshf.d", MSA128D>; class XOR_V_DESC : MSA_VEC_DESC_BASE<"xor.v", xor, MSA128B>; class XOR_V_H_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE; diff --git a/llvm/lib/Target/Mips/MipsSEISelLowering.cpp b/llvm/lib/Target/Mips/MipsSEISelLowering.cpp index f135b5f3857d..929e91eb60e5 100644 --- a/llvm/lib/Target/Mips/MipsSEISelLowering.cpp +++ b/llvm/lib/Target/Mips/MipsSEISelLowering.cpp @@ -180,6 +180,7 @@ addMSAIntType(MVT::SimpleValueType Ty, const TargetRegisterClass *RC) { setOperationAction(ISD::SRL, Ty, Legal); setOperationAction(ISD::SUB, Ty, Legal); setOperationAction(ISD::UDIV, Ty, Legal); + setOperationAction(ISD::VECTOR_SHUFFLE, Ty, Custom); setOperationAction(ISD::VSELECT, Ty, Legal); setOperationAction(ISD::XOR, Ty, Legal); @@ -259,6 +260,7 @@ SDValue MipsSETargetLowering::LowerOperation(SDValue Op, case ISD::INTRINSIC_VOID: return lowerINTRINSIC_VOID(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return lowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::BUILD_VECTOR: return lowerBUILD_VECTOR(Op, DAG); + case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, DAG); } return MipsTargetLowering::LowerOperation(Op, DAG); @@ -1470,6 +1472,12 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::mips_subvi_d: return lowerMSABinaryImmIntr(Op, DAG, ISD::SUB, lowerMSASplatImm(Op, 2, DAG)); + case Intrinsic::mips_vshf_b: + case Intrinsic::mips_vshf_h: + case Intrinsic::mips_vshf_w: + case Intrinsic::mips_vshf_d: + return DAG.getNode(MipsISD::VSHF, SDLoc(Op), Op->getValueType(0), + Op->getOperand(1), Op->getOperand(2), Op->getOperand(3)); case Intrinsic::mips_xor_v: return lowerMSABinaryIntr(Op, DAG, ISD::XOR); case Intrinsic::mips_xori_b: @@ -1727,6 +1735,76 @@ SDValue MipsSETargetLowering::lowerBUILD_VECTOR(SDValue Op, return SDValue(); } +// Lower VECTOR_SHUFFLE into VSHF. +// +// This mostly consists of converting the shuffle indices in Indices into a +// BUILD_VECTOR and adding it as an operand to the resulting VSHF. There is +// also code to eliminate unused operands of the VECTOR_SHUFFLE. For example, +// if the type is v8i16 and all the indices are less than 8 then the second +// operand is unused and can be replaced with anything. We choose to replace it +// with the used operand since this reduces the number of instructions overall. +static SDValue lowerVECTOR_SHUFFLE_VSHF(SDValue Op, EVT ResTy, + SmallVector Indices, + SelectionDAG &DAG) { + SmallVector Ops; + SDValue Op0; + SDValue Op1; + EVT MaskVecTy = ResTy.changeVectorElementTypeToInteger(); + EVT MaskEltTy = MaskVecTy.getVectorElementType(); + bool Using1stVec = false; + bool Using2ndVec = false; + SDLoc DL(Op); + int ResTyNumElts = ResTy.getVectorNumElements(); + + for (int i = 0; i < ResTyNumElts; ++i) { + // Idx == -1 means UNDEF + int Idx = Indices[i]; + + if (0 <= Idx && Idx < ResTyNumElts) + Using1stVec = true; + if (ResTyNumElts <= Idx && Idx < ResTyNumElts * 2) + Using2ndVec = true; + } + + for (SmallVector::iterator I = Indices.begin(); I != Indices.end(); + ++I) + Ops.push_back(DAG.getTargetConstant(*I, MaskEltTy)); + + SDValue MaskVec = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskVecTy, &Ops[0], + Ops.size()); + + if (Using1stVec && Using2ndVec) { + Op0 = Op->getOperand(0); + Op1 = Op->getOperand(1); + } else if (Using1stVec) + Op0 = Op1 = Op->getOperand(0); + else if (Using2ndVec) + Op0 = Op1 = Op->getOperand(1); + else + llvm_unreachable("shuffle vector mask references neither vector operand?"); + + return DAG.getNode(MipsISD::VSHF, DL, ResTy, MaskVec, Op0, Op1); +} + +// Lower VECTOR_SHUFFLE into one of a number of instructions depending on the +// indices in the shuffle. +SDValue MipsSETargetLowering::lowerVECTOR_SHUFFLE(SDValue Op, + SelectionDAG &DAG) const { + ShuffleVectorSDNode *Node = cast(Op); + EVT ResTy = Op->getValueType(0); + + if (!ResTy.is128BitVector()) + return SDValue(); + + int ResTyNumElts = ResTy.getVectorNumElements(); + SmallVector Indices; + + for (int i = 0; i < ResTyNumElts; ++i) + Indices.push_back(Node->getMaskElt(i)); + + return lowerVECTOR_SHUFFLE_VSHF(Op, ResTy, Indices, DAG); +} + MachineBasicBlock * MipsSETargetLowering:: emitBPOSGE32(MachineInstr *MI, MachineBasicBlock *BB) const{ // $bb: diff --git a/llvm/lib/Target/Mips/MipsSEISelLowering.h b/llvm/lib/Target/Mips/MipsSEISelLowering.h index 644fe02665cd..9b69fb5dc2a9 100644 --- a/llvm/lib/Target/Mips/MipsSEISelLowering.h +++ b/llvm/lib/Target/Mips/MipsSEISelLowering.h @@ -75,6 +75,9 @@ namespace llvm { SDValue lowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const; SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; + /// \brief Lower VECTOR_SHUFFLE into one of a number of instructions + /// depending on the indices in the shuffle. + SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; MachineBasicBlock *emitBPOSGE32(MachineInstr *MI, MachineBasicBlock *BB) const; diff --git a/llvm/test/CodeGen/Mips/msa/3r-v.ll b/llvm/test/CodeGen/Mips/msa/3r-v.ll index 055491d6a7f7..544ae9fd5d13 100644 --- a/llvm/test/CodeGen/Mips/msa/3r-v.ll +++ b/llvm/test/CodeGen/Mips/msa/3r-v.ll @@ -5,84 +5,95 @@ @llvm_mips_vshf_b_ARG1 = global <16 x i8> , align 16 @llvm_mips_vshf_b_ARG2 = global <16 x i8> , align 16 +@llvm_mips_vshf_b_ARG3 = global <16 x i8> , align 16 @llvm_mips_vshf_b_RES = global <16 x i8> , align 16 define void @llvm_mips_vshf_b_test() nounwind { entry: %0 = load <16 x i8>* @llvm_mips_vshf_b_ARG1 %1 = load <16 x i8>* @llvm_mips_vshf_b_ARG2 - %2 = tail call <16 x i8> @llvm.mips.vshf.b(<16 x i8> %0, <16 x i8> %1) - store <16 x i8> %2, <16 x i8>* @llvm_mips_vshf_b_RES + %2 = load <16 x i8>* @llvm_mips_vshf_b_ARG3 + %3 = tail call <16 x i8> @llvm.mips.vshf.b(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2) + store <16 x i8> %3, <16 x i8>* @llvm_mips_vshf_b_RES ret void } -declare <16 x i8> @llvm.mips.vshf.b(<16 x i8>, <16 x i8>) nounwind +declare <16 x i8> @llvm.mips.vshf.b(<16 x i8>, <16 x i8>, <16 x i8>) nounwind ; CHECK: llvm_mips_vshf_b_test: ; CHECK: ld.b ; CHECK: ld.b +; CHECK: ld.b ; CHECK: vshf.b ; CHECK: st.b ; CHECK: .size llvm_mips_vshf_b_test ; @llvm_mips_vshf_h_ARG1 = global <8 x i16> , align 16 @llvm_mips_vshf_h_ARG2 = global <8 x i16> , align 16 +@llvm_mips_vshf_h_ARG3 = global <8 x i16> , align 16 @llvm_mips_vshf_h_RES = global <8 x i16> , align 16 define void @llvm_mips_vshf_h_test() nounwind { entry: %0 = load <8 x i16>* @llvm_mips_vshf_h_ARG1 %1 = load <8 x i16>* @llvm_mips_vshf_h_ARG2 - %2 = tail call <8 x i16> @llvm.mips.vshf.h(<8 x i16> %0, <8 x i16> %1) - store <8 x i16> %2, <8 x i16>* @llvm_mips_vshf_h_RES + %2 = load <8 x i16>* @llvm_mips_vshf_h_ARG3 + %3 = tail call <8 x i16> @llvm.mips.vshf.h(<8 x i16> %0, <8 x i16> %1, <8 x i16> %2) + store <8 x i16> %3, <8 x i16>* @llvm_mips_vshf_h_RES ret void } -declare <8 x i16> @llvm.mips.vshf.h(<8 x i16>, <8 x i16>) nounwind +declare <8 x i16> @llvm.mips.vshf.h(<8 x i16>, <8 x i16>, <8 x i16>) nounwind ; CHECK: llvm_mips_vshf_h_test: ; CHECK: ld.h ; CHECK: ld.h +; CHECK: ld.h ; CHECK: vshf.h ; CHECK: st.h ; CHECK: .size llvm_mips_vshf_h_test ; @llvm_mips_vshf_w_ARG1 = global <4 x i32> , align 16 @llvm_mips_vshf_w_ARG2 = global <4 x i32> , align 16 +@llvm_mips_vshf_w_ARG3 = global <4 x i32> , align 16 @llvm_mips_vshf_w_RES = global <4 x i32> , align 16 define void @llvm_mips_vshf_w_test() nounwind { entry: %0 = load <4 x i32>* @llvm_mips_vshf_w_ARG1 %1 = load <4 x i32>* @llvm_mips_vshf_w_ARG2 - %2 = tail call <4 x i32> @llvm.mips.vshf.w(<4 x i32> %0, <4 x i32> %1) - store <4 x i32> %2, <4 x i32>* @llvm_mips_vshf_w_RES + %2 = load <4 x i32>* @llvm_mips_vshf_w_ARG3 + %3 = tail call <4 x i32> @llvm.mips.vshf.w(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2) + store <4 x i32> %3, <4 x i32>* @llvm_mips_vshf_w_RES ret void } -declare <4 x i32> @llvm.mips.vshf.w(<4 x i32>, <4 x i32>) nounwind +declare <4 x i32> @llvm.mips.vshf.w(<4 x i32>, <4 x i32>, <4 x i32>) nounwind ; CHECK: llvm_mips_vshf_w_test: ; CHECK: ld.w ; CHECK: ld.w +; CHECK: ld.w ; CHECK: vshf.w ; CHECK: st.w ; CHECK: .size llvm_mips_vshf_w_test ; @llvm_mips_vshf_d_ARG1 = global <2 x i64> , align 16 @llvm_mips_vshf_d_ARG2 = global <2 x i64> , align 16 +@llvm_mips_vshf_d_ARG3 = global <2 x i64> , align 16 @llvm_mips_vshf_d_RES = global <2 x i64> , align 16 define void @llvm_mips_vshf_d_test() nounwind { entry: %0 = load <2 x i64>* @llvm_mips_vshf_d_ARG1 %1 = load <2 x i64>* @llvm_mips_vshf_d_ARG2 - %2 = tail call <2 x i64> @llvm.mips.vshf.d(<2 x i64> %0, <2 x i64> %1) - store <2 x i64> %2, <2 x i64>* @llvm_mips_vshf_d_RES + %2 = load <2 x i64>* @llvm_mips_vshf_d_ARG3 + %3 = tail call <2 x i64> @llvm.mips.vshf.d(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2) + store <2 x i64> %3, <2 x i64>* @llvm_mips_vshf_d_RES ret void } -declare <2 x i64> @llvm.mips.vshf.d(<2 x i64>, <2 x i64>) nounwind +declare <2 x i64> @llvm.mips.vshf.d(<2 x i64>, <2 x i64>, <2 x i64>) nounwind ; CHECK: llvm_mips_vshf_d_test: ; CHECK: ld.d diff --git a/llvm/test/CodeGen/Mips/msa/shuffle.ll b/llvm/test/CodeGen/Mips/msa/shuffle.ll new file mode 100644 index 000000000000..35a5cf8658c0 --- /dev/null +++ b/llvm/test/CodeGen/Mips/msa/shuffle.ll @@ -0,0 +1,313 @@ +; RUN: llc -march=mips -mattr=+msa < %s | FileCheck %s + +define void @vshf_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind { + ; CHECK: vshf_v16i8_0: + + %1 = load <16 x i8>* %a + ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5) + %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> + ; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], %lo + ; CHECK-DAG: vshf.b [[R3]], [[R1]], [[R1]] + store <16 x i8> %2, <16 x i8>* %c + ; CHECK-DAG: st.b [[R3]], 0($4) + + ret void + ; CHECK: .size vshf_v16i8_0 +} + +define void @vshf_v16i8_1(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind { + ; CHECK: vshf_v16i8_1: + + %1 = load <16 x i8>* %a + ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5) + %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> + ; CHECK-DAG: ldi.b [[R3:\$w[0-9]+]], 1 + ; CHECK-DAG: vshf.b [[R3]], [[R1]], [[R1]] + store <16 x i8> %2, <16 x i8>* %c + ; CHECK-DAG: st.b [[R3]], 0($4) + + ret void + ; CHECK: .size vshf_v16i8_1 +} + +define void @vshf_v16i8_2(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind { + ; CHECK: vshf_v16i8_2: + + %1 = load <16 x i8>* %a + %2 = load <16 x i8>* %b + ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6) + %3 = shufflevector <16 x i8> %1, <16 x i8> %2, <16 x i32> + ; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], %lo + ; CHECK-DAG: vshf.b [[R3]], [[R2]], [[R2]] + store <16 x i8> %3, <16 x i8>* %c + ; CHECK-DAG: st.b [[R3]], 0($4) + + ret void + ; CHECK: .size vshf_v16i8_2 +} + +define void @vshf_v16i8_3(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind { + ; CHECK: vshf_v16i8_3: + + %1 = load <16 x i8>* %a + ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5) + %2 = load <16 x i8>* %b + ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6) + %3 = shufflevector <16 x i8> %1, <16 x i8> %2, <16 x i32> + ; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], %lo + ; CHECK-DAG: vshf.b [[R3]], [[R1]], [[R2]] + store <16 x i8> %3, <16 x i8>* %c + ; CHECK-DAG: st.b [[R3]], 0($4) + + ret void + ; CHECK: .size vshf_v16i8_3 +} + +define void @vshf_v16i8_4(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind { + ; CHECK: vshf_v16i8_4: + + %1 = load <16 x i8>* %a + ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5) + %2 = shufflevector <16 x i8> %1, <16 x i8> %1, <16 x i32> + ; CHECK-DAG: ldi.b [[R3:\$w[0-9]+]], 1 + ; CHECK-DAG: vshf.b [[R3]], [[R1]], [[R1]] + store <16 x i8> %2, <16 x i8>* %c + ; CHECK-DAG: st.b [[R3]], 0($4) + + ret void + ; CHECK: .size vshf_v16i8_4 +} + +define void @vshf_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind { + ; CHECK: vshf_v8i16_0: + + %1 = load <8 x i16>* %a + ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5) + %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> + ; CHECK-DAG: ld.h [[R3:\$w[0-9]+]], %lo + ; CHECK-DAG: vshf.h [[R3]], [[R1]], [[R1]] + store <8 x i16> %2, <8 x i16>* %c + ; CHECK-DAG: st.h [[R3]], 0($4) + + ret void + ; CHECK: .size vshf_v8i16_0 +} + +define void @vshf_v8i16_1(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind { + ; CHECK: vshf_v8i16_1: + + %1 = load <8 x i16>* %a + ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5) + %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> + ; CHECK-DAG: ldi.h [[R3:\$w[0-9]+]], 1 + ; CHECK-DAG: vshf.h [[R3]], [[R1]], [[R1]] + store <8 x i16> %2, <8 x i16>* %c + ; CHECK-DAG: st.h [[R3]], 0($4) + + ret void + ; CHECK: .size vshf_v8i16_1 +} + +define void @vshf_v8i16_2(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind { + ; CHECK: vshf_v8i16_2: + + %1 = load <8 x i16>* %a + %2 = load <8 x i16>* %b + ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6) + %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> + ; CHECK-DAG: ld.h [[R3:\$w[0-9]+]], %lo + ; CHECK-DAG: vshf.h [[R3]], [[R2]], [[R2]] + store <8 x i16> %3, <8 x i16>* %c + ; CHECK-DAG: st.h [[R3]], 0($4) + + ret void + ; CHECK: .size vshf_v8i16_2 +} + +define void @vshf_v8i16_3(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind { + ; CHECK: vshf_v8i16_3: + + %1 = load <8 x i16>* %a + ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5) + %2 = load <8 x i16>* %b + ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6) + %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> + ; CHECK-DAG: ld.h [[R3:\$w[0-9]+]], %lo + ; CHECK-DAG: vshf.h [[R3]], [[R1]], [[R2]] + store <8 x i16> %3, <8 x i16>* %c + ; CHECK-DAG: st.h [[R3]], 0($4) + + ret void + ; CHECK: .size vshf_v8i16_3 +} + +define void @vshf_v8i16_4(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind { + ; CHECK: vshf_v8i16_4: + + %1 = load <8 x i16>* %a + ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5) + %2 = shufflevector <8 x i16> %1, <8 x i16> %1, <8 x i32> + ; CHECK-DAG: ldi.h [[R3:\$w[0-9]+]], 1 + ; CHECK-DAG: vshf.h [[R3]], [[R1]], [[R1]] + store <8 x i16> %2, <8 x i16>* %c + ; CHECK-DAG: st.h [[R3]], 0($4) + + ret void + ; CHECK: .size vshf_v8i16_4 +} + +define void @vshf_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind { + ; CHECK: vshf_v4i32_0: + + %1 = load <4 x i32>* %a + ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5) + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ; CHECK-DAG: ld.w [[R3:\$w[0-9]+]], %lo + ; CHECK-DAG: vshf.w [[R3:\$w[0-9]+]], [[R1]], [[R1]] + store <4 x i32> %2, <4 x i32>* %c + ; CHECK-DAG: st.w [[R3]], 0($4) + + ret void + ; CHECK: .size vshf_v4i32_0 +} + +define void @vshf_v4i32_1(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind { + ; CHECK: vshf_v4i32_1: + + %1 = load <4 x i32>* %a + ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5) + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ; CHECK-DAG: ldi.w [[R3:\$w[0-9]+]], 1 + ; CHECK-DAG: vshf.w [[R3:\$w[0-9]+]], [[R1]], [[R1]] + store <4 x i32> %2, <4 x i32>* %c + ; CHECK-DAG: st.w [[R3]], 0($4) + + ret void + ; CHECK: .size vshf_v4i32_1 +} + +define void @vshf_v4i32_2(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind { + ; CHECK: vshf_v4i32_2: + + %1 = load <4 x i32>* %a + %2 = load <4 x i32>* %b + ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6) + %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> + ; CHECK-DAG: ld.w [[R3:\$w[0-9]+]], %lo + ; CHECK-DAG: vshf.w [[R3:\$w[0-9]+]], [[R2]], [[R2]] + store <4 x i32> %3, <4 x i32>* %c + ; CHECK-DAG: st.w [[R3]], 0($4) + + ret void + ; CHECK: .size vshf_v4i32_2 +} + +define void @vshf_v4i32_3(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind { + ; CHECK: vshf_v4i32_3: + + %1 = load <4 x i32>* %a + ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5) + %2 = load <4 x i32>* %b + ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6) + %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> + ; CHECK-DAG: ld.w [[R3:\$w[0-9]+]], %lo + ; CHECK-DAG: vshf.w [[R3]], [[R1]], [[R2]] + store <4 x i32> %3, <4 x i32>* %c + ; CHECK-DAG: st.w [[R3]], 0($4) + + ret void + ; CHECK: .size vshf_v4i32_3 +} + +define void @vshf_v4i32_4(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind { + ; CHECK: vshf_v4i32_4: + + %1 = load <4 x i32>* %a + ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5) + %2 = shufflevector <4 x i32> %1, <4 x i32> %1, <4 x i32> + ; CHECK-DAG: ldi.w [[R3:\$w[0-9]+]], 1 + ; CHECK-DAG: vshf.w [[R3:\$w[0-9]+]], [[R1]], [[R1]] + store <4 x i32> %2, <4 x i32>* %c + ; CHECK-DAG: st.w [[R3]], 0($4) + + ret void + ; CHECK: .size vshf_v4i32_4 +} + +define void @vshf_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind { + ; CHECK: vshf_v2i64_0: + + %1 = load <2 x i64>* %a + ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5) + %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> + ; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], %lo + ; CHECK-DAG: vshf.d [[R3]], [[R1]], [[R1]] + store <2 x i64> %2, <2 x i64>* %c + ; CHECK-DAG: st.d [[R3]], 0($4) + + ret void + ; CHECK: .size vshf_v2i64_0 +} + +define void @vshf_v2i64_1(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind { + ; CHECK: vshf_v2i64_1: + + %1 = load <2 x i64>* %a + ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5) + %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> + ; CHECK-DAG: ldi.d [[R3:\$w[0-9]+]], 1 + ; CHECK-DAG: vshf.d [[R3]], [[R1]], [[R1]] + store <2 x i64> %2, <2 x i64>* %c + ; CHECK-DAG: st.d [[R3]], 0($4) + + ret void + ; CHECK: .size vshf_v2i64_1 +} + +define void @vshf_v2i64_2(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind { + ; CHECK: vshf_v2i64_2: + + %1 = load <2 x i64>* %a + %2 = load <2 x i64>* %b + ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6) + %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> + ; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], %lo + ; CHECK-DAG: vshf.d [[R3]], [[R2]], [[R2]] + store <2 x i64> %3, <2 x i64>* %c + ; CHECK-DAG: st.d [[R3]], 0($4) + + ret void + ; CHECK: .size vshf_v2i64_2 +} + +define void @vshf_v2i64_3(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind { + ; CHECK: vshf_v2i64_3: + + %1 = load <2 x i64>* %a + ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5) + %2 = load <2 x i64>* %b + ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6) + %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> + ; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], %lo + ; CHECK-DAG: vshf.d [[R3]], [[R1]], [[R2]] + store <2 x i64> %3, <2 x i64>* %c + ; CHECK-DAG: st.d [[R3]], 0($4) + + ret void + ; CHECK: .size vshf_v2i64_3 +} + +define void @vshf_v2i64_4(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind { + ; CHECK: vshf_v2i64_4: + + %1 = load <2 x i64>* %a + ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5) + %2 = shufflevector <2 x i64> %1, <2 x i64> %1, <2 x i32> + ; CHECK-DAG: ldi.d [[R3:\$w[0-9]+]], 1 + ; CHECK-DAG: vshf.d [[R3]], [[R1]], [[R1]] + store <2 x i64> %2, <2 x i64>* %c + ; CHECK-DAG: st.d [[R3]], 0($4) + + ret void + ; CHECK: .size vshf_v2i64_4 +}