Add lowering for AVX2 shift instructions.

llvm-svn: 144380
This commit is contained in:
Craig Topper 2011-11-11 07:39:23 +00:00
parent e89c3bd004
commit ea28a34c43
4 changed files with 377 additions and 190 deletions

View File

@ -1050,21 +1050,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::MUL, MVT::v4i64, Custom);
setOperationAction(ISD::MUL, MVT::v8i32, Legal);
setOperationAction(ISD::MUL, MVT::v16i16, Legal);
// Don't lower v32i8 because there is no 128-bit byte mul
setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
setOperationAction(ISD::SHL, MVT::v4i32, Legal);
setOperationAction(ISD::SHL, MVT::v2i64, Legal);
setOperationAction(ISD::SRL, MVT::v4i32, Legal);
setOperationAction(ISD::SRL, MVT::v2i64, Legal);
setOperationAction(ISD::SRA, MVT::v4i32, Legal);
setOperationAction(ISD::SHL, MVT::v8i32, Legal);
setOperationAction(ISD::SHL, MVT::v4i64, Legal);
setOperationAction(ISD::SRL, MVT::v8i32, Legal);
setOperationAction(ISD::SRL, MVT::v4i64, Legal);
setOperationAction(ISD::SRA, MVT::v8i32, Legal);
// Don't lower v32i8 because there is no 128-bit byte mul
} else {
setOperationAction(ISD::ADD, MVT::v4i64, Custom);
setOperationAction(ISD::ADD, MVT::v8i32, Custom);
@ -10130,47 +10118,6 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
if (!Subtarget->hasXMMInt())
return SDValue();
// Decompose 256-bit shifts into smaller 128-bit shifts.
if (VT.getSizeInBits() == 256) {
int NumElems = VT.getVectorNumElements();
MVT EltVT = VT.getVectorElementType().getSimpleVT();
EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
// Extract the two vectors
SDValue V1 = Extract128BitVector(R, DAG.getConstant(0, MVT::i32), DAG, dl);
SDValue V2 = Extract128BitVector(R, DAG.getConstant(NumElems/2, MVT::i32),
DAG, dl);
// Recreate the shift amount vectors
SDValue Amt1, Amt2;
if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
// Constant shift amount
SmallVector<SDValue, 4> Amt1Csts;
SmallVector<SDValue, 4> Amt2Csts;
for (int i = 0; i < NumElems/2; ++i)
Amt1Csts.push_back(Amt->getOperand(i));
for (int i = NumElems/2; i < NumElems; ++i)
Amt2Csts.push_back(Amt->getOperand(i));
Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
&Amt1Csts[0], NumElems/2);
Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
&Amt2Csts[0], NumElems/2);
} else {
// Variable shift amount
Amt1 = Extract128BitVector(Amt, DAG.getConstant(0, MVT::i32), DAG, dl);
Amt2 = Extract128BitVector(Amt, DAG.getConstant(NumElems/2, MVT::i32),
DAG, dl);
}
// Issue new vector shifts for the smaller types
V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1);
V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2);
// Concatenate the result back
return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2);
}
// Optimize shl/srl/sra with constant shift amount.
if (isSplatVector(Amt.getNode())) {
SDValue SclrAmt = Amt->getOperand(0);
@ -10259,9 +10206,97 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
return Res;
}
if (Subtarget->hasAVX2()) {
if (VT == MVT::v4i64 && Op.getOpcode() == ISD::SHL)
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_avx2_pslli_q, MVT::i32),
R, DAG.getConstant(ShiftAmt, MVT::i32));
if (VT == MVT::v8i32 && Op.getOpcode() == ISD::SHL)
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_avx2_pslli_d, MVT::i32),
R, DAG.getConstant(ShiftAmt, MVT::i32));
if (VT == MVT::v16i16 && Op.getOpcode() == ISD::SHL)
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_avx2_pslli_w, MVT::i32),
R, DAG.getConstant(ShiftAmt, MVT::i32));
if (VT == MVT::v4i64 && Op.getOpcode() == ISD::SRL)
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_avx2_psrli_q, MVT::i32),
R, DAG.getConstant(ShiftAmt, MVT::i32));
if (VT == MVT::v8i32 && Op.getOpcode() == ISD::SRL)
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_avx2_psrli_d, MVT::i32),
R, DAG.getConstant(ShiftAmt, MVT::i32));
if (VT == MVT::v16i16 && Op.getOpcode() == ISD::SRL)
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_avx2_psrli_w, MVT::i32),
R, DAG.getConstant(ShiftAmt, MVT::i32));
if (VT == MVT::v8i32 && Op.getOpcode() == ISD::SRA)
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_avx2_psrai_d, MVT::i32),
R, DAG.getConstant(ShiftAmt, MVT::i32));
if (VT == MVT::v16i16 && Op.getOpcode() == ISD::SRA)
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_avx2_psrai_w, MVT::i32),
R, DAG.getConstant(ShiftAmt, MVT::i32));
}
}
}
// AVX2 variable shifts
if (Subtarget->hasAVX2()) {
if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL)
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_avx2_psllv_d, MVT::i32),
R, Amt);
if (VT == MVT::v8i32 && Op->getOpcode() == ISD::SHL)
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_avx2_psllv_d_256, MVT::i32),
R, Amt);
if (VT == MVT::v2i64 && Op->getOpcode() == ISD::SHL)
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_avx2_psllv_q, MVT::i32),
R, Amt);
if (VT == MVT::v4i64 && Op->getOpcode() == ISD::SHL)
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_avx2_psllv_q_256, MVT::i32),
R, Amt);
if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SRL)
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_avx2_psrlv_d, MVT::i32),
R, Amt);
if (VT == MVT::v8i32 && Op->getOpcode() == ISD::SRL)
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_avx2_psrlv_d_256, MVT::i32),
R, Amt);
if (VT == MVT::v2i64 && Op->getOpcode() == ISD::SRL)
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_avx2_psrlv_q, MVT::i32),
R, Amt);
if (VT == MVT::v4i64 && Op->getOpcode() == ISD::SRL)
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_avx2_psrlv_q_256, MVT::i32),
R, Amt);
if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SRA)
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_avx2_psrav_d, MVT::i32),
R, Amt);
if (VT == MVT::v8i32 && Op->getOpcode() == ISD::SRA)
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_avx2_psrav_d_256, MVT::i32),
R, Amt);
}
// Lower SHL with variable shift amount.
if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
@ -10328,6 +10363,48 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
R, DAG.getNode(ISD::ADD, dl, VT, R, R));
return R;
}
// Decompose 256-bit shifts into smaller 128-bit shifts.
if (VT.getSizeInBits() == 256) {
int NumElems = VT.getVectorNumElements();
MVT EltVT = VT.getVectorElementType().getSimpleVT();
EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
// Extract the two vectors
SDValue V1 = Extract128BitVector(R, DAG.getConstant(0, MVT::i32), DAG, dl);
SDValue V2 = Extract128BitVector(R, DAG.getConstant(NumElems/2, MVT::i32),
DAG, dl);
// Recreate the shift amount vectors
SDValue Amt1, Amt2;
if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
// Constant shift amount
SmallVector<SDValue, 4> Amt1Csts;
SmallVector<SDValue, 4> Amt2Csts;
for (int i = 0; i < NumElems/2; ++i)
Amt1Csts.push_back(Amt->getOperand(i));
for (int i = NumElems/2; i < NumElems; ++i)
Amt2Csts.push_back(Amt->getOperand(i));
Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
&Amt1Csts[0], NumElems/2);
Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
&Amt2Csts[0], NumElems/2);
} else {
// Variable shift amount
Amt1 = Extract128BitVector(Amt, DAG.getConstant(0, MVT::i32), DAG, dl);
Amt2 = Extract128BitVector(Amt, DAG.getConstant(NumElems/2, MVT::i32),
DAG, dl);
}
// Issue new vector shifts for the smaller types
V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1);
V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2);
// Concatenate the result back
return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2);
}
return SDValue();
}

View File

@ -7655,7 +7655,6 @@ defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
// Variable Bit Shifts
//
multiclass avx2_var_shift<bits<8> opc, string OpcodeStr,
PatFrag pf128, PatFrag pf256,
Intrinsic Int128, Intrinsic Int256> {
def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2),
@ -7664,7 +7663,8 @@ multiclass avx2_var_shift<bits<8> opc, string OpcodeStr,
def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i128mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst, (Int128 VR128:$src1, (pf128 addr:$src2)))]>,
[(set VR128:$dst,
(Int128 VR128:$src1, (bitconvert (memopv2i64 addr:$src2))))]>,
VEX_4V;
def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2),
@ -7673,70 +7673,43 @@ multiclass avx2_var_shift<bits<8> opc, string OpcodeStr,
def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, i256mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR256:$dst, (Int256 VR256:$src1, (pf256 addr:$src2)))]>,
[(set VR256:$dst,
(Int256 VR256:$src1, (bitconvert (memopv4i64 addr:$src2))))]>,
VEX_4V;
}
defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", memopv4i32, memopv8i32,
int_x86_avx2_psllv_d, int_x86_avx2_psllv_d_256>;
defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", memopv2i64, memopv4i64,
int_x86_avx2_psllv_q, int_x86_avx2_psllv_q_256>,
VEX_W;
defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", memopv4i32, memopv8i32,
int_x86_avx2_psrlv_d, int_x86_avx2_psrlv_d_256>;
defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", memopv2i64, memopv4i64,
int_x86_avx2_psrlv_q, int_x86_avx2_psrlv_q_256>,
VEX_W;
defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", memopv4i32, memopv8i32,
int_x86_avx2_psrav_d, int_x86_avx2_psrav_d_256>;
let Predicates = [HasAVX2] in {
def : Pat<(v4i32 (shl (v4i32 VR128:$src1), (v4i32 VR128:$src2))),
(VPSLLVDrr VR128:$src1, VR128:$src2)>;
def : Pat<(v2i64 (shl (v2i64 VR128:$src1), (v2i64 VR128:$src2))),
(VPSLLVQrr VR128:$src1, VR128:$src2)>;
def : Pat<(v4i32 (srl (v4i32 VR128:$src1), (v4i32 VR128:$src2))),
(VPSRLVDrr VR128:$src1, VR128:$src2)>;
def : Pat<(v2i64 (srl (v2i64 VR128:$src1), (v2i64 VR128:$src2))),
(VPSRLVQrr VR128:$src1, VR128:$src2)>;
def : Pat<(v4i32 (sra (v4i32 VR128:$src1), (v4i32 VR128:$src2))),
(VPSRAVDrr VR128:$src1, VR128:$src2)>;
def : Pat<(v8i32 (shl (v8i32 VR256:$src1), (v8i32 VR256:$src2))),
(VPSLLVDYrr VR256:$src1, VR256:$src2)>;
def : Pat<(v4i64 (shl (v4i64 VR256:$src1), (v4i64 VR256:$src2))),
(VPSLLVQYrr VR256:$src1, VR256:$src2)>;
def : Pat<(v8i32 (srl (v8i32 VR256:$src1), (v8i32 VR256:$src2))),
(VPSRLVDYrr VR256:$src1, VR256:$src2)>;
def : Pat<(v4i64 (srl (v4i64 VR256:$src1), (v4i64 VR256:$src2))),
(VPSRLVQYrr VR256:$src1, VR256:$src2)>;
def : Pat<(v8i32 (sra (v8i32 VR256:$src1), (v8i32 VR256:$src2))),
(VPSRAVDYrr VR256:$src1, VR256:$src2)>;
def : Pat<(v4i32 (shl (v4i32 VR128:$src1),(loadv4i32 addr:$src2))),
(VPSLLVDrm VR128:$src1, addr:$src2)>;
def : Pat<(v4i32 (shl (v4i32 VR128:$src1),(loadv2i64 addr:$src2))),
(VPSLLVDrm VR128:$src1, addr:$src2)>;
def : Pat<(v2i64 (shl (v2i64 VR128:$src1),(loadv2i64 addr:$src2))),
(VPSLLVQrm VR128:$src1, addr:$src2)>;
def : Pat<(v4i32 (srl (v4i32 VR128:$src1),(loadv4i32 addr:$src2))),
(VPSRLVDrm VR128:$src1, addr:$src2)>;
def : Pat<(v2i64 (srl (v2i64 VR128:$src1),(loadv2i64 addr:$src2))),
(VPSRLVQrm VR128:$src1, addr:$src2)>;
def : Pat<(v4i32 (sra (v4i32 VR128:$src1),(loadv4i32 addr:$src2))),
(VPSRAVDrm VR128:$src1, addr:$src2)>;
def : Pat<(v8i32 (shl (v8i32 VR256:$src1),(loadv8i32 addr:$src2))),
(VPSLLVDYrm VR256:$src1, addr:$src2)>;
def : Pat<(v4i64 (shl (v4i64 VR256:$src1),(loadv4i64 addr:$src2))),
(VPSLLVQYrm VR256:$src1, addr:$src2)>;
def : Pat<(v8i32 (srl (v8i32 VR256:$src1),(loadv8i32 addr:$src2))),
(VPSRLVDYrm VR256:$src1, addr:$src2)>;
def : Pat<(v4i64 (srl (v4i64 VR256:$src1),(loadv4i64 addr:$src2))),
(VPSRLVQYrm VR256:$src1, addr:$src2)>;
def : Pat<(v8i32 (sra (v8i32 VR256:$src1),(loadv8i32 addr:$src2))),
(VPSRAVDYrm VR256:$src1, addr:$src2)>;
multiclass avx2_var_shift_i64<bits<8> opc, string OpcodeStr,
Intrinsic Int128, Intrinsic Int256> {
def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst, (Int128 VR128:$src1, VR128:$src2))]>, VEX_4V;
def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i128mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst,
(Int128 VR128:$src1, (memopv2i64 addr:$src2)))]>,
VEX_4V;
def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR256:$dst, (Int256 VR256:$src1, VR256:$src2))]>, VEX_4V;
def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, i256mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR256:$dst,
(Int256 VR256:$src1, (memopv4i64 addr:$src2)))]>,
VEX_4V;
}
defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", int_x86_avx2_psllv_d,
int_x86_avx2_psllv_d_256>;
defm VPSLLVQ : avx2_var_shift_i64<0x47, "vpsllvq", int_x86_avx2_psllv_q,
int_x86_avx2_psllv_q_256>, VEX_W;
defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", int_x86_avx2_psrlv_d,
int_x86_avx2_psrlv_d_256>;
defm VPSRLVQ : avx2_var_shift_i64<0x45, "vpsrlvq", int_x86_avx2_psrlv_q,
int_x86_avx2_psrlv_q_256>, VEX_W;
defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", int_x86_avx2_psrav_d,
int_x86_avx2_psrav_d_256>;

View File

@ -53,76 +53,3 @@ define <32 x i8> @vpblendvb(<32 x i8> %x, <32 x i8> %y) {
%min = select <32 x i1> %min_is_x, <32 x i8> %x, <32 x i8> %y
ret <32 x i8> %min
}
; CHECK: variable_shl0
; CHECK: psllvd
; CHECK: ret
define <4 x i32> @variable_shl0(<4 x i32> %x, <4 x i32> %y) {
%k = shl <4 x i32> %x, %y
ret <4 x i32> %k
}
; CHECK: variable_shl1
; CHECK: psllvd
; CHECK: ret
define <8 x i32> @variable_shl1(<8 x i32> %x, <8 x i32> %y) {
%k = shl <8 x i32> %x, %y
ret <8 x i32> %k
}
; CHECK: variable_shl2
; CHECK: psllvq
; CHECK: ret
define <2 x i64> @variable_shl2(<2 x i64> %x, <2 x i64> %y) {
%k = shl <2 x i64> %x, %y
ret <2 x i64> %k
}
; CHECK: variable_shl3
; CHECK: psllvq
; CHECK: ret
define <4 x i64> @variable_shl3(<4 x i64> %x, <4 x i64> %y) {
%k = shl <4 x i64> %x, %y
ret <4 x i64> %k
}
; CHECK: variable_srl0
; CHECK: psrlvd
; CHECK: ret
define <4 x i32> @variable_srl0(<4 x i32> %x, <4 x i32> %y) {
%k = lshr <4 x i32> %x, %y
ret <4 x i32> %k
}
; CHECK: variable_srl1
; CHECK: psrlvd
; CHECK: ret
define <8 x i32> @variable_srl1(<8 x i32> %x, <8 x i32> %y) {
%k = lshr <8 x i32> %x, %y
ret <8 x i32> %k
}
; CHECK: variable_srl2
; CHECK: psrlvq
; CHECK: ret
define <2 x i64> @variable_srl2(<2 x i64> %x, <2 x i64> %y) {
%k = lshr <2 x i64> %x, %y
ret <2 x i64> %k
}
; CHECK: variable_srl3
; CHECK: psrlvq
; CHECK: ret
define <4 x i64> @variable_srl3(<4 x i64> %x, <4 x i64> %y) {
%k = lshr <4 x i64> %x, %y
ret <4 x i64> %k
}
; CHECK: variable_sra0
; CHECK: psravd
; CHECK: ret
define <4 x i32> @variable_sra0(<4 x i32> %x, <4 x i32> %y) {
%k = ashr <4 x i32> %x, %y
ret <4 x i32> %k
}
; CHECK: variable_sra1
; CHECK: psravd
; CHECK: ret
define <8 x i32> @variable_sra1(<8 x i32> %x, <8 x i32> %y) {
%k = ashr <8 x i32> %x, %y
ret <8 x i32> %k
}

View File

@ -0,0 +1,210 @@
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s
; CHECK: variable_shl0
; CHECK: psllvd
; CHECK: ret
define <4 x i32> @variable_shl0(<4 x i32> %x, <4 x i32> %y) {
%k = shl <4 x i32> %x, %y
ret <4 x i32> %k
}
; CHECK: variable_shl1
; CHECK: psllvd
; CHECK: ret
define <8 x i32> @variable_shl1(<8 x i32> %x, <8 x i32> %y) {
%k = shl <8 x i32> %x, %y
ret <8 x i32> %k
}
; CHECK: variable_shl2
; CHECK: psllvq
; CHECK: ret
define <2 x i64> @variable_shl2(<2 x i64> %x, <2 x i64> %y) {
%k = shl <2 x i64> %x, %y
ret <2 x i64> %k
}
; CHECK: variable_shl3
; CHECK: psllvq
; CHECK: ret
define <4 x i64> @variable_shl3(<4 x i64> %x, <4 x i64> %y) {
%k = shl <4 x i64> %x, %y
ret <4 x i64> %k
}
; CHECK: variable_srl0
; CHECK: psrlvd
; CHECK: ret
define <4 x i32> @variable_srl0(<4 x i32> %x, <4 x i32> %y) {
%k = lshr <4 x i32> %x, %y
ret <4 x i32> %k
}
; CHECK: variable_srl1
; CHECK: psrlvd
; CHECK: ret
define <8 x i32> @variable_srl1(<8 x i32> %x, <8 x i32> %y) {
%k = lshr <8 x i32> %x, %y
ret <8 x i32> %k
}
; CHECK: variable_srl2
; CHECK: psrlvq
; CHECK: ret
define <2 x i64> @variable_srl2(<2 x i64> %x, <2 x i64> %y) {
%k = lshr <2 x i64> %x, %y
ret <2 x i64> %k
}
; CHECK: variable_srl3
; CHECK: psrlvq
; CHECK: ret
define <4 x i64> @variable_srl3(<4 x i64> %x, <4 x i64> %y) {
%k = lshr <4 x i64> %x, %y
ret <4 x i64> %k
}
; CHECK: variable_sra0
; CHECK: psravd
; CHECK: ret
define <4 x i32> @variable_sra0(<4 x i32> %x, <4 x i32> %y) {
%k = ashr <4 x i32> %x, %y
ret <4 x i32> %k
}
; CHECK: variable_sra1
; CHECK: psravd
; CHECK: ret
define <8 x i32> @variable_sra1(<8 x i32> %x, <8 x i32> %y) {
%k = ashr <8 x i32> %x, %y
ret <8 x i32> %k
}
;;; Shift left
; CHECK: vpslld
define <8 x i32> @vshift00(<8 x i32> %a) nounwind readnone {
%s = shl <8 x i32> %a, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32
2>
ret <8 x i32> %s
}
; CHECK: vpsllw
define <16 x i16> @vshift01(<16 x i16> %a) nounwind readnone {
%s = shl <16 x i16> %a, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
ret <16 x i16> %s
}
; CHECK: vpsllq
define <4 x i64> @vshift02(<4 x i64> %a) nounwind readnone {
%s = shl <4 x i64> %a, <i64 2, i64 2, i64 2, i64 2>
ret <4 x i64> %s
}
;;; Logical Shift right
; CHECK: vpsrld
define <8 x i32> @vshift03(<8 x i32> %a) nounwind readnone {
%s = lshr <8 x i32> %a, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32
2>
ret <8 x i32> %s
}
; CHECK: vpsrlw
define <16 x i16> @vshift04(<16 x i16> %a) nounwind readnone {
%s = lshr <16 x i16> %a, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
ret <16 x i16> %s
}
; CHECK: vpsrlq
define <4 x i64> @vshift05(<4 x i64> %a) nounwind readnone {
%s = lshr <4 x i64> %a, <i64 2, i64 2, i64 2, i64 2>
ret <4 x i64> %s
}
;;; Arithmetic Shift right
; CHECK: vpsrad
define <8 x i32> @vshift06(<8 x i32> %a) nounwind readnone {
%s = ashr <8 x i32> %a, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32
2>
ret <8 x i32> %s
}
; CHECK: vpsraw
define <16 x i16> @vshift07(<16 x i16> %a) nounwind readnone {
%s = ashr <16 x i16> %a, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
ret <16 x i16> %s
}
; CHECK: variable_sra0_load
; CHECK: psravd (%
; CHECK: ret
define <4 x i32> @variable_sra0_load(<4 x i32> %x, <4 x i32>* %y) {
%y1 = load <4 x i32>* %y
%k = ashr <4 x i32> %x, %y1
ret <4 x i32> %k
}
; CHECK: variable_sra1_load
; CHECK: psravd (%
; CHECK: ret
define <8 x i32> @variable_sra1_load(<8 x i32> %x, <8 x i32>* %y) {
%y1 = load <8 x i32>* %y
%k = ashr <8 x i32> %x, %y1
ret <8 x i32> %k
}
; CHECK: variable_shl0_load
; CHECK: psllvd (%
; CHECK: ret
define <4 x i32> @variable_shl0_load(<4 x i32> %x, <4 x i32>* %y) {
%y1 = load <4 x i32>* %y
%k = shl <4 x i32> %x, %y1
ret <4 x i32> %k
}
; CHECK: variable_shl1_load
; CHECK: psllvd (%
; CHECK: ret
define <8 x i32> @variable_shl1_load(<8 x i32> %x, <8 x i32>* %y) {
%y1 = load <8 x i32>* %y
%k = shl <8 x i32> %x, %y1
ret <8 x i32> %k
}
; CHECK: variable_shl2_load
; CHECK: psllvq (%
; CHECK: ret
define <2 x i64> @variable_shl2_load(<2 x i64> %x, <2 x i64>* %y) {
%y1 = load <2 x i64>* %y
%k = shl <2 x i64> %x, %y1
ret <2 x i64> %k
}
; CHECK: variable_shl3_load
; CHECK: psllvq (%
; CHECK: ret
define <4 x i64> @variable_shl3_load(<4 x i64> %x, <4 x i64>* %y) {
%y1 = load <4 x i64>* %y
%k = shl <4 x i64> %x, %y1
ret <4 x i64> %k
}
; CHECK: variable_srl0_load
; CHECK: psrlvd (%
; CHECK: ret
define <4 x i32> @variable_srl0_load(<4 x i32> %x, <4 x i32>* %y) {
%y1 = load <4 x i32>* %y
%k = lshr <4 x i32> %x, %y1
ret <4 x i32> %k
}
; CHECK: variable_srl1_load
; CHECK: psrlvd (%
; CHECK: ret
define <8 x i32> @variable_srl1_load(<8 x i32> %x, <8 x i32>* %y) {
%y1 = load <8 x i32>* %y
%k = lshr <8 x i32> %x, %y1
ret <8 x i32> %k
}
; CHECK: variable_srl2_load
; CHECK: psrlvq (%
; CHECK: ret
define <2 x i64> @variable_srl2_load(<2 x i64> %x, <2 x i64>* %y) {
%y1 = load <2 x i64>* %y
%k = lshr <2 x i64> %x, %y1
ret <2 x i64> %k
}
; CHECK: variable_srl3_load
; CHECK: psrlvq (%
; CHECK: ret
define <4 x i64> @variable_srl3_load(<4 x i64> %x, <4 x i64>* %y) {
%y1 = load <4 x i64>* %y
%k = lshr <4 x i64> %x, %y1
ret <4 x i64> %k
}