forked from OSchip/llvm-project
[X86][SSE] Replace 128-bit SSE41 PMOVSX intrinsics with native IR
This patches removes the x86.sse41.pmovsx* intrinsics, provides a suitable upgrade path and updates relevant tests to sign extend a subvector instead. LLVM counterpart to D12835 Differential Revision: http://reviews.llvm.org/D13002 llvm-svn: 248368
This commit is contained in:
parent
d89ae9d3ac
commit
9cb018b6b6
|
@ -132,6 +132,7 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
|
|||
Name.startswith("x86.avx2.vbroadcast") ||
|
||||
Name.startswith("x86.avx2.pbroadcast") ||
|
||||
Name.startswith("x86.avx.vpermil.") ||
|
||||
Name.startswith("x86.sse41.pmovsx") ||
|
||||
Name == "x86.avx.vinsertf128.pd.256" ||
|
||||
Name == "x86.avx.vinsertf128.ps.256" ||
|
||||
Name == "x86.avx.vinsertf128.si.256" ||
|
||||
|
@ -440,6 +441,19 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
|
|||
for (unsigned I = 0; I < EltNum; ++I)
|
||||
Rep = Builder.CreateInsertElement(Rep, Load,
|
||||
ConstantInt::get(I32Ty, I));
|
||||
} else if (Name.startswith("llvm.x86.sse41.pmovsx")) {
|
||||
VectorType *SrcTy = cast<VectorType>(CI->getArgOperand(0)->getType());
|
||||
VectorType *DstTy = cast<VectorType>(CI->getType());
|
||||
unsigned NumDstElts = DstTy->getNumElements();
|
||||
|
||||
// Extract a subvector of the first NumDstElts lanes and sign extend.
|
||||
SmallVector<int, 8> ShuffleMask;
|
||||
for (int i = 0; i != (int)NumDstElts; ++i)
|
||||
ShuffleMask.push_back(i);
|
||||
|
||||
Value *SV = Builder.CreateShuffleVector(
|
||||
CI->getArgOperand(0), UndefValue::get(SrcTy), ShuffleMask);
|
||||
Rep = Builder.CreateSExt(SV, DstTy);
|
||||
} else if (Name == "llvm.x86.avx2.vbroadcasti128") {
|
||||
// Replace vbroadcasts with a vector shuffle.
|
||||
Type *VT = VectorType::get(Type::getInt64Ty(C), 2);
|
||||
|
@ -527,10 +541,10 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
|
|||
unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
|
||||
VectorType *VecTy = cast<VectorType>(CI->getType());
|
||||
unsigned NumElts = VecTy->getNumElements();
|
||||
|
||||
|
||||
// Mask off the high bits of the immediate value; hardware ignores those.
|
||||
Imm = Imm & 1;
|
||||
|
||||
|
||||
// Extend the second operand into a vector that is twice as big.
|
||||
Value *UndefV = UndefValue::get(Op1->getType());
|
||||
SmallVector<Constant*, 8> Idxs;
|
||||
|
@ -572,7 +586,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
|
|||
unsigned Imm = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
|
||||
VectorType *VecTy = cast<VectorType>(CI->getType());
|
||||
unsigned NumElts = VecTy->getNumElements();
|
||||
|
||||
|
||||
// Mask off the high bits of the immediate value; hardware ignores those.
|
||||
Imm = Imm & 1;
|
||||
|
||||
|
|
|
@ -681,13 +681,13 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
|
|||
X86ISD::EXPAND, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_expand_q_512, COMPRESS_EXPAND_IN_REG,
|
||||
X86ISD::EXPAND, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_128, FPCLASS, X86ISD::VFPCLASS, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_256, FPCLASS, X86ISD::VFPCLASS, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_512, FPCLASS, X86ISD::VFPCLASS, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_128, FPCLASS, X86ISD::VFPCLASS, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_256, FPCLASS, X86ISD::VFPCLASS, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_128, FPCLASS, X86ISD::VFPCLASS, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_256, FPCLASS, X86ISD::VFPCLASS, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_512, FPCLASS, X86ISD::VFPCLASS, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_128, FPCLASS, X86ISD::VFPCLASS, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_256, FPCLASS, X86ISD::VFPCLASS, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_512, FPCLASS, X86ISD::VFPCLASS, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_getexp_pd_128, INTR_TYPE_1OP_MASK_RM,
|
||||
X86_INTRINSIC_DATA(avx512_mask_getexp_pd_128, INTR_TYPE_1OP_MASK_RM,
|
||||
X86ISD::FGETEXP_RND, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_getexp_pd_256, INTR_TYPE_1OP_MASK_RM,
|
||||
X86ISD::FGETEXP_RND, 0),
|
||||
|
@ -1628,12 +1628,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
|
|||
X86_INTRINSIC_DATA(sse41_pminsd, INTR_TYPE_2OP, ISD::SMIN, 0),
|
||||
X86_INTRINSIC_DATA(sse41_pminud, INTR_TYPE_2OP, ISD::UMIN, 0),
|
||||
X86_INTRINSIC_DATA(sse41_pminuw, INTR_TYPE_2OP, ISD::UMIN, 0),
|
||||
X86_INTRINSIC_DATA(sse41_pmovsxbd, INTR_TYPE_1OP, X86ISD::VSEXT, 0),
|
||||
X86_INTRINSIC_DATA(sse41_pmovsxbq, INTR_TYPE_1OP, X86ISD::VSEXT, 0),
|
||||
X86_INTRINSIC_DATA(sse41_pmovsxbw, INTR_TYPE_1OP, X86ISD::VSEXT, 0),
|
||||
X86_INTRINSIC_DATA(sse41_pmovsxdq, INTR_TYPE_1OP, X86ISD::VSEXT, 0),
|
||||
X86_INTRINSIC_DATA(sse41_pmovsxwd, INTR_TYPE_1OP, X86ISD::VSEXT, 0),
|
||||
X86_INTRINSIC_DATA(sse41_pmovsxwq, INTR_TYPE_1OP, X86ISD::VSEXT, 0),
|
||||
X86_INTRINSIC_DATA(sse41_pmovzxbd, INTR_TYPE_1OP, X86ISD::VZEXT, 0),
|
||||
X86_INTRINSIC_DATA(sse41_pmovzxbq, INTR_TYPE_1OP, X86ISD::VZEXT, 0),
|
||||
X86_INTRINSIC_DATA(sse41_pmovzxbw, INTR_TYPE_1OP, X86ISD::VZEXT, 0),
|
||||
|
|
|
@ -936,12 +936,6 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
|
|||
break;
|
||||
}
|
||||
|
||||
case Intrinsic::x86_sse41_pmovsxbd:
|
||||
case Intrinsic::x86_sse41_pmovsxbq:
|
||||
case Intrinsic::x86_sse41_pmovsxbw:
|
||||
case Intrinsic::x86_sse41_pmovsxdq:
|
||||
case Intrinsic::x86_sse41_pmovsxwd:
|
||||
case Intrinsic::x86_sse41_pmovsxwq:
|
||||
case Intrinsic::x86_avx2_pmovsxbd:
|
||||
case Intrinsic::x86_avx2_pmovsxbq:
|
||||
case Intrinsic::x86_avx2_pmovsxbw:
|
||||
|
|
|
@ -143,3 +143,69 @@ define <8 x i16> @test_x86_sse41_pblendw(<8 x i16> %a0, <8 x i16> %a1) {
|
|||
ret <8 x i16> %res
|
||||
}
|
||||
declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone
|
||||
|
||||
|
||||
define <4 x i32> @test_x86_sse41_pmovsxbd(<16 x i8> %a0) {
|
||||
; CHECK-LABEL: test_x86_sse41_pmovsxbd:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vpmovsxbd %xmm0, %xmm0
|
||||
; CHECK-NEXT: retl
|
||||
%res = call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %a0) ; <<4 x i32>> [#uses=1]
|
||||
ret <4 x i32> %res
|
||||
}
|
||||
declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone
|
||||
|
||||
|
||||
define <2 x i64> @test_x86_sse41_pmovsxbq(<16 x i8> %a0) {
|
||||
; CHECK-LABEL: test_x86_sse41_pmovsxbq:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vpmovsxbq %xmm0, %xmm0
|
||||
; CHECK-NEXT: retl
|
||||
%res = call <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8> %a0) ; <<2 x i64>> [#uses=1]
|
||||
ret <2 x i64> %res
|
||||
}
|
||||
declare <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8>) nounwind readnone
|
||||
|
||||
|
||||
define <8 x i16> @test_x86_sse41_pmovsxbw(<16 x i8> %a0) {
|
||||
; CHECK-LABEL: test_x86_sse41_pmovsxbw:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vpmovsxbw %xmm0, %xmm0
|
||||
; CHECK-NEXT: retl
|
||||
%res = call <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1]
|
||||
ret <8 x i16> %res
|
||||
}
|
||||
declare <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8>) nounwind readnone
|
||||
|
||||
|
||||
define <2 x i64> @test_x86_sse41_pmovsxdq(<4 x i32> %a0) {
|
||||
; CHECK-LABEL: test_x86_sse41_pmovsxdq:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0
|
||||
; CHECK-NEXT: retl
|
||||
%res = call <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32> %a0) ; <<2 x i64>> [#uses=1]
|
||||
ret <2 x i64> %res
|
||||
}
|
||||
declare <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32>) nounwind readnone
|
||||
|
||||
|
||||
define <4 x i32> @test_x86_sse41_pmovsxwd(<8 x i16> %a0) {
|
||||
; CHECK-LABEL: test_x86_sse41_pmovsxwd:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vpmovsxwd %xmm0, %xmm0
|
||||
; CHECK-NEXT: retl
|
||||
%res = call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %a0) ; <<4 x i32>> [#uses=1]
|
||||
ret <4 x i32> %res
|
||||
}
|
||||
declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone
|
||||
|
||||
|
||||
define <2 x i64> @test_x86_sse41_pmovsxwq(<8 x i16> %a0) {
|
||||
; CHECK-LABEL: test_x86_sse41_pmovsxwq:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vpmovsxwq %xmm0, %xmm0
|
||||
; CHECK-NEXT: retl
|
||||
%res = call <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16> %a0) ; <<2 x i64>> [#uses=1]
|
||||
ret <2 x i64> %res
|
||||
}
|
||||
declare <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16>) nounwind readnone
|
||||
|
|
|
@ -1251,72 +1251,6 @@ define <8 x i16> @test_x86_sse41_pminuw(<8 x i16> %a0, <8 x i16> %a1) {
|
|||
declare <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16>, <8 x i16>) nounwind readnone
|
||||
|
||||
|
||||
define <4 x i32> @test_x86_sse41_pmovsxbd(<16 x i8> %a0) {
|
||||
; CHECK-LABEL: test_x86_sse41_pmovsxbd:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: vpmovsxbd %xmm0, %xmm0
|
||||
; CHECK-NEXT: retl
|
||||
%res = call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %a0) ; <<4 x i32>> [#uses=1]
|
||||
ret <4 x i32> %res
|
||||
}
|
||||
declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone
|
||||
|
||||
|
||||
define <2 x i64> @test_x86_sse41_pmovsxbq(<16 x i8> %a0) {
|
||||
; CHECK-LABEL: test_x86_sse41_pmovsxbq:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: vpmovsxbq %xmm0, %xmm0
|
||||
; CHECK-NEXT: retl
|
||||
%res = call <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8> %a0) ; <<2 x i64>> [#uses=1]
|
||||
ret <2 x i64> %res
|
||||
}
|
||||
declare <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8>) nounwind readnone
|
||||
|
||||
|
||||
define <8 x i16> @test_x86_sse41_pmovsxbw(<16 x i8> %a0) {
|
||||
; CHECK-LABEL: test_x86_sse41_pmovsxbw:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: vpmovsxbw %xmm0, %xmm0
|
||||
; CHECK-NEXT: retl
|
||||
%res = call <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1]
|
||||
ret <8 x i16> %res
|
||||
}
|
||||
declare <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8>) nounwind readnone
|
||||
|
||||
|
||||
define <2 x i64> @test_x86_sse41_pmovsxdq(<4 x i32> %a0) {
|
||||
; CHECK-LABEL: test_x86_sse41_pmovsxdq:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0
|
||||
; CHECK-NEXT: retl
|
||||
%res = call <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32> %a0) ; <<2 x i64>> [#uses=1]
|
||||
ret <2 x i64> %res
|
||||
}
|
||||
declare <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32>) nounwind readnone
|
||||
|
||||
|
||||
define <4 x i32> @test_x86_sse41_pmovsxwd(<8 x i16> %a0) {
|
||||
; CHECK-LABEL: test_x86_sse41_pmovsxwd:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: vpmovsxwd %xmm0, %xmm0
|
||||
; CHECK-NEXT: retl
|
||||
%res = call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %a0) ; <<4 x i32>> [#uses=1]
|
||||
ret <4 x i32> %res
|
||||
}
|
||||
declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone
|
||||
|
||||
|
||||
define <2 x i64> @test_x86_sse41_pmovsxwq(<8 x i16> %a0) {
|
||||
; CHECK-LABEL: test_x86_sse41_pmovsxwq:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: vpmovsxwq %xmm0, %xmm0
|
||||
; CHECK-NEXT: retl
|
||||
%res = call <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16> %a0) ; <<2 x i64>> [#uses=1]
|
||||
ret <2 x i64> %res
|
||||
}
|
||||
declare <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16>) nounwind readnone
|
||||
|
||||
|
||||
define <4 x i32> @test_x86_sse41_pmovzxbd(<16 x i8> %a0) {
|
||||
; CHECK-LABEL: test_x86_sse41_pmovzxbd:
|
||||
; CHECK: ## BB#0:
|
||||
|
@ -3378,7 +3312,7 @@ define void @movnt_dq(i8* %p, <2 x i64> %a1) nounwind {
|
|||
; CHECK-LABEL: movnt_dq:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; CHECK-NEXT: vpaddq LCPI282_0, %xmm0, %xmm0
|
||||
; CHECK-NEXT: vpaddq LCPI276_0, %xmm0, %xmm0
|
||||
; CHECK-NEXT: vmovntdq %ymm0, (%eax)
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: retl
|
||||
|
|
|
@ -42,7 +42,6 @@ define <4 x float> @test_x86_sse41_insertps(<4 x float> %a0, <4 x float> %a1) {
|
|||
declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
|
||||
|
||||
|
||||
|
||||
define <8 x i16> @test_x86_sse41_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) {
|
||||
; CHECK: mpsadbw
|
||||
%res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i32 7) ; <<8 x i16>> [#uses=1]
|
||||
|
@ -59,3 +58,49 @@ define <8 x i16> @test_x86_sse41_pblendw(<8 x i16> %a0, <8 x i16> %a1) {
|
|||
declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i32) nounwind readnone
|
||||
|
||||
|
||||
define <4 x i32> @test_x86_sse41_pmovsxbd(<16 x i8> %a0) {
|
||||
; CHECK: pmovsxbd
|
||||
%res = call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %a0) ; <<4 x i32>> [#uses=1]
|
||||
ret <4 x i32> %res
|
||||
}
|
||||
declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone
|
||||
|
||||
|
||||
define <2 x i64> @test_x86_sse41_pmovsxbq(<16 x i8> %a0) {
|
||||
; CHECK: pmovsxbq
|
||||
%res = call <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8> %a0) ; <<2 x i64>> [#uses=1]
|
||||
ret <2 x i64> %res
|
||||
}
|
||||
declare <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8>) nounwind readnone
|
||||
|
||||
|
||||
define <8 x i16> @test_x86_sse41_pmovsxbw(<16 x i8> %a0) {
|
||||
; CHECK: pmovsxbw
|
||||
%res = call <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1]
|
||||
ret <8 x i16> %res
|
||||
}
|
||||
declare <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8>) nounwind readnone
|
||||
|
||||
|
||||
define <2 x i64> @test_x86_sse41_pmovsxdq(<4 x i32> %a0) {
|
||||
; CHECK: pmovsxdq
|
||||
%res = call <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32> %a0) ; <<2 x i64>> [#uses=1]
|
||||
ret <2 x i64> %res
|
||||
}
|
||||
declare <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32>) nounwind readnone
|
||||
|
||||
|
||||
define <4 x i32> @test_x86_sse41_pmovsxwd(<8 x i16> %a0) {
|
||||
; CHECK: pmovsxwd
|
||||
%res = call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %a0) ; <<4 x i32>> [#uses=1]
|
||||
ret <4 x i32> %res
|
||||
}
|
||||
declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone
|
||||
|
||||
|
||||
define <2 x i64> @test_x86_sse41_pmovsxwq(<8 x i16> %a0) {
|
||||
; CHECK: pmovsxwq
|
||||
%res = call <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16> %a0) ; <<2 x i64>> [#uses=1]
|
||||
ret <2 x i64> %res
|
||||
}
|
||||
declare <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16>) nounwind readnone
|
||||
|
|
|
@ -162,54 +162,6 @@ define <8 x i16> @test_x86_sse41_pminuw(<8 x i16> %a0, <8 x i16> %a1) {
|
|||
declare <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16>, <8 x i16>) nounwind readnone
|
||||
|
||||
|
||||
define <4 x i32> @test_x86_sse41_pmovsxbd(<16 x i8> %a0) {
|
||||
; CHECK: pmovsxbd
|
||||
%res = call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %a0) ; <<4 x i32>> [#uses=1]
|
||||
ret <4 x i32> %res
|
||||
}
|
||||
declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone
|
||||
|
||||
|
||||
define <2 x i64> @test_x86_sse41_pmovsxbq(<16 x i8> %a0) {
|
||||
; CHECK: pmovsxbq
|
||||
%res = call <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8> %a0) ; <<2 x i64>> [#uses=1]
|
||||
ret <2 x i64> %res
|
||||
}
|
||||
declare <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8>) nounwind readnone
|
||||
|
||||
|
||||
define <8 x i16> @test_x86_sse41_pmovsxbw(<16 x i8> %a0) {
|
||||
; CHECK: pmovsxbw
|
||||
%res = call <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1]
|
||||
ret <8 x i16> %res
|
||||
}
|
||||
declare <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8>) nounwind readnone
|
||||
|
||||
|
||||
define <2 x i64> @test_x86_sse41_pmovsxdq(<4 x i32> %a0) {
|
||||
; CHECK: pmovsxdq
|
||||
%res = call <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32> %a0) ; <<2 x i64>> [#uses=1]
|
||||
ret <2 x i64> %res
|
||||
}
|
||||
declare <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32>) nounwind readnone
|
||||
|
||||
|
||||
define <4 x i32> @test_x86_sse41_pmovsxwd(<8 x i16> %a0) {
|
||||
; CHECK: pmovsxwd
|
||||
%res = call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %a0) ; <<4 x i32>> [#uses=1]
|
||||
ret <4 x i32> %res
|
||||
}
|
||||
declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone
|
||||
|
||||
|
||||
define <2 x i64> @test_x86_sse41_pmovsxwq(<8 x i16> %a0) {
|
||||
; CHECK: pmovsxwq
|
||||
%res = call <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16> %a0) ; <<2 x i64>> [#uses=1]
|
||||
ret <2 x i64> %res
|
||||
}
|
||||
declare <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16>) nounwind readnone
|
||||
|
||||
|
||||
define <4 x i32> @test_x86_sse41_pmovzxbd(<16 x i8> %a0) {
|
||||
; CHECK: pmovzxbd
|
||||
%res = call <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8> %a0) ; <<4 x i32>> [#uses=1]
|
||||
|
|
|
@ -6,8 +6,9 @@ define <8 x i16> @test_llvm_x86_sse41_pmovsxbw(<16 x i8>* %a) {
|
|||
; SSE41: pmovsxbw (%rdi), %xmm0
|
||||
; AVX: vpmovsxbw (%rdi), %xmm0
|
||||
%1 = load <16 x i8>, <16 x i8>* %a, align 1
|
||||
%2 = call <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8> %1)
|
||||
ret <8 x i16> %2
|
||||
%2 = shufflevector <16 x i8> %1, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%3 = sext <8 x i8> %2 to <8 x i16>
|
||||
ret <8 x i16> %3
|
||||
}
|
||||
|
||||
define <4 x i32> @test_llvm_x86_sse41_pmovsxbd(<16 x i8>* %a) {
|
||||
|
@ -15,8 +16,9 @@ define <4 x i32> @test_llvm_x86_sse41_pmovsxbd(<16 x i8>* %a) {
|
|||
; SSE41: pmovsxbd (%rdi), %xmm0
|
||||
; AVX: vpmovsxbd (%rdi), %xmm0
|
||||
%1 = load <16 x i8>, <16 x i8>* %a, align 1
|
||||
%2 = call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %1)
|
||||
ret <4 x i32> %2
|
||||
%2 = shufflevector <16 x i8> %1, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%3 = sext <4 x i8> %2 to <4 x i32>
|
||||
ret <4 x i32> %3
|
||||
}
|
||||
|
||||
define <2 x i64> @test_llvm_x86_sse41_pmovsxbq(<16 x i8>* %a) {
|
||||
|
@ -24,8 +26,9 @@ define <2 x i64> @test_llvm_x86_sse41_pmovsxbq(<16 x i8>* %a) {
|
|||
; SSE41: pmovsxbq (%rdi), %xmm0
|
||||
; AVX: vpmovsxbq (%rdi), %xmm0
|
||||
%1 = load <16 x i8>, <16 x i8>* %a, align 1
|
||||
%2 = call <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8> %1)
|
||||
ret <2 x i64> %2
|
||||
%2 = shufflevector <16 x i8> %1, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
|
||||
%3 = sext <2 x i8> %2 to <2 x i64>
|
||||
ret <2 x i64> %3
|
||||
}
|
||||
|
||||
define <4 x i32> @test_llvm_x86_sse41_pmovsxwd(<8 x i16>* %a) {
|
||||
|
@ -33,8 +36,9 @@ define <4 x i32> @test_llvm_x86_sse41_pmovsxwd(<8 x i16>* %a) {
|
|||
; SSE41: pmovsxwd (%rdi), %xmm0
|
||||
; AVX: vpmovsxwd (%rdi), %xmm0
|
||||
%1 = load <8 x i16>, <8 x i16>* %a, align 1
|
||||
%2 = call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %1)
|
||||
ret <4 x i32> %2
|
||||
%2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%3 = sext <4 x i16> %2 to <4 x i32>
|
||||
ret <4 x i32> %3
|
||||
}
|
||||
|
||||
define <2 x i64> @test_llvm_x86_sse41_pmovsxwq(<8 x i16>* %a) {
|
||||
|
@ -42,8 +46,9 @@ define <2 x i64> @test_llvm_x86_sse41_pmovsxwq(<8 x i16>* %a) {
|
|||
; SSE41: pmovsxwq (%rdi), %xmm0
|
||||
; AVX: vpmovsxwq (%rdi), %xmm0
|
||||
%1 = load <8 x i16>, <8 x i16>* %a, align 1
|
||||
%2 = call <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16> %1)
|
||||
ret <2 x i64> %2
|
||||
%2 = shufflevector <8 x i16> %1, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
|
||||
%3 = sext <2 x i16> %2 to <2 x i64>
|
||||
ret <2 x i64> %3
|
||||
}
|
||||
|
||||
define <2 x i64> @test_llvm_x86_sse41_pmovsxdq(<4 x i32>* %a) {
|
||||
|
@ -51,8 +56,9 @@ define <2 x i64> @test_llvm_x86_sse41_pmovsxdq(<4 x i32>* %a) {
|
|||
; SSE41: pmovsxdq (%rdi), %xmm0
|
||||
; AVX: vpmovsxdq (%rdi), %xmm0
|
||||
%1 = load <4 x i32>, <4 x i32>* %a, align 1
|
||||
%2 = call <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32> %1)
|
||||
ret <2 x i64> %2
|
||||
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
|
||||
%3 = sext <2 x i32> %2 to <2 x i64>
|
||||
ret <2 x i64> %3
|
||||
}
|
||||
|
||||
define <8 x i16> @test_llvm_x86_sse41_pmovzxbw(<16 x i8>* %a) {
|
||||
|
@ -115,9 +121,3 @@ declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>)
|
|||
declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>)
|
||||
declare <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8>)
|
||||
declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>)
|
||||
declare <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32>)
|
||||
declare <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16>)
|
||||
declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>)
|
||||
declare <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8>)
|
||||
declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>)
|
||||
declare <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8>)
|
||||
|
|
|
@ -31,49 +31,6 @@ define <16 x i8> @pinsrb_1(i8 %s, <16 x i8> %tmp) nounwind {
|
|||
ret <16 x i8> %tmp1
|
||||
}
|
||||
|
||||
define <2 x i64> @pmovsxbd_1(i32* %p) nounwind {
|
||||
; X32-LABEL: pmovsxbd_1:
|
||||
; X32: ## BB#0: ## %entry
|
||||
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X32-NEXT: pmovsxbd (%eax), %xmm0
|
||||
; X32-NEXT: retl
|
||||
;
|
||||
; X64-LABEL: pmovsxbd_1:
|
||||
; X64: ## BB#0: ## %entry
|
||||
; X64-NEXT: pmovsxbd (%rdi), %xmm0
|
||||
; X64-NEXT: retq
|
||||
entry:
|
||||
%0 = load i32, i32* %p, align 4
|
||||
%1 = insertelement <4 x i32> undef, i32 %0, i32 0
|
||||
%2 = insertelement <4 x i32> %1, i32 0, i32 1
|
||||
%3 = insertelement <4 x i32> %2, i32 0, i32 2
|
||||
%4 = insertelement <4 x i32> %3, i32 0, i32 3
|
||||
%5 = bitcast <4 x i32> %4 to <16 x i8>
|
||||
%6 = tail call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %5) nounwind readnone
|
||||
%7 = bitcast <4 x i32> %6 to <2 x i64>
|
||||
ret <2 x i64> %7
|
||||
}
|
||||
|
||||
define <2 x i64> @pmovsxwd_1(i64* %p) nounwind readonly {
|
||||
; X32-LABEL: pmovsxwd_1:
|
||||
; X32: ## BB#0: ## %entry
|
||||
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X32-NEXT: pmovsxwd (%eax), %xmm0
|
||||
; X32-NEXT: retl
|
||||
;
|
||||
; X64-LABEL: pmovsxwd_1:
|
||||
; X64: ## BB#0: ## %entry
|
||||
; X64-NEXT: pmovsxwd (%rdi), %xmm0
|
||||
; X64-NEXT: retq
|
||||
entry:
|
||||
%0 = load i64, i64* %p ; <i64> [#uses=1]
|
||||
%tmp2 = insertelement <2 x i64> zeroinitializer, i64 %0, i32 0 ; <<2 x i64>> [#uses=1]
|
||||
%1 = bitcast <2 x i64> %tmp2 to <8 x i16> ; <<8 x i16>> [#uses=1]
|
||||
%2 = tail call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %1) nounwind readnone ; <<4 x i32>> [#uses=1]
|
||||
%3 = bitcast <4 x i32> %2 to <2 x i64> ; <<2 x i64>> [#uses=1]
|
||||
ret <2 x i64> %3
|
||||
}
|
||||
|
||||
define <2 x i64> @pmovzxbq_1() nounwind {
|
||||
; X32-LABEL: pmovzxbq_1:
|
||||
; X32: ## BB#0: ## %entry
|
||||
|
@ -94,8 +51,6 @@ entry:
|
|||
ret <2 x i64> %3
|
||||
}
|
||||
|
||||
declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone
|
||||
declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone
|
||||
declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone
|
||||
|
||||
define i32 @extractps_1(<4 x float> %v) nounwind {
|
||||
|
@ -137,7 +92,7 @@ define float @ext_1(<4 x float> %v) nounwind {
|
|||
; X32: ## BB#0:
|
||||
; X32-NEXT: pushl %eax
|
||||
; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
|
||||
; X32-NEXT: addss LCPI7_0, %xmm0
|
||||
; X32-NEXT: addss LCPI5_0, %xmm0
|
||||
; X32-NEXT: movss %xmm0, (%esp)
|
||||
; X32-NEXT: flds (%esp)
|
||||
; X32-NEXT: popl %eax
|
||||
|
|
Loading…
Reference in New Issue