forked from OSchip/llvm-project
[X86] Remove SSE/AVX unaligned store intrinsics as clang no longer uses them. Auto upgrade to native unaligned store instructions.
llvm-svn: 271236
This commit is contained in:
parent
424b5ee8f7
commit
8287fd8abd
|
@ -259,13 +259,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
|
|||
llvm_x86mmx_ty], [IntrNoMem]>;
|
||||
}
|
||||
|
||||
// SIMD store ops
|
||||
let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
|
||||
def int_x86_sse_storeu_ps : GCCBuiltin<"__builtin_ia32_storeups">,
|
||||
Intrinsic<[], [llvm_ptr_ty,
|
||||
llvm_v4f32_ty], [IntrArgMemOnly]>;
|
||||
}
|
||||
|
||||
// Cacheability support ops
|
||||
let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
|
||||
def int_x86_sse_sfence : GCCBuiltin<"__builtin_ia32_sfence">,
|
||||
|
@ -525,16 +518,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
|
|||
Intrinsic<[llvm_v2f64_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
|
||||
}
|
||||
|
||||
// SIMD store ops
|
||||
let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
|
||||
def int_x86_sse2_storeu_pd : GCCBuiltin<"__builtin_ia32_storeupd">,
|
||||
Intrinsic<[], [llvm_ptr_ty,
|
||||
llvm_v2f64_ty], [IntrArgMemOnly]>;
|
||||
def int_x86_sse2_storeu_dq : GCCBuiltin<"__builtin_ia32_storedqu">,
|
||||
Intrinsic<[], [llvm_ptr_ty,
|
||||
llvm_v16i8_ty], [IntrArgMemOnly]>;
|
||||
}
|
||||
|
||||
// Misc.
|
||||
let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
|
||||
def int_x86_sse2_packsswb_128 : GCCBuiltin<"__builtin_ia32_packsswb128">,
|
||||
|
@ -1938,16 +1921,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
|
|||
Intrinsic<[llvm_v32i8_ty], [llvm_ptr_ty], [IntrReadMem]>;
|
||||
}
|
||||
|
||||
// SIMD store ops
|
||||
let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
|
||||
def int_x86_avx_storeu_pd_256 : GCCBuiltin<"__builtin_ia32_storeupd256">,
|
||||
Intrinsic<[], [llvm_ptr_ty, llvm_v4f64_ty], [IntrArgMemOnly]>;
|
||||
def int_x86_avx_storeu_ps_256 : GCCBuiltin<"__builtin_ia32_storeups256">,
|
||||
Intrinsic<[], [llvm_ptr_ty, llvm_v8f32_ty], [IntrArgMemOnly]>;
|
||||
def int_x86_avx_storeu_dq_256 : GCCBuiltin<"__builtin_ia32_storedqu256">,
|
||||
Intrinsic<[], [llvm_ptr_ty, llvm_v32i8_ty], [IntrArgMemOnly]>;
|
||||
}
|
||||
|
||||
// Conditional load ops
|
||||
let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
|
||||
def int_x86_avx_maskload_pd : GCCBuiltin<"__builtin_ia32_maskloadpd">,
|
||||
|
|
|
@ -191,6 +191,9 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
|
|||
Name == "x86.avx2.vextracti128" ||
|
||||
Name.startswith("x86.avx.movnt.") ||
|
||||
Name == "x86.sse2.storel.dq" ||
|
||||
Name.startswith("x86.sse.storeu.") ||
|
||||
Name.startswith("x86.sse2.storeu.") ||
|
||||
Name.startswith("x86.avx.storeu.") ||
|
||||
Name == "x86.sse42.crc32.64.8" ||
|
||||
Name.startswith("x86.avx.vbroadcast.s") ||
|
||||
Name.startswith("x86.sse2.psll.dq") ||
|
||||
|
@ -439,6 +442,20 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
|
|||
"cast");
|
||||
Builder.CreateAlignedStore(Elt, BC, 1);
|
||||
|
||||
// Remove intrinsic.
|
||||
CI->eraseFromParent();
|
||||
return;
|
||||
} else if (Name.startswith("llvm.x86.sse.storeu.") ||
|
||||
Name.startswith("llvm.x86.sse2.storeu.") ||
|
||||
Name.startswith("llvm.x86.avx.storeu.")) {
|
||||
Value *Arg0 = CI->getArgOperand(0);
|
||||
Value *Arg1 = CI->getArgOperand(1);
|
||||
|
||||
Arg0 = Builder.CreateBitCast(Arg0,
|
||||
PointerType::getUnqual(Arg1->getType()),
|
||||
"cast");
|
||||
Builder.CreateAlignedStore(Arg1, Arg0, 1);
|
||||
|
||||
// Remove intrinsic.
|
||||
CI->eraseFromParent();
|
||||
return;
|
||||
|
|
|
@ -905,11 +905,6 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
|
|||
IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
|
||||
}
|
||||
|
||||
def : Pat<(int_x86_avx_storeu_ps_256 addr:$dst, VR256:$src),
|
||||
(VMOVUPSYmr addr:$dst, VR256:$src)>;
|
||||
def : Pat<(int_x86_avx_storeu_pd_256 addr:$dst, VR256:$src),
|
||||
(VMOVUPDYmr addr:$dst, VR256:$src)>;
|
||||
|
||||
// Aliases to help the assembler pick two byte VEX encodings by swapping the
|
||||
// operands relative to the normal instructions to use VEX.R instead of VEX.B.
|
||||
def : InstAlias<"vmovaps\t{$src, $dst|$dst, $src}",
|
||||
|
@ -965,20 +960,6 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
|
|||
IIC_SSE_MOVU_P_RR>;
|
||||
}
|
||||
|
||||
let Predicates = [HasAVX] in {
|
||||
def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src),
|
||||
(VMOVUPSmr addr:$dst, VR128:$src)>;
|
||||
def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src),
|
||||
(VMOVUPDmr addr:$dst, VR128:$src)>;
|
||||
}
|
||||
|
||||
let Predicates = [UseSSE1] in
|
||||
def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src),
|
||||
(MOVUPSmr addr:$dst, VR128:$src)>;
|
||||
let Predicates = [UseSSE2] in
|
||||
def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src),
|
||||
(MOVUPDmr addr:$dst, VR128:$src)>;
|
||||
|
||||
// Use vmovaps/vmovups for AVX integer load/store.
|
||||
let Predicates = [HasAVX, NoVLX] in {
|
||||
// 128-bit load/store
|
||||
|
@ -3887,16 +3868,6 @@ def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
|
|||
|
||||
} // ExeDomain = SSEPackedInt
|
||||
|
||||
let Predicates = [HasAVX] in {
|
||||
def : Pat<(int_x86_sse2_storeu_dq addr:$dst, VR128:$src),
|
||||
(VMOVDQUmr addr:$dst, VR128:$src)>;
|
||||
def : Pat<(int_x86_avx_storeu_dq_256 addr:$dst, VR256:$src),
|
||||
(VMOVDQUYmr addr:$dst, VR256:$src)>;
|
||||
}
|
||||
let Predicates = [UseSSE2] in
|
||||
def : Pat<(int_x86_sse2_storeu_dq addr:$dst, VR128:$src),
|
||||
(MOVDQUmr addr:$dst, VR128:$src)>;
|
||||
|
||||
// Aliases to help the assembler pick two byte VEX encodings by swapping the
|
||||
// operands relative to the normal instructions to use VEX.R instead of VEX.B.
|
||||
def : InstAlias<"vmovdqa\t{$src, $dst|$dst, $src}",
|
||||
|
|
|
@ -1397,32 +1397,6 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
|
|||
}
|
||||
break;
|
||||
|
||||
case Intrinsic::x86_sse_storeu_ps:
|
||||
case Intrinsic::x86_sse2_storeu_pd:
|
||||
case Intrinsic::x86_sse2_storeu_dq:
|
||||
// Turn X86 storeu -> store if the pointer is known aligned.
|
||||
if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, II, AC, DT) >=
|
||||
16) {
|
||||
Type *OpPtrTy =
|
||||
PointerType::getUnqual(II->getArgOperand(1)->getType());
|
||||
Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0), OpPtrTy);
|
||||
return new StoreInst(II->getArgOperand(1), Ptr);
|
||||
}
|
||||
break;
|
||||
|
||||
case Intrinsic::x86_avx_storeu_ps_256:
|
||||
case Intrinsic::x86_avx_storeu_pd_256:
|
||||
case Intrinsic::x86_avx_storeu_dq_256:
|
||||
// Turn X86 storeu -> store if the pointer is known aligned.
|
||||
if (getOrEnforceKnownAlignment(II->getArgOperand(0), 32, DL, II, AC, DT) >=
|
||||
32) {
|
||||
Type *OpPtrTy =
|
||||
PointerType::getUnqual(II->getArgOperand(1)->getType());
|
||||
Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0), OpPtrTy);
|
||||
return new StoreInst(II->getArgOperand(1), Ptr);
|
||||
}
|
||||
break;
|
||||
|
||||
case Intrinsic::x86_vcvtph2ps_128:
|
||||
case Intrinsic::x86_vcvtph2ps_256: {
|
||||
auto Arg = II->getArgOperand(0);
|
||||
|
|
|
@ -684,12 +684,6 @@ static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
|
|||
switch (II->getIntrinsicID()) {
|
||||
default: break;
|
||||
case Intrinsic::prefetch:
|
||||
case Intrinsic::x86_sse_storeu_ps:
|
||||
case Intrinsic::x86_sse2_storeu_pd:
|
||||
case Intrinsic::x86_sse2_storeu_dq:
|
||||
case Intrinsic::x86_avx_storeu_ps_256:
|
||||
case Intrinsic::x86_avx_storeu_pd_256:
|
||||
case Intrinsic::x86_avx_storeu_dq_256:
|
||||
if (II->getArgOperand(0) == OperandVal)
|
||||
isAddress = true;
|
||||
break;
|
||||
|
@ -706,20 +700,6 @@ static MemAccessTy getAccessType(const Instruction *Inst) {
|
|||
AccessTy.AddrSpace = SI->getPointerAddressSpace();
|
||||
} else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
|
||||
AccessTy.AddrSpace = LI->getPointerAddressSpace();
|
||||
} else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
|
||||
// Addressing modes can also be folded into prefetches and a variety
|
||||
// of intrinsics.
|
||||
switch (II->getIntrinsicID()) {
|
||||
default: break;
|
||||
case Intrinsic::x86_sse_storeu_ps:
|
||||
case Intrinsic::x86_sse2_storeu_pd:
|
||||
case Intrinsic::x86_sse2_storeu_dq:
|
||||
case Intrinsic::x86_avx_storeu_ps_256:
|
||||
case Intrinsic::x86_avx_storeu_pd_256:
|
||||
case Intrinsic::x86_avx_storeu_dq_256:
|
||||
AccessTy.MemTy = II->getArgOperand(0)->getType();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// All pointers have the same requirements, so canonicalize them to an
|
||||
|
|
|
@ -355,3 +355,98 @@ define <4 x double> @test_x86_avx_cvt_ps2_pd_256(<4 x float> %a0) {
|
|||
ret <4 x double> %res
|
||||
}
|
||||
declare <4 x double> @llvm.x86.avx.cvt.ps2.pd.256(<4 x float>) nounwind readnone
|
||||
|
||||
|
||||
define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) {
|
||||
; add operation forces the execution domain.
|
||||
; CHECK-LABEL: test_x86_sse2_storeu_dq:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; CHECK-NEXT: vpaddb LCPI32_0, %xmm0, %xmm0
|
||||
; CHECK-NEXT: vmovdqu %xmm0, (%eax)
|
||||
; CHECK-NEXT: retl
|
||||
%a2 = add <16 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
|
||||
call void @llvm.x86.sse2.storeu.dq(i8* %a0, <16 x i8> %a2)
|
||||
ret void
|
||||
}
|
||||
declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind
|
||||
|
||||
|
||||
define void @test_x86_sse2_storeu_pd(i8* %a0, <2 x double> %a1) {
|
||||
; fadd operation forces the execution domain.
|
||||
; CHECK-LABEL: test_x86_sse2_storeu_pd:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; CHECK-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
|
||||
; CHECK-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
|
||||
; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0
|
||||
; CHECK-NEXT: vmovupd %xmm0, (%eax)
|
||||
; CHECK-NEXT: retl
|
||||
%a2 = fadd <2 x double> %a1, <double 0x0, double 0x4200000000000000>
|
||||
call void @llvm.x86.sse2.storeu.pd(i8* %a0, <2 x double> %a2)
|
||||
ret void
|
||||
}
|
||||
declare void @llvm.x86.sse2.storeu.pd(i8*, <2 x double>) nounwind
|
||||
|
||||
|
||||
define void @test_x86_sse_storeu_ps(i8* %a0, <4 x float> %a1) {
|
||||
; CHECK-LABEL: test_x86_sse_storeu_ps:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; CHECK-NEXT: vmovups %xmm0, (%eax)
|
||||
; CHECK-NEXT: retl
|
||||
call void @llvm.x86.sse.storeu.ps(i8* %a0, <4 x float> %a1)
|
||||
ret void
|
||||
}
|
||||
declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind
|
||||
|
||||
|
||||
define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) {
|
||||
; FIXME: unfortunately the execution domain fix pass changes this to vmovups and its hard to force with no 256-bit integer instructions
|
||||
; add operation forces the execution domain.
|
||||
; CHECK-LABEL: test_x86_avx_storeu_dq_256:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
|
||||
; CHECK-NEXT: vpaddb %xmm2, %xmm1, %xmm1
|
||||
; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0
|
||||
; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
||||
; CHECK-NEXT: vmovups %ymm0, (%eax)
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: retl
|
||||
%a2 = add <32 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
|
||||
call void @llvm.x86.avx.storeu.dq.256(i8* %a0, <32 x i8> %a2)
|
||||
ret void
|
||||
}
|
||||
declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind
|
||||
|
||||
|
||||
define void @test_x86_avx_storeu_pd_256(i8* %a0, <4 x double> %a1) {
|
||||
; add operation forces the execution domain.
|
||||
; CHECK-LABEL: test_x86_avx_storeu_pd_256:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; CHECK-NEXT: vxorpd %ymm1, %ymm1, %ymm1
|
||||
; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0
|
||||
; CHECK-NEXT: vmovupd %ymm0, (%eax)
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: retl
|
||||
%a2 = fadd <4 x double> %a1, <double 0x0, double 0x0, double 0x0, double 0x0>
|
||||
call void @llvm.x86.avx.storeu.pd.256(i8* %a0, <4 x double> %a2)
|
||||
ret void
|
||||
}
|
||||
declare void @llvm.x86.avx.storeu.pd.256(i8*, <4 x double>) nounwind
|
||||
|
||||
|
||||
define void @test_x86_avx_storeu_ps_256(i8* %a0, <8 x float> %a1) {
|
||||
; CHECK-LABEL: test_x86_avx_storeu_ps_256:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; CHECK-NEXT: vmovups %ymm0, (%eax)
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: retl
|
||||
call void @llvm.x86.avx.storeu.ps.256(i8* %a0, <8 x float> %a1)
|
||||
ret void
|
||||
}
|
||||
declare void @llvm.x86.avx.storeu.ps.256(i8*, <8 x float>) nounwind
|
||||
|
|
|
@ -1221,54 +1221,6 @@ define <2 x double> @test_x86_sse2_sqrt_sd(<2 x double> %a0) {
|
|||
declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
|
||||
|
||||
|
||||
define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) {
|
||||
; add operation forces the execution domain.
|
||||
; AVX-LABEL: test_x86_sse2_storeu_dq:
|
||||
; AVX: ## BB#0:
|
||||
; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; AVX-NEXT: vpaddb LCPI74_0, %xmm0, %xmm0
|
||||
; AVX-NEXT: vmovdqu %xmm0, (%eax)
|
||||
; AVX-NEXT: retl
|
||||
;
|
||||
; AVX512VL-LABEL: test_x86_sse2_storeu_dq:
|
||||
; AVX512VL: ## BB#0:
|
||||
; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; AVX512VL-NEXT: vpaddb LCPI74_0, %xmm0, %xmm0
|
||||
; AVX512VL-NEXT: vmovdqu %xmm0, (%eax)
|
||||
; AVX512VL-NEXT: retl
|
||||
%a2 = add <16 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
|
||||
call void @llvm.x86.sse2.storeu.dq(i8* %a0, <16 x i8> %a2)
|
||||
ret void
|
||||
}
|
||||
declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind
|
||||
|
||||
|
||||
define void @test_x86_sse2_storeu_pd(i8* %a0, <2 x double> %a1) {
|
||||
; fadd operation forces the execution domain.
|
||||
; AVX-LABEL: test_x86_sse2_storeu_pd:
|
||||
; AVX: ## BB#0:
|
||||
; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
|
||||
; AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
|
||||
; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: vmovupd %xmm0, (%eax)
|
||||
; AVX-NEXT: retl
|
||||
;
|
||||
; AVX512VL-LABEL: test_x86_sse2_storeu_pd:
|
||||
; AVX512VL: ## BB#0:
|
||||
; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
|
||||
; AVX512VL-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
|
||||
; AVX512VL-NEXT: vaddpd %xmm1, %xmm0, %xmm0
|
||||
; AVX512VL-NEXT: vmovups %xmm0, (%eax)
|
||||
; AVX512VL-NEXT: retl
|
||||
%a2 = fadd <2 x double> %a1, <double 0x0, double 0x4200000000000000>
|
||||
call void @llvm.x86.sse2.storeu.pd(i8* %a0, <2 x double> %a2)
|
||||
ret void
|
||||
}
|
||||
declare void @llvm.x86.sse2.storeu.pd(i8*, <2 x double>) nounwind
|
||||
|
||||
|
||||
define <2 x double> @test_x86_sse2_sub_sd(<2 x double> %a0, <2 x double> %a1) {
|
||||
; AVX-LABEL: test_x86_sse2_sub_sd:
|
||||
; AVX: ## BB#0:
|
||||
|
@ -2802,24 +2754,6 @@ define void @test_x86_sse_stmxcsr(i8* %a0) {
|
|||
declare void @llvm.x86.sse.stmxcsr(i8*) nounwind
|
||||
|
||||
|
||||
define void @test_x86_sse_storeu_ps(i8* %a0, <4 x float> %a1) {
|
||||
; AVX-LABEL: test_x86_sse_storeu_ps:
|
||||
; AVX: ## BB#0:
|
||||
; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; AVX-NEXT: vmovups %xmm0, (%eax)
|
||||
; AVX-NEXT: retl
|
||||
;
|
||||
; AVX512VL-LABEL: test_x86_sse_storeu_ps:
|
||||
; AVX512VL: ## BB#0:
|
||||
; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; AVX512VL-NEXT: vmovups %xmm0, (%eax)
|
||||
; AVX512VL-NEXT: retl
|
||||
call void @llvm.x86.sse.storeu.ps(i8* %a0, <4 x float> %a1)
|
||||
ret void
|
||||
}
|
||||
declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind
|
||||
|
||||
|
||||
define <4 x float> @test_x86_sse_sub_ss(<4 x float> %a0, <4 x float> %a1) {
|
||||
; AVX-LABEL: test_x86_sse_sub_ss:
|
||||
; AVX: ## BB#0:
|
||||
|
@ -4012,78 +3946,6 @@ define <8 x float> @test_x86_avx_sqrt_ps_256(<8 x float> %a0) {
|
|||
declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
|
||||
|
||||
|
||||
define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) {
|
||||
; FIXME: unfortunately the execution domain fix pass changes this to vmovups and its hard to force with no 256-bit integer instructions
|
||||
; add operation forces the execution domain.
|
||||
; AVX-LABEL: test_x86_avx_storeu_dq_256:
|
||||
; AVX: ## BB#0:
|
||||
; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
|
||||
; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1
|
||||
; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
|
||||
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
||||
; AVX-NEXT: vmovups %ymm0, (%eax)
|
||||
; AVX-NEXT: vzeroupper
|
||||
; AVX-NEXT: retl
|
||||
;
|
||||
; AVX512VL-LABEL: test_x86_avx_storeu_dq_256:
|
||||
; AVX512VL: ## BB#0:
|
||||
; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; AVX512VL-NEXT: vpaddb LCPI225_0, %ymm0, %ymm0
|
||||
; AVX512VL-NEXT: vmovdqu %ymm0, (%eax)
|
||||
; AVX512VL-NEXT: retl
|
||||
%a2 = add <32 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
|
||||
call void @llvm.x86.avx.storeu.dq.256(i8* %a0, <32 x i8> %a2)
|
||||
ret void
|
||||
}
|
||||
declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind
|
||||
|
||||
|
||||
define void @test_x86_avx_storeu_pd_256(i8* %a0, <4 x double> %a1) {
|
||||
; add operation forces the execution domain.
|
||||
; AVX-LABEL: test_x86_avx_storeu_pd_256:
|
||||
; AVX: ## BB#0:
|
||||
; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; AVX-NEXT: vxorpd %ymm1, %ymm1, %ymm1
|
||||
; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
|
||||
; AVX-NEXT: vmovupd %ymm0, (%eax)
|
||||
; AVX-NEXT: vzeroupper
|
||||
; AVX-NEXT: retl
|
||||
;
|
||||
; AVX512VL-LABEL: test_x86_avx_storeu_pd_256:
|
||||
; AVX512VL: ## BB#0:
|
||||
; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; AVX512VL-NEXT: vpxord %ymm1, %ymm1, %ymm1
|
||||
; AVX512VL-NEXT: vaddpd %ymm1, %ymm0, %ymm0
|
||||
; AVX512VL-NEXT: vmovups %ymm0, (%eax)
|
||||
; AVX512VL-NEXT: retl
|
||||
%a2 = fadd <4 x double> %a1, <double 0x0, double 0x0, double 0x0, double 0x0>
|
||||
call void @llvm.x86.avx.storeu.pd.256(i8* %a0, <4 x double> %a2)
|
||||
ret void
|
||||
}
|
||||
declare void @llvm.x86.avx.storeu.pd.256(i8*, <4 x double>) nounwind
|
||||
|
||||
|
||||
define void @test_x86_avx_storeu_ps_256(i8* %a0, <8 x float> %a1) {
|
||||
; AVX-LABEL: test_x86_avx_storeu_ps_256:
|
||||
; AVX: ## BB#0:
|
||||
; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; AVX-NEXT: vmovups %ymm0, (%eax)
|
||||
; AVX-NEXT: vzeroupper
|
||||
; AVX-NEXT: retl
|
||||
;
|
||||
; AVX512VL-LABEL: test_x86_avx_storeu_ps_256:
|
||||
; AVX512VL: ## BB#0:
|
||||
; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; AVX512VL-NEXT: vmovups %ymm0, (%eax)
|
||||
; AVX512VL-NEXT: retl
|
||||
call void @llvm.x86.avx.storeu.ps.256(i8* %a0, <8 x float> %a1)
|
||||
ret void
|
||||
}
|
||||
declare void @llvm.x86.avx.storeu.ps.256(i8*, <8 x float>) nounwind
|
||||
|
||||
|
||||
define <4 x double> @test_x86_avx_vbroadcastf128_pd_256(i8* %a0) {
|
||||
; AVX-LABEL: test_x86_avx_vbroadcastf128_pd_256:
|
||||
; AVX: ## BB#0:
|
||||
|
@ -4271,7 +4133,7 @@ define <4 x double> @test_x86_avx_vpermilvar_pd_256_2(<4 x double> %a0) {
|
|||
;
|
||||
; AVX512VL-LABEL: test_x86_avx_vpermilvar_pd_256_2:
|
||||
; AVX512VL: ## BB#0:
|
||||
; AVX512VL-NEXT: vpermilpd LCPI239_0, %ymm0, %ymm0
|
||||
; AVX512VL-NEXT: vpermilpd LCPI233_0, %ymm0, %ymm0
|
||||
; AVX512VL-NEXT: retl
|
||||
%res = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 2, i64 0, i64 0, i64 2>) ; <<4 x double>> [#uses=1]
|
||||
ret <4 x double> %res
|
||||
|
@ -4763,7 +4625,7 @@ define void @movnt_dq(i8* %p, <2 x i64> %a1) nounwind {
|
|||
; AVX-LABEL: movnt_dq:
|
||||
; AVX: ## BB#0:
|
||||
; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; AVX-NEXT: vpaddq LCPI266_0, %xmm0, %xmm0
|
||||
; AVX-NEXT: vpaddq LCPI260_0, %xmm0, %xmm0
|
||||
; AVX-NEXT: vmovntdq %ymm0, (%eax)
|
||||
; AVX-NEXT: vzeroupper
|
||||
; AVX-NEXT: retl
|
||||
|
@ -4771,7 +4633,7 @@ define void @movnt_dq(i8* %p, <2 x i64> %a1) nounwind {
|
|||
; AVX512VL-LABEL: movnt_dq:
|
||||
; AVX512VL: ## BB#0:
|
||||
; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; AVX512VL-NEXT: vpaddq LCPI266_0, %xmm0, %xmm0
|
||||
; AVX512VL-NEXT: vpaddq LCPI260_0, %xmm0, %xmm0
|
||||
; AVX512VL-NEXT: vmovntdq %ymm0, (%eax)
|
||||
; AVX512VL-NEXT: retl
|
||||
%a2 = add <2 x i64> %a1, <i64 1, i64 1>
|
||||
|
|
|
@ -365,3 +365,19 @@ define <4 x i64> @test_x86_avx2_pmovzxwq(<8 x i16> %a0) {
|
|||
ret <4 x i64> %res
|
||||
}
|
||||
declare <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16>) nounwind readnone
|
||||
|
||||
; This is checked here because the execution dependency fix pass makes it hard to test in AVX mode since we don't have 256-bit integer instructions
|
||||
define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) {
|
||||
; add operation forces the execution domain.
|
||||
; CHECK-LABEL: test_x86_avx_storeu_dq_256:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; CHECK-NEXT: vpaddb LCPI33_0, %ymm0, %ymm0
|
||||
; CHECK-NEXT: vmovdqu %ymm0, (%eax)
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: retl
|
||||
%a2 = add <32 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
|
||||
call void @llvm.x86.avx.storeu.dq.256(i8* %a0, <32 x i8> %a2)
|
||||
ret void
|
||||
}
|
||||
declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind
|
||||
|
|
|
@ -1475,29 +1475,6 @@ define <8 x i32> @test_x86_avx2_psrav_d_256(<8 x i32> %a0, <8 x i32> %a1) {
|
|||
}
|
||||
declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone
|
||||
|
||||
; This is checked here because the execution dependency fix pass makes it hard to test in AVX mode since we don't have 256-bit integer instructions
|
||||
define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) {
|
||||
; add operation forces the execution domain.
|
||||
; AVX2-LABEL: test_x86_avx_storeu_dq_256:
|
||||
; AVX2: ## BB#0:
|
||||
; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; AVX2-NEXT: vpaddb LCPI91_0, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vmovdqu %ymm0, (%eax)
|
||||
; AVX2-NEXT: vzeroupper
|
||||
; AVX2-NEXT: retl
|
||||
;
|
||||
; AVX512VL-LABEL: test_x86_avx_storeu_dq_256:
|
||||
; AVX512VL: ## BB#0:
|
||||
; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; AVX512VL-NEXT: vpaddb LCPI91_0, %ymm0, %ymm0
|
||||
; AVX512VL-NEXT: vmovdqu %ymm0, (%eax)
|
||||
; AVX512VL-NEXT: retl
|
||||
%a2 = add <32 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
|
||||
call void @llvm.x86.avx.storeu.dq.256(i8* %a0, <32 x i8> %a2)
|
||||
ret void
|
||||
}
|
||||
declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind
|
||||
|
||||
define <2 x double> @test_x86_avx2_gather_d_pd(<2 x double> %a0, i8* %a1, <4 x i32> %idx, <2 x double> %mask) {
|
||||
; AVX2-LABEL: test_x86_avx2_gather_d_pd:
|
||||
; AVX2: ## BB#0:
|
||||
|
|
|
@ -0,0 +1,27 @@
|
|||
; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
|
||||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
||||
; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | FileCheck %s
|
||||
|
||||
define void @test_x86_sse_storeu_ps(i8* %a0, <4 x float> %a1) {
|
||||
; SSE-LABEL: test_x86_sse_storeu_ps:
|
||||
; SSE: ## BB#0:
|
||||
; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; SSE-NEXT: movups %xmm0, (%eax)
|
||||
; SSE-NEXT: retl
|
||||
;
|
||||
; KNL-LABEL: test_x86_sse_storeu_ps:
|
||||
; KNL: ## BB#0:
|
||||
; KNL-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; KNL-NEXT: vmovups %xmm0, (%eax)
|
||||
; KNL-NEXT: retl
|
||||
; CHECK-LABEL: test_x86_sse_storeu_ps:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; CHECK-NEXT: movups %xmm0, (%eax)
|
||||
; CHECK-NEXT: retl
|
||||
call void @llvm.x86.sse.storeu.ps(i8* %a0, <4 x float> %a1)
|
||||
ret void
|
||||
}
|
||||
declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind
|
||||
|
||||
|
|
@ -474,24 +474,6 @@ define void @test_x86_sse_stmxcsr(i8* %a0) {
|
|||
declare void @llvm.x86.sse.stmxcsr(i8*) nounwind
|
||||
|
||||
|
||||
define void @test_x86_sse_storeu_ps(i8* %a0, <4 x float> %a1) {
|
||||
; SSE-LABEL: test_x86_sse_storeu_ps:
|
||||
; SSE: ## BB#0:
|
||||
; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; SSE-NEXT: movups %xmm0, (%eax)
|
||||
; SSE-NEXT: retl
|
||||
;
|
||||
; KNL-LABEL: test_x86_sse_storeu_ps:
|
||||
; KNL: ## BB#0:
|
||||
; KNL-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; KNL-NEXT: vmovups %xmm0, (%eax)
|
||||
; KNL-NEXT: retl
|
||||
call void @llvm.x86.sse.storeu.ps(i8* %a0, <4 x float> %a1)
|
||||
ret void
|
||||
}
|
||||
declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind
|
||||
|
||||
|
||||
define <4 x float> @test_x86_sse_sub_ss(<4 x float> %a0, <4 x float> %a1) {
|
||||
; SSE-LABEL: test_x86_sse_sub_ss:
|
||||
; SSE: ## BB#0:
|
||||
|
|
|
@ -96,4 +96,35 @@ define void @test_x86_sse2_storel_dq(i8* %a0, <4 x i32> %a1) {
|
|||
declare void @llvm.x86.sse2.storel.dq(i8*, <4 x i32>) nounwind
|
||||
|
||||
|
||||
define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) {
|
||||
; add operation forces the execution domain.
|
||||
; CHECK-LABEL: test_x86_sse2_storeu_dq:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; CHECK-NEXT: paddb LCPI7_0, %xmm0
|
||||
; CHECK-NEXT: movdqu %xmm0, (%eax)
|
||||
; CHECK-NEXT: retl
|
||||
%a2 = add <16 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
|
||||
call void @llvm.x86.sse2.storeu.dq(i8* %a0, <16 x i8> %a2)
|
||||
ret void
|
||||
}
|
||||
declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind
|
||||
|
||||
|
||||
define void @test_x86_sse2_storeu_pd(i8* %a0, <2 x double> %a1) {
|
||||
; fadd operation forces the execution domain.
|
||||
; CHECK-LABEL: test_x86_sse2_storeu_pd:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
|
||||
; CHECK-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
|
||||
; CHECK-NEXT: addpd %xmm0, %xmm1
|
||||
; CHECK-NEXT: movupd %xmm1, (%eax)
|
||||
; CHECK-NEXT: retl
|
||||
%a2 = fadd <2 x double> %a1, <double 0x0, double 0x4200000000000000>
|
||||
call void @llvm.x86.sse2.storeu.pd(i8* %a0, <2 x double> %a2)
|
||||
ret void
|
||||
}
|
||||
declare void @llvm.x86.sse2.storeu.pd(i8*, <2 x double>) nounwind
|
||||
|
||||
|
||||
|
|
|
@ -1125,54 +1125,6 @@ define <2 x double> @test_x86_sse2_sqrt_sd(<2 x double> %a0) {
|
|||
declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
|
||||
|
||||
|
||||
define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) {
|
||||
; add operation forces the execution domain.
|
||||
; SSE-LABEL: test_x86_sse2_storeu_dq:
|
||||
; SSE: ## BB#0:
|
||||
; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; SSE-NEXT: paddb LCPI68_0, %xmm0
|
||||
; SSE-NEXT: movdqu %xmm0, (%eax)
|
||||
; SSE-NEXT: retl
|
||||
;
|
||||
; KNL-LABEL: test_x86_sse2_storeu_dq:
|
||||
; KNL: ## BB#0:
|
||||
; KNL-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; KNL-NEXT: vpaddb LCPI68_0, %xmm0, %xmm0
|
||||
; KNL-NEXT: vmovdqu %xmm0, (%eax)
|
||||
; KNL-NEXT: retl
|
||||
%a2 = add <16 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
|
||||
call void @llvm.x86.sse2.storeu.dq(i8* %a0, <16 x i8> %a2)
|
||||
ret void
|
||||
}
|
||||
declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind
|
||||
|
||||
|
||||
define void @test_x86_sse2_storeu_pd(i8* %a0, <2 x double> %a1) {
|
||||
; fadd operation forces the execution domain.
|
||||
; SSE-LABEL: test_x86_sse2_storeu_pd:
|
||||
; SSE: ## BB#0:
|
||||
; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
|
||||
; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
|
||||
; SSE-NEXT: addpd %xmm0, %xmm1
|
||||
; SSE-NEXT: movupd %xmm1, (%eax)
|
||||
; SSE-NEXT: retl
|
||||
;
|
||||
; KNL-LABEL: test_x86_sse2_storeu_pd:
|
||||
; KNL: ## BB#0:
|
||||
; KNL-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; KNL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
|
||||
; KNL-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
|
||||
; KNL-NEXT: vaddpd %xmm1, %xmm0, %xmm0
|
||||
; KNL-NEXT: vmovupd %xmm0, (%eax)
|
||||
; KNL-NEXT: retl
|
||||
%a2 = fadd <2 x double> %a1, <double 0x0, double 0x4200000000000000>
|
||||
call void @llvm.x86.sse2.storeu.pd(i8* %a0, <2 x double> %a2)
|
||||
ret void
|
||||
}
|
||||
declare void @llvm.x86.sse2.storeu.pd(i8*, <2 x double>) nounwind
|
||||
|
||||
|
||||
define <2 x double> @test_x86_sse2_sub_sd(<2 x double> %a0, <2 x double> %a1) {
|
||||
; SSE-LABEL: test_x86_sse2_sub_sd:
|
||||
; SSE: ## BB#0:
|
||||
|
|
|
@ -631,7 +631,7 @@ declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind
|
|||
; CHECK-NOT: br
|
||||
; CHECK-NOT: = or
|
||||
; CHECK: store <4 x i32> {{.*}} align 1
|
||||
; CHECK: call void @llvm.x86.sse.storeu.ps
|
||||
; CHECK: store <4 x float> %{{.*}}, <4 x float>* %{{.*}}, align 1{{$}}
|
||||
; CHECK: ret void
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue