From 8287fd8abde6228ebab32953e85c78e75ce0fd30 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 30 May 2016 23:15:56 +0000 Subject: [PATCH] [X86] Remove SSE/AVX unaligned store intrinsics as clang no longer uses them. Auto upgrade to native unaligned store instructions. llvm-svn: 271236 --- llvm/include/llvm/IR/IntrinsicsX86.td | 27 ---- llvm/lib/IR/AutoUpgrade.cpp | 17 +++ llvm/lib/Target/X86/X86InstrSSE.td | 29 ---- .../InstCombine/InstCombineCalls.cpp | 26 ---- .../Transforms/Scalar/LoopStrengthReduce.cpp | 20 --- .../CodeGen/X86/avx-intrinsics-x86-upgrade.ll | 95 ++++++++++++ llvm/test/CodeGen/X86/avx-intrinsics-x86.ll | 144 +----------------- .../X86/avx2-intrinsics-x86-upgrade.ll | 16 ++ llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll | 23 --- .../CodeGen/X86/sse-intrinsics-x86-upgrade.ll | 27 ++++ llvm/test/CodeGen/X86/sse-intrinsics-x86.ll | 18 --- .../X86/sse2-intrinsics-x86-upgrade.ll | 31 ++++ llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll | 48 ------ .../MemorySanitizer/msan_basic.ll | 2 +- 14 files changed, 190 insertions(+), 333 deletions(-) create mode 100644 llvm/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index 99cfd6c276e1..92843097cae1 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -259,13 +259,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". llvm_x86mmx_ty], [IntrNoMem]>; } -// SIMD store ops -let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_sse_storeu_ps : GCCBuiltin<"__builtin_ia32_storeups">, - Intrinsic<[], [llvm_ptr_ty, - llvm_v4f32_ty], [IntrArgMemOnly]>; -} - // Cacheability support ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_sse_sfence : GCCBuiltin<"__builtin_ia32_sfence">, @@ -525,16 +518,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". Intrinsic<[llvm_v2f64_ty], [llvm_x86mmx_ty], [IntrNoMem]>; } -// SIMD store ops -let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_sse2_storeu_pd : GCCBuiltin<"__builtin_ia32_storeupd">, - Intrinsic<[], [llvm_ptr_ty, - llvm_v2f64_ty], [IntrArgMemOnly]>; - def int_x86_sse2_storeu_dq : GCCBuiltin<"__builtin_ia32_storedqu">, - Intrinsic<[], [llvm_ptr_ty, - llvm_v16i8_ty], [IntrArgMemOnly]>; -} - // Misc. let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_sse2_packsswb_128 : GCCBuiltin<"__builtin_ia32_packsswb128">, @@ -1938,16 +1921,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". Intrinsic<[llvm_v32i8_ty], [llvm_ptr_ty], [IntrReadMem]>; } -// SIMD store ops -let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_avx_storeu_pd_256 : GCCBuiltin<"__builtin_ia32_storeupd256">, - Intrinsic<[], [llvm_ptr_ty, llvm_v4f64_ty], [IntrArgMemOnly]>; - def int_x86_avx_storeu_ps_256 : GCCBuiltin<"__builtin_ia32_storeups256">, - Intrinsic<[], [llvm_ptr_ty, llvm_v8f32_ty], [IntrArgMemOnly]>; - def int_x86_avx_storeu_dq_256 : GCCBuiltin<"__builtin_ia32_storedqu256">, - Intrinsic<[], [llvm_ptr_ty, llvm_v32i8_ty], [IntrArgMemOnly]>; -} - // Conditional load ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx_maskload_pd : GCCBuiltin<"__builtin_ia32_maskloadpd">, diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index b679bd685c23..ce0b10d8a776 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -191,6 +191,9 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) { Name == "x86.avx2.vextracti128" || Name.startswith("x86.avx.movnt.") || Name == "x86.sse2.storel.dq" || + Name.startswith("x86.sse.storeu.") || + Name.startswith("x86.sse2.storeu.") || + Name.startswith("x86.avx.storeu.") || Name == "x86.sse42.crc32.64.8" || Name.startswith("x86.avx.vbroadcast.s") || Name.startswith("x86.sse2.psll.dq") || @@ -439,6 +442,20 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { "cast"); Builder.CreateAlignedStore(Elt, BC, 1); + // Remove intrinsic. + CI->eraseFromParent(); + return; + } else if (Name.startswith("llvm.x86.sse.storeu.") || + Name.startswith("llvm.x86.sse2.storeu.") || + Name.startswith("llvm.x86.avx.storeu.")) { + Value *Arg0 = CI->getArgOperand(0); + Value *Arg1 = CI->getArgOperand(1); + + Arg0 = Builder.CreateBitCast(Arg0, + PointerType::getUnqual(Arg1->getType()), + "cast"); + Builder.CreateAlignedStore(Arg1, Arg0, 1); + // Remove intrinsic. CI->eraseFromParent(); return; diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index f5a845b08d9f..3c52d1db28fe 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -905,11 +905,6 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, IIC_SSE_MOVU_P_RR>, VEX, VEX_L; } -def : Pat<(int_x86_avx_storeu_ps_256 addr:$dst, VR256:$src), - (VMOVUPSYmr addr:$dst, VR256:$src)>; -def : Pat<(int_x86_avx_storeu_pd_256 addr:$dst, VR256:$src), - (VMOVUPDYmr addr:$dst, VR256:$src)>; - // Aliases to help the assembler pick two byte VEX encodings by swapping the // operands relative to the normal instructions to use VEX.R instead of VEX.B. def : InstAlias<"vmovaps\t{$src, $dst|$dst, $src}", @@ -965,20 +960,6 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, IIC_SSE_MOVU_P_RR>; } -let Predicates = [HasAVX] in { - def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src), - (VMOVUPSmr addr:$dst, VR128:$src)>; - def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src), - (VMOVUPDmr addr:$dst, VR128:$src)>; -} - -let Predicates = [UseSSE1] in - def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src), - (MOVUPSmr addr:$dst, VR128:$src)>; -let Predicates = [UseSSE2] in - def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src), - (MOVUPDmr addr:$dst, VR128:$src)>; - // Use vmovaps/vmovups for AVX integer load/store. let Predicates = [HasAVX, NoVLX] in { // 128-bit load/store @@ -3887,16 +3868,6 @@ def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), } // ExeDomain = SSEPackedInt -let Predicates = [HasAVX] in { - def : Pat<(int_x86_sse2_storeu_dq addr:$dst, VR128:$src), - (VMOVDQUmr addr:$dst, VR128:$src)>; - def : Pat<(int_x86_avx_storeu_dq_256 addr:$dst, VR256:$src), - (VMOVDQUYmr addr:$dst, VR256:$src)>; -} -let Predicates = [UseSSE2] in -def : Pat<(int_x86_sse2_storeu_dq addr:$dst, VR128:$src), - (MOVDQUmr addr:$dst, VR128:$src)>; - // Aliases to help the assembler pick two byte VEX encodings by swapping the // operands relative to the normal instructions to use VEX.R instead of VEX.B. def : InstAlias<"vmovdqa\t{$src, $dst|$dst, $src}", diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index c87d0466a9d4..e0ec74f814b1 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -1397,32 +1397,6 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { } break; - case Intrinsic::x86_sse_storeu_ps: - case Intrinsic::x86_sse2_storeu_pd: - case Intrinsic::x86_sse2_storeu_dq: - // Turn X86 storeu -> store if the pointer is known aligned. - if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, II, AC, DT) >= - 16) { - Type *OpPtrTy = - PointerType::getUnqual(II->getArgOperand(1)->getType()); - Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0), OpPtrTy); - return new StoreInst(II->getArgOperand(1), Ptr); - } - break; - - case Intrinsic::x86_avx_storeu_ps_256: - case Intrinsic::x86_avx_storeu_pd_256: - case Intrinsic::x86_avx_storeu_dq_256: - // Turn X86 storeu -> store if the pointer is known aligned. - if (getOrEnforceKnownAlignment(II->getArgOperand(0), 32, DL, II, AC, DT) >= - 32) { - Type *OpPtrTy = - PointerType::getUnqual(II->getArgOperand(1)->getType()); - Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0), OpPtrTy); - return new StoreInst(II->getArgOperand(1), Ptr); - } - break; - case Intrinsic::x86_vcvtph2ps_128: case Intrinsic::x86_vcvtph2ps_256: { auto Arg = II->getArgOperand(0); diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index ad70a70a0d6c..125f2cbc516f 100644 --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -684,12 +684,6 @@ static bool isAddressUse(Instruction *Inst, Value *OperandVal) { switch (II->getIntrinsicID()) { default: break; case Intrinsic::prefetch: - case Intrinsic::x86_sse_storeu_ps: - case Intrinsic::x86_sse2_storeu_pd: - case Intrinsic::x86_sse2_storeu_dq: - case Intrinsic::x86_avx_storeu_ps_256: - case Intrinsic::x86_avx_storeu_pd_256: - case Intrinsic::x86_avx_storeu_dq_256: if (II->getArgOperand(0) == OperandVal) isAddress = true; break; @@ -706,20 +700,6 @@ static MemAccessTy getAccessType(const Instruction *Inst) { AccessTy.AddrSpace = SI->getPointerAddressSpace(); } else if (const LoadInst *LI = dyn_cast(Inst)) { AccessTy.AddrSpace = LI->getPointerAddressSpace(); - } else if (const IntrinsicInst *II = dyn_cast(Inst)) { - // Addressing modes can also be folded into prefetches and a variety - // of intrinsics. - switch (II->getIntrinsicID()) { - default: break; - case Intrinsic::x86_sse_storeu_ps: - case Intrinsic::x86_sse2_storeu_pd: - case Intrinsic::x86_sse2_storeu_dq: - case Intrinsic::x86_avx_storeu_ps_256: - case Intrinsic::x86_avx_storeu_pd_256: - case Intrinsic::x86_avx_storeu_dq_256: - AccessTy.MemTy = II->getArgOperand(0)->getType(); - break; - } } // All pointers have the same requirements, so canonicalize them to an diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll index f3a3b774e7af..bc89f7c1eb2e 100644 --- a/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll @@ -355,3 +355,98 @@ define <4 x double> @test_x86_avx_cvt_ps2_pd_256(<4 x float> %a0) { ret <4 x double> %res } declare <4 x double> @llvm.x86.avx.cvt.ps2.pd.256(<4 x float>) nounwind readnone + + +define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) { + ; add operation forces the execution domain. +; CHECK-LABEL: test_x86_sse2_storeu_dq: +; CHECK: ## BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: vpaddb LCPI32_0, %xmm0, %xmm0 +; CHECK-NEXT: vmovdqu %xmm0, (%eax) +; CHECK-NEXT: retl + %a2 = add <16 x i8> %a1, + call void @llvm.x86.sse2.storeu.dq(i8* %a0, <16 x i8> %a2) + ret void +} +declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind + + +define void @test_x86_sse2_storeu_pd(i8* %a0, <2 x double> %a1) { + ; fadd operation forces the execution domain. +; CHECK-LABEL: test_x86_sse2_storeu_pd: +; CHECK: ## BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; CHECK-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] +; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovupd %xmm0, (%eax) +; CHECK-NEXT: retl + %a2 = fadd <2 x double> %a1, + call void @llvm.x86.sse2.storeu.pd(i8* %a0, <2 x double> %a2) + ret void +} +declare void @llvm.x86.sse2.storeu.pd(i8*, <2 x double>) nounwind + + +define void @test_x86_sse_storeu_ps(i8* %a0, <4 x float> %a1) { +; CHECK-LABEL: test_x86_sse_storeu_ps: +; CHECK: ## BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: vmovups %xmm0, (%eax) +; CHECK-NEXT: retl + call void @llvm.x86.sse.storeu.ps(i8* %a0, <4 x float> %a1) + ret void +} +declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind + + +define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) { + ; FIXME: unfortunately the execution domain fix pass changes this to vmovups and its hard to force with no 256-bit integer instructions + ; add operation forces the execution domain. +; CHECK-LABEL: test_x86_avx_storeu_dq_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; CHECK-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; CHECK-NEXT: vmovups %ymm0, (%eax) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retl + %a2 = add <32 x i8> %a1, + call void @llvm.x86.avx.storeu.dq.256(i8* %a0, <32 x i8> %a2) + ret void +} +declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind + + +define void @test_x86_avx_storeu_pd_256(i8* %a0, <4 x double> %a1) { + ; add operation forces the execution domain. +; CHECK-LABEL: test_x86_avx_storeu_pd_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: vxorpd %ymm1, %ymm1, %ymm1 +; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vmovupd %ymm0, (%eax) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retl + %a2 = fadd <4 x double> %a1, + call void @llvm.x86.avx.storeu.pd.256(i8* %a0, <4 x double> %a2) + ret void +} +declare void @llvm.x86.avx.storeu.pd.256(i8*, <4 x double>) nounwind + + +define void @test_x86_avx_storeu_ps_256(i8* %a0, <8 x float> %a1) { +; CHECK-LABEL: test_x86_avx_storeu_ps_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: vmovups %ymm0, (%eax) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retl + call void @llvm.x86.avx.storeu.ps.256(i8* %a0, <8 x float> %a1) + ret void +} +declare void @llvm.x86.avx.storeu.ps.256(i8*, <8 x float>) nounwind diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll b/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll index 84f8f3cd150a..f5e1f3e210e2 100644 --- a/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll +++ b/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll @@ -1221,54 +1221,6 @@ define <2 x double> @test_x86_sse2_sqrt_sd(<2 x double> %a0) { declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone -define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) { - ; add operation forces the execution domain. -; AVX-LABEL: test_x86_sse2_storeu_dq: -; AVX: ## BB#0: -; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX-NEXT: vpaddb LCPI74_0, %xmm0, %xmm0 -; AVX-NEXT: vmovdqu %xmm0, (%eax) -; AVX-NEXT: retl -; -; AVX512VL-LABEL: test_x86_sse2_storeu_dq: -; AVX512VL: ## BB#0: -; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512VL-NEXT: vpaddb LCPI74_0, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqu %xmm0, (%eax) -; AVX512VL-NEXT: retl - %a2 = add <16 x i8> %a1, - call void @llvm.x86.sse2.storeu.dq(i8* %a0, <16 x i8> %a2) - ret void -} -declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind - - -define void @test_x86_sse2_storeu_pd(i8* %a0, <2 x double> %a1) { - ; fadd operation forces the execution domain. -; AVX-LABEL: test_x86_sse2_storeu_pd: -; AVX: ## BB#0: -; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] -; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovupd %xmm0, (%eax) -; AVX-NEXT: retl -; -; AVX512VL-LABEL: test_x86_sse2_storeu_pd: -; AVX512VL: ## BB#0: -; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512VL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512VL-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] -; AVX512VL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovups %xmm0, (%eax) -; AVX512VL-NEXT: retl - %a2 = fadd <2 x double> %a1, - call void @llvm.x86.sse2.storeu.pd(i8* %a0, <2 x double> %a2) - ret void -} -declare void @llvm.x86.sse2.storeu.pd(i8*, <2 x double>) nounwind - - define <2 x double> @test_x86_sse2_sub_sd(<2 x double> %a0, <2 x double> %a1) { ; AVX-LABEL: test_x86_sse2_sub_sd: ; AVX: ## BB#0: @@ -2802,24 +2754,6 @@ define void @test_x86_sse_stmxcsr(i8* %a0) { declare void @llvm.x86.sse.stmxcsr(i8*) nounwind -define void @test_x86_sse_storeu_ps(i8* %a0, <4 x float> %a1) { -; AVX-LABEL: test_x86_sse_storeu_ps: -; AVX: ## BB#0: -; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX-NEXT: vmovups %xmm0, (%eax) -; AVX-NEXT: retl -; -; AVX512VL-LABEL: test_x86_sse_storeu_ps: -; AVX512VL: ## BB#0: -; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512VL-NEXT: vmovups %xmm0, (%eax) -; AVX512VL-NEXT: retl - call void @llvm.x86.sse.storeu.ps(i8* %a0, <4 x float> %a1) - ret void -} -declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind - - define <4 x float> @test_x86_sse_sub_ss(<4 x float> %a0, <4 x float> %a1) { ; AVX-LABEL: test_x86_sse_sub_ss: ; AVX: ## BB#0: @@ -4012,78 +3946,6 @@ define <8 x float> @test_x86_avx_sqrt_ps_256(<8 x float> %a0) { declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone -define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) { - ; FIXME: unfortunately the execution domain fix pass changes this to vmovups and its hard to force with no 256-bit integer instructions - ; add operation forces the execution domain. -; AVX-LABEL: test_x86_avx_storeu_dq_256: -; AVX: ## BB#0: -; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-NEXT: vmovups %ymm0, (%eax) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retl -; -; AVX512VL-LABEL: test_x86_avx_storeu_dq_256: -; AVX512VL: ## BB#0: -; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512VL-NEXT: vpaddb LCPI225_0, %ymm0, %ymm0 -; AVX512VL-NEXT: vmovdqu %ymm0, (%eax) -; AVX512VL-NEXT: retl - %a2 = add <32 x i8> %a1, - call void @llvm.x86.avx.storeu.dq.256(i8* %a0, <32 x i8> %a2) - ret void -} -declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind - - -define void @test_x86_avx_storeu_pd_256(i8* %a0, <4 x double> %a1) { - ; add operation forces the execution domain. -; AVX-LABEL: test_x86_avx_storeu_pd_256: -; AVX: ## BB#0: -; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX-NEXT: vxorpd %ymm1, %ymm1, %ymm1 -; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vmovupd %ymm0, (%eax) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retl -; -; AVX512VL-LABEL: test_x86_avx_storeu_pd_256: -; AVX512VL: ## BB#0: -; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512VL-NEXT: vpxord %ymm1, %ymm1, %ymm1 -; AVX512VL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vmovups %ymm0, (%eax) -; AVX512VL-NEXT: retl - %a2 = fadd <4 x double> %a1, - call void @llvm.x86.avx.storeu.pd.256(i8* %a0, <4 x double> %a2) - ret void -} -declare void @llvm.x86.avx.storeu.pd.256(i8*, <4 x double>) nounwind - - -define void @test_x86_avx_storeu_ps_256(i8* %a0, <8 x float> %a1) { -; AVX-LABEL: test_x86_avx_storeu_ps_256: -; AVX: ## BB#0: -; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX-NEXT: vmovups %ymm0, (%eax) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retl -; -; AVX512VL-LABEL: test_x86_avx_storeu_ps_256: -; AVX512VL: ## BB#0: -; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512VL-NEXT: vmovups %ymm0, (%eax) -; AVX512VL-NEXT: retl - call void @llvm.x86.avx.storeu.ps.256(i8* %a0, <8 x float> %a1) - ret void -} -declare void @llvm.x86.avx.storeu.ps.256(i8*, <8 x float>) nounwind - - define <4 x double> @test_x86_avx_vbroadcastf128_pd_256(i8* %a0) { ; AVX-LABEL: test_x86_avx_vbroadcastf128_pd_256: ; AVX: ## BB#0: @@ -4271,7 +4133,7 @@ define <4 x double> @test_x86_avx_vpermilvar_pd_256_2(<4 x double> %a0) { ; ; AVX512VL-LABEL: test_x86_avx_vpermilvar_pd_256_2: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpermilpd LCPI239_0, %ymm0, %ymm0 +; AVX512VL-NEXT: vpermilpd LCPI233_0, %ymm0, %ymm0 ; AVX512VL-NEXT: retl %res = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> ) ; <<4 x double>> [#uses=1] ret <4 x double> %res @@ -4763,7 +4625,7 @@ define void @movnt_dq(i8* %p, <2 x i64> %a1) nounwind { ; AVX-LABEL: movnt_dq: ; AVX: ## BB#0: ; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX-NEXT: vpaddq LCPI266_0, %xmm0, %xmm0 +; AVX-NEXT: vpaddq LCPI260_0, %xmm0, %xmm0 ; AVX-NEXT: vmovntdq %ymm0, (%eax) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retl @@ -4771,7 +4633,7 @@ define void @movnt_dq(i8* %p, <2 x i64> %a1) nounwind { ; AVX512VL-LABEL: movnt_dq: ; AVX512VL: ## BB#0: ; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512VL-NEXT: vpaddq LCPI266_0, %xmm0, %xmm0 +; AVX512VL-NEXT: vpaddq LCPI260_0, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovntdq %ymm0, (%eax) ; AVX512VL-NEXT: retl %a2 = add <2 x i64> %a1, diff --git a/llvm/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll index b537a700852c..2c7d055fe2a0 100644 --- a/llvm/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll @@ -365,3 +365,19 @@ define <4 x i64> @test_x86_avx2_pmovzxwq(<8 x i16> %a0) { ret <4 x i64> %res } declare <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16>) nounwind readnone + +; This is checked here because the execution dependency fix pass makes it hard to test in AVX mode since we don't have 256-bit integer instructions +define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) { + ; add operation forces the execution domain. +; CHECK-LABEL: test_x86_avx_storeu_dq_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: vpaddb LCPI33_0, %ymm0, %ymm0 +; CHECK-NEXT: vmovdqu %ymm0, (%eax) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retl + %a2 = add <32 x i8> %a1, + call void @llvm.x86.avx.storeu.dq.256(i8* %a0, <32 x i8> %a2) + ret void +} +declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind diff --git a/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll b/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll index b5c4dbcb777b..820a87aeab1f 100644 --- a/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll +++ b/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll @@ -1475,29 +1475,6 @@ define <8 x i32> @test_x86_avx2_psrav_d_256(<8 x i32> %a0, <8 x i32> %a1) { } declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone -; This is checked here because the execution dependency fix pass makes it hard to test in AVX mode since we don't have 256-bit integer instructions -define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) { - ; add operation forces the execution domain. -; AVX2-LABEL: test_x86_avx_storeu_dq_256: -; AVX2: ## BB#0: -; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX2-NEXT: vpaddb LCPI91_0, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqu %ymm0, (%eax) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retl -; -; AVX512VL-LABEL: test_x86_avx_storeu_dq_256: -; AVX512VL: ## BB#0: -; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512VL-NEXT: vpaddb LCPI91_0, %ymm0, %ymm0 -; AVX512VL-NEXT: vmovdqu %ymm0, (%eax) -; AVX512VL-NEXT: retl - %a2 = add <32 x i8> %a1, - call void @llvm.x86.avx.storeu.dq.256(i8* %a0, <32 x i8> %a2) - ret void -} -declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind - define <2 x double> @test_x86_avx2_gather_d_pd(<2 x double> %a0, i8* %a1, <4 x i32> %idx, <2 x double> %mask) { ; AVX2-LABEL: test_x86_avx2_gather_d_pd: ; AVX2: ## BB#0: diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll new file mode 100644 index 000000000000..2900c277f124 --- /dev/null +++ b/llvm/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll @@ -0,0 +1,27 @@ +; NOTE: Assertions have been autogenerated by update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | FileCheck %s + +define void @test_x86_sse_storeu_ps(i8* %a0, <4 x float> %a1) { +; SSE-LABEL: test_x86_sse_storeu_ps: +; SSE: ## BB#0: +; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE-NEXT: movups %xmm0, (%eax) +; SSE-NEXT: retl +; +; KNL-LABEL: test_x86_sse_storeu_ps: +; KNL: ## BB#0: +; KNL-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL-NEXT: vmovups %xmm0, (%eax) +; KNL-NEXT: retl +; CHECK-LABEL: test_x86_sse_storeu_ps: +; CHECK: ## BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movups %xmm0, (%eax) +; CHECK-NEXT: retl + call void @llvm.x86.sse.storeu.ps(i8* %a0, <4 x float> %a1) + ret void +} +declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind + + diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-x86.ll b/llvm/test/CodeGen/X86/sse-intrinsics-x86.ll index 86b52419a391..c346064e7aa8 100644 --- a/llvm/test/CodeGen/X86/sse-intrinsics-x86.ll +++ b/llvm/test/CodeGen/X86/sse-intrinsics-x86.ll @@ -474,24 +474,6 @@ define void @test_x86_sse_stmxcsr(i8* %a0) { declare void @llvm.x86.sse.stmxcsr(i8*) nounwind -define void @test_x86_sse_storeu_ps(i8* %a0, <4 x float> %a1) { -; SSE-LABEL: test_x86_sse_storeu_ps: -; SSE: ## BB#0: -; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; SSE-NEXT: movups %xmm0, (%eax) -; SSE-NEXT: retl -; -; KNL-LABEL: test_x86_sse_storeu_ps: -; KNL: ## BB#0: -; KNL-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL-NEXT: vmovups %xmm0, (%eax) -; KNL-NEXT: retl - call void @llvm.x86.sse.storeu.ps(i8* %a0, <4 x float> %a1) - ret void -} -declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind - - define <4 x float> @test_x86_sse_sub_ss(<4 x float> %a0, <4 x float> %a1) { ; SSE-LABEL: test_x86_sse_sub_ss: ; SSE: ## BB#0: diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll index 1725e8f8c2b9..42d7c26d42b2 100644 --- a/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll +++ b/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll @@ -96,4 +96,35 @@ define void @test_x86_sse2_storel_dq(i8* %a0, <4 x i32> %a1) { declare void @llvm.x86.sse2.storel.dq(i8*, <4 x i32>) nounwind +define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) { + ; add operation forces the execution domain. +; CHECK-LABEL: test_x86_sse2_storeu_dq: +; CHECK: ## BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: paddb LCPI7_0, %xmm0 +; CHECK-NEXT: movdqu %xmm0, (%eax) +; CHECK-NEXT: retl + %a2 = add <16 x i8> %a1, + call void @llvm.x86.sse2.storeu.dq(i8* %a0, <16 x i8> %a2) + ret void +} +declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind + + +define void @test_x86_sse2_storeu_pd(i8* %a0, <2 x double> %a1) { + ; fadd operation forces the execution domain. +; CHECK-LABEL: test_x86_sse2_storeu_pd: +; CHECK: ## BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; CHECK-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] +; CHECK-NEXT: addpd %xmm0, %xmm1 +; CHECK-NEXT: movupd %xmm1, (%eax) +; CHECK-NEXT: retl + %a2 = fadd <2 x double> %a1, + call void @llvm.x86.sse2.storeu.pd(i8* %a0, <2 x double> %a2) + ret void +} +declare void @llvm.x86.sse2.storeu.pd(i8*, <2 x double>) nounwind + diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll index 25f73055091b..d06ef2807e9e 100644 --- a/llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll +++ b/llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll @@ -1125,54 +1125,6 @@ define <2 x double> @test_x86_sse2_sqrt_sd(<2 x double> %a0) { declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone -define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) { - ; add operation forces the execution domain. -; SSE-LABEL: test_x86_sse2_storeu_dq: -; SSE: ## BB#0: -; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; SSE-NEXT: paddb LCPI68_0, %xmm0 -; SSE-NEXT: movdqu %xmm0, (%eax) -; SSE-NEXT: retl -; -; KNL-LABEL: test_x86_sse2_storeu_dq: -; KNL: ## BB#0: -; KNL-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL-NEXT: vpaddb LCPI68_0, %xmm0, %xmm0 -; KNL-NEXT: vmovdqu %xmm0, (%eax) -; KNL-NEXT: retl - %a2 = add <16 x i8> %a1, - call void @llvm.x86.sse2.storeu.dq(i8* %a0, <16 x i8> %a2) - ret void -} -declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind - - -define void @test_x86_sse2_storeu_pd(i8* %a0, <2 x double> %a1) { - ; fadd operation forces the execution domain. -; SSE-LABEL: test_x86_sse2_storeu_pd: -; SSE: ## BB#0: -; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] -; SSE-NEXT: addpd %xmm0, %xmm1 -; SSE-NEXT: movupd %xmm1, (%eax) -; SSE-NEXT: retl -; -; KNL-LABEL: test_x86_sse2_storeu_pd: -; KNL: ## BB#0: -; KNL-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; KNL-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] -; KNL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; KNL-NEXT: vmovupd %xmm0, (%eax) -; KNL-NEXT: retl - %a2 = fadd <2 x double> %a1, - call void @llvm.x86.sse2.storeu.pd(i8* %a0, <2 x double> %a2) - ret void -} -declare void @llvm.x86.sse2.storeu.pd(i8*, <2 x double>) nounwind - - define <2 x double> @test_x86_sse2_sub_sd(<2 x double> %a0, <2 x double> %a1) { ; SSE-LABEL: test_x86_sse2_sub_sd: ; SSE: ## BB#0: diff --git a/llvm/test/Instrumentation/MemorySanitizer/msan_basic.ll b/llvm/test/Instrumentation/MemorySanitizer/msan_basic.ll index 91e2a9087a99..4b208d64427b 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/msan_basic.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/msan_basic.ll @@ -631,7 +631,7 @@ declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind ; CHECK-NOT: br ; CHECK-NOT: = or ; CHECK: store <4 x i32> {{.*}} align 1 -; CHECK: call void @llvm.x86.sse.storeu.ps +; CHECK: store <4 x float> %{{.*}}, <4 x float>* %{{.*}}, align 1{{$}} ; CHECK: ret void