From 20bc37c7db4ead75170d7e70145ea126a92f4a01 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 19 Jan 2015 22:40:45 +0000 Subject: [PATCH] [X86][AVX] Missing AVX1 memory folding float instructions Now that we can create much more exhaustive X86 memory folding tests, this patch adds the missing AVX1/F16C floating point instruction stack foldings we can easily test for including the scalar intrinsics (add, div, max, min, mul, sub), conversions float/int to double, half precision conversions, rounding, dot product and bit test. The patch also adds a couple of obviously missing SSE instructions (more to follow once we have full SSE testing). Now that scalar folding is working it broke a very old test (2006-10-07-ScalarSSEMiscompile.ll) - this test appears to make no sense as its trying to ensure that a scalar subtraction isn't folded as it 'would zero the top elts of the loaded vector' - this test just appears to be wrong to me. Differential Revision: http://reviews.llvm.org/D7055 llvm-svn: 226513 --- llvm/lib/Target/X86/X86InstrInfo.cpp | 56 +- .../X86/2006-10-07-ScalarSSEMiscompile.ll | 15 - .../CodeGen/X86/avx1-stack-reload-folding.ll | 636 +++++++++++++----- 3 files changed, 511 insertions(+), 196 deletions(-) delete mode 100644 llvm/test/CodeGen/X86/2006-10-07-ScalarSSEMiscompile.ll diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 461569345a11..dbbe3ad1fc4a 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -407,7 +407,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOVDQU8Z128rr, X86::VMOVDQU8Z128mr, TB_FOLDED_STORE }, { X86::VMOVDQU16Z128rr, X86::VMOVDQU16Z128mr, TB_FOLDED_STORE }, { X86::VMOVDQU32Z128rr, X86::VMOVDQU32Z128mr, TB_FOLDED_STORE }, - { X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128mr, TB_FOLDED_STORE } + { X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128mr, TB_FOLDED_STORE }, + // F16C foldable instructions + { X86::VCVTPS2PHrr, X86::VCVTPS2PHmr, TB_FOLDED_STORE }, + { X86::VCVTPS2PHYrr, X86::VCVTPS2PHYmr, TB_FOLDED_STORE } }; for (unsigned i = 0, e = array_lengthof(OpTbl0); i != e; ++i) { @@ -445,10 +448,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::CVTSD2SIrr, X86::CVTSD2SIrm, 0 }, { X86::CVTSS2SI64rr, X86::CVTSS2SI64rm, 0 }, { X86::CVTSS2SIrr, X86::CVTSS2SIrm, 0 }, + { X86::CVTDQ2PDrr, X86::CVTDQ2PDrm, TB_ALIGN_16 }, { X86::CVTDQ2PSrr, X86::CVTDQ2PSrm, TB_ALIGN_16 }, { X86::CVTPD2DQrr, X86::CVTPD2DQrm, TB_ALIGN_16 }, { X86::CVTPD2PSrr, X86::CVTPD2PSrm, TB_ALIGN_16 }, { X86::CVTPS2DQrr, X86::CVTPS2DQrm, TB_ALIGN_16 }, + { X86::CVTPS2PDrr, X86::CVTPS2PDrm, TB_ALIGN_16 }, { X86::CVTTPD2DQrr, X86::CVTTPD2DQrm, TB_ALIGN_16 }, { X86::CVTTPS2DQrr, X86::CVTTPS2DQrm, TB_ALIGN_16 }, { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm, 0 }, @@ -493,6 +498,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::PSHUFLWri, X86::PSHUFLWmi, TB_ALIGN_16 }, { X86::RCPPSr, X86::RCPPSm, TB_ALIGN_16 }, { X86::RCPPSr_Int, X86::RCPPSm_Int, TB_ALIGN_16 }, + { X86::ROUNDPDr, X86::ROUNDPDm, TB_ALIGN_16 }, + { X86::ROUNDPSr, X86::ROUNDPSm, TB_ALIGN_16 }, { X86::RSQRTPSr, X86::RSQRTPSm, TB_ALIGN_16 }, { X86::RSQRTPSr_Int, X86::RSQRTPSm_Int, TB_ALIGN_16 }, { X86::RSQRTSSr, X86::RSQRTSSm, 0 }, @@ -527,10 +534,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VCVTSD2SIrr, X86::VCVTSD2SIrm, 0 }, { X86::VCVTSS2SI64rr, X86::VCVTSS2SI64rm, 0 }, { X86::VCVTSS2SIrr, X86::VCVTSS2SIrm, 0 }, + { X86::VCVTDQ2PDrr, X86::VCVTDQ2PDrm, 0 }, { X86::VCVTDQ2PSrr, X86::VCVTDQ2PSrm, 0 }, { X86::VCVTPD2DQrr, X86::VCVTPD2DQXrm, 0 }, { X86::VCVTPD2PSrr, X86::VCVTPD2PSXrm, 0 }, { X86::VCVTPS2DQrr, X86::VCVTPS2DQrm, 0 }, + { X86::VCVTPS2PDrr, X86::VCVTPS2PDrm, 0 }, { X86::VCVTTPD2DQrr, X86::VCVTTPD2DQXrm, 0 }, { X86::VCVTTPS2DQrr, X86::VCVTTPS2DQrm, 0 }, { X86::VMOV64toPQIrr, X86::VMOVQI2PQIrm, 0 }, @@ -557,19 +566,25 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPSHUFLWri, X86::VPSHUFLWmi, 0 }, { X86::VRCPPSr, X86::VRCPPSm, 0 }, { X86::VRCPPSr_Int, X86::VRCPPSm_Int, 0 }, + { X86::VROUNDPDr, X86::VROUNDPDm, 0 }, + { X86::VROUNDPSr, X86::VROUNDPSm, 0 }, { X86::VRSQRTPSr, X86::VRSQRTPSm, 0 }, { X86::VRSQRTPSr_Int, X86::VRSQRTPSm_Int, 0 }, { X86::VSQRTPDr, X86::VSQRTPDm, 0 }, { X86::VSQRTPSr, X86::VSQRTPSm, 0 }, + { X86::VTESTPDrr, X86::VTESTPDrm, 0 }, + { X86::VTESTPSrr, X86::VTESTPSrm, 0 }, { X86::VUCOMISDrr, X86::VUCOMISDrm, 0 }, { X86::VUCOMISSrr, X86::VUCOMISSrm, 0 }, { X86::VBROADCASTSSrr, X86::VBROADCASTSSrm, TB_NO_REVERSE }, // AVX 256-bit foldable instructions + { X86::VCVTDQ2PDYrr, X86::VCVTDQ2PDYrm, 0 }, { X86::VCVTDQ2PSYrr, X86::VCVTDQ2PSYrm, 0 }, { X86::VCVTPD2DQYrr, X86::VCVTPD2DQYrm, 0 }, { X86::VCVTPD2PSYrr, X86::VCVTPD2PSYrm, 0 }, { X86::VCVTPS2DQYrr, X86::VCVTPS2DQYrm, 0 }, + { X86::VCVTPS2PDYrr, X86::VCVTPS2PDYrm, 0 }, { X86::VCVTTPD2DQYrr, X86::VCVTTPD2DQYrm, 0 }, { X86::VCVTTPS2DQYrr, X86::VCVTTPS2DQYrm, 0 }, { X86::VMOVAPDYrr, X86::VMOVAPDYrm, TB_ALIGN_32 }, @@ -581,9 +596,14 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPERMILPSYri, X86::VPERMILPSYmi, 0 }, { X86::VRCPPSYr, X86::VRCPPSYm, 0 }, { X86::VRCPPSYr_Int, X86::VRCPPSYm_Int, 0 }, + { X86::VROUNDYPDr, X86::VROUNDYPDm, 0 }, + { X86::VROUNDYPSr, X86::VROUNDYPSm, 0 }, { X86::VRSQRTPSYr, X86::VRSQRTPSYm, 0 }, + { X86::VRSQRTPSYr_Int, X86::VRSQRTPSYm_Int, 0 }, { X86::VSQRTPDYr, X86::VSQRTPDYm, 0 }, { X86::VSQRTPSYr, X86::VSQRTPSYm, 0 }, + { X86::VTESTPDYrr, X86::VTESTPDYrm, 0 }, + { X86::VTESTPSYrr, X86::VTESTPSYrm, 0 }, { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm, TB_NO_REVERSE }, { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm, TB_NO_REVERSE }, @@ -686,7 +706,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOVUPDZ128rr, X86::VMOVUPDZ128rm, 0 }, { X86::VMOVUPSZ128rr, X86::VMOVUPSZ128rm, 0 }, { X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128m, TB_NO_REVERSE }, - + // F16C foldable instructions + { X86::VCVTPH2PSrr, X86::VCVTPH2PSrm, 0 }, + { X86::VCVTPH2PSYrr, X86::VCVTPH2PSYrm, 0 }, // AES foldable instructions { X86::AESIMCrr, X86::AESIMCrm, TB_ALIGN_16 }, { X86::AESKEYGENASSIST128rr, X86::AESKEYGENASSIST128rm, TB_ALIGN_16 }, @@ -717,7 +739,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::ADDPDrr, X86::ADDPDrm, TB_ALIGN_16 }, { X86::ADDPSrr, X86::ADDPSrm, TB_ALIGN_16 }, { X86::ADDSDrr, X86::ADDSDrm, 0 }, + { X86::ADDSDrr_Int, X86::ADDSDrm_Int, 0 }, { X86::ADDSSrr, X86::ADDSSrm, 0 }, + { X86::ADDSSrr_Int, X86::ADDSSrm_Int, 0 }, { X86::ADDSUBPDrr, X86::ADDSUBPDrm, TB_ALIGN_16 }, { X86::ADDSUBPSrr, X86::ADDSUBPSrm, TB_ALIGN_16 }, { X86::AND16rr, X86::AND16rm, 0 }, @@ -787,7 +811,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::DIVPDrr, X86::DIVPDrm, TB_ALIGN_16 }, { X86::DIVPSrr, X86::DIVPSrm, TB_ALIGN_16 }, { X86::DIVSDrr, X86::DIVSDrm, 0 }, + { X86::DIVSDrr_Int, X86::DIVSDrm_Int, 0 }, { X86::DIVSSrr, X86::DIVSSrm, 0 }, + { X86::DIVSSrr_Int, X86::DIVSSrm_Int, 0 }, + { X86::DPPDrri, X86::DPPDrmi, TB_ALIGN_16 }, + { X86::DPPSrri, X86::DPPSrmi, TB_ALIGN_16 }, { X86::FsANDNPDrr, X86::FsANDNPDrm, TB_ALIGN_16 }, { X86::FsANDNPSrr, X86::FsANDNPSrm, TB_ALIGN_16 }, { X86::FsANDPDrr, X86::FsANDPDrm, TB_ALIGN_16 }, @@ -814,16 +842,22 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::MAXPDrr, X86::MAXPDrm, TB_ALIGN_16 }, { X86::MAXPSrr, X86::MAXPSrm, TB_ALIGN_16 }, { X86::MAXSDrr, X86::MAXSDrm, 0 }, + { X86::MAXSDrr_Int, X86::MAXSDrm_Int, 0 }, { X86::MAXSSrr, X86::MAXSSrm, 0 }, + { X86::MAXSSrr_Int, X86::MAXSSrm_Int, 0 }, { X86::MINPDrr, X86::MINPDrm, TB_ALIGN_16 }, { X86::MINPSrr, X86::MINPSrm, TB_ALIGN_16 }, { X86::MINSDrr, X86::MINSDrm, 0 }, + { X86::MINSDrr_Int, X86::MINSDrm_Int, 0 }, { X86::MINSSrr, X86::MINSSrm, 0 }, + { X86::MINSSrr_Int, X86::MINSSrm_Int, 0 }, { X86::MPSADBWrri, X86::MPSADBWrmi, TB_ALIGN_16 }, { X86::MULPDrr, X86::MULPDrm, TB_ALIGN_16 }, { X86::MULPSrr, X86::MULPSrm, TB_ALIGN_16 }, { X86::MULSDrr, X86::MULSDrm, 0 }, + { X86::MULSDrr_Int, X86::MULSDrm_Int, 0 }, { X86::MULSSrr, X86::MULSSrm, 0 }, + { X86::MULSSrr_Int, X86::MULSSrm_Int, 0 }, { X86::OR16rr, X86::OR16rm, 0 }, { X86::OR32rr, X86::OR32rm, 0 }, { X86::OR64rr, X86::OR64rm, 0 }, @@ -923,7 +957,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::SUBPDrr, X86::SUBPDrm, TB_ALIGN_16 }, { X86::SUBPSrr, X86::SUBPSrm, TB_ALIGN_16 }, { X86::SUBSDrr, X86::SUBSDrm, 0 }, + { X86::SUBSDrr_Int, X86::SUBSDrm_Int, 0 }, { X86::SUBSSrr, X86::SUBSSrm, 0 }, + { X86::SUBSSrr_Int, X86::SUBSSrm_Int, 0 }, // FIXME: TEST*rr -> swapped operand of TEST*mr. { X86::UNPCKHPDrr, X86::UNPCKHPDrm, TB_ALIGN_16 }, { X86::UNPCKHPSrr, X86::UNPCKHPSrm, TB_ALIGN_16 }, @@ -948,13 +984,16 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::Int_VCVTSI2SSrr, X86::Int_VCVTSI2SSrm, 0 }, { X86::VCVTSS2SDrr, X86::VCVTSS2SDrm, 0 }, { X86::Int_VCVTSS2SDrr, X86::Int_VCVTSS2SDrm, 0 }, + { X86::VRCPSSr, X86::VRCPSSm, 0 }, { X86::VRSQRTSSr, X86::VRSQRTSSm, 0 }, { X86::VSQRTSDr, X86::VSQRTSDm, 0 }, { X86::VSQRTSSr, X86::VSQRTSSm, 0 }, { X86::VADDPDrr, X86::VADDPDrm, 0 }, { X86::VADDPSrr, X86::VADDPSrm, 0 }, { X86::VADDSDrr, X86::VADDSDrm, 0 }, + { X86::VADDSDrr_Int, X86::VADDSDrm_Int, 0 }, { X86::VADDSSrr, X86::VADDSSrm, 0 }, + { X86::VADDSSrr_Int, X86::VADDSSrm_Int, 0 }, { X86::VADDSUBPDrr, X86::VADDSUBPDrm, 0 }, { X86::VADDSUBPSrr, X86::VADDSUBPSrm, 0 }, { X86::VANDNPDrr, X86::VANDNPDrm, 0 }, @@ -972,7 +1011,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VDIVPDrr, X86::VDIVPDrm, 0 }, { X86::VDIVPSrr, X86::VDIVPSrm, 0 }, { X86::VDIVSDrr, X86::VDIVSDrm, 0 }, + { X86::VDIVSDrr_Int, X86::VDIVSDrm_Int, 0 }, { X86::VDIVSSrr, X86::VDIVSSrm, 0 }, + { X86::VDIVSSrr_Int, X86::VDIVSSrm_Int, 0 }, + { X86::VDPPDrri, X86::VDPPDrmi, 0 }, + { X86::VDPPSrri, X86::VDPPSrmi, 0 }, { X86::VFsANDNPDrr, X86::VFsANDNPDrm, TB_ALIGN_16 }, { X86::VFsANDNPSrr, X86::VFsANDNPSrm, TB_ALIGN_16 }, { X86::VFsANDPDrr, X86::VFsANDPDrm, TB_ALIGN_16 }, @@ -990,16 +1033,22 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMAXPDrr, X86::VMAXPDrm, 0 }, { X86::VMAXPSrr, X86::VMAXPSrm, 0 }, { X86::VMAXSDrr, X86::VMAXSDrm, 0 }, + { X86::VMAXSDrr_Int, X86::VMAXSDrm_Int, 0 }, { X86::VMAXSSrr, X86::VMAXSSrm, 0 }, + { X86::VMAXSSrr_Int, X86::VMAXSSrm_Int, 0 }, { X86::VMINPDrr, X86::VMINPDrm, 0 }, { X86::VMINPSrr, X86::VMINPSrm, 0 }, { X86::VMINSDrr, X86::VMINSDrm, 0 }, + { X86::VMINSDrr_Int, X86::VMINSDrm_Int, 0 }, { X86::VMINSSrr, X86::VMINSSrm, 0 }, + { X86::VMINSSrr_Int, X86::VMINSSrm_Int, 0 }, { X86::VMPSADBWrri, X86::VMPSADBWrmi, 0 }, { X86::VMULPDrr, X86::VMULPDrm, 0 }, { X86::VMULPSrr, X86::VMULPSrm, 0 }, { X86::VMULSDrr, X86::VMULSDrm, 0 }, + { X86::VMULSDrr_Int, X86::VMULSDrm_Int, 0 }, { X86::VMULSSrr, X86::VMULSSrm, 0 }, + { X86::VMULSSrr_Int, X86::VMULSSrm_Int, 0 }, { X86::VORPDrr, X86::VORPDrm, 0 }, { X86::VORPSrr, X86::VORPSrm, 0 }, { X86::VPACKSSDWrr, X86::VPACKSSDWrm, 0 }, @@ -1091,7 +1140,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VSUBPDrr, X86::VSUBPDrm, 0 }, { X86::VSUBPSrr, X86::VSUBPSrm, 0 }, { X86::VSUBSDrr, X86::VSUBSDrm, 0 }, + { X86::VSUBSDrr_Int, X86::VSUBSDrm_Int, 0 }, { X86::VSUBSSrr, X86::VSUBSSrm, 0 }, + { X86::VSUBSSrr_Int, X86::VSUBSSrm_Int, 0 }, { X86::VUNPCKHPDrr, X86::VUNPCKHPDrm, 0 }, { X86::VUNPCKHPSrr, X86::VUNPCKHPSrm, 0 }, { X86::VUNPCKLPDrr, X86::VUNPCKLPDrm, 0 }, @@ -1115,6 +1166,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VCMPPSYrri, X86::VCMPPSYrmi, 0 }, { X86::VDIVPDYrr, X86::VDIVPDYrm, 0 }, { X86::VDIVPSYrr, X86::VDIVPSYrm, 0 }, + { X86::VDPPSYrri, X86::VDPPSYrmi, 0 }, { X86::VHADDPDYrr, X86::VHADDPDYrm, 0 }, { X86::VHADDPSYrr, X86::VHADDPSYrm, 0 }, { X86::VHSUBPDYrr, X86::VHSUBPDYrm, 0 }, diff --git a/llvm/test/CodeGen/X86/2006-10-07-ScalarSSEMiscompile.ll b/llvm/test/CodeGen/X86/2006-10-07-ScalarSSEMiscompile.ll deleted file mode 100644 index d09d06147696..000000000000 --- a/llvm/test/CodeGen/X86/2006-10-07-ScalarSSEMiscompile.ll +++ /dev/null @@ -1,15 +0,0 @@ -; RUN: llc < %s -march=x86 -mattr=sse | grep movaps -; Test that the load is NOT folded into the intrinsic, which would zero the top -; elts of the loaded vector. - -target datalayout = "e-p:32:32" -target triple = "i686-apple-darwin8.7.2" - -define <4 x float> @test(<4 x float> %A, <4 x float>* %B) nounwind { - %BV = load <4 x float>* %B ; <<4 x float>> [#uses=1] - %tmp28 = tail call <4 x float> @llvm.x86.sse.sub.ss( <4 x float> %A, <4 x float> %BV ) ; <<4 x float>> [#uses=1] - ret <4 x float> %tmp28 -} - -declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>) - diff --git a/llvm/test/CodeGen/X86/avx1-stack-reload-folding.ll b/llvm/test/CodeGen/X86/avx1-stack-reload-folding.ll index e40579144380..37dde9b7cdea 100644 --- a/llvm/test/CodeGen/X86/avx1-stack-reload-folding.ll +++ b/llvm/test/CodeGen/X86/avx1-stack-reload-folding.ll @@ -1,7 +1,7 @@ -; RUN: llc -O3 -disable-peephole -mcpu=corei7-avx -mattr=+avx < %s | FileCheck %s - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-unknown" +; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx,+f16c < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-unknown" ; Stack reload folding tests. ; @@ -45,24 +45,36 @@ define double @stack_fold_addsd(double %a0, double %a1) { ;CHECK: vaddsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = fadd double %a0, %a1 - ret double %2 -} - -; TODO stack_fold_addsd_int -declare <2 x double> @llvm.x86.sse2.add.sd(<2 x double>, <2 x double>) nounwind readnone - -define float @stack_fold_addss(float %a0, float %a1) { + ret double %2 +} + +define <2 x double> @stack_fold_addsd_int(<2 x double> %a0, <2 x double> %a1) { + ;CHECK-LABEL: stack_fold_addsd_int + ;CHECK: vaddsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <2 x double> @llvm.x86.sse2.add.sd(<2 x double> %a0, <2 x double> %a1) + ret <2 x double> %2 +} +declare <2 x double> @llvm.x86.sse2.add.sd(<2 x double>, <2 x double>) nounwind readnone + +define float @stack_fold_addss(float %a0, float %a1) { ;CHECK-LABEL: stack_fold_addss ;CHECK: vaddss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = fadd float %a0, %a1 - ret float %2 -} - -; TODO stack_fold_addss_int -declare <4 x float> @llvm.x86.sse.add.ss(<4 x float>, <4 x float>) nounwind readnone - -define <2 x double> @stack_fold_addsubpd(<2 x double> %a0, <2 x double> %a1) { + ret float %2 +} + +define <4 x float> @stack_fold_addss_int(<4 x float> %a0, <4 x float> %a1) { + ;CHECK-LABEL: stack_fold_addss_int + ;CHECK: vaddss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %a0, <4 x float> %a1) + ret <4 x float> %2 +} +declare <4 x float> @llvm.x86.sse.add.ss(<4 x float>, <4 x float>) nounwind readnone + +define <2 x double> @stack_fold_addsubpd(<2 x double> %a0, <2 x double> %a1) { ;CHECK-LABEL: stack_fold_addsubpd ;CHECK: vaddsubpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() @@ -357,16 +369,28 @@ define i32 @stack_fold_comiss_int(<4 x float> %a0, <4 x float> %a1) { %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1) ret i32 %2 -} -declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone - -; TODO stack_fold_cvtdq2pd -declare <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32>) nounwind readnone - -; TODO stack_fold_cvtdq2pd_ymm -declare <4 x double> @llvm.x86.avx.cvtdq2.pd.256(<4 x i32>) nounwind readnone - -define <4 x float> @stack_fold_cvtdq2ps(<4 x i32> %a0) { +} +declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone + +define <2 x double> @stack_fold_cvtdq2pd(<4 x i32> %a0) { + ;CHECK-LABEL: stack_fold_cvtdq2pd + ;CHECK: vcvtdq2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32> %a0) + ret <2 x double> %2 +} +declare <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32>) nounwind readnone + +define <4 x double> @stack_fold_cvtdq2pd_ymm(<4 x i32> %a0) { + ;CHECK-LABEL: stack_fold_cvtdq2pd_ymm + ;CHECK: vcvtdq2pd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x double> @llvm.x86.avx.cvtdq2.pd.256(<4 x i32> %a0) + ret <4 x double> %2 +} +declare <4 x double> @llvm.x86.avx.cvtdq2.pd.256(<4 x i32>) nounwind readnone + +define <4 x float> @stack_fold_cvtdq2ps(<4 x i32> %a0) { ;CHECK-LABEL: stack_fold_cvtdq2ps ;CHECK: vcvtdq2ps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() @@ -413,16 +437,28 @@ define <4 x float> @stack_fold_cvtpd2ps_ymm(<4 x double> %a0) { ;CHECK: vcvtpd2psy {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = fptrunc <4 x double> %a0 to <4 x float> - ret <4 x float> %2 -} - -; TODO stack_fold_cvtph2ps -declare <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16>) nounwind readonly - -; TODO stack_fold_cvtph2ps_ymm -declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readonly - -define <4 x i32> @stack_fold_cvtps2dq(<4 x float> %a0) { + ret <4 x float> %2 +} + +define <4 x float> @stack_fold_cvtph2ps(<8 x i16> %a0) { + ;CHECK-LABEL: stack_fold_cvtph2ps + ;CHECK: vcvtph2ps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %a0) + ret <4 x float> %2 +} +declare <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16>) nounwind readonly + +define <8 x float> @stack_fold_cvtph2ps_ymm(<8 x i16> %a0) { + ;CHECK-LABEL: stack_fold_cvtph2ps_ymm + ;CHECK: vcvtph2ps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %a0) + ret <8 x float> %2 +} +declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readonly + +define <4 x i32> @stack_fold_cvtps2dq(<4 x float> %a0) { ;CHECK-LABEL: stack_fold_cvtps2dq ;CHECK: vcvtps2dq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() @@ -437,19 +473,46 @@ define <8 x i32> @stack_fold_cvtps2dq_ymm(<8 x float> %a0) { %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %a0) ret <8 x i32> %2 -} -declare <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float>) nounwind readnone - -; TODO stack_fold_cvtps2pd -; TODO stack_fold_cvtps2pd_ymm - -; TODO stack_fold_cvtps2ph -declare <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float>, i32) nounwind readonly - -; TODO stack_fold_cvtps2ph_ymm -declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readonly - -; TODO stack_fold_cvtsd2si +} +declare <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float>) nounwind readnone + +define <2 x double> @stack_fold_cvtps2pd(<4 x float> %a0) { + ;CHECK-LABEL: stack_fold_cvtps2pd + ;CHECK: vcvtps2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float> %a0) + ret <2 x double> %2 +} +declare <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float>) nounwind readnone + +define <4 x double> @stack_fold_cvtps2pd_ymm(<4 x float> %a0) { + ;CHECK-LABEL: stack_fold_cvtps2pd_ymm + ;CHECK: vcvtps2pd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x double> @llvm.x86.avx.cvt.ps2.pd.256(<4 x float> %a0) + ret <4 x double> %2 +} +declare <4 x double> @llvm.x86.avx.cvt.ps2.pd.256(<4 x float>) nounwind readnone + +define <8 x i16> @stack_fold_cvtps2ph(<4 x float> %a0) { + ;CHECK-LABEL: stack_fold_cvtps2ph + ;CHECK: vcvtps2ph $0, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill + %1 = call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %a0, i32 0) + %2 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + ret <8 x i16> %1 +} +declare <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float>, i32) nounwind readonly + +define <8 x i16> @stack_fold_cvtps2ph_ymm(<8 x float> %a0) { + ;CHECK-LABEL: stack_fold_cvtps2ph_ymm + ;CHECK: vcvtps2ph $0, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill + %1 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %a0, i32 0) + %2 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + ret <8 x i16> %1 +} +declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readonly + +; TODO stack_fold_cvtsd2si define i32 @stack_fold_cvtsd2si_int(<2 x double> %a0) { ;CHECK-LABEL: stack_fold_cvtsd2si_int @@ -580,13 +643,20 @@ define i64 @stack_fold_cvtss2si64_int(<4 x float> %a0) { %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %a0) ret i64 %2 -} -declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone - -; TODO stack_fold_cvttpd2dq - -define <4 x i32> @stack_fold_cvttpd2dq_ymm(<4 x double> %a0) { - ;CHECK-LABEL: stack_fold_cvttpd2dq_ymm +} +declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone + +define <4 x i32> @stack_fold_cvttpd2dq(<2 x double> %a0) { + ;CHECK-LABEL: stack_fold_cvttpd2dq + ;CHECK: vcvttpd2dqx {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0) + ret <4 x i32> %2 +} +declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>) nounwind readnone + +define <4 x i32> @stack_fold_cvttpd2dq_ymm(<4 x double> %a0) { + ;CHECK-LABEL: stack_fold_cvttpd2dq_ymm ;CHECK: vcvttpd2dqy {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = fptosi <4 x double> %a0 to <4 x i32> @@ -714,30 +784,64 @@ define double @stack_fold_divsd(double %a0, double %a1) { ;CHECK: vdivsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = fdiv double %a0, %a1 - ret double %2 -} - -; TODO stack_fold_divsd_int -declare <2 x double> @llvm.x86.sse2.div.sd(<2 x double>, <2 x double>) nounwind readnone - -define float @stack_fold_divss(float %a0, float %a1) { + ret double %2 +} + +define <2 x double> @stack_fold_divsd_int(<2 x double> %a0, <2 x double> %a1) { + ;CHECK-LABEL: stack_fold_divsd_int + ;CHECK: vdivsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %a0, <2 x double> %a1) + ret <2 x double> %2 +} +declare <2 x double> @llvm.x86.sse2.div.sd(<2 x double>, <2 x double>) nounwind readnone + +define float @stack_fold_divss(float %a0, float %a1) { ;CHECK-LABEL: stack_fold_divss ;CHECK: vdivss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = fdiv float %a0, %a1 - ret float %2 -} - -; TODO stack_fold_divss_int -declare <4 x float> @llvm.x86.sse.div.ss(<4 x float>, <4 x float>) nounwind readnone - -; TODO stack_fold_dppd -; TODO stack_fold_dppd_ymm -; TODO stack_fold_dpps -; TODO stack_fold_dpps_ymm - -define <4 x float> @stack_fold_extractf128(<8 x float> %a0, <8 x float> %a1) { - ;CHECK-LABEL: stack_fold_extractf128 + ret float %2 +} + +define <4 x float> @stack_fold_divss_int(<4 x float> %a0, <4 x float> %a1) { + ;CHECK-LABEL: stack_fold_divss_int + ;CHECK: vdivss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %a0, <4 x float> %a1) + ret <4 x float> %2 +} +declare <4 x float> @llvm.x86.sse.div.ss(<4 x float>, <4 x float>) nounwind readnone + +define <2 x double> @stack_fold_dppd(<2 x double> %a0, <2 x double> %a1) { + ;CHECK-LABEL: stack_fold_dppd + ;CHECK: vdppd $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7) + ret <2 x double> %2 +} +declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwind readnone + +define <4 x float> @stack_fold_dpps(<4 x float> %a0, <4 x float> %a1) { + ;CHECK-LABEL: stack_fold_dpps + ;CHECK: vdpps $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7) + ret <4 x float> %2 +} +declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind readnone + +define <8 x float> @stack_fold_dpps_ymm(<8 x float> %a0, <8 x float> %a1) { + ;CHECK-LABEL: stack_fold_dpps_ymm + ;CHECK: vdpps $7, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7) + ret <8 x float> %2 +} +declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone + +define <4 x float> @stack_fold_extractf128(<8 x float> %a0, <8 x float> %a1) { + ;CHECK-LABEL: stack_fold_extractf128 ;CHECK: vextractf128 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill ;CHECK: vmovaps {{-?[0-9]*}}(%rsp), %xmm0 {{.*#+}} 16-byte Reload %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <4 x i32> @@ -879,25 +983,37 @@ define double @stack_fold_maxsd(double %a0, double %a1) { %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = fcmp ogt double %a0, %a1 %3 = select i1 %2, double %a0, double %a1 - ret double %3 -} - -; TODO stack_fold_maxsd_int -declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone - -define float @stack_fold_maxss(float %a0, float %a1) { + ret double %3 +} + +define <2 x double> @stack_fold_maxsd_int(<2 x double> %a0, <2 x double> %a1) { + ;CHECK-LABEL: stack_fold_maxsd_int + ;CHECK: vmaxsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a0, <2 x double> %a1) + ret <2 x double> %2 +} +declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone + +define float @stack_fold_maxss(float %a0, float %a1) { ;CHECK-LABEL: stack_fold_maxss ;CHECK: vmaxss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = fcmp ogt float %a0, %a1 %3 = select i1 %2, float %a0, float %a1 - ret float %3 -} - -; TODO stack_fold_maxss_int -declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone - -define <2 x double> @stack_fold_minpd(<2 x double> %a0, <2 x double> %a1) { + ret float %3 +} + +define <4 x float> @stack_fold_maxss_int(<4 x float> %a0, <4 x float> %a1) { + ;CHECK-LABEL: stack_fold_maxss_int + ;CHECK: vmaxss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1) + ret <4 x float> %2 +} +declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone + +define <2 x double> @stack_fold_minpd(<2 x double> %a0, <2 x double> %a1) { ;CHECK-LABEL: stack_fold_minpd ;CHECK: vminpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() @@ -939,25 +1055,37 @@ define double @stack_fold_minsd(double %a0, double %a1) { %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = fcmp olt double %a0, %a1 %3 = select i1 %2, double %a0, double %a1 - ret double %3 -} - -; TODO stack_fold_minsd_int -declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone - -define float @stack_fold_minss(float %a0, float %a1) { + ret double %3 +} + +define <2 x double> @stack_fold_minsd_int(<2 x double> %a0, <2 x double> %a1) { + ;CHECK-LABEL: stack_fold_minsd_int + ;CHECK: vminsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a0, <2 x double> %a1) + ret <2 x double> %2 +} +declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone + +define float @stack_fold_minss(float %a0, float %a1) { ;CHECK-LABEL: stack_fold_minss ;CHECK: vminss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = fcmp olt float %a0, %a1 %3 = select i1 %2, float %a0, float %a1 - ret float %3 -} - -; TODO stack_fold_minss_int -declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone - -; TODO stack_fold_movd (load / store) + ret float %3 +} + +define <4 x float> @stack_fold_minss_int(<4 x float> %a0, <4 x float> %a1) { + ;CHECK-LABEL: stack_fold_minss_int + ;CHECK: vminss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1) + ret <4 x float> %2 +} +declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone + +; TODO stack_fold_movd (load / store) ; TODO stack_fold_movq (load / store) ; TODO stack_fold_movddup @@ -1029,24 +1157,36 @@ define double @stack_fold_mulsd(double %a0, double %a1) { ;CHECK: vmulsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = fmul double %a0, %a1 - ret double %2 -} - -; TODO stack_fold_mulsd_int -declare <2 x double> @llvm.x86.sse2.mul.sd(<2 x double>, <2 x double>) nounwind readnone - -define float @stack_fold_mulss(float %a0, float %a1) { + ret double %2 +} + +define <2 x double> @stack_fold_mulsd_int(<2 x double> %a0, <2 x double> %a1) { + ;CHECK-LABEL: stack_fold_mulsd_int + ;CHECK: vmulsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <2 x double> @llvm.x86.sse2.mul.sd(<2 x double> %a0, <2 x double> %a1) + ret <2 x double> %2 +} +declare <2 x double> @llvm.x86.sse2.mul.sd(<2 x double>, <2 x double>) nounwind readnone + +define float @stack_fold_mulss(float %a0, float %a1) { ;CHECK-LABEL: stack_fold_mulss ;CHECK: vmulss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = fmul float %a0, %a1 - ret float %2 -} - -; TODO stack_fold_mulss_int -declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>) nounwind readnone - -define <2 x double> @stack_fold_orpd(<2 x double> %a0, <2 x double> %a1) { + ret float %2 +} + +define <4 x float> @stack_fold_mulss_int(<4 x float> %a0, <4 x float> %a1) { + ;CHECK-LABEL: stack_fold_mulss_int + ;CHECK: vmulss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x float> @llvm.x86.sse.mul.ss(<4 x float> %a0, <4 x float> %a1) + ret <4 x float> %2 +} +declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>) nounwind readnone + +define <2 x double> @stack_fold_orpd(<2 x double> %a0, <2 x double> %a1) { ;CHECK-LABEL: stack_fold_orpd ;CHECK: vorpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() @@ -1115,12 +1255,30 @@ define <4 x double> @stack_fold_permilpd_ymm(<4 x double> %a0) { ;CHECK: vpermilpd $5, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> - ret <4 x double> %2 -} - -define <4 x float> @stack_fold_permilps(<4 x float> %a0) { - ;CHECK-LABEL: stack_fold_permilps - ;CHECK: vpermilps $27, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + ret <4 x double> %2 +} + +define <2 x double> @stack_fold_permilpdvar(<2 x double> %a0, <2 x i64> %a1) { + ;CHECK-LABEL: stack_fold_permilpdvar + ;CHECK: vpermilpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %a1) + ret <2 x double> %2 +} +declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>) nounwind readnone + +define <4 x double> @stack_fold_permilpdvar_ymm(<4 x double> %a0, <4 x i64> %a1) { + ;CHECK-LABEL: stack_fold_permilpdvar_ymm + ;CHECK: vpermilpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %a1) + ret <4 x double> %2 +} +declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>) nounwind readnone + +define <4 x float> @stack_fold_permilps(<4 x float> %a0) { + ;CHECK-LABEL: stack_fold_permilps + ;CHECK: vpermilps $27, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> ret <4 x float> %2 @@ -1131,12 +1289,30 @@ define <8 x float> @stack_fold_permilps_ymm(<8 x float> %a0) { ;CHECK: vpermilps $27, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> - ret <8 x float> %2 -} - -; TODO stack_fold_rcpps - -define <4 x float> @stack_fold_rcpps_int(<4 x float> %a0) { + ret <8 x float> %2 +} + +define <4 x float> @stack_fold_permilpsvar(<4 x float> %a0, <4 x i32> %a1) { + ;CHECK-LABEL: stack_fold_permilpsvar + ;CHECK: vpermilps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %a1) + ret <4 x float> %2 +} +declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) nounwind readnone + +define <8 x float> @stack_fold_permilpsvar_ymm(<8 x float> %a0, <8 x i32> %a1) { + ;CHECK-LABEL: stack_fold_permilpsvar_ymm + ;CHECK: vpermilps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %a1) + ret <8 x float> %2 +} +declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>) nounwind readnone + +; TODO stack_fold_rcpps + +define <4 x float> @stack_fold_rcpps_int(<4 x float> %a0) { ;CHECK-LABEL: stack_fold_rcpps_int ;CHECK: vrcpps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() @@ -1154,18 +1330,67 @@ define <8 x float> @stack_fold_rcpps_ymm_int(<8 x float> %a0) { %2 = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %a0) ret <8 x float> %2 } -declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone - -; TODO stack_fold_rcpss -; TODO stack_fold_rcpss_int - -; TODO stack_fold_roundpd -; TODO stack_fold_roundps -; TODO stack_fold_roundsd (+ int) -; TODO stack_fold_roundss (+ int) - -; TODO stack_fold_rsqrtps - +declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone + +; TODO stack_fold_rcpss + +define <4 x float> @stack_fold_rcpss_int(<4 x float> %a0) { + ;CHECK-LABEL: stack_fold_rcpss_int + ;CHECK: vrcpss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a0) + ret <4 x float> %2 +} +declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone + +define <2 x double> @stack_fold_roundpd(<2 x double> %a0) { + ;CHECK-LABEL: stack_fold_roundpd + ;CHECK: vroundpd $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 7) + ret <2 x double> %2 +} +declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone + +define <4 x double> @stack_fold_roundpd_ymm(<4 x double> %a0) { + ;CHECK-LABEL: stack_fold_roundpd_ymm + ;CHECK: vroundpd $7, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 7) + ret <4 x double> %2 +} +declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone + +define <4 x float> @stack_fold_roundps(<4 x float> %a0) { + ;CHECK-LABEL: stack_fold_roundps + ;CHECK: vroundps $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 7) + ret <4 x float> %2 +} +declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone + +define <8 x float> @stack_fold_roundps_ymm(<8 x float> %a0) { + ;CHECK-LABEL: stack_fold_roundps_ymm + ;CHECK: vroundps $7, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 7) + ret <8 x float> %2 +} +declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone + +; TODO stack_fold_roundsd + +; TODO stack_fold_roundsd_int +declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone + +; TODO stack_fold_roundss + +; TODO stack_fold_roundss_int +declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone + +; TODO stack_fold_rsqrtps + define <4 x float> @stack_fold_rsqrtps_int(<4 x float> %a0) { ;CHECK-LABEL: stack_fold_rsqrtps_int ;CHECK: vrsqrtps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload @@ -1173,15 +1398,32 @@ define <4 x float> @stack_fold_rsqrtps_int(<4 x float> %a0) { %2 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0) ret <4 x float> %2 } -declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone - -; TODO stack_fold_rsqrtps_ymm -; TODO stack_fold_rsqrtps_ymm_int -; TODO stack_fold_rsqrtss -; TODO stack_fold_rsqrtss_int - -define <2 x double> @stack_fold_shufpd(<2 x double> %a0, <2 x double> %a1) { - ;CHECK-LABEL: stack_fold_shufpd +declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone + +; TODO stack_fold_rsqrtps_ymm + +define <8 x float> @stack_fold_rsqrtps_ymm_int(<8 x float> %a0) { + ;CHECK-LABEL: stack_fold_rsqrtps_ymm_int + ;CHECK: vrsqrtps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0) + ret <8 x float> %2 +} +declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone + +; TODO stack_fold_rsqrtss + +define <4 x float> @stack_fold_rsqrtss_int(<4 x float> %a0) { + ;CHECK-LABEL: stack_fold_rsqrtss_int + ;CHECK: vrsqrtss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a0) + ret <4 x float> %2 +} +declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone + +define <2 x double> @stack_fold_shufpd(<2 x double> %a0, <2 x double> %a1) { + ;CHECK-LABEL: stack_fold_shufpd ;CHECK: vshufpd $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> @@ -1321,36 +1563,72 @@ define double @stack_fold_subsd(double %a0, double %a1) { ;CHECK: vsubsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = fsub double %a0, %a1 - ret double %2 -} - -; TODO stack_fold_subsd_int -declare <2 x double> @llvm.x86.sse2.sub.sd(<2 x double>, <2 x double>) nounwind readnone - -define float @stack_fold_subss(float %a0, float %a1) { + ret double %2 +} + +define <2 x double> @stack_fold_subsd_int(<2 x double> %a0, <2 x double> %a1) { + ;CHECK-LABEL: stack_fold_subsd_int + ;CHECK: vsubsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <2 x double> @llvm.x86.sse2.sub.sd(<2 x double> %a0, <2 x double> %a1) + ret <2 x double> %2 +} +declare <2 x double> @llvm.x86.sse2.sub.sd(<2 x double>, <2 x double>) nounwind readnone + +define float @stack_fold_subss(float %a0, float %a1) { ;CHECK-LABEL: stack_fold_subss ;CHECK: vsubss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = fsub float %a0, %a1 - ret float %2 -} - -; TODO stack_fold_subss_int -declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>) nounwind readnone - -; TODO stack_fold_testpd -declare i32 @llvm.x86.avx.vtestc.pd(<2 x double>, <2 x double>) nounwind readnone - -; TODO stack_fold_testpd_ymm -declare i32 @llvm.x86.avx.vtestc.pd.256(<4 x double>, <4 x double>) nounwind readnone - -; TODO stack_fold_testps -declare i32 @llvm.x86.avx.vtestc.ps(<4 x float>, <4 x float>) nounwind readnone - -; TODO stack_fold_testps_ymm -declare i32 @llvm.x86.avx.vtestc.ps.256(<8 x float>, <8 x float>) nounwind readnone - -define i32 @stack_fold_ucomisd(double %a0, double %a1) { + ret float %2 +} + +define <4 x float> @stack_fold_subss_int(<4 x float> %a0, <4 x float> %a1) { + ;CHECK-LABEL: stack_fold_subss_int + ;CHECK: vsubss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x float> @llvm.x86.sse.sub.ss(<4 x float> %a0, <4 x float> %a1) + ret <4 x float> %2 +} +declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>) nounwind readnone + +define i32 @stack_fold_testpd(<2 x double> %a0, <2 x double> %a1) { + ;CHECK-LABEL: stack_fold_testpd + ;CHECK: vtestpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %a1) + ret i32 %2 +} +declare i32 @llvm.x86.avx.vtestc.pd(<2 x double>, <2 x double>) nounwind readnone + +define i32 @stack_fold_testpd_ymm(<4 x double> %a0, <4 x double> %a1) { + ;CHECK-LABEL: stack_fold_testpd_ymm + ;CHECK: vtestpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %a1) + ret i32 %2 +} +declare i32 @llvm.x86.avx.vtestc.pd.256(<4 x double>, <4 x double>) nounwind readnone + +define i32 @stack_fold_testps(<4 x float> %a0, <4 x float> %a1) { + ;CHECK-LABEL: stack_fold_testps + ;CHECK: vtestps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %a1) + ret i32 %2 +} +declare i32 @llvm.x86.avx.vtestc.ps(<4 x float>, <4 x float>) nounwind readnone + +define i32 @stack_fold_testps_ymm(<8 x float> %a0, <8 x float> %a1) { + ;CHECK-LABEL: stack_fold_testps_ymm + ;CHECK: vtestps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %a1) + ret i32 %2 +} +declare i32 @llvm.x86.avx.vtestc.ps.256(<8 x float>, <8 x float>) nounwind readnone + +define i32 @stack_fold_ucomisd(double %a0, double %a1) { ;CHECK-LABEL: stack_fold_ucomisd ;CHECK: vucomisd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()