From 2aeb4564257e16617d718c91550ee28854fd37ce Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 9 Dec 2016 02:18:11 +0000 Subject: [PATCH] [AVX-512] Add vpermilps/pd to load folding tables. llvm-svn: 289173 --- llvm/lib/Target/X86/X86InstrInfo.cpp | 36 ++++++ .../CodeGen/X86/stack-folding-fp-avx512.ll | 122 ++++++++++++++++++ .../CodeGen/X86/stack-folding-fp-avx512vl.ll | 78 +++++++++++ 3 files changed, 236 insertions(+) diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 303bd1e7b496..7f88f8af4446 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -885,6 +885,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOVUPSZrr, X86::VMOVUPSZrm, 0 }, { X86::VPABSDZrr, X86::VPABSDZrm, 0 }, { X86::VPABSQZrr, X86::VPABSQZrm, 0 }, + { X86::VPERMILPDZri, X86::VPERMILPDZmi, 0 }, + { X86::VPERMILPSZri, X86::VPERMILPSZmi, 0 }, { X86::VPERMPDZri, X86::VPERMPDZmi, 0 }, { X86::VPERMQZri, X86::VPERMQZmi, 0 }, { X86::VPMOVSXBDZrr, X86::VPMOVSXBDZrm, 0 }, @@ -918,6 +920,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256rm, 0 }, { X86::VMOVUPDZ256rr, X86::VMOVUPDZ256rm, 0 }, { X86::VMOVUPSZ256rr, X86::VMOVUPSZ256rm, 0 }, + { X86::VPERMILPDZ256ri, X86::VPERMILPDZ256mi, 0 }, + { X86::VPERMILPSZ256ri, X86::VPERMILPSZ256mi, 0 }, { X86::VPERMPDZ256ri, X86::VPERMPDZ256mi, 0 }, { X86::VPERMQZ256ri, X86::VPERMQZ256mi, 0 }, { X86::VPMOVSXBDZ256rr, X86::VPMOVSXBDZ256rm, TB_NO_REVERSE }, @@ -949,6 +953,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128rm, 0 }, { X86::VMOVUPDZ128rr, X86::VMOVUPDZ128rm, 0 }, { X86::VMOVUPSZ128rr, X86::VMOVUPSZ128rm, 0 }, + { X86::VPERMILPDZ128ri, X86::VPERMILPDZ128mi, 0 }, + { X86::VPERMILPSZ128ri, X86::VPERMILPSZ128mi, 0 }, { X86::VPMOVSXBDZ128rr, X86::VPMOVSXBDZ128rm, TB_NO_REVERSE }, { X86::VPMOVSXBQZ128rr, X86::VPMOVSXBQZ128rm, TB_NO_REVERSE }, { X86::VPMOVSXBWZ128rr, X86::VPMOVSXBWZ128rm, TB_NO_REVERSE }, @@ -1874,6 +1880,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPCMPWZrri, X86::VPCMPWZrmi, 0 }, { X86::VPERMBZrr, X86::VPERMBZrm, 0 }, { X86::VPERMDZrr, X86::VPERMDZrm, 0 }, + { X86::VPERMILPDZrr, X86::VPERMILPDZrm, 0 }, + { X86::VPERMILPSZrr, X86::VPERMILPSZrm, 0 }, { X86::VPERMPDZrr, X86::VPERMPDZrm, 0 }, { X86::VPERMPSZrr, X86::VPERMPSZrm, 0 }, { X86::VPERMQZrr, X86::VPERMQZrm, 0 }, @@ -2042,6 +2050,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPERMBZ128rr, X86::VPERMBZ128rm, 0 }, { X86::VPERMBZ256rr, X86::VPERMBZ256rm, 0 }, { X86::VPERMDZ256rr, X86::VPERMDZ256rm, 0 }, + { X86::VPERMILPDZ128rr, X86::VPERMILPDZ128rm, 0 }, + { X86::VPERMILPDZ256rr, X86::VPERMILPDZ256rm, 0 }, + { X86::VPERMILPSZ128rr, X86::VPERMILPSZ128rm, 0 }, + { X86::VPERMILPSZ256rr, X86::VPERMILPSZ256rm, 0 }, { X86::VPERMPDZ256rr, X86::VPERMPDZ256rm, 0 }, { X86::VPERMPSZ256rr, X86::VPERMPSZ256rm, 0 }, { X86::VPERMQZ256rr, X86::VPERMQZ256rm, 0 }, @@ -2111,6 +2123,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VXORPSZ256rr, X86::VXORPSZ256rm, 0 }, // AVX-512 masked foldable instructions + { X86::VPERMILPDZrikz, X86::VPERMILPDZmikz, 0 }, + { X86::VPERMILPSZrikz, X86::VPERMILPSZmikz, 0 }, { X86::VPERMPDZrikz, X86::VPERMPDZmikz, 0 }, { X86::VPERMQZrikz, X86::VPERMQZmikz, 0 }, { X86::VPMOVSXBDZrrkz, X86::VPMOVSXBDZrmkz, 0 }, @@ -2130,6 +2144,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPSHUFLWZrikz, X86::VPSHUFLWZmikz, 0 }, // AVX-512VL 256-bit masked foldable instructions + { X86::VPERMILPDZ256rikz, X86::VPERMILPDZ256mikz, 0 }, + { X86::VPERMILPSZ256rikz, X86::VPERMILPSZ256mikz, 0 }, { X86::VPERMPDZ256rikz, X86::VPERMPDZ256mikz, 0 }, { X86::VPERMQZ256rikz, X86::VPERMQZ256mikz, 0 }, { X86::VPMOVSXBDZ256rrkz, X86::VPMOVSXBDZ256rmkz, TB_NO_REVERSE }, @@ -2149,6 +2165,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPSHUFLWZ256rikz, X86::VPSHUFLWZ256mikz, 0 }, // AVX-512VL 128-bit masked foldable instructions + { X86::VPERMILPDZ128rikz, X86::VPERMILPDZ128mikz, 0 }, + { X86::VPERMILPSZ128rikz, X86::VPERMILPSZ128mikz, 0 }, { X86::VPMOVSXBDZ128rrkz, X86::VPMOVSXBDZ128rmkz, TB_NO_REVERSE }, { X86::VPMOVSXBQZ128rrkz, X86::VPMOVSXBQZ128rmkz, TB_NO_REVERSE }, { X86::VPMOVSXBWZ128rrkz, X86::VPMOVSXBWZ128rmkz, TB_NO_REVERSE }, @@ -2344,6 +2362,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPANDQZrrkz, X86::VPANDQZrmkz, 0 }, { X86::VPERMBZrrkz, X86::VPERMBZrmkz, 0 }, { X86::VPERMDZrrkz, X86::VPERMDZrmkz, 0 }, + { X86::VPERMILPDZrrkz, X86::VPERMILPDZrmkz, 0 }, + { X86::VPERMILPSZrrkz, X86::VPERMILPSZrmkz, 0 }, { X86::VPERMPDZrrkz, X86::VPERMPDZrmkz, 0 }, { X86::VPERMPSZrrkz, X86::VPERMPSZrmkz, 0 }, { X86::VPERMQZrrkz, X86::VPERMQZrmkz, 0 }, @@ -2419,6 +2439,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPANDQZ256rrkz, X86::VPANDQZ256rmkz, 0 }, { X86::VPERMBZ256rrkz, X86::VPERMBZ256rmkz, 0 }, { X86::VPERMDZ256rrkz, X86::VPERMDZ256rmkz, 0 }, + { X86::VPERMILPDZ256rrkz, X86::VPERMILPDZ256rmkz, 0 }, + { X86::VPERMILPSZ256rrkz, X86::VPERMILPSZ256rmkz, 0 }, { X86::VPERMPDZ256rrkz, X86::VPERMPDZ256rmkz, 0 }, { X86::VPERMPSZ256rrkz, X86::VPERMPSZ256rmkz, 0 }, { X86::VPERMQZ256rrkz, X86::VPERMQZ256rmkz, 0 }, @@ -2489,6 +2511,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPANDNQZ128rrkz, X86::VPANDNQZ128rmkz, 0 }, { X86::VPANDQZ128rrkz, X86::VPANDQZ128rmkz, 0 }, { X86::VPERMBZ128rrkz, X86::VPERMBZ128rmkz, 0 }, + { X86::VPERMILPDZ128rrkz, X86::VPERMILPDZ128rmkz, 0 }, + { X86::VPERMILPSZ128rrkz, X86::VPERMILPSZ128rmkz, 0 }, { X86::VPERMWZ128rrkz, X86::VPERMWZ128rmkz, 0 }, { X86::VPMADDUBSWZ128rrkz, X86::VPMADDUBSWZ128rmkz, 0 }, { X86::VPMADDWDZ128rrkz, X86::VPMADDWDZ128rmkz, 0 }, @@ -2523,6 +2547,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VXORPSZ128rrkz, X86::VXORPSZ128rmkz, 0 }, // AVX-512 masked foldable instructions + { X86::VPERMILPDZrik, X86::VPERMILPDZmik, 0 }, + { X86::VPERMILPSZrik, X86::VPERMILPSZmik, 0 }, { X86::VPERMPDZrik, X86::VPERMPDZmik, 0 }, { X86::VPERMQZrik, X86::VPERMQZmik, 0 }, { X86::VPMOVSXBDZrrk, X86::VPMOVSXBDZrmk, 0 }, @@ -2542,6 +2568,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPSHUFLWZrik, X86::VPSHUFLWZmik, 0 }, // AVX-512VL 256-bit masked foldable instructions + { X86::VPERMILPDZ256rik, X86::VPERMILPDZ256mik, 0 }, + { X86::VPERMILPSZ256rik, X86::VPERMILPSZ256mik, 0 }, { X86::VPERMPDZ256rik, X86::VPERMPDZ256mik, 0 }, { X86::VPERMQZ256rik, X86::VPERMQZ256mik, 0 }, { X86::VPMOVSXBDZ256rrk, X86::VPMOVSXBDZ256rmk, TB_NO_REVERSE }, @@ -2561,6 +2589,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPSHUFLWZ256rik, X86::VPSHUFLWZ256mik, 0 }, // AVX-512VL 128-bit masked foldable instructions + { X86::VPERMILPDZ128rik, X86::VPERMILPDZ128mik, 0 }, + { X86::VPERMILPSZ128rik, X86::VPERMILPSZ128mik, 0 }, { X86::VPMOVSXBDZ128rrk, X86::VPMOVSXBDZ128rmk, TB_NO_REVERSE }, { X86::VPMOVSXBQZ128rrk, X86::VPMOVSXBQZ128rmk, TB_NO_REVERSE }, { X86::VPMOVSXBWZ128rrk, X86::VPMOVSXBWZ128rmk, TB_NO_REVERSE }, @@ -2645,6 +2675,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPANDQZrrk, X86::VPANDQZrmk, 0 }, { X86::VPERMBZrrk, X86::VPERMBZrmk, 0 }, { X86::VPERMDZrrk, X86::VPERMDZrmk, 0 }, + { X86::VPERMILPDZrrk, X86::VPERMILPDZrmk, 0 }, + { X86::VPERMILPSZrrk, X86::VPERMILPSZrmk, 0 }, { X86::VPERMPDZrrk, X86::VPERMPDZrmk, 0 }, { X86::VPERMPSZrrk, X86::VPERMPSZrmk, 0 }, { X86::VPERMQZrrk, X86::VPERMQZrmk, 0 }, @@ -2723,6 +2755,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPANDQZ256rrk, X86::VPANDQZ256rmk, 0 }, { X86::VPERMBZ256rrk, X86::VPERMBZ256rmk, 0 }, { X86::VPERMDZ256rrk, X86::VPERMDZ256rmk, 0 }, + { X86::VPERMILPDZ256rrk, X86::VPERMILPDZ256rmk, 0 }, + { X86::VPERMILPSZ256rrk, X86::VPERMILPSZ256rmk, 0 }, { X86::VPERMPDZ256rrk, X86::VPERMPDZ256rmk, 0 }, { X86::VPERMPSZ256rrk, X86::VPERMPSZ256rmk, 0 }, { X86::VPERMQZ256rrk, X86::VPERMQZ256rmk, 0 }, @@ -2797,6 +2831,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPANDNQZ128rrk, X86::VPANDNQZ128rmk, 0 }, { X86::VPANDQZ128rrk, X86::VPANDQZ128rmk, 0 }, { X86::VPERMBZ128rrk, X86::VPERMBZ128rmk, 0 }, + { X86::VPERMILPDZ128rrk, X86::VPERMILPDZ128rmk, 0 }, + { X86::VPERMILPSZ128rrk, X86::VPERMILPSZ128rmk, 0 }, { X86::VPERMWZ128rrk, X86::VPERMWZ128rmk, 0 }, { X86::VPMADDUBSWZ128rrk, X86::VPMADDUBSWZ128rmk, 0 }, { X86::VPMADDWDZ128rrk, X86::VPMADDWDZ128rmk, 0 }, diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll index 4214fb8915cd..4a1ad4b9158b 100644 --- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll +++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll @@ -606,5 +606,127 @@ define <16 x float> @stack_fold_permps(<16 x i32> %a0, <16 x float> %a1) { } declare <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float>, <16 x i32>, <16 x float>, i16) nounwind readonly +define <8 x double> @stack_fold_permilpd_zmm(<8 x double> %a0) { + ;CHECK-LABEL: stack_fold_permilpd_zmm + ;CHECK: vpermilpd $85, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> + ret <8 x double> %2 +} + +define <8 x double> @stack_fold_permilpd_zmm_mask(<8 x double>* %passthru, <8 x double> %a0, i8 %mask) { + ;CHECK-LABEL: stack_fold_permilpd_zmm_mask + ;CHECK: vpermilpd $85, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> + %3 = bitcast i8 %mask to <8 x i1> + ; load needed to keep the operation from being scheduled above the asm block + %4 = load <8 x double>, <8 x double>* %passthru + %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4 + ret <8 x double> %5 +} + +define <8 x double> @stack_fold_permilpd_zmm_maskz(<8 x double> %a0, i8 %mask) { + ;CHECK-LABEL: stack_fold_permilpd_zmm_maskz + ;CHECK: vpermilpd $85, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> + %3 = bitcast i8 %mask to <8 x i1> + %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer + ret <8 x double> %4 +} + +define <8 x double> @stack_fold_permilpdvar_zmm(<8 x double> %a0, <8 x i64> %a1) { + ;CHECK-LABEL: stack_fold_permilpdvar_zmm + ;CHECK: vpermilpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %a0, <8 x i64> %a1, <8 x double> undef, i8 -1) + ret <8 x double> %2 +} +declare <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double>, <8 x i64>, <8 x double>, i8) nounwind readnone + +define <8 x double> @stack_fold_permilpdvar_zmm_mask(<8 x double>* %passthru, <8 x double> %a0, <8 x i64> %a1, i8 %mask) { + ;CHECK-LABEL: stack_fold_permilpdvar_zmm_mask + ;CHECK: vpermilpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %a0, <8 x i64> %a1, <8 x double> undef, i8 -1) + %3 = bitcast i8 %mask to <8 x i1> + ; load needed to keep the operation from being scheduled above the asm block + %4 = load <8 x double>, <8 x double>* %passthru + %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4 + ret <8 x double> %5 +} + +define <8 x double> @stack_fold_permilpdvar_zmm_maskz(<8 x double> %a0, <8 x i64> %a1, i8 %mask) { + ;CHECK-LABEL: stack_fold_permilpdvar_zmm_maskz + ;CHECK: vpermilpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %a0, <8 x i64> %a1, <8 x double> undef, i8 -1) + %3 = bitcast i8 %mask to <8 x i1> + %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer + ret <8 x double> %4 +} + +define <16 x float> @stack_fold_permilps_zmm(<16 x float> %a0) { + ;CHECK-LABEL: stack_fold_permilps_zmm + ;CHECK: vpermilps $27, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> + ret <16 x float> %2 +} + +define <16 x float> @stack_fold_permilps_zmm_mask(<16 x float>* %passthru, <16 x float> %a0, i16 %mask) { + ;CHECK-LABEL: stack_fold_permilps_zmm_mask + ;CHECK: vpermilps $27, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> + %3 = bitcast i16 %mask to <16 x i1> + ; load needed to keep the operation from being scheduled above the asm block + %4 = load <16 x float>, <16 x float>* %passthru + %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4 + ret <16 x float> %5 +} + +define <16 x float> @stack_fold_permilps_zmm_maskz(<16 x float> %a0, i16 %mask) { + ;CHECK-LABEL: stack_fold_permilps_zmm_maskz + ;CHECK: vpermilps $27, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> + %3 = bitcast i16 %mask to <16 x i1> + %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer + ret <16 x float> %4 +} + +define <16 x float> @stack_fold_permilpsvar_zmm(<16 x float> %a0, <16 x i32> %a1) { + ;CHECK-LABEL: stack_fold_permilpsvar_zmm + ;CHECK: vpermilps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %a0, <16 x i32> %a1, <16 x float> undef, i16 -1) + ret <16 x float> %2 +} +declare <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16) nounwind readnone + +define <16 x float> @stack_fold_permilpsvar_zmm_mask(<16 x float>* %passthru, <16 x float> %a0, <16 x i32> %a1, i16 %mask) { + ;CHECK-LABEL: stack_fold_permilpsvar_zmm_mask + ;CHECK: vpermilps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %a0, <16 x i32> %a1, <16 x float> undef, i16 -1) + %3 = bitcast i16 %mask to <16 x i1> + ; load needed to keep the operation from being scheduled above the asm block + %4 = load <16 x float>, <16 x float>* %passthru + %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4 + ret <16 x float> %5 +} + +define <16 x float> @stack_fold_permilpsvar_zmm_maskz(<16 x float> %a0, <16 x i32> %a1, i16 %mask) { + ;CHECK-LABEL: stack_fold_permilpsvar_zmm_maskz + ;CHECK: vpermilps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %a0, <16 x i32> %a1, <16 x float> undef, i16 -1) + %3 = bitcast i16 %mask to <16 x i1> + %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer + ret <16 x float> %4 +} + attributes #0 = { "unsafe-fp-math"="false" } attributes #1 = { "unsafe-fp-math"="true" } diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll index dad7cee92f4a..198a96df6b1f 100644 --- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll +++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll @@ -620,5 +620,83 @@ define <8 x float> @stack_fold_permps(<8 x i32> %a0, <8 x float> %a1) { } declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly +define <2 x double> @stack_fold_permilpd(<2 x double> %a0) { + ;CHECK-LABEL: stack_fold_permilpd + ;CHECK: vpermilpd $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> + ret <2 x double> %2 +} + +define <4 x double> @stack_fold_permilpd_ymm(<4 x double> %a0) { + ;CHECK-LABEL: stack_fold_permilpd_ymm + ;CHECK: vpermilpd $5, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> + ret <4 x double> %2 +} + +define <2 x double> @stack_fold_permilpdvar(<2 x double> %a0, <2 x i64> %a1) { + ;CHECK-LABEL: stack_fold_permilpdvar + ;CHECK: vpermilpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %a1) + ret <2 x double> %2 +} +declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>) nounwind readnone + +define <4 x double> @stack_fold_permilpdvar_ymm(<4 x double> %a0, <4 x i64> %a1) { + ;CHECK-LABEL: stack_fold_permilpdvar_ymm + ;CHECK: vpermilpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %a1) + ret <4 x double> %2 +} +declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>) nounwind readnone + +define <4 x float> @stack_fold_permilps(<4 x float> %a0) { + ;CHECK-LABEL: stack_fold_permilps + ;CHECK: vpermilps $27, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> + ret <4 x float> %2 +} + +define <8 x float> @stack_fold_permilps_ymm(<8 x float> %a0) { + ;CHECK-LABEL: stack_fold_permilps_ymm + ;CHECK: vpermilps $27, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> + ret <8 x float> %2 +} + +define <4 x float> @stack_fold_permilpsvar(<4 x float> %a0, <4 x i32> %a1) { + ;CHECK-LABEL: stack_fold_permilpsvar + ;CHECK: vpermilps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %a1) + ret <4 x float> %2 +} +declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) nounwind readnone + +define <8 x float> @stack_fold_permilpsvar_ymm(<8 x float> %a0, <8 x i32> %a1) { + ;CHECK-LABEL: stack_fold_permilpsvar_ymm + ;CHECK: vpermilps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %a1) + ret <8 x float> %2 +} +declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>) nounwind readnone + +define <8 x float> @stack_fold_permilpsvar_ymm_maskz(<8 x float> %a0, <8 x i32> %a1, i8 %mask) { + ;CHECK-LABEL: stack_fold_permilpsvar_ymm_maskz + ;CHECK: vpermilps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %a1) + %3 = bitcast i8 %mask to <8 x i1> + %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> zeroinitializer + ret <8 x float> %4 +} + attributes #0 = { "unsafe-fp-math"="false" } attributes #1 = { "unsafe-fp-math"="true" }