From ab13b33dedb60ecd8f19afb0d9d519a4cd15be16 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 22 Jul 2016 05:00:35 +0000 Subject: [PATCH] [AVX512] Update X86InstrInfo::foldMemoryOperandCustom to handle the EVEX encoded instructions too. llvm-svn: 276390 --- llvm/lib/Target/X86/X86InstrInfo.cpp | 12 ++++++++---- llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll | 10 ++++++++++ 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index e64f4548031f..052191a3bff6 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -5780,6 +5780,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom( switch (MI.getOpcode()) { case X86::INSERTPSrr: case X86::VINSERTPSrr: + case X86::VINSERTPSZrr: // Attempt to convert the load of inserted vector into a fold load // of a single float. if (OpNum == 2) { @@ -5793,8 +5794,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom( int PtrOffset = SrcIdx * 4; unsigned NewImm = (DstIdx << 4) | ZMask; unsigned NewOpCode = - (MI.getOpcode() == X86::VINSERTPSrr ? X86::VINSERTPSrm - : X86::INSERTPSrm); + (MI.getOpcode() == X86::VINSERTPSZrr) ? X86::VINSERTPSZrm : + (MI.getOpcode() == X86::VINSERTPSrr) ? X86::VINSERTPSrm : + X86::INSERTPSrm; MachineInstr *NewMI = FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, PtrOffset); NewMI->getOperand(NewMI->getNumOperands() - 1).setImm(NewImm); @@ -5804,6 +5806,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom( break; case X86::MOVHLPSrr: case X86::VMOVHLPSrr: + case X86::VMOVHLPSZrr: // Move the upper 64-bits of the second operand to the lower 64-bits. // To fold the load, adjust the pointer to the upper and use (V)MOVLPS. // TODO: In most cases AVX doesn't have a 8-byte alignment requirement. @@ -5811,8 +5814,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom( unsigned RCSize = getRegClass(MI.getDesc(), OpNum, &RI, MF)->getSize(); if (Size <= RCSize && 8 <= Align) { unsigned NewOpCode = - (MI.getOpcode() == X86::VMOVHLPSrr ? X86::VMOVLPSrm - : X86::MOVLPSrm); + (MI.getOpcode() == X86::VMOVHLPSZrr) ? X86::VMOVLPSZ128rm : + (MI.getOpcode() == X86::VMOVHLPSrr) ? X86::VMOVLPSrm : + X86::MOVLPSrm; MachineInstr *NewMI = FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, 8); return NewMI; diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll index 3ab96e3f4629..d79babfc38ca 100644 --- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll +++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll @@ -68,6 +68,16 @@ define <4 x float> @stack_fold_divss_int(<4 x float> %a0, <4 x float> %a1) { } declare <4 x float> @llvm.x86.sse.div.ss(<4 x float>, <4 x float>) nounwind readnone +define <4 x float> @stack_fold_insertps(<4 x float> %a0, <4 x float> %a1) { + ;CHECK-LABEL: stack_fold_insertps + ;CHECK: vinsertps $17, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + ;CHECK-NEXT: {{.*#+}} xmm0 = zero,mem[0],xmm0[2,3] + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 209) + ret <4 x float> %2 +} +declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone + define double @stack_fold_mulsd(double %a0, double %a1) { ;CHECK-LABEL: stack_fold_mulsd ;CHECK: vmulsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload