From b00ffd8cb7a0f5fd51c9968e4f59b4b3eca52e47 Mon Sep 17 00:00:00 2001 From: Hans Wennborg Date: Thu, 18 May 2017 18:50:05 +0000 Subject: [PATCH] Revert r302938 "Add LiveRangeShrink pass to shrink live range within BB." This also reverts follow-ups r303292 and r303298. It broke some Chromium tests under MSan, and apparently also internal tests at Google. llvm-svn: 303369 --- llvm/include/llvm/CodeGen/Passes.h | 4 - llvm/include/llvm/InitializePasses.h | 1 - llvm/lib/CodeGen/CMakeLists.txt | 1 - llvm/lib/CodeGen/CodeGen.cpp | 1 - llvm/lib/CodeGen/LiveRangeShrink.cpp | 211 ---- llvm/lib/Target/X86/X86TargetMachine.cpp | 1 - llvm/test/CodeGen/NVPTX/vec8.ll | 2 +- llvm/test/CodeGen/SPARC/LeonItinerariesUT.ll | 2 +- .../test/CodeGen/X86/2007-01-08-InstrSched.ll | 4 +- llvm/test/CodeGen/X86/avg.ll | 853 ++++++++-------- .../CodeGen/X86/avx-intrinsics-fast-isel.ll | 52 +- llvm/test/CodeGen/X86/avx.ll | 4 +- .../CodeGen/X86/avx512-cmp-kor-sequence.ll | 6 +- .../X86/avx512-gather-scatter-intrin.ll | 10 +- .../CodeGen/X86/avx512-intrinsics-upgrade.ll | 44 +- llvm/test/CodeGen/X86/avx512-intrinsics.ll | 215 ++-- llvm/test/CodeGen/X86/avx512-mask-spills.ll | 40 +- .../X86/avx512bw-intrinsics-upgrade.ll | 12 +- llvm/test/CodeGen/X86/avx512bw-intrinsics.ll | 16 +- .../X86/avx512bwvl-intrinsics-upgrade.ll | 24 +- .../X86/avx512cdvl-intrinsics-upgrade.ll | 2 +- .../test/CodeGen/X86/avx512cdvl-intrinsics.ll | 2 +- .../X86/avx512dq-intrinsics-upgrade.ll | 2 +- llvm/test/CodeGen/X86/avx512dq-intrinsics.ll | 4 +- .../X86/avx512dqvl-intrinsics-upgrade.ll | 10 +- .../test/CodeGen/X86/avx512dqvl-intrinsics.ll | 4 +- .../test/CodeGen/X86/avx512ifma-intrinsics.ll | 8 +- .../CodeGen/X86/avx512ifmavl-intrinsics.ll | 16 +- .../X86/avx512vl-intrinsics-upgrade.ll | 64 +- llvm/test/CodeGen/X86/avx512vl-intrinsics.ll | 28 +- llvm/test/CodeGen/X86/bswap_tree2.ll | 35 +- llvm/test/CodeGen/X86/fold-tied-op.ll | 7 +- llvm/test/CodeGen/X86/fp128-i128.ll | 2 +- llvm/test/CodeGen/X86/haddsub-2.ll | 12 +- llvm/test/CodeGen/X86/lrshrink.ll | 57 -- llvm/test/CodeGen/X86/madd.ll | 34 +- .../X86/merge-consecutive-loads-128.ll | 16 +- llvm/test/CodeGen/X86/misched-matrix.ll | 4 +- llvm/test/CodeGen/X86/oddshuffles.ll | 34 +- llvm/test/CodeGen/X86/pmul.ll | 55 +- llvm/test/CodeGen/X86/pr29112.ll | 8 +- llvm/test/CodeGen/X86/pr31088.ll | 2 +- llvm/test/CodeGen/X86/rotate.ll | 16 +- llvm/test/CodeGen/X86/sad.ll | 953 +++++++++--------- llvm/test/CodeGen/X86/select.ll | 30 +- llvm/test/CodeGen/X86/setcc-wide-types.ll | 56 +- llvm/test/CodeGen/X86/shrink_vmul_sse.ll | 2 +- .../CodeGen/X86/sse-intrinsics-fast-isel.ll | 10 +- llvm/test/CodeGen/X86/sse1.ll | 8 +- llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll | 14 +- llvm/test/CodeGen/X86/sse41.ll | 8 +- llvm/test/CodeGen/X86/vec_int_to_fp.ll | 84 +- llvm/test/CodeGen/X86/vector-bitreverse.ll | 6 +- llvm/test/CodeGen/X86/vector-blend.ll | 4 +- llvm/test/CodeGen/X86/vector-sqrt.ll | 8 +- .../CodeGen/X86/x86-interleaved-access.ll | 14 +- 56 files changed, 1433 insertions(+), 1689 deletions(-) delete mode 100644 llvm/lib/CodeGen/LiveRangeShrink.cpp delete mode 100644 llvm/test/CodeGen/X86/lrshrink.ll diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h index 3bcfc1c4254b..f3e04cffcda6 100644 --- a/llvm/include/llvm/CodeGen/Passes.h +++ b/llvm/include/llvm/CodeGen/Passes.h @@ -133,10 +133,6 @@ namespace llvm { // instruction and update the MachineFunctionInfo with that information. extern char &ShrinkWrapID; - /// LiveRangeShrink pass. Move instruction close to its definition to shrink - /// the definition's live range. - extern char &LiveRangeShrinkID; - /// Greedy register allocator. extern char &RAGreedyID; diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h index cf314e19d1ca..3df5244a0bd6 100644 --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -187,7 +187,6 @@ void initializeLintPass(PassRegistry&); void initializeLiveDebugValuesPass(PassRegistry&); void initializeLiveDebugVariablesPass(PassRegistry&); void initializeLiveIntervalsPass(PassRegistry&); -void initializeLiveRangeShrinkPass(PassRegistry&); void initializeLiveRegMatrixPass(PassRegistry&); void initializeLiveStacksPass(PassRegistry&); void initializeLiveVariablesPass(PassRegistry&); diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt index 55a27e2fb79e..2b5863aa5800 100644 --- a/llvm/lib/CodeGen/CMakeLists.txt +++ b/llvm/lib/CodeGen/CMakeLists.txt @@ -49,7 +49,6 @@ add_llvm_library(LLVMCodeGen LivePhysRegs.cpp LiveRangeCalc.cpp LiveRangeEdit.cpp - LiveRangeShrink.cpp LiveRegMatrix.cpp LiveRegUnits.cpp LiveStackAnalysis.cpp diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp index 4d30c6574b12..2a2715beaadc 100644 --- a/llvm/lib/CodeGen/CodeGen.cpp +++ b/llvm/lib/CodeGen/CodeGen.cpp @@ -43,7 +43,6 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeLiveDebugValuesPass(Registry); initializeLiveDebugVariablesPass(Registry); initializeLiveIntervalsPass(Registry); - initializeLiveRangeShrinkPass(Registry); initializeLiveStacksPass(Registry); initializeLiveVariablesPass(Registry); initializeLocalStackSlotPassPass(Registry); diff --git a/llvm/lib/CodeGen/LiveRangeShrink.cpp b/llvm/lib/CodeGen/LiveRangeShrink.cpp deleted file mode 100644 index 00182e2c779f..000000000000 --- a/llvm/lib/CodeGen/LiveRangeShrink.cpp +++ /dev/null @@ -1,211 +0,0 @@ -//===-- LiveRangeShrink.cpp - Move instructions to shrink live range ------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -///===---------------------------------------------------------------------===// -/// -/// \file -/// This pass moves instructions close to the definition of its operands to -/// shrink live range of the def instruction. The code motion is limited within -/// the basic block. The moved instruction should have 1 def, and more than one -/// uses, all of which are the only use of the def. -/// -///===---------------------------------------------------------------------===// -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/Passes.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/Support/Debug.h" - -#define DEBUG_TYPE "lrshrink" - -STATISTIC(NumInstrsHoistedToShrinkLiveRange, - "Number of insructions hoisted to shrink live range."); - -using namespace llvm; - -namespace { -class LiveRangeShrink : public MachineFunctionPass { -public: - static char ID; - - LiveRangeShrink() : MachineFunctionPass(ID) { - initializeLiveRangeShrinkPass(*PassRegistry::getPassRegistry()); - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - MachineFunctionPass::getAnalysisUsage(AU); - } - - StringRef getPassName() const override { return "Live Range Shrink"; } - - bool runOnMachineFunction(MachineFunction &MF) override; -}; -} // End anonymous namespace. - -char LiveRangeShrink::ID = 0; -char &llvm::LiveRangeShrinkID = LiveRangeShrink::ID; - -INITIALIZE_PASS(LiveRangeShrink, "lrshrink", "Live Range Shrink Pass", false, - false) -namespace { -typedef DenseMap InstOrderMap; - -/// Returns \p New if it's dominated by \p Old, otherwise return \p Old. -/// \p M maintains a map from instruction to its dominating order that satisfies -/// M[A] > M[B] guarantees that A is dominated by B. -/// If \p New is not in \p M, return \p Old. Otherwise if \p Old is null, return -/// \p New. -MachineInstr *FindDominatedInstruction(MachineInstr &New, MachineInstr *Old, - const InstOrderMap &M) { - auto NewIter = M.find(&New); - if (NewIter == M.end()) - return Old; - if (Old == nullptr) - return &New; - unsigned OrderOld = M.find(Old)->second; - unsigned OrderNew = NewIter->second; - if (OrderOld != OrderNew) - return OrderOld < OrderNew ? &New : Old; - // OrderOld == OrderNew, we need to iterate down from Old to see if it - // can reach New, if yes, New is dominated by Old. - for (MachineInstr *I = Old->getNextNode(); M.find(I)->second == OrderNew; - I = I->getNextNode()) - if (I == &New) - return &New; - return Old; -} - -/// Builds Instruction to its dominating order number map \p M by traversing -/// from instruction \p Start. -void BuildInstOrderMap(MachineBasicBlock::iterator Start, InstOrderMap &M) { - M.clear(); - unsigned i = 0; - for (MachineInstr &I : make_range(Start, Start->getParent()->end())) - M[&I] = i++; -} -} // end anonymous namespace - -bool LiveRangeShrink::runOnMachineFunction(MachineFunction &MF) { - if (skipFunction(*MF.getFunction())) - return false; - - MachineRegisterInfo &MRI = MF.getRegInfo(); - - DEBUG(dbgs() << "**** Analysing " << MF.getName() << '\n'); - - InstOrderMap IOM; - // Map from register to instruction order (value of IOM) where the - // register is used last. When moving instructions up, we need to - // make sure all its defs (including dead def) will not cross its - // last use when moving up. - DenseMap UseMap; - - for (MachineBasicBlock &MBB : MF) { - if (MBB.empty()) - continue; - bool SawStore = false; - BuildInstOrderMap(MBB.begin(), IOM); - UseMap.clear(); - - for (MachineBasicBlock::iterator Next = MBB.begin(); Next != MBB.end();) { - MachineInstr &MI = *Next; - ++Next; - if (MI.isPHI() || MI.isDebugValue()) - continue; - if (MI.mayStore()) - SawStore = true; - - unsigned CurrentOrder = IOM[&MI]; - unsigned Barrier = 0; - for (const MachineOperand &MO : MI.operands()) { - if (!MO.isReg() || MO.isDebug()) - continue; - if (MO.isUse()) - UseMap[MO.getReg()] = CurrentOrder; - else if (MO.isDead() && UseMap.count(MO.getReg())) - // Barrier is the last instruction where MO get used. MI should not - // be moved above Barrier. - Barrier = std::max(Barrier, UseMap[MO.getReg()]); - } - - if (!MI.isSafeToMove(nullptr, SawStore)) { - // If MI has side effects, it should become a barrier for code motion. - // IOM is rebuild from the next instruction to prevent later - // instructions from being moved before this MI. - if (MI.hasUnmodeledSideEffects() && Next != MBB.end()) { - BuildInstOrderMap(Next, IOM); - SawStore = false; - } - continue; - } - - const MachineOperand *DefMO = nullptr; - MachineInstr *Insert = nullptr; - - // Number of live-ranges that will be shortened. We do not count - // live-ranges that are defined by a COPY as it could be coalesced later. - unsigned NumEligibleUse = 0; - - for (const MachineOperand &MO : MI.operands()) { - if (!MO.isReg() || MO.isDead() || MO.isDebug()) - continue; - unsigned Reg = MO.getReg(); - // Do not move the instruction if it def/uses a physical register, - // unless it is a constant physical register. - if (TargetRegisterInfo::isPhysicalRegister(Reg) && - !MRI.isConstantPhysReg(Reg)) { - Insert = nullptr; - break; - } - if (MO.isDef()) { - // Do not move if there is more than one def. - if (DefMO) { - Insert = nullptr; - break; - } - DefMO = &MO; - } else if (MRI.hasOneNonDBGUse(Reg) && MRI.hasOneDef(Reg)) { - MachineInstr &DefInstr = *MRI.def_instr_begin(Reg); - if (!DefInstr.isCopy()) - NumEligibleUse++; - Insert = FindDominatedInstruction(DefInstr, Insert, IOM); - } else { - Insert = nullptr; - break; - } - } - // Move the instruction when # of shrunk live range > 1. - if (DefMO && Insert && NumEligibleUse > 1 && Barrier <= IOM[Insert]) { - MachineBasicBlock::iterator I = std::next(Insert->getIterator()); - // Skip all the PHI and debug instructions. - while (I != MBB.end() && (I->isPHI() || I->isDebugValue())) - I = std::next(I); - if (I == MI.getIterator()) - continue; - - // Update the dominator order to be the same as the insertion point. - // We do this to maintain a non-decreasing order without need to update - // all instruction orders after the insertion point. - unsigned NewOrder = IOM[&*I]; - IOM[&MI] = NewOrder; - NumInstrsHoistedToShrinkLiveRange++; - - // Find MI's debug value following MI. - MachineBasicBlock::iterator EndIter = std::next(MI.getIterator()); - if (MI.getOperand(0).isReg()) - for (; EndIter != MBB.end() && EndIter->isDebugValue() && - EndIter->getOperand(0).isReg() && - EndIter->getOperand(0).getReg() == MI.getOperand(0).getReg(); - ++EndIter, ++Next) - IOM[&*EndIter] = NewOrder; - MBB.splice(I, &MBB, MI.getIterator(), EndIter); - } - } - } - return false; -} diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index fdda866b5977..e717f84104c5 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -440,7 +440,6 @@ bool X86PassConfig::addPreISel() { void X86PassConfig::addPreRegAlloc() { if (getOptLevel() != CodeGenOpt::None) { - addPass(&LiveRangeShrinkID); addPass(createX86FixupSetCC()); addPass(createX86OptimizeLEAs()); addPass(createX86CallFrameOptimization()); diff --git a/llvm/test/CodeGen/NVPTX/vec8.ll b/llvm/test/CodeGen/NVPTX/vec8.ll index 93b39c1125f8..a86ba1e29d5c 100644 --- a/llvm/test/CodeGen/NVPTX/vec8.ll +++ b/llvm/test/CodeGen/NVPTX/vec8.ll @@ -7,7 +7,7 @@ define void @foo(<8 x i8> %a, i8* %b) { ; CHECK-DAG: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [foo_param_0] ; CHECK-DAG: ld.param.v4.u8 {[[E4:%rs[0-9]+]], [[E5:%rs[0-9]+]], [[E6:%rs[0-9]+]], [[E7:%rs[0-9]+]]}, [foo_param_0+4] ; CHECK-DAG: ld.param.u32 %[[B:r[0-9+]]], [foo_param_1] -; CHECK-DAG: add.s16 [[T:%rs[0-9+]]], [[E1]], [[E6]]; +; CHECK: add.s16 [[T:%rs[0-9+]]], [[E1]], [[E6]]; ; CHECK: st.u8 [%[[B]]], [[T]]; %t0 = extractelement <8 x i8> %a, i32 1 %t1 = extractelement <8 x i8> %a, i32 6 diff --git a/llvm/test/CodeGen/SPARC/LeonItinerariesUT.ll b/llvm/test/CodeGen/SPARC/LeonItinerariesUT.ll index 5a6be134686b..87e0c4621c08 100644 --- a/llvm/test/CodeGen/SPARC/LeonItinerariesUT.ll +++ b/llvm/test/CodeGen/SPARC/LeonItinerariesUT.ll @@ -47,4 +47,4 @@ entry: %6 = fmul float %5, %3 %7 = fdiv float %6, %4 ret float %7 -} +} \ No newline at end of file diff --git a/llvm/test/CodeGen/X86/2007-01-08-InstrSched.ll b/llvm/test/CodeGen/X86/2007-01-08-InstrSched.ll index 24aa5b98d0bb..4ec703921e29 100644 --- a/llvm/test/CodeGen/X86/2007-01-08-InstrSched.ll +++ b/llvm/test/CodeGen/X86/2007-01-08-InstrSched.ll @@ -13,10 +13,10 @@ define float @foo(float %x) nounwind { ; CHECK: mulss ; CHECK: mulss -; CHECK: addss +; CHECK: mulss ; CHECK: mulss ; CHECK: addss -; CHECK: mulss +; CHECK: addss ; CHECK: addss ; CHECK: ret } diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll index aa28ef5175ed..2aaf14001758 100644 --- a/llvm/test/CodeGen/X86/avg.ll +++ b/llvm/test/CodeGen/X86/avg.ll @@ -135,87 +135,88 @@ define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) { define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) { ; SSE2-LABEL: avg_v32i8: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm3 -; SSE2-NEXT: movdqa 16(%rdi), %xmm8 +; SSE2-NEXT: movdqa (%rdi), %xmm8 +; SSE2-NEXT: movdqa 16(%rdi), %xmm11 ; SSE2-NEXT: movdqa (%rsi), %xmm0 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 ; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm3, %xmm12 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm8, %xmm7 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm7, %xmm11 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] ; SSE2-NEXT: movdqa %xmm8, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm10, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm8, %xmm12 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm2, %xmm9 +; SSE2-NEXT: movdqa %xmm11, %xmm15 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm4[8],xmm15[9],xmm4[9],xmm15[10],xmm4[10],xmm15[11],xmm4[11],xmm15[12],xmm4[12],xmm15[13],xmm4[13],xmm15[14],xmm4[14],xmm15[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm15, %xmm14 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm11, %xmm9 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] -; SSE2-NEXT: paddd %xmm6, %xmm9 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE2-NEXT: paddd %xmm5, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; SSE2-NEXT: paddd %xmm12, %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE2-NEXT: paddd %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; SSE2-NEXT: paddd %xmm11, %xmm6 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE2-NEXT: paddd %xmm7, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: movdqa %xmm3, %xmm7 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; SSE2-NEXT: paddd %xmm10, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm1, %xmm13 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: paddd %xmm8, %xmm1 +; SSE2-NEXT: paddd %xmm11, %xmm1 +; SSE2-NEXT: paddd %xmm9, %xmm13 +; SSE2-NEXT: paddd %xmm15, %xmm2 +; SSE2-NEXT: paddd %xmm14, %xmm5 +; SSE2-NEXT: paddd %xmm8, %xmm0 +; SSE2-NEXT: paddd %xmm12, %xmm6 +; SSE2-NEXT: paddd %xmm10, %xmm3 +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1] -; SSE2-NEXT: paddd %xmm4, %xmm9 -; SSE2-NEXT: paddd %xmm4, %xmm2 -; SSE2-NEXT: paddd %xmm4, %xmm5 -; SSE2-NEXT: paddd %xmm4, %xmm0 -; SSE2-NEXT: paddd %xmm4, %xmm6 -; SSE2-NEXT: paddd %xmm4, %xmm3 ; SSE2-NEXT: paddd %xmm4, %xmm7 +; SSE2-NEXT: paddd %xmm4, %xmm3 +; SSE2-NEXT: paddd %xmm4, %xmm6 +; SSE2-NEXT: paddd %xmm4, %xmm0 +; SSE2-NEXT: paddd %xmm4, %xmm5 +; SSE2-NEXT: paddd %xmm4, %xmm2 +; SSE2-NEXT: paddd %xmm4, %xmm13 ; SSE2-NEXT: paddd %xmm4, %xmm1 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm7 ; SSE2-NEXT: psrld $1, %xmm3 -; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: psrld $1, %xmm0 -; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: psrld $1, %xmm9 +; SSE2-NEXT: psrld $1, %xmm7 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-NEXT: pand %xmm4, %xmm9 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: packuswb %xmm9, %xmm2 -; SSE2-NEXT: pand %xmm4, %xmm5 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: packuswb %xmm5, %xmm0 -; SSE2-NEXT: packuswb %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm4, %xmm6 -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: packuswb %xmm6, %xmm3 ; SSE2-NEXT: pand %xmm4, %xmm7 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: packuswb %xmm7, %xmm3 +; SSE2-NEXT: psrld $1, %xmm0 +; SSE2-NEXT: psrld $1, %xmm6 +; SSE2-NEXT: pand %xmm4, %xmm6 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: packuswb %xmm6, %xmm0 +; SSE2-NEXT: packuswb %xmm3, %xmm0 +; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: psrld $1, %xmm5 +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: packuswb %xmm5, %xmm2 +; SSE2-NEXT: psrld $1, %xmm1 +; SSE2-NEXT: psrld $1, %xmm13 +; SSE2-NEXT: pand %xmm4, %xmm13 ; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: packuswb %xmm7, %xmm1 -; SSE2-NEXT: packuswb %xmm3, %xmm1 +; SSE2-NEXT: packuswb %xmm13, %xmm1 +; SSE2-NEXT: packuswb %xmm2, %xmm1 ; SSE2-NEXT: movdqu %xmm1, (%rax) ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq @@ -258,183 +259,198 @@ define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) { define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) { ; SSE2-LABEL: avg_v64i8: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm6 -; SSE2-NEXT: movdqa 16(%rdi), %xmm2 -; SSE2-NEXT: movdqa 32(%rdi), %xmm1 -; SSE2-NEXT: movdqa 48(%rdi), %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa (%rsi), %xmm5 -; SSE2-NEXT: movdqa 16(%rsi), %xmm13 -; SSE2-NEXT: movdqa 32(%rsi), %xmm11 +; SSE2-NEXT: subq $152, %rsp +; SSE2-NEXT: .Lcfi0: +; SSE2-NEXT: .cfi_def_cfa_offset 160 +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: movdqa 16(%rdi), %xmm4 +; SSE2-NEXT: movdqa 32(%rdi), %xmm5 +; SSE2-NEXT: movdqa 48(%rdi), %xmm6 ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm6, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm4, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm6, %xmm12 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm2, %xmm15 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm15, %xmm14 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, %xmm8 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm5, %xmm10 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm10, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: paddd %xmm7, %xmm3 -; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm1, %xmm7 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE2-NEXT: paddd %xmm4, %xmm10 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm5, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: paddd %xmm12, %xmm3 -; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE2-NEXT: paddd %xmm6, %xmm5 -; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm13, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm4, %xmm12 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] -; SSE2-NEXT: paddd %xmm14, %xmm12 -; SSE2-NEXT: movdqa %xmm7, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE2-NEXT: paddd %xmm15, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3],xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm13, %xmm15 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] -; SSE2-NEXT: paddd %xmm8, %xmm15 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; SSE2-NEXT: paddd %xmm2, %xmm13 -; SSE2-NEXT: movdqa %xmm11, %xmm6 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm6, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] -; SSE2-NEXT: paddd %xmm5, %xmm9 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE2-NEXT: paddd %xmm7, %xmm6 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3],xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm11, %xmm14 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] -; SSE2-NEXT: paddd %xmm2, %xmm14 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; SSE2-NEXT: paddd %xmm1, %xmm11 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movdqa 48(%rsi), %xmm7 -; SSE2-NEXT: movdqa %xmm7, %xmm3 +; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm4, %xmm3 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm3, %xmm8 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] -; SSE2-NEXT: paddd %xmm1, %xmm8 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE2-NEXT: paddd %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm4, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm5, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] ; SSE2-NEXT: movdqa %xmm5, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15] +; SSE2-NEXT: movdqa %xmm8, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa (%rsi), %xmm14 +; SSE2-NEXT: movdqa %xmm14, %xmm7 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15] +; SSE2-NEXT: movdqa %xmm7, %xmm15 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm14, %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; SSE2-NEXT: movdqa 16(%rsi), %xmm12 +; SSE2-NEXT: movdqa %xmm12, %xmm6 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] +; SSE2-NEXT: movdqa %xmm6, %xmm13 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3],xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm12, %xmm10 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] +; SSE2-NEXT: movdqa 32(%rsi), %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] +; SSE2-NEXT: movdqa %xmm5, %xmm11 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm7, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSE2-NEXT: paddd %xmm1, %xmm5 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE2-NEXT: paddd %xmm2, %xmm7 +; SSE2-NEXT: movdqa 48(%rsi), %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] +; SSE2-NEXT: movdqa %xmm4, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload +; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: paddd %xmm8, %xmm4 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Reload +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Folded Reload +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Folded Reload +; SSE2-NEXT: paddd (%rsp), %xmm11 # 16-byte Folded Reload +; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm12 # 16-byte Folded Reload +; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm10 # 16-byte Folded Reload +; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm6 # 16-byte Folded Reload +; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm13 # 16-byte Folded Reload +; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm14 # 16-byte Folded Reload +; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm9 # 16-byte Folded Reload +; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload +; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm15 # 16-byte Folded Reload ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1] -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: paddd %xmm0, %xmm10 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: paddd %xmm0, %xmm2 -; SSE2-NEXT: paddd %xmm0, %xmm12 -; SSE2-NEXT: paddd %xmm0, %xmm4 ; SSE2-NEXT: paddd %xmm0, %xmm15 -; SSE2-NEXT: paddd %xmm0, %xmm13 -; SSE2-NEXT: paddd %xmm0, %xmm9 -; SSE2-NEXT: paddd %xmm0, %xmm6 -; SSE2-NEXT: paddd %xmm0, %xmm14 -; SSE2-NEXT: paddd %xmm0, %xmm11 -; SSE2-NEXT: paddd %xmm0, %xmm8 -; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: paddd %xmm0, %xmm5 ; SSE2-NEXT: paddd %xmm0, %xmm7 -; SSE2-NEXT: psrld $1, %xmm10 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm10 -; SSE2-NEXT: packuswb %xmm1, %xmm10 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: packuswb %xmm1, %xmm2 -; SSE2-NEXT: packuswb %xmm10, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: psrld $1, %xmm4 -; SSE2-NEXT: psrld $1, %xmm12 -; SSE2-NEXT: pand %xmm0, %xmm12 -; SSE2-NEXT: pand %xmm0, %xmm4 -; SSE2-NEXT: packuswb %xmm12, %xmm4 -; SSE2-NEXT: psrld $1, %xmm13 +; SSE2-NEXT: paddd %xmm0, %xmm9 +; SSE2-NEXT: paddd %xmm0, %xmm14 +; SSE2-NEXT: paddd %xmm0, %xmm13 +; SSE2-NEXT: paddd %xmm0, %xmm6 +; SSE2-NEXT: paddd %xmm0, %xmm10 +; SSE2-NEXT: paddd %xmm0, %xmm12 +; SSE2-NEXT: paddd %xmm0, %xmm11 +; SSE2-NEXT: paddd %xmm0, %xmm5 +; SSE2-NEXT: paddd %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: paddd %xmm0, %xmm2 +; SSE2-NEXT: paddd %xmm0, %xmm8 +; SSE2-NEXT: paddd %xmm0, %xmm4 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload +; SSE2-NEXT: paddd %xmm0, %xmm3 +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: psrld $1, %xmm7 ; SSE2-NEXT: psrld $1, %xmm15 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; SSE2-NEXT: pand %xmm0, %xmm15 -; SSE2-NEXT: pand %xmm0, %xmm13 -; SSE2-NEXT: packuswb %xmm15, %xmm13 -; SSE2-NEXT: packuswb %xmm4, %xmm13 -; SSE2-NEXT: psrld $1, %xmm6 +; SSE2-NEXT: pand %xmm0, %xmm7 +; SSE2-NEXT: packuswb %xmm15, %xmm7 +; SSE2-NEXT: psrld $1, %xmm14 ; SSE2-NEXT: psrld $1, %xmm9 ; SSE2-NEXT: pand %xmm0, %xmm9 -; SSE2-NEXT: pand %xmm0, %xmm6 -; SSE2-NEXT: packuswb %xmm9, %xmm6 -; SSE2-NEXT: psrld $1, %xmm11 -; SSE2-NEXT: psrld $1, %xmm14 ; SSE2-NEXT: pand %xmm0, %xmm14 +; SSE2-NEXT: packuswb %xmm9, %xmm14 +; SSE2-NEXT: packuswb %xmm7, %xmm14 +; SSE2-NEXT: psrld $1, %xmm6 +; SSE2-NEXT: psrld $1, %xmm13 +; SSE2-NEXT: pand %xmm0, %xmm13 +; SSE2-NEXT: pand %xmm0, %xmm6 +; SSE2-NEXT: packuswb %xmm13, %xmm6 +; SSE2-NEXT: psrld $1, %xmm12 +; SSE2-NEXT: psrld $1, %xmm10 +; SSE2-NEXT: pand %xmm0, %xmm10 +; SSE2-NEXT: pand %xmm0, %xmm12 +; SSE2-NEXT: packuswb %xmm10, %xmm12 +; SSE2-NEXT: packuswb %xmm6, %xmm12 +; SSE2-NEXT: psrld $1, %xmm5 +; SSE2-NEXT: psrld $1, %xmm11 ; SSE2-NEXT: pand %xmm0, %xmm11 -; SSE2-NEXT: packuswb %xmm14, %xmm11 -; SSE2-NEXT: packuswb %xmm6, %xmm11 -; SSE2-NEXT: psrld $1, %xmm3 -; SSE2-NEXT: psrld $1, %xmm8 -; SSE2-NEXT: pand %xmm0, %xmm8 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: packuswb %xmm8, %xmm3 -; SSE2-NEXT: psrld $1, %xmm7 +; SSE2-NEXT: pand %xmm0, %xmm5 +; SSE2-NEXT: packuswb %xmm11, %xmm5 +; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload +; SSE2-NEXT: psrld $1, %xmm6 +; SSE2-NEXT: pand %xmm0, %xmm6 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: packuswb %xmm6, %xmm2 +; SSE2-NEXT: packuswb %xmm5, %xmm2 +; SSE2-NEXT: psrld $1, %xmm4 +; SSE2-NEXT: movdqa %xmm8, %xmm5 ; SSE2-NEXT: psrld $1, %xmm5 ; SSE2-NEXT: pand %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm0, %xmm7 -; SSE2-NEXT: packuswb %xmm5, %xmm7 -; SSE2-NEXT: packuswb %xmm3, %xmm7 -; SSE2-NEXT: movdqu %xmm7, (%rax) -; SSE2-NEXT: movdqu %xmm11, (%rax) -; SSE2-NEXT: movdqu %xmm13, (%rax) +; SSE2-NEXT: pand %xmm0, %xmm4 +; SSE2-NEXT: packuswb %xmm5, %xmm4 +; SSE2-NEXT: psrld $1, %xmm1 +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: psrld $1, %xmm5 +; SSE2-NEXT: pand %xmm0, %xmm5 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: packuswb %xmm5, %xmm1 +; SSE2-NEXT: packuswb %xmm4, %xmm1 ; SSE2-NEXT: movdqu %xmm1, (%rax) +; SSE2-NEXT: movdqu %xmm2, (%rax) +; SSE2-NEXT: movdqu %xmm12, (%rax) +; SSE2-NEXT: movdqu %xmm14, (%rax) +; SSE2-NEXT: addq $152, %rsp ; SSE2-NEXT: retq ; ; AVX2-LABEL: avg_v64i8: @@ -448,21 +464,21 @@ define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) { ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm12 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm15, %ymm7, %ymm7 +; AVX2-NEXT: vpaddd %ymm14, %ymm6, %ymm6 +; AVX2-NEXT: vpaddd %ymm13, %ymm5, %ymm5 +; AVX2-NEXT: vpaddd %ymm12, %ymm4, %ymm4 +; AVX2-NEXT: vpaddd %ymm11, %ymm3, %ymm3 +; AVX2-NEXT: vpaddd %ymm10, %ymm2, %ymm2 +; AVX2-NEXT: vpaddd %ymm9, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm8, %ymm0, %ymm0 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm8, %ymm1, %ymm1 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm8, %ymm2, %ymm2 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm8, %ymm3, %ymm3 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm8, %ymm4, %ymm4 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm8, %ymm5, %ymm5 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm8, %ymm6, %ymm6 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm8, %ymm7, %ymm7 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm8 ; AVX2-NEXT: vpaddd %ymm8, %ymm0, %ymm9 ; AVX2-NEXT: vpaddd %ymm8, %ymm1, %ymm10 @@ -524,13 +540,13 @@ define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) { ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpaddd %zmm7, %zmm3, %zmm3 +; AVX512F-NEXT: vpaddd %zmm6, %zmm2, %zmm2 +; AVX512F-NEXT: vpaddd %zmm5, %zmm1, %zmm1 ; AVX512F-NEXT: vpaddd %zmm4, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpaddd %zmm4, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpaddd %zmm4, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpaddd %zmm4, %zmm3, %zmm3 ; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm4 ; AVX512F-NEXT: vpaddd %zmm4, %zmm0, %zmm0 ; AVX512F-NEXT: vpaddd %zmm4, %zmm1, %zmm1 @@ -657,27 +673,27 @@ define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) { define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) { ; SSE2-LABEL: avg_v16i16: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm2 -; SSE2-NEXT: movdqa 16(%rdi), %xmm4 +; SSE2-NEXT: movdqa (%rdi), %xmm4 +; SSE2-NEXT: movdqa 16(%rdi), %xmm5 ; SSE2-NEXT: movdqa (%rsi), %xmm0 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE2-NEXT: movdqa %xmm4, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: movdqa %xmm4, %xmm8 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; SSE2-NEXT: movdqa %xmm5, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; SSE2-NEXT: paddd %xmm6, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE2-NEXT: paddd %xmm2, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE2-NEXT: paddd %xmm5, %xmm1 ; SSE2-NEXT: paddd %xmm7, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; SSE2-NEXT: paddd %xmm4, %xmm1 +; SSE2-NEXT: paddd %xmm4, %xmm0 +; SSE2-NEXT: paddd %xmm8, %xmm3 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1] ; SSE2-NEXT: paddd %xmm4, %xmm3 ; SSE2-NEXT: paddd %xmm4, %xmm0 @@ -739,79 +755,80 @@ define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) { define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) { ; SSE2-LABEL: avg_v32i16: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm4 -; SSE2-NEXT: movdqa 16(%rdi), %xmm11 -; SSE2-NEXT: movdqa 32(%rdi), %xmm10 +; SSE2-NEXT: movdqa (%rdi), %xmm10 +; SSE2-NEXT: movdqa 16(%rdi), %xmm9 +; SSE2-NEXT: movdqa 32(%rdi), %xmm11 ; SSE2-NEXT: movdqa 48(%rdi), %xmm8 -; SSE2-NEXT: movdqa (%rsi), %xmm9 +; SSE2-NEXT: movdqa (%rsi), %xmm14 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 ; SSE2-NEXT: movdqa 32(%rsi), %xmm2 ; SSE2-NEXT: movdqa 48(%rsi), %xmm3 ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm11, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm10, %xmm12 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm10, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm9, %xmm12 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm11, %xmm15 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] ; SSE2-NEXT: movdqa %xmm8, %xmm13 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm9, %xmm7 +; SSE2-NEXT: movdqa %xmm14, %xmm7 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; SSE2-NEXT: paddd %xmm6, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; SSE2-NEXT: paddd %xmm4, %xmm9 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] ; SSE2-NEXT: movdqa %xmm1, %xmm6 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; SSE2-NEXT: paddd %xmm5, %xmm6 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: paddd %xmm11, %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm5 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSE2-NEXT: paddd %xmm12, %xmm5 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: paddd %xmm10, %xmm2 ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSE2-NEXT: paddd %xmm13, %xmm4 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; SSE2-NEXT: paddd %xmm8, %xmm3 +; SSE2-NEXT: paddd %xmm13, %xmm4 +; SSE2-NEXT: paddd %xmm11, %xmm2 +; SSE2-NEXT: paddd %xmm15, %xmm5 +; SSE2-NEXT: paddd %xmm9, %xmm1 +; SSE2-NEXT: paddd %xmm12, %xmm6 +; SSE2-NEXT: paddd %xmm10, %xmm14 +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1] ; SSE2-NEXT: paddd %xmm0, %xmm7 -; SSE2-NEXT: paddd %xmm0, %xmm9 +; SSE2-NEXT: paddd %xmm0, %xmm14 ; SSE2-NEXT: paddd %xmm0, %xmm6 ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: paddd %xmm0, %xmm5 ; SSE2-NEXT: paddd %xmm0, %xmm2 ; SSE2-NEXT: paddd %xmm0, %xmm4 ; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: psrld $1, %xmm3 -; SSE2-NEXT: psrld $1, %xmm4 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: psrld $1, %xmm9 +; SSE2-NEXT: psrld $1, %xmm14 ; SSE2-NEXT: psrld $1, %xmm7 ; SSE2-NEXT: pslld $16, %xmm7 ; SSE2-NEXT: psrad $16, %xmm7 -; SSE2-NEXT: pslld $16, %xmm9 -; SSE2-NEXT: psrad $16, %xmm9 -; SSE2-NEXT: packssdw %xmm7, %xmm9 +; SSE2-NEXT: pslld $16, %xmm14 +; SSE2-NEXT: psrad $16, %xmm14 +; SSE2-NEXT: packssdw %xmm7, %xmm14 +; SSE2-NEXT: psrld $1, %xmm1 +; SSE2-NEXT: psrld $1, %xmm6 ; SSE2-NEXT: pslld $16, %xmm6 ; SSE2-NEXT: psrad $16, %xmm6 ; SSE2-NEXT: pslld $16, %xmm1 ; SSE2-NEXT: psrad $16, %xmm1 ; SSE2-NEXT: packssdw %xmm6, %xmm1 +; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: psrld $1, %xmm5 ; SSE2-NEXT: pslld $16, %xmm5 ; SSE2-NEXT: psrad $16, %xmm5 ; SSE2-NEXT: pslld $16, %xmm2 ; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: packssdw %xmm5, %xmm2 +; SSE2-NEXT: psrld $1, %xmm3 +; SSE2-NEXT: psrld $1, %xmm4 ; SSE2-NEXT: pslld $16, %xmm4 ; SSE2-NEXT: psrad $16, %xmm4 ; SSE2-NEXT: pslld $16, %xmm3 @@ -820,7 +837,7 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) { ; SSE2-NEXT: movdqu %xmm3, (%rax) ; SSE2-NEXT: movdqu %xmm2, (%rax) ; SSE2-NEXT: movdqu %xmm1, (%rax) -; SSE2-NEXT: movdqu %xmm9, (%rax) +; SSE2-NEXT: movdqu %xmm14, (%rax) ; SSE2-NEXT: retq ; ; AVX2-LABEL: avg_v32i16: @@ -830,13 +847,13 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) { ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpaddd %ymm7, %ymm3, %ymm3 +; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2 +; AVX2-NEXT: vpaddd %ymm5, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpaddd %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm4 ; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1 @@ -867,9 +884,9 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) { ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpaddd %zmm3, %zmm1, %zmm1 ; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm2 ; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1 @@ -1030,87 +1047,88 @@ define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) { define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) { ; SSE2-LABEL: avg_v32i8_2: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm3 -; SSE2-NEXT: movdqa 16(%rdi), %xmm8 +; SSE2-NEXT: movdqa (%rdi), %xmm8 +; SSE2-NEXT: movdqa 16(%rdi), %xmm11 ; SSE2-NEXT: movdqa (%rsi), %xmm0 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 ; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm3, %xmm12 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm8, %xmm7 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm7, %xmm11 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] ; SSE2-NEXT: movdqa %xmm8, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm10, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm8, %xmm12 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm2, %xmm9 +; SSE2-NEXT: movdqa %xmm11, %xmm15 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm4[8],xmm15[9],xmm4[9],xmm15[10],xmm4[10],xmm15[11],xmm4[11],xmm15[12],xmm4[12],xmm15[13],xmm4[13],xmm15[14],xmm4[14],xmm15[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm15, %xmm14 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm11, %xmm9 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] -; SSE2-NEXT: paddd %xmm6, %xmm9 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE2-NEXT: paddd %xmm5, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; SSE2-NEXT: paddd %xmm12, %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE2-NEXT: paddd %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; SSE2-NEXT: paddd %xmm11, %xmm6 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE2-NEXT: paddd %xmm7, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: movdqa %xmm3, %xmm7 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; SSE2-NEXT: paddd %xmm10, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm1, %xmm13 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: paddd %xmm8, %xmm1 +; SSE2-NEXT: paddd %xmm11, %xmm1 +; SSE2-NEXT: paddd %xmm9, %xmm13 +; SSE2-NEXT: paddd %xmm15, %xmm2 +; SSE2-NEXT: paddd %xmm14, %xmm5 +; SSE2-NEXT: paddd %xmm8, %xmm0 +; SSE2-NEXT: paddd %xmm12, %xmm6 +; SSE2-NEXT: paddd %xmm10, %xmm3 +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1] -; SSE2-NEXT: paddd %xmm4, %xmm9 -; SSE2-NEXT: paddd %xmm4, %xmm2 -; SSE2-NEXT: paddd %xmm4, %xmm5 -; SSE2-NEXT: paddd %xmm4, %xmm0 -; SSE2-NEXT: paddd %xmm4, %xmm6 -; SSE2-NEXT: paddd %xmm4, %xmm3 ; SSE2-NEXT: paddd %xmm4, %xmm7 +; SSE2-NEXT: paddd %xmm4, %xmm3 +; SSE2-NEXT: paddd %xmm4, %xmm6 +; SSE2-NEXT: paddd %xmm4, %xmm0 +; SSE2-NEXT: paddd %xmm4, %xmm5 +; SSE2-NEXT: paddd %xmm4, %xmm2 +; SSE2-NEXT: paddd %xmm4, %xmm13 ; SSE2-NEXT: paddd %xmm4, %xmm1 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm7 ; SSE2-NEXT: psrld $1, %xmm3 -; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: psrld $1, %xmm0 -; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: psrld $1, %xmm9 +; SSE2-NEXT: psrld $1, %xmm7 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-NEXT: pand %xmm4, %xmm9 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: packuswb %xmm9, %xmm2 -; SSE2-NEXT: pand %xmm4, %xmm5 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: packuswb %xmm5, %xmm0 -; SSE2-NEXT: packuswb %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm4, %xmm6 -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: packuswb %xmm6, %xmm3 ; SSE2-NEXT: pand %xmm4, %xmm7 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: packuswb %xmm7, %xmm3 +; SSE2-NEXT: psrld $1, %xmm0 +; SSE2-NEXT: psrld $1, %xmm6 +; SSE2-NEXT: pand %xmm4, %xmm6 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: packuswb %xmm6, %xmm0 +; SSE2-NEXT: packuswb %xmm3, %xmm0 +; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: psrld $1, %xmm5 +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: packuswb %xmm5, %xmm2 +; SSE2-NEXT: psrld $1, %xmm1 +; SSE2-NEXT: psrld $1, %xmm13 +; SSE2-NEXT: pand %xmm4, %xmm13 ; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: packuswb %xmm7, %xmm1 -; SSE2-NEXT: packuswb %xmm3, %xmm1 +; SSE2-NEXT: packuswb %xmm13, %xmm1 +; SSE2-NEXT: packuswb %xmm2, %xmm1 ; SSE2-NEXT: movdqu %xmm1, (%rax) ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq @@ -1494,27 +1512,27 @@ define void @avg_v8i16_2(<8 x i16>* %a, <8 x i16>* %b) { define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) { ; SSE2-LABEL: avg_v16i16_2: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm2 -; SSE2-NEXT: movdqa 16(%rdi), %xmm4 +; SSE2-NEXT: movdqa (%rdi), %xmm4 +; SSE2-NEXT: movdqa 16(%rdi), %xmm5 ; SSE2-NEXT: movdqa (%rsi), %xmm0 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE2-NEXT: movdqa %xmm4, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: movdqa %xmm4, %xmm8 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; SSE2-NEXT: movdqa %xmm5, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; SSE2-NEXT: paddd %xmm6, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE2-NEXT: paddd %xmm2, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE2-NEXT: paddd %xmm5, %xmm1 ; SSE2-NEXT: paddd %xmm7, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; SSE2-NEXT: paddd %xmm4, %xmm1 +; SSE2-NEXT: paddd %xmm4, %xmm0 +; SSE2-NEXT: paddd %xmm8, %xmm3 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1] ; SSE2-NEXT: paddd %xmm4, %xmm3 ; SSE2-NEXT: paddd %xmm4, %xmm0 @@ -1576,79 +1594,80 @@ define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) { define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) { ; SSE2-LABEL: avg_v32i16_2: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm4 -; SSE2-NEXT: movdqa 16(%rdi), %xmm11 -; SSE2-NEXT: movdqa 32(%rdi), %xmm10 +; SSE2-NEXT: movdqa (%rdi), %xmm10 +; SSE2-NEXT: movdqa 16(%rdi), %xmm9 +; SSE2-NEXT: movdqa 32(%rdi), %xmm11 ; SSE2-NEXT: movdqa 48(%rdi), %xmm8 -; SSE2-NEXT: movdqa (%rsi), %xmm9 +; SSE2-NEXT: movdqa (%rsi), %xmm14 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 ; SSE2-NEXT: movdqa 32(%rsi), %xmm2 ; SSE2-NEXT: movdqa 48(%rsi), %xmm3 ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm11, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm10, %xmm12 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm10, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm9, %xmm12 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm11, %xmm15 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] ; SSE2-NEXT: movdqa %xmm8, %xmm13 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm9, %xmm7 +; SSE2-NEXT: movdqa %xmm14, %xmm7 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; SSE2-NEXT: paddd %xmm6, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; SSE2-NEXT: paddd %xmm4, %xmm9 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] ; SSE2-NEXT: movdqa %xmm1, %xmm6 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; SSE2-NEXT: paddd %xmm5, %xmm6 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: paddd %xmm11, %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm5 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSE2-NEXT: paddd %xmm12, %xmm5 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: paddd %xmm10, %xmm2 ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSE2-NEXT: paddd %xmm13, %xmm4 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; SSE2-NEXT: paddd %xmm8, %xmm3 +; SSE2-NEXT: paddd %xmm13, %xmm4 +; SSE2-NEXT: paddd %xmm11, %xmm2 +; SSE2-NEXT: paddd %xmm15, %xmm5 +; SSE2-NEXT: paddd %xmm9, %xmm1 +; SSE2-NEXT: paddd %xmm12, %xmm6 +; SSE2-NEXT: paddd %xmm10, %xmm14 +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1] ; SSE2-NEXT: paddd %xmm0, %xmm7 -; SSE2-NEXT: paddd %xmm0, %xmm9 +; SSE2-NEXT: paddd %xmm0, %xmm14 ; SSE2-NEXT: paddd %xmm0, %xmm6 ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: paddd %xmm0, %xmm5 ; SSE2-NEXT: paddd %xmm0, %xmm2 ; SSE2-NEXT: paddd %xmm0, %xmm4 ; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: psrld $1, %xmm3 -; SSE2-NEXT: psrld $1, %xmm4 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: psrld $1, %xmm9 +; SSE2-NEXT: psrld $1, %xmm14 ; SSE2-NEXT: psrld $1, %xmm7 ; SSE2-NEXT: pslld $16, %xmm7 ; SSE2-NEXT: psrad $16, %xmm7 -; SSE2-NEXT: pslld $16, %xmm9 -; SSE2-NEXT: psrad $16, %xmm9 -; SSE2-NEXT: packssdw %xmm7, %xmm9 +; SSE2-NEXT: pslld $16, %xmm14 +; SSE2-NEXT: psrad $16, %xmm14 +; SSE2-NEXT: packssdw %xmm7, %xmm14 +; SSE2-NEXT: psrld $1, %xmm1 +; SSE2-NEXT: psrld $1, %xmm6 ; SSE2-NEXT: pslld $16, %xmm6 ; SSE2-NEXT: psrad $16, %xmm6 ; SSE2-NEXT: pslld $16, %xmm1 ; SSE2-NEXT: psrad $16, %xmm1 ; SSE2-NEXT: packssdw %xmm6, %xmm1 +; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: psrld $1, %xmm5 ; SSE2-NEXT: pslld $16, %xmm5 ; SSE2-NEXT: psrad $16, %xmm5 ; SSE2-NEXT: pslld $16, %xmm2 ; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: packssdw %xmm5, %xmm2 +; SSE2-NEXT: psrld $1, %xmm3 +; SSE2-NEXT: psrld $1, %xmm4 ; SSE2-NEXT: pslld $16, %xmm4 ; SSE2-NEXT: psrad $16, %xmm4 ; SSE2-NEXT: pslld $16, %xmm3 @@ -1657,7 +1676,7 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) { ; SSE2-NEXT: movdqu %xmm3, (%rax) ; SSE2-NEXT: movdqu %xmm2, (%rax) ; SSE2-NEXT: movdqu %xmm1, (%rax) -; SSE2-NEXT: movdqu %xmm9, (%rax) +; SSE2-NEXT: movdqu %xmm14, (%rax) ; SSE2-NEXT: retq ; ; AVX2-LABEL: avg_v32i16_2: @@ -1667,13 +1686,13 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) { ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpaddd %ymm7, %ymm3, %ymm3 +; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2 +; AVX2-NEXT: vpaddd %ymm5, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpaddd %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm4 ; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1 @@ -1704,9 +1723,9 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) { ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpaddd %zmm3, %zmm1, %zmm1 ; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm2 ; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1 diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll index 3cadbe2a8db3..ff5a2371a145 100644 --- a/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll @@ -2244,11 +2244,11 @@ define <4 x double> @test_mm256_set_pd(double %a0, double %a1, double %a2, doubl ; X32: # BB#0: ; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; X32-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; X32-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; X32-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X32-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; X32-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; X32-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X32-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_set_pd: @@ -2269,19 +2269,19 @@ define <8 x float> @test_mm256_set_ps(float %a0, float %a1, float %a2, float %a3 ; X32: # BB#0: ; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X32-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; X32-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero +; X32-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero +; X32-NEXT: vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero +; X32-NEXT: vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero +; X32-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3] +; X32-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3] +; X32-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0] ; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; X32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; X32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] -; X32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] -; X32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; X32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] -; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] +; X32-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_set_ps: @@ -2881,10 +2881,10 @@ define <4 x double> @test_mm256_setr_pd(double %a0, double %a1, double %a2, doub ; X32: # BB#0: ; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; X32-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; X32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; X32-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; X32-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; X32-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; X32-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; X32-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm2[0] ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X32-NEXT: retl ; @@ -2908,16 +2908,16 @@ define <8 x float> @test_mm256_setr_ps(float %a0, float %a1, float %a2, float %a ; X32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X32-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; X32-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero +; X32-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero +; X32-NEXT: vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero +; X32-NEXT: vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero ; X32-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] ; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] ; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; X32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X32-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; X32-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; X32-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] -; X32-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3] -; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] +; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[2,3] +; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm5[0],xmm1[3] +; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X32-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/avx.ll b/llvm/test/CodeGen/X86/avx.ll index 647b7a8f4dfc..341dd867e4ff 100644 --- a/llvm/test/CodeGen/X86/avx.ll +++ b/llvm/test/CodeGen/X86/avx.ll @@ -113,9 +113,9 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl ; CHECK-NOT: mov ; CHECK: insertps $48 ; CHECK: insertps $48 +; CHECK: insertps $48 +; CHECK: insertps $48 ; CHECK: vaddps -; CHECK: insertps $48 -; CHECK: insertps $48 ; CHECK: vaddps ; CHECK: vaddps ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/X86/avx512-cmp-kor-sequence.ll b/llvm/test/CodeGen/X86/avx512-cmp-kor-sequence.ll index e29cf09718ad..63b0281a7339 100644 --- a/llvm/test/CodeGen/X86/avx512-cmp-kor-sequence.ll +++ b/llvm/test/CodeGen/X86/avx512-cmp-kor-sequence.ll @@ -13,10 +13,10 @@ define zeroext i16 @cmp_kor_seq_16(<16 x float> %a, <16 x float> %b, <16 x float ; CHECK: # BB#0: # %entry ; CHECK-NEXT: vcmpgeps %zmm4, %zmm0, %k0 ; CHECK-NEXT: vcmpgeps %zmm4, %zmm1, %k1 +; CHECK-NEXT: vcmpgeps %zmm4, %zmm2, %k2 +; CHECK-NEXT: vcmpgeps %zmm4, %zmm3, %k3 ; CHECK-NEXT: korw %k1, %k0, %k0 -; CHECK-NEXT: vcmpgeps %zmm4, %zmm2, %k1 -; CHECK-NEXT: vcmpgeps %zmm4, %zmm3, %k2 -; CHECK-NEXT: korw %k2, %k1, %k1 +; CHECK-NEXT: korw %k3, %k2, %k1 ; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: # kill: %AX %AX %EAX diff --git a/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin.ll b/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin.ll index c03623a2f035..4890afec2164 100644 --- a/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin.ll +++ b/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin.ll @@ -852,16 +852,16 @@ define <16 x float> @gather_mask_test(<16 x i32> %ind, <16 x float> %src, i8* %b ; CHECK-NEXT: kxorw %k0, %k0, %k1 ; CHECK-NEXT: vmovaps %zmm1, %zmm3 ; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1} -; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: movw $1, %ax ; CHECK-NEXT: kmovd %eax, %k1 -; CHECK-NEXT: vmovaps %zmm1, %zmm3 -; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm4 +; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm4 {%k1} ; CHECK-NEXT: movw $220, %ax ; CHECK-NEXT: kmovd %eax, %k1 ; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} -; CHECK-NEXT: vaddps %zmm3, %zmm1, %zmm0 -; CHECK-NEXT: vaddps %zmm2, %zmm0, %zmm0 +; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm0 +; CHECK-NEXT: vaddps %zmm4, %zmm1, %zmm1 +; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 -1, i32 4) %res1 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 0, i32 4) diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll index 56962ca2671d..0e7a8d25c56f 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -9,8 +9,8 @@ define <16 x float> @test_x86_vbroadcast_ss_ps_512(<4 x float> %a0, <16 x float> ; CHECK-NEXT: vbroadcastss %xmm0, %zmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vbroadcastss %xmm0, %zmm1 {%k1} -; CHECK-NEXT: vaddps %zmm1, %zmm2, %zmm1 ; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: vaddps %zmm1, %zmm2, %zmm1 ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq @@ -30,8 +30,8 @@ define <8 x double> @test_x86_vbroadcast_sd_pd_512(<2 x double> %a0, <8 x double ; CHECK-NEXT: vbroadcastsd %xmm0, %zmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1} -; CHECK-NEXT: vaddpd %zmm1, %zmm2, %zmm1 ; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: vaddpd %zmm1, %zmm2, %zmm1 ; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq @@ -51,8 +51,8 @@ define <16 x i32>@test_int_x86_avx512_pbroadcastd_512(<4 x i32> %x0, <16 x i32> ; CHECK-NEXT: vpbroadcastd %xmm0, %zmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpbroadcastd %xmm0, %zmm1 {%k1} -; CHECK-NEXT: vpaddd %zmm1, %zmm2, %zmm1 ; CHECK-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: vpaddd %zmm1, %zmm2, %zmm1 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 -1) @@ -71,8 +71,8 @@ define <8 x i64>@test_int_x86_avx512_pbroadcastq_512(<2 x i64> %x0, <8 x i64> %x ; CHECK-NEXT: vpbroadcastq %xmm0, %zmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpbroadcastq %xmm0, %zmm1 {%k1} -; CHECK-NEXT: vpaddq %zmm1, %zmm2, %zmm1 ; CHECK-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: vpaddq %zmm1, %zmm2, %zmm1 ; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 -1) @@ -91,8 +91,8 @@ define <16 x float>@test_int_x86_avx512_mask_movsldup_512(<16 x float> %x0, <16 ; CHECK-NEXT: vmovsldup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovsldup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] -; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1 ; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] +; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1 ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2) @@ -111,8 +111,8 @@ define <16 x float>@test_int_x86_avx512_mask_movshdup_512(<16 x float> %x0, <16 ; CHECK-NEXT: vmovshdup {{.*#+}} zmm2 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovshdup {{.*#+}} zmm1 {%k1} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] -; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1 ; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1 ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2) @@ -131,8 +131,8 @@ define <8 x double>@test_int_x86_avx512_mask_movddup_512(<8 x double> %x0, <8 x ; CHECK-NEXT: vmovddup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6] ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovddup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6] -; CHECK-NEXT: vaddpd %zmm2, %zmm1, %zmm1 ; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6] +; CHECK-NEXT: vaddpd %zmm2, %zmm1, %zmm1 ; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 %x2) @@ -671,9 +671,9 @@ define <8 x i64>@test_int_x86_avx512_mask_punpcklqd_q_512(<8 x i64> %x0, <8 x i6 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm1 +; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) %res1 = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) @@ -1616,9 +1616,9 @@ define <8 x double>@test_int_x86_avx512_mask_shuf_pd_512(<8 x double> %x0, <8 x ; CHECK-NEXT: vshufpd {{.*#+}} zmm3 = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6] ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vshufpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6] -; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6] -; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm1 +; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 %x4) %res1 = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 -1) @@ -2031,8 +2031,8 @@ define <8 x i64>@test_int_x86_avx512_mask_psrl_qi_512(<8 x i64> %x0, i32 %x1, <8 ; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm2 ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1 ; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i32 4, <8 x i64> %x2, i8 %x3) @@ -2051,8 +2051,8 @@ define <16 x i32>@test_int_x86_avx512_mask_psrl_di_512(<16 x i32> %x0, i32 %x1, ; CHECK-NEXT: vpsrld $4, %zmm0, %zmm2 ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpsrld $4, %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1 ; CHECK-NEXT: vpsrld $4, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1 ; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 4, <16 x i32> %x2, i16 %x3) @@ -2651,8 +2651,8 @@ define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512_constant_pool(<16 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpermilps {{.*#+}} zmm2 {%k1} = zmm0[2,3,0,1,7,6,5,4,9,8,11,10,12,13,14,15] ; CHECK-NEXT: vpermilps {{.*#+}} zmm1 {%k1} {z} = zmm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] -; CHECK-NEXT: vaddps %zmm1, %zmm2, %zmm1 ; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12] +; CHECK-NEXT: vaddps %zmm1, %zmm2, %zmm1 ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> , <16 x float> %x2, i16 %x3) @@ -2989,9 +2989,9 @@ define <16 x float>@test_int_x86_avx512_mask_insertf32x4_512(<16 x float> %x0, < ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm3 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: vaddps %zmm2, %zmm0, %zmm0 +; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm1 +; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 %x4) %res1 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 -1) @@ -3010,9 +3010,9 @@ define <16 x i32>@test_int_x86_avx512_mask_inserti32x4_512(<16 x i32> %x0, <4 x ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm3 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0 +; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4) %res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1) @@ -3030,9 +3030,9 @@ define <8 x double>@test_int_x86_avx512_mask_insertf64x4_512(<8 x double> %x0, < ; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm3 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: vaddpd %zmm2, %zmm0, %zmm0 +; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm1 +; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 %x4) %res1 = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 -1) @@ -3050,9 +3050,9 @@ define <8 x i64>@test_int_x86_avx512_mask_inserti64x4_512(<8 x i64> %x0, <4 x i6 ; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm1 +; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4) %res1 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1) diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll index f800d01064ba..cc5e9e038e0b 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll @@ -274,11 +274,11 @@ define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> % ; CHECK-NEXT: vmovaps %xmm2, %xmm3 ; CHECK-NEXT: vsqrtss %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vsqrtss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vaddps %xmm2, %xmm3, %xmm2 -; CHECK-NEXT: vsqrtss {ru-sae}, %xmm1, %xmm0, %xmm3 {%k1} {z} +; CHECK-NEXT: vsqrtss {ru-sae}, %xmm1, %xmm0, %xmm4 {%k1} {z} ; CHECK-NEXT: vsqrtss {rz-sae}, %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 -; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: vaddps %xmm2, %xmm3, %xmm1 +; CHECK-NEXT: vaddps %xmm0, %xmm4, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %res0 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 1) @@ -301,11 +301,11 @@ define <2 x double> @test_sqrt_sd(<2 x double> %a0, <2 x double> %a1, <2 x doubl ; CHECK-NEXT: vmovapd %xmm2, %xmm3 ; CHECK-NEXT: vsqrtsd %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vsqrtsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vaddpd %xmm2, %xmm3, %xmm2 -; CHECK-NEXT: vsqrtsd {ru-sae}, %xmm1, %xmm0, %xmm3 {%k1} {z} +; CHECK-NEXT: vsqrtsd {ru-sae}, %xmm1, %xmm0, %xmm4 {%k1} {z} ; CHECK-NEXT: vsqrtsd {rz-sae}, %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 -; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: vaddpd %xmm2, %xmm3, %xmm1 +; CHECK-NEXT: vaddpd %xmm0, %xmm4, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %res0 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 1) @@ -477,11 +477,11 @@ declare i64 @llvm.x86.avx512.cvttss2usi64(<4 x float>, i32) nounwind readnone define i64 @test_x86_avx512_cvtsd2usi64(<2 x double> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtsd2usi64: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtsd2usi %xmm0, %rax -; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %rcx -; CHECK-NEXT: addq %rax, %rcx -; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %rax +; CHECK-NEXT: vcvtsd2usi %xmm0, %rcx +; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %rax +; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %rdx ; CHECK-NEXT: addq %rcx, %rax +; CHECK-NEXT: addq %rdx, %rax ; CHECK-NEXT: retq %res = call i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double> %a0, i32 4) @@ -496,11 +496,11 @@ declare i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double>, i32) nounwind readnone define i64 @test_x86_avx512_cvtsd2si64(<2 x double> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtsd2si64: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtsd2si %xmm0, %rax -; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %rcx -; CHECK-NEXT: addq %rax, %rcx -; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %rax +; CHECK-NEXT: vcvtsd2si %xmm0, %rcx +; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %rax +; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %rdx ; CHECK-NEXT: addq %rcx, %rax +; CHECK-NEXT: addq %rdx, %rax ; CHECK-NEXT: retq %res = call i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double> %a0, i32 4) @@ -515,11 +515,11 @@ declare i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double>, i32) nounwind readnone define i64 @test_x86_avx512_cvtss2usi64(<4 x float> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtss2usi64: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtss2usi %xmm0, %rax -; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %rcx -; CHECK-NEXT: addq %rax, %rcx -; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %rax +; CHECK-NEXT: vcvtss2usi %xmm0, %rcx +; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %rax +; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %rdx ; CHECK-NEXT: addq %rcx, %rax +; CHECK-NEXT: addq %rdx, %rax ; CHECK-NEXT: retq %res = call i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float> %a0, i32 4) @@ -534,11 +534,11 @@ declare i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float>, i32) nounwind readnone define i64 @test_x86_avx512_cvtss2si64(<4 x float> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtss2si64: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtss2si %xmm0, %rax -; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %rcx -; CHECK-NEXT: addq %rax, %rcx -; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %rax +; CHECK-NEXT: vcvtss2si %xmm0, %rcx +; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %rax +; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %rdx ; CHECK-NEXT: addq %rcx, %rax +; CHECK-NEXT: addq %rdx, %rax ; CHECK-NEXT: retq %res = call i64 @llvm.x86.avx512.vcvtss2si64(<4 x float> %a0, i32 4) @@ -553,11 +553,11 @@ declare i64 @llvm.x86.avx512.vcvtss2si64(<4 x float>, i32) nounwind readnone define i32 @test_x86_avx512_cvtsd2usi32(<2 x double> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtsd2usi32: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtsd2usi %xmm0, %eax -; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %ecx -; CHECK-NEXT: addl %eax, %ecx -; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %eax +; CHECK-NEXT: vcvtsd2usi %xmm0, %ecx +; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %eax +; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %edx ; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: retq %res = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 4) @@ -572,11 +572,11 @@ declare i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double>, i32) nounwind readnone define i32 @test_x86_avx512_cvtsd2si32(<2 x double> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtsd2si32: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtsd2si %xmm0, %eax -; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %ecx -; CHECK-NEXT: addl %eax, %ecx -; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %eax +; CHECK-NEXT: vcvtsd2si %xmm0, %ecx +; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %eax +; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %edx ; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: retq %res = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 4) @@ -591,11 +591,11 @@ declare i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double>, i32) nounwind readnone define i32 @test_x86_avx512_cvtss2usi32(<4 x float> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtss2usi32: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtss2usi %xmm0, %eax -; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %ecx -; CHECK-NEXT: addl %eax, %ecx -; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %eax +; CHECK-NEXT: vcvtss2usi %xmm0, %ecx +; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %eax +; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %edx ; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: retq %res = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 4) @@ -610,11 +610,11 @@ declare i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float>, i32) nounwind readnone define i32 @test_x86_avx512_cvtss2si32(<4 x float> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtss2si32: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtss2si %xmm0, %eax -; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %ecx -; CHECK-NEXT: addl %eax, %ecx -; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %eax +; CHECK-NEXT: vcvtss2si %xmm0, %ecx +; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %eax +; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %edx ; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: retq %res = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 4) @@ -683,9 +683,8 @@ define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0, <16 x i16> %src, i16 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcvtps2ph $2, %zmm0, %ymm1 {%k1} ; CHECK-NEXT: vcvtps2ph $2, %zmm0, %ymm2 {%k1} {z} -; CHECK-NEXT: vpaddw %ymm1, %ymm2, %ymm1 ; CHECK-NEXT: vcvtps2ph $2, %zmm0, (%rsi) -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpaddw %ymm1, %ymm2, %ymm0 ; CHECK-NEXT: retq %res1 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 -1) %res2 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 %mask) @@ -3657,11 +3656,11 @@ define <4 x float> @test_getexp_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> ; CHECK-NEXT: vmovaps %xmm2, %xmm3 ; CHECK-NEXT: vgetexpss %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vaddps %xmm2, %xmm3, %xmm2 -; CHECK-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm3 {%k1} {z} +; CHECK-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm4 {%k1} {z} ; CHECK-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 -; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: vaddps %xmm2, %xmm3, %xmm1 +; CHECK-NEXT: vaddps %xmm0, %xmm4, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %res0 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8) @@ -3685,10 +3684,10 @@ define <2 x double> @test_getexp_sd(<2 x double> %a0, <2 x double> %a1, <2 x dou ; CHECK-NEXT: vgetexpsd %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vgetexpsd %xmm1, %xmm0, %xmm4 ; CHECK-NEXT: vgetexpsd {sae}, %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vaddpd %xmm2, %xmm3, %xmm2 ; CHECK-NEXT: vgetexpsd {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: vaddpd %xmm2, %xmm3, %xmm1 ; CHECK-NEXT: vaddpd %xmm4, %xmm0, %xmm0 -; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %res0 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8) @@ -3904,11 +3903,11 @@ define <2 x double>@test_int_x86_avx512_mask_getmant_sd(<2 x double> %x0, <2 x d ; CHECK-NEXT: vmovapd %xmm2, %xmm3 ; CHECK-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm4 {%k1} {z} -; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm3 -; CHECK-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm4 +; CHECK-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm5 ; CHECK-NEXT: vgetmantsd $11, {sae}, %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vaddpd %xmm4, %xmm2, %xmm0 -; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm0 +; CHECK-NEXT: vaddpd %xmm5, %xmm2, %xmm1 +; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> %x2, i8 %x3, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> zeroinitializer, i8 %x3, i32 4) @@ -3929,11 +3928,11 @@ define <4 x float>@test_int_x86_avx512_mask_getmant_ss(<4 x float> %x0, <4 x flo ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm3 {%k1} {z} -; CHECK-NEXT: vaddps %xmm3, %xmm2, %xmm2 -; CHECK-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm3 +; CHECK-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm4 ; CHECK-NEXT: vgetmantss $11, {sae}, %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vaddps %xmm3, %xmm0, %xmm0 -; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: vaddps %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vaddps %xmm4, %xmm0, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> %x2, i8 %x3, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> zeroinitializer, i8 %x3, i32 4) @@ -4435,8 +4434,8 @@ define <16 x i32>@test_int_x86_avx512_mask_prol_d_512(<16 x i32> %x0, i32 %x1, < ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vprold $3, %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vprold $3, %zmm0, %zmm2 {%k1} {z} -; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1 ; CHECK-NEXT: vprold $3, %zmm0, %zmm0 +; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1 ; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3) @@ -4455,8 +4454,8 @@ define <8 x i64>@test_int_x86_avx512_mask_prol_q_512(<8 x i64> %x0, i32 %x1, <8 ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vprolq $3, %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vprolq $3, %zmm0, %zmm2 {%k1} {z} -; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1 ; CHECK-NEXT: vprolq $3, %zmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3) @@ -4557,9 +4556,9 @@ define <8 x double>@test_int_x86_avx512_mask_fixupimm_pd_512(<8 x double> %x0, < ; CHECK-NEXT: vfixupimmpd $4, %zmm2, %zmm1, %zmm3 {%k1} ; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4 ; CHECK-NEXT: vfixupimmpd $5, %zmm2, %zmm1, %zmm4 {%k1} {z} -; CHECK-NEXT: vaddpd %zmm4, %zmm3, %zmm3 ; CHECK-NEXT: vfixupimmpd $3, {sae}, %zmm2, %zmm1, %zmm0 -; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: vaddpd %zmm4, %zmm3, %zmm1 +; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 4, i8 %x4, i32 4) %res1 = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> zeroinitializer, <8 x double> %x1, <8 x i64> %x2, i32 5, i8 %x4, i32 4) @@ -4580,9 +4579,9 @@ define <8 x double>@test_int_x86_avx512_maskz_fixupimm_pd_512(<8 x double> %x0, ; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4 ; CHECK-NEXT: vmovapd %zmm0, %zmm5 ; CHECK-NEXT: vfixupimmpd $5, %zmm4, %zmm1, %zmm5 {%k1} {z} -; CHECK-NEXT: vaddpd %zmm5, %zmm3, %zmm3 ; CHECK-NEXT: vfixupimmpd $2, {sae}, %zmm2, %zmm1, %zmm0 -; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: vaddpd %zmm5, %zmm3, %zmm1 +; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 3, i8 %x4, i32 4) %res1 = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> zeroinitializer, i32 5, i8 %x4, i32 4) @@ -4604,9 +4603,9 @@ define <4 x float>@test_int_x86_avx512_mask_fixupimm_ss(<4 x float> %x0, <4 x fl ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vmovaps %xmm0, %xmm5 ; CHECK-NEXT: vfixupimmss $5, %xmm4, %xmm1, %xmm5 {%k1} -; CHECK-NEXT: vaddps %xmm5, %xmm3, %xmm3 ; CHECK-NEXT: vfixupimmss $5, {sae}, %xmm2, %xmm1, %xmm0 -; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: vaddps %xmm5, %xmm3, %xmm1 +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 5, i8 %x4, i32 4) @@ -4651,9 +4650,9 @@ define <16 x float>@test_int_x86_avx512_mask_fixupimm_ps_512(<16 x float> %x0, < ; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4 ; CHECK-NEXT: vmovaps %zmm0, %zmm5 ; CHECK-NEXT: vfixupimmps $5, %zmm4, %zmm1, %zmm5 {%k1} -; CHECK-NEXT: vaddps %zmm5, %zmm3, %zmm3 ; CHECK-NEXT: vfixupimmps $5, {sae}, %zmm2, %zmm1, %zmm0 -; CHECK-NEXT: vaddps %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: vaddps %zmm5, %zmm3, %zmm1 +; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 %x4, i32 4) %res1 = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> zeroinitializer, i32 5, i16 %x4, i32 4) @@ -4722,9 +4721,9 @@ define <2 x double>@test_int_x86_avx512_maskz_fixupimm_sd(<2 x double> %x0, <2 x ; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vmovapd %xmm0, %xmm5 ; CHECK-NEXT: vfixupimmsd $5, {sae}, %xmm4, %xmm1, %xmm5 {%k1} {z} -; CHECK-NEXT: vaddpd %xmm5, %xmm3, %xmm3 ; CHECK-NEXT: vfixupimmsd $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} -; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: vaddpd %xmm5, %xmm3, %xmm1 +; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> zeroinitializer, i32 5, i8 %x4, i32 8) @@ -4822,12 +4821,12 @@ define <2 x double>@test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x do ; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm3 {%k1} ; CHECK-NEXT: vmovapd %xmm0, %xmm4 ; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm4 -; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm3 -; CHECK-NEXT: vmovapd %xmm0, %xmm4 -; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm4 {%k1} +; CHECK-NEXT: vmovapd %xmm0, %xmm5 +; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm5 {%k1} ; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0 -; CHECK-NEXT: vaddpd %xmm4, %xmm0, %xmm0 -; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm1 +; CHECK-NEXT: vaddpd %xmm5, %xmm0, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) @@ -4850,12 +4849,12 @@ define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss(<4 x float> %x0, <4 x floa ; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm3 {%k1} ; CHECK-NEXT: vmovaps %xmm0, %xmm4 ; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm4 -; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm3 -; CHECK-NEXT: vmovaps %xmm0, %xmm4 -; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm4 {%k1} +; CHECK-NEXT: vmovaps %xmm0, %xmm5 +; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm5 {%k1} ; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm0 -; CHECK-NEXT: vaddps %xmm4, %xmm0, %xmm0 -; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm1 +; CHECK-NEXT: vaddps %xmm5, %xmm0, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) @@ -4910,12 +4909,12 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x d ; CHECK-NEXT: vfmadd231sd %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vmovapd %xmm2, %xmm4 ; CHECK-NEXT: vfmadd231sd %xmm1, %xmm0, %xmm4 -; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm3 -; CHECK-NEXT: vmovapd %xmm2, %xmm4 -; CHECK-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm4 {%k1} +; CHECK-NEXT: vmovapd %xmm2, %xmm5 +; CHECK-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} ; CHECK-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm2 -; CHECK-NEXT: vaddpd %xmm4, %xmm2, %xmm0 -; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm0 +; CHECK-NEXT: vaddpd %xmm5, %xmm2, %xmm1 +; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) @@ -4938,12 +4937,12 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x flo ; CHECK-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vmovaps %xmm2, %xmm4 ; CHECK-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm4 -; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm3 -; CHECK-NEXT: vmovaps %xmm2, %xmm4 -; CHECK-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm4 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm5 +; CHECK-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} ; CHECK-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm2 -; CHECK-NEXT: vaddps %xmm4, %xmm2, %xmm0 -; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm0 +; CHECK-NEXT: vaddps %xmm5, %xmm2, %xmm1 +; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) @@ -5070,12 +5069,12 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmsub_sd(<2 x double> %x0, <2 x d ; CHECK-NEXT: vfmsub231sd %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vmovapd %xmm2, %xmm4 ; CHECK-NEXT: vfmsub231sd %xmm1, %xmm0, %xmm4 -; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm3 -; CHECK-NEXT: vmovapd %xmm2, %xmm4 -; CHECK-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm4 {%k1} +; CHECK-NEXT: vmovapd %xmm2, %xmm5 +; CHECK-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} ; CHECK-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 -; CHECK-NEXT: vaddpd %xmm4, %xmm2, %xmm0 -; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm0 +; CHECK-NEXT: vaddpd %xmm5, %xmm2, %xmm1 +; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) @@ -5098,12 +5097,12 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ss(<4 x float> %x0, <4 x flo ; CHECK-NEXT: vfmsub231ss %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vmovaps %xmm2, %xmm4 ; CHECK-NEXT: vfmsub231ss %xmm1, %xmm0, %xmm4 -; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm3 -; CHECK-NEXT: vmovaps %xmm2, %xmm4 -; CHECK-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm4 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm5 +; CHECK-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} ; CHECK-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 -; CHECK-NEXT: vaddps %xmm4, %xmm2, %xmm0 -; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm0 +; CHECK-NEXT: vaddps %xmm5, %xmm2, %xmm1 +; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) @@ -5126,12 +5125,12 @@ define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_sd(<2 x double> %x0, <2 x ; CHECK-NEXT: vfnmsub231sd %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vmovapd %xmm2, %xmm4 ; CHECK-NEXT: vfnmsub231sd %xmm1, %xmm0, %xmm4 -; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm3 -; CHECK-NEXT: vmovapd %xmm2, %xmm4 -; CHECK-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm4 {%k1} +; CHECK-NEXT: vmovapd %xmm2, %xmm5 +; CHECK-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} ; CHECK-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 -; CHECK-NEXT: vaddpd %xmm4, %xmm2, %xmm0 -; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm0 +; CHECK-NEXT: vaddpd %xmm5, %xmm2, %xmm1 +; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) @@ -5154,12 +5153,12 @@ define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ss(<4 x float> %x0, <4 x fl ; CHECK-NEXT: vfnmsub231ss %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vmovaps %xmm2, %xmm4 ; CHECK-NEXT: vfnmsub231ss %xmm1, %xmm0, %xmm4 -; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm3 -; CHECK-NEXT: vmovaps %xmm2, %xmm4 -; CHECK-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm4 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm5 +; CHECK-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} ; CHECK-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 -; CHECK-NEXT: vaddps %xmm4, %xmm2, %xmm0 -; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm0 +; CHECK-NEXT: vaddps %xmm5, %xmm2, %xmm1 +; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) diff --git a/llvm/test/CodeGen/X86/avx512-mask-spills.ll b/llvm/test/CodeGen/X86/avx512-mask-spills.ll index 96aefdb10584..4ef88ac495c3 100644 --- a/llvm/test/CodeGen/X86/avx512-mask-spills.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-spills.ll @@ -9,11 +9,13 @@ define <4 x i1> @test_4i1(<4 x i32> %a, <4 x i32> %b) { ; CHECK-NEXT: Lcfi0: ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vpcmpnleud %xmm1, %xmm0, %k0 -; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 -; CHECK-NEXT: korw %k1, %k0, %k0 +; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill +; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 ; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill ; CHECK-NEXT: callq _f ; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload +; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload +; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: vpmovm2d %k0, %xmm0 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq @@ -32,12 +34,14 @@ define <8 x i1> @test_8i1(<8 x i32> %a, <8 x i32> %b) { ; CHECK-NEXT: Lcfi1: ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vpcmpnleud %ymm1, %ymm0, %k0 -; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k1 -; CHECK-NEXT: korb %k1, %k0, %k0 +; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill +; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 ; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq _f ; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload +; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload +; CHECK-NEXT: korb %k1, %k0, %k0 ; CHECK-NEXT: vpmovm2w %k0, %xmm0 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq @@ -56,12 +60,14 @@ define <16 x i1> @test_16i1(<16 x i32> %a, <16 x i32> %b) { ; CHECK-NEXT: Lcfi2: ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 -; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 -; CHECK-NEXT: korw %k1, %k0, %k0 +; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill +; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 ; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq _f ; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload +; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload +; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: vpmovm2b %k0, %xmm0 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq @@ -79,12 +85,14 @@ define <32 x i1> @test_32i1(<32 x i16> %a, <32 x i16> %b) { ; CHECK-NEXT: Lcfi3: ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 -; CHECK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1 -; CHECK-NEXT: kord %k1, %k0, %k0 ; CHECK-NEXT: kmovd %k0, {{[0-9]+}}(%rsp) ## 4-byte Spill +; CHECK-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, (%rsp) ## 4-byte Spill ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq _f ; CHECK-NEXT: kmovd {{[0-9]+}}(%rsp), %k0 ## 4-byte Reload +; CHECK-NEXT: kmovd (%rsp), %k1 ## 4-byte Reload +; CHECK-NEXT: kord %k1, %k0, %k0 ; CHECK-NEXT: vpmovm2b %k0, %ymm0 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq @@ -98,18 +106,20 @@ define <32 x i1> @test_32i1(<32 x i16> %a, <32 x i16> %b) { define <64 x i1> @test_64i1(<64 x i8> %a, <64 x i8> %b) { ; CHECK-LABEL: test_64i1: ; CHECK: ## BB#0: -; CHECK-NEXT: pushq %rax +; CHECK-NEXT: subq $24, %rsp ; CHECK-NEXT: Lcfi4: -; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 -; CHECK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 -; CHECK-NEXT: korq %k1, %k0, %k0 -; CHECK-NEXT: kmovq %k0, (%rsp) ## 8-byte Spill +; CHECK-NEXT: kmovq %k0, {{[0-9]+}}(%rsp) ## 8-byte Spill +; CHECK-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, {{[0-9]+}}(%rsp) ## 8-byte Spill ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq _f -; CHECK-NEXT: kmovq (%rsp), %k0 ## 8-byte Reload +; CHECK-NEXT: kmovq {{[0-9]+}}(%rsp), %k0 ## 8-byte Reload +; CHECK-NEXT: kmovq {{[0-9]+}}(%rsp), %k1 ## 8-byte Reload +; CHECK-NEXT: korq %k1, %k0, %k0 ; CHECK-NEXT: vpmovm2b %k0, %zmm0 -; CHECK-NEXT: popq %rax +; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: retq %cmp_res = icmp ugt <64 x i8> %a, %b diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll index faa055dfbbf3..9b4e73a18fc2 100644 --- a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -796,9 +796,9 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_w_512(<32 x i16> %x0, <8 x i16> ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm1 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_int_x86_avx512_mask_psrl_w_512: @@ -806,9 +806,9 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_w_512(<32 x i16> %x0, <8 x i16> ; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm3 ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm2 {%k1} -; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm2 ; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0 +; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm1 +; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %res = call <32 x i16> @llvm.x86.avx512.mask.psrl.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3) %res1 = call <32 x i16> @llvm.x86.avx512.mask.psrl.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 -1) @@ -826,8 +826,8 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_wi_512(<32 x i16> %x0, i32 %x1, ; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm2 ; AVX512BW-NEXT: kmovd %esi, %k1 ; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vpaddw %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: vpaddw %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; @@ -836,8 +836,8 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_wi_512(<32 x i16> %x0, i32 %x1, ; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm2 ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm1 {%k1} -; AVX512F-32-NEXT: vpaddw %zmm2, %zmm1, %zmm1 ; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: vpaddw %zmm2, %zmm1, %zmm1 ; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %res = call <32 x i16> @llvm.x86.avx512.mask.psrl.wi.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3) diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll index 13b850ccc3b6..3337f42eb142 100644 --- a/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -2159,9 +2159,9 @@ define <32 x i16>@test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8> ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1} ; AVX512BW-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm3 {%k1} {z} -; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm1 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_int_x86_avx512_mask_dbpsadbw_512: @@ -2169,9 +2169,9 @@ define <32 x i16>@test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8> ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1} ; AVX512F-32-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm3 {%k1} {z} -; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm2 ; AVX512F-32-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm0 -; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0 +; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm1 +; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %res = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> %x3, i32 %x4) %res1 = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> zeroinitializer, i32 %x4) @@ -2411,9 +2411,9 @@ define <32 x i16>@test_int_x86_avx512_mask_permvar_hi_512(<32 x i16> %x0, <32 x ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm2 {%k1} ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm3 {%k1} {z} -; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm1 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_int_x86_avx512_mask_permvar_hi_512: @@ -2421,9 +2421,9 @@ define <32 x i16>@test_int_x86_avx512_mask_permvar_hi_512(<32 x i16> %x0, <32 x ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpermw %zmm0, %zmm1, %zmm2 {%k1} ; AVX512F-32-NEXT: vpermw %zmm0, %zmm1, %zmm3 {%k1} {z} -; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm2 ; AVX512F-32-NEXT: vpermw %zmm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0 +; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm1 +; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %res = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) %res1 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3) diff --git a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll index 571f345d4616..7df07b0413ed 100644 --- a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll @@ -9,8 +9,8 @@ define <32 x i8>@test_int_x86_avx512_pbroadcastb_256(<16 x i8> %x0, <32 x i8> %x ; CHECK-NEXT: vpbroadcastb %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x78,0xd0] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastb %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x78,0xc8] -; CHECK-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc9] ; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x78,0xc0] +; CHECK-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc9] ; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> %x1, i32 -1) @@ -29,8 +29,8 @@ define <16 x i8>@test_int_x86_avx512_pbroadcastb_128(<16 x i8> %x0, <16 x i8> %x ; CHECK-NEXT: vpbroadcastb %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x78,0xd0] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x78,0xc8] -; CHECK-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc9] ; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x78,0xc0] +; CHECK-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc9] ; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> %x1, i16 -1) @@ -49,8 +49,8 @@ define <16 x i16>@test_int_x86_avx512_pbroadcastw_256(<8 x i16> %x0, <16 x i16> ; CHECK-NEXT: vpbroadcastw %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0xd0] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastw %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x79,0xc8] -; CHECK-NEXT: vpaddw %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc9] ; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x79,0xc0] +; CHECK-NEXT: vpaddw %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc9] ; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> %x1, i16 -1) @@ -69,8 +69,8 @@ define <8 x i16>@test_int_x86_avx512_pbroadcastw_128(<8 x i16> %x0, <8 x i16> %x ; CHECK-NEXT: vpbroadcastw %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x79,0xd0] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x79,0xc8] -; CHECK-NEXT: vpaddw %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc9] ; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x79,0xc0] +; CHECK-NEXT: vpaddw %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc9] ; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> %x1, i8 -1) @@ -89,8 +89,8 @@ define <64 x i8>@test_int_x86_avx512_pbroadcastb_512(<16 x i8> %x0, <64 x i8> %x ; CHECK-NEXT: vpbroadcastb %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x78,0xd0] ; CHECK-NEXT: kmovq %rdi, %k1 ## encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastb %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x78,0xc8] -; CHECK-NEXT: vpaddb %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc9] ; CHECK-NEXT: vpbroadcastb %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x78,0xc0] +; CHECK-NEXT: vpaddb %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc9] ; CHECK-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> %x1, i64 -1) @@ -109,8 +109,8 @@ define <32 x i16>@test_int_x86_avx512_pbroadcastw_512(<8 x i16> %x0, <32 x i16> ; CHECK-NEXT: vpbroadcastw %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x79,0xd0] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastw %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x79,0xc8] -; CHECK-NEXT: vpaddw %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc9] ; CHECK-NEXT: vpbroadcastw %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x79,0xc0] +; CHECK-NEXT: vpaddw %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc9] ; CHECK-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> %x1, i32 -1) @@ -1476,9 +1476,9 @@ define <8 x i16>@test_int_x86_avx512_mask_psrl_w_128(<8 x i16> %x0, <8 x i16> %x ; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd1,0xd9] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd1,0xd1] -; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xd3] ; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd1,0xc1] -; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2] +; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xcb] +; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) %res1 = call <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) @@ -1496,9 +1496,9 @@ define <16 x i16>@test_int_x86_avx512_mask_psrl_w_256(<16 x i16> %x0, <8 x i16> ; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd1,0xd9] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd1,0xd1] -; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xd3] ; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd1,0xc1] -; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] +; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xcb] +; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.psrl.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3) %res1 = call <16 x i16> @llvm.x86.avx512.mask.psrl.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 -1) @@ -1596,8 +1596,8 @@ define <8 x i16>@test_int_x86_avx512_mask_psrl_wi_128(<8 x i16> %x0, i32 %x1, <8 ; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x71,0xd0,0x03] ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x71,0xd0,0x03] -; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xca] ; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x71,0xd0,0x03] +; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xca] ; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3) @@ -1616,8 +1616,8 @@ define <16 x i16>@test_int_x86_avx512_mask_psrl_wi_256(<16 x i16> %x0, i32 %x1, ; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x71,0xd0,0x03] ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x71,0xd0,0x03] -; CHECK-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xca] ; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x71,0xd0,0x03] +; CHECK-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xca] ; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.psrl.wi.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3) diff --git a/llvm/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll index f8f47c87100a..8f528394f5bd 100644 --- a/llvm/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll @@ -9,8 +9,8 @@ define <4 x i32>@test_int_x86_avx512_mask_vplzcnt_d_128(<4 x i32> %x0, <4 x i32> ; CHECK-NEXT: vplzcntd %xmm0, %xmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vplzcntd %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; CHECK-NEXT: vplzcntd %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %res = call <4 x i32> @llvm.x86.avx512.mask.lzcnt.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) diff --git a/llvm/test/CodeGen/X86/avx512cdvl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512cdvl-intrinsics.ll index 96254f7c95b0..37aea45e6107 100644 --- a/llvm/test/CodeGen/X86/avx512cdvl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512cdvl-intrinsics.ll @@ -7,8 +7,8 @@ define <4 x i32> @test_int_x86_avx512_mask_vplzcnt_d_128(<4 x i32> %x0, <4 x i32 ; CHECK-NEXT: vplzcntd %xmm0, %xmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vplzcntd %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; CHECK-NEXT: vplzcntd %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %1 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %x0, i1 false) diff --git a/llvm/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll index 1377733739fe..c5478dad4224 100644 --- a/llvm/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll @@ -40,8 +40,8 @@ define <8 x float>@test_int_x86_avx512_mask_vextractf32x8(<16 x float> %x0, <8 x ; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm1 {%k1} -; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: retq %res = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> %x2, i8 %x3) diff --git a/llvm/test/CodeGen/X86/avx512dq-intrinsics.ll b/llvm/test/CodeGen/X86/avx512dq-intrinsics.ll index 97ac0fde10ec..000390404b54 100644 --- a/llvm/test/CodeGen/X86/avx512dq-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512dq-intrinsics.ll @@ -414,8 +414,8 @@ define <16 x float>@test_int_x86_avx512_mask_broadcastf32x2_512(<4 x float> %x0, ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1 ; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1 ; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x2.512(<4 x float> %x0, <16 x float> %x2, i16 %x3) @@ -434,8 +434,8 @@ define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x2_512(<4 x i32> %x0, <16 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1 ; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x2.512(<4 x i32> %x0, <16 x i32> %x2, i16 %x3) diff --git a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll index 595b3e0ebb86..52a84deebf51 100644 --- a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll @@ -1568,8 +1568,8 @@ define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_256(<4 x double> %x0, ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x19,0xc2,0x01] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x19,0xc1,0x01] -; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca] ; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x19,0xc0,0x01] +; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca] ; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> %x2, i8 %x3) @@ -1588,9 +1588,9 @@ define <4 x double>@test_int_x86_avx512_mask_insertf64x2_256(<4 x double> %x0, < ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd9,0x01] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x18,0xd1,0x01] -; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xd3] ; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x18,0xc1,0x01] -; CHECK-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc2] +; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xcb] +; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 %x4) %res1 = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 -1) @@ -1608,9 +1608,9 @@ define <4 x i64>@test_int_x86_avx512_mask_inserti64x2_256(<4 x i64> %x0, <2 x i6 ; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd9,0x01] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x38,0xd1,0x01] -; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xd3] ; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x38,0xc1,0x01] -; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] +; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xcb] +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 %x4) %res1 = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 -1) diff --git a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics.ll index 1bfdfd0e634d..ad9ea93c2031 100644 --- a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics.ll @@ -635,8 +635,8 @@ define <8 x float>@test_int_x86_avx512_mask_broadcastf32x2_256(<4 x float> %x0, ; CHECK-NEXT: ## ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1] ; CHECK-NEXT: vbroadcastf32x2 %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x19,0xd0] ; CHECK-NEXT: ## ymm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1] -; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca] ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xc0] +; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca] ; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x2.256(<4 x float> %x0, <8 x float> %x2, i8 %x3) @@ -680,8 +680,8 @@ define <4 x i32>@test_int_x86_avx512_mask_broadcasti32x2_128(<4 x i32> %x0, <4 x ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vbroadcasti32x2 %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x59,0xc8] ; CHECK-NEXT: vbroadcasti32x2 %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x59,0xd0] -; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca] ; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xc0] +; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca] ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.broadcasti32x2.128(<4 x i32> %x0, <4 x i32> %x2, i8 %x3) diff --git a/llvm/test/CodeGen/X86/avx512ifma-intrinsics.ll b/llvm/test/CodeGen/X86/avx512ifma-intrinsics.ll index 9659dc6d455a..30ecc0d2e49e 100644 --- a/llvm/test/CodeGen/X86/avx512ifma-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512ifma-intrinsics.ll @@ -13,8 +13,8 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512(<8 x i64> %x0, <8 x i ; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm4 {%k1} ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} -; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0 ; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z} +; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0 ; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -41,8 +41,8 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512(<8 x i64> %x0, <8 x ; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm4 {%k1} {z} ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} {z} -; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0 ; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z} +; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0 ; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -69,8 +69,8 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_512(<8 x i64> %x0, <8 x i ; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm4 {%k1} ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1} -; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0 ; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z} +; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0 ; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -97,8 +97,8 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_512(<8 x i64> %x0, <8 x ; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm4 {%k1} {z} ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1} {z} -; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0 ; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z} +; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0 ; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512ifmavl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512ifmavl-intrinsics.ll index b2fe6eba88ab..3ca686cef3bf 100644 --- a/llvm/test/CodeGen/X86/avx512ifmavl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512ifmavl-intrinsics.ll @@ -14,8 +14,8 @@ define <2 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_128(<2 x i64> %x0, <2 x i ; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm4 {%k1} ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0 ; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z} +; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0 ; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1 ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -42,8 +42,8 @@ define <4 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_256(<4 x i64> %x0, <4 x i ; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm4 {%k1} ; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0 ; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z} +; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0 ; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1 ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -70,8 +70,8 @@ define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_128(<2 x i64> %x0, <2 x ; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm4 {%k1} {z} ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1} {z} -; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0 ; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z} +; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0 ; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1 ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -98,8 +98,8 @@ define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_256(<4 x i64> %x0, <4 x ; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm4 {%k1} {z} ; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1} {z} -; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0 ; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z} +; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0 ; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1 ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -126,8 +126,8 @@ define <2 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_128(<2 x i64> %x0, <2 x i ; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm4 {%k1} ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0 ; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z} +; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0 ; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1 ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -154,8 +154,8 @@ define <4 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_256(<4 x i64> %x0, <4 x i ; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm4 {%k1} ; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0 ; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z} +; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0 ; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1 ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -182,8 +182,8 @@ define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_128(<2 x i64> %x0, <2 x ; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm4 {%k1} {z} ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1} {z} -; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0 ; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z} +; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0 ; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1 ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -210,8 +210,8 @@ define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_256(<4 x i64> %x0, <4 x ; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm4 {%k1} {z} ; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1} {z} -; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0 ; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z} +; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0 ; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1 ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll index c2d8df6476b3..4d906a4fd29a 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -30,8 +30,8 @@ define <4 x i32>@test_int_x86_avx512_pbroadcastd_128(<4 x i32> %x0, <4 x i32> %x ; CHECK-NEXT: vpbroadcastd %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0xd0] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpbroadcastd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x58,0xc8] -; CHECK-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc9] ; CHECK-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x58,0xc0] +; CHECK-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc9] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> %x1, i8 -1) @@ -50,8 +50,8 @@ define <4 x i64>@test_int_x86_avx512_pbroadcastq_256(<2 x i64> %x0, <4 x i64> %x ; CHECK-NEXT: vpbroadcastq %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0xd0] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpbroadcastq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x59,0xc8] -; CHECK-NEXT: vpaddq %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc9] ; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x59,0xc0] +; CHECK-NEXT: vpaddq %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc9] ; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> %x1,i8 -1) @@ -70,8 +70,8 @@ define <2 x i64>@test_int_x86_avx512_pbroadcastq_128(<2 x i64> %x0, <2 x i64> %x ; CHECK-NEXT: vpbroadcastq %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd0] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpbroadcastq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x59,0xc8] -; CHECK-NEXT: vpaddq %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc9] ; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x59,0xc0] +; CHECK-NEXT: vpaddq %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc9] ; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> %x1,i8 -1) @@ -90,8 +90,8 @@ define <4 x double> @test_x86_vbroadcast_sd_pd_256(<2 x double> %a0, <4 x double ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xd0] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x19,0xc8] -; CHECK-NEXT: vaddpd %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc9] ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x19,0xc0] +; CHECK-NEXT: vaddpd %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc9] ; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> zeroinitializer, i8 -1) @@ -110,8 +110,8 @@ define <8 x float> @test_x86_vbroadcast_ss_ps_256(<4 x float> %a0, <8 x float> % ; CHECK-NEXT: vbroadcastss %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x18,0xd0] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vbroadcastss %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x18,0xc8] -; CHECK-NEXT: vaddps %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc9] ; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x18,0xc0] +; CHECK-NEXT: vaddps %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc9] ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> zeroinitializer, i8 -1) @@ -130,8 +130,8 @@ define <4 x float> @test_x86_vbroadcast_ss_ps_128(<4 x float> %a0, <4 x float> % ; CHECK-NEXT: vbroadcastss %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xd0] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vbroadcastss %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x18,0xc8] -; CHECK-NEXT: vaddps %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc9] ; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x18,0xc0] +; CHECK-NEXT: vaddps %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc9] ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1) @@ -152,9 +152,9 @@ define <4 x float>@test_int_x86_avx512_mask_movsldup_128(<4 x float> %x0, <4 x f ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovsldup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x12,0xc8] ; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0,0,2,2] -; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca] ; CHECK-NEXT: vmovsldup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x12,0xc0] ; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[0,0,2,2] +; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca] ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2) @@ -175,9 +175,9 @@ define <8 x float>@test_int_x86_avx512_mask_movsldup_256(<8 x float> %x0, <8 x f ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovsldup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x12,0xc8] ; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,0,2,2,4,4,6,6] -; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca] ; CHECK-NEXT: vmovsldup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x12,0xc0] ; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6] +; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca] ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2) @@ -198,9 +198,9 @@ define <4 x float>@test_int_x86_avx512_mask_movshdup_128(<4 x float> %x0, <4 x f ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovshdup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x16,0xc8] ; CHECK-NEXT: ## xmm1 {%k1} = xmm0[1,1,3,3] -; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca] ; CHECK-NEXT: vmovshdup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x16,0xc0] ; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[1,1,3,3] +; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca] ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2) @@ -221,9 +221,9 @@ define <8 x float>@test_int_x86_avx512_mask_movshdup_256(<8 x float> %x0, <8 x f ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovshdup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x16,0xc8] ; CHECK-NEXT: ## ymm1 {%k1} = ymm0[1,1,3,3,5,5,7,7] -; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca] ; CHECK-NEXT: vmovshdup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x16,0xc0] ; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7] +; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca] ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2) @@ -243,9 +243,9 @@ define <2 x double>@test_int_x86_avx512_mask_movddup_128(<2 x double> %x0, <2 x ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovddup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x12,0xc8] ; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0,0] -; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca] ; CHECK-NEXT: vmovddup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0x89,0x12,0xc0] ; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[0,0] +; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca] ; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> %x1, i8 %x2) @@ -266,9 +266,9 @@ define <4 x double>@test_int_x86_avx512_mask_movddup_256(<4 x double> %x0, <4 x ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovddup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0x12,0xc8] ; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,0,2,2] -; CHECK-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xca] ; CHECK-NEXT: vmovddup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0xa9,0x12,0xc0] ; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[0,0,2,2] +; CHECK-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xca] ; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> %x1, i8 %x2) @@ -3209,10 +3209,10 @@ define <2 x double>@test_int_x86_avx512_mask_shuf_pd_128(<2 x double> %x0, <2 x ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vshufpd $1, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xc6,0xd1,0x01] ; CHECK-NEXT: ## xmm2 {%k1} = xmm0[1],xmm1[0] -; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xd3] ; CHECK-NEXT: vshufpd $1, %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xc6,0xc1,0x01] ; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[1],xmm1[0] -; CHECK-NEXT: vaddpd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc2] +; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xcb] +; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 1, <2 x double> %x3, i8 %x4) %res1 = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 1, <2 x double> %x3, i8 -1) @@ -3540,9 +3540,9 @@ define <2 x i64>@test_int_x86_avx512_mask_psrl_q_128(<2 x i64> %x0, <2 x i64> %x ; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd3,0xd9] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xd3,0xd1] -; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xd3] ; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xd3,0xc1] -; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0] +; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xcb] +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) %res1 = call <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) @@ -3560,9 +3560,9 @@ define <4 x i64>@test_int_x86_avx512_mask_psrl_q_256(<4 x i64> %x0, <2 x i64> %x ; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd3,0xd9] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xd3,0xd1] -; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xd3] ; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xd3,0xc1] -; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] +; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xcb] +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3) %res1 = call <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 -1) @@ -3580,9 +3580,9 @@ define <4 x i32>@test_int_x86_avx512_mask_psrl_d_128(<4 x i32> %x0, <4 x i32> %x ; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd2,0xd9] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd2,0xd1] -; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xd3] ; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd2,0xc1] -; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] +; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xcb] +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) %res1 = call <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) @@ -3600,9 +3600,9 @@ define <8 x i32>@test_int_x86_avx512_mask_psrl_d_256(<8 x i32> %x0, <4 x i32> %x ; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd2,0xd9] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd2,0xd1] -; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xd3] ; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd2,0xc1] -; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] +; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xcb] +; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3) %res1 = call <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 -1) @@ -3720,8 +3720,8 @@ define <2 x i64>@test_int_x86_avx512_mask_psrl_qi_128(<2 x i64> %x0, i32 %x1, <2 ; CHECK-NEXT: vpsrlq $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x73,0xd0,0x03] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpsrlq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x73,0xd0,0x03] -; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca] ; CHECK-NEXT: vpsrlq $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x73,0xd0,0x03] +; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca] ; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.psrl.qi.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3) @@ -3740,8 +3740,8 @@ define <4 x i64>@test_int_x86_avx512_mask_psrl_qi_256(<4 x i64> %x0, i32 %x1, <4 ; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x73,0xd0,0x03] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x73,0xd0,0x03] -; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca] ; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x73,0xd0,0x03] +; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca] ; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.psrl.qi.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3) @@ -3760,8 +3760,8 @@ define <4 x i32>@test_int_x86_avx512_mask_psrl_di_128(<4 x i32> %x0, i32 %x1, <4 ; CHECK-NEXT: vpsrld $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x72,0xd0,0x03] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpsrld $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xd0,0x03] -; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca] ; CHECK-NEXT: vpsrld $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x72,0xd0,0x03] +; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.psrl.di.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3) @@ -3780,8 +3780,8 @@ define <8 x i32>@test_int_x86_avx512_mask_psrl_di_256(<8 x i32> %x0, i32 %x1, <8 ; CHECK-NEXT: vpsrld $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x72,0xd0,0x03] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpsrld $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xd0,0x03] -; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca] ; CHECK-NEXT: vpsrld $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x72,0xd0,0x03] +; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca] ; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.psrl.di.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3) @@ -4642,10 +4642,10 @@ define <4 x i32>@test_int_x86_avx512_mask_valign_d_128(<4 x i32> %x0, <4 x i32> ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: valignd $2, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x03,0xd1,0x02] ; CHECK-NEXT: ## xmm2 {%k1} = xmm1[2,3],xmm0[0,1] -; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xd3] ; CHECK-NEXT: valignd $2, %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0x89,0x03,0xc1,0x02] ; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm1[2,3],xmm0[0,1] -; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] +; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xcb] +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 2, <4 x i32> %x3, i8 %x4) %res1 = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 2, <4 x i32> %x3, i8 -1) @@ -4817,9 +4817,9 @@ define <8 x float>@test_int_x86_avx512_mask_insertf32x4_256(<8 x float> %x0, <4 ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd9,0x01] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x18,0xd1,0x01] -; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xd3] ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x18,0xc1,0x01] -; CHECK-NEXT: vaddps %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc2] +; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xcb] +; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 %x4) %res1 = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 -1) @@ -4837,9 +4837,9 @@ define <8 x i32>@test_int_x86_avx512_mask_inserti32x4_256(<8 x i32> %x0, <4 x i3 ; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd9,0x01] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x38,0xd1,0x01] -; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xd3] ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x38,0xc1,0x01] -; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] +; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xcb] +; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> %x3, i8 %x4) diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll index 684b0468cf51..1f324d679564 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -4368,8 +4368,8 @@ define <4 x i32>@test_int_x86_avx512_mask_prol_d_128(<4 x i32> %x0, i32 %x1, <4 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprold $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xc8,0x03] ; CHECK-NEXT: vprold $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0x89,0x72,0xc8,0x03] -; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca] ; CHECK-NEXT: vprold $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc8,0x03] +; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca] ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3) @@ -4388,8 +4388,8 @@ define <8 x i32>@test_int_x86_avx512_mask_prol_d_256(<8 x i32> %x0, i32 %x1, <8 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprold $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xc8,0x03] ; CHECK-NEXT: vprold $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xa9,0x72,0xc8,0x03] -; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca] ; CHECK-NEXT: vprold $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc8,0x03] +; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca] ; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.prol.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3) @@ -4408,8 +4408,8 @@ define <2 x i64>@test_int_x86_avx512_mask_prol_q_128(<2 x i64> %x0, i32 %x1, <2 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprolq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x72,0xc8,0x03] ; CHECK-NEXT: vprolq $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0x89,0x72,0xc8,0x03] -; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca] ; CHECK-NEXT: vprolq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc8,0x03] +; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca] ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.prol.q.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3) @@ -4428,8 +4428,8 @@ define <4 x i64>@test_int_x86_avx512_mask_prol_q_256(<4 x i64> %x0, i32 %x1, <4 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprolq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x72,0xc8,0x03] ; CHECK-NEXT: vprolq $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0xa9,0x72,0xc8,0x03] -; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca] ; CHECK-NEXT: vprolq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc8,0x03] +; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca] ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.prol.q.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3) @@ -4528,8 +4528,8 @@ define <4 x i32>@test_int_x86_avx512_mask_pror_d_128(<4 x i32> %x0, i32 %x1, <4 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprord $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xc0,0x03] ; CHECK-NEXT: vprord $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0x89,0x72,0xc0,0x03] -; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca] ; CHECK-NEXT: vprord $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc0,0x03] +; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca] ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.pror.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3) @@ -4548,8 +4548,8 @@ define <8 x i32>@test_int_x86_avx512_mask_pror_d_256(<8 x i32> %x0, i32 %x1, <8 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprord $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xc0,0x03] ; CHECK-NEXT: vprord $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xa9,0x72,0xc0,0x03] -; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca] ; CHECK-NEXT: vprord $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc0,0x03] +; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca] ; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.pror.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3) @@ -4568,8 +4568,8 @@ define <2 x i64>@test_int_x86_avx512_mask_pror_q_128(<2 x i64> %x0, i32 %x1, <2 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprorq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x72,0xc0,0x03] ; CHECK-NEXT: vprorq $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0x89,0x72,0xc0,0x03] -; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca] ; CHECK-NEXT: vprorq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc0,0x03] +; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca] ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.pror.q.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3) @@ -4588,8 +4588,8 @@ define <4 x i64>@test_int_x86_avx512_mask_pror_q_256(<4 x i64> %x0, i32 %x1, <4 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprorq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x72,0xc0,0x03] ; CHECK-NEXT: vprorq $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0xa9,0x72,0xc0,0x03] -; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca] ; CHECK-NEXT: vprorq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc0,0x03] +; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca] ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.pror.q.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3) @@ -4690,9 +4690,9 @@ define <2 x double>@test_int_x86_avx512_mask_fixupimm_pd_128(<2 x double> %x0, < ; CHECK-NEXT: vfixupimmpd $5, %xmm2, %xmm1, %xmm3 {%k1} ## encoding: [0x62,0xf3,0xf5,0x09,0x54,0xda,0x05] ; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] ; CHECK-NEXT: vfixupimmpd $4, %xmm2, %xmm1, %xmm4 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0x89,0x54,0xe2,0x04] -; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xdc] ; CHECK-NEXT: vfixupimmpd $3, %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf3,0xf5,0x08,0x54,0xc2,0x03] -; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xc0] +; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xcc] +; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> %x0, <2 x double> %x1,<2 x i64> %x2, i32 5, i8 %x4) %res1 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> zeroinitializer, <2 x double> %x1, <2 x i64> %x2, i32 4, i8 %x4) @@ -4732,9 +4732,9 @@ define <4 x double>@test_int_x86_avx512_mask_fixupimm_pd_256(<4 x double> %x0, < ; CHECK-NEXT: vfixupimmpd $4, %ymm2, %ymm1, %ymm3 {%k1} ## encoding: [0x62,0xf3,0xf5,0x29,0x54,0xda,0x04] ; CHECK-NEXT: vpxor %ymm4, %ymm4, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xe4] ; CHECK-NEXT: vfixupimmpd $5, %ymm2, %ymm1, %ymm4 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xa9,0x54,0xe2,0x05] -; CHECK-NEXT: vaddpd %ymm4, %ymm3, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xdc] ; CHECK-NEXT: vfixupimmpd $3, %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0xf5,0x28,0x54,0xc2,0x03] -; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc0] +; CHECK-NEXT: vaddpd %ymm4, %ymm3, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xcc] +; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 4, i8 %x4) %res1 = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> zeroinitializer, <4 x double> %x1, <4 x i64> %x2 , i32 5, i8 %x4) @@ -4755,9 +4755,9 @@ define <4 x double>@test_int_x86_avx512_maskz_fixupimm_pd_256(<4 x double> %x0, ; CHECK-NEXT: vpxor %ymm4, %ymm4, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xe4] ; CHECK-NEXT: vmovapd %ymm0, %ymm5 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xe8] ; CHECK-NEXT: vfixupimmpd $4, %ymm4, %ymm1, %ymm5 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xa9,0x54,0xec,0x04] -; CHECK-NEXT: vaddpd %ymm5, %ymm3, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xdd] ; CHECK-NEXT: vfixupimmpd $3, %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0xf5,0x28,0x54,0xc2,0x03] -; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc0] +; CHECK-NEXT: vaddpd %ymm5, %ymm3, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xcd] +; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 5, i8 %x4) %res1 = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> zeroinitializer, i32 4, i8 %x4) diff --git a/llvm/test/CodeGen/X86/bswap_tree2.ll b/llvm/test/CodeGen/X86/bswap_tree2.ll index 1340b7662a7a..a9c74df9d0d9 100644 --- a/llvm/test/CodeGen/X86/bswap_tree2.ll +++ b/llvm/test/CodeGen/X86/bswap_tree2.ll @@ -9,32 +9,31 @@ define i32 @test1(i32 %x) nounwind { ; CHECK-LABEL: test1: ; CHECK: # BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl %eax, %ecx -; CHECK-NEXT: andl $16711680, %ecx # imm = 0xFF0000 -; CHECK-NEXT: movl %eax, %edx -; CHECK-NEXT: orl $-16777216, %edx # imm = 0xFF000000 -; CHECK-NEXT: shll $8, %ecx -; CHECK-NEXT: shrl $8, %edx -; CHECK-NEXT: orl %ecx, %edx -; CHECK-NEXT: bswapl %eax -; CHECK-NEXT: shrl $16, %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl %ecx, %edx +; CHECK-NEXT: andl $16711680, %edx # imm = 0xFF0000 +; CHECK-NEXT: movl %ecx, %eax +; CHECK-NEXT: orl $-16777216, %eax # imm = 0xFF000000 +; CHECK-NEXT: shll $8, %edx +; CHECK-NEXT: shrl $8, %eax +; CHECK-NEXT: bswapl %ecx +; CHECK-NEXT: shrl $16, %ecx ; CHECK-NEXT: orl %edx, %eax +; CHECK-NEXT: orl %ecx, %eax ; CHECK-NEXT: retl ; ; CHECK64-LABEL: test1: ; CHECK64: # BB#0: -; CHECK64-NEXT: movl %edi, %eax -; CHECK64-NEXT: andl $16711680, %eax # imm = 0xFF0000 ; CHECK64-NEXT: movl %edi, %ecx -; CHECK64-NEXT: orl $-16777216, %ecx # imm = 0xFF000000 -; CHECK64-NEXT: shll $8, %eax -; CHECK64-NEXT: shrl $8, %ecx -; CHECK64-NEXT: orl %eax, %ecx +; CHECK64-NEXT: andl $16711680, %ecx # imm = 0xFF0000 +; CHECK64-NEXT: movl %edi, %eax +; CHECK64-NEXT: orl $-16777216, %eax # imm = 0xFF000000 +; CHECK64-NEXT: shll $8, %ecx +; CHECK64-NEXT: shrl $8, %eax ; CHECK64-NEXT: bswapl %edi ; CHECK64-NEXT: shrl $16, %edi -; CHECK64-NEXT: orl %ecx, %edi -; CHECK64-NEXT: movl %edi, %eax +; CHECK64-NEXT: orl %ecx, %eax +; CHECK64-NEXT: orl %edi, %eax ; CHECK64-NEXT: retq %byte0 = and i32 %x, 255 ; 0x000000ff %byte1 = and i32 %x, 65280 ; 0x0000ff00 diff --git a/llvm/test/CodeGen/X86/fold-tied-op.ll b/llvm/test/CodeGen/X86/fold-tied-op.ll index eb06eb75a4d7..d68236e9d250 100644 --- a/llvm/test/CodeGen/X86/fold-tied-op.ll +++ b/llvm/test/CodeGen/X86/fold-tied-op.ll @@ -6,10 +6,9 @@ target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" target triple = "i386--netbsd" ; CHECK-LABEL: fn1 -; CHECK: orl {{.*#+}} 4-byte Folded Reload -; CHECK: addl {{.*#+}} 4-byte Folded Reload -; CHECK: xorl {{.*#+}} 4-byte Folded Reload -; CHECK: xorl {{.*#+}} 4-byte Folded Reload +; CHECK: addl {{.*#+}} 4-byte Folded Reload +; CHECK: imull {{.*#+}} 4-byte Folded Reload +; CHECK: orl {{.*#+}} 4-byte Folded Reload ; CHECK: retl %struct.XXH_state64_t = type { i32, i32, i64, i64, i64 } diff --git a/llvm/test/CodeGen/X86/fp128-i128.ll b/llvm/test/CodeGen/X86/fp128-i128.ll index 6c6bc8bdc1d1..98082ec611d4 100644 --- a/llvm/test/CodeGen/X86/fp128-i128.ll +++ b/llvm/test/CodeGen/X86/fp128-i128.ll @@ -50,8 +50,8 @@ define void @TestUnionLD1(fp128 %s, i64 %n) #0 { ; CHECK-NEXT: andq %rdi, %rcx ; CHECK-NEXT: movabsq $-281474976710656, %rdx # imm = 0xFFFF000000000000 ; CHECK-NEXT: andq -{{[0-9]+}}(%rsp), %rdx -; CHECK-NEXT: orq %rcx, %rdx ; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: orq %rcx, %rdx ; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; CHECK-NEXT: jmp foo # TAILCALL diff --git a/llvm/test/CodeGen/X86/haddsub-2.ll b/llvm/test/CodeGen/X86/haddsub-2.ll index b5507523a75a..4596b83f7bc2 100644 --- a/llvm/test/CodeGen/X86/haddsub-2.ll +++ b/llvm/test/CodeGen/X86/haddsub-2.ll @@ -933,14 +933,14 @@ define <4 x float> @not_a_hsub_2(<4 x float> %A, <4 x float> %B) { ; AVX-NEXT: vsubss %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; AVX-NEXT: vsubss %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3] +; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0] +; AVX-NEXT: vsubss %xmm4, %xmm3, %xmm3 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] -; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3] -; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] -; AVX-NEXT: vsubss %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] -; AVX-NEXT: vsubss %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX-NEXT: vsubss %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0] +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] ; AVX-NEXT: retq %vecext = extractelement <4 x float> %A, i32 2 %vecext1 = extractelement <4 x float> %A, i32 3 diff --git a/llvm/test/CodeGen/X86/lrshrink.ll b/llvm/test/CodeGen/X86/lrshrink.ll deleted file mode 100644 index a9cf086dbd90..000000000000 --- a/llvm/test/CodeGen/X86/lrshrink.ll +++ /dev/null @@ -1,57 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s - -; Checks if "%7 = add nuw nsw i64 %4, %2" is moved before the last call -; to minimize live-range. - -define i64 @test(i1 %a, i64 %r1, i64 %r2, i64 %s1, i64 %s2, i64 %t1, i64 %t2) { -entry: - br i1 %a, label %then, label %else - -then: - br label %else - -else: - %0 = phi i64 [ 4, %entry ], [ 10, %then ] - %r = phi i64 [ %r1, %entry ], [ %r2, %then ] - %s = phi i64 [ %s1, %entry ], [ %s2, %then ] - %t = phi i64 [ %t1, %entry ], [ %t2, %then ] -; CHECK-LABEL: test: -; CHECK: add -; CHECK: add -; CHECK: call -; CHECK: add -; CHECK: call -; CHECK: add -; CHECK: call -; CHECK: add - %1 = tail call i32 @_Z3foov() - %2 = zext i32 %1 to i64 - %3 = tail call i32 @_Z3foov() - %4 = zext i32 %3 to i64 - %5 = tail call i32 @_Z3foov() - %6 = zext i32 %5 to i64 - %7 = add nuw nsw i64 %0, %r - tail call void @llvm.dbg.value(metadata i64 %7, i64 0, metadata !5, metadata !DIExpression()), !dbg !6 - %8 = add nuw nsw i64 %2, %7 - %9 = add nuw nsw i64 %4, %8 - %10 = add nuw nsw i64 %6, %9 - %11 = add nuw nsw i64 %s, %t - tail call void @llvm.dbg.value(metadata i64 %11, i64 0, metadata !5, metadata !DIExpression()), !dbg !6 - %12 = add nuw nsw i64 %10, %11 - ret i64 %12 -} - -declare i32 @_Z3foov() -declare void @llvm.dbg.value(metadata, i64, metadata, metadata) - -!llvm.dbg.cu = !{!0} -!llvm.module.flags = !{!1, !2} - -!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !3, emissionKind: FullDebug) -!1 = !{i32 2, !"Dwarf Version", i32 4} -!2 = !{i32 2, !"Debug Info Version", i32 3} -!3 = !DIFile(filename: "a.c", directory: "./") -!4 = distinct !DISubprogram(name: "test", scope: !3, unit: !0) -!5 = !DILocalVariable(name: "x", scope: !4) -!6 = !DILocation(line: 4, scope: !4) diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll index af86df510016..d332b2f3169f 100644 --- a/llvm/test/CodeGen/X86/madd.ll +++ b/llvm/test/CodeGen/X86/madd.ll @@ -129,9 +129,9 @@ define i32 @test_unsigned_short(i16* nocapture readonly, i16* nocapture readonly ; SSE2-NEXT: pmullw %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: addq $16, %rsi ; SSE2-NEXT: addq $16, %rdi ; SSE2-NEXT: addq $-8, %rax @@ -246,23 +246,23 @@ define i32 @_Z9test_charPcS_i(i8* nocapture readonly, i8* nocapture readonly, i3 ; SSE2-NEXT: pmullw %xmm4, %xmm5 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $16, %xmm5 +; SSE2-NEXT: movq {{.*#+}} xmm6 = mem[0],zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psraw $8, %xmm6 +; SSE2-NEXT: movq {{.*#+}} xmm7 = mem[0],zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psraw $8, %xmm7 +; SSE2-NEXT: pmullw %xmm6, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSE2-NEXT: psrad $16, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $16, %xmm7 +; SSE2-NEXT: paddd %xmm7, %xmm2 +; SSE2-NEXT: paddd %xmm6, %xmm3 +; SSE2-NEXT: paddd %xmm5, %xmm1 ; SSE2-NEXT: paddd %xmm4, %xmm0 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; SSE2-NEXT: psrad $16, %xmm4 -; SSE2-NEXT: paddd %xmm4, %xmm1 -; SSE2-NEXT: movq {{.*#+}} xmm4 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: psraw $8, %xmm4 -; SSE2-NEXT: movq {{.*#+}} xmm5 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: psraw $8, %xmm5 -; SSE2-NEXT: pmullw %xmm4, %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; SSE2-NEXT: psrad $16, %xmm4 -; SSE2-NEXT: paddd %xmm4, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; SSE2-NEXT: psrad $16, %xmm4 -; SSE2-NEXT: paddd %xmm4, %xmm2 ; SSE2-NEXT: addq $16, %rsi ; SSE2-NEXT: addq $16, %rdi ; SSE2-NEXT: addq $-16, %rax diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll index 2f7714e63886..71417694b0d4 100644 --- a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll @@ -270,9 +270,9 @@ define <4 x float> @merge_4f32_f32_012u(float* %ptr) nounwind uwtable noinline s ; SSE2: # BB#0: ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: merge_4f32_f32_012u: @@ -292,9 +292,9 @@ define <4 x float> @merge_4f32_f32_012u(float* %ptr) nounwind uwtable noinline s ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; X32-SSE1-NEXT: retl ; ; X32-SSE41-LABEL: merge_4f32_f32_012u: @@ -321,9 +321,9 @@ define <4 x float> @merge_4f32_f32_019u(float* %ptr) nounwind uwtable noinline s ; SSE2: # BB#0: ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: merge_4f32_f32_019u: @@ -343,9 +343,9 @@ define <4 x float> @merge_4f32_f32_019u(float* %ptr) nounwind uwtable noinline s ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; X32-SSE1-NEXT: retl ; ; X32-SSE41-LABEL: merge_4f32_f32_019u: diff --git a/llvm/test/CodeGen/X86/misched-matrix.ll b/llvm/test/CodeGen/X86/misched-matrix.ll index 94bbe75702cb..e62a1d04dad6 100644 --- a/llvm/test/CodeGen/X86/misched-matrix.ll +++ b/llvm/test/CodeGen/X86/misched-matrix.ll @@ -17,9 +17,9 @@ ; ; TOPDOWN-LABEL: %for.body ; TOPDOWN: movl %{{.*}}, ( -; TOPDOWN-NOT: imull {{[0-9]*}}( +; TOPDOWN: imull {{[0-9]*}}( ; TOPDOWN: movl %{{.*}}, 4( -; TOPDOWN-NOT: imull {{[0-9]*}}( +; TOPDOWN: imull {{[0-9]*}}( ; TOPDOWN: movl %{{.*}}, 8( ; TOPDOWN: movl %{{.*}}, 12( ; TOPDOWN-LABEL: %for.end diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll index 0bda41a30c69..d26cf02dd942 100644 --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -746,9 +746,9 @@ define void @interleave_24i8_in(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8 ; SSE2-LABEL: interleave_24i8_in: ; SSE2: # BB#0: ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] @@ -791,17 +791,17 @@ define void @interleave_24i8_in(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8 ; SSE42: # BB#0: ; SSE42-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE42-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE42-NEXT: movq {{.*#+}} xmm2 = mem[0],zero ; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE42-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE42-NEXT: movdqa %xmm0, %xmm2 -; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,8],zero,xmm2[1,9],zero,xmm2[2,10],zero,xmm2[3,11],zero,xmm2[4,12],zero,xmm2[5] -; SSE42-NEXT: movdqa %xmm1, %xmm3 +; SSE42-NEXT: movdqa %xmm0, %xmm1 +; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,8],zero,xmm1[1,9],zero,xmm1[2,10],zero,xmm1[3,11],zero,xmm1[4,12],zero,xmm1[5] +; SSE42-NEXT: movdqa %xmm2, %xmm3 ; SSE42-NEXT: pshufb {{.*#+}} xmm3 = zero,zero,xmm3[0],zero,zero,xmm3[1],zero,zero,xmm3[2],zero,zero,xmm3[3],zero,zero,xmm3[4],zero -; SSE42-NEXT: por %xmm2, %xmm3 +; SSE42-NEXT: por %xmm1, %xmm3 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u] -; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u] -; SSE42-NEXT: por %xmm0, %xmm1 -; SSE42-NEXT: movq %xmm1, 16(%rdi) +; SSE42-NEXT: pshufb {{.*#+}} xmm2 = zero,xmm2[5],zero,zero,xmm2[6],zero,zero,xmm2[7,u,u,u,u,u,u,u,u] +; SSE42-NEXT: por %xmm0, %xmm2 +; SSE42-NEXT: movq %xmm2, 16(%rdi) ; SSE42-NEXT: movdqu %xmm3, (%rdi) ; SSE42-NEXT: retq ; @@ -809,16 +809,16 @@ define void @interleave_24i8_in(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8 ; AVX: # BB#0: ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5] -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,xmm1[1],zero,zero,xmm1[2],zero,zero,xmm1[3],zero,zero,xmm1[4],zero -; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm2[0],zero,zero,xmm2[1],zero,zero,xmm2[2],zero,zero,xmm2[3],zero,zero,xmm2[4],zero +; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[5],zero,zero,xmm2[6],zero,zero,xmm2[7,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vmovq %xmm0, 16(%rdi) -; AVX-NEXT: vmovdqu %xmm2, (%rdi) +; AVX-NEXT: vmovdqu %xmm1, (%rdi) ; AVX-NEXT: retq %s1 = load <8 x i8>, <8 x i8>* %q1, align 4 %s2 = load <8 x i8>, <8 x i8>* %q2, align 4 diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll index 50a661fcca11..88cb7a6d5825 100644 --- a/llvm/test/CodeGen/X86/pmul.ll +++ b/llvm/test/CodeGen/X86/pmul.ll @@ -1152,9 +1152,9 @@ define <4 x i32> @mul_v4i64_zero_upper(<4 x i32> %val1, <4 x i32> %val2) { ; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE2-NEXT: pmuludq %xmm4, %xmm2 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; SSE2-NEXT: pmuludq %xmm0, %xmm1 +; SSE2-NEXT: pmuludq %xmm4, %xmm2 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm1[1,3] ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -1166,9 +1166,9 @@ define <4 x i32> @mul_v4i64_zero_upper(<4 x i32> %val1, <4 x i32> %val2) { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero -; SSE41-NEXT: pmuludq %xmm2, %xmm4 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero ; SSE41-NEXT: pmuludq %xmm3, %xmm0 +; SSE41-NEXT: pmuludq %xmm2, %xmm4 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3] ; SSE41-NEXT: retq ; @@ -1312,17 +1312,17 @@ define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) { ; SSE2-NEXT: movdqa %xmm1, %xmm5 ; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] ; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE2-NEXT: movdqa %xmm2, %xmm7 -; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; SSE2-NEXT: pmuludq %xmm7, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm8 +; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] ; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; SSE2-NEXT: pmuludq %xmm0, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm2[1,3] -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE2-NEXT: pmuludq %xmm0, %xmm5 +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm6[2],xmm3[3],xmm6[3] ; SSE2-NEXT: pmuludq %xmm1, %xmm3 +; SSE2-NEXT: pmuludq %xmm7, %xmm5 +; SSE2-NEXT: pmuludq %xmm0, %xmm2 +; SSE2-NEXT: pmuludq %xmm8, %xmm4 +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm2[1,3] ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm3[1,3] ; SSE2-NEXT: movaps %xmm4, %xmm0 ; SSE2-NEXT: movaps %xmm5, %xmm1 @@ -1331,22 +1331,22 @@ define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) { ; SSE41-LABEL: mul_v8i64_zero_upper: ; SSE41: # BB#0: # %entry ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm8 = xmm4[0],zero,xmm4[1],zero ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero -; SSE41-NEXT: pmuludq %xmm4, %xmm1 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero -; SSE41-NEXT: pmuludq %xmm5, %xmm0 -; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero -; SSE41-NEXT: pmuludq %xmm6, %xmm2 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero ; SSE41-NEXT: pmuludq %xmm7, %xmm1 +; SSE41-NEXT: pmuludq %xmm6, %xmm2 +; SSE41-NEXT: pmuludq %xmm5, %xmm0 +; SSE41-NEXT: pmuludq %xmm8, %xmm4 +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3] ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] ; SSE41-NEXT: retq ; @@ -1356,11 +1356,11 @@ define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) { ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX2-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,3],ymm0[1,3],ymm2[5,7],ymm0[5,7] +; AVX2-NEXT: vpmuludq %ymm3, %ymm2, %ymm1 +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: retq ; @@ -1467,22 +1467,22 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) { ; SSE41-LABEL: mul_v8i64_sext: ; SSE41: # BB#0: ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,1,2,3] -; SSE41-NEXT: pmovsxwq %xmm3, %xmm4 +; SSE41-NEXT: pmovsxwq %xmm3, %xmm8 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovsxwq %xmm3, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3] ; SSE41-NEXT: pmovsxwq %xmm3, %xmm6 -; SSE41-NEXT: pmovsxwq %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3] +; SSE41-NEXT: pmovsxwq %xmm3, %xmm7 +; SSE41-NEXT: pmovsxwq %xmm0, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] ; SSE41-NEXT: pmovsxdq %xmm0, %xmm3 -; SSE41-NEXT: pmuldq %xmm4, %xmm3 ; SSE41-NEXT: pmovsxdq %xmm2, %xmm2 -; SSE41-NEXT: pmuldq %xmm5, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; SSE41-NEXT: pmovsxdq %xmm0, %xmm4 -; SSE41-NEXT: pmuldq %xmm6, %xmm4 ; SSE41-NEXT: pmovsxdq %xmm1, %xmm0 -; SSE41-NEXT: pmuldq %xmm7, %xmm0 +; SSE41-NEXT: pmuldq %xmm5, %xmm0 +; SSE41-NEXT: pmuldq %xmm7, %xmm4 +; SSE41-NEXT: pmuldq %xmm6, %xmm2 +; SSE41-NEXT: pmuldq %xmm8, %xmm3 ; SSE41-NEXT: movdqa %xmm4, %xmm1 ; SSE41-NEXT: retq ; @@ -1493,10 +1493,9 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) { ; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX2-NEXT: vpmovsxdq %xmm3, %ymm3 -; AVX2-NEXT: vpmuldq %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1 ; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm2, %ymm1 +; AVX2-NEXT: vpmuldq %ymm3, %ymm2, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: mul_v8i64_sext: diff --git a/llvm/test/CodeGen/X86/pr29112.ll b/llvm/test/CodeGen/X86/pr29112.ll index 94904018872b..8c970b3d4771 100644 --- a/llvm/test/CodeGen/X86/pr29112.ll +++ b/llvm/test/CodeGen/X86/pr29112.ll @@ -38,8 +38,7 @@ define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, < ; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm8[0],xmm0[0],xmm8[2,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1],xmm1[3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm14 = xmm1[0,1,2],xmm3[1] -; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm10[0,1,2],xmm3[1] -; CHECK-NEXT: vaddps %xmm14, %xmm1, %xmm10 +; CHECK-NEXT: vinsertps {{.*#+}} xmm10 = xmm10[0,1,2],xmm3[1] ; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[2,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[1],xmm0[3] @@ -53,9 +52,10 @@ define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, < ; CHECK-NEXT: vmovaps %xmm15, %xmm1 ; CHECK-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill ; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm9 +; CHECK-NEXT: vaddps %xmm14, %xmm10, %xmm0 ; CHECK-NEXT: vaddps %xmm1, %xmm1, %xmm8 -; CHECK-NEXT: vaddps %xmm11, %xmm3, %xmm0 -; CHECK-NEXT: vaddps %xmm10, %xmm0, %xmm0 +; CHECK-NEXT: vaddps %xmm11, %xmm3, %xmm3 +; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: vmovaps %xmm8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: vmovaps %xmm9, (%rsp) diff --git a/llvm/test/CodeGen/X86/pr31088.ll b/llvm/test/CodeGen/X86/pr31088.ll index d7a546c7396d..0dd8eb0ece85 100644 --- a/llvm/test/CodeGen/X86/pr31088.ll +++ b/llvm/test/CodeGen/X86/pr31088.ll @@ -150,12 +150,12 @@ define <2 x half> @ir_fadd_v2f16(<2 x half> %arg0, <2 x half> %arg1) nounwind { ; F16C-NEXT: vcvtph2ps %xmm3, %xmm3 ; F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1 ; F16C-NEXT: vcvtph2ps %xmm1, %xmm1 -; F16C-NEXT: vaddss %xmm3, %xmm1, %xmm1 ; F16C-NEXT: vcvtps2ph $4, %xmm2, %xmm2 ; F16C-NEXT: vcvtph2ps %xmm2, %xmm2 ; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; F16C-NEXT: vaddss %xmm2, %xmm0, %xmm0 +; F16C-NEXT: vaddss %xmm3, %xmm1, %xmm1 ; F16C-NEXT: retq %retval = fadd <2 x half> %arg0, %arg1 ret <2 x half> %retval diff --git a/llvm/test/CodeGen/X86/rotate.ll b/llvm/test/CodeGen/X86/rotate.ll index 4be3a4c2391b..5d5150ad62d6 100644 --- a/llvm/test/CodeGen/X86/rotate.ll +++ b/llvm/test/CodeGen/X86/rotate.ll @@ -33,8 +33,8 @@ define i64 @rotl64(i64 %A, i8 %Amt) nounwind { ; 32-NEXT: movl %ebx, %esi ; 32-NEXT: xorl %ebx, %ebx ; 32-NEXT: .LBB0_4: -; 32-NEXT: orl %ebx, %edx ; 32-NEXT: orl %esi, %eax +; 32-NEXT: orl %ebx, %edx ; 32-NEXT: popl %esi ; 32-NEXT: popl %edi ; 32-NEXT: popl %ebx @@ -86,8 +86,8 @@ define i64 @rotr64(i64 %A, i8 %Amt) nounwind { ; 32-NEXT: movl %ebx, %esi ; 32-NEXT: xorl %ebx, %ebx ; 32-NEXT: .LBB1_4: -; 32-NEXT: orl %esi, %edx ; 32-NEXT: orl %ebx, %eax +; 32-NEXT: orl %esi, %edx ; 32-NEXT: popl %esi ; 32-NEXT: popl %edi ; 32-NEXT: popl %ebx @@ -546,7 +546,7 @@ define void @rotr1_64_mem(i64* %Aptr) nounwind { ; 32-LABEL: rotr1_64_mem: ; 32: # BB#0: ; 32-NEXT: pushl %esi -; 32-NEXT: movl {{[0-9]+}}(%esp), %eax +; 32-NEXT: movl 8(%esp), %eax ; 32-NEXT: movl (%eax), %ecx ; 32-NEXT: movl 4(%eax), %edx ; 32-NEXT: movl %edx, %esi @@ -555,13 +555,11 @@ define void @rotr1_64_mem(i64* %Aptr) nounwind { ; 32-NEXT: movl %ecx, 4(%eax) ; 32-NEXT: movl %esi, (%eax) ; 32-NEXT: popl %esi -; 32-NEXT: retl -; + ; 64-LABEL: rotr1_64_mem: ; 64: # BB#0: ; 64-NEXT: rorq (%rdi) ; 64-NEXT: retq - %A = load i64, i64 *%Aptr %B = shl i64 %A, 63 %C = lshr i64 %A, 1 @@ -573,7 +571,7 @@ define void @rotr1_64_mem(i64* %Aptr) nounwind { define void @rotr1_32_mem(i32* %Aptr) nounwind { ; 32-LABEL: rotr1_32_mem: ; 32: # BB#0: -; 32-NEXT: movl {{[0-9]+}}(%esp), %eax +; 32-NEXT: movl 4(%esp), %eax ; 32-NEXT: rorl (%eax) ; 32-NEXT: retl ; @@ -592,7 +590,7 @@ define void @rotr1_32_mem(i32* %Aptr) nounwind { define void @rotr1_16_mem(i16* %Aptr) nounwind { ; 32-LABEL: rotr1_16_mem: ; 32: # BB#0: -; 32-NEXT: movl {{[0-9]+}}(%esp), %eax +; 32-NEXT: movl 4(%esp), %eax ; 32-NEXT: rorw (%eax) ; 32-NEXT: retl ; @@ -611,7 +609,7 @@ define void @rotr1_16_mem(i16* %Aptr) nounwind { define void @rotr1_8_mem(i8* %Aptr) nounwind { ; 32-LABEL: rotr1_8_mem: ; 32: # BB#0: -; 32-NEXT: movl {{[0-9]+}}(%esp), %eax +; 32-NEXT: movl 4(%esp), %eax ; 32-NEXT: rorb (%eax) ; 32-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/sad.ll b/llvm/test/CodeGen/X86/sad.ll index 6a565a5c76f0..b8a8b8afd14f 100644 --- a/llvm/test/CodeGen/X86/sad.ll +++ b/llvm/test/CodeGen/X86/sad.ll @@ -149,131 +149,127 @@ middle.block: define i32 @sad_32i8() nounwind { ; SSE2-LABEL: sad_32i8: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: pxor %xmm12, %xmm12 +; SSE2-NEXT: pxor %xmm11, %xmm11 ; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 -; SSE2-NEXT: pxor %xmm13, %xmm13 -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pxor %xmm14, %xmm14 -; SSE2-NEXT: pxor %xmm15, %xmm15 -; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm12, %xmm12 +; SSE2-NEXT: pxor %xmm15, %xmm15 +; SSE2-NEXT: pxor %xmm13, %xmm13 +; SSE2-NEXT: pxor %xmm14, %xmm14 ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB1_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa a+1040(%rax), %xmm8 -; SSE2-NEXT: movdqa a+1024(%rax), %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] -; SSE2-NEXT: movdqa %xmm4, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm12[8],xmm3[9],xmm12[9],xmm3[10],xmm12[10],xmm3[11],xmm12[11],xmm3[12],xmm12[12],xmm3[13],xmm12[13],xmm3[14],xmm12[14],xmm3[15],xmm12[15] -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] -; SSE2-NEXT: movdqa %xmm8, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm12[8],xmm8[9],xmm12[9],xmm8[10],xmm12[10],xmm8[11],xmm12[11],xmm8[12],xmm12[12],xmm8[13],xmm12[13],xmm8[14],xmm12[14],xmm8[15],xmm12[15] -; SSE2-NEXT: movdqa b+1024(%rax), %xmm11 -; SSE2-NEXT: movdqa %xmm11, %xmm10 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] -; SSE2-NEXT: movdqa %xmm10, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] -; SSE2-NEXT: psubd %xmm2, %xmm7 -; SSE2-NEXT: movdqa b+1040(%rax), %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] -; SSE2-NEXT: psubd %xmm10, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] -; SSE2-NEXT: movdqa %xmm11, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] -; SSE2-NEXT: psubd %xmm2, %xmm1 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] -; SSE2-NEXT: psubd %xmm11, %xmm3 -; SSE2-NEXT: movdqa %xmm6, %xmm10 -; SSE2-NEXT: movdqa %xmm9, %xmm6 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] -; SSE2-NEXT: movdqa %xmm6, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] -; SSE2-NEXT: psubd %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm8, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] -; SSE2-NEXT: psubd %xmm6, %xmm0 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15] -; SSE2-NEXT: movdqa %xmm9, %xmm6 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] -; SSE2-NEXT: psubd %xmm6, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7] -; SSE2-NEXT: psubd %xmm9, %xmm8 -; SSE2-NEXT: movdqa %xmm7, %xmm6 -; SSE2-NEXT: psrad $31, %xmm6 -; SSE2-NEXT: paddd %xmm6, %xmm7 -; SSE2-NEXT: pxor %xmm6, %xmm7 -; SSE2-NEXT: paddd %xmm7, %xmm13 -; SSE2-NEXT: movdqa %xmm4, %xmm6 -; SSE2-NEXT: psrad $31, %xmm6 -; SSE2-NEXT: paddd %xmm6, %xmm4 -; SSE2-NEXT: pxor %xmm6, %xmm4 -; SSE2-NEXT: movdqa %xmm10, %xmm6 -; SSE2-NEXT: paddd %xmm4, %xmm6 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: paddd %xmm4, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload -; SSE2-NEXT: paddd %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm3 -; SSE2-NEXT: pxor %xmm1, %xmm3 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; SSE2-NEXT: movdqa %xmm5, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm5 -; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: paddd %xmm5, %xmm14 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm0, %xmm15 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa a+1040(%rax), %xmm6 +; SSE2-NEXT: movdqa a+1024(%rax), %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm8 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3],xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm11[8],xmm3[9],xmm11[9],xmm3[10],xmm11[10],xmm3[11],xmm11[11],xmm3[12],xmm11[12],xmm3[13],xmm11[13],xmm3[14],xmm11[14],xmm3[15],xmm11[15] +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] +; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3],xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm11[8],xmm6[9],xmm11[9],xmm6[10],xmm11[10],xmm6[11],xmm11[11],xmm6[12],xmm11[12],xmm6[13],xmm11[13],xmm6[14],xmm11[14],xmm6[15],xmm11[15] +; SSE2-NEXT: movdqa %xmm6, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] +; SSE2-NEXT: movdqa b+1040(%rax), %xmm9 +; SSE2-NEXT: movdqa %xmm9, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm11[8],xmm9[9],xmm11[9],xmm9[10],xmm11[10],xmm9[11],xmm11[11],xmm9[12],xmm11[12],xmm9[13],xmm11[13],xmm9[14],xmm11[14],xmm9[15],xmm11[15] +; SSE2-NEXT: movdqa %xmm9, %xmm10 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] +; SSE2-NEXT: psubd %xmm9, %xmm6 +; SSE2-NEXT: movdqa b+1024(%rax), %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; SSE2-NEXT: psubd %xmm10, %xmm7 +; SSE2-NEXT: movdqa %xmm2, %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] +; SSE2-NEXT: psubd %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm4, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm11[8],xmm4[9],xmm11[9],xmm4[10],xmm11[10],xmm4[11],xmm11[11],xmm4[12],xmm11[12],xmm4[13],xmm11[13],xmm4[14],xmm11[14],xmm4[15],xmm11[15] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] +; SSE2-NEXT: psubd %xmm9, %xmm0 +; SSE2-NEXT: movdqa %xmm4, %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7] +; SSE2-NEXT: psubd %xmm4, %xmm3 +; SSE2-NEXT: movdqa %xmm8, %xmm10 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] +; SSE2-NEXT: psubd %xmm9, %xmm5 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] +; SSE2-NEXT: psubd %xmm2, %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3] +; SSE2-NEXT: psubd %xmm4, %xmm10 +; SSE2-NEXT: movdqa %xmm10, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: paddd %xmm2, %xmm10 +; SSE2-NEXT: pxor %xmm2, %xmm10 +; SSE2-NEXT: movdqa %xmm8, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: paddd %xmm2, %xmm8 +; SSE2-NEXT: pxor %xmm2, %xmm8 +; SSE2-NEXT: movdqa %xmm5, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: paddd %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm2, %xmm5 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: paddd %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: paddd %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 ; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm8, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm8 -; SSE2-NEXT: pxor %xmm0, %xmm8 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm7, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: paddd %xmm2, %xmm7 +; SSE2-NEXT: pxor %xmm2, %xmm7 +; SSE2-NEXT: movdqa %xmm6, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: paddd %xmm2, %xmm6 +; SSE2-NEXT: pxor %xmm2, %xmm6 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload +; SSE2-NEXT: paddd %xmm6, %xmm14 +; SSE2-NEXT: paddd %xmm7, %xmm13 +; SSE2-NEXT: paddd %xmm1, %xmm15 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm0, %xmm12 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload -; SSE2-NEXT: paddd %xmm8, %xmm0 +; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload +; SSE2-NEXT: paddd %xmm5, %xmm2 +; SSE2-NEXT: paddd %xmm8, %xmm3 +; SSE2-NEXT: paddd %xmm10, %xmm0 ; SSE2-NEXT: addq $4, %rax ; SSE2-NEXT: jne .LBB1_1 ; SSE2-NEXT: # BB#2: # %middle.block -; SSE2-NEXT: paddd %xmm15, %xmm6 -; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: paddd %xmm6, %xmm3 -; SSE2-NEXT: paddd %xmm14, %xmm13 -; SSE2-NEXT: paddd %xmm1, %xmm4 -; SSE2-NEXT: paddd %xmm3, %xmm4 -; SSE2-NEXT: paddd %xmm13, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] -; SSE2-NEXT: paddd %xmm4, %xmm0 +; SSE2-NEXT: paddd %xmm15, %xmm3 +; SSE2-NEXT: paddd %xmm14, %xmm1 +; SSE2-NEXT: paddd %xmm12, %xmm0 +; SSE2-NEXT: paddd %xmm13, %xmm2 +; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: paddd %xmm2, %xmm1 +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax @@ -402,284 +398,288 @@ middle.block: define i32 @sad_avx64i8() nounwind { ; SSE2-LABEL: sad_avx64i8: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: subq $200, %rsp -; SSE2-NEXT: pxor %xmm14, %xmm14 -; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 +; SSE2-NEXT: subq $184, %rsp ; SSE2-NEXT: pxor %xmm15, %xmm15 -; SSE2-NEXT: pxor %xmm10, %xmm10 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pxor %xmm13, %xmm13 -; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 +; SSE2-NEXT: pxor %xmm12, %xmm12 ; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: pxor %xmm14, %xmm14 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: pxor %xmm11, %xmm11 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm11, %xmm11 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: pxor %xmm13, %xmm13 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm5, %xmm5 ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB2_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm8, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm11, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm5, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm13, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm10, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm13, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm11, (%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm7, {{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm15, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movaps a+1040(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa a+1024(%rax), %xmm12 -; SSE2-NEXT: movdqa a+1056(%rax), %xmm15 -; SSE2-NEXT: movdqa a+1072(%rax), %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm6 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] -; SSE2-NEXT: movdqa %xmm6, %xmm1 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7] -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3] -; SSE2-NEXT: movdqa %xmm15, %xmm11 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm14[8],xmm11[9],xmm14[9],xmm11[10],xmm14[10],xmm11[11],xmm14[11],xmm11[12],xmm14[12],xmm11[13],xmm14[13],xmm11[14],xmm14[14],xmm11[15],xmm14[15] -; SSE2-NEXT: movdqa %xmm11, %xmm8 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; SSE2-NEXT: movdqa %xmm15, %xmm0 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; SSE2-NEXT: movdqa %xmm12, %xmm10 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3],xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] -; SSE2-NEXT: movdqa %xmm10, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSE2-NEXT: movdqa %xmm0, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15] -; SSE2-NEXT: movdqa %xmm12, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSE2-NEXT: movdqa %xmm0, %xmm13 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] -; SSE2-NEXT: movdqa b+1072(%rax), %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm7 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm14[8],xmm7[9],xmm14[9],xmm7[10],xmm14[10],xmm7[11],xmm14[11],xmm7[12],xmm14[12],xmm7[13],xmm14[13],xmm7[14],xmm14[14],xmm7[15],xmm14[15] -; SSE2-NEXT: movdqa %xmm7, %xmm0 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] -; SSE2-NEXT: psubd %xmm0, %xmm1 -; SSE2-NEXT: movdqa b+1056(%rax), %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] -; SSE2-NEXT: psubd %xmm7, %xmm6 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] -; SSE2-NEXT: movdqa %xmm3, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7] -; SSE2-NEXT: psubd %xmm7, %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] -; SSE2-NEXT: psubd %xmm3, %xmm4 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm14[8],xmm3[9],xmm14[9],xmm3[10],xmm14[10],xmm3[11],xmm14[11],xmm3[12],xmm14[12],xmm3[13],xmm14[13],xmm3[14],xmm14[14],xmm3[15],xmm14[15] -; SSE2-NEXT: movdqa %xmm3, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7] -; SSE2-NEXT: psubd %xmm7, %xmm8 -; SSE2-NEXT: movdqa b+1024(%rax), %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] -; SSE2-NEXT: psubd %xmm3, %xmm11 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] -; SSE2-NEXT: psubd %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSE2-NEXT: psubd %xmm0, %xmm15 -; SSE2-NEXT: movdqa %xmm7, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] -; SSE2-NEXT: psubd %xmm3, %xmm9 -; SSE2-NEXT: movdqa %xmm9, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: movdqa %xmm2, %xmm9 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3],xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] -; SSE2-NEXT: psubd %xmm0, %xmm10 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm14[8],xmm7[9],xmm14[9],xmm7[10],xmm14[10],xmm7[11],xmm14[11],xmm7[12],xmm14[12],xmm7[13],xmm14[13],xmm7[14],xmm14[14],xmm7[15],xmm14[15] -; SSE2-NEXT: movdqa %xmm7, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSE2-NEXT: psubd %xmm0, %xmm13 -; SSE2-NEXT: movdqa %xmm13, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm9, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7] -; SSE2-NEXT: psubd %xmm7, %xmm12 -; SSE2-NEXT: movdqa b+1040(%rax), %xmm13 -; SSE2-NEXT: movdqa %xmm13, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] -; SSE2-NEXT: movdqa %xmm3, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] -; SSE2-NEXT: psubd %xmm7, %xmm0 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] -; SSE2-NEXT: psubd %xmm3, %xmm9 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15] -; SSE2-NEXT: movdqa %xmm2, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm14[8],xmm13[9],xmm14[9],xmm13[10],xmm14[10],xmm13[11],xmm14[11],xmm13[12],xmm14[12],xmm13[13],xmm14[13],xmm13[14],xmm14[14],xmm13[15],xmm14[15] -; SSE2-NEXT: movdqa %xmm13, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] -; SSE2-NEXT: psubd %xmm3, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] -; SSE2-NEXT: psubd %xmm13, %xmm2 -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: paddd %xmm3, %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm1 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; SSE2-NEXT: paddd %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm6, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm6 -; SSE2-NEXT: pxor %xmm1, %xmm6 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm6, %xmm1 -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; SSE2-NEXT: movdqa %xmm5, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm5 -; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm4 -; SSE2-NEXT: pxor %xmm1, %xmm4 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm13 # 16-byte Reload -; SSE2-NEXT: movdqa %xmm8, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm8 -; SSE2-NEXT: pxor %xmm1, %xmm8 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm8, %xmm1 -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm14, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm6, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm8, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm12, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa a+1040(%rax), %xmm6 +; SSE2-NEXT: movdqa a+1024(%rax), %xmm4 +; SSE2-NEXT: movdqa a+1056(%rax), %xmm11 +; SSE2-NEXT: movdqa a+1072(%rax), %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] ; SSE2-NEXT: movdqa %xmm11, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm11 -; SSE2-NEXT: pxor %xmm1, %xmm11 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm11, %xmm1 -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm4 -; SSE2-NEXT: pxor %xmm1, %xmm4 -; SSE2-NEXT: paddd %xmm4, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm11 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; SSE2-NEXT: movdqa %xmm15, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm15 -; SSE2-NEXT: pxor %xmm1, %xmm15 -; SSE2-NEXT: paddd %xmm15, %xmm2 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm4 -; SSE2-NEXT: pxor %xmm1, %xmm4 -; SSE2-NEXT: paddd %xmm4, %xmm6 -; SSE2-NEXT: movdqa %xmm6, %xmm15 -; SSE2-NEXT: movdqa %xmm10, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm10 -; SSE2-NEXT: pxor %xmm1, %xmm10 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm10, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm10 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload -; SSE2-NEXT: movdqa %xmm6, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm6 -; SSE2-NEXT: pxor %xmm1, %xmm6 -; SSE2-NEXT: paddd %xmm6, %xmm3 -; SSE2-NEXT: movdqa %xmm12, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm12 -; SSE2-NEXT: pxor %xmm1, %xmm12 -; SSE2-NEXT: paddd %xmm12, %xmm5 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm0, %xmm13 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm15[8],xmm1[9],xmm15[9],xmm1[10],xmm15[10],xmm1[11],xmm15[11],xmm1[12],xmm15[12],xmm1[13],xmm15[13],xmm1[14],xmm15[14],xmm1[15],xmm15[15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3],xmm11[4],xmm15[4],xmm11[5],xmm15[5],xmm11[6],xmm15[6],xmm11[7],xmm15[7] +; SSE2-NEXT: movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3] +; SSE2-NEXT: movdqa %xmm4, %xmm12 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3],xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] +; SSE2-NEXT: movdqa %xmm12, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm15[8],xmm4[9],xmm15[9],xmm4[10],xmm15[10],xmm4[11],xmm15[11],xmm4[12],xmm15[12],xmm4[13],xmm15[13],xmm4[14],xmm15[14],xmm4[15],xmm15[15] +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7] +; SSE2-NEXT: movdqa %xmm6, %xmm14 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3],xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] +; SSE2-NEXT: movdqa %xmm14, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm15[8],xmm6[9],xmm15[9],xmm6[10],xmm15[10],xmm6[11],xmm15[11],xmm6[12],xmm15[12],xmm6[13],xmm15[13],xmm6[14],xmm15[14],xmm6[15],xmm15[15] +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7] +; SSE2-NEXT: movdqa b+1040(%rax), %xmm9 +; SSE2-NEXT: movdqa %xmm9, %xmm13 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm15[8],xmm9[9],xmm15[9],xmm9[10],xmm15[10],xmm9[11],xmm15[11],xmm9[12],xmm15[12],xmm9[13],xmm15[13],xmm9[14],xmm15[14],xmm9[15],xmm15[15] +; SSE2-NEXT: movdqa %xmm9, %xmm10 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] +; SSE2-NEXT: psubd %xmm9, %xmm6 +; SSE2-NEXT: movdqa b+1024(%rax), %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3],xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3] +; SSE2-NEXT: psubd %xmm10, %xmm8 +; SSE2-NEXT: movdqa %xmm13, %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] +; SSE2-NEXT: psubd %xmm13, %xmm14 +; SSE2-NEXT: movdqa %xmm2, %xmm10 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3] +; SSE2-NEXT: psubd %xmm9, %xmm7 +; SSE2-NEXT: movdqa %xmm2, %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] +; SSE2-NEXT: psubd %xmm2, %xmm4 +; SSE2-NEXT: movdqa b+1056(%rax), %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3],xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3] +; SSE2-NEXT: psubd %xmm9, %xmm0 +; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm10, %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7] +; SSE2-NEXT: psubd %xmm10, %xmm12 +; SSE2-NEXT: movdqa %xmm2, %xmm10 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3] +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: psubd %xmm9, %xmm0 +; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm2, %xmm9 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] +; SSE2-NEXT: psubd %xmm2, %xmm11 +; SSE2-NEXT: movdqa %xmm1, %xmm13 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm15[8],xmm10[9],xmm15[9],xmm10[10],xmm15[10],xmm10[11],xmm15[11],xmm10[12],xmm15[12],xmm10[13],xmm15[13],xmm10[14],xmm15[14],xmm10[15],xmm15[15] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] +; SSE2-NEXT: psubd %xmm9, %xmm0 +; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm10, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3] +; SSE2-NEXT: psubd %xmm10, %xmm1 +; SSE2-NEXT: movdqa %xmm3, %xmm10 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] +; SSE2-NEXT: psubd %xmm2, %xmm13 +; SSE2-NEXT: movdqa b+1072(%rax), %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] +; SSE2-NEXT: movdqa %xmm2, %xmm9 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] +; SSE2-NEXT: psubd %xmm2, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] +; SSE2-NEXT: psubd %xmm9, %xmm10 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm15[8],xmm5[9],xmm15[9],xmm5[10],xmm15[10],xmm5[11],xmm15[11],xmm5[12],xmm15[12],xmm5[13],xmm15[13],xmm5[14],xmm15[14],xmm5[15],xmm15[15] +; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; SSE2-NEXT: psubd %xmm0, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] +; SSE2-NEXT: psubd %xmm2, %xmm9 ; SSE2-NEXT: movdqa %xmm9, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: paddd %xmm0, %xmm9 ; SSE2-NEXT: pxor %xmm0, %xmm9 -; SSE2-NEXT: paddd %xmm9, %xmm1 +; SSE2-NEXT: movdqa %xmm5, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm5 +; SSE2-NEXT: pxor %xmm0, %xmm5 +; SSE2-NEXT: movdqa %xmm10, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm10 +; SSE2-NEXT: pxor %xmm0, %xmm10 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm13, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm13 +; SSE2-NEXT: pxor %xmm0, %xmm13 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm1 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm2 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm11, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm11 +; SSE2-NEXT: pxor %xmm0, %xmm11 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm2 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm12, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm12 +; SSE2-NEXT: pxor %xmm0, %xmm12 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm2 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm4 +; SSE2-NEXT: pxor %xmm0, %xmm4 ; SSE2-NEXT: movdqa %xmm7, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: paddd %xmm0, %xmm7 ; SSE2-NEXT: pxor %xmm0, %xmm7 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload -; SSE2-NEXT: paddd %xmm7, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Reload -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload -; SSE2-NEXT: movdqa %xmm7, %xmm0 +; SSE2-NEXT: movdqa %xmm14, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm7 -; SSE2-NEXT: pxor %xmm0, %xmm7 +; SSE2-NEXT: paddd %xmm0, %xmm14 +; SSE2-NEXT: pxor %xmm0, %xmm14 +; SSE2-NEXT: movdqa %xmm8, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm8 +; SSE2-NEXT: pxor %xmm0, %xmm8 +; SSE2-NEXT: movdqa %xmm6, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm6 +; SSE2-NEXT: pxor %xmm0, %xmm6 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload -; SSE2-NEXT: paddd %xmm7, %xmm0 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload +; SSE2-NEXT: paddd %xmm6, %xmm2 +; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload +; SSE2-NEXT: paddd %xmm8, %xmm6 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 # 16-byte Reload +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload +; SSE2-NEXT: paddd %xmm14, %xmm2 +; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload +; SSE2-NEXT: paddd %xmm7, %xmm2 +; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm14 # 16-byte Reload +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload +; SSE2-NEXT: paddd %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Folded Reload +; SSE2-NEXT: paddd %xmm12, %xmm8 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload +; SSE2-NEXT: movdqa %xmm0, %xmm12 +; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: paddd %xmm11, %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa (%rsp), %xmm11 # 16-byte Reload +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload +; SSE2-NEXT: paddd %xmm1, %xmm2 +; SSE2-NEXT: paddd %xmm13, %xmm7 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm13 # 16-byte Reload +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm10, %xmm1 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload +; SSE2-NEXT: paddd %xmm5, %xmm3 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload +; SSE2-NEXT: paddd %xmm9, %xmm5 +; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload ; SSE2-NEXT: addq $4, %rax ; SSE2-NEXT: jne .LBB2_1 ; SSE2-NEXT: # BB#2: # %middle.block -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Folded Reload -; SSE2-NEXT: paddd %xmm3, %xmm8 -; SSE2-NEXT: paddd %xmm2, %xmm15 -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm13 # 16-byte Folded Reload -; SSE2-NEXT: paddd %xmm8, %xmm13 -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Folded Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload -; SSE2-NEXT: paddd %xmm5, %xmm0 -; SSE2-NEXT: paddd %xmm11, %xmm10 -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: paddd %xmm10, %xmm1 -; SSE2-NEXT: paddd %xmm13, %xmm1 -; SSE2-NEXT: paddd %xmm15, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: paddd %xmm2, %xmm4 +; SSE2-NEXT: paddd %xmm3, %xmm6 +; SSE2-NEXT: movdqa %xmm12, %xmm2 +; SSE2-NEXT: paddd %xmm11, %xmm2 +; SSE2-NEXT: paddd %xmm13, %xmm14 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload +; SSE2-NEXT: paddd %xmm7, %xmm3 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload +; SSE2-NEXT: paddd %xmm5, %xmm7 +; SSE2-NEXT: paddd %xmm0, %xmm8 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload ; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: paddd %xmm3, %xmm7 +; SSE2-NEXT: paddd %xmm4, %xmm6 +; SSE2-NEXT: paddd %xmm14, %xmm6 +; SSE2-NEXT: paddd %xmm0, %xmm7 +; SSE2-NEXT: paddd %xmm8, %xmm7 +; SSE2-NEXT: paddd %xmm6, %xmm7 +; SSE2-NEXT: paddd %xmm2, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,0,1] +; SSE2-NEXT: paddd %xmm7, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: addq $200, %rsp +; SSE2-NEXT: addq $184, %rsp ; SSE2-NEXT: retq ; ; AVX2-LABEL: sad_avx64i8: @@ -688,8 +688,8 @@ define i32 @sad_avx64i8() nounwind { ; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00 ; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4 ; AVX2-NEXT: vpxor %ymm3, %ymm3, %ymm3 +; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4 ; AVX2-NEXT: vpxor %ymm6, %ymm6, %ymm6 ; AVX2-NEXT: vpxor %ymm5, %ymm5, %ymm5 ; AVX2-NEXT: vpxor %ymm7, %ymm7, %ymm7 @@ -697,6 +697,7 @@ define i32 @sad_avx64i8() nounwind { ; AVX2-NEXT: .LBB2_1: # %vector.body ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vmovdqu %ymm8, -{{[0-9]+}}(%rsp) # 32-byte Spill ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero @@ -704,49 +705,48 @@ define i32 @sad_avx64i8() nounwind { ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vmovdqu %ymm15, -{{[0-9]+}}(%rsp) # 32-byte Spill -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm15, %ymm8, %ymm8 -; AVX2-NEXT: vmovdqu %ymm8, -{{[0-9]+}}(%rsp) # 32-byte Spill -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm15, %ymm9, %ymm9 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm15, %ymm10, %ymm10 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm15, %ymm11, %ymm11 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm15, %ymm12, %ymm12 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm15, %ymm13, %ymm13 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpsubd %ymm8, %ymm15, %ymm8 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpsubd %ymm15, %ymm14, %ymm14 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vmovdqu -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Reload -; AVX2-NEXT: vpsubd %ymm15, %ymm8, %ymm15 -; AVX2-NEXT: vpabsd -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Folded Reload -; AVX2-NEXT: vpaddd %ymm7, %ymm8, %ymm7 -; AVX2-NEXT: vpabsd %ymm9, %ymm8 -; AVX2-NEXT: vpaddd %ymm5, %ymm8, %ymm5 -; AVX2-NEXT: vpabsd %ymm10, %ymm8 -; AVX2-NEXT: vpaddd %ymm6, %ymm8, %ymm6 -; AVX2-NEXT: vpabsd %ymm11, %ymm8 +; AVX2-NEXT: vpsubd %ymm15, %ymm13, %ymm13 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpsubd %ymm15, %ymm12, %ymm12 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpsubd %ymm15, %ymm11, %ymm11 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpsubd %ymm15, %ymm10, %ymm10 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpsubd %ymm15, %ymm9, %ymm9 +; AVX2-NEXT: vmovdqu %ymm9, -{{[0-9]+}}(%rsp) # 32-byte Spill +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vmovdqu -{{[0-9]+}}(%rsp), %ymm9 # 32-byte Reload +; AVX2-NEXT: vpsubd %ymm15, %ymm9, %ymm15 +; AVX2-NEXT: vpabsd %ymm8, %ymm8 ; AVX2-NEXT: vpaddd %ymm3, %ymm8, %ymm3 -; AVX2-NEXT: vpabsd %ymm12, %ymm8 -; AVX2-NEXT: vpaddd %ymm0, %ymm8, %ymm0 -; AVX2-NEXT: vpabsd %ymm13, %ymm8 -; AVX2-NEXT: vpaddd %ymm2, %ymm8, %ymm2 ; AVX2-NEXT: vpabsd %ymm14, %ymm8 ; AVX2-NEXT: vpaddd %ymm1, %ymm8, %ymm1 -; AVX2-NEXT: vpabsd %ymm15, %ymm8 +; AVX2-NEXT: vpabsd %ymm13, %ymm8 +; AVX2-NEXT: vpaddd %ymm2, %ymm8, %ymm2 +; AVX2-NEXT: vpabsd %ymm12, %ymm8 +; AVX2-NEXT: vpaddd %ymm0, %ymm8, %ymm0 +; AVX2-NEXT: vpabsd %ymm11, %ymm8 ; AVX2-NEXT: vpaddd %ymm4, %ymm8, %ymm4 +; AVX2-NEXT: vpabsd %ymm10, %ymm8 +; AVX2-NEXT: vpaddd %ymm6, %ymm8, %ymm6 +; AVX2-NEXT: vpabsd -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Folded Reload +; AVX2-NEXT: vpaddd %ymm5, %ymm8, %ymm5 +; AVX2-NEXT: vpabsd %ymm15, %ymm8 +; AVX2-NEXT: vpaddd %ymm7, %ymm8, %ymm7 ; AVX2-NEXT: addq $4, %rax ; AVX2-NEXT: jne .LBB2_1 ; AVX2-NEXT: # BB#2: # %middle.block ; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpaddd %ymm7, %ymm4, %ymm4 -; AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vpaddd %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm7, %ymm3, %ymm3 +; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm5, %ymm1, %ymm1 +; AVX2-NEXT: vpaddd %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -773,21 +773,21 @@ define i32 @sad_avx64i8() nounwind { ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpsubd %zmm11, %zmm7, %zmm7 +; AVX512F-NEXT: vpsubd %zmm10, %zmm6, %zmm6 +; AVX512F-NEXT: vpsubd %zmm9, %zmm5, %zmm5 ; AVX512F-NEXT: vpsubd %zmm8, %zmm4, %zmm4 -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpsubd %zmm8, %zmm5, %zmm5 -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpsubd %zmm8, %zmm6, %zmm6 -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpsubd %zmm8, %zmm7, %zmm7 ; AVX512F-NEXT: vpabsd %zmm4, %zmm4 +; AVX512F-NEXT: vpabsd %zmm5, %zmm5 +; AVX512F-NEXT: vpabsd %zmm6, %zmm6 +; AVX512F-NEXT: vpabsd %zmm7, %zmm7 +; AVX512F-NEXT: vpaddd %zmm3, %zmm7, %zmm3 +; AVX512F-NEXT: vpaddd %zmm2, %zmm6, %zmm2 +; AVX512F-NEXT: vpaddd %zmm1, %zmm5, %zmm1 ; AVX512F-NEXT: vpaddd %zmm0, %zmm4, %zmm0 -; AVX512F-NEXT: vpabsd %zmm5, %zmm4 -; AVX512F-NEXT: vpaddd %zmm1, %zmm4, %zmm1 -; AVX512F-NEXT: vpabsd %zmm6, %zmm4 -; AVX512F-NEXT: vpaddd %zmm2, %zmm4, %zmm2 -; AVX512F-NEXT: vpabsd %zmm7, %zmm4 -; AVX512F-NEXT: vpaddd %zmm3, %zmm4, %zmm3 ; AVX512F-NEXT: addq $4, %rax ; AVX512F-NEXT: jne .LBB2_1 ; AVX512F-NEXT: # BB#2: # %middle.block @@ -1154,54 +1154,59 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n ; SSE2-LABEL: sad_nonloop_32i8: ; SSE2: # BB#0: ; SSE2-NEXT: movdqu (%rdi), %xmm0 -; SSE2-NEXT: movdqu 16(%rdi), %xmm12 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm12, %xmm8 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] -; SSE2-NEXT: movdqa %xmm8, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] -; SSE2-NEXT: movdqa %xmm0, %xmm9 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3],xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] -; SSE2-NEXT: movdqa %xmm9, %xmm11 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm1[8],xmm12[9],xmm1[9],xmm12[10],xmm1[10],xmm12[11],xmm1[11],xmm12[12],xmm1[12],xmm12[13],xmm1[13],xmm12[14],xmm1[14],xmm12[15],xmm1[15] -; SSE2-NEXT: movdqa %xmm12, %xmm13 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: movdqu (%rdx), %xmm7 -; SSE2-NEXT: movdqu 16(%rdx), %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3],xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] -; SSE2-NEXT: movdqa %xmm6, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; SSE2-NEXT: psubd %xmm5, %xmm10 -; SSE2-NEXT: movdqa %xmm7, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; SSE2-NEXT: psubd %xmm5, %xmm11 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; SSE2-NEXT: psubd %xmm5, %xmm13 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15] -; SSE2-NEXT: movdqa %xmm7, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; SSE2-NEXT: psubd %xmm5, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; SSE2-NEXT: psubd %xmm6, %xmm8 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE2-NEXT: psubd %xmm2, %xmm9 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE2-NEXT: psubd %xmm3, %xmm12 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; SSE2-NEXT: psubd %xmm7, %xmm0 +; SSE2-NEXT: movdqu 16(%rdi), %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: movdqa %xmm3, %xmm12 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3],xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm12, %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm0, %xmm13 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3],xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm13, %xmm10 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm3, %xmm11 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE2-NEXT: movdqu (%rdx), %xmm5 +; SSE2-NEXT: movdqu 16(%rdx), %xmm7 +; SSE2-NEXT: movdqa %xmm7, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm5, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm2, %xmm14 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm7, %xmm15 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm4[4],xmm15[5],xmm4[5],xmm15[6],xmm4[6],xmm15[7],xmm4[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm5, %xmm8 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE2-NEXT: psubd %xmm5, %xmm0 +; SSE2-NEXT: psubd %xmm7, %xmm3 +; SSE2-NEXT: psubd %xmm2, %xmm13 +; SSE2-NEXT: psubd %xmm1, %xmm12 +; SSE2-NEXT: psubd %xmm8, %xmm6 +; SSE2-NEXT: psubd %xmm15, %xmm11 +; SSE2-NEXT: psubd %xmm14, %xmm10 +; SSE2-NEXT: psubd -{{[0-9]+}}(%rsp), %xmm9 # 16-byte Folded Reload +; SSE2-NEXT: movdqa %xmm9, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm9 +; SSE2-NEXT: pxor %xmm1, %xmm9 ; SSE2-NEXT: movdqa %xmm10, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm10 @@ -1210,37 +1215,33 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm11 ; SSE2-NEXT: pxor %xmm1, %xmm11 -; SSE2-NEXT: movdqa %xmm13, %xmm1 +; SSE2-NEXT: movdqa %xmm6, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm13 -; SSE2-NEXT: pxor %xmm1, %xmm13 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm4 -; SSE2-NEXT: pxor %xmm1, %xmm4 -; SSE2-NEXT: paddd %xmm13, %xmm4 -; SSE2-NEXT: paddd %xmm10, %xmm4 -; SSE2-NEXT: paddd %xmm11, %xmm4 -; SSE2-NEXT: movdqa %xmm8, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm8 -; SSE2-NEXT: pxor %xmm1, %xmm8 -; SSE2-NEXT: movdqa %xmm9, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm9 -; SSE2-NEXT: pxor %xmm1, %xmm9 +; SSE2-NEXT: paddd %xmm1, %xmm6 +; SSE2-NEXT: pxor %xmm1, %xmm6 ; SSE2-NEXT: movdqa %xmm12, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm12 ; SSE2-NEXT: pxor %xmm1, %xmm12 +; SSE2-NEXT: movdqa %xmm13, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm13 +; SSE2-NEXT: pxor %xmm1, %xmm13 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm3 +; SSE2-NEXT: pxor %xmm1, %xmm3 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: paddd %xmm3, %xmm0 +; SSE2-NEXT: paddd %xmm11, %xmm6 +; SSE2-NEXT: paddd %xmm9, %xmm6 +; SSE2-NEXT: paddd %xmm10, %xmm6 ; SSE2-NEXT: paddd %xmm12, %xmm0 -; SSE2-NEXT: paddd %xmm8, %xmm0 -; SSE2-NEXT: paddd %xmm4, %xmm0 -; SSE2-NEXT: paddd %xmm9, %xmm0 +; SSE2-NEXT: paddd %xmm6, %xmm0 +; SSE2-NEXT: paddd %xmm13, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] diff --git a/llvm/test/CodeGen/X86/select.ll b/llvm/test/CodeGen/X86/select.ll index 1afef86a5f11..ce42d0d643e8 100644 --- a/llvm/test/CodeGen/X86/select.ll +++ b/llvm/test/CodeGen/X86/select.ll @@ -299,21 +299,20 @@ define void @test8(i1 %c, <6 x i32>* %dst.addr, <6 x i32> %src1,<6 x i32> %src2) ; GENERIC-NEXT: testb %dil, %dil ; GENERIC-NEXT: jne LBB7_4 ; GENERIC-NEXT: ## BB#5: -; GENERIC-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; GENERIC-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; GENERIC-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; GENERIC-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; GENERIC-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero ; GENERIC-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; GENERIC-NEXT: jmp LBB7_6 ; GENERIC-NEXT: LBB7_4: -; GENERIC-NEXT: movd %r9d, %xmm1 -; GENERIC-NEXT: movd %ecx, %xmm2 -; GENERIC-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; GENERIC-NEXT: movd %r8d, %xmm3 +; GENERIC-NEXT: movd %r9d, %xmm2 +; GENERIC-NEXT: movd %ecx, %xmm3 +; GENERIC-NEXT: movd %r8d, %xmm4 ; GENERIC-NEXT: movd %edx, %xmm1 ; GENERIC-NEXT: LBB7_6: +; GENERIC-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; GENERIC-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; GENERIC-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; GENERIC-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; GENERIC-NEXT: psubd {{.*}}(%rip), %xmm1 ; GENERIC-NEXT: psubd {{.*}}(%rip), %xmm0 ; GENERIC-NEXT: movq %xmm0, 16(%rsi) @@ -340,19 +339,16 @@ define void @test8(i1 %c, <6 x i32>* %dst.addr, <6 x i32> %src1,<6 x i32> %src2) ; ATOM-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero ; ATOM-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero ; ATOM-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; ATOM-NEXT: jmp LBB7_6 +; ATOM-NEXT: LBB7_4: +; ATOM-NEXT: movd %r9d, %xmm2 +; ATOM-NEXT: movd %ecx, %xmm3 +; ATOM-NEXT: movd %r8d, %xmm4 +; ATOM-NEXT: movd %edx, %xmm1 +; ATOM-NEXT: LBB7_6: ; ATOM-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; ATOM-NEXT: jmp LBB7_6 -; ATOM-NEXT: LBB7_4: -; ATOM-NEXT: movd %r9d, %xmm1 -; ATOM-NEXT: movd %ecx, %xmm2 -; ATOM-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; ATOM-NEXT: movd %r8d, %xmm3 -; ATOM-NEXT: movd %edx, %xmm1 -; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; ATOM-NEXT: LBB7_6: ; ATOM-NEXT: psubd {{.*}}(%rip), %xmm0 ; ATOM-NEXT: psubd {{.*}}(%rip), %xmm1 ; ATOM-NEXT: movq %xmm0, 16(%rsi) diff --git a/llvm/test/CodeGen/X86/setcc-wide-types.ll b/llvm/test/CodeGen/X86/setcc-wide-types.ll index 332bf2887fb0..2996edaec3e0 100644 --- a/llvm/test/CodeGen/X86/setcc-wide-types.ll +++ b/llvm/test/CodeGen/X86/setcc-wide-types.ll @@ -58,25 +58,25 @@ define i32 @ne_i256(<4 x i64> %x, <4 x i64> %y) { ; SSE2-LABEL: ne_i256: ; SSE2: # BB#0: ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE2-NEXT: movq %xmm4, %rax +; SSE2-NEXT: movq %xmm4, %r8 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; SSE2-NEXT: movq %xmm4, %rcx -; SSE2-NEXT: movq %xmm0, %rdx -; SSE2-NEXT: movq %xmm1, %r8 +; SSE2-NEXT: movq %xmm4, %r9 +; SSE2-NEXT: movq %xmm0, %r10 +; SSE2-NEXT: movq %xmm1, %rsi ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] ; SSE2-NEXT: movq %xmm0, %rdi -; SSE2-NEXT: xorq %rax, %rdi ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] -; SSE2-NEXT: movq %xmm0, %rsi -; SSE2-NEXT: xorq %rcx, %rsi -; SSE2-NEXT: orq %rdi, %rsi -; SSE2-NEXT: movq %xmm2, %rax -; SSE2-NEXT: xorq %rdx, %rax -; SSE2-NEXT: movq %xmm3, %rcx -; SSE2-NEXT: xorq %r8, %rcx -; SSE2-NEXT: orq %rax, %rcx +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: movq %xmm2, %rcx +; SSE2-NEXT: movq %xmm3, %rdx +; SSE2-NEXT: xorq %rsi, %rdx +; SSE2-NEXT: xorq %r10, %rcx +; SSE2-NEXT: orq %rdx, %rcx +; SSE2-NEXT: xorq %r9, %rax +; SSE2-NEXT: xorq %r8, %rdi +; SSE2-NEXT: orq %rax, %rdi ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: orq %rsi, %rcx +; SSE2-NEXT: orq %rcx, %rdi ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; @@ -100,25 +100,25 @@ define i32 @eq_i256(<4 x i64> %x, <4 x i64> %y) { ; SSE2-LABEL: eq_i256: ; SSE2: # BB#0: ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE2-NEXT: movq %xmm4, %rax +; SSE2-NEXT: movq %xmm4, %r8 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; SSE2-NEXT: movq %xmm4, %rcx -; SSE2-NEXT: movq %xmm0, %rdx -; SSE2-NEXT: movq %xmm1, %r8 +; SSE2-NEXT: movq %xmm4, %r9 +; SSE2-NEXT: movq %xmm0, %r10 +; SSE2-NEXT: movq %xmm1, %rsi ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] ; SSE2-NEXT: movq %xmm0, %rdi -; SSE2-NEXT: xorq %rax, %rdi ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] -; SSE2-NEXT: movq %xmm0, %rsi -; SSE2-NEXT: xorq %rcx, %rsi -; SSE2-NEXT: orq %rdi, %rsi -; SSE2-NEXT: movq %xmm2, %rax -; SSE2-NEXT: xorq %rdx, %rax -; SSE2-NEXT: movq %xmm3, %rcx -; SSE2-NEXT: xorq %r8, %rcx -; SSE2-NEXT: orq %rax, %rcx +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: movq %xmm2, %rcx +; SSE2-NEXT: movq %xmm3, %rdx +; SSE2-NEXT: xorq %rsi, %rdx +; SSE2-NEXT: xorq %r10, %rcx +; SSE2-NEXT: orq %rdx, %rcx +; SSE2-NEXT: xorq %r9, %rax +; SSE2-NEXT: xorq %r8, %rdi +; SSE2-NEXT: orq %rax, %rdi ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: orq %rsi, %rcx +; SSE2-NEXT: orq %rcx, %rdi ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/shrink_vmul_sse.ll b/llvm/test/CodeGen/X86/shrink_vmul_sse.ll index 6701c247e6fc..c869dff9e642 100644 --- a/llvm/test/CodeGen/X86/shrink_vmul_sse.ll +++ b/llvm/test/CodeGen/X86/shrink_vmul_sse.ll @@ -20,9 +20,9 @@ define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 ; CHECK-NEXT: movzbl 1(%edx,%ecx), %edi ; CHECK-NEXT: movzbl (%edx,%ecx), %edx ; CHECK-NEXT: movzbl 1(%eax,%ecx), %ebx -; CHECK-NEXT: imull %edi, %ebx ; CHECK-NEXT: movzbl (%eax,%ecx), %eax ; CHECK-NEXT: imull %edx, %eax +; CHECK-NEXT: imull %edi, %ebx ; CHECK-NEXT: movl %ebx, 4(%esi,%ecx,4) ; CHECK-NEXT: movl %eax, (%esi,%ecx,4) ; CHECK-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll index d99cfaf535de..0b03dffe99b5 100644 --- a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll @@ -1537,9 +1537,9 @@ define <4 x float> @test_mm_set_ps(float %a0, float %a1, float %a2, float %a3) n ; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X32-NEXT: retl ; @@ -1673,13 +1673,13 @@ define void @test_mm_setcsr(i32 %a0) nounwind { define <4 x float> @test_mm_setr_ps(float %a0, float %a1, float %a2, float %a3) nounwind { ; X32-LABEL: test_mm_setr_ps: ; X32: # BB#0: -; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X32-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; X32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero ; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; X32-NEXT: retl ; ; X64-LABEL: test_mm_setr_ps: diff --git a/llvm/test/CodeGen/X86/sse1.ll b/llvm/test/CodeGen/X86/sse1.ll index 68ab3f9f3205..f4964b5a6f66 100644 --- a/llvm/test/CodeGen/X86/sse1.ll +++ b/llvm/test/CodeGen/X86/sse1.ll @@ -66,8 +66,8 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) { ; X32-NEXT: jne .LBB1_8 ; X32-NEXT: .LBB1_7: ; X32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; X32-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; X32-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; X32-NEXT: je .LBB1_10 ; X32-NEXT: jmp .LBB1_11 ; X32-NEXT: .LBB1_1: @@ -80,8 +80,8 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) { ; X32-NEXT: je .LBB1_7 ; X32-NEXT: .LBB1_8: # %entry ; X32-NEXT: xorps %xmm3, %xmm3 -; X32-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; X32-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; X32-NEXT: jne .LBB1_11 ; X32-NEXT: .LBB1_10: ; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -105,8 +105,8 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) { ; X64-NEXT: jne .LBB1_8 ; X64-NEXT: .LBB1_7: ; X64-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; X64-NEXT: testl %esi, %esi +; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; X64-NEXT: je .LBB1_10 ; X64-NEXT: jmp .LBB1_11 ; X64-NEXT: .LBB1_1: @@ -119,8 +119,8 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) { ; X64-NEXT: je .LBB1_7 ; X64-NEXT: .LBB1_8: # %entry ; X64-NEXT: xorps %xmm3, %xmm3 -; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; X64-NEXT: testl %esi, %esi +; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; X64-NEXT: jne .LBB1_11 ; X64-NEXT: .LBB1_10: ; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero diff --git a/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll b/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll index aed5e0d1c32e..4d895ea264c5 100644 --- a/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll +++ b/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll @@ -412,14 +412,14 @@ define <4 x float> @test16(<4 x float> %A, <4 x float> %B) { ; SSE-NEXT: movaps %xmm1, %xmm4 ; SSE-NEXT: movhlps {{.*#+}} xmm4 = xmm4[1,1] ; SSE-NEXT: subss %xmm4, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE-NEXT: addss %xmm0, %xmm3 +; SSE-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE-NEXT: addss %xmm0, %xmm4 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE-NEXT: addss %xmm0, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: retq ; @@ -431,12 +431,12 @@ define <4 x float> @test16(<4 x float> %A, <4 x float> %B) { ; AVX-NEXT: vsubss %xmm4, %xmm3, %xmm3 ; AVX-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] ; AVX-NEXT: vaddss %xmm0, %xmm4, %xmm4 -; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3] -; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] +; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm4[0],xmm2[2,3] +; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX-NEXT: retq %1 = extractelement <4 x float> %A, i32 0 %2 = extractelement <4 x float> %B, i32 0 diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll index 4a0dc9c1eb17..503b9416c8d3 100644 --- a/llvm/test/CodeGen/X86/sse41.ll +++ b/llvm/test/CodeGen/X86/sse41.ll @@ -273,8 +273,8 @@ define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind { ; X32: ## BB#0: ## %entry ; X32-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; X32-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] -; X32-NEXT: addss %xmm2, %xmm3 ; X32-NEXT: addss %xmm1, %xmm0 +; X32-NEXT: addss %xmm2, %xmm3 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] ; X32-NEXT: retl ; @@ -282,8 +282,8 @@ define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind { ; X64: ## BB#0: ## %entry ; X64-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; X64-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] -; X64-NEXT: addss %xmm2, %xmm3 ; X64-NEXT: addss %xmm1, %xmm0 +; X64-NEXT: addss %xmm2, %xmm3 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] ; X64-NEXT: retq entry: @@ -896,9 +896,9 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl ; X32-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] ; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] -; X32-NEXT: addps %xmm1, %xmm0 ; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0] ; X32-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] +; X32-NEXT: addps %xmm1, %xmm0 ; X32-NEXT: addps %xmm2, %xmm3 ; X32-NEXT: addps %xmm3, %xmm0 ; X32-NEXT: retl @@ -908,9 +908,9 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl ; X64-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] ; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] -; X64-NEXT: addps %xmm1, %xmm0 ; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0] ; X64-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] +; X64-NEXT: addps %xmm1, %xmm0 ; X64-NEXT: addps %xmm2, %xmm3 ; X64-NEXT: addps %xmm3, %xmm0 ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll index 1eef67764ab9..a42b3c96c3ae 100644 --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -4344,7 +4344,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB80_4 ; AVX1-NEXT: # BB#5: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm4 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 ; AVX1-NEXT: jmp .LBB80_6 ; AVX1-NEXT: .LBB80_4: ; AVX1-NEXT: movq %rax, %rcx @@ -4352,22 +4352,22 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 -; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm4 +; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: .LBB80_6: ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-NEXT: vmovq %xmm2, %rax ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB80_7 ; AVX1-NEXT: # BB#8: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 ; AVX1-NEXT: jmp .LBB80_9 ; AVX1-NEXT: .LBB80_7: ; AVX1-NEXT: movq %rax, %rcx ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 -; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 +; AVX1-NEXT: vaddss %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: .LBB80_9: ; AVX1-NEXT: vpextrq $1, %xmm2, %rax ; AVX1-NEXT: testq %rax, %rax @@ -4397,29 +4397,29 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5 ; AVX1-NEXT: vaddss %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: .LBB80_15: -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[2,3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3] ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB80_16 ; AVX1-NEXT: # BB#17: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm4 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3 ; AVX1-NEXT: jmp .LBB80_18 ; AVX1-NEXT: .LBB80_16: ; AVX1-NEXT: movq %rax, %rcx ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm4 -; AVX1-NEXT: vaddss %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3 +; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: .LBB80_18: -; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vmovq %xmm3, %rax +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[2,3] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vmovq %xmm4, %rax ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB80_19 ; AVX1-NEXT: # BB#20: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm0 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm5 ; AVX1-NEXT: jmp .LBB80_21 ; AVX1-NEXT: .LBB80_19: ; AVX1-NEXT: movq %rax, %rcx @@ -4427,25 +4427,25 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm0 -; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm5 ; AVX1-NEXT: .LBB80_21: -; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0],xmm4[3] ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0] -; AVX1-NEXT: vpextrq $1, %xmm3, %rax +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1],xmm5[0],xmm3[3] +; AVX1-NEXT: vpextrq $1, %xmm4, %rax ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB80_22 ; AVX1-NEXT: # BB#23: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm1 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2 ; AVX1-NEXT: jmp .LBB80_24 ; AVX1-NEXT: .LBB80_22: ; AVX1-NEXT: movq %rax, %rcx ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm1 -; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2 +; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: .LBB80_24: -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[0] +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -4471,7 +4471,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB80_4 ; AVX2-NEXT: # BB#5: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm4 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 ; AVX2-NEXT: jmp .LBB80_6 ; AVX2-NEXT: .LBB80_4: ; AVX2-NEXT: movq %rax, %rcx @@ -4479,22 +4479,22 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 -; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm4 +; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: .LBB80_6: ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX2-NEXT: vmovq %xmm2, %rax ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB80_7 ; AVX2-NEXT: # BB#8: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 ; AVX2-NEXT: jmp .LBB80_9 ; AVX2-NEXT: .LBB80_7: ; AVX2-NEXT: movq %rax, %rcx ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 -; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 +; AVX2-NEXT: vaddss %xmm4, %xmm4, %xmm4 ; AVX2-NEXT: .LBB80_9: ; AVX2-NEXT: vpextrq $1, %xmm2, %rax ; AVX2-NEXT: testq %rax, %rax @@ -4524,29 +4524,29 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5 ; AVX2-NEXT: vaddss %xmm5, %xmm5, %xmm5 ; AVX2-NEXT: .LBB80_15: -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[2,3] +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3] ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB80_16 ; AVX2-NEXT: # BB#17: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm4 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3 ; AVX2-NEXT: jmp .LBB80_18 ; AVX2-NEXT: .LBB80_16: ; AVX2-NEXT: movq %rax, %rcx ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm4 -; AVX2-NEXT: vaddss %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3 +; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: .LBB80_18: -; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vmovq %xmm3, %rax +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3] +; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[2,3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-NEXT: vmovq %xmm4, %rax ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB80_19 ; AVX2-NEXT: # BB#20: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm0 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm5 ; AVX2-NEXT: jmp .LBB80_21 ; AVX2-NEXT: .LBB80_19: ; AVX2-NEXT: movq %rax, %rcx @@ -4554,25 +4554,25 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm0 -; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm5 ; AVX2-NEXT: .LBB80_21: -; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0],xmm4[3] ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0] -; AVX2-NEXT: vpextrq $1, %xmm3, %rax +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1],xmm5[0],xmm3[3] +; AVX2-NEXT: vpextrq $1, %xmm4, %rax ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB80_22 ; AVX2-NEXT: # BB#23: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm1 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2 ; AVX2-NEXT: jmp .LBB80_24 ; AVX2-NEXT: .LBB80_22: ; AVX2-NEXT: movq %rax, %rcx ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm1 -; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2 +; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: .LBB80_24: -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[0] +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll index 2fb821555dba..226c0adbaf3c 100644 --- a/llvm/test/CodeGen/X86/vector-bitreverse.ll +++ b/llvm/test/CodeGen/X86/vector-bitreverse.ll @@ -2372,10 +2372,10 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind { ; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1 ; AVX512F-NEXT: vpsrlq $24, %zmm0, %zmm2 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 +; AVX512F-NEXT: vpsrlq $8, %zmm0, %zmm3 +; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3 ; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1 -; AVX512F-NEXT: vpsrlq $8, %zmm0, %zmm2 -; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 -; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1 +; AVX512F-NEXT: vporq %zmm1, %zmm3, %zmm1 ; AVX512F-NEXT: vpsllq $8, %zmm0, %zmm2 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 ; AVX512F-NEXT: vpsllq $24, %zmm0, %zmm3 diff --git a/llvm/test/CodeGen/X86/vector-blend.ll b/llvm/test/CodeGen/X86/vector-blend.ll index f0a5fe1dbfff..a05a981daa1f 100644 --- a/llvm/test/CodeGen/X86/vector-blend.ll +++ b/llvm/test/CodeGen/X86/vector-blend.ll @@ -848,10 +848,10 @@ define <8 x i32> @blend_logic_v8i32(<8 x i32> %b, <8 x i32> %a, <8 x i32> %c) { ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm3 ; SSE2-NEXT: pandn %xmm5, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: pandn %xmm4, %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: blend_logic_v8i32: @@ -860,10 +860,10 @@ define <8 x i32> @blend_logic_v8i32(<8 x i32> %b, <8 x i32> %a, <8 x i32> %c) { ; SSSE3-NEXT: psrad $31, %xmm1 ; SSSE3-NEXT: pand %xmm1, %xmm3 ; SSSE3-NEXT: pandn %xmm5, %xmm1 -; SSSE3-NEXT: por %xmm3, %xmm1 ; SSSE3-NEXT: pand %xmm0, %xmm2 ; SSSE3-NEXT: pandn %xmm4, %xmm0 ; SSSE3-NEXT: por %xmm2, %xmm0 +; SSSE3-NEXT: por %xmm3, %xmm1 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: blend_logic_v8i32: diff --git a/llvm/test/CodeGen/X86/vector-sqrt.ll b/llvm/test/CodeGen/X86/vector-sqrt.ll index 8081e9482d67..c5ac4466b5fa 100644 --- a/llvm/test/CodeGen/X86/vector-sqrt.ll +++ b/llvm/test/CodeGen/X86/vector-sqrt.ll @@ -29,11 +29,11 @@ define <4 x float> @sqrtf4(float* nocapture readonly %v) local_unnamed_addr #0 { ; CHECK: # BB#0: # %entry ; CHECK-NEXT: vsqrtss (%rdi), %xmm0, %xmm0 ; CHECK-NEXT: vsqrtss 4(%rdi), %xmm1, %xmm1 +; CHECK-NEXT: vsqrtss 8(%rdi), %xmm2, %xmm2 +; CHECK-NEXT: vsqrtss 12(%rdi), %xmm3, %xmm3 ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; CHECK-NEXT: vsqrtss 8(%rdi), %xmm2, %xmm1 -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; CHECK-NEXT: vsqrtss 12(%rdi), %xmm2, %xmm1 -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] ; CHECK-NEXT: retq entry: %0 = load float, float* %v, align 4 diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll index 450e255313b3..6fbec91e77a3 100644 --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -11,13 +11,13 @@ define <4 x double> @load_factorf64_4(<16 x double>* %ptr) { ; AVX-NEXT: vmovupd 96(%rdi), %ymm3 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5 -; AVX-NEXT: vhaddpd %ymm5, %ymm4, %ymm4 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] ; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX-NEXT: vaddpd %ymm2, %ymm4, %ymm2 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX-NEXT: vaddpd %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vhaddpd %ymm5, %ymm4, %ymm1 +; AVX-NEXT: vaddpd %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ; AVX-NEXT: retq %wide.vec = load <16 x double>, <16 x double>* %ptr, align 16 %strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> @@ -39,11 +39,11 @@ define <4 x double> @load_factorf64_2(<16 x double>* %ptr) { ; AVX-NEXT: vmovupd 96(%rdi), %ymm3 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX-NEXT: vmulpd %ymm0, %ymm4, %ymm0 +; AVX-NEXT: vmulpd %ymm0, %ymm2, %ymm0 ; AVX-NEXT: retq %wide.vec = load <16 x double>, <16 x double>* %ptr, align 16 %strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> @@ -124,9 +124,9 @@ define <4 x i64> @load_factori64_4(<16 x i64>* %ptr) { ; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] ; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] -; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-NEXT: vpaddq %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm1 +; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: retq %wide.vec = load <16 x i64>, <16 x i64>* %ptr, align 16