From 9aa45f047f303b6484afce6716472b3b1f510c7e Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 6 Jul 2017 20:57:05 +0000 Subject: [PATCH] AMDGPU: Add macro fusion schedule DAG mutation Try to increase opportunities to shrink vcc uses. llvm-svn: 307313 --- llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp | 64 ++++ llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.h | 19 ++ .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 2 + llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 + llvm/test/CodeGen/AMDGPU/addrspacecast.ll | 33 +- llvm/test/CodeGen/AMDGPU/ctlz.ll | 25 +- llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll | 4 +- llvm/test/CodeGen/AMDGPU/fcmp.f16.ll | 312 +++++------------- llvm/test/CodeGen/AMDGPU/inline-asm.ll | 4 +- .../AMDGPU/macro-fusion-cluster-vcc-uses.mir | 227 +++++++++++++ .../AMDGPU/multi-divergent-exit-region.ll | 4 +- llvm/test/CodeGen/AMDGPU/sad.ll | 4 +- llvm/test/CodeGen/AMDGPU/select.f16.ll | 59 ++-- llvm/test/CodeGen/AMDGPU/setcc.ll | 10 +- llvm/test/CodeGen/AMDGPU/uaddo.ll | 4 +- llvm/test/CodeGen/AMDGPU/usubo.ll | 4 +- llvm/test/CodeGen/AMDGPU/v_cndmask.ll | 12 +- llvm/test/CodeGen/AMDGPU/vselect.ll | 25 +- 18 files changed, 508 insertions(+), 305 deletions(-) create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.h create mode 100644 llvm/test/CodeGen/AMDGPU/macro-fusion-cluster-vcc-uses.mir diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp new file mode 100644 index 000000000000..7263ba73d155 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp @@ -0,0 +1,64 @@ +//===--- AMDGPUMacroFusion.cpp - AMDGPU Macro Fusion ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file This file contains the AMDGPU implementation of the DAG scheduling +/// mutation to pair instructions back to back. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUMacroFusion.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" + +#include "llvm/CodeGen/MacroFusion.h" + +using namespace llvm; + +namespace { + +/// \brief Check if the instr pair, FirstMI and SecondMI, should be fused +/// together. Given SecondMI, when FirstMI is unspecified, then check if +/// SecondMI may be part of a fused pair at all. +static bool shouldScheduleAdjacent(const TargetInstrInfo &TII_, + const TargetSubtargetInfo &TSI, + const MachineInstr *FirstMI, + const MachineInstr &SecondMI) { + const SIInstrInfo &TII = static_cast(TII_); + + switch (SecondMI.getOpcode()) { + case AMDGPU::V_ADDC_U32_e64: + case AMDGPU::V_SUBB_U32_e64: + case AMDGPU::V_CNDMASK_B32_e64: { + // Try to cluster defs of condition registers to their uses. This improves + // the chance VCC will be available which will allow shrinking to VOP2 + // encodings. + if (!FirstMI) + return true; + + const MachineOperand *Src2 = TII.getNamedOperand(SecondMI, + AMDGPU::OpName::src2); + return FirstMI->definesRegister(Src2->getReg()); + } + default: + return false; + } + + return false; +} + +} // end namespace + + +namespace llvm { + +std::unique_ptr createAMDGPUMacroFusionDAGMutation () { + return createMacroFusionDAGMutation(shouldScheduleAdjacent); +} + +} // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.h b/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.h new file mode 100644 index 000000000000..844958580a65 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.h @@ -0,0 +1,19 @@ +//===- AMDGPUMacroFusion.h - AMDGPU Macro Fusion ----------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/MachineScheduler.h" + +namespace llvm { + +/// Note that you have to add: +/// DAG.addMutation(createAMDGPUMacroFusionDAGMutation()); +/// to AMDGPUPassConfig::createMachineScheduler() to have an effect. +std::unique_ptr createAMDGPUMacroFusionDAGMutation(); + +} // llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 87d8e714d660..dc868f010d85 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -19,6 +19,7 @@ #include "AMDGPUCallLowering.h" #include "AMDGPUInstructionSelector.h" #include "AMDGPULegalizerInfo.h" +#include "AMDGPUMacroFusion.h" #include "AMDGPUTargetObjectFile.h" #include "AMDGPUTargetTransformInfo.h" #include "GCNIterativeScheduler.h" @@ -173,6 +174,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { new GCNScheduleDAGMILive(C, make_unique(C)); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); + DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); return DAG; } diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 917d9cfa6905..971208c5db84 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -47,6 +47,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUIntrinsicInfo.cpp AMDGPUISelDAGToDAG.cpp AMDGPULowerIntrinsics.cpp + AMDGPUMacroFusion.cpp AMDGPUMCInstLower.cpp AMDGPUMachineCFGStructurizer.cpp AMDGPUMachineFunction.cpp diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll index b1e71722d80c..a6aa9e795151 100644 --- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll @@ -10,20 +10,22 @@ ; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}} ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}} ; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]] +; CI-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], -1 +; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc +; CI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] +; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] +; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 ; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}} ; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(15, 16, 16) ; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16 ; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_SHARED_BASE]] ; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_shared_base - -; HSA-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] - -; HSA-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], -1 -; HSA-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]] -; HSA-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] -; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 +; GFX9: v_cmp_ne_u32_e64 vcc, [[PTR]], -1 +; GFX9: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc +; GFX9-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] +; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]] @@ -48,6 +50,12 @@ define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %pt ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11{{$}} ; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]] +; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 +; CI-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], 0 +; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc +; CI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] +; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] + ; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}} ; GFX9-DAG: s_getreg_b32 [[SSRC_PRIVATE:s[0-9]+]], hwreg(15, 0, 16) ; GFX9-DAG: s_lshl_b32 [[SSRC_PRIVATE_BASE:s[0-9]+]], [[SSRC_PRIVATE]], 16 @@ -55,12 +63,11 @@ define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %pt ; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_private_base -; HSA-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] - -; HSA-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], 0 -; HSA-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]] -; HSA-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] -; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 +; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 +; GFX9: v_cmp_ne_u32_e64 vcc, [[PTR]], 0 +; GFX9: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc +; GFX9: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] +; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]] diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll index 3031ee8bbecd..e7382a894832 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=EG -check-prefix=FUNC %s declare i7 @llvm.ctlz.i7(i7, i1) nounwind readnone declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone @@ -35,8 +35,8 @@ define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) ; FUNC-LABEL: {{^}}v_ctlz_i32: ; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]], -; GCN-DAG: v_ffbh_u32_e32 [[CTLZ:v[0-9]+]], [[VAL]] -; GCN-DAG: v_cmp_ne_u32_e32 vcc, 0, [[CTLZ]] +; GCN: v_ffbh_u32_e32 [[CTLZ:v[0-9]+]], [[VAL]] +; GCN: v_cmp_ne_u32_e32 vcc, 0, [[VAL]] ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], 32, [[CTLZ]], vcc ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm @@ -104,8 +104,15 @@ define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 ; FUNC-LABEL: {{^}}v_ctlz_i8: ; GCN: {{buffer|flat}}_load_ubyte [[VAL:v[0-9]+]], -; SI-DAG: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]] -; VI-DAG: v_ffbh_u32_sdwa [[RESULT:v[0-9]+]], [[VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; SI-DAG: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]] +; VI-DAG: v_ffbh_u32_sdwa [[FFBH:v[0-9]+]], [[VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; SI: v_cmp_ne_u32_e32 vcc, 0, [[VAL]] +; VI: v_cmp_ne_u16_e32 vcc, 0, [[VAL]] + +; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 32, [[FFBH]], vcc + +; SI: v_subrev_i32_e32 [[RESULT:v[0-9]+]], vcc, 24, [[SELECT]] +; VI: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, -16, [[SELECT]] ; GCN: buffer_store_byte [[RESULT]], ; GCN: s_endpgm define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { @@ -142,11 +149,11 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 ; FUNC-LABEL: {{^}}v_ctlz_i64: ; GCN-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} -; GCN-DAG: v_cmp_eq_u32_e64 [[CMPHI:s\[[0-9]+:[0-9]+\]]], 0, v[[HI]] +; GCN-DAG: v_cmp_eq_u32_e32 vcc, 0, v[[HI]] ; GCN-DAG: v_ffbh_u32_e32 [[FFBH_LO:v[0-9]+]], v[[LO]] ; GCN-DAG: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 32, [[FFBH_LO]] ; GCN-DAG: v_ffbh_u32_e32 [[FFBH_HI:v[0-9]+]], v[[HI]] -; GCN-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[ADD]], [[CMPHI]] +; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[ADD]], vcc ; GCN-DAG: v_or_b32_e32 [[OR:v[0-9]+]], v[[HI]], v[[LO]] ; GCN-DAG: v_cmp_ne_u32_e32 vcc, 0, [[OR]] ; GCN-DAG: v_cndmask_b32_e32 v[[CLTZ_LO:[0-9]+]], 64, v[[CTLZ:[0-9]+]], vcc diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll index c99ce8659e77..7500da536307 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -124,11 +124,11 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i64: ; GCN-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} -; GCN-DAG: v_cmp_eq_u32_e64 [[CMPHI:s\[[0-9]+:[0-9]+\]]], 0, v[[HI]] +; GCN-DAG: v_cmp_eq_u32_e32 vcc, 0, v[[HI]] ; GCN-DAG: v_ffbh_u32_e32 [[FFBH_LO:v[0-9]+]], v[[LO]] ; GCN-DAG: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 32, [[FFBH_LO]] ; GCN-DAG: v_ffbh_u32_e32 [[FFBH_HI:v[0-9]+]], v[[HI]] -; GCN-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[FFBH_LO]] +; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[FFBH_LO]] ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI:[0-9]+]]{{\]}} define amdgpu_kernel void @v_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { %tid = call i32 @llvm.r600.read.tidig.x() diff --git a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll index 13d5f7e91c7e..aef898b1a8ee 100644 --- a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}fcmp_f16_lt ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] @@ -351,23 +351,12 @@ entry: ret void } -; GCN-LABEL: {{^}}fcmp_v2f16_lt -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cmp_lt_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] -; SI: v_cmp_lt_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] -; VI: v_cmp_lt_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] -; VI: v_cmp_lt_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] -; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] -; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] -; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} -; GCN: s_endpgm +; GCN-LABEL: {{^}}fcmp_v2f16_lt: +; SI: v_cmp_lt_f32_e32 vcc, +; SI: v_cmp_lt_f32_e32 vcc, + +; VI: v_cmp_lt_f16_e32 vcc, +; VI: v_cmp_lt_f16_e32 vcc, define amdgpu_kernel void @fcmp_v2f16_lt( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, @@ -382,22 +371,11 @@ entry: } ; GCN-LABEL: {{^}}fcmp_v2f16_eq -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cmp_eq_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] -; SI: v_cmp_eq_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] -; VI: v_cmp_eq_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] -; VI: v_cmp_eq_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] -; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] -; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] -; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} -; GCN: s_endpgm +; SI: v_cmp_eq_f32_e32 vcc, +; SI: v_cmp_eq_f32_e32 vcc, + +; VI: v_cmp_eq_f16_e32 vcc, +; VI: v_cmp_eq_f16_e32 vcc, define amdgpu_kernel void @fcmp_v2f16_eq( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, @@ -411,23 +389,11 @@ entry: ret void } -; GCN-LABEL: {{^}}fcmp_v2f16_le -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cmp_le_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] -; SI: v_cmp_le_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] -; VI: v_cmp_le_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] -; VI: v_cmp_le_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] -; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] -; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] -; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} -; GCN: s_endpgm +; GCN-LABEL: {{^}}fcmp_v2f16_le: +; SI: v_cmp_le_f32_e32 vcc +; SI: v_cmp_le_f32_e32 vcc +; VI: v_cmp_le_f16_e32 vcc +; VI: v_cmp_le_f16_e32 vcc define amdgpu_kernel void @fcmp_v2f16_le( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, @@ -441,23 +407,12 @@ entry: ret void } -; GCN-LABEL: {{^}}fcmp_v2f16_gt -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cmp_gt_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] -; SI: v_cmp_gt_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] -; VI: v_cmp_gt_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] -; VI: v_cmp_gt_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] -; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] -; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] -; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} -; GCN: s_endpgm +; GCN-LABEL: {{^}}fcmp_v2f16_gt: +; SI: v_cmp_gt_f32_e32 vcc, +; SI: v_cmp_gt_f32_e32 vcc, + +; VI: v_cmp_gt_f16_e32 vcc, +; VI: v_cmp_gt_f16_e32 vcc, define amdgpu_kernel void @fcmp_v2f16_gt( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, @@ -471,23 +426,12 @@ entry: ret void } -; GCN-LABEL: {{^}}fcmp_v2f16_lg -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cmp_lg_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] -; SI: v_cmp_lg_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] -; VI: v_cmp_lg_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] -; VI: v_cmp_lg_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] -; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] -; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] -; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} -; GCN: s_endpgm +; GCN-LABEL: {{^}}fcmp_v2f16_lg: +; SI: v_cmp_lg_f32_e32 vcc, +; SI: v_cmp_lg_f32_e32 vcc, + +; VI: v_cmp_lg_f16_e32 vcc, +; VI: v_cmp_lg_f16_e32 vcc, define amdgpu_kernel void @fcmp_v2f16_lg( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, @@ -501,23 +445,12 @@ entry: ret void } -; GCN-LABEL: {{^}}fcmp_v2f16_ge -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cmp_ge_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] -; SI: v_cmp_ge_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] -; VI: v_cmp_ge_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] -; VI: v_cmp_ge_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] -; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] -; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] -; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} -; GCN: s_endpgm +; GCN-LABEL: {{^}}fcmp_v2f16_ge: +; SI: v_cmp_ge_f32_e32 vcc, +; SI: v_cmp_ge_f32_e32 vcc, + +; VI: v_cmp_ge_f16_e32 vcc, +; VI: v_cmp_ge_f16_e32 vcc, define amdgpu_kernel void @fcmp_v2f16_ge( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, @@ -531,23 +464,12 @@ entry: ret void } -; GCN-LABEL: {{^}}fcmp_v2f16_o -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cmp_o_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] -; SI: v_cmp_o_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] -; VI: v_cmp_o_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] -; VI: v_cmp_o_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] -; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] -; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] -; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} -; GCN: s_endpgm +; GCN-LABEL: {{^}}fcmp_v2f16_o: +; SI: v_cmp_o_f32_e32 vcc, +; SI: v_cmp_o_f32_e32 vcc, + +; VI: v_cmp_o_f16_e32 vcc, +; VI: v_cmp_o_f16_e32 vcc, define amdgpu_kernel void @fcmp_v2f16_o( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, @@ -561,23 +483,12 @@ entry: ret void } -; GCN-LABEL: {{^}}fcmp_v2f16_u -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cmp_u_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] -; SI: v_cmp_u_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] -; VI: v_cmp_u_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] -; VI: v_cmp_u_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] -; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] -; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] -; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} -; GCN: s_endpgm +; GCN-LABEL: {{^}}fcmp_v2f16_u: +; SI: v_cmp_u_f32_e32 vcc, +; SI: v_cmp_u_f32_e32 vcc, + +; VI: v_cmp_u_f16_e32 vcc, +; VI: v_cmp_u_f16_e32 vcc, define amdgpu_kernel void @fcmp_v2f16_u( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, @@ -592,22 +503,11 @@ entry: } ; GCN-LABEL: {{^}}fcmp_v2f16_nge -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cmp_nge_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] -; SI: v_cmp_nge_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] -; VI: v_cmp_nge_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] -; VI: v_cmp_nge_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] -; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] -; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] -; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} -; GCN: s_endpgm +; SI: v_cmp_nge_f32_e32 vcc, +; SI: v_cmp_nge_f32_e32 vcc, + +; VI: v_cmp_nge_f16_e32 vcc, +; VI: v_cmp_nge_f16_e32 vcc, define amdgpu_kernel void @fcmp_v2f16_nge( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, @@ -622,22 +522,11 @@ entry: } ; GCN-LABEL: {{^}}fcmp_v2f16_nlg -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cmp_nlg_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] -; SI: v_cmp_nlg_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] -; VI: v_cmp_nlg_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] -; VI: v_cmp_nlg_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] -; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] -; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] -; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} -; GCN: s_endpgm +; SI: v_cmp_nlg_f32_e32 vcc +; SI: v_cmp_nlg_f32_e32 vcc + +; VI: v_cmp_nlg_f16_e32 vcc +; VI: v_cmp_nlg_f16_e32 vcc define amdgpu_kernel void @fcmp_v2f16_nlg( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, @@ -652,22 +541,11 @@ entry: } ; GCN-LABEL: {{^}}fcmp_v2f16_ngt -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cmp_ngt_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] -; SI: v_cmp_ngt_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] -; VI: v_cmp_ngt_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] -; VI: v_cmp_ngt_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] -; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] -; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] -; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} -; GCN: s_endpgm +; SI: v_cmp_ngt_f32_e32 vcc, +; SI: v_cmp_ngt_f32_e32 vcc, + +; VI: v_cmp_ngt_f16_e32 vcc, +; VI: v_cmp_ngt_f16_e32 vcc, define amdgpu_kernel void @fcmp_v2f16_ngt( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, @@ -682,22 +560,11 @@ entry: } ; GCN-LABEL: {{^}}fcmp_v2f16_nle -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cmp_nle_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] -; SI: v_cmp_nle_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] -; VI: v_cmp_nle_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] -; VI: v_cmp_nle_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] -; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] -; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] -; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} -; GCN: s_endpgm +; SI: v_cmp_nle_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} +; SI: v_cmp_nle_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} + +; VI: v_cmp_nle_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_cmp_nle_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @fcmp_v2f16_nle( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, @@ -712,22 +579,11 @@ entry: } ; GCN-LABEL: {{^}}fcmp_v2f16_neq -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cmp_neq_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] -; SI: v_cmp_neq_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] -; VI: v_cmp_neq_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] -; VI: v_cmp_neq_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] -; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] -; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] -; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} -; GCN: s_endpgm +; SI: v_cmp_neq_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} +; SI: v_cmp_neq_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} + +; VI: v_cmp_neq_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_cmp_neq_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @fcmp_v2f16_neq( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, @@ -744,17 +600,19 @@ entry: ; GCN-LABEL: {{^}}fcmp_v2f16_nlt ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cmp_nlt_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] -; SI: v_cmp_nlt_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] -; VI: v_cmp_nlt_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] -; VI: v_cmp_nlt_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; GCN-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; SI-DAG: v_cmp_nlt_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] + +; GCN-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI-DAG: v_cmp_nlt_f32_e32 vcc, v[[A_F32_1]], v[[B_F32_1]] +; VI-DAG: v_cmp_nlt_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] ; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] + +; VI: v_cmp_nlt_f16_e32 vcc, v[[A_F16_1]], v[[B_F16_1]] ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} ; GCN: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.ll index c0f5218efc16..75826d530cb0 100644 --- a/llvm/test/CodeGen/AMDGPU/inline-asm.ll +++ b/llvm/test/CodeGen/AMDGPU/inline-asm.ll @@ -222,9 +222,9 @@ entry: ; FIXME: Should be scheduled to shrink vcc ; CHECK-LABEL: {{^}}i1_input_phys_vgpr_x2: ; CHECK: v_cmp_eq_u32_e32 vcc, 1, v0 -; CHECK: v_cmp_eq_u32_e64 s[0:1], 1, v1 ; CHECK: v_cndmask_b32_e64 v0, 0, -1, vcc -; CHECK: v_cndmask_b32_e64 v1, 0, -1, s[0:1] +; CHECK: v_cmp_eq_u32_e32 vcc, 1, v1 +; CHECK: v_cndmask_b32_e64 v1, 0, -1, vcc define amdgpu_kernel void @i1_input_phys_vgpr_x2() { entry: %val0 = load volatile i1, i1 addrspace(1)* undef diff --git a/llvm/test/CodeGen/AMDGPU/macro-fusion-cluster-vcc-uses.mir b/llvm/test/CodeGen/AMDGPU/macro-fusion-cluster-vcc-uses.mir new file mode 100644 index 000000000000..768acf35eeae --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/macro-fusion-cluster-vcc-uses.mir @@ -0,0 +1,227 @@ +# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass machine-scheduler -o - %s | FileCheck -check-prefix=GCN %s + +# GCN-LABEL: name: cluster_add_addc +# GCN: S_NOP 0, implicit-def %vcc +# GCN: dead %2, %3 = V_ADD_I32_e64 %0, %1, implicit %exec +# GCN: dead %4, dead %5 = V_ADDC_U32_e64 %6, %7, %3, implicit %exec +name: cluster_add_addc +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: sreg_64 } + - { id: 4, class: vgpr_32 } + - { id: 5, class: sreg_64 } + - { id: 6, class: vgpr_32 } + - { id: 7, class: vgpr_32 } + +body: | + bb.0: + %0 = V_MOV_B32_e32 0, implicit %exec + %1 = V_MOV_B32_e32 0, implicit %exec + %2, %3 = V_ADD_I32_e64 %0, %1, implicit %exec + %6 = V_MOV_B32_e32 0, implicit %exec + %7 = V_MOV_B32_e32 0, implicit %exec + S_NOP 0, implicit def %vcc + %4, %5 = V_ADDC_U32_e64 %6, %7, %3, implicit %exec +... + +# GCN-LABEL: name: interleave_add64s +# GCN: dead %8, %9 = V_ADD_I32_e64 %0, %1, implicit %exec +# GCN-NEXT: dead %12, dead %13 = V_ADDC_U32_e64 %4, %5, %9, implicit %exec +# GCN-NEXT: dead %10, %11 = V_ADD_I32_e64 %2, %3, implicit %exec +# GCN-NEXT: dead %14, dead %15 = V_ADDC_U32_e64 %6, %7, %11, implicit %exec +name: interleave_add64s +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: vgpr_32 } + - { id: 5, class: vgpr_32 } + - { id: 6, class: vgpr_32 } + - { id: 7, class: vgpr_32 } + - { id: 8, class: vgpr_32 } + - { id: 9, class: sreg_64 } + - { id: 10, class: vgpr_32 } + - { id: 11, class: sreg_64 } + - { id: 12, class: vgpr_32 } + - { id: 13, class: sreg_64 } + - { id: 14, class: vgpr_32 } + - { id: 15, class: sreg_64 } + +body: | + bb.0: + %0 = V_MOV_B32_e32 0, implicit %exec + %1 = V_MOV_B32_e32 0, implicit %exec + %2 = V_MOV_B32_e32 0, implicit %exec + %3 = V_MOV_B32_e32 0, implicit %exec + %4 = V_MOV_B32_e32 0, implicit %exec + %5 = V_MOV_B32_e32 0, implicit %exec + %6 = V_MOV_B32_e32 0, implicit %exec + %7 = V_MOV_B32_e32 0, implicit %exec + + %8, %9 = V_ADD_I32_e64 %0, %1, implicit %exec + %10, %11 = V_ADD_I32_e64 %2, %3, implicit %exec + + + %12, %13 = V_ADDC_U32_e64 %4, %5, %9, implicit %exec + %14, %15 = V_ADDC_U32_e64 %6, %7, %11, implicit %exec +... + +# GCN-LABEL: name: cluster_mov_addc +# GCN: S_NOP 0, implicit-def %vcc +# GCN-NEXT: %2 = S_MOV_B64 0 +# GCN-NEXT: dead %3, dead %4 = V_ADDC_U32_e64 %0, %1, %2, implicit %exec +name: cluster_mov_addc +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: sreg_64 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: sreg_64 } + - { id: 6, class: vgpr_32 } + - { id: 7, class: vgpr_32 } + +body: | + bb.0: + %0 = V_MOV_B32_e32 0, implicit %exec + %1 = V_MOV_B32_e32 0, implicit %exec + %2 = S_MOV_B64 0 + S_NOP 0, implicit def %vcc + %3, %4 = V_ADDC_U32_e64 %0, %1, %2, implicit %exec +... + +# GCN-LABEL: name: no_cluster_add_addc_diff_sgpr +# GCN: dead %2, dead %3 = V_ADD_I32_e64 %0, %1, implicit %exec +# GCN-NEXT: %6 = V_MOV_B32_e32 0, implicit %exec +# GCN-NEXT: %7 = V_MOV_B32_e32 0, implicit %exec +# GCN-NEXT: S_NOP 0, implicit-def %vcc +# GCN-NEXT: %8 = S_MOV_B64 0 +# GCN-NEXT: dead %4, dead %5 = V_ADDC_U32_e64 %6, %7, %8, implicit %exec +name: no_cluster_add_addc_diff_sgpr +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: sreg_64 } + - { id: 4, class: vgpr_32 } + - { id: 5, class: sreg_64 } + - { id: 6, class: vgpr_32 } + - { id: 7, class: vgpr_32 } + - { id: 8, class: sreg_64 } +body: | + bb.0: + %0 = V_MOV_B32_e32 0, implicit %exec + %1 = V_MOV_B32_e32 0, implicit %exec + %8 = S_MOV_B64 0 + %2, %3 = V_ADD_I32_e64 %0, %1, implicit %exec + %6 = V_MOV_B32_e32 0, implicit %exec + %7 = V_MOV_B32_e32 0, implicit %exec + S_NOP 0, implicit def %vcc + %4, %5 = V_ADDC_U32_e64 %6, %7, %8, implicit %exec +... +# GCN-LABEL: name: cluster_sub_subb +# GCN: S_NOP 0, implicit-def %vcc +# GCN: dead %2, %3 = V_SUB_I32_e64 %0, %1, implicit %exec +# GCN: dead %4, dead %5 = V_SUBB_U32_e64 %6, %7, %3, implicit %exec +name: cluster_sub_subb +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: sreg_64 } + - { id: 4, class: vgpr_32 } + - { id: 5, class: sreg_64 } + - { id: 6, class: vgpr_32 } + - { id: 7, class: vgpr_32 } + +body: | + bb.0: + %0 = V_MOV_B32_e32 0, implicit %exec + %1 = V_MOV_B32_e32 0, implicit %exec + %2, %3 = V_SUB_I32_e64 %0, %1, implicit %exec + %6 = V_MOV_B32_e32 0, implicit %exec + %7 = V_MOV_B32_e32 0, implicit %exec + S_NOP 0, implicit def %vcc + %4, %5 = V_SUBB_U32_e64 %6, %7, %3, implicit %exec +... + +# GCN-LABEL: name: cluster_cmp_cndmask +# GCN: S_NOP 0, implicit-def %vcc +# GCN-NEXT: %3 = V_CMP_EQ_I32_e64 %0, %1, implicit %exec +# GCN-NEXT: dead %4 = V_CNDMASK_B32_e64 %0, %1, %3, implicit %exec +name: cluster_cmp_cndmask +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: sreg_64 } + - { id: 4, class: vgpr_32 } + - { id: 5, class: sreg_64 } + - { id: 6, class: vgpr_32 } + - { id: 7, class: vgpr_32 } + +body: | + bb.0: + %0 = V_MOV_B32_e32 0, implicit %exec + %1 = V_MOV_B32_e32 0, implicit %exec + %3 = V_CMP_EQ_I32_e64 %0, %1, implicit %exec + S_NOP 0, implicit def %vcc + %4 = V_CNDMASK_B32_e64 %0, %1, %3, implicit %exec +... + +# GCN-LABEL: name: cluster_multi_use_cmp_cndmask +# GCN: %4 = V_CMP_EQ_I32_e64 %0, %1, implicit %exec +# GCN-NEXT: dead %5 = V_CNDMASK_B32_e64 %2, %1, %4, implicit %exec +# GCN-NEXT: dead %6 = V_CNDMASK_B32_e64 %1, %3, %4, implicit %exec +name: cluster_multi_use_cmp_cndmask +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: sreg_64 } + - { id: 5, class: vgpr_32 } + - { id: 6, class: vgpr_32 } + - { id: 7, class: vgpr_32 } + +body: | + bb.0: + %0 = V_MOV_B32_e32 0, implicit %exec + %1 = V_MOV_B32_e32 0, implicit %exec + %2 = V_MOV_B32_e32 0, implicit %exec + %3 = V_MOV_B32_e32 0, implicit %exec + + %4 = V_CMP_EQ_I32_e64 %0, %1, implicit %exec + S_NOP 0, implicit def %vcc + %5 = V_CNDMASK_B32_e64 %2, %1, %4, implicit %exec + %6 = V_CNDMASK_B32_e64 %1, %3, %4, implicit %exec +... + +# GCN-LABEL: name: cluster_multi_use_cmp_cndmask2 +# GCN: %4 = V_CMP_EQ_I32_e64 %0, %1, implicit %exec +# GCN-NEXT: dead %5 = V_CNDMASK_B32_e64 %2, %1, %4, implicit %exec +# GCN-NEXT: %3 = V_MOV_B32_e32 0, implicit %exec +# GCN-NEXT: dead %6 = V_CNDMASK_B32_e64 %1, %3, %4, implicit %exec +name: cluster_multi_use_cmp_cndmask2 +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: sreg_64 } + - { id: 5, class: vgpr_32 } + - { id: 6, class: vgpr_32 } + - { id: 7, class: vgpr_32 } + +body: | + bb.0: + %0 = V_MOV_B32_e32 0, implicit %exec + %1 = V_MOV_B32_e32 0, implicit %exec + %4 = V_CMP_EQ_I32_e64 %0, %1, implicit %exec + %2 = V_MOV_B32_e32 0, implicit %exec + %5 = V_CNDMASK_B32_e64 %2, %1, %4, implicit %exec + %3 = V_MOV_B32_e32 0, implicit %exec + %6 = V_CNDMASK_B32_e64 %1, %3, %4, implicit %exec +... diff --git a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll index 82c27f204a47..ba3ff0b08bc9 100644 --- a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll +++ b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll @@ -66,9 +66,9 @@ ; FIXME: Why is this compare essentially repeated? ; GCN: v_cmp_eq_u32_e32 vcc, 1, [[REG:v[0-9]+]] -; GCN-NEXT: v_cmp_ne_u32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1, [[REG]] ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc -; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1 +; GCN: v_cmp_ne_u32_e32 vcc, 1, [[REG]] +; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc ; GCN: ; %Flow1 ; GCN-NEXT: s_or_b64 exec, exec diff --git a/llvm/test/CodeGen/AMDGPU/sad.ll b/llvm/test/CodeGen/AMDGPU/sad.ll index f7a1c65881d0..ee56e9053fd3 100644 --- a/llvm/test/CodeGen/AMDGPU/sad.ll +++ b/llvm/test/CodeGen/AMDGPU/sad.ll @@ -134,8 +134,8 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(i32 addrspace(1)* %out, ; GCN-LABEL: {{^}}v_sad_u32_multi_use_select_pat2: ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} -; GCN: v_cmp_gt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} -; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; GCN-DAG: v_cmp_gt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { %icmp0 = icmp ugt i32 %a, %b %sub0 = sub i32 %a, %b diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll index 62627e56aba4..e79ce3af0cf9 100644 --- a/llvm/test/CodeGen/AMDGPU/select.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll @@ -104,8 +104,8 @@ entry: ; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], 0.5, v[[D_F32]], vcc ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]] ; VI: v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x3800{{$}} +; VI: v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]] ; VI: v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[C_F16]], v[[D_F16]], vcc ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm @@ -134,8 +134,8 @@ entry: ; SI: v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]] ; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], 0.5, v[[C_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: v_cmp_lt_f16_e32 vcc, v[[A_F16]], v[[B_F16]] ; VI: v_mov_b32_e32 v[[D_F16:[0-9]+]], 0x3800{{$}} +; VI: v_cmp_lt_f16_e32 vcc, v[[A_F16]], v[[B_F16]] ; VI: v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm @@ -159,16 +159,16 @@ entry: ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 -; SI: v_cmp_lt_f32_e64 ; SI: v_cmp_lt_f32_e32 ; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e64 +; SI: v_cmp_lt_f32_e32 +; SI: v_cndmask_b32_e32 ; SI: v_cvt_f16_f32_e32 ; SI: v_cvt_f16_f32_e32 -; VI: v_cmp_lt_f16_e64 ; VI: v_cmp_lt_f16_e32 -; VI: v_cndmask_b32_e64 +; VI: v_cndmask_b32_e32 +; VI: v_cmp_lt_f16_e32 ; VI: v_cndmask_b32_e32 ; GCN: s_endpgm @@ -196,13 +196,17 @@ entry: ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 -; SI-DAG: v_cmp_gt_f32_e64 -; SI-DAG: v_cmp_lt_f32_e32 vcc, 0.5 -; VI: v_cmp_lt_f16_e32 -; VI: v_cmp_gt_f16_e64 -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e64 +; SI: v_cmp_lt_f32_e32 vcc, 0.5 +; SI: v_cndmask_b32_e32 +; SI: v_cmp_gt_f32_e32 +; SI: v_cndmask_b32_e32 + +; VI: v_cmp_lt_f16_e32 +; VI: v_cndmask_b32_e32 +; VI: v_cmp_gt_f16_e32 +; VI: v_cndmask_b32_e32 + ; SI: v_cvt_f16_f32_e32 ; SI: v_cvt_f16_f32_e32 ; GCN: s_endpgm @@ -228,13 +232,16 @@ entry: ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 -; SI-DAG: v_cmp_lt_f32_e64 -; SI-DAG: v_cmp_gt_f32_e32 vcc, 0.5 -; VI: v_cmp_gt_f16_e32 -; VI: v_cmp_lt_f16_e64 -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e64 +; SI: v_cmp_gt_f32_e32 vcc, 0.5 +; SI: v_cndmask_b32_e32 +; SI: v_cmp_lt_f32_e32 +; SI: v_cndmask_b32_e32 + +; VI: v_cmp_gt_f16_e32 +; VI: v_cndmask_b32_e32 +; VI: v_cmp_lt_f16_e32 +; VI: v_cndmask_b32_e32 ; SI: v_cvt_f16_f32_e32 ; SI: v_cvt_f16_f32_e32 @@ -263,8 +270,8 @@ entry: ; SI: v_cvt_f32_f16_e32 ; SI: v_cmp_nlt_f32_e32 -; SI: v_cmp_nlt_f32_e64 -; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e32 +; SI: v_cmp_nlt_f32_e32 ; SI: v_cndmask_b32_e32 ; VI: v_cmp_nlt_f16_e32 @@ -298,13 +305,17 @@ entry: ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 -; SI: v_cmp_lt_f32_e64 + ; SI: v_cmp_lt_f32_e32 +; SI: v_cndmask_b32 +; SI: v_cmp_lt_f32_e32 +; SI: v_cndmask_b32 ; VI: v_cmp_lt_f16_e32 -; VI: v_cmp_lt_f16_e64 -; GCN: v_cndmask_b32 -; GCN: v_cndmask_b32 +; VI: v_cndmask_b32 +; VI: v_cmp_lt_f16_e32 +; VI: v_cndmask_b32 + ; SI: v_cvt_f16_f32_e32 ; SI: v_cvt_f16_f32_e32 ; GCN: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/setcc.ll b/llvm/test/CodeGen/AMDGPU/setcc.ll index f63719d62a84..a3bf167e756a 100644 --- a/llvm/test/CodeGen/AMDGPU/setcc.ll +++ b/llvm/test/CodeGen/AMDGPU/setcc.ll @@ -7,8 +7,8 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone ; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[3].X, KC0[3].Z ; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[2].W, KC0[3].Y -; GCN-DAG: v_cmp_eq_u32_e32 -; GCN-DAG: v_cmp_eq_u32_e64 +; GCN: v_cmp_eq_u32_e32 +; GCN: v_cmp_eq_u32_e32 define amdgpu_kernel void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 { %result = icmp eq <2 x i32> %a, %b %sext = sext <2 x i1> %result to <2 x i32> @@ -23,9 +23,9 @@ define amdgpu_kernel void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> % ; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; GCN: v_cmp_eq_u32_e32 -; GCN: v_cmp_eq_u32_e64 -; GCN: v_cmp_eq_u32_e64 -; GCN: v_cmp_eq_u32_e64 +; GCN: v_cmp_eq_u32_e32 +; GCN: v_cmp_eq_u32_e32 +; GCN: v_cmp_eq_u32_e32 define amdgpu_kernel void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 %a = load <4 x i32>, <4 x i32> addrspace(1)* %in diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll index f6a72633d418..5754bd9bb913 100644 --- a/llvm/test/CodeGen/AMDGPU/uaddo.ll +++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll @@ -58,8 +58,8 @@ define amdgpu_kernel void @v_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* } ; FUNC-LABEL: {{^}}v_uaddo_i32_novcc: -; GCN: v_add_i32_e64 v{{[0-9]+}}, [[COND:s\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[COND]] +; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc ; EG: ADDC_UINT ; EG: ADD_INT diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll index 99c759a17875..a8b35a22fb1e 100644 --- a/llvm/test/CodeGen/AMDGPU/usubo.ll +++ b/llvm/test/CodeGen/AMDGPU/usubo.ll @@ -58,8 +58,8 @@ define amdgpu_kernel void @v_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* } ; FUNC-LABEL: {{^}}v_usubo_i32_novcc: -; GCN: v_sub_i32_e64 v{{[0-9]+}}, [[COND:s\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[COND]] +; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc ; EG-DAG: SUBB_UINT ; EG-DAG: SUB_INT diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll index d4a68a418ee4..5cbfae34e1bb 100644 --- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll +++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll @@ -200,9 +200,9 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(i32 addrspace(1)* % ; SI-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v[[Z_HI]], vcc ; SI-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2, v[[Z_LO]], vcc -; VI-DAG: v_cmp_lt_i64_e64 s{{\[[0-9]+:[0-9]+\]}}, -1, v{{\[}}[[X_LO]]:[[X_HI]]{{\]}} -; VI-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v[[Z_HI]], s -; VI-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 2, v[[Z_LO]], s +; VI-DAG: v_cmp_lt_i64_e32 vcc, -1, v{{\[}}[[X_LO]]:[[X_HI]]{{\]}} +; VI-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v[[Z_HI]], vcc +; VI-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2, v[[Z_LO]], vcc define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -292,10 +292,10 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(<4 x float> addrs ; GCN-LABEL: {{^}}icmp_vgprX_k0_select_k1_vgprZ_i1: ; GCN: load_dword ; GCN: load_ubyte -; GCN-DAG: v_cmp_gt_i32_e64 s{{\[[0-9]+:[0-9]+\]}}, 0, v +; GCN-DAG: v_cmp_gt_i32_e32 vcc, 0, v ; DCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 1, -; GCN-DAG: v_cmp_eq_u32_e32 vcc, 1, v -; GCN-DAG: s_or_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, vcc +; GCN-DAG: v_cmp_eq_u32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1, v +; GCN-DAG: s_or_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc, s{{\[[0-9]+:[0-9]+\]}} ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, s ; GCN: store_byte define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i1 addrspace(1)* %z.ptr) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/vselect.ll b/llvm/test/CodeGen/AMDGPU/vselect.ll index bb6234729f90..02ffd30be5fd 100644 --- a/llvm/test/CodeGen/AMDGPU/vselect.ll +++ b/llvm/test/CodeGen/AMDGPU/vselect.ll @@ -7,7 +7,9 @@ ; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Z ; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Y -; SI: v_cndmask_b32_e64 +; SI: v_cmp_gt_i32_e32 vcc +; SI: v_cndmask_b32_e32 +; SI: v_cmp_gt_i32_e32 vcc ; SI: v_cndmask_b32_e32 define amdgpu_kernel void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1, <2 x i32> %val) { @@ -25,8 +27,11 @@ entry: ; EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;SI: v_cndmask_b32_e64 -;SI: v_cndmask_b32_e32 + +; SI: v_cmp_neq_f32_e32 vcc +; SI: v_cndmask_b32_e32 +; SI: v_cmp_neq_f32_e32 vcc +; SI: v_cndmask_b32_e32 define amdgpu_kernel void @test_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in0, <2 x float> addrspace(1)* %in1) { entry: @@ -45,12 +50,10 @@ entry: ; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Z ; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Y -; FIXME: The shrinking does not happen on tonga - -; SI: v_cndmask_b32 -; SI: v_cndmask_b32 -; SI: v_cndmask_b32 -; SI: v_cndmask_b32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 define amdgpu_kernel void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1, <4 x i32> %val) { entry: @@ -68,6 +71,10 @@ entry: ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 define amdgpu_kernel void @test_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in0, <4 x float> addrspace(1)* %in1) { entry: %0 = load <4 x float>, <4 x float> addrspace(1)* %in0