AMDGPU/GlobalISel Check for NoNaNsFPMath in isKnownNeverSNaN

Check for NoNaNsFPMath function attribute in isKnownNeverSNaN.
Function attributes are in held in 'TargetMachine.Options'.
Among other things, this allows selection of some patterns imported
in D87351 since G_FCANONICALIZE is not generated when isKnownNeverSNaN
returns true in lowerFMinNumMaxNum.

However we notice some incorrect results since function attributes are
not correctly written in TargetMachine.Options when next function is
processed. Take a look at @v_test_no_global_nnans_med3_f32_pat0_srcmod0,
it has "no-nans-fp-math"="false" but TargetMachine.Options still has it
set to true since first function in test file had this attribute set to
true. This will be fixed in D87511.

Differential Revision: https://reviews.llvm.org/D87456
This commit is contained in:
Petar Avramovic 2020-09-14 12:03:36 +02:00
parent 416346d1ca
commit 6e2a86ed5a
3 changed files with 593 additions and 2 deletions

View File

@ -26,6 +26,7 @@
#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/IR/Constants.h" #include "llvm/IR/Constants.h"
#include "llvm/Target/TargetMachine.h"
#define DEBUG_TYPE "globalisel-utils" #define DEBUG_TYPE "globalisel-utils"
@ -470,7 +471,8 @@ bool llvm::isKnownNeverNaN(Register Val, const MachineRegisterInfo &MRI,
if (!DefMI) if (!DefMI)
return false; return false;
if (DefMI->getFlag(MachineInstr::FmNoNans)) const TargetMachine& TM = DefMI->getMF()->getTarget();
if (DefMI->getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath)
return true; return true;
if (SNaN) { if (SNaN) {

View File

@ -3167,7 +3167,7 @@ AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const {
Register Src; Register Src;
unsigned Mods; unsigned Mods;
std::tie(Src, Mods) = selectVOP3ModsImpl(Root); std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
if (!TM.Options.NoNaNsFPMath && !isKnownNeverNaN(Src, *MRI)) if (!isKnownNeverNaN(Src, *MRI))
return None; return None;
return {{ return {{

View File

@ -0,0 +1,589 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
; SI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b64 s[8:9], s[4:5]
; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_sub_f32_e32 v2, 0x80000000, v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_med3_f32 v2, v2, v3, v4
; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v8, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v8
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v8
; VI-NEXT: v_mov_b32_e32 v4, s6
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v8
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: flat_load_dword v1, v[2:3]
; VI-NEXT: flat_load_dword v2, v[4:5]
; VI-NEXT: v_mov_b32_e32 v7, s1
; VI-NEXT: v_mov_b32_e32 v6, s0
; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v8
; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
; VI-NEXT: v_sub_f32_e32 v0, 0x80000000, v0
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_med3_f32 v0, v0, v1, v2
; VI-NEXT: flat_store_dword v[6:7], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
; GFX9-NEXT: global_load_dword v2, v0, s[4:5]
; GFX9-NEXT: global_load_dword v3, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_sub_f32_e32 v1, 0x80000000, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
%gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
%gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
%outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
%a = load volatile float, float addrspace(1)* %gep0
%b = load volatile float, float addrspace(1)* %gep1
%c = load volatile float, float addrspace(1)* %gep2
%a.fneg = fsub float -0.0, %a
%tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b)
%tmp1 = call float @llvm.maxnum.f32(float %a.fneg, float %b)
%tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
%med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
store float %med3, float addrspace(1)* %outgep
ret void
}
define amdgpu_kernel void @v_test_no_global_nnans_med3_f32_pat0_srcmod0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
; SI-LABEL: v_test_no_global_nnans_med3_f32_pat0_srcmod0:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b64 s[8:9], s[4:5]
; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_sub_f32_e32 v2, 0x80000000, v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_med3_f32 v2, v2, v3, v4
; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_test_no_global_nnans_med3_f32_pat0_srcmod0:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v8, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v8
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v8
; VI-NEXT: v_mov_b32_e32 v4, s6
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v8
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: flat_load_dword v1, v[2:3]
; VI-NEXT: flat_load_dword v2, v[4:5]
; VI-NEXT: v_mov_b32_e32 v7, s1
; VI-NEXT: v_mov_b32_e32 v6, s0
; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v8
; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
; VI-NEXT: v_sub_f32_e32 v0, 0x80000000, v0
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_med3_f32 v0, v0, v1, v2
; VI-NEXT: flat_store_dword v[6:7], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_no_global_nnans_med3_f32_pat0_srcmod0:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
; GFX9-NEXT: global_load_dword v2, v0, s[4:5]
; GFX9-NEXT: global_load_dword v3, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_sub_f32_e32 v1, 0x80000000, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
%gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
%gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
%outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
%a = load volatile float, float addrspace(1)* %gep0
%b = load volatile float, float addrspace(1)* %gep1
%c = load volatile float, float addrspace(1)* %gep2
%a.fneg = fsub float -0.0, %a
%tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b)
%tmp1 = call float @llvm.maxnum.f32(float %a.fneg, float %b)
%tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
%med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
store float %med3, float addrspace(1)* %outgep
ret void
}
define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
; SI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b64 s[8:9], s[4:5]
; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b32 s2, 0x80000000
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_sub_f32_e32 v2, s2, v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_sub_f32_e64 v4, s2, |v4|
; SI-NEXT: v_med3_f32 v2, v2, |v3|, v4
; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-NEXT: v_mov_b32_e32 v4, s6
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dword v7, v[0:1]
; VI-NEXT: flat_load_dword v2, v[2:3]
; VI-NEXT: flat_load_dword v3, v[4:5]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_mov_b32 s2, 0x80000000
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
; VI-NEXT: v_sub_f32_e32 v4, s2, v7
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_sub_f32_e64 v3, s2, |v3|
; VI-NEXT: v_med3_f32 v2, v4, |v2|, v3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
; GFX9-NEXT: global_load_dword v2, v0, s[4:5]
; GFX9-NEXT: global_load_dword v3, v0, s[6:7]
; GFX9-NEXT: s_mov_b32 s2, 0x80000000
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_sub_f32_e32 v1, s2, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_f32_e64 v3, s2, |v3|
; GFX9-NEXT: v_med3_f32 v1, v1, |v2|, v3
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
%gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
%gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
%outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
%a = load volatile float, float addrspace(1)* %gep0
%b = load volatile float, float addrspace(1)* %gep1
%c = load volatile float, float addrspace(1)* %gep2
%a.fneg = fsub float -0.0, %a
%b.fabs = call float @llvm.fabs.f32(float %b)
%c.fabs = call float @llvm.fabs.f32(float %c)
%c.fabs.fneg = fsub float -0.0, %c.fabs
%tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b.fabs)
%tmp1 = call float @llvm.maxnum.f32(float %a.fneg, float %b.fabs)
%tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg)
%med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
store float %med3, float addrspace(1)* %outgep
ret void
}
define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
; SI-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b64 s[8:9], s[4:5]
; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b32 s2, 0x80000000
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_sub_f32_e64 v2, s2, |v2|
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_sub_f32_e64 v3, s2, |v3|
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_sub_f32_e64 v4, s2, |v4|
; SI-NEXT: v_med3_f32 v2, v2, v3, v4
; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-NEXT: v_mov_b32_e32 v4, s6
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dword v7, v[0:1]
; VI-NEXT: flat_load_dword v2, v[2:3]
; VI-NEXT: flat_load_dword v3, v[4:5]
; VI-NEXT: s_mov_b32 s2, 0x80000000
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
; VI-NEXT: v_sub_f32_e64 v4, s2, |v7|
; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
; VI-NEXT: v_sub_f32_e64 v2, s2, |v2|
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_sub_f32_e64 v3, s2, |v3|
; VI-NEXT: v_med3_f32 v2, v4, v2, v3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
; GFX9-NEXT: global_load_dword v2, v0, s[4:5]
; GFX9-NEXT: global_load_dword v3, v0, s[6:7]
; GFX9-NEXT: s_mov_b32 s2, 0x80000000
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_sub_f32_e64 v1, s2, |v1|
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_sub_f32_e64 v2, s2, |v2|
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_f32_e64 v3, s2, |v3|
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
%gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
%gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
%outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
%a = load volatile float, float addrspace(1)* %gep0
%b = load volatile float, float addrspace(1)* %gep1
%c = load volatile float, float addrspace(1)* %gep2
%a.fabs = call float @llvm.fabs.f32(float %a)
%a.fabs.fneg = fsub float -0.0, %a.fabs
%b.fabs = call float @llvm.fabs.f32(float %b)
%b.fabs.fneg = fsub float -0.0, %b.fabs
%c.fabs = call float @llvm.fabs.f32(float %c)
%c.fabs.fneg = fsub float -0.0, %c.fabs
%tmp0 = call float @llvm.minnum.f32(float %a.fabs.fneg, float %b.fabs.fneg)
%tmp1 = call float @llvm.maxnum.f32(float %a.fabs.fneg, float %b.fabs.fneg)
%tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg)
%med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
store float %med3, float addrspace(1)* %outgep
ret void
}
define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
; SI-LABEL: v_nnan_inputs_med3_f32_pat0:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b64 s[8:9], s[4:5]
; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_add_f32_e32 v2, 1.0, v2
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_add_f32_e32 v3, 2.0, v3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_f32_e32 v4, 4.0, v4
; SI-NEXT: v_med3_f32 v2, v2, v3, v4
; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_nnan_inputs_med3_f32_pat0:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-NEXT: v_mov_b32_e32 v4, s6
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dword v7, v[0:1]
; VI-NEXT: flat_load_dword v2, v[2:3]
; VI-NEXT: flat_load_dword v3, v[4:5]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
; VI-NEXT: v_add_f32_e32 v4, 1.0, v7
; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
; VI-NEXT: v_add_f32_e32 v2, 2.0, v2
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_f32_e32 v3, 4.0, v3
; VI-NEXT: v_med3_f32 v2, v4, v2, v3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_nnan_inputs_med3_f32_pat0:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
; GFX9-NEXT: global_load_dword v2, v0, s[4:5]
; GFX9-NEXT: global_load_dword v3, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_add_f32_e32 v2, 2.0, v2
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
%gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
%gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
%outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
%a = load volatile float, float addrspace(1)* %gep0
%b = load volatile float, float addrspace(1)* %gep1
%c = load volatile float, float addrspace(1)* %gep2
%a.nnan = fadd nnan float %a, 1.0
%b.nnan = fadd nnan float %b, 2.0
%c.nnan = fadd nnan float %c, 4.0
%tmp0 = call float @llvm.minnum.f32(float %a.nnan, float %b.nnan)
%tmp1 = call float @llvm.maxnum.f32(float %a.nnan, float %b.nnan)
%tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.nnan)
%med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
store float %med3, float addrspace(1)* %outgep
ret void
}
; ---------------------------------------------------------------------
; Negative patterns
; ---------------------------------------------------------------------
define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
; SI-LABEL: v_test_safe_med3_f32_pat0_multi_use0:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b64 s[8:9], s[4:5]
; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s3, s11
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_min_f32_e32 v5, v2, v3
; SI-NEXT: v_max_f32_e32 v2, v2, v3
; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_min_f32_e32 v2, v2, v4
; SI-NEXT: v_max_f32_e32 v2, v5, v2
; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_test_safe_med3_f32_pat0_multi_use0:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-NEXT: v_mov_b32_e32 v4, s6
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dword v7, v[0:1]
; VI-NEXT: flat_load_dword v2, v[2:3]
; VI-NEXT: flat_load_dword v3, v[4:5]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
; VI-NEXT: v_min_f32_e32 v4, v7, v2
; VI-NEXT: v_max_f32_e32 v2, v7, v2
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_min_f32_e32 v2, v2, v3
; VI-NEXT: v_max_f32_e32 v2, v4, v2
; VI-NEXT: flat_store_dword v[0:1], v4
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use0:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
; GFX9-NEXT: global_load_dword v2, v0, s[4:5]
; GFX9-NEXT: global_load_dword v3, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_min_f32_e32 v4, v1, v2
; GFX9-NEXT: v_max_f32_e32 v1, v1, v2
; GFX9-NEXT: global_store_dword v[0:1], v4, off
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_min_f32_e32 v1, v1, v3
; GFX9-NEXT: v_max_f32_e32 v1, v4, v1
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
%gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
%gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
%outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
%a = load volatile float, float addrspace(1)* %gep0
%b = load volatile float, float addrspace(1)* %gep1
%c = load volatile float, float addrspace(1)* %gep2
%tmp0 = call float @llvm.minnum.f32(float %a, float %b)
store volatile float %tmp0, float addrspace(1)* undef
%tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
%tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
%med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
store float %med3, float addrspace(1)* %outgep
ret void
}
declare i32 @llvm.amdgcn.workitem.id.x() #0
declare float @llvm.fabs.f32(float) #0
declare float @llvm.minnum.f32(float, float) #0
declare float @llvm.maxnum.f32(float, float) #0
declare double @llvm.minnum.f64(double, double) #0
declare double @llvm.maxnum.f64(double, double) #0
declare half @llvm.fabs.f16(half) #0
declare half @llvm.minnum.f16(half, half) #0
declare half @llvm.maxnum.f16(half, half) #0
attributes #0 = { nounwind readnone }
attributes #1 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="false" }
attributes #2 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" }