forked from OSchip/llvm-project
R600/SI: Use V_FRACT_F64 for faster 64-bit floor on SI
Other f64 opcodes not supported on SI can be lowered in a similar way. v2: use complex VOP3 patterns llvm-svn: 233076
This commit is contained in:
parent
43650e45c3
commit
7d77728c97
|
@ -202,10 +202,10 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
|
||||||
if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
|
if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
|
||||||
setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
|
setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
|
||||||
setOperationAction(ISD::FCEIL, MVT::f64, Legal);
|
setOperationAction(ISD::FCEIL, MVT::f64, Legal);
|
||||||
setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
|
|
||||||
setOperationAction(ISD::FRINT, MVT::f64, Legal);
|
setOperationAction(ISD::FRINT, MVT::f64, Legal);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
|
||||||
setOperationAction(ISD::FDIV, MVT::f32, Custom);
|
setOperationAction(ISD::FDIV, MVT::f32, Custom);
|
||||||
setOperationAction(ISD::FDIV, MVT::f64, Custom);
|
setOperationAction(ISD::FDIV, MVT::f64, Custom);
|
||||||
|
|
||||||
|
|
|
@ -727,6 +727,26 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
|
||||||
MI->eraseFromParent();
|
MI->eraseFromParent();
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
case AMDGPU::V_CNDMASK_B64_PSEUDO: {
|
||||||
|
unsigned Dst = MI->getOperand(0).getReg();
|
||||||
|
unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
|
||||||
|
unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
|
||||||
|
unsigned Src0 = MI->getOperand(1).getReg();
|
||||||
|
unsigned Src1 = MI->getOperand(2).getReg();
|
||||||
|
const MachineOperand &SrcCond = MI->getOperand(3);
|
||||||
|
|
||||||
|
BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
|
||||||
|
.addReg(RI.getSubReg(Src0, AMDGPU::sub0))
|
||||||
|
.addReg(RI.getSubReg(Src1, AMDGPU::sub0))
|
||||||
|
.addOperand(SrcCond);
|
||||||
|
BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
|
||||||
|
.addReg(RI.getSubReg(Src0, AMDGPU::sub1))
|
||||||
|
.addReg(RI.getSubReg(Src1, AMDGPU::sub1))
|
||||||
|
.addOperand(SrcCond);
|
||||||
|
MI->eraseFromParent();
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
|
@ -320,6 +320,7 @@ def SIOperand {
|
||||||
|
|
||||||
def SRCMODS {
|
def SRCMODS {
|
||||||
int NONE = 0;
|
int NONE = 0;
|
||||||
|
int NEG = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
def DSTCLAMP {
|
def DSTCLAMP {
|
||||||
|
|
|
@ -28,6 +28,8 @@ def SendMsgImm : Operand<i32> {
|
||||||
|
|
||||||
def isGCN : Predicate<"Subtarget->getGeneration() "
|
def isGCN : Predicate<"Subtarget->getGeneration() "
|
||||||
">= AMDGPUSubtarget::SOUTHERN_ISLANDS">;
|
">= AMDGPUSubtarget::SOUTHERN_ISLANDS">;
|
||||||
|
def isSI : Predicate<"Subtarget->getGeneration() "
|
||||||
|
"== AMDGPUSubtarget::SOUTHERN_ISLANDS">;
|
||||||
def isSICI : Predicate<
|
def isSICI : Predicate<
|
||||||
"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||"
|
"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||"
|
||||||
"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS"
|
"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS"
|
||||||
|
@ -1874,6 +1876,11 @@ defm V_ASHRREV_I64 : VOP3Inst <vop3<0, 0x291>, "v_ashrrev_i64",
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
let isCodeGenOnly = 1, isPseudo = 1 in {
|
let isCodeGenOnly = 1, isPseudo = 1 in {
|
||||||
|
|
||||||
|
// For use in patterns
|
||||||
|
def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$dst),
|
||||||
|
(ins VSrc_64:$src0, VSrc_64:$src1, SSrc_64:$src2), "", []
|
||||||
|
>;
|
||||||
|
|
||||||
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
|
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
|
||||||
// 64-bit vector move instruction. This is mainly used by the SIFoldOperands
|
// 64-bit vector move instruction. This is mainly used by the SIFoldOperands
|
||||||
// pass to enable folding of inline immediates.
|
// pass to enable folding of inline immediates.
|
||||||
|
@ -3320,6 +3327,50 @@ def : Pat <
|
||||||
// Fract Patterns
|
// Fract Patterns
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
let Predicates = [isSI] in {
|
||||||
|
|
||||||
|
// V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) is
|
||||||
|
// used instead. However, SI doesn't have V_FLOOR_F64, so the most efficient
|
||||||
|
// way to implement it is using V_FRACT_F64.
|
||||||
|
// The workaround for the V_FRACT bug is:
|
||||||
|
// fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
|
||||||
|
|
||||||
|
// Convert (x + (-floor(x)) to fract(x)
|
||||||
|
def : Pat <
|
||||||
|
(f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)),
|
||||||
|
(f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))),
|
||||||
|
(V_CNDMASK_B64_PSEUDO
|
||||||
|
$x,
|
||||||
|
(V_MIN_F64
|
||||||
|
SRCMODS.NONE,
|
||||||
|
(V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE),
|
||||||
|
SRCMODS.NONE,
|
||||||
|
(V_MOV_B64_PSEUDO 0x3fefffffffffffff),
|
||||||
|
DSTCLAMP.NONE, DSTOMOD.NONE),
|
||||||
|
(V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, 3/*NaN*/))
|
||||||
|
>;
|
||||||
|
|
||||||
|
// Convert floor(x) to (x - fract(x))
|
||||||
|
def : Pat <
|
||||||
|
(f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))),
|
||||||
|
(V_ADD_F64
|
||||||
|
$mods,
|
||||||
|
$x,
|
||||||
|
SRCMODS.NEG,
|
||||||
|
(V_CNDMASK_B64_PSEUDO
|
||||||
|
$x,
|
||||||
|
(V_MIN_F64
|
||||||
|
SRCMODS.NONE,
|
||||||
|
(V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE),
|
||||||
|
SRCMODS.NONE,
|
||||||
|
(V_MOV_B64_PSEUDO 0x3fefffffffffffff),
|
||||||
|
DSTCLAMP.NONE, DSTOMOD.NONE),
|
||||||
|
(V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, 3/*NaN*/)),
|
||||||
|
DSTCLAMP.NONE, DSTOMOD.NONE)
|
||||||
|
>;
|
||||||
|
|
||||||
|
} // End Predicates = [isSI]
|
||||||
|
|
||||||
let Predicates = [isCI] in {
|
let Predicates = [isCI] in {
|
||||||
|
|
||||||
// Convert (x - floor(x)) to fract(x)
|
// Convert (x - floor(x)) to fract(x)
|
||||||
|
|
|
@ -1,7 +1,8 @@
|
||||||
; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
|
; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
|
||||||
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
|
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
|
||||||
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
|
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
|
||||||
|
|
||||||
|
declare double @llvm.fabs.f64(double %Val)
|
||||||
declare double @llvm.floor.f64(double) nounwind readnone
|
declare double @llvm.floor.f64(double) nounwind readnone
|
||||||
declare <2 x double> @llvm.floor.v2f64(<2 x double>) nounwind readnone
|
declare <2 x double> @llvm.floor.v2f64(<2 x double>) nounwind readnone
|
||||||
declare <3 x double> @llvm.floor.v3f64(<3 x double>) nounwind readnone
|
declare <3 x double> @llvm.floor.v3f64(<3 x double>) nounwind readnone
|
||||||
|
@ -11,24 +12,11 @@ declare <16 x double> @llvm.floor.v16f64(<16 x double>) nounwind readnone
|
||||||
|
|
||||||
; FUNC-LABEL: {{^}}ffloor_f64:
|
; FUNC-LABEL: {{^}}ffloor_f64:
|
||||||
; CI: v_floor_f64_e32
|
; CI: v_floor_f64_e32
|
||||||
|
; SI: v_fract_f64_e32
|
||||||
; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
|
; SI: v_min_f64
|
||||||
; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
|
; SI: v_cmp_class_f64_e64
|
||||||
; SI: s_add_i32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01
|
; SI: v_cndmask_b32_e64
|
||||||
; SI: s_lshr_b64
|
; SI: v_cndmask_b32_e64
|
||||||
; SI: s_not_b64
|
|
||||||
; SI: s_and_b64
|
|
||||||
; SI: cmp_gt_i32
|
|
||||||
; SI: cndmask_b32
|
|
||||||
; SI: cndmask_b32
|
|
||||||
; SI: cmp_lt_i32
|
|
||||||
; SI: cndmask_b32
|
|
||||||
; SI: cndmask_b32
|
|
||||||
; SI-DAG: v_cmp_gt_f64
|
|
||||||
; SI-DAG: v_cmp_lg_f64
|
|
||||||
; SI-DAG: s_and_b64
|
|
||||||
; SI-DAG: v_cndmask_b32
|
|
||||||
; SI-DAG: v_cndmask_b32
|
|
||||||
; SI: v_add_f64
|
; SI: v_add_f64
|
||||||
; SI: s_endpgm
|
; SI: s_endpgm
|
||||||
define void @ffloor_f64(double addrspace(1)* %out, double %x) {
|
define void @ffloor_f64(double addrspace(1)* %out, double %x) {
|
||||||
|
@ -37,6 +25,39 @@ define void @ffloor_f64(double addrspace(1)* %out, double %x) {
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
; FUNC-LABEL: {{^}}ffloor_f64_neg:
|
||||||
|
; CI: v_floor_f64_e64
|
||||||
|
; SI: v_fract_f64_e64 {{v[[0-9]+:[0-9]+]}}, -[[INPUT:s[[0-9]+:[0-9]+]]]
|
||||||
|
; SI: v_min_f64
|
||||||
|
; SI: v_cmp_class_f64_e64
|
||||||
|
; SI: v_cndmask_b32_e64
|
||||||
|
; SI: v_cndmask_b32_e64
|
||||||
|
; SI: v_add_f64 {{v[[0-9]+:[0-9]+]}}, -[[INPUT]]
|
||||||
|
; SI: s_endpgm
|
||||||
|
define void @ffloor_f64_neg(double addrspace(1)* %out, double %x) {
|
||||||
|
%neg = fsub double 0.0, %x
|
||||||
|
%y = call double @llvm.floor.f64(double %neg) nounwind readnone
|
||||||
|
store double %y, double addrspace(1)* %out
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; FUNC-LABEL: {{^}}ffloor_f64_neg_abs:
|
||||||
|
; CI: v_floor_f64_e64
|
||||||
|
; SI: v_fract_f64_e64 {{v[[0-9]+:[0-9]+]}}, -|[[INPUT:s[[0-9]+:[0-9]+]]]|
|
||||||
|
; SI: v_min_f64
|
||||||
|
; SI: v_cmp_class_f64_e64
|
||||||
|
; SI: v_cndmask_b32_e64
|
||||||
|
; SI: v_cndmask_b32_e64
|
||||||
|
; SI: v_add_f64 {{v[[0-9]+:[0-9]+]}}, -|[[INPUT]]|
|
||||||
|
; SI: s_endpgm
|
||||||
|
define void @ffloor_f64_neg_abs(double addrspace(1)* %out, double %x) {
|
||||||
|
%abs = call double @llvm.fabs.f64(double %x)
|
||||||
|
%neg = fsub double 0.0, %abs
|
||||||
|
%y = call double @llvm.floor.f64(double %neg) nounwind readnone
|
||||||
|
store double %y, double addrspace(1)* %out
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
; FUNC-LABEL: {{^}}ffloor_v2f64:
|
; FUNC-LABEL: {{^}}ffloor_v2f64:
|
||||||
; CI: v_floor_f64_e32
|
; CI: v_floor_f64_e32
|
||||||
; CI: v_floor_f64_e32
|
; CI: v_floor_f64_e32
|
||||||
|
|
|
@ -0,0 +1,60 @@
|
||||||
|
; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
|
||||||
|
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
|
||||||
|
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
|
||||||
|
|
||||||
|
declare double @llvm.fabs.f64(double %Val)
|
||||||
|
declare double @llvm.AMDGPU.fract.f64(double) nounwind readnone
|
||||||
|
|
||||||
|
; FUNC-LABEL: {{^}}fract_f64:
|
||||||
|
; GCN: v_fract_f64_e32 [[FRC:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]]
|
||||||
|
; SI: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1
|
||||||
|
; SI: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff
|
||||||
|
; SI: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]]
|
||||||
|
; SI: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3
|
||||||
|
; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[LO]], v[[MINLO]], [[COND]]
|
||||||
|
; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[HI]], v[[MINHI]], [[COND]]
|
||||||
|
; SI: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]]
|
||||||
|
; CI: buffer_store_dwordx2 [[FRC]]
|
||||||
|
define void @fract_f64(double addrspace(1)* %out, double addrspace(1)* %src) nounwind {
|
||||||
|
%val = load double, double addrspace(1)* %src, align 4
|
||||||
|
%fract = call double @llvm.AMDGPU.fract.f64(double %val) nounwind readnone
|
||||||
|
store double %fract, double addrspace(1)* %out, align 4
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; FUNC-LABEL: {{^}}fract_f64_neg:
|
||||||
|
; GCN: v_fract_f64_e64 [[FRC:v\[[0-9]+:[0-9]+\]]], -v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]]
|
||||||
|
; SI: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1
|
||||||
|
; SI: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff
|
||||||
|
; SI: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]]
|
||||||
|
; SI: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3
|
||||||
|
; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[LO]], v[[MINLO]], [[COND]]
|
||||||
|
; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[HI]], v[[MINHI]], [[COND]]
|
||||||
|
; SI: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]]
|
||||||
|
; CI: buffer_store_dwordx2 [[FRC]]
|
||||||
|
define void @fract_f64_neg(double addrspace(1)* %out, double addrspace(1)* %src) nounwind {
|
||||||
|
%val = load double, double addrspace(1)* %src, align 4
|
||||||
|
%neg = fsub double 0.0, %val
|
||||||
|
%fract = call double @llvm.AMDGPU.fract.f64(double %neg) nounwind readnone
|
||||||
|
store double %fract, double addrspace(1)* %out, align 4
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; FUNC-LABEL: {{^}}fract_f64_neg_abs:
|
||||||
|
; GCN: v_fract_f64_e64 [[FRC:v\[[0-9]+:[0-9]+\]]], -|v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]]|
|
||||||
|
; SI: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1
|
||||||
|
; SI: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff
|
||||||
|
; SI: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]]
|
||||||
|
; SI: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3
|
||||||
|
; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[LO]], v[[MINLO]], [[COND]]
|
||||||
|
; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[HI]], v[[MINHI]], [[COND]]
|
||||||
|
; SI: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]]
|
||||||
|
; CI: buffer_store_dwordx2 [[FRC]]
|
||||||
|
define void @fract_f64_neg_abs(double addrspace(1)* %out, double addrspace(1)* %src) nounwind {
|
||||||
|
%val = load double, double addrspace(1)* %src, align 4
|
||||||
|
%abs = call double @llvm.fabs.f64(double %val)
|
||||||
|
%neg = fsub double 0.0, %abs
|
||||||
|
%fract = call double @llvm.AMDGPU.fract.f64(double %neg) nounwind readnone
|
||||||
|
store double %fract, double addrspace(1)* %out, align 4
|
||||||
|
ret void
|
||||||
|
}
|
Loading…
Reference in New Issue