2012-12-12 05:25:42 +08:00
|
|
|
//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
|
|
|
|
//
|
|
|
|
// The LLVM Compiler Infrastructure
|
|
|
|
//
|
|
|
|
// This file is distributed under the University of Illinois Open Source
|
|
|
|
// License. See LICENSE.TXT for details.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
|
|
|
/// \file
|
|
|
|
/// \brief Custom DAG lowering for SI
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2014-07-20 19:15:07 +08:00
|
|
|
#ifdef _MSC_VER
|
|
|
|
// Provide M_PI.
|
|
|
|
#define _USE_MATH_DEFINES
|
|
|
|
#include <cmath>
|
|
|
|
#endif
|
|
|
|
|
2013-03-07 17:04:14 +08:00
|
|
|
#include "AMDGPU.h"
|
2014-06-24 02:00:31 +08:00
|
|
|
#include "AMDGPUIntrinsicInfo.h"
|
2014-02-25 05:01:28 +08:00
|
|
|
#include "AMDGPUSubtarget.h"
|
2016-04-18 17:17:29 +08:00
|
|
|
#include "SIISelLowering.h"
|
2012-12-12 05:25:42 +08:00
|
|
|
#include "SIInstrInfo.h"
|
|
|
|
#include "SIMachineFunctionInfo.h"
|
|
|
|
#include "SIRegisterInfo.h"
|
2014-08-28 03:36:53 +08:00
|
|
|
#include "llvm/ADT/BitVector.h"
|
2016-01-26 12:29:24 +08:00
|
|
|
#include "llvm/ADT/StringSwitch.h"
|
2013-03-07 17:03:52 +08:00
|
|
|
#include "llvm/CodeGen/CallingConvLower.h"
|
2012-12-12 05:25:42 +08:00
|
|
|
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
|
|
|
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
|
|
|
#include "llvm/CodeGen/SelectionDAG.h"
|
2016-07-29 00:42:13 +08:00
|
|
|
#include "llvm/CodeGen/Analysis.h"
|
2016-02-02 21:52:43 +08:00
|
|
|
#include "llvm/IR/DiagnosticInfo.h"
|
2013-05-24 01:10:37 +08:00
|
|
|
#include "llvm/IR/Function.h"
|
2012-12-12 05:25:42 +08:00
|
|
|
|
|
|
|
using namespace llvm;
|
|
|
|
|
2016-10-13 02:49:05 +08:00
|
|
|
static cl::opt<bool> EnableVGPRIndexMode(
|
|
|
|
"amdgpu-vgpr-index-mode",
|
|
|
|
cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
|
|
|
|
cl::init(false));
|
|
|
|
|
|
|
|
|
2016-04-15 00:27:03 +08:00
|
|
|
static unsigned findFirstFreeSGPR(CCState &CCInfo) {
|
|
|
|
unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
|
|
|
|
for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
|
|
|
|
if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
|
|
|
|
return AMDGPU::SGPR0 + Reg;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
llvm_unreachable("Cannot allocate sgpr");
|
|
|
|
}
|
|
|
|
|
2016-06-24 14:30:11 +08:00
|
|
|
SITargetLowering::SITargetLowering(const TargetMachine &TM,
|
|
|
|
const SISubtarget &STI)
|
2015-01-31 07:24:40 +08:00
|
|
|
: AMDGPUTargetLowering(TM, STI) {
|
2014-04-30 23:31:33 +08:00
|
|
|
addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
|
2014-05-15 22:41:57 +08:00
|
|
|
addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
|
2013-03-07 17:03:38 +08:00
|
|
|
|
2016-11-26 01:37:09 +08:00
|
|
|
addRegisterClass(MVT::i32, &AMDGPU::SReg_32_XM0RegClass);
|
2015-01-08 04:59:25 +08:00
|
|
|
addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
|
2012-12-12 05:25:42 +08:00
|
|
|
|
2014-05-15 22:41:57 +08:00
|
|
|
addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
|
|
|
|
addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
|
|
|
|
addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
|
2013-03-07 17:03:38 +08:00
|
|
|
|
2015-11-26 03:58:34 +08:00
|
|
|
addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass);
|
|
|
|
addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass);
|
|
|
|
|
2014-05-15 22:41:57 +08:00
|
|
|
addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
|
|
|
|
addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
|
2013-03-07 17:03:38 +08:00
|
|
|
|
2014-11-19 04:39:39 +08:00
|
|
|
addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
|
2013-03-07 17:03:38 +08:00
|
|
|
addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
|
|
|
|
|
2014-11-19 04:39:39 +08:00
|
|
|
addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
|
2013-03-07 17:03:38 +08:00
|
|
|
addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
|
2012-12-12 05:25:42 +08:00
|
|
|
|
2016-11-13 15:01:11 +08:00
|
|
|
if (Subtarget->has16BitInsts()) {
|
2016-11-26 01:37:09 +08:00
|
|
|
addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass);
|
|
|
|
addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass);
|
2016-11-13 15:01:11 +08:00
|
|
|
}
|
2016-11-11 00:02:37 +08:00
|
|
|
|
2015-02-26 08:00:24 +08:00
|
|
|
computeRegisterProperties(STI.getRegisterInfo());
|
2012-12-12 05:25:42 +08:00
|
|
|
|
2013-08-26 23:06:04 +08:00
|
|
|
// We need to custom lower vector stores from local memory
|
2016-05-21 10:27:49 +08:00
|
|
|
setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
|
2013-08-26 23:06:04 +08:00
|
|
|
setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
|
2013-10-23 08:44:32 +08:00
|
|
|
setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
|
|
|
|
setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
|
2016-05-21 10:27:49 +08:00
|
|
|
setOperationAction(ISD::LOAD, MVT::i1, Custom);
|
2013-10-23 08:44:32 +08:00
|
|
|
|
2016-05-21 10:27:49 +08:00
|
|
|
setOperationAction(ISD::STORE, MVT::v2i32, Custom);
|
|
|
|
setOperationAction(ISD::STORE, MVT::v4i32, Custom);
|
2013-10-23 08:44:32 +08:00
|
|
|
setOperationAction(ISD::STORE, MVT::v8i32, Custom);
|
|
|
|
setOperationAction(ISD::STORE, MVT::v16i32, Custom);
|
2014-03-08 04:12:33 +08:00
|
|
|
setOperationAction(ISD::STORE, MVT::i1, Custom);
|
2016-05-03 04:07:26 +08:00
|
|
|
|
2016-05-21 10:27:49 +08:00
|
|
|
setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
|
|
|
|
setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
|
|
|
|
setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand);
|
2016-05-03 04:13:51 +08:00
|
|
|
|
2016-05-21 10:27:49 +08:00
|
|
|
setOperationAction(ISD::SELECT, MVT::i1, Promote);
|
2014-02-05 01:18:40 +08:00
|
|
|
setOperationAction(ISD::SELECT, MVT::i64, Custom);
|
2014-03-25 00:07:30 +08:00
|
|
|
setOperationAction(ISD::SELECT, MVT::f64, Promote);
|
|
|
|
AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
|
2013-11-14 07:36:50 +08:00
|
|
|
|
2014-06-11 00:01:22 +08:00
|
|
|
setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
|
|
|
|
setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
|
|
|
|
setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
|
|
|
|
setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
|
2016-05-21 10:27:49 +08:00
|
|
|
setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
|
2013-04-06 07:31:51 +08:00
|
|
|
|
2016-01-21 05:48:24 +08:00
|
|
|
setOperationAction(ISD::SETCC, MVT::i1, Promote);
|
2013-07-19 05:43:53 +08:00
|
|
|
setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
|
|
|
|
setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
|
|
|
|
|
2016-05-21 10:27:49 +08:00
|
|
|
setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
|
|
|
|
setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
|
2014-10-22 00:25:08 +08:00
|
|
|
|
2014-04-16 09:41:30 +08:00
|
|
|
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
|
|
|
|
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
|
|
|
|
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
|
|
|
|
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
|
|
|
|
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
|
|
|
|
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
|
|
|
|
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
|
|
|
|
|
2013-08-15 07:24:45 +08:00
|
|
|
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
|
|
|
|
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
|
2016-04-12 22:05:04 +08:00
|
|
|
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
|
|
|
|
|
2014-06-24 02:00:44 +08:00
|
|
|
setOperationAction(ISD::BRCOND, MVT::Other, Custom);
|
2016-05-21 10:27:49 +08:00
|
|
|
setOperationAction(ISD::BR_CC, MVT::i1, Expand);
|
2016-02-13 07:45:29 +08:00
|
|
|
setOperationAction(ISD::BR_CC, MVT::i32, Expand);
|
|
|
|
setOperationAction(ISD::BR_CC, MVT::i64, Expand);
|
|
|
|
setOperationAction(ISD::BR_CC, MVT::f32, Expand);
|
|
|
|
setOperationAction(ISD::BR_CC, MVT::f64, Expand);
|
2013-09-12 10:55:14 +08:00
|
|
|
|
2015-03-08 01:41:00 +08:00
|
|
|
// We only support LOAD/STORE and vector manipulation ops for vectors
|
|
|
|
// with > 4 elements.
|
2015-11-26 03:58:34 +08:00
|
|
|
for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64}) {
|
2014-02-14 07:34:15 +08:00
|
|
|
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
|
2016-05-21 10:27:49 +08:00
|
|
|
switch (Op) {
|
2014-02-14 07:34:15 +08:00
|
|
|
case ISD::LOAD:
|
|
|
|
case ISD::STORE:
|
|
|
|
case ISD::BUILD_VECTOR:
|
|
|
|
case ISD::BITCAST:
|
|
|
|
case ISD::EXTRACT_VECTOR_ELT:
|
|
|
|
case ISD::INSERT_VECTOR_ELT:
|
|
|
|
case ISD::INSERT_SUBVECTOR:
|
|
|
|
case ISD::EXTRACT_SUBVECTOR:
|
2015-11-26 03:58:34 +08:00
|
|
|
case ISD::SCALAR_TO_VECTOR:
|
2014-02-14 07:34:15 +08:00
|
|
|
break;
|
2014-08-09 09:06:56 +08:00
|
|
|
case ISD::CONCAT_VECTORS:
|
|
|
|
setOperationAction(Op, VT, Custom);
|
|
|
|
break;
|
2014-02-14 07:34:15 +08:00
|
|
|
default:
|
2014-05-16 05:44:05 +08:00
|
|
|
setOperationAction(Op, VT, Expand);
|
2014-02-14 07:34:15 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-07-19 08:35:03 +08:00
|
|
|
// TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
|
|
|
|
// is expanded to avoid having two separate loops in case the index is a VGPR.
|
|
|
|
|
2015-11-26 03:58:34 +08:00
|
|
|
// Most operations are naturally 32-bit vector operations. We only support
|
|
|
|
// load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
|
|
|
|
for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
|
|
|
|
setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
|
|
|
|
AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
|
|
|
|
|
|
|
|
setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
|
|
|
|
AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
|
|
|
|
|
|
|
|
setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
|
|
|
|
AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
|
|
|
|
|
|
|
|
setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
|
|
|
|
AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
|
|
|
|
}
|
|
|
|
|
2016-05-21 10:27:49 +08:00
|
|
|
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
|
|
|
|
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
|
|
|
|
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
|
|
|
|
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
|
2014-07-16 04:18:31 +08:00
|
|
|
|
AMDGPU: Implement {BUFFER,FLAT}_ATOMIC_CMPSWAP{,_X2}
Summary:
Implement BUFFER_ATOMIC_CMPSWAP{,_X2} instructions on all GCN targets, and FLAT_ATOMIC_CMPSWAP{,_X2} on CI+.
32-bit instruction variants tested manually on Kabini and Bonaire. Tests and parts of code provided by Jan Veselý.
Patch by: Vedran Miletić
Reviewers: arsenm, tstellarAMD, nhaehnle
Subscribers: jvesely, scchan, kanarayan, arsenm
Differential Revision: http://reviews.llvm.org/D17280
llvm-svn: 265170
2016-04-02 02:27:37 +08:00
|
|
|
// BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
|
|
|
|
// and output demarshalling
|
|
|
|
setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
|
|
|
|
setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
|
|
|
|
|
|
|
|
// We can't return success/failure, only the old value,
|
|
|
|
// let LLVM add the comparison
|
|
|
|
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand);
|
|
|
|
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand);
|
|
|
|
|
2016-06-24 14:30:11 +08:00
|
|
|
if (getSubtarget()->hasFlatAddressSpace()) {
|
2016-04-26 03:27:24 +08:00
|
|
|
setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
|
|
|
|
setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
|
|
|
|
}
|
|
|
|
|
2016-05-21 10:27:49 +08:00
|
|
|
setOperationAction(ISD::BSWAP, MVT::i32, Legal);
|
|
|
|
setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
|
|
|
|
|
|
|
|
// On SI this is s_memtime and s_memrealtime on VI.
|
|
|
|
setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
|
2016-06-18 06:27:03 +08:00
|
|
|
setOperationAction(ISD::TRAP, MVT::Other, Custom);
|
2016-05-21 10:27:49 +08:00
|
|
|
|
|
|
|
setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
|
|
|
|
setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
|
|
|
|
|
2016-06-24 14:30:11 +08:00
|
|
|
if (Subtarget->getGeneration() >= SISubtarget::SEA_ISLANDS) {
|
2016-05-21 10:27:49 +08:00
|
|
|
setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
|
|
|
|
setOperationAction(ISD::FCEIL, MVT::f64, Legal);
|
|
|
|
setOperationAction(ISD::FRINT, MVT::f64, Legal);
|
|
|
|
}
|
|
|
|
|
|
|
|
setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
|
|
|
|
|
|
|
|
setOperationAction(ISD::FSIN, MVT::f32, Custom);
|
|
|
|
setOperationAction(ISD::FCOS, MVT::f32, Custom);
|
|
|
|
setOperationAction(ISD::FDIV, MVT::f32, Custom);
|
|
|
|
setOperationAction(ISD::FDIV, MVT::f64, Custom);
|
|
|
|
|
2016-11-11 00:02:37 +08:00
|
|
|
if (Subtarget->has16BitInsts()) {
|
|
|
|
setOperationAction(ISD::Constant, MVT::i16, Legal);
|
|
|
|
|
|
|
|
setOperationAction(ISD::SMIN, MVT::i16, Legal);
|
|
|
|
setOperationAction(ISD::SMAX, MVT::i16, Legal);
|
|
|
|
|
|
|
|
setOperationAction(ISD::UMIN, MVT::i16, Legal);
|
|
|
|
setOperationAction(ISD::UMAX, MVT::i16, Legal);
|
|
|
|
|
|
|
|
setOperationAction(ISD::SETCC, MVT::i16, Promote);
|
|
|
|
AddPromotedToType(ISD::SETCC, MVT::i16, MVT::i32);
|
|
|
|
|
|
|
|
setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Promote);
|
|
|
|
AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
|
|
|
|
|
|
|
|
setOperationAction(ISD::ROTR, MVT::i16, Promote);
|
|
|
|
setOperationAction(ISD::ROTL, MVT::i16, Promote);
|
|
|
|
|
|
|
|
setOperationAction(ISD::SDIV, MVT::i16, Promote);
|
|
|
|
setOperationAction(ISD::UDIV, MVT::i16, Promote);
|
|
|
|
setOperationAction(ISD::SREM, MVT::i16, Promote);
|
|
|
|
setOperationAction(ISD::UREM, MVT::i16, Promote);
|
|
|
|
|
|
|
|
setOperationAction(ISD::BSWAP, MVT::i16, Promote);
|
|
|
|
setOperationAction(ISD::BITREVERSE, MVT::i16, Promote);
|
|
|
|
|
|
|
|
setOperationAction(ISD::CTTZ, MVT::i16, Promote);
|
|
|
|
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote);
|
|
|
|
setOperationAction(ISD::CTLZ, MVT::i16, Promote);
|
|
|
|
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote);
|
|
|
|
|
|
|
|
setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
|
|
|
|
|
|
|
|
setOperationAction(ISD::BR_CC, MVT::i16, Expand);
|
|
|
|
|
|
|
|
setOperationAction(ISD::LOAD, MVT::i16, Custom);
|
|
|
|
|
|
|
|
setTruncStoreAction(MVT::i64, MVT::i16, Expand);
|
|
|
|
|
|
|
|
setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
|
|
|
|
AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
|
|
|
|
setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
|
|
|
|
AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
|
2016-11-12 08:19:11 +08:00
|
|
|
|
2016-11-17 12:00:46 +08:00
|
|
|
setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
|
|
|
|
setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
|
|
|
|
setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
|
|
|
|
setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
|
2016-11-13 15:01:11 +08:00
|
|
|
|
|
|
|
// F16 - Constant Actions.
|
|
|
|
setOperationAction(ISD::ConstantFP, MVT::f16, Custom);
|
|
|
|
|
|
|
|
// F16 - Load/Store Actions.
|
|
|
|
setOperationAction(ISD::LOAD, MVT::f16, Promote);
|
|
|
|
AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
|
|
|
|
setOperationAction(ISD::STORE, MVT::f16, Promote);
|
|
|
|
AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
|
2016-11-12 08:19:11 +08:00
|
|
|
|
2016-11-13 15:01:11 +08:00
|
|
|
// F16 - VOP1 Actions.
|
2016-11-17 12:28:37 +08:00
|
|
|
setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
|
2016-11-13 15:01:11 +08:00
|
|
|
setOperationAction(ISD::FCOS, MVT::f16, Promote);
|
|
|
|
setOperationAction(ISD::FSIN, MVT::f16, Promote);
|
2016-11-17 12:00:46 +08:00
|
|
|
setOperationAction(ISD::FP_TO_SINT, MVT::f16, Promote);
|
|
|
|
setOperationAction(ISD::FP_TO_UINT, MVT::f16, Promote);
|
|
|
|
setOperationAction(ISD::SINT_TO_FP, MVT::f16, Promote);
|
|
|
|
setOperationAction(ISD::UINT_TO_FP, MVT::f16, Promote);
|
2016-11-13 15:01:11 +08:00
|
|
|
|
|
|
|
// F16 - VOP2 Actions.
|
2016-11-17 11:49:01 +08:00
|
|
|
setOperationAction(ISD::BR_CC, MVT::f16, Expand);
|
2016-11-16 11:16:26 +08:00
|
|
|
setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
|
2016-11-13 15:01:11 +08:00
|
|
|
setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
|
|
|
|
setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
|
|
|
|
setOperationAction(ISD::FDIV, MVT::f16, Promote);
|
|
|
|
|
|
|
|
// F16 - VOP3 Actions.
|
|
|
|
setOperationAction(ISD::FMA, MVT::f16, Legal);
|
|
|
|
if (!Subtarget->hasFP16Denormals())
|
|
|
|
setOperationAction(ISD::FMAD, MVT::f16, Legal);
|
2016-11-11 00:02:37 +08:00
|
|
|
}
|
|
|
|
|
2014-09-29 22:59:34 +08:00
|
|
|
setTargetDAGCombine(ISD::FADD);
|
2014-08-30 00:01:14 +08:00
|
|
|
setTargetDAGCombine(ISD::FSUB);
|
2014-11-15 04:08:52 +08:00
|
|
|
setTargetDAGCombine(ISD::FMINNUM);
|
|
|
|
setTargetDAGCombine(ISD::FMAXNUM);
|
2015-06-09 08:52:37 +08:00
|
|
|
setTargetDAGCombine(ISD::SMIN);
|
|
|
|
setTargetDAGCombine(ISD::SMAX);
|
|
|
|
setTargetDAGCombine(ISD::UMIN);
|
|
|
|
setTargetDAGCombine(ISD::UMAX);
|
2012-12-12 05:25:42 +08:00
|
|
|
setTargetDAGCombine(ISD::SETCC);
|
2015-01-07 07:00:46 +08:00
|
|
|
setTargetDAGCombine(ISD::AND);
|
2015-01-07 07:00:39 +08:00
|
|
|
setTargetDAGCombine(ISD::OR);
|
2016-09-14 23:19:03 +08:00
|
|
|
setTargetDAGCombine(ISD::XOR);
|
2016-10-22 06:10:03 +08:00
|
|
|
setTargetDAGCombine(ISD::SINT_TO_FP);
|
2014-06-12 01:50:44 +08:00
|
|
|
setTargetDAGCombine(ISD::UINT_TO_FP);
|
2016-04-14 09:42:16 +08:00
|
|
|
setTargetDAGCombine(ISD::FCANONICALIZE);
|
2014-06-12 01:50:44 +08:00
|
|
|
|
2014-08-16 01:49:05 +08:00
|
|
|
// All memory operations. Some folding on the pointer operand is done to help
|
|
|
|
// matching the constant offsets in the addressing modes.
|
|
|
|
setTargetDAGCombine(ISD::LOAD);
|
|
|
|
setTargetDAGCombine(ISD::STORE);
|
|
|
|
setTargetDAGCombine(ISD::ATOMIC_LOAD);
|
|
|
|
setTargetDAGCombine(ISD::ATOMIC_STORE);
|
|
|
|
setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP);
|
|
|
|
setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
|
|
|
|
setTargetDAGCombine(ISD::ATOMIC_SWAP);
|
|
|
|
setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD);
|
|
|
|
setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB);
|
|
|
|
setTargetDAGCombine(ISD::ATOMIC_LOAD_AND);
|
|
|
|
setTargetDAGCombine(ISD::ATOMIC_LOAD_OR);
|
|
|
|
setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR);
|
|
|
|
setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND);
|
|
|
|
setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN);
|
|
|
|
setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX);
|
|
|
|
setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN);
|
|
|
|
setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
|
|
|
|
|
2013-03-26 22:04:02 +08:00
|
|
|
setSchedulingPreference(Sched::RegPressure);
|
2012-12-12 05:25:42 +08:00
|
|
|
}
|
|
|
|
|
2016-06-24 14:30:11 +08:00
|
|
|
const SISubtarget *SITargetLowering::getSubtarget() const {
|
|
|
|
return static_cast<const SISubtarget *>(Subtarget);
|
|
|
|
}
|
|
|
|
|
2013-06-25 10:39:35 +08:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// TargetLowering queries
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2016-04-12 22:05:04 +08:00
|
|
|
bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
|
|
|
|
const CallInst &CI,
|
|
|
|
unsigned IntrID) const {
|
|
|
|
switch (IntrID) {
|
|
|
|
case Intrinsic::amdgcn_atomic_inc:
|
|
|
|
case Intrinsic::amdgcn_atomic_dec:
|
|
|
|
Info.opc = ISD::INTRINSIC_W_CHAIN;
|
|
|
|
Info.memVT = MVT::getVT(CI.getType());
|
|
|
|
Info.ptrVal = CI.getOperand(0);
|
|
|
|
Info.align = 0;
|
|
|
|
Info.vol = false;
|
|
|
|
Info.readMem = true;
|
|
|
|
Info.writeMem = true;
|
|
|
|
return true;
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-10-22 00:25:08 +08:00
|
|
|
bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &,
|
|
|
|
EVT) const {
|
|
|
|
// SI has some legal vector types, but no legal vector operations. Say no
|
|
|
|
// shuffles are legal in order to prefer scalarizing some vector operations.
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2015-07-20 22:28:41 +08:00
|
|
|
bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
|
|
|
|
// Flat instructions do not have offsets, and only have the register
|
|
|
|
// address.
|
|
|
|
return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1);
|
|
|
|
}
|
|
|
|
|
2015-08-08 04:18:34 +08:00
|
|
|
bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
|
|
|
|
// MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
|
|
|
|
// additionally can do r + r + i with addr64. 32-bit has more addressing
|
|
|
|
// mode options. Depending on the resource constant, it can also do
|
|
|
|
// (i64 r0) + (i32 r1) * (i14 i).
|
|
|
|
//
|
|
|
|
// Private arrays end up using a scratch buffer most of the time, so also
|
|
|
|
// assume those use MUBUF instructions. Scratch loads / stores are currently
|
|
|
|
// implemented as mubuf instructions with offen bit set, so slightly
|
|
|
|
// different than the normal addr64.
|
|
|
|
if (!isUInt<12>(AM.BaseOffs))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// FIXME: Since we can split immediate into soffset and immediate offset,
|
|
|
|
// would it make sense to allow any immediate?
|
|
|
|
|
|
|
|
switch (AM.Scale) {
|
|
|
|
case 0: // r + i or just i, depending on HasBaseReg.
|
|
|
|
return true;
|
|
|
|
case 1:
|
|
|
|
return true; // We have r + r or r + i.
|
|
|
|
case 2:
|
|
|
|
if (AM.HasBaseReg) {
|
|
|
|
// Reject 2 * r + r.
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Allow 2 * r as r + r
|
|
|
|
// Or 2 * r + i is allowed as r + r + i.
|
|
|
|
return true;
|
|
|
|
default: // Don't allow n * r
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-07-09 10:09:40 +08:00
|
|
|
bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
|
|
|
|
const AddrMode &AM, Type *Ty,
|
|
|
|
unsigned AS) const {
|
2014-08-16 01:17:07 +08:00
|
|
|
// No global is ever allowed as a base.
|
|
|
|
if (AM.BaseGV)
|
|
|
|
return false;
|
|
|
|
|
2015-06-05 00:17:42 +08:00
|
|
|
switch (AS) {
|
2015-08-08 04:18:34 +08:00
|
|
|
case AMDGPUAS::GLOBAL_ADDRESS: {
|
2016-06-24 14:30:11 +08:00
|
|
|
if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
|
2015-07-20 22:28:41 +08:00
|
|
|
// Assume the we will use FLAT for all global memory accesses
|
|
|
|
// on VI.
|
|
|
|
// FIXME: This assumption is currently wrong. On VI we still use
|
|
|
|
// MUBUF instructions for the r + i addressing mode. As currently
|
|
|
|
// implemented, the MUBUF instructions only work on buffer < 4GB.
|
|
|
|
// It may be possible to support > 4GB buffers with MUBUF instructions,
|
|
|
|
// by setting the stride value in the resource descriptor which would
|
|
|
|
// increase the size limit to (stride * 4GB). However, this is risky,
|
|
|
|
// because it has never been validated.
|
|
|
|
return isLegalFlatAddressingMode(AM);
|
|
|
|
}
|
2015-06-05 00:17:42 +08:00
|
|
|
|
2015-08-08 04:18:34 +08:00
|
|
|
return isLegalMUBUFAddressingMode(AM);
|
|
|
|
}
|
|
|
|
case AMDGPUAS::CONSTANT_ADDRESS: {
|
|
|
|
// If the offset isn't a multiple of 4, it probably isn't going to be
|
|
|
|
// correctly aligned.
|
2016-08-13 09:43:51 +08:00
|
|
|
// FIXME: Can we get the real alignment here?
|
2015-08-08 04:18:34 +08:00
|
|
|
if (AM.BaseOffs % 4 != 0)
|
|
|
|
return isLegalMUBUFAddressingMode(AM);
|
|
|
|
|
|
|
|
// There are no SMRD extloads, so if we have to do a small type access we
|
|
|
|
// will use a MUBUF load.
|
|
|
|
// FIXME?: We also need to do this if unaligned, but we don't know the
|
|
|
|
// alignment here.
|
|
|
|
if (DL.getTypeStoreSize(Ty) < 4)
|
|
|
|
return isLegalMUBUFAddressingMode(AM);
|
|
|
|
|
2016-06-24 14:30:11 +08:00
|
|
|
if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
|
2015-08-08 04:18:34 +08:00
|
|
|
// SMRD instructions have an 8-bit, dword offset on SI.
|
|
|
|
if (!isUInt<8>(AM.BaseOffs / 4))
|
|
|
|
return false;
|
2016-06-24 14:30:11 +08:00
|
|
|
} else if (Subtarget->getGeneration() == SISubtarget::SEA_ISLANDS) {
|
2015-08-08 04:18:34 +08:00
|
|
|
// On CI+, this can also be a 32-bit literal constant offset. If it fits
|
|
|
|
// in 8-bits, it can use a smaller encoding.
|
|
|
|
if (!isUInt<32>(AM.BaseOffs / 4))
|
|
|
|
return false;
|
2016-06-24 14:30:11 +08:00
|
|
|
} else if (Subtarget->getGeneration() == SISubtarget::VOLCANIC_ISLANDS) {
|
2015-08-08 04:18:34 +08:00
|
|
|
// On VI, these use the SMEM format and the offset is 20-bit in bytes.
|
|
|
|
if (!isUInt<20>(AM.BaseOffs))
|
|
|
|
return false;
|
|
|
|
} else
|
|
|
|
llvm_unreachable("unhandled generation");
|
2015-06-05 00:17:42 +08:00
|
|
|
|
2015-08-08 04:18:34 +08:00
|
|
|
if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
|
2015-06-05 00:17:42 +08:00
|
|
|
return true;
|
2014-08-16 01:17:07 +08:00
|
|
|
|
2015-08-08 04:18:34 +08:00
|
|
|
if (AM.Scale == 1 && AM.HasBaseReg)
|
2015-06-05 00:17:42 +08:00
|
|
|
return true;
|
2015-08-08 04:18:34 +08:00
|
|
|
|
|
|
|
return false;
|
2015-06-05 00:17:42 +08:00
|
|
|
}
|
2015-08-08 04:18:34 +08:00
|
|
|
|
|
|
|
case AMDGPUAS::PRIVATE_ADDRESS:
|
|
|
|
return isLegalMUBUFAddressingMode(AM);
|
|
|
|
|
2015-06-05 00:17:42 +08:00
|
|
|
case AMDGPUAS::LOCAL_ADDRESS:
|
|
|
|
case AMDGPUAS::REGION_ADDRESS: {
|
|
|
|
// Basic, single offset DS instructions allow a 16-bit unsigned immediate
|
|
|
|
// field.
|
|
|
|
// XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
|
|
|
|
// an 8-bit dword offset but we don't know the alignment here.
|
|
|
|
if (!isUInt<16>(AM.BaseOffs))
|
2014-08-16 01:17:07 +08:00
|
|
|
return false;
|
2015-06-05 00:17:42 +08:00
|
|
|
|
|
|
|
if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
|
|
|
|
return true;
|
|
|
|
|
|
|
|
if (AM.Scale == 1 && AM.HasBaseReg)
|
|
|
|
return true;
|
|
|
|
|
2014-08-16 01:17:07 +08:00
|
|
|
return false;
|
|
|
|
}
|
2015-07-20 22:28:41 +08:00
|
|
|
case AMDGPUAS::FLAT_ADDRESS:
|
2016-04-29 14:25:10 +08:00
|
|
|
case AMDGPUAS::UNKNOWN_ADDRESS_SPACE:
|
|
|
|
// For an unknown address space, this usually means that this is for some
|
|
|
|
// reason being used for pure arithmetic, and not based on some addressing
|
|
|
|
// computation. We don't have instructions that compute pointers with any
|
|
|
|
// addressing modes, so treat them as having no offset like flat
|
|
|
|
// instructions.
|
2015-07-20 22:28:41 +08:00
|
|
|
return isLegalFlatAddressingMode(AM);
|
|
|
|
|
2015-06-05 00:17:42 +08:00
|
|
|
default:
|
|
|
|
llvm_unreachable("unhandled address space");
|
|
|
|
}
|
2014-08-16 01:17:07 +08:00
|
|
|
}
|
|
|
|
|
2015-01-14 09:35:22 +08:00
|
|
|
bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
|
2014-07-28 01:46:40 +08:00
|
|
|
unsigned AddrSpace,
|
|
|
|
unsigned Align,
|
|
|
|
bool *IsFast) const {
|
2014-04-25 01:08:26 +08:00
|
|
|
if (IsFast)
|
|
|
|
*IsFast = false;
|
|
|
|
|
|
|
|
// TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
|
|
|
|
// which isn't a simple VT.
|
2016-08-05 00:38:44 +08:00
|
|
|
// Until MVT is extended to handle this, simply check for the size and
|
|
|
|
// rely on the condition below: allow accesses if the size is a multiple of 4.
|
|
|
|
if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 &&
|
|
|
|
VT.getStoreSize() > 16)) {
|
2013-11-14 07:36:50 +08:00
|
|
|
return false;
|
2016-08-05 00:38:44 +08:00
|
|
|
}
|
2014-04-25 01:08:26 +08:00
|
|
|
|
2016-07-02 07:03:44 +08:00
|
|
|
if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
|
|
|
|
AddrSpace == AMDGPUAS::REGION_ADDRESS) {
|
2014-07-28 01:46:40 +08:00
|
|
|
// ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
|
|
|
|
// aligned, 8 byte access in a single operation using ds_read2/write2_b32
|
|
|
|
// with adjacent offsets.
|
2015-09-03 23:03:19 +08:00
|
|
|
bool AlignedBy4 = (Align % 4 == 0);
|
|
|
|
if (IsFast)
|
|
|
|
*IsFast = AlignedBy4;
|
2016-07-02 07:03:44 +08:00
|
|
|
|
2015-09-03 23:03:19 +08:00
|
|
|
return AlignedBy4;
|
2014-07-28 01:46:40 +08:00
|
|
|
}
|
2014-04-25 01:08:26 +08:00
|
|
|
|
2016-10-15 02:10:39 +08:00
|
|
|
// FIXME: We have to be conservative here and assume that flat operations
|
|
|
|
// will access scratch. If we had access to the IR function, then we
|
|
|
|
// could determine if any private memory was used in the function.
|
|
|
|
if (!Subtarget->hasUnalignedScratchAccess() &&
|
|
|
|
(AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
|
|
|
|
AddrSpace == AMDGPUAS::FLAT_ADDRESS)) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2016-07-02 07:03:44 +08:00
|
|
|
if (Subtarget->hasUnalignedBufferAccess()) {
|
|
|
|
// If we have an uniform constant load, it still requires using a slow
|
|
|
|
// buffer instruction if unaligned.
|
|
|
|
if (IsFast) {
|
|
|
|
*IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS) ?
|
|
|
|
(Align % 4 == 0) : true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2015-02-05 04:49:52 +08:00
|
|
|
// Smaller than dword value must be aligned.
|
|
|
|
if (VT.bitsLT(MVT::i32))
|
|
|
|
return false;
|
|
|
|
|
2014-04-25 01:08:26 +08:00
|
|
|
// 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
|
|
|
|
// byte-address are ignored, thus forcing Dword alignment.
|
2014-07-21 23:45:06 +08:00
|
|
|
// This applies to private, global, and constant memory.
|
2014-04-25 01:08:26 +08:00
|
|
|
if (IsFast)
|
|
|
|
*IsFast = true;
|
2015-02-03 02:02:28 +08:00
|
|
|
|
|
|
|
return VT.bitsGT(MVT::i32) && Align % 4 == 0;
|
2013-06-25 10:39:35 +08:00
|
|
|
}
|
|
|
|
|
2014-07-29 01:49:26 +08:00
|
|
|
EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
|
|
|
|
unsigned SrcAlign, bool IsMemset,
|
|
|
|
bool ZeroMemset,
|
|
|
|
bool MemcpyStrSrc,
|
|
|
|
MachineFunction &MF) const {
|
|
|
|
// FIXME: Should account for address space here.
|
|
|
|
|
|
|
|
// The default fallback uses the private pointer size as a guess for a type to
|
|
|
|
// use. Make sure we switch these to 64-bit accesses.
|
|
|
|
|
|
|
|
if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global
|
|
|
|
return MVT::v4i32;
|
|
|
|
|
|
|
|
if (Size >= 8 && DstAlign >= 4)
|
|
|
|
return MVT::v2i32;
|
|
|
|
|
|
|
|
// Use the default.
|
|
|
|
return MVT::Other;
|
|
|
|
}
|
|
|
|
|
2015-12-02 07:04:00 +08:00
|
|
|
static bool isFlatGlobalAddrSpace(unsigned AS) {
|
|
|
|
return AS == AMDGPUAS::GLOBAL_ADDRESS ||
|
|
|
|
AS == AMDGPUAS::FLAT_ADDRESS ||
|
|
|
|
AS == AMDGPUAS::CONSTANT_ADDRESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
|
|
|
|
unsigned DestAS) const {
|
2016-06-10 10:18:02 +08:00
|
|
|
return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS);
|
2015-12-02 07:04:00 +08:00
|
|
|
}
|
|
|
|
|
2015-12-16 04:55:55 +08:00
|
|
|
bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
|
|
|
|
const MemSDNode *MemNode = cast<MemSDNode>(N);
|
|
|
|
const Value *Ptr = MemNode->getMemOperand()->getValue();
|
|
|
|
|
|
|
|
// UndefValue means this is a load of a kernel input. These are uniform.
|
2016-07-13 22:23:33 +08:00
|
|
|
// Sometimes LDS instructions have constant pointers.
|
|
|
|
// If Ptr is null, then that means this mem operand contains a
|
|
|
|
// PseudoSourceValue like GOT.
|
|
|
|
if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
|
|
|
|
isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
|
2015-12-16 04:55:55 +08:00
|
|
|
return true;
|
|
|
|
|
2016-07-13 22:23:33 +08:00
|
|
|
const Instruction *I = dyn_cast<Instruction>(Ptr);
|
2015-12-16 04:55:55 +08:00
|
|
|
return I && I->getMetadata("amdgpu.uniform");
|
|
|
|
}
|
|
|
|
|
2014-07-03 08:23:43 +08:00
|
|
|
TargetLoweringBase::LegalizeTypeAction
|
|
|
|
SITargetLowering::getPreferredVectorAction(EVT VT) const {
|
|
|
|
if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
|
|
|
|
return TypeSplitVector;
|
|
|
|
|
|
|
|
return TargetLoweringBase::getPreferredVectorAction(VT);
|
2013-08-15 07:25:00 +08:00
|
|
|
}
|
2013-06-25 10:39:35 +08:00
|
|
|
|
2014-04-01 03:54:27 +08:00
|
|
|
bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
|
|
|
|
Type *Ty) const {
|
2016-07-30 09:40:36 +08:00
|
|
|
// FIXME: Could be smarter if called for vector constants.
|
|
|
|
return true;
|
2014-04-01 03:54:27 +08:00
|
|
|
}
|
|
|
|
|
2016-01-20 08:13:22 +08:00
|
|
|
bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
|
|
|
|
|
2016-09-29 04:05:39 +08:00
|
|
|
// i16 is not desirable unless it is a load or a store.
|
|
|
|
if (VT == MVT::i16 && Op != ISD::LOAD && Op != ISD::STORE)
|
|
|
|
return false;
|
|
|
|
|
2016-01-20 08:13:22 +08:00
|
|
|
// SimplifySetCC uses this function to determine whether or not it should
|
|
|
|
// create setcc with i1 operands. We don't have instructions for i1 setcc.
|
|
|
|
if (VT == MVT::i1 && Op == ISD::SETCC)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return TargetLowering::isTypeDesirableForOp(Op, VT);
|
|
|
|
}
|
|
|
|
|
2016-06-22 04:46:20 +08:00
|
|
|
SDValue SITargetLowering::LowerParameterPtr(SelectionDAG &DAG,
|
|
|
|
const SDLoc &SL, SDValue Chain,
|
|
|
|
unsigned Offset) const {
|
2015-07-09 10:09:52 +08:00
|
|
|
const DataLayout &DL = DAG.getDataLayout();
|
2014-09-22 23:35:29 +08:00
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
2016-06-24 14:30:11 +08:00
|
|
|
const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
|
2015-12-01 05:15:57 +08:00
|
|
|
unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
|
2014-07-29 01:31:39 +08:00
|
|
|
|
2013-06-04 01:40:18 +08:00
|
|
|
MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
|
2015-07-09 10:09:52 +08:00
|
|
|
MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
|
2015-06-02 05:58:24 +08:00
|
|
|
SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
|
|
|
|
MRI.getLiveInVirtReg(InputPtrReg), PtrVT);
|
2016-06-22 04:46:20 +08:00
|
|
|
return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
|
|
|
|
DAG.getConstant(Offset, SL, PtrVT));
|
|
|
|
}
|
2016-11-13 15:01:11 +08:00
|
|
|
|
2016-06-22 04:46:20 +08:00
|
|
|
SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
|
|
|
|
const SDLoc &SL, SDValue Chain,
|
|
|
|
unsigned Offset, bool Signed) const {
|
|
|
|
const DataLayout &DL = DAG.getDataLayout();
|
2016-10-18 00:56:19 +08:00
|
|
|
Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
|
2016-06-22 04:46:20 +08:00
|
|
|
PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
|
2014-07-29 01:31:39 +08:00
|
|
|
MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
|
|
|
|
|
2015-07-09 10:09:52 +08:00
|
|
|
unsigned Align = DL.getABITypeAlignment(Ty);
|
2015-06-05 00:00:27 +08:00
|
|
|
|
2016-06-22 04:46:20 +08:00
|
|
|
SDValue Ptr = LowerParameterPtr(DAG, SL, Chain, Offset);
|
2016-10-18 00:21:45 +08:00
|
|
|
SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
|
|
|
|
MachineMemOperand::MONonTemporal |
|
|
|
|
MachineMemOperand::MODereferenceable |
|
|
|
|
MachineMemOperand::MOInvariant);
|
|
|
|
|
|
|
|
SDValue Val;
|
|
|
|
if (MemVT.isFloatingPoint())
|
2016-11-13 15:01:11 +08:00
|
|
|
Val = getFPExtOrFPTrunc(DAG, Load, SL, VT);
|
2016-10-18 00:21:45 +08:00
|
|
|
else if (Signed)
|
|
|
|
Val = DAG.getSExtOrTrunc(Load, SL, VT);
|
|
|
|
else
|
|
|
|
Val = DAG.getZExtOrTrunc(Load, SL, VT);
|
|
|
|
|
|
|
|
SDValue Ops[] = {
|
|
|
|
Val,
|
|
|
|
Load.getValue(1)
|
|
|
|
};
|
|
|
|
|
|
|
|
return DAG.getMergeValues(Ops, SL);
|
2013-06-04 01:40:18 +08:00
|
|
|
}
|
|
|
|
|
2013-03-07 17:03:52 +08:00
|
|
|
SDValue SITargetLowering::LowerFormalArguments(
|
2015-01-31 07:24:40 +08:00
|
|
|
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
|
2016-06-12 23:39:02 +08:00
|
|
|
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
|
|
|
|
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
|
2016-06-24 14:30:11 +08:00
|
|
|
const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
|
2013-03-07 17:03:52 +08:00
|
|
|
|
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
|
|
FunctionType *FType = MF.getFunction()->getFunctionType();
|
2013-03-07 17:04:14 +08:00
|
|
|
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
|
2016-06-24 14:30:11 +08:00
|
|
|
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
|
2013-03-07 17:03:52 +08:00
|
|
|
|
2016-04-07 03:40:20 +08:00
|
|
|
if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
|
2015-11-03 07:23:02 +08:00
|
|
|
const Function *Fn = MF.getFunction();
|
2016-02-02 21:52:43 +08:00
|
|
|
DiagnosticInfoUnsupported NoGraphicsHSA(
|
|
|
|
*Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
|
2015-11-03 07:23:02 +08:00
|
|
|
DAG.getContext()->diagnose(NoGraphicsHSA);
|
2016-05-26 23:24:55 +08:00
|
|
|
return DAG.getEntryNode();
|
2015-11-03 07:23:02 +08:00
|
|
|
}
|
|
|
|
|
2016-06-25 11:11:28 +08:00
|
|
|
// Create stack objects that are used for emitting debugger prologue if
|
|
|
|
// "amdgpu-debugger-emit-prologue" attribute was specified.
|
|
|
|
if (ST.debuggerEmitPrologue())
|
|
|
|
createDebuggerPrologueStackObjects(MF);
|
|
|
|
|
2013-03-07 17:03:52 +08:00
|
|
|
SmallVector<ISD::InputArg, 16> Splits;
|
2014-08-28 03:36:53 +08:00
|
|
|
BitVector Skipped(Ins.size());
|
2013-03-07 17:04:14 +08:00
|
|
|
|
|
|
|
for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) {
|
2013-03-07 17:03:52 +08:00
|
|
|
const ISD::InputArg &Arg = Ins[i];
|
2013-05-18 08:21:46 +08:00
|
|
|
|
|
|
|
// First check if it's a PS input addr
|
2016-04-07 03:40:20 +08:00
|
|
|
if (CallConv == CallingConv::AMDGPU_PS && !Arg.Flags.isInReg() &&
|
2016-01-13 19:46:10 +08:00
|
|
|
!Arg.Flags.isByVal() && PSInputNum <= 15) {
|
2013-03-07 17:04:14 +08:00
|
|
|
|
2016-01-13 19:45:36 +08:00
|
|
|
if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) {
|
2015-08-09 02:27:36 +08:00
|
|
|
// We can safely skip PS inputs
|
2014-08-28 03:36:53 +08:00
|
|
|
Skipped.set(i);
|
2013-03-07 17:04:14 +08:00
|
|
|
++PSInputNum;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2016-01-13 19:45:36 +08:00
|
|
|
Info->markPSInputAllocated(PSInputNum);
|
|
|
|
if (Arg.Used)
|
|
|
|
Info->PSInputEna |= 1 << PSInputNum;
|
|
|
|
|
|
|
|
++PSInputNum;
|
2013-03-07 17:04:14 +08:00
|
|
|
}
|
|
|
|
|
2016-05-06 04:27:02 +08:00
|
|
|
if (AMDGPU::isShader(CallConv)) {
|
|
|
|
// Second split vertices into their elements
|
|
|
|
if (Arg.VT.isVector()) {
|
|
|
|
ISD::InputArg NewArg = Arg;
|
|
|
|
NewArg.Flags.setSplit();
|
|
|
|
NewArg.VT = Arg.VT.getVectorElementType();
|
|
|
|
|
|
|
|
// We REALLY want the ORIGINAL number of vertex elements here, e.g. a
|
|
|
|
// three or five element vertex only needs three or five registers,
|
|
|
|
// NOT four or eight.
|
|
|
|
Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
|
|
|
|
unsigned NumElements = ParamType->getVectorNumElements();
|
|
|
|
|
|
|
|
for (unsigned j = 0; j != NumElements; ++j) {
|
|
|
|
Splits.push_back(NewArg);
|
|
|
|
NewArg.PartOffset += NewArg.VT.getStoreSize();
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
Splits.push_back(Arg);
|
2013-03-07 17:03:52 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
SmallVector<CCValAssign, 16> ArgLocs;
|
2014-08-07 02:45:26 +08:00
|
|
|
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
|
|
|
|
*DAG.getContext());
|
2013-03-07 17:03:52 +08:00
|
|
|
|
2013-03-07 17:04:14 +08:00
|
|
|
// At least one interpolation mode must be enabled or else the GPU will hang.
|
2016-01-13 19:45:36 +08:00
|
|
|
//
|
|
|
|
// Check PSInputAddr instead of PSInputEna. The idea is that if the user set
|
|
|
|
// PSInputAddr, the user wants to enable some bits after the compilation
|
|
|
|
// based on run-time states. Since we can't know what the final PSInputEna
|
|
|
|
// will look like, so we shouldn't do anything here and the user should take
|
|
|
|
// responsibility for the correct programming.
|
2016-01-14 01:23:20 +08:00
|
|
|
//
|
|
|
|
// Otherwise, the following restrictions apply:
|
|
|
|
// - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
|
|
|
|
// - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
|
|
|
|
// enabled too.
|
2016-04-07 03:40:20 +08:00
|
|
|
if (CallConv == CallingConv::AMDGPU_PS &&
|
2016-01-14 01:23:20 +08:00
|
|
|
((Info->getPSInputAddr() & 0x7F) == 0 ||
|
2016-06-20 08:37:41 +08:00
|
|
|
((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11)))) {
|
2013-03-07 17:04:14 +08:00
|
|
|
CCInfo.AllocateReg(AMDGPU::VGPR0);
|
|
|
|
CCInfo.AllocateReg(AMDGPU::VGPR1);
|
2016-01-13 19:45:36 +08:00
|
|
|
Info->markPSInputAllocated(0);
|
|
|
|
Info->PSInputEna |= 1;
|
2013-03-07 17:04:14 +08:00
|
|
|
}
|
|
|
|
|
2016-04-07 03:40:20 +08:00
|
|
|
if (!AMDGPU::isShader(CallConv)) {
|
2016-04-15 00:27:03 +08:00
|
|
|
assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
|
|
|
|
} else {
|
|
|
|
assert(!Info->hasPrivateSegmentBuffer() && !Info->hasDispatchPtr() &&
|
|
|
|
!Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
|
|
|
|
!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
|
|
|
|
!Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
|
|
|
|
!Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
|
|
|
|
!Info->hasWorkItemIDZ());
|
2015-12-01 05:16:03 +08:00
|
|
|
}
|
2015-11-30 23:46:47 +08:00
|
|
|
|
2015-12-01 05:16:03 +08:00
|
|
|
// FIXME: How should these inputs interact with inreg / custom SGPR inputs?
|
|
|
|
if (Info->hasPrivateSegmentBuffer()) {
|
|
|
|
unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI);
|
|
|
|
MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass);
|
|
|
|
CCInfo.AllocateReg(PrivateSegmentBufferReg);
|
|
|
|
}
|
2015-11-30 23:46:47 +08:00
|
|
|
|
2015-12-01 05:16:03 +08:00
|
|
|
if (Info->hasDispatchPtr()) {
|
|
|
|
unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI);
|
|
|
|
MF.addLiveIn(DispatchPtrReg, &AMDGPU::SReg_64RegClass);
|
|
|
|
CCInfo.AllocateReg(DispatchPtrReg);
|
2013-06-04 01:40:11 +08:00
|
|
|
}
|
|
|
|
|
2016-04-26 03:27:18 +08:00
|
|
|
if (Info->hasQueuePtr()) {
|
|
|
|
unsigned QueuePtrReg = Info->addQueuePtr(*TRI);
|
|
|
|
MF.addLiveIn(QueuePtrReg, &AMDGPU::SReg_64RegClass);
|
|
|
|
CCInfo.AllocateReg(QueuePtrReg);
|
|
|
|
}
|
|
|
|
|
2015-12-01 05:16:03 +08:00
|
|
|
if (Info->hasKernargSegmentPtr()) {
|
|
|
|
unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI);
|
|
|
|
MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass);
|
|
|
|
CCInfo.AllocateReg(InputPtrReg);
|
2013-10-23 08:44:32 +08:00
|
|
|
}
|
|
|
|
|
2016-07-23 01:01:30 +08:00
|
|
|
if (Info->hasDispatchID()) {
|
|
|
|
unsigned DispatchIDReg = Info->addDispatchID(*TRI);
|
|
|
|
MF.addLiveIn(DispatchIDReg, &AMDGPU::SReg_64RegClass);
|
|
|
|
CCInfo.AllocateReg(DispatchIDReg);
|
|
|
|
}
|
|
|
|
|
2016-02-12 14:31:30 +08:00
|
|
|
if (Info->hasFlatScratchInit()) {
|
|
|
|
unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI);
|
|
|
|
MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SReg_64RegClass);
|
|
|
|
CCInfo.AllocateReg(FlatScratchInitReg);
|
|
|
|
}
|
|
|
|
|
2016-09-17 05:53:00 +08:00
|
|
|
if (!AMDGPU::isShader(CallConv))
|
|
|
|
analyzeFormalArgumentsCompute(CCInfo, Ins);
|
|
|
|
else
|
|
|
|
AnalyzeFormalArguments(CCInfo, Splits);
|
2013-03-07 17:03:52 +08:00
|
|
|
|
2015-07-11 06:51:36 +08:00
|
|
|
SmallVector<SDValue, 16> Chains;
|
|
|
|
|
2013-03-07 17:03:52 +08:00
|
|
|
for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
|
|
|
|
|
2013-05-17 17:46:48 +08:00
|
|
|
const ISD::InputArg &Arg = Ins[i];
|
2014-08-28 03:36:53 +08:00
|
|
|
if (Skipped[i]) {
|
2013-05-17 17:46:48 +08:00
|
|
|
InVals.push_back(DAG.getUNDEF(Arg.VT));
|
2013-03-07 17:04:14 +08:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2013-03-07 17:03:52 +08:00
|
|
|
CCValAssign &VA = ArgLocs[ArgIdx++];
|
2014-11-17 05:17:18 +08:00
|
|
|
MVT VT = VA.getLocVT();
|
2013-06-04 01:40:11 +08:00
|
|
|
|
|
|
|
if (VA.isMemLoc()) {
|
2013-10-23 08:44:32 +08:00
|
|
|
VT = Ins[i].VT;
|
2016-09-17 05:53:00 +08:00
|
|
|
EVT MemVT = VA.getLocVT();
|
2015-06-27 05:15:03 +08:00
|
|
|
const unsigned Offset = Subtarget->getExplicitKernelArgOffset() +
|
|
|
|
VA.getLocMemOffset();
|
2013-06-04 01:40:18 +08:00
|
|
|
// The first 36 bytes of the input buffer contains information about
|
|
|
|
// thread group and global sizes.
|
2015-07-11 06:28:41 +08:00
|
|
|
SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, Chain,
|
2014-10-15 04:05:26 +08:00
|
|
|
Offset, Ins[i].Flags.isSExt());
|
2015-07-11 06:51:36 +08:00
|
|
|
Chains.push_back(Arg.getValue(1));
|
2014-08-23 02:49:31 +08:00
|
|
|
|
2015-08-02 06:20:21 +08:00
|
|
|
auto *ParamTy =
|
2015-02-17 02:10:47 +08:00
|
|
|
dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
|
2016-06-24 14:30:11 +08:00
|
|
|
if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
|
2014-08-23 02:49:31 +08:00
|
|
|
ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
|
|
|
|
// On SI local pointers are just offsets into LDS, so they are always
|
|
|
|
// less than 16-bits. On CI and newer they could potentially be
|
|
|
|
// real pointers, so we can't guarantee their size.
|
|
|
|
Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
|
|
|
|
DAG.getValueType(MVT::i16));
|
|
|
|
}
|
|
|
|
|
2013-06-04 01:40:11 +08:00
|
|
|
InVals.push_back(Arg);
|
2016-07-27 00:45:58 +08:00
|
|
|
Info->setABIArgOffset(Offset + MemVT.getStoreSize());
|
2013-06-04 01:40:11 +08:00
|
|
|
continue;
|
|
|
|
}
|
2013-03-07 17:03:52 +08:00
|
|
|
assert(VA.isRegLoc() && "Parameter must be in a register!");
|
|
|
|
|
|
|
|
unsigned Reg = VA.getLocReg();
|
|
|
|
|
|
|
|
if (VT == MVT::i64) {
|
|
|
|
// For now assume it is a pointer
|
|
|
|
Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0,
|
|
|
|
&AMDGPU::SReg_64RegClass);
|
|
|
|
Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass);
|
2015-07-11 06:51:36 +08:00
|
|
|
SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
|
|
|
|
InVals.push_back(Copy);
|
2013-03-07 17:03:52 +08:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
|
|
|
|
|
|
|
|
Reg = MF.addLiveIn(Reg, RC);
|
|
|
|
SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
|
|
|
|
|
|
|
|
if (Arg.VT.isVector()) {
|
|
|
|
|
|
|
|
// Build a vector from the registers
|
2015-02-17 02:10:47 +08:00
|
|
|
Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
|
2013-03-07 17:03:52 +08:00
|
|
|
unsigned NumElements = ParamType->getVectorNumElements();
|
|
|
|
|
|
|
|
SmallVector<SDValue, 4> Regs;
|
|
|
|
Regs.push_back(Val);
|
|
|
|
for (unsigned j = 1; j != NumElements; ++j) {
|
|
|
|
Reg = ArgLocs[ArgIdx++].getLocReg();
|
|
|
|
Reg = MF.addLiveIn(Reg, RC);
|
2015-07-11 06:51:36 +08:00
|
|
|
|
|
|
|
SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
|
|
|
|
Regs.push_back(Copy);
|
2013-03-07 17:03:52 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Fill up the missing vector elements
|
|
|
|
NumElements = Arg.VT.getVectorNumElements() - NumElements;
|
2015-02-17 23:29:18 +08:00
|
|
|
Regs.append(NumElements, DAG.getUNDEF(VT));
|
2013-05-18 08:21:46 +08:00
|
|
|
|
2016-04-27 05:15:30 +08:00
|
|
|
InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs));
|
2013-03-07 17:03:52 +08:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
InVals.push_back(Val);
|
|
|
|
}
|
2015-01-21 03:33:04 +08:00
|
|
|
|
2015-12-01 05:16:03 +08:00
|
|
|
// TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
|
|
|
|
// these from the dispatch pointer.
|
|
|
|
|
|
|
|
// Start adding system SGPRs.
|
|
|
|
if (Info->hasWorkGroupIDX()) {
|
|
|
|
unsigned Reg = Info->addWorkGroupIDX();
|
2016-11-26 01:37:09 +08:00
|
|
|
MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
|
2015-12-01 05:16:03 +08:00
|
|
|
CCInfo.AllocateReg(Reg);
|
2016-04-15 00:27:03 +08:00
|
|
|
}
|
2015-12-01 05:16:03 +08:00
|
|
|
|
|
|
|
if (Info->hasWorkGroupIDY()) {
|
|
|
|
unsigned Reg = Info->addWorkGroupIDY();
|
2016-11-26 01:37:09 +08:00
|
|
|
MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
|
2015-12-01 05:16:03 +08:00
|
|
|
CCInfo.AllocateReg(Reg);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (Info->hasWorkGroupIDZ()) {
|
|
|
|
unsigned Reg = Info->addWorkGroupIDZ();
|
2016-11-26 01:37:09 +08:00
|
|
|
MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
|
2015-12-01 05:16:03 +08:00
|
|
|
CCInfo.AllocateReg(Reg);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (Info->hasWorkGroupInfo()) {
|
|
|
|
unsigned Reg = Info->addWorkGroupInfo();
|
2016-11-26 01:37:09 +08:00
|
|
|
MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
|
2015-12-01 05:16:03 +08:00
|
|
|
CCInfo.AllocateReg(Reg);
|
2015-01-21 03:33:04 +08:00
|
|
|
}
|
2015-07-11 06:51:36 +08:00
|
|
|
|
2015-12-01 05:16:03 +08:00
|
|
|
if (Info->hasPrivateSegmentWaveByteOffset()) {
|
|
|
|
// Scratch wave offset passed in system SGPR.
|
2016-04-15 00:27:03 +08:00
|
|
|
unsigned PrivateSegmentWaveByteOffsetReg;
|
|
|
|
|
|
|
|
if (AMDGPU::isShader(CallConv)) {
|
|
|
|
PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
|
|
|
|
Info->setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
|
|
|
|
} else
|
|
|
|
PrivateSegmentWaveByteOffsetReg = Info->addPrivateSegmentWaveByteOffset();
|
2015-12-01 05:16:03 +08:00
|
|
|
|
|
|
|
MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
|
|
|
|
CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Now that we've figured out where the scratch register inputs are, see if
|
|
|
|
// should reserve the arguments and use them directly.
|
2016-07-29 02:40:00 +08:00
|
|
|
bool HasStackObjects = MF.getFrameInfo().hasStackObjects();
|
2016-02-12 14:31:30 +08:00
|
|
|
// Record that we know we have non-spill stack objects so we don't need to
|
|
|
|
// check all stack objects later.
|
|
|
|
if (HasStackObjects)
|
|
|
|
Info->setHasNonSpillStackObjects(true);
|
2015-12-01 05:16:03 +08:00
|
|
|
|
2016-10-13 21:10:00 +08:00
|
|
|
// Everything live out of a block is spilled with fast regalloc, so it's
|
|
|
|
// almost certain that spilling will be required.
|
|
|
|
if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
|
|
|
|
HasStackObjects = true;
|
|
|
|
|
2016-09-17 05:34:26 +08:00
|
|
|
if (ST.isAmdCodeObjectV2()) {
|
2015-12-01 05:16:03 +08:00
|
|
|
if (HasStackObjects) {
|
|
|
|
// If we have stack objects, we unquestionably need the private buffer
|
2016-09-17 05:34:26 +08:00
|
|
|
// resource. For the Code Object V2 ABI, this will be the first 4 user
|
|
|
|
// SGPR inputs. We can reserve those and use them directly.
|
2015-12-01 05:16:03 +08:00
|
|
|
|
|
|
|
unsigned PrivateSegmentBufferReg = TRI->getPreloadedValue(
|
|
|
|
MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
|
|
|
|
Info->setScratchRSrcReg(PrivateSegmentBufferReg);
|
|
|
|
|
|
|
|
unsigned PrivateSegmentWaveByteOffsetReg = TRI->getPreloadedValue(
|
|
|
|
MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
|
|
|
|
Info->setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
|
|
|
|
} else {
|
|
|
|
unsigned ReservedBufferReg
|
|
|
|
= TRI->reservedPrivateSegmentBufferReg(MF);
|
|
|
|
unsigned ReservedOffsetReg
|
|
|
|
= TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
|
|
|
|
|
|
|
|
// We tentatively reserve the last registers (skipping the last two
|
|
|
|
// which may contain VCC). After register allocation, we'll replace
|
|
|
|
// these with the ones immediately after those which were really
|
|
|
|
// allocated. In the prologue copies will be inserted from the argument
|
|
|
|
// to these reserved registers.
|
|
|
|
Info->setScratchRSrcReg(ReservedBufferReg);
|
|
|
|
Info->setScratchWaveOffsetReg(ReservedOffsetReg);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
unsigned ReservedBufferReg = TRI->reservedPrivateSegmentBufferReg(MF);
|
|
|
|
|
|
|
|
// Without HSA, relocations are used for the scratch pointer and the
|
|
|
|
// buffer resource setup is always inserted in the prologue. Scratch wave
|
|
|
|
// offset is still in an input SGPR.
|
|
|
|
Info->setScratchRSrcReg(ReservedBufferReg);
|
|
|
|
|
|
|
|
if (HasStackObjects) {
|
|
|
|
unsigned ScratchWaveOffsetReg = TRI->getPreloadedValue(
|
|
|
|
MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
|
|
|
|
Info->setScratchWaveOffsetReg(ScratchWaveOffsetReg);
|
|
|
|
} else {
|
|
|
|
unsigned ReservedOffsetReg
|
|
|
|
= TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
|
|
|
|
Info->setScratchWaveOffsetReg(ReservedOffsetReg);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (Info->hasWorkItemIDX()) {
|
|
|
|
unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X);
|
|
|
|
MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
|
|
|
|
CCInfo.AllocateReg(Reg);
|
2016-04-15 00:27:03 +08:00
|
|
|
}
|
2015-12-01 05:16:03 +08:00
|
|
|
|
|
|
|
if (Info->hasWorkItemIDY()) {
|
|
|
|
unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y);
|
|
|
|
MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
|
|
|
|
CCInfo.AllocateReg(Reg);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (Info->hasWorkItemIDZ()) {
|
|
|
|
unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z);
|
|
|
|
MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
|
|
|
|
CCInfo.AllocateReg(Reg);
|
|
|
|
}
|
2015-12-01 05:15:53 +08:00
|
|
|
|
2015-07-11 06:51:36 +08:00
|
|
|
if (Chains.empty())
|
|
|
|
return Chain;
|
|
|
|
|
|
|
|
return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
|
2013-03-07 17:03:52 +08:00
|
|
|
}
|
|
|
|
|
2016-06-12 23:39:02 +08:00
|
|
|
SDValue
|
|
|
|
SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
|
|
|
|
bool isVarArg,
|
|
|
|
const SmallVectorImpl<ISD::OutputArg> &Outs,
|
|
|
|
const SmallVectorImpl<SDValue> &OutVals,
|
|
|
|
const SDLoc &DL, SelectionDAG &DAG) const {
|
2016-01-14 01:23:04 +08:00
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
|
|
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
|
|
|
|
|
2016-04-07 03:40:20 +08:00
|
|
|
if (!AMDGPU::isShader(CallConv))
|
2016-01-14 01:23:04 +08:00
|
|
|
return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
|
|
|
|
OutVals, DL, DAG);
|
|
|
|
|
2016-01-14 01:23:09 +08:00
|
|
|
Info->setIfReturnsVoid(Outs.size() == 0);
|
|
|
|
|
2016-01-14 01:23:04 +08:00
|
|
|
SmallVector<ISD::OutputArg, 48> Splits;
|
|
|
|
SmallVector<SDValue, 48> SplitVals;
|
|
|
|
|
|
|
|
// Split vectors into their elements.
|
|
|
|
for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
|
|
|
|
const ISD::OutputArg &Out = Outs[i];
|
|
|
|
|
|
|
|
if (Out.VT.isVector()) {
|
|
|
|
MVT VT = Out.VT.getVectorElementType();
|
|
|
|
ISD::OutputArg NewOut = Out;
|
|
|
|
NewOut.Flags.setSplit();
|
|
|
|
NewOut.VT = VT;
|
|
|
|
|
|
|
|
// We want the original number of vector elements here, e.g.
|
|
|
|
// three or five, not four or eight.
|
|
|
|
unsigned NumElements = Out.ArgVT.getVectorNumElements();
|
|
|
|
|
|
|
|
for (unsigned j = 0; j != NumElements; ++j) {
|
|
|
|
SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, OutVals[i],
|
|
|
|
DAG.getConstant(j, DL, MVT::i32));
|
|
|
|
SplitVals.push_back(Elem);
|
|
|
|
Splits.push_back(NewOut);
|
|
|
|
NewOut.PartOffset += NewOut.VT.getStoreSize();
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
SplitVals.push_back(OutVals[i]);
|
|
|
|
Splits.push_back(Out);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// CCValAssign - represent the assignment of the return value to a location.
|
|
|
|
SmallVector<CCValAssign, 48> RVLocs;
|
|
|
|
|
|
|
|
// CCState - Info about the registers and stack slots.
|
|
|
|
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
|
|
|
|
*DAG.getContext());
|
|
|
|
|
|
|
|
// Analyze outgoing return values.
|
|
|
|
AnalyzeReturn(CCInfo, Splits);
|
|
|
|
|
|
|
|
SDValue Flag;
|
|
|
|
SmallVector<SDValue, 48> RetOps;
|
|
|
|
RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
|
|
|
|
|
|
|
|
// Copy the result values into the output registers.
|
|
|
|
for (unsigned i = 0, realRVLocIdx = 0;
|
|
|
|
i != RVLocs.size();
|
|
|
|
++i, ++realRVLocIdx) {
|
|
|
|
CCValAssign &VA = RVLocs[i];
|
|
|
|
assert(VA.isRegLoc() && "Can only return in registers!");
|
|
|
|
|
|
|
|
SDValue Arg = SplitVals[realRVLocIdx];
|
|
|
|
|
|
|
|
// Copied from other backends.
|
|
|
|
switch (VA.getLocInfo()) {
|
|
|
|
default: llvm_unreachable("Unknown loc info!");
|
|
|
|
case CCValAssign::Full:
|
|
|
|
break;
|
|
|
|
case CCValAssign::BCvt:
|
|
|
|
Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
|
|
|
|
Flag = Chain.getValue(1);
|
|
|
|
RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Update chain and glue.
|
|
|
|
RetOps[0] = Chain;
|
|
|
|
if (Flag.getNode())
|
|
|
|
RetOps.push_back(Flag);
|
|
|
|
|
2016-06-23 04:15:28 +08:00
|
|
|
unsigned Opc = Info->returnsVoid() ? AMDGPUISD::ENDPGM : AMDGPUISD::RETURN;
|
|
|
|
return DAG.getNode(Opc, DL, MVT::Other, RetOps);
|
2016-01-14 01:23:04 +08:00
|
|
|
}
|
|
|
|
|
2016-01-26 12:29:24 +08:00
|
|
|
unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
|
|
|
|
SelectionDAG &DAG) const {
|
|
|
|
unsigned Reg = StringSwitch<unsigned>(RegName)
|
|
|
|
.Case("m0", AMDGPU::M0)
|
|
|
|
.Case("exec", AMDGPU::EXEC)
|
|
|
|
.Case("exec_lo", AMDGPU::EXEC_LO)
|
|
|
|
.Case("exec_hi", AMDGPU::EXEC_HI)
|
|
|
|
.Case("flat_scratch", AMDGPU::FLAT_SCR)
|
|
|
|
.Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
|
|
|
|
.Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
|
|
|
|
.Default(AMDGPU::NoRegister);
|
|
|
|
|
|
|
|
if (Reg == AMDGPU::NoRegister) {
|
|
|
|
report_fatal_error(Twine("invalid register name \""
|
|
|
|
+ StringRef(RegName) + "\"."));
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2016-06-24 14:30:11 +08:00
|
|
|
if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
|
2016-01-26 12:29:24 +08:00
|
|
|
Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
|
|
|
|
report_fatal_error(Twine("invalid register \""
|
|
|
|
+ StringRef(RegName) + "\" for subtarget."));
|
|
|
|
}
|
|
|
|
|
|
|
|
switch (Reg) {
|
|
|
|
case AMDGPU::M0:
|
|
|
|
case AMDGPU::EXEC_LO:
|
|
|
|
case AMDGPU::EXEC_HI:
|
|
|
|
case AMDGPU::FLAT_SCR_LO:
|
|
|
|
case AMDGPU::FLAT_SCR_HI:
|
|
|
|
if (VT.getSizeInBits() == 32)
|
|
|
|
return Reg;
|
|
|
|
break;
|
|
|
|
case AMDGPU::EXEC:
|
|
|
|
case AMDGPU::FLAT_SCR:
|
|
|
|
if (VT.getSizeInBits() == 64)
|
|
|
|
return Reg;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
llvm_unreachable("missing register type checking");
|
|
|
|
}
|
|
|
|
|
|
|
|
report_fatal_error(Twine("invalid type for register \""
|
|
|
|
+ StringRef(RegName) + "\"."));
|
|
|
|
}
|
|
|
|
|
2016-07-13 05:41:32 +08:00
|
|
|
// If kill is not the last instruction, split the block so kill is always a
|
|
|
|
// proper terminator.
|
|
|
|
MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI,
|
|
|
|
MachineBasicBlock *BB) const {
|
|
|
|
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
|
|
|
|
|
|
|
|
MachineBasicBlock::iterator SplitPoint(&MI);
|
|
|
|
++SplitPoint;
|
|
|
|
|
|
|
|
if (SplitPoint == BB->end()) {
|
|
|
|
// Don't bother with a new block.
|
|
|
|
MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR));
|
|
|
|
return BB;
|
|
|
|
}
|
|
|
|
|
|
|
|
MachineFunction *MF = BB->getParent();
|
|
|
|
MachineBasicBlock *SplitBB
|
|
|
|
= MF->CreateMachineBasicBlock(BB->getBasicBlock());
|
|
|
|
|
|
|
|
MF->insert(++MachineFunction::iterator(BB), SplitBB);
|
|
|
|
SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end());
|
|
|
|
|
2016-07-23 01:01:15 +08:00
|
|
|
SplitBB->transferSuccessorsAndUpdatePHIs(BB);
|
2016-07-13 05:41:32 +08:00
|
|
|
BB->addSuccessor(SplitBB);
|
|
|
|
|
|
|
|
MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR));
|
|
|
|
return SplitBB;
|
|
|
|
}
|
|
|
|
|
2016-07-19 08:35:03 +08:00
|
|
|
// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
|
|
|
|
// wavefront. If the value is uniform and just happens to be in a VGPR, this
|
|
|
|
// will only do one iteration. In the worst case, this will loop 64 times.
|
|
|
|
//
|
|
|
|
// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
|
2016-10-04 09:41:05 +08:00
|
|
|
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
|
|
|
|
const SIInstrInfo *TII,
|
|
|
|
MachineRegisterInfo &MRI,
|
|
|
|
MachineBasicBlock &OrigBB,
|
|
|
|
MachineBasicBlock &LoopBB,
|
|
|
|
const DebugLoc &DL,
|
|
|
|
const MachineOperand &IdxReg,
|
|
|
|
unsigned InitReg,
|
|
|
|
unsigned ResultReg,
|
|
|
|
unsigned PhiReg,
|
|
|
|
unsigned InitSaveExecReg,
|
2016-10-13 02:49:05 +08:00
|
|
|
int Offset,
|
|
|
|
bool UseGPRIdxMode) {
|
2016-07-19 08:35:03 +08:00
|
|
|
MachineBasicBlock::iterator I = LoopBB.begin();
|
|
|
|
|
|
|
|
unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
|
|
|
|
unsigned NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
|
|
|
|
unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
|
|
|
|
unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
|
|
|
|
|
|
|
|
BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
|
|
|
|
.addReg(InitReg)
|
|
|
|
.addMBB(&OrigBB)
|
|
|
|
.addReg(ResultReg)
|
|
|
|
.addMBB(&LoopBB);
|
|
|
|
|
|
|
|
BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
|
|
|
|
.addReg(InitSaveExecReg)
|
|
|
|
.addMBB(&OrigBB)
|
|
|
|
.addReg(NewExec)
|
|
|
|
.addMBB(&LoopBB);
|
|
|
|
|
|
|
|
// Read the next variant <- also loop target.
|
|
|
|
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
|
|
|
|
.addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
|
|
|
|
|
|
|
|
// Compare the just read M0 value to all possible Idx values.
|
|
|
|
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
|
|
|
|
.addReg(CurrentIdxReg)
|
2016-07-21 17:40:57 +08:00
|
|
|
.addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());
|
2016-07-19 08:35:03 +08:00
|
|
|
|
2016-10-13 02:49:05 +08:00
|
|
|
if (UseGPRIdxMode) {
|
|
|
|
unsigned IdxReg;
|
|
|
|
if (Offset == 0) {
|
|
|
|
IdxReg = CurrentIdxReg;
|
|
|
|
} else {
|
|
|
|
IdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
|
|
|
|
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), IdxReg)
|
|
|
|
.addReg(CurrentIdxReg, RegState::Kill)
|
|
|
|
.addImm(Offset);
|
|
|
|
}
|
|
|
|
|
|
|
|
MachineInstr *SetIdx =
|
|
|
|
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_IDX))
|
|
|
|
.addReg(IdxReg, RegState::Kill);
|
2016-10-13 20:45:16 +08:00
|
|
|
SetIdx->getOperand(2).setIsUndef();
|
2016-07-19 08:35:03 +08:00
|
|
|
} else {
|
2016-10-13 02:49:05 +08:00
|
|
|
// Move index from VCC into M0
|
|
|
|
if (Offset == 0) {
|
|
|
|
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
|
|
|
|
.addReg(CurrentIdxReg, RegState::Kill);
|
|
|
|
} else {
|
|
|
|
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
|
|
|
|
.addReg(CurrentIdxReg, RegState::Kill)
|
|
|
|
.addImm(Offset);
|
|
|
|
}
|
2016-07-19 08:35:03 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Update EXEC, save the original EXEC value to VCC.
|
|
|
|
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
|
|
|
|
.addReg(CondReg, RegState::Kill);
|
|
|
|
|
|
|
|
MRI.setSimpleHint(NewExec, CondReg);
|
|
|
|
|
|
|
|
// Update EXEC, switch all done bits to 0 and all todo bits to 1.
|
2016-10-04 09:41:05 +08:00
|
|
|
MachineInstr *InsertPt =
|
|
|
|
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
|
2016-07-19 08:35:03 +08:00
|
|
|
.addReg(AMDGPU::EXEC)
|
|
|
|
.addReg(NewExec);
|
|
|
|
|
|
|
|
// XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
|
|
|
|
// s_cbranch_scc0?
|
|
|
|
|
|
|
|
// Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
|
|
|
|
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
|
|
|
|
.addMBB(&LoopBB);
|
2016-10-04 09:41:05 +08:00
|
|
|
|
|
|
|
return InsertPt->getIterator();
|
2016-07-19 08:35:03 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// This has slightly sub-optimal regalloc when the source vector is killed by
|
|
|
|
// the read. The register allocator does not understand that the kill is
|
|
|
|
// per-workitem, so is kept alive for the whole loop so we end up not re-using a
|
|
|
|
// subregister from it, using 1 more VGPR than necessary. This was saved when
|
|
|
|
// this was expanded after register allocation.
|
2016-10-04 09:41:05 +08:00
|
|
|
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
|
|
|
|
MachineBasicBlock &MBB,
|
|
|
|
MachineInstr &MI,
|
|
|
|
unsigned InitResultReg,
|
|
|
|
unsigned PhiReg,
|
2016-10-13 02:49:05 +08:00
|
|
|
int Offset,
|
|
|
|
bool UseGPRIdxMode) {
|
2016-07-19 08:35:03 +08:00
|
|
|
MachineFunction *MF = MBB.getParent();
|
|
|
|
MachineRegisterInfo &MRI = MF->getRegInfo();
|
|
|
|
const DebugLoc &DL = MI.getDebugLoc();
|
|
|
|
MachineBasicBlock::iterator I(&MI);
|
|
|
|
|
|
|
|
unsigned DstReg = MI.getOperand(0).getReg();
|
|
|
|
unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
|
|
|
|
unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
|
|
|
|
|
|
|
|
BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
|
|
|
|
|
|
|
|
// Save the EXEC mask
|
|
|
|
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), SaveExec)
|
|
|
|
.addReg(AMDGPU::EXEC);
|
|
|
|
|
|
|
|
// To insert the loop we need to split the block. Move everything after this
|
|
|
|
// point to a new block, and insert a new empty block between the two.
|
|
|
|
MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
|
|
|
|
MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
|
|
|
|
MachineFunction::iterator MBBI(MBB);
|
|
|
|
++MBBI;
|
|
|
|
|
|
|
|
MF->insert(MBBI, LoopBB);
|
|
|
|
MF->insert(MBBI, RemainderBB);
|
|
|
|
|
|
|
|
LoopBB->addSuccessor(LoopBB);
|
|
|
|
LoopBB->addSuccessor(RemainderBB);
|
|
|
|
|
|
|
|
// Move the rest of the block into a new block.
|
2016-07-23 01:01:15 +08:00
|
|
|
RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
|
2016-07-19 08:35:03 +08:00
|
|
|
RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
|
|
|
|
|
|
|
|
MBB.addSuccessor(LoopBB);
|
|
|
|
|
|
|
|
const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
|
|
|
|
|
2016-10-04 09:41:05 +08:00
|
|
|
auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
|
|
|
|
InitResultReg, DstReg, PhiReg, TmpExec,
|
2016-10-13 02:49:05 +08:00
|
|
|
Offset, UseGPRIdxMode);
|
2016-07-19 08:35:03 +08:00
|
|
|
|
|
|
|
MachineBasicBlock::iterator First = RemainderBB->begin();
|
|
|
|
BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
|
|
|
|
.addReg(SaveExec);
|
|
|
|
|
2016-10-04 09:41:05 +08:00
|
|
|
return InsPt;
|
2016-07-19 08:35:03 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Returns subreg index, offset
|
|
|
|
static std::pair<unsigned, int>
|
|
|
|
computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
|
|
|
|
const TargetRegisterClass *SuperRC,
|
|
|
|
unsigned VecReg,
|
|
|
|
int Offset) {
|
|
|
|
int NumElts = SuperRC->getSize() / 4;
|
|
|
|
|
|
|
|
// Skip out of bounds offsets, or else we would end up using an undefined
|
|
|
|
// register.
|
|
|
|
if (Offset >= NumElts || Offset < 0)
|
|
|
|
return std::make_pair(AMDGPU::sub0, Offset);
|
|
|
|
|
|
|
|
return std::make_pair(AMDGPU::sub0 + Offset, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Return true if the index is an SGPR and was set.
|
|
|
|
static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII,
|
|
|
|
MachineRegisterInfo &MRI,
|
|
|
|
MachineInstr &MI,
|
2016-10-13 02:49:05 +08:00
|
|
|
int Offset,
|
|
|
|
bool UseGPRIdxMode,
|
|
|
|
bool IsIndirectSrc) {
|
2016-07-19 08:35:03 +08:00
|
|
|
MachineBasicBlock *MBB = MI.getParent();
|
|
|
|
const DebugLoc &DL = MI.getDebugLoc();
|
|
|
|
MachineBasicBlock::iterator I(&MI);
|
|
|
|
|
|
|
|
const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
|
|
|
|
const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
|
|
|
|
|
|
|
|
assert(Idx->getReg() != AMDGPU::NoRegister);
|
|
|
|
|
|
|
|
if (!TII->getRegisterInfo().isSGPRClass(IdxRC))
|
|
|
|
return false;
|
|
|
|
|
2016-10-13 02:49:05 +08:00
|
|
|
if (UseGPRIdxMode) {
|
|
|
|
unsigned IdxMode = IsIndirectSrc ?
|
|
|
|
VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE;
|
|
|
|
if (Offset == 0) {
|
|
|
|
MachineInstr *SetOn =
|
|
|
|
BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
|
|
|
|
.addOperand(*Idx)
|
|
|
|
.addImm(IdxMode);
|
|
|
|
|
2016-10-13 20:45:16 +08:00
|
|
|
SetOn->getOperand(3).setIsUndef();
|
2016-10-13 02:49:05 +08:00
|
|
|
} else {
|
|
|
|
unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
|
|
|
|
BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
|
|
|
|
.addOperand(*Idx)
|
|
|
|
.addImm(Offset);
|
|
|
|
MachineInstr *SetOn =
|
|
|
|
BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
|
|
|
|
.addReg(Tmp, RegState::Kill)
|
|
|
|
.addImm(IdxMode);
|
|
|
|
|
2016-10-13 20:45:16 +08:00
|
|
|
SetOn->getOperand(3).setIsUndef();
|
2016-10-13 02:49:05 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2016-07-19 08:35:03 +08:00
|
|
|
if (Offset == 0) {
|
|
|
|
BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
|
|
|
|
.addOperand(*Idx);
|
|
|
|
} else {
|
|
|
|
BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
|
|
|
|
.addOperand(*Idx)
|
|
|
|
.addImm(Offset);
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Control flow needs to be inserted if indexing with a VGPR.
|
|
|
|
static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
|
|
|
|
MachineBasicBlock &MBB,
|
2016-10-04 09:41:05 +08:00
|
|
|
const SISubtarget &ST) {
|
|
|
|
const SIInstrInfo *TII = ST.getInstrInfo();
|
2016-07-19 08:35:03 +08:00
|
|
|
const SIRegisterInfo &TRI = TII->getRegisterInfo();
|
|
|
|
MachineFunction *MF = MBB.getParent();
|
|
|
|
MachineRegisterInfo &MRI = MF->getRegInfo();
|
|
|
|
|
|
|
|
unsigned Dst = MI.getOperand(0).getReg();
|
2016-10-14 17:03:04 +08:00
|
|
|
unsigned SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
|
2016-07-19 08:35:03 +08:00
|
|
|
int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
|
|
|
|
|
2016-10-14 17:03:04 +08:00
|
|
|
const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
|
2016-07-19 08:35:03 +08:00
|
|
|
|
|
|
|
unsigned SubReg;
|
|
|
|
std::tie(SubReg, Offset)
|
2016-10-14 17:03:04 +08:00
|
|
|
= computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
|
2016-07-19 08:35:03 +08:00
|
|
|
|
2016-10-13 02:49:05 +08:00
|
|
|
bool UseGPRIdxMode = ST.hasVGPRIndexMode() && EnableVGPRIndexMode;
|
|
|
|
|
|
|
|
if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) {
|
2016-07-19 08:35:03 +08:00
|
|
|
MachineBasicBlock::iterator I(&MI);
|
|
|
|
const DebugLoc &DL = MI.getDebugLoc();
|
|
|
|
|
2016-10-13 02:49:05 +08:00
|
|
|
if (UseGPRIdxMode) {
|
|
|
|
// TODO: Look at the uses to avoid the copy. This may require rescheduling
|
|
|
|
// to avoid interfering with other uses, so probably requires a new
|
|
|
|
// optimization pass.
|
|
|
|
BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
|
2016-10-14 17:03:04 +08:00
|
|
|
.addReg(SrcReg, RegState::Undef, SubReg)
|
|
|
|
.addReg(SrcReg, RegState::Implicit)
|
2016-10-13 02:49:05 +08:00
|
|
|
.addReg(AMDGPU::M0, RegState::Implicit);
|
|
|
|
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
|
|
|
|
} else {
|
|
|
|
BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
|
2016-10-14 17:03:04 +08:00
|
|
|
.addReg(SrcReg, RegState::Undef, SubReg)
|
|
|
|
.addReg(SrcReg, RegState::Implicit);
|
2016-10-13 02:49:05 +08:00
|
|
|
}
|
|
|
|
|
2016-07-19 08:35:03 +08:00
|
|
|
MI.eraseFromParent();
|
|
|
|
|
|
|
|
return &MBB;
|
|
|
|
}
|
|
|
|
|
2016-10-13 02:49:05 +08:00
|
|
|
|
2016-07-19 08:35:03 +08:00
|
|
|
const DebugLoc &DL = MI.getDebugLoc();
|
|
|
|
MachineBasicBlock::iterator I(&MI);
|
|
|
|
|
|
|
|
unsigned PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
|
|
|
|
unsigned InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
|
|
|
|
|
|
|
|
BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
|
|
|
|
|
2016-10-13 02:49:05 +08:00
|
|
|
if (UseGPRIdxMode) {
|
|
|
|
MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
|
|
|
|
.addImm(0) // Reset inside loop.
|
|
|
|
.addImm(VGPRIndexMode::SRC0_ENABLE);
|
2016-10-13 20:45:16 +08:00
|
|
|
SetOn->getOperand(3).setIsUndef();
|
2016-10-04 09:41:05 +08:00
|
|
|
|
2016-10-13 02:49:05 +08:00
|
|
|
// Disable again after the loop.
|
|
|
|
BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
|
|
|
|
}
|
|
|
|
|
|
|
|
auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset, UseGPRIdxMode);
|
|
|
|
MachineBasicBlock *LoopBB = InsPt->getParent();
|
2016-07-19 08:35:03 +08:00
|
|
|
|
2016-10-13 02:49:05 +08:00
|
|
|
if (UseGPRIdxMode) {
|
|
|
|
BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
|
2016-10-14 17:03:04 +08:00
|
|
|
.addReg(SrcReg, RegState::Undef, SubReg)
|
|
|
|
.addReg(SrcReg, RegState::Implicit)
|
2016-10-13 02:49:05 +08:00
|
|
|
.addReg(AMDGPU::M0, RegState::Implicit);
|
|
|
|
} else {
|
|
|
|
BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
|
2016-10-14 17:03:04 +08:00
|
|
|
.addReg(SrcReg, RegState::Undef, SubReg)
|
|
|
|
.addReg(SrcReg, RegState::Implicit);
|
2016-10-13 02:49:05 +08:00
|
|
|
}
|
|
|
|
|
2016-10-14 17:03:04 +08:00
|
|
|
MI.eraseFromParent();
|
|
|
|
|
2016-10-13 02:49:05 +08:00
|
|
|
return LoopBB;
|
2016-07-19 08:35:03 +08:00
|
|
|
}
|
|
|
|
|
AMDGPU: Fix Two Address problems with v_movreld
Summary:
The v_movreld machine instruction is used with three operands that are
in a sense tied to each other (the explicit VGPR_32 def and the implicit
VGPR_NN def and use). There is no way to express that using the currently
available operand bits, and indeed there are cases where the Two Address
instructions pass does the wrong thing.
This patch introduces a new set of pseudo instructions that are identical
in intended semantics as v_movreld, but they only have two tied operands.
Having to add a new set of pseudo instructions is admittedly annoying, but
it's a fairly straightforward and solid approach. The only alternative I
see is to try to teach the Two Address instructions pass about Three Address
instructions, and I'm afraid that's trickier and is going to end up more
fragile.
Note that v_movrels does not suffer from this problem, and so this patch
does not touch it.
This fixes several GL45-CTS.shaders.indexing.* tests.
Reviewers: tstellarAMD, arsenm
Subscribers: kzhuravl, wdng, yaxunl, llvm-commits, tony-tye
Differential Revision: https://reviews.llvm.org/D25633
llvm-svn: 284980
2016-10-24 22:56:02 +08:00
|
|
|
static unsigned getMOVRELDPseudo(const TargetRegisterClass *VecRC) {
|
|
|
|
switch (VecRC->getSize()) {
|
|
|
|
case 4:
|
|
|
|
return AMDGPU::V_MOVRELD_B32_V1;
|
|
|
|
case 8:
|
|
|
|
return AMDGPU::V_MOVRELD_B32_V2;
|
|
|
|
case 16:
|
|
|
|
return AMDGPU::V_MOVRELD_B32_V4;
|
|
|
|
case 32:
|
|
|
|
return AMDGPU::V_MOVRELD_B32_V8;
|
|
|
|
case 64:
|
|
|
|
return AMDGPU::V_MOVRELD_B32_V16;
|
|
|
|
default:
|
|
|
|
llvm_unreachable("unsupported size for MOVRELD pseudos");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-07-19 08:35:03 +08:00
|
|
|
static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
|
|
|
|
MachineBasicBlock &MBB,
|
2016-10-04 09:41:05 +08:00
|
|
|
const SISubtarget &ST) {
|
|
|
|
const SIInstrInfo *TII = ST.getInstrInfo();
|
2016-07-19 08:35:03 +08:00
|
|
|
const SIRegisterInfo &TRI = TII->getRegisterInfo();
|
|
|
|
MachineFunction *MF = MBB.getParent();
|
|
|
|
MachineRegisterInfo &MRI = MF->getRegInfo();
|
|
|
|
|
|
|
|
unsigned Dst = MI.getOperand(0).getReg();
|
|
|
|
const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
|
|
|
|
const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
|
|
|
|
const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
|
|
|
|
int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
|
|
|
|
const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
|
|
|
|
|
|
|
|
// This can be an immediate, but will be folded later.
|
|
|
|
assert(Val->getReg());
|
|
|
|
|
|
|
|
unsigned SubReg;
|
|
|
|
std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
|
|
|
|
SrcVec->getReg(),
|
|
|
|
Offset);
|
2016-10-13 02:49:05 +08:00
|
|
|
bool UseGPRIdxMode = ST.hasVGPRIndexMode() && EnableVGPRIndexMode;
|
|
|
|
|
2016-07-19 08:35:03 +08:00
|
|
|
if (Idx->getReg() == AMDGPU::NoRegister) {
|
|
|
|
MachineBasicBlock::iterator I(&MI);
|
|
|
|
const DebugLoc &DL = MI.getDebugLoc();
|
|
|
|
|
|
|
|
assert(Offset == 0);
|
|
|
|
|
|
|
|
BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
|
|
|
|
.addOperand(*SrcVec)
|
|
|
|
.addOperand(*Val)
|
|
|
|
.addImm(SubReg);
|
|
|
|
|
|
|
|
MI.eraseFromParent();
|
|
|
|
return &MBB;
|
|
|
|
}
|
|
|
|
|
2016-10-13 02:49:05 +08:00
|
|
|
if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, false)) {
|
2016-07-19 08:35:03 +08:00
|
|
|
MachineBasicBlock::iterator I(&MI);
|
|
|
|
const DebugLoc &DL = MI.getDebugLoc();
|
|
|
|
|
2016-10-13 02:49:05 +08:00
|
|
|
if (UseGPRIdxMode) {
|
|
|
|
BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
|
|
|
|
.addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst
|
|
|
|
.addOperand(*Val)
|
|
|
|
.addReg(Dst, RegState::ImplicitDefine)
|
|
|
|
.addReg(SrcVec->getReg(), RegState::Implicit)
|
|
|
|
.addReg(AMDGPU::M0, RegState::Implicit);
|
2016-07-19 08:35:03 +08:00
|
|
|
|
2016-10-13 02:49:05 +08:00
|
|
|
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
|
|
|
|
} else {
|
AMDGPU: Fix Two Address problems with v_movreld
Summary:
The v_movreld machine instruction is used with three operands that are
in a sense tied to each other (the explicit VGPR_32 def and the implicit
VGPR_NN def and use). There is no way to express that using the currently
available operand bits, and indeed there are cases where the Two Address
instructions pass does the wrong thing.
This patch introduces a new set of pseudo instructions that are identical
in intended semantics as v_movreld, but they only have two tied operands.
Having to add a new set of pseudo instructions is admittedly annoying, but
it's a fairly straightforward and solid approach. The only alternative I
see is to try to teach the Two Address instructions pass about Three Address
instructions, and I'm afraid that's trickier and is going to end up more
fragile.
Note that v_movrels does not suffer from this problem, and so this patch
does not touch it.
This fixes several GL45-CTS.shaders.indexing.* tests.
Reviewers: tstellarAMD, arsenm
Subscribers: kzhuravl, wdng, yaxunl, llvm-commits, tony-tye
Differential Revision: https://reviews.llvm.org/D25633
llvm-svn: 284980
2016-10-24 22:56:02 +08:00
|
|
|
const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(VecRC));
|
2016-10-13 02:49:05 +08:00
|
|
|
|
AMDGPU: Fix Two Address problems with v_movreld
Summary:
The v_movreld machine instruction is used with three operands that are
in a sense tied to each other (the explicit VGPR_32 def and the implicit
VGPR_NN def and use). There is no way to express that using the currently
available operand bits, and indeed there are cases where the Two Address
instructions pass does the wrong thing.
This patch introduces a new set of pseudo instructions that are identical
in intended semantics as v_movreld, but they only have two tied operands.
Having to add a new set of pseudo instructions is admittedly annoying, but
it's a fairly straightforward and solid approach. The only alternative I
see is to try to teach the Two Address instructions pass about Three Address
instructions, and I'm afraid that's trickier and is going to end up more
fragile.
Note that v_movrels does not suffer from this problem, and so this patch
does not touch it.
This fixes several GL45-CTS.shaders.indexing.* tests.
Reviewers: tstellarAMD, arsenm
Subscribers: kzhuravl, wdng, yaxunl, llvm-commits, tony-tye
Differential Revision: https://reviews.llvm.org/D25633
llvm-svn: 284980
2016-10-24 22:56:02 +08:00
|
|
|
BuildMI(MBB, I, DL, MovRelDesc)
|
|
|
|
.addReg(Dst, RegState::Define)
|
|
|
|
.addReg(SrcVec->getReg())
|
|
|
|
.addOperand(*Val)
|
|
|
|
.addImm(SubReg - AMDGPU::sub0);
|
2016-10-13 02:49:05 +08:00
|
|
|
}
|
2016-07-19 08:35:03 +08:00
|
|
|
|
|
|
|
MI.eraseFromParent();
|
|
|
|
return &MBB;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (Val->isReg())
|
|
|
|
MRI.clearKillFlags(Val->getReg());
|
|
|
|
|
|
|
|
const DebugLoc &DL = MI.getDebugLoc();
|
2016-10-13 02:49:05 +08:00
|
|
|
|
|
|
|
if (UseGPRIdxMode) {
|
|
|
|
MachineBasicBlock::iterator I(&MI);
|
|
|
|
|
|
|
|
MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
|
|
|
|
.addImm(0) // Reset inside loop.
|
|
|
|
.addImm(VGPRIndexMode::DST_ENABLE);
|
2016-10-13 20:45:16 +08:00
|
|
|
SetOn->getOperand(3).setIsUndef();
|
2016-10-13 02:49:05 +08:00
|
|
|
|
|
|
|
// Disable again after the loop.
|
|
|
|
BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
|
|
|
|
}
|
|
|
|
|
2016-07-19 08:35:03 +08:00
|
|
|
unsigned PhiReg = MRI.createVirtualRegister(VecRC);
|
|
|
|
|
2016-10-13 02:49:05 +08:00
|
|
|
auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg,
|
|
|
|
Offset, UseGPRIdxMode);
|
|
|
|
MachineBasicBlock *LoopBB = InsPt->getParent();
|
2016-10-04 09:41:05 +08:00
|
|
|
|
2016-10-13 02:49:05 +08:00
|
|
|
if (UseGPRIdxMode) {
|
|
|
|
BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
|
|
|
|
.addReg(PhiReg, RegState::Undef, SubReg) // vdst
|
|
|
|
.addOperand(*Val) // src0
|
|
|
|
.addReg(Dst, RegState::ImplicitDefine)
|
|
|
|
.addReg(PhiReg, RegState::Implicit)
|
|
|
|
.addReg(AMDGPU::M0, RegState::Implicit);
|
|
|
|
} else {
|
AMDGPU: Fix Two Address problems with v_movreld
Summary:
The v_movreld machine instruction is used with three operands that are
in a sense tied to each other (the explicit VGPR_32 def and the implicit
VGPR_NN def and use). There is no way to express that using the currently
available operand bits, and indeed there are cases where the Two Address
instructions pass does the wrong thing.
This patch introduces a new set of pseudo instructions that are identical
in intended semantics as v_movreld, but they only have two tied operands.
Having to add a new set of pseudo instructions is admittedly annoying, but
it's a fairly straightforward and solid approach. The only alternative I
see is to try to teach the Two Address instructions pass about Three Address
instructions, and I'm afraid that's trickier and is going to end up more
fragile.
Note that v_movrels does not suffer from this problem, and so this patch
does not touch it.
This fixes several GL45-CTS.shaders.indexing.* tests.
Reviewers: tstellarAMD, arsenm
Subscribers: kzhuravl, wdng, yaxunl, llvm-commits, tony-tye
Differential Revision: https://reviews.llvm.org/D25633
llvm-svn: 284980
2016-10-24 22:56:02 +08:00
|
|
|
const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(VecRC));
|
|
|
|
|
|
|
|
BuildMI(*LoopBB, InsPt, DL, MovRelDesc)
|
|
|
|
.addReg(Dst, RegState::Define)
|
|
|
|
.addReg(PhiReg)
|
|
|
|
.addOperand(*Val)
|
|
|
|
.addImm(SubReg - AMDGPU::sub0);
|
2016-10-13 02:49:05 +08:00
|
|
|
}
|
2016-07-19 08:35:03 +08:00
|
|
|
|
2016-10-14 17:03:04 +08:00
|
|
|
MI.eraseFromParent();
|
|
|
|
|
2016-10-13 02:49:05 +08:00
|
|
|
return LoopBB;
|
2016-07-19 08:35:03 +08:00
|
|
|
}
|
|
|
|
|
2016-07-13 05:41:32 +08:00
|
|
|
MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
|
|
|
|
MachineInstr &MI, MachineBasicBlock *BB) const {
|
2016-07-01 06:52:52 +08:00
|
|
|
switch (MI.getOpcode()) {
|
2016-04-15 05:58:15 +08:00
|
|
|
case AMDGPU::SI_INIT_M0: {
|
2016-06-24 14:30:11 +08:00
|
|
|
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
|
2016-07-01 06:52:52 +08:00
|
|
|
BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
|
2016-04-15 05:58:15 +08:00
|
|
|
TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
|
2016-07-19 08:35:03 +08:00
|
|
|
.addOperand(MI.getOperand(0));
|
2016-07-01 06:52:52 +08:00
|
|
|
MI.eraseFromParent();
|
2015-02-21 06:10:45 +08:00
|
|
|
return BB;
|
2016-07-19 08:35:03 +08:00
|
|
|
}
|
2016-03-16 01:28:44 +08:00
|
|
|
case AMDGPU::GET_GROUPSTATICSIZE: {
|
2016-06-24 14:30:11 +08:00
|
|
|
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
|
|
|
|
|
2016-03-16 01:28:44 +08:00
|
|
|
MachineFunction *MF = BB->getParent();
|
|
|
|
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
|
2016-07-01 06:52:52 +08:00
|
|
|
DebugLoc DL = MI.getDebugLoc();
|
2016-07-23 01:01:33 +08:00
|
|
|
BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
|
|
|
|
.addOperand(MI.getOperand(0))
|
2016-07-27 00:45:58 +08:00
|
|
|
.addImm(MFI->getLDSSize());
|
2016-07-01 06:52:52 +08:00
|
|
|
MI.eraseFromParent();
|
2016-03-16 01:28:44 +08:00
|
|
|
return BB;
|
|
|
|
}
|
2016-07-19 08:35:03 +08:00
|
|
|
case AMDGPU::SI_INDIRECT_SRC_V1:
|
|
|
|
case AMDGPU::SI_INDIRECT_SRC_V2:
|
|
|
|
case AMDGPU::SI_INDIRECT_SRC_V4:
|
|
|
|
case AMDGPU::SI_INDIRECT_SRC_V8:
|
|
|
|
case AMDGPU::SI_INDIRECT_SRC_V16:
|
2016-10-04 09:41:05 +08:00
|
|
|
return emitIndirectSrc(MI, *BB, *getSubtarget());
|
2016-07-19 08:35:03 +08:00
|
|
|
case AMDGPU::SI_INDIRECT_DST_V1:
|
|
|
|
case AMDGPU::SI_INDIRECT_DST_V2:
|
|
|
|
case AMDGPU::SI_INDIRECT_DST_V4:
|
|
|
|
case AMDGPU::SI_INDIRECT_DST_V8:
|
|
|
|
case AMDGPU::SI_INDIRECT_DST_V16:
|
2016-10-04 09:41:05 +08:00
|
|
|
return emitIndirectDst(MI, *BB, *getSubtarget());
|
2016-07-13 05:41:32 +08:00
|
|
|
case AMDGPU::SI_KILL:
|
|
|
|
return splitKillBlock(MI, BB);
|
2016-08-27 09:00:37 +08:00
|
|
|
case AMDGPU::V_CNDMASK_B64_PSEUDO: {
|
|
|
|
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
|
|
|
|
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
|
|
|
|
|
|
|
|
unsigned Dst = MI.getOperand(0).getReg();
|
|
|
|
unsigned Src0 = MI.getOperand(1).getReg();
|
|
|
|
unsigned Src1 = MI.getOperand(2).getReg();
|
|
|
|
const DebugLoc &DL = MI.getDebugLoc();
|
|
|
|
unsigned SrcCond = MI.getOperand(3).getReg();
|
|
|
|
|
|
|
|
unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
|
|
|
|
unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
|
|
|
|
|
|
|
|
BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
|
|
|
|
.addReg(Src0, 0, AMDGPU::sub0)
|
|
|
|
.addReg(Src1, 0, AMDGPU::sub0)
|
|
|
|
.addReg(SrcCond);
|
|
|
|
BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
|
|
|
|
.addReg(Src0, 0, AMDGPU::sub1)
|
|
|
|
.addReg(Src1, 0, AMDGPU::sub1)
|
|
|
|
.addReg(SrcCond);
|
|
|
|
|
|
|
|
BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
|
|
|
|
.addReg(DstLo)
|
|
|
|
.addImm(AMDGPU::sub0)
|
|
|
|
.addReg(DstHi)
|
|
|
|
.addImm(AMDGPU::sub1);
|
|
|
|
MI.eraseFromParent();
|
|
|
|
return BB;
|
|
|
|
}
|
2016-03-16 01:28:44 +08:00
|
|
|
default:
|
|
|
|
return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
|
2012-12-12 05:25:42 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-01-30 03:34:32 +08:00
|
|
|
bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
|
|
|
|
// This currently forces unfolding various combinations of fsub into fma with
|
|
|
|
// free fneg'd operands. As long as we have fast FMA (controlled by
|
|
|
|
// isFMAFasterThanFMulAndFAdd), we should perform these.
|
|
|
|
|
|
|
|
// When fma is quarter rate, for f64 where add / sub are at best half rate,
|
|
|
|
// most of these combines appear to be cycle neutral but save on instruction
|
|
|
|
// count / code size.
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2015-07-09 10:09:04 +08:00
|
|
|
EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
|
|
|
|
EVT VT) const {
|
2013-07-19 05:43:53 +08:00
|
|
|
if (!VT.isVector()) {
|
|
|
|
return MVT::i1;
|
|
|
|
}
|
2014-11-29 06:51:38 +08:00
|
|
|
return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
|
2012-12-12 05:25:42 +08:00
|
|
|
}
|
|
|
|
|
2015-07-09 23:12:23 +08:00
|
|
|
MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT) const {
|
2013-03-18 19:34:05 +08:00
|
|
|
return MVT::i32;
|
|
|
|
}
|
|
|
|
|
2015-01-30 03:34:32 +08:00
|
|
|
// Answering this is somewhat tricky and depends on the specific device which
|
|
|
|
// have different rates for fma or all f64 operations.
|
|
|
|
//
|
|
|
|
// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
|
|
|
|
// regardless of which device (although the number of cycles differs between
|
|
|
|
// devices), so it is always profitable for f64.
|
|
|
|
//
|
|
|
|
// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
|
|
|
|
// only on full rate devices. Normally, we should prefer selecting v_mad_f32
|
|
|
|
// which we can always do even without fused FP ops since it returns the same
|
|
|
|
// result as the separate operations and since it is always full
|
|
|
|
// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
|
|
|
|
// however does not support denormals, so we do report fma as faster if we have
|
|
|
|
// a fast fma device and require denormals.
|
|
|
|
//
|
2013-08-10 18:38:54 +08:00
|
|
|
bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
|
|
|
|
VT = VT.getScalarType();
|
|
|
|
|
|
|
|
if (!VT.isSimple())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
switch (VT.getSimpleVT().SimpleTy) {
|
|
|
|
case MVT::f32:
|
2015-01-30 03:34:32 +08:00
|
|
|
// This is as fast on some subtargets. However, we always have full rate f32
|
|
|
|
// mad available which returns the same result as the separate operations
|
2015-02-21 06:10:41 +08:00
|
|
|
// which we should prefer over fma. We can't use this if we want to support
|
|
|
|
// denormals, so only report this in these cases.
|
|
|
|
return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32();
|
2013-08-10 18:38:54 +08:00
|
|
|
case MVT::f64:
|
|
|
|
return true;
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2012-12-12 05:25:42 +08:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// Custom DAG Lowering Operations
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
|
|
|
|
switch (Op.getOpcode()) {
|
|
|
|
default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
|
2012-12-20 06:10:31 +08:00
|
|
|
case ISD::BRCOND: return LowerBRCOND(Op, DAG);
|
2013-08-26 23:06:04 +08:00
|
|
|
case ISD::LOAD: {
|
2014-07-21 23:45:06 +08:00
|
|
|
SDValue Result = LowerLOAD(Op, DAG);
|
|
|
|
assert((!Result.getNode() ||
|
|
|
|
Result.getNode()->getNumValues() == 2) &&
|
|
|
|
"Load should return a value and a chain");
|
|
|
|
return Result;
|
2013-08-26 23:06:04 +08:00
|
|
|
}
|
2013-10-23 08:44:32 +08:00
|
|
|
|
2014-07-20 02:44:39 +08:00
|
|
|
case ISD::FSIN:
|
|
|
|
case ISD::FCOS:
|
|
|
|
return LowerTrig(Op, DAG);
|
2014-02-05 01:18:40 +08:00
|
|
|
case ISD::SELECT: return LowerSELECT(Op, DAG);
|
2014-07-16 04:18:31 +08:00
|
|
|
case ISD::FDIV: return LowerFDIV(Op, DAG);
|
AMDGPU: Implement {BUFFER,FLAT}_ATOMIC_CMPSWAP{,_X2}
Summary:
Implement BUFFER_ATOMIC_CMPSWAP{,_X2} instructions on all GCN targets, and FLAT_ATOMIC_CMPSWAP{,_X2} on CI+.
32-bit instruction variants tested manually on Kabini and Bonaire. Tests and parts of code provided by Jan Veselý.
Patch by: Vedran Miletić
Reviewers: arsenm, tstellarAMD, nhaehnle
Subscribers: jvesely, scchan, kanarayan, arsenm
Differential Revision: http://reviews.llvm.org/D17280
llvm-svn: 265170
2016-04-02 02:27:37 +08:00
|
|
|
case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
|
2013-11-14 07:36:50 +08:00
|
|
|
case ISD::STORE: return LowerSTORE(Op, DAG);
|
2014-07-26 14:23:37 +08:00
|
|
|
case ISD::GlobalAddress: {
|
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
|
|
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
|
|
|
return LowerGlobalAddress(MFI, Op, DAG);
|
2013-06-04 01:40:18 +08:00
|
|
|
}
|
2014-07-26 14:23:37 +08:00
|
|
|
case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
|
2016-04-12 22:05:04 +08:00
|
|
|
case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
|
2014-07-26 14:23:37 +08:00
|
|
|
case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
|
2016-04-26 03:27:24 +08:00
|
|
|
case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
|
2016-06-18 06:27:03 +08:00
|
|
|
case ISD::TRAP: return lowerTRAP(Op, DAG);
|
2016-11-13 15:01:11 +08:00
|
|
|
|
|
|
|
case ISD::ConstantFP:
|
|
|
|
return lowerConstantFP(Op, DAG);
|
2016-11-17 12:28:37 +08:00
|
|
|
case ISD::FP_ROUND:
|
|
|
|
return lowerFP_ROUND(Op, DAG);
|
2012-12-12 05:25:42 +08:00
|
|
|
}
|
|
|
|
return SDValue();
|
|
|
|
}
|
|
|
|
|
2012-12-20 06:10:31 +08:00
|
|
|
/// \brief Helper function for LowerBRCOND
|
|
|
|
static SDNode *findUser(SDValue Value, unsigned Opcode) {
|
|
|
|
|
|
|
|
SDNode *Parent = Value.getNode();
|
|
|
|
for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
|
|
|
|
I != E; ++I) {
|
|
|
|
|
|
|
|
if (I.getUse().get() != Value)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (I->getOpcode() == Opcode)
|
|
|
|
return *I;
|
|
|
|
}
|
2014-04-25 13:30:21 +08:00
|
|
|
return nullptr;
|
2012-12-20 06:10:31 +08:00
|
|
|
}
|
|
|
|
|
2016-02-13 07:45:29 +08:00
|
|
|
bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
|
2016-09-17 06:11:18 +08:00
|
|
|
if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
|
|
|
|
switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
|
|
|
|
case AMDGPUIntrinsic::amdgcn_if:
|
|
|
|
case AMDGPUIntrinsic::amdgcn_else:
|
|
|
|
case AMDGPUIntrinsic::amdgcn_end_cf:
|
|
|
|
case AMDGPUIntrinsic::amdgcn_loop:
|
|
|
|
return true;
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
2016-02-13 07:45:29 +08:00
|
|
|
|
2016-09-17 06:11:18 +08:00
|
|
|
if (Intr->getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
|
|
|
|
switch (cast<ConstantSDNode>(Intr->getOperand(0))->getZExtValue()) {
|
|
|
|
case AMDGPUIntrinsic::amdgcn_break:
|
|
|
|
case AMDGPUIntrinsic::amdgcn_if_break:
|
|
|
|
case AMDGPUIntrinsic::amdgcn_else_break:
|
|
|
|
return true;
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
2016-02-13 07:45:29 +08:00
|
|
|
}
|
2016-09-17 06:11:18 +08:00
|
|
|
|
|
|
|
return false;
|
2016-02-13 07:45:29 +08:00
|
|
|
}
|
|
|
|
|
2016-06-25 11:11:28 +08:00
|
|
|
void SITargetLowering::createDebuggerPrologueStackObjects(
|
|
|
|
MachineFunction &MF) const {
|
|
|
|
// Create stack objects that are used for emitting debugger prologue.
|
|
|
|
//
|
|
|
|
// Debugger prologue writes work group IDs and work item IDs to scratch memory
|
|
|
|
// at fixed location in the following format:
|
|
|
|
// offset 0: work group ID x
|
|
|
|
// offset 4: work group ID y
|
|
|
|
// offset 8: work group ID z
|
|
|
|
// offset 16: work item ID x
|
|
|
|
// offset 20: work item ID y
|
|
|
|
// offset 24: work item ID z
|
|
|
|
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
|
|
|
|
int ObjectIdx = 0;
|
|
|
|
|
|
|
|
// For each dimension:
|
|
|
|
for (unsigned i = 0; i < 3; ++i) {
|
|
|
|
// Create fixed stack object for work group ID.
|
2016-07-29 02:40:00 +08:00
|
|
|
ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4, true);
|
2016-06-25 11:11:28 +08:00
|
|
|
Info->setDebuggerWorkGroupIDStackObjectIndex(i, ObjectIdx);
|
|
|
|
// Create fixed stack object for work item ID.
|
2016-07-29 02:40:00 +08:00
|
|
|
ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4 + 16, true);
|
2016-06-25 11:11:28 +08:00
|
|
|
Info->setDebuggerWorkItemIDStackObjectIndex(i, ObjectIdx);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-10-21 02:12:38 +08:00
|
|
|
bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
|
|
|
|
const Triple &TT = getTargetMachine().getTargetTriple();
|
|
|
|
return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
|
|
|
|
AMDGPU::shouldEmitConstantsToTextSection(TT);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
|
|
|
|
return (GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
|
|
|
|
GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) &&
|
|
|
|
!shouldEmitFixup(GV) &&
|
|
|
|
!getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
|
|
|
|
return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
|
|
|
|
}
|
|
|
|
|
2012-12-20 06:10:31 +08:00
|
|
|
/// This transforms the control flow intrinsics to get the branch destination as
|
|
|
|
/// last parameter, also switches branch target with BR if the need arise
|
|
|
|
SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
|
|
|
|
SelectionDAG &DAG) const {
|
|
|
|
|
2013-05-25 10:42:55 +08:00
|
|
|
SDLoc DL(BRCOND);
|
2012-12-20 06:10:31 +08:00
|
|
|
|
|
|
|
SDNode *Intr = BRCOND.getOperand(1).getNode();
|
|
|
|
SDValue Target = BRCOND.getOperand(2);
|
2014-04-25 13:30:21 +08:00
|
|
|
SDNode *BR = nullptr;
|
2016-02-13 07:45:29 +08:00
|
|
|
SDNode *SetCC = nullptr;
|
2012-12-20 06:10:31 +08:00
|
|
|
|
|
|
|
if (Intr->getOpcode() == ISD::SETCC) {
|
|
|
|
// As long as we negate the condition everything is fine
|
2016-02-13 07:45:29 +08:00
|
|
|
SetCC = Intr;
|
2012-12-20 06:10:31 +08:00
|
|
|
Intr = SetCC->getOperand(0).getNode();
|
|
|
|
|
|
|
|
} else {
|
|
|
|
// Get the target from BR if we don't negate the condition
|
|
|
|
BR = findUser(BRCOND, ISD::BR);
|
|
|
|
Target = BR->getOperand(1);
|
|
|
|
}
|
|
|
|
|
2016-09-17 06:11:18 +08:00
|
|
|
// FIXME: This changes the types of the intrinsics instead of introducing new
|
|
|
|
// nodes with the correct types.
|
|
|
|
// e.g. llvm.amdgcn.loop
|
|
|
|
|
|
|
|
// eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3
|
|
|
|
// => t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088>
|
|
|
|
|
2016-05-06 01:36:36 +08:00
|
|
|
if (!isCFIntrinsic(Intr)) {
|
2016-02-13 07:45:29 +08:00
|
|
|
// This is a uniform branch so we don't need to legalize.
|
|
|
|
return BRCOND;
|
|
|
|
}
|
|
|
|
|
2016-09-17 06:11:18 +08:00
|
|
|
bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
|
|
|
|
Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
|
|
|
|
|
2016-02-13 07:45:29 +08:00
|
|
|
assert(!SetCC ||
|
|
|
|
(SetCC->getConstantOperandVal(1) == 1 &&
|
|
|
|
cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
|
|
|
|
ISD::SETNE));
|
2012-12-20 06:10:31 +08:00
|
|
|
|
|
|
|
// operands of the new intrinsic call
|
|
|
|
SmallVector<SDValue, 4> Ops;
|
2016-09-17 06:11:18 +08:00
|
|
|
if (HaveChain)
|
|
|
|
Ops.push_back(BRCOND.getOperand(0));
|
|
|
|
|
|
|
|
Ops.append(Intr->op_begin() + (HaveChain ? 1 : 0), Intr->op_end());
|
2012-12-20 06:10:31 +08:00
|
|
|
Ops.push_back(Target);
|
|
|
|
|
2016-09-17 06:11:18 +08:00
|
|
|
ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
|
|
|
|
|
2012-12-20 06:10:31 +08:00
|
|
|
// build the new intrinsic call
|
|
|
|
SDNode *Result = DAG.getNode(
|
|
|
|
Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL,
|
2014-04-27 02:35:24 +08:00
|
|
|
DAG.getVTList(Res), Ops).getNode();
|
2012-12-20 06:10:31 +08:00
|
|
|
|
2016-09-17 06:11:18 +08:00
|
|
|
if (!HaveChain) {
|
|
|
|
SDValue Ops[] = {
|
|
|
|
SDValue(Result, 0),
|
|
|
|
BRCOND.getOperand(0)
|
|
|
|
};
|
|
|
|
|
|
|
|
Result = DAG.getMergeValues(Ops, DL).getNode();
|
|
|
|
}
|
|
|
|
|
2012-12-20 06:10:31 +08:00
|
|
|
if (BR) {
|
|
|
|
// Give the branch instruction our target
|
|
|
|
SDValue Ops[] = {
|
|
|
|
BR->getOperand(0),
|
|
|
|
BRCOND.getOperand(2)
|
|
|
|
};
|
2014-08-02 06:09:43 +08:00
|
|
|
SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
|
|
|
|
DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
|
|
|
|
BR = NewBR.getNode();
|
2012-12-20 06:10:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
|
|
|
|
|
|
|
|
// Copy the intrinsic results to registers
|
|
|
|
for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
|
|
|
|
SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
|
|
|
|
if (!CopyToReg)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
Chain = DAG.getCopyToReg(
|
|
|
|
Chain, DL,
|
|
|
|
CopyToReg->getOperand(1),
|
|
|
|
SDValue(Result, i - 1),
|
|
|
|
SDValue());
|
|
|
|
|
|
|
|
DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Remove the old intrinsic from the chain
|
|
|
|
DAG.ReplaceAllUsesOfValueWith(
|
|
|
|
SDValue(Intr, Intr->getNumValues() - 1),
|
|
|
|
Intr->getOperand(0));
|
|
|
|
|
|
|
|
return Chain;
|
2012-12-12 05:25:42 +08:00
|
|
|
}
|
|
|
|
|
2016-11-13 15:01:11 +08:00
|
|
|
SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG,
|
|
|
|
SDValue Op,
|
|
|
|
const SDLoc &DL,
|
|
|
|
EVT VT) const {
|
|
|
|
return Op.getValueType().bitsLE(VT) ?
|
|
|
|
DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
|
|
|
|
DAG.getNode(ISD::FTRUNC, DL, VT, Op);
|
|
|
|
}
|
|
|
|
|
|
|
|
SDValue SITargetLowering::lowerConstantFP(SDValue Op, SelectionDAG &DAG) const {
|
|
|
|
if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(Op)) {
|
|
|
|
return DAG.getConstant(FP->getValueAPF().bitcastToAPInt().getZExtValue(),
|
|
|
|
SDLoc(Op), MVT::i32);
|
|
|
|
}
|
|
|
|
|
|
|
|
return SDValue();
|
|
|
|
}
|
|
|
|
|
2016-11-17 12:28:37 +08:00
|
|
|
SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
|
2016-11-19 02:33:36 +08:00
|
|
|
assert(Op.getValueType() == MVT::f16 &&
|
2016-11-17 12:28:37 +08:00
|
|
|
"Do not know how to custom lower FP_ROUND for non-f16 type");
|
|
|
|
|
2016-11-19 02:33:36 +08:00
|
|
|
SDValue Src = Op.getOperand(0);
|
|
|
|
EVT SrcVT = Src.getValueType();
|
2016-11-17 12:28:37 +08:00
|
|
|
if (SrcVT != MVT::f64)
|
|
|
|
return Op;
|
|
|
|
|
|
|
|
SDLoc DL(Op);
|
2016-11-19 02:33:36 +08:00
|
|
|
|
2016-11-17 12:28:37 +08:00
|
|
|
SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
|
|
|
|
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
|
|
|
|
return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);;
|
|
|
|
}
|
|
|
|
|
2016-04-26 03:27:24 +08:00
|
|
|
SDValue SITargetLowering::getSegmentAperture(unsigned AS,
|
|
|
|
SelectionDAG &DAG) const {
|
|
|
|
SDLoc SL;
|
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
|
|
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
|
2016-06-07 04:03:31 +08:00
|
|
|
unsigned UserSGPR = Info->getQueuePtrUserSGPR();
|
|
|
|
assert(UserSGPR != AMDGPU::NoRegister);
|
|
|
|
|
2016-04-26 03:27:24 +08:00
|
|
|
SDValue QueuePtr = CreateLiveInRegister(
|
2016-06-07 04:03:31 +08:00
|
|
|
DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
|
2016-04-26 03:27:24 +08:00
|
|
|
|
|
|
|
// Offset into amd_queue_t for group_segment_aperture_base_hi /
|
|
|
|
// private_segment_aperture_base_hi.
|
|
|
|
uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
|
|
|
|
|
|
|
|
SDValue Ptr = DAG.getNode(ISD::ADD, SL, MVT::i64, QueuePtr,
|
|
|
|
DAG.getConstant(StructOffset, SL, MVT::i64));
|
|
|
|
|
|
|
|
// TODO: Use custom target PseudoSourceValue.
|
|
|
|
// TODO: We should use the value from the IR intrinsic call, but it might not
|
|
|
|
// be available and how do we get it?
|
|
|
|
Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()),
|
|
|
|
AMDGPUAS::CONSTANT_ADDRESS));
|
|
|
|
|
|
|
|
MachinePointerInfo PtrInfo(V, StructOffset);
|
[SelectionDAG] Get rid of bool parameters in SelectionDAG::getLoad, getStore, and friends.
Summary:
Instead, we take a single flags arg (a bitset).
Also add a default 0 alignment, and change the order of arguments so the
alignment comes before the flags.
This greatly simplifies many callsites, and fixes a bug in
AMDGPUISelLowering, wherein the order of the args to getLoad was
inverted. It also greatly simplifies the process of adding another flag
to getLoad.
Reviewers: chandlerc, tstellarAMD
Subscribers: jholewinski, arsenm, jyknight, dsanders, nemanjai, llvm-commits
Differential Revision: http://reviews.llvm.org/D22249
llvm-svn: 275592
2016-07-16 02:27:10 +08:00
|
|
|
return DAG.getLoad(MVT::i32, SL, QueuePtr.getValue(1), Ptr, PtrInfo,
|
|
|
|
MinAlign(64, StructOffset),
|
[CodeGen] Split out the notions of MI invariance and MI dereferenceability.
Summary:
An IR load can be invariant, dereferenceable, neither, or both. But
currently, MI's notion of invariance is IR-invariant &&
IR-dereferenceable.
This patch splits up the notions of invariance and dereferenceability at
the MI level. It's NFC, so adds some probably-unnecessary
"is-dereferenceable" checks, which we can remove later if desired.
Reviewers: chandlerc, tstellarAMD
Subscribers: jholewinski, arsenm, nemanjai, llvm-commits
Differential Revision: https://reviews.llvm.org/D23371
llvm-svn: 281151
2016-09-11 09:38:58 +08:00
|
|
|
MachineMemOperand::MODereferenceable |
|
|
|
|
MachineMemOperand::MOInvariant);
|
2016-04-26 03:27:24 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
|
|
|
|
SelectionDAG &DAG) const {
|
|
|
|
SDLoc SL(Op);
|
|
|
|
const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
|
|
|
|
|
|
|
|
SDValue Src = ASC->getOperand(0);
|
|
|
|
|
|
|
|
// FIXME: Really support non-0 null pointers.
|
|
|
|
SDValue SegmentNullPtr = DAG.getConstant(-1, SL, MVT::i32);
|
|
|
|
SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
|
|
|
|
|
|
|
|
// flat -> local/private
|
|
|
|
if (ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
|
|
|
|
if (ASC->getDestAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
|
|
|
|
ASC->getDestAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
|
|
|
|
SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
|
|
|
|
SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
|
|
|
|
|
|
|
|
return DAG.getNode(ISD::SELECT, SL, MVT::i32,
|
|
|
|
NonNull, Ptr, SegmentNullPtr);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// local/private -> flat
|
|
|
|
if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
|
|
|
|
if (ASC->getSrcAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
|
|
|
|
ASC->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
|
|
|
|
SDValue NonNull
|
|
|
|
= DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
|
|
|
|
|
|
|
|
SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), DAG);
|
|
|
|
SDValue CvtPtr
|
|
|
|
= DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
|
|
|
|
|
|
|
|
return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull,
|
|
|
|
DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr),
|
|
|
|
FlatNullPtr);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// global <-> flat are no-ops and never emitted.
|
|
|
|
|
|
|
|
const MachineFunction &MF = DAG.getMachineFunction();
|
|
|
|
DiagnosticInfoUnsupported InvalidAddrSpaceCast(
|
|
|
|
*MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
|
|
|
|
DAG.getContext()->diagnose(InvalidAddrSpaceCast);
|
|
|
|
|
|
|
|
return DAG.getUNDEF(ASC->getValueType(0));
|
|
|
|
}
|
|
|
|
|
2016-06-25 09:59:16 +08:00
|
|
|
bool
|
|
|
|
SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
|
2016-07-13 22:23:33 +08:00
|
|
|
// We can fold offsets for anything that doesn't require a GOT relocation.
|
2016-10-21 02:12:38 +08:00
|
|
|
return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
|
|
|
|
GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) &&
|
|
|
|
!shouldEmitGOTReloc(GA->getGlobal());
|
2016-06-25 09:59:16 +08:00
|
|
|
}
|
|
|
|
|
2016-07-13 22:23:33 +08:00
|
|
|
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
|
|
|
|
SDLoc DL, unsigned Offset, EVT PtrVT,
|
|
|
|
unsigned GAFlags = SIInstrInfo::MO_NONE) {
|
2016-06-15 04:29:59 +08:00
|
|
|
// In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
|
|
|
|
// lowered to the following code sequence:
|
|
|
|
//
|
2016-10-14 12:37:34 +08:00
|
|
|
// For constant address space:
|
|
|
|
// s_getpc_b64 s[0:1]
|
|
|
|
// s_add_u32 s0, s0, $symbol
|
|
|
|
// s_addc_u32 s1, s1, 0
|
|
|
|
//
|
|
|
|
// s_getpc_b64 returns the address of the s_add_u32 instruction and then
|
|
|
|
// a fixup or relocation is emitted to replace $symbol with a literal
|
|
|
|
// constant, which is a pc-relative offset from the encoding of the $symbol
|
|
|
|
// operand to the global variable.
|
|
|
|
//
|
|
|
|
// For global address space:
|
|
|
|
// s_getpc_b64 s[0:1]
|
|
|
|
// s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
|
|
|
|
// s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
|
|
|
|
//
|
|
|
|
// s_getpc_b64 returns the address of the s_add_u32 instruction and then
|
|
|
|
// fixups or relocations are emitted to replace $symbol@*@lo and
|
|
|
|
// $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
|
|
|
|
// which is a 64-bit pc-relative offset from the encoding of the $symbol
|
|
|
|
// operand to the global variable.
|
2016-06-15 04:29:59 +08:00
|
|
|
//
|
|
|
|
// What we want here is an offset from the value returned by s_getpc
|
|
|
|
// (which is the address of the s_add_u32 instruction) to the global
|
|
|
|
// variable, but since the encoding of $symbol starts 4 bytes after the start
|
|
|
|
// of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
|
|
|
|
// small. This requires us to add 4 to the global variable offset in order to
|
|
|
|
// compute the correct address.
|
2016-10-14 12:37:34 +08:00
|
|
|
SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
|
|
|
|
GAFlags);
|
|
|
|
SDValue PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
|
|
|
|
GAFlags == SIInstrInfo::MO_NONE ?
|
|
|
|
GAFlags : GAFlags + 1);
|
|
|
|
return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
|
2016-06-15 04:29:59 +08:00
|
|
|
}
|
|
|
|
|
2016-07-13 22:23:33 +08:00
|
|
|
SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
|
|
|
|
SDValue Op,
|
|
|
|
SelectionDAG &DAG) const {
|
|
|
|
GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
|
|
|
|
|
|
|
|
if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS &&
|
|
|
|
GSD->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS)
|
|
|
|
return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
|
|
|
|
|
|
|
|
SDLoc DL(GSD);
|
|
|
|
const GlobalValue *GV = GSD->getGlobal();
|
|
|
|
EVT PtrVT = Op.getValueType();
|
|
|
|
|
2016-10-21 02:12:38 +08:00
|
|
|
if (shouldEmitFixup(GV))
|
2016-07-13 22:23:33 +08:00
|
|
|
return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
|
2016-10-21 02:12:38 +08:00
|
|
|
else if (shouldEmitPCReloc(GV))
|
2016-10-14 12:37:34 +08:00
|
|
|
return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
|
|
|
|
SIInstrInfo::MO_REL32);
|
2016-07-13 22:23:33 +08:00
|
|
|
|
|
|
|
SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
|
2016-10-14 12:37:34 +08:00
|
|
|
SIInstrInfo::MO_GOTPCREL32);
|
2016-07-13 22:23:33 +08:00
|
|
|
|
|
|
|
Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
|
|
|
|
PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
|
|
|
|
const DataLayout &DataLayout = DAG.getDataLayout();
|
|
|
|
unsigned Align = DataLayout.getABITypeAlignment(PtrTy);
|
|
|
|
// FIXME: Use a PseudoSourceValue once those can be assigned an address space.
|
|
|
|
MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
|
|
|
|
|
[SelectionDAG] Get rid of bool parameters in SelectionDAG::getLoad, getStore, and friends.
Summary:
Instead, we take a single flags arg (a bitset).
Also add a default 0 alignment, and change the order of arguments so the
alignment comes before the flags.
This greatly simplifies many callsites, and fixes a bug in
AMDGPUISelLowering, wherein the order of the args to getLoad was
inverted. It also greatly simplifies the process of adding another flag
to getLoad.
Reviewers: chandlerc, tstellarAMD
Subscribers: jholewinski, arsenm, jyknight, dsanders, nemanjai, llvm-commits
Differential Revision: http://reviews.llvm.org/D22249
llvm-svn: 275592
2016-07-16 02:27:10 +08:00
|
|
|
return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align,
|
[CodeGen] Split out the notions of MI invariance and MI dereferenceability.
Summary:
An IR load can be invariant, dereferenceable, neither, or both. But
currently, MI's notion of invariance is IR-invariant &&
IR-dereferenceable.
This patch splits up the notions of invariance and dereferenceability at
the MI level. It's NFC, so adds some probably-unnecessary
"is-dereferenceable" checks, which we can remove later if desired.
Reviewers: chandlerc, tstellarAMD
Subscribers: jholewinski, arsenm, nemanjai, llvm-commits
Differential Revision: https://reviews.llvm.org/D23371
llvm-svn: 281151
2016-09-11 09:38:58 +08:00
|
|
|
MachineMemOperand::MODereferenceable |
|
|
|
|
MachineMemOperand::MOInvariant);
|
2016-07-13 22:23:33 +08:00
|
|
|
}
|
|
|
|
|
2016-06-18 06:27:03 +08:00
|
|
|
SDValue SITargetLowering::lowerTRAP(SDValue Op,
|
|
|
|
SelectionDAG &DAG) const {
|
|
|
|
const MachineFunction &MF = DAG.getMachineFunction();
|
|
|
|
DiagnosticInfoUnsupported NoTrap(*MF.getFunction(),
|
|
|
|
"trap handler not supported",
|
|
|
|
Op.getDebugLoc(),
|
|
|
|
DS_Warning);
|
|
|
|
DAG.getContext()->diagnose(NoTrap);
|
|
|
|
|
|
|
|
// Emit s_endpgm.
|
|
|
|
|
|
|
|
// FIXME: This should really be selected to s_trap, but that requires
|
|
|
|
// setting up the trap handler for it o do anything.
|
2016-06-23 04:15:28 +08:00
|
|
|
return DAG.getNode(AMDGPUISD::ENDPGM, SDLoc(Op), MVT::Other,
|
|
|
|
Op.getOperand(0));
|
2016-06-18 06:27:03 +08:00
|
|
|
}
|
|
|
|
|
2016-06-12 23:39:02 +08:00
|
|
|
SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
|
|
|
|
const SDLoc &DL, SDValue V) const {
|
2016-04-15 05:58:15 +08:00
|
|
|
// We can't use S_MOV_B32 directly, because there is no way to specify m0 as
|
|
|
|
// the destination register.
|
|
|
|
//
|
2015-05-12 22:18:14 +08:00
|
|
|
// We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
|
|
|
|
// so we will end up with redundant moves to m0.
|
|
|
|
//
|
2016-04-15 05:58:15 +08:00
|
|
|
// We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
|
|
|
|
|
|
|
|
// A Null SDValue creates a glue result.
|
|
|
|
SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
|
|
|
|
V, Chain);
|
|
|
|
return SDValue(M0, 0);
|
2015-05-12 22:18:14 +08:00
|
|
|
}
|
|
|
|
|
2015-12-01 05:15:45 +08:00
|
|
|
SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
|
|
|
|
SDValue Op,
|
|
|
|
MVT VT,
|
|
|
|
unsigned Offset) const {
|
|
|
|
SDLoc SL(Op);
|
|
|
|
SDValue Param = LowerParameter(DAG, MVT::i32, MVT::i32, SL,
|
|
|
|
DAG.getEntryNode(), Offset, false);
|
|
|
|
// The local size values will have the hi 16-bits as zero.
|
|
|
|
return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
|
|
|
|
DAG.getValueType(VT));
|
|
|
|
}
|
|
|
|
|
2016-06-21 02:33:56 +08:00
|
|
|
static SDValue emitNonHSAIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) {
|
2016-01-30 13:19:45 +08:00
|
|
|
DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(),
|
2016-06-21 02:33:56 +08:00
|
|
|
"non-hsa intrinsic with hsa target",
|
|
|
|
DL.getDebugLoc());
|
|
|
|
DAG.getContext()->diagnose(BadIntrin);
|
|
|
|
return DAG.getUNDEF(VT);
|
|
|
|
}
|
|
|
|
|
|
|
|
static SDValue emitRemovedIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) {
|
|
|
|
DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(),
|
|
|
|
"intrinsic not supported on subtarget",
|
|
|
|
DL.getDebugLoc());
|
2016-01-30 13:19:45 +08:00
|
|
|
DAG.getContext()->diagnose(BadIntrin);
|
|
|
|
return DAG.getUNDEF(VT);
|
|
|
|
}
|
|
|
|
|
2014-07-26 14:23:37 +08:00
|
|
|
SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
|
|
|
|
SelectionDAG &DAG) const {
|
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
2015-07-10 05:20:37 +08:00
|
|
|
auto MFI = MF.getInfo<SIMachineFunctionInfo>();
|
2016-06-24 14:30:11 +08:00
|
|
|
const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
|
2014-07-26 14:23:37 +08:00
|
|
|
|
|
|
|
EVT VT = Op.getValueType();
|
|
|
|
SDLoc DL(Op);
|
|
|
|
unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
|
|
|
|
|
2015-09-17 00:31:21 +08:00
|
|
|
// TODO: Should this propagate fast-math-flags?
|
|
|
|
|
2014-07-26 14:23:37 +08:00
|
|
|
switch (IntrinsicID) {
|
2015-11-26 08:43:29 +08:00
|
|
|
case Intrinsic::amdgcn_dispatch_ptr:
|
2016-04-26 03:27:18 +08:00
|
|
|
case Intrinsic::amdgcn_queue_ptr: {
|
2016-09-17 05:34:26 +08:00
|
|
|
if (!Subtarget->isAmdCodeObjectV2()) {
|
2016-02-02 21:52:43 +08:00
|
|
|
DiagnosticInfoUnsupported BadIntrin(
|
|
|
|
*MF.getFunction(), "unsupported hsa intrinsic without hsa target",
|
|
|
|
DL.getDebugLoc());
|
2016-01-12 05:18:33 +08:00
|
|
|
DAG.getContext()->diagnose(BadIntrin);
|
|
|
|
return DAG.getUNDEF(VT);
|
|
|
|
}
|
|
|
|
|
2016-04-26 03:27:18 +08:00
|
|
|
auto Reg = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
|
|
|
|
SIRegisterInfo::DISPATCH_PTR : SIRegisterInfo::QUEUE_PTR;
|
2015-11-26 08:43:29 +08:00
|
|
|
return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass,
|
2016-04-26 03:27:18 +08:00
|
|
|
TRI->getPreloadedValue(MF, Reg), VT);
|
|
|
|
}
|
2016-06-22 04:46:20 +08:00
|
|
|
case Intrinsic::amdgcn_implicitarg_ptr: {
|
|
|
|
unsigned offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
|
|
|
|
return LowerParameterPtr(DAG, DL, DAG.getEntryNode(), offset);
|
|
|
|
}
|
2016-04-30 05:16:52 +08:00
|
|
|
case Intrinsic::amdgcn_kernarg_segment_ptr: {
|
|
|
|
unsigned Reg
|
|
|
|
= TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
|
|
|
|
return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT);
|
|
|
|
}
|
2016-07-23 01:01:30 +08:00
|
|
|
case Intrinsic::amdgcn_dispatch_id: {
|
|
|
|
unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_ID);
|
|
|
|
return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT);
|
|
|
|
}
|
2016-01-23 13:32:20 +08:00
|
|
|
case Intrinsic::amdgcn_rcp:
|
|
|
|
return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
|
|
|
|
case Intrinsic::amdgcn_rsq:
|
2016-01-26 12:14:16 +08:00
|
|
|
case AMDGPUIntrinsic::AMDGPU_rsq: // Legacy name
|
2016-01-23 13:32:20 +08:00
|
|
|
return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
|
2016-06-21 02:33:56 +08:00
|
|
|
case Intrinsic::amdgcn_rsq_legacy: {
|
2016-06-24 14:30:11 +08:00
|
|
|
if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
|
2016-06-21 02:33:56 +08:00
|
|
|
return emitRemovedIntrinsicError(DAG, DL, VT);
|
|
|
|
|
|
|
|
return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
|
|
|
|
}
|
2016-07-27 00:45:45 +08:00
|
|
|
case Intrinsic::amdgcn_rcp_legacy: {
|
|
|
|
if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
|
|
|
|
return emitRemovedIntrinsicError(DAG, DL, VT);
|
|
|
|
return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
|
|
|
|
}
|
2016-07-16 05:26:52 +08:00
|
|
|
case Intrinsic::amdgcn_rsq_clamp: {
|
2016-06-24 14:30:11 +08:00
|
|
|
if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
|
2016-02-13 09:03:00 +08:00
|
|
|
return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
|
2016-01-23 13:32:20 +08:00
|
|
|
|
|
|
|
Type *Type = VT.getTypeForEVT(*DAG.getContext());
|
|
|
|
APFloat Max = APFloat::getLargest(Type->getFltSemantics());
|
|
|
|
APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
|
|
|
|
|
|
|
|
SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
|
|
|
|
SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
|
|
|
|
DAG.getConstantFP(Max, DL, VT));
|
|
|
|
return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
|
|
|
|
DAG.getConstantFP(Min, DL, VT));
|
|
|
|
}
|
2014-07-26 14:23:37 +08:00
|
|
|
case Intrinsic::r600_read_ngroups_x:
|
2016-01-30 13:19:45 +08:00
|
|
|
if (Subtarget->isAmdHsaOS())
|
2016-06-21 02:33:56 +08:00
|
|
|
return emitNonHSAIntrinsicError(DAG, DL, VT);
|
2016-01-30 13:19:45 +08:00
|
|
|
|
2014-09-22 23:35:29 +08:00
|
|
|
return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
|
|
|
|
SI::KernelInputOffsets::NGROUPS_X, false);
|
2014-07-26 14:23:37 +08:00
|
|
|
case Intrinsic::r600_read_ngroups_y:
|
2016-01-30 13:19:45 +08:00
|
|
|
if (Subtarget->isAmdHsaOS())
|
2016-06-21 02:33:56 +08:00
|
|
|
return emitNonHSAIntrinsicError(DAG, DL, VT);
|
2016-01-30 13:19:45 +08:00
|
|
|
|
2014-09-22 23:35:29 +08:00
|
|
|
return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
|
|
|
|
SI::KernelInputOffsets::NGROUPS_Y, false);
|
2014-07-26 14:23:37 +08:00
|
|
|
case Intrinsic::r600_read_ngroups_z:
|
2016-01-30 13:19:45 +08:00
|
|
|
if (Subtarget->isAmdHsaOS())
|
2016-06-21 02:33:56 +08:00
|
|
|
return emitNonHSAIntrinsicError(DAG, DL, VT);
|
2016-01-30 13:19:45 +08:00
|
|
|
|
2014-09-22 23:35:29 +08:00
|
|
|
return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
|
|
|
|
SI::KernelInputOffsets::NGROUPS_Z, false);
|
2014-07-26 14:23:37 +08:00
|
|
|
case Intrinsic::r600_read_global_size_x:
|
2016-01-30 13:19:45 +08:00
|
|
|
if (Subtarget->isAmdHsaOS())
|
2016-06-21 02:33:56 +08:00
|
|
|
return emitNonHSAIntrinsicError(DAG, DL, VT);
|
2016-01-30 13:19:45 +08:00
|
|
|
|
2014-09-22 23:35:29 +08:00
|
|
|
return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
|
|
|
|
SI::KernelInputOffsets::GLOBAL_SIZE_X, false);
|
2014-07-26 14:23:37 +08:00
|
|
|
case Intrinsic::r600_read_global_size_y:
|
2016-01-30 13:19:45 +08:00
|
|
|
if (Subtarget->isAmdHsaOS())
|
2016-06-21 02:33:56 +08:00
|
|
|
return emitNonHSAIntrinsicError(DAG, DL, VT);
|
2016-01-30 13:19:45 +08:00
|
|
|
|
2014-09-22 23:35:29 +08:00
|
|
|
return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
|
|
|
|
SI::KernelInputOffsets::GLOBAL_SIZE_Y, false);
|
2014-07-26 14:23:37 +08:00
|
|
|
case Intrinsic::r600_read_global_size_z:
|
2016-01-30 13:19:45 +08:00
|
|
|
if (Subtarget->isAmdHsaOS())
|
2016-06-21 02:33:56 +08:00
|
|
|
return emitNonHSAIntrinsicError(DAG, DL, VT);
|
2016-01-30 13:19:45 +08:00
|
|
|
|
2014-09-22 23:35:29 +08:00
|
|
|
return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
|
|
|
|
SI::KernelInputOffsets::GLOBAL_SIZE_Z, false);
|
2014-07-26 14:23:37 +08:00
|
|
|
case Intrinsic::r600_read_local_size_x:
|
2016-01-30 13:19:45 +08:00
|
|
|
if (Subtarget->isAmdHsaOS())
|
2016-06-21 02:33:56 +08:00
|
|
|
return emitNonHSAIntrinsicError(DAG, DL, VT);
|
2016-01-30 13:19:45 +08:00
|
|
|
|
2015-12-01 05:15:45 +08:00
|
|
|
return lowerImplicitZextParam(DAG, Op, MVT::i16,
|
|
|
|
SI::KernelInputOffsets::LOCAL_SIZE_X);
|
2014-07-26 14:23:37 +08:00
|
|
|
case Intrinsic::r600_read_local_size_y:
|
2016-01-30 13:19:45 +08:00
|
|
|
if (Subtarget->isAmdHsaOS())
|
2016-06-21 02:33:56 +08:00
|
|
|
return emitNonHSAIntrinsicError(DAG, DL, VT);
|
2016-01-30 13:19:45 +08:00
|
|
|
|
2015-12-01 05:15:45 +08:00
|
|
|
return lowerImplicitZextParam(DAG, Op, MVT::i16,
|
|
|
|
SI::KernelInputOffsets::LOCAL_SIZE_Y);
|
2014-07-26 14:23:37 +08:00
|
|
|
case Intrinsic::r600_read_local_size_z:
|
2016-01-30 13:19:45 +08:00
|
|
|
if (Subtarget->isAmdHsaOS())
|
2016-06-21 02:33:56 +08:00
|
|
|
return emitNonHSAIntrinsicError(DAG, DL, VT);
|
2016-01-30 13:19:45 +08:00
|
|
|
|
2015-12-01 05:15:45 +08:00
|
|
|
return lowerImplicitZextParam(DAG, Op, MVT::i16,
|
|
|
|
SI::KernelInputOffsets::LOCAL_SIZE_Z);
|
2016-01-30 12:25:19 +08:00
|
|
|
case Intrinsic::amdgcn_workgroup_id_x:
|
2014-07-26 14:23:37 +08:00
|
|
|
case Intrinsic::r600_read_tgid_x:
|
2016-11-26 01:37:09 +08:00
|
|
|
return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass,
|
2015-12-01 05:15:57 +08:00
|
|
|
TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT);
|
2016-01-30 12:25:19 +08:00
|
|
|
case Intrinsic::amdgcn_workgroup_id_y:
|
2014-07-26 14:23:37 +08:00
|
|
|
case Intrinsic::r600_read_tgid_y:
|
2016-11-26 01:37:09 +08:00
|
|
|
return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass,
|
2015-12-01 05:15:57 +08:00
|
|
|
TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT);
|
2016-01-30 12:25:19 +08:00
|
|
|
case Intrinsic::amdgcn_workgroup_id_z:
|
2014-07-26 14:23:37 +08:00
|
|
|
case Intrinsic::r600_read_tgid_z:
|
2016-11-26 01:37:09 +08:00
|
|
|
return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass,
|
2015-12-01 05:15:57 +08:00
|
|
|
TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT);
|
2016-01-30 12:25:19 +08:00
|
|
|
case Intrinsic::amdgcn_workitem_id_x:
|
2014-07-26 14:23:37 +08:00
|
|
|
case Intrinsic::r600_read_tidig_x:
|
2015-01-08 04:59:25 +08:00
|
|
|
return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
|
2015-12-01 05:15:57 +08:00
|
|
|
TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X), VT);
|
2016-01-30 12:25:19 +08:00
|
|
|
case Intrinsic::amdgcn_workitem_id_y:
|
2014-07-26 14:23:37 +08:00
|
|
|
case Intrinsic::r600_read_tidig_y:
|
2015-01-08 04:59:25 +08:00
|
|
|
return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
|
2015-12-01 05:15:57 +08:00
|
|
|
TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y), VT);
|
2016-01-30 12:25:19 +08:00
|
|
|
case Intrinsic::amdgcn_workitem_id_z:
|
2014-07-26 14:23:37 +08:00
|
|
|
case Intrinsic::r600_read_tidig_z:
|
2015-01-08 04:59:25 +08:00
|
|
|
return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
|
2015-12-01 05:15:57 +08:00
|
|
|
TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT);
|
2014-07-26 14:23:37 +08:00
|
|
|
case AMDGPUIntrinsic::SI_load_const: {
|
|
|
|
SDValue Ops[] = {
|
|
|
|
Op.getOperand(1),
|
|
|
|
Op.getOperand(2)
|
|
|
|
};
|
|
|
|
|
|
|
|
MachineMemOperand *MMO = MF.getMachineMemOperand(
|
[CodeGen] Split out the notions of MI invariance and MI dereferenceability.
Summary:
An IR load can be invariant, dereferenceable, neither, or both. But
currently, MI's notion of invariance is IR-invariant &&
IR-dereferenceable.
This patch splits up the notions of invariance and dereferenceability at
the MI level. It's NFC, so adds some probably-unnecessary
"is-dereferenceable" checks, which we can remove later if desired.
Reviewers: chandlerc, tstellarAMD
Subscribers: jholewinski, arsenm, nemanjai, llvm-commits
Differential Revision: https://reviews.llvm.org/D23371
llvm-svn: 281151
2016-09-11 09:38:58 +08:00
|
|
|
MachinePointerInfo(),
|
|
|
|
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
|
|
|
|
MachineMemOperand::MOInvariant,
|
|
|
|
VT.getStoreSize(), 4);
|
2014-07-26 14:23:37 +08:00
|
|
|
return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
|
|
|
|
Op->getVTList(), Ops, VT, MMO);
|
|
|
|
}
|
2016-07-20 07:16:53 +08:00
|
|
|
case AMDGPUIntrinsic::amdgcn_fdiv_fast: {
|
|
|
|
return lowerFDIV_FAST(Op, DAG);
|
|
|
|
}
|
2014-07-26 14:23:37 +08:00
|
|
|
case AMDGPUIntrinsic::SI_vs_load_input:
|
|
|
|
return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT,
|
|
|
|
Op.getOperand(1),
|
|
|
|
Op.getOperand(2),
|
|
|
|
Op.getOperand(3));
|
2015-03-24 21:40:08 +08:00
|
|
|
|
2015-05-12 23:00:46 +08:00
|
|
|
case AMDGPUIntrinsic::SI_fs_constant: {
|
|
|
|
SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3));
|
|
|
|
SDValue Glue = M0.getValue(1);
|
|
|
|
return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32,
|
|
|
|
DAG.getConstant(2, DL, MVT::i32), // P0
|
|
|
|
Op.getOperand(1), Op.getOperand(2), Glue);
|
|
|
|
}
|
2015-10-29 23:29:09 +08:00
|
|
|
case AMDGPUIntrinsic::SI_packf16:
|
|
|
|
if (Op.getOperand(1).isUndef() && Op.getOperand(2).isUndef())
|
|
|
|
return DAG.getUNDEF(MVT::i32);
|
|
|
|
return Op;
|
2015-05-12 23:00:46 +08:00
|
|
|
case AMDGPUIntrinsic::SI_fs_interp: {
|
|
|
|
SDValue IJ = Op.getOperand(4);
|
|
|
|
SDValue I = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ,
|
|
|
|
DAG.getConstant(0, DL, MVT::i32));
|
|
|
|
SDValue J = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ,
|
|
|
|
DAG.getConstant(1, DL, MVT::i32));
|
|
|
|
SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3));
|
|
|
|
SDValue Glue = M0.getValue(1);
|
|
|
|
SDValue P1 = DAG.getNode(AMDGPUISD::INTERP_P1, DL,
|
|
|
|
DAG.getVTList(MVT::f32, MVT::Glue),
|
|
|
|
I, Op.getOperand(1), Op.getOperand(2), Glue);
|
|
|
|
Glue = SDValue(P1.getNode(), 1);
|
|
|
|
return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, P1, J,
|
|
|
|
Op.getOperand(1), Op.getOperand(2), Glue);
|
|
|
|
}
|
2015-12-16 01:02:49 +08:00
|
|
|
case Intrinsic::amdgcn_interp_p1: {
|
|
|
|
SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
|
|
|
|
SDValue Glue = M0.getValue(1);
|
|
|
|
return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1),
|
|
|
|
Op.getOperand(2), Op.getOperand(3), Glue);
|
|
|
|
}
|
|
|
|
case Intrinsic::amdgcn_interp_p2: {
|
|
|
|
SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
|
|
|
|
SDValue Glue = SDValue(M0.getNode(), 1);
|
|
|
|
return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1),
|
|
|
|
Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
|
|
|
|
Glue);
|
|
|
|
}
|
2016-02-13 09:19:56 +08:00
|
|
|
case Intrinsic::amdgcn_sin:
|
|
|
|
return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
|
|
|
|
|
|
|
|
case Intrinsic::amdgcn_cos:
|
|
|
|
return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
|
|
|
|
|
|
|
|
case Intrinsic::amdgcn_log_clamp: {
|
2016-06-24 14:30:11 +08:00
|
|
|
if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
|
2016-02-13 09:19:56 +08:00
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
DiagnosticInfoUnsupported BadIntrin(
|
|
|
|
*MF.getFunction(), "intrinsic not supported on subtarget",
|
|
|
|
DL.getDebugLoc());
|
|
|
|
DAG.getContext()->diagnose(BadIntrin);
|
|
|
|
return DAG.getUNDEF(VT);
|
|
|
|
}
|
2016-01-23 13:32:20 +08:00
|
|
|
case Intrinsic::amdgcn_ldexp:
|
|
|
|
return DAG.getNode(AMDGPUISD::LDEXP, DL, VT,
|
|
|
|
Op.getOperand(1), Op.getOperand(2));
|
2016-05-28 08:19:52 +08:00
|
|
|
|
|
|
|
case Intrinsic::amdgcn_fract:
|
|
|
|
return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
|
|
|
|
|
2016-01-23 13:32:20 +08:00
|
|
|
case Intrinsic::amdgcn_class:
|
|
|
|
return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
|
|
|
|
Op.getOperand(1), Op.getOperand(2));
|
|
|
|
case Intrinsic::amdgcn_div_fmas:
|
|
|
|
return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
|
|
|
|
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
|
|
|
|
Op.getOperand(4));
|
|
|
|
|
|
|
|
case Intrinsic::amdgcn_div_fixup:
|
|
|
|
return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
|
|
|
|
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
|
|
|
|
|
|
|
|
case Intrinsic::amdgcn_trig_preop:
|
|
|
|
return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
|
|
|
|
Op.getOperand(1), Op.getOperand(2));
|
|
|
|
case Intrinsic::amdgcn_div_scale: {
|
|
|
|
// 3rd parameter required to be a constant.
|
|
|
|
const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
|
|
|
|
if (!Param)
|
|
|
|
return DAG.getUNDEF(VT);
|
|
|
|
|
|
|
|
// Translate to the operands expected by the machine instruction. The
|
|
|
|
// first parameter must be the same as the first instruction.
|
|
|
|
SDValue Numerator = Op.getOperand(1);
|
|
|
|
SDValue Denominator = Op.getOperand(2);
|
|
|
|
|
|
|
|
// Note this order is opposite of the machine instruction's operations,
|
|
|
|
// which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
|
|
|
|
// intrinsic has the numerator as the first operand to match a normal
|
|
|
|
// division operation.
|
|
|
|
|
|
|
|
SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
|
|
|
|
|
|
|
|
return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
|
|
|
|
Denominator, Numerator);
|
|
|
|
}
|
2016-07-29 00:42:13 +08:00
|
|
|
case Intrinsic::amdgcn_icmp: {
|
|
|
|
const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3));
|
|
|
|
int CondCode = CD->getSExtValue();
|
|
|
|
|
|
|
|
if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE ||
|
2016-08-22 08:58:04 +08:00
|
|
|
CondCode >= ICmpInst::Predicate::BAD_ICMP_PREDICATE)
|
2016-07-29 00:42:13 +08:00
|
|
|
return DAG.getUNDEF(VT);
|
|
|
|
|
2016-08-22 08:58:04 +08:00
|
|
|
ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
|
2016-07-29 00:42:13 +08:00
|
|
|
ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
|
|
|
|
return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1),
|
|
|
|
Op.getOperand(2), DAG.getCondCode(CCOpcode));
|
|
|
|
}
|
|
|
|
case Intrinsic::amdgcn_fcmp: {
|
|
|
|
const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3));
|
|
|
|
int CondCode = CD->getSExtValue();
|
|
|
|
|
|
|
|
if (CondCode <= FCmpInst::Predicate::FCMP_FALSE ||
|
2016-08-22 08:58:04 +08:00
|
|
|
CondCode >= FCmpInst::Predicate::FCMP_TRUE)
|
2016-07-29 00:42:13 +08:00
|
|
|
return DAG.getUNDEF(VT);
|
|
|
|
|
2016-08-22 08:58:04 +08:00
|
|
|
FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
|
2016-07-29 00:42:13 +08:00
|
|
|
ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
|
|
|
|
return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1),
|
|
|
|
Op.getOperand(2), DAG.getCondCode(CCOpcode));
|
|
|
|
}
|
2016-07-27 00:45:45 +08:00
|
|
|
case Intrinsic::amdgcn_fmul_legacy:
|
|
|
|
return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
|
|
|
|
Op.getOperand(1), Op.getOperand(2));
|
2016-07-19 02:35:05 +08:00
|
|
|
case Intrinsic::amdgcn_sffbh:
|
|
|
|
case AMDGPUIntrinsic::AMDGPU_flbit_i32: // Legacy name.
|
|
|
|
return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
|
2014-07-26 14:23:37 +08:00
|
|
|
default:
|
|
|
|
return AMDGPUTargetLowering::LowerOperation(Op, DAG);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-04-12 22:05:04 +08:00
|
|
|
SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
|
|
|
|
SelectionDAG &DAG) const {
|
|
|
|
unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
|
|
|
|
switch (IntrID) {
|
|
|
|
case Intrinsic::amdgcn_atomic_inc:
|
|
|
|
case Intrinsic::amdgcn_atomic_dec: {
|
|
|
|
MemSDNode *M = cast<MemSDNode>(Op);
|
|
|
|
unsigned Opc = (IntrID == Intrinsic::amdgcn_atomic_inc) ?
|
|
|
|
AMDGPUISD::ATOMIC_INC : AMDGPUISD::ATOMIC_DEC;
|
|
|
|
SDValue Ops[] = {
|
|
|
|
M->getOperand(0), // Chain
|
|
|
|
M->getOperand(2), // Ptr
|
|
|
|
M->getOperand(3) // Value
|
|
|
|
};
|
|
|
|
|
|
|
|
return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
|
|
|
|
M->getMemoryVT(), M->getMemOperand());
|
|
|
|
}
|
|
|
|
default:
|
|
|
|
return SDValue();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-07-26 14:23:37 +08:00
|
|
|
SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
|
|
|
|
SelectionDAG &DAG) const {
|
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
2015-05-12 22:18:14 +08:00
|
|
|
SDLoc DL(Op);
|
2014-07-26 14:23:37 +08:00
|
|
|
SDValue Chain = Op.getOperand(0);
|
|
|
|
unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
|
|
|
|
|
|
|
|
switch (IntrinsicID) {
|
2015-05-12 22:18:14 +08:00
|
|
|
case AMDGPUIntrinsic::SI_sendmsg: {
|
|
|
|
Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
|
|
|
|
SDValue Glue = Chain.getValue(1);
|
|
|
|
return DAG.getNode(AMDGPUISD::SENDMSG, DL, MVT::Other, Chain,
|
|
|
|
Op.getOperand(2), Glue);
|
|
|
|
}
|
2014-07-26 14:23:37 +08:00
|
|
|
case AMDGPUIntrinsic::SI_tbuffer_store: {
|
|
|
|
SDValue Ops[] = {
|
|
|
|
Chain,
|
|
|
|
Op.getOperand(2),
|
|
|
|
Op.getOperand(3),
|
|
|
|
Op.getOperand(4),
|
|
|
|
Op.getOperand(5),
|
|
|
|
Op.getOperand(6),
|
|
|
|
Op.getOperand(7),
|
|
|
|
Op.getOperand(8),
|
|
|
|
Op.getOperand(9),
|
|
|
|
Op.getOperand(10),
|
|
|
|
Op.getOperand(11),
|
|
|
|
Op.getOperand(12),
|
|
|
|
Op.getOperand(13),
|
|
|
|
Op.getOperand(14)
|
|
|
|
};
|
|
|
|
|
|
|
|
EVT VT = Op.getOperand(3).getValueType();
|
|
|
|
|
|
|
|
MachineMemOperand *MMO = MF.getMachineMemOperand(
|
|
|
|
MachinePointerInfo(),
|
|
|
|
MachineMemOperand::MOStore,
|
|
|
|
VT.getStoreSize(), 4);
|
|
|
|
return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
|
|
|
|
Op->getVTList(), Ops, VT, MMO);
|
|
|
|
}
|
2016-07-13 14:04:22 +08:00
|
|
|
case AMDGPUIntrinsic::AMDGPU_kill: {
|
2016-07-20 00:27:56 +08:00
|
|
|
SDValue Src = Op.getOperand(2);
|
|
|
|
if (const ConstantFPSDNode *K = dyn_cast<ConstantFPSDNode>(Src)) {
|
2016-07-13 14:04:22 +08:00
|
|
|
if (!K->isNegative())
|
|
|
|
return Chain;
|
2016-07-20 00:27:56 +08:00
|
|
|
|
|
|
|
SDValue NegOne = DAG.getTargetConstant(FloatToBits(-1.0f), DL, MVT::i32);
|
|
|
|
return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, NegOne);
|
2016-07-13 14:04:22 +08:00
|
|
|
}
|
|
|
|
|
2016-07-20 00:27:56 +08:00
|
|
|
SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Src);
|
|
|
|
return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, Cast);
|
2016-07-13 14:04:22 +08:00
|
|
|
}
|
2014-07-26 14:23:37 +08:00
|
|
|
default:
|
|
|
|
return SDValue();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-11-14 07:36:50 +08:00
|
|
|
SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
|
|
|
|
SDLoc DL(Op);
|
|
|
|
LoadSDNode *Load = cast<LoadSDNode>(Op);
|
2016-02-11 02:21:39 +08:00
|
|
|
ISD::LoadExtType ExtType = Load->getExtensionType();
|
2016-02-11 02:21:45 +08:00
|
|
|
EVT MemVT = Load->getMemoryVT();
|
2016-02-11 02:21:39 +08:00
|
|
|
|
2016-02-11 02:21:45 +08:00
|
|
|
if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
|
2016-02-11 02:21:39 +08:00
|
|
|
// FIXME: Copied from PPC
|
|
|
|
// First, load into 32 bits, then truncate to 1 bit.
|
|
|
|
|
|
|
|
SDValue Chain = Load->getChain();
|
|
|
|
SDValue BasePtr = Load->getBasePtr();
|
|
|
|
MachineMemOperand *MMO = Load->getMemOperand();
|
|
|
|
|
2016-11-11 00:02:37 +08:00
|
|
|
EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
|
|
|
|
|
2016-02-11 02:21:39 +08:00
|
|
|
SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
|
2016-11-11 00:02:37 +08:00
|
|
|
BasePtr, RealMemVT, MMO);
|
2016-02-11 02:21:39 +08:00
|
|
|
|
|
|
|
SDValue Ops[] = {
|
2016-02-11 02:21:45 +08:00
|
|
|
DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
|
2016-02-11 02:21:39 +08:00
|
|
|
NewLD.getValue(1)
|
|
|
|
};
|
|
|
|
|
|
|
|
return DAG.getMergeValues(Ops, DL);
|
|
|
|
}
|
2014-07-08 02:34:45 +08:00
|
|
|
|
2016-02-11 02:21:45 +08:00
|
|
|
if (!MemVT.isVector())
|
|
|
|
return SDValue();
|
2015-11-24 20:05:03 +08:00
|
|
|
|
2016-02-11 02:21:45 +08:00
|
|
|
assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
|
|
|
|
"Custom lowering for non-i32 vectors hasn't been implemented.");
|
|
|
|
|
2016-05-03 04:13:51 +08:00
|
|
|
unsigned AS = Load->getAddressSpace();
|
|
|
|
if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
|
|
|
|
AS, Load->getAlignment())) {
|
|
|
|
SDValue Ops[2];
|
|
|
|
std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
|
|
|
|
return DAG.getMergeValues(Ops, DL);
|
|
|
|
}
|
|
|
|
|
2016-10-26 22:38:47 +08:00
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
|
|
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
|
|
|
// If there is a possibilty that flat instruction access scratch memory
|
|
|
|
// then we need to use the same legalization rules we use for private.
|
|
|
|
if (AS == AMDGPUAS::FLAT_ADDRESS)
|
|
|
|
AS = MFI->hasFlatScratchInit() ?
|
|
|
|
AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
|
|
|
|
|
2016-05-03 04:13:51 +08:00
|
|
|
unsigned NumElements = MemVT.getVectorNumElements();
|
|
|
|
switch (AS) {
|
2016-02-11 02:21:45 +08:00
|
|
|
case AMDGPUAS::CONSTANT_ADDRESS:
|
|
|
|
if (isMemOpUniform(Load))
|
|
|
|
return SDValue();
|
|
|
|
// Non-uniform loads will be selected to MUBUF instructions, so they
|
|
|
|
// have the same legalization requires ments as global and private
|
|
|
|
// loads.
|
|
|
|
//
|
2016-08-17 13:10:15 +08:00
|
|
|
LLVM_FALLTHROUGH;
|
2016-02-11 02:21:45 +08:00
|
|
|
case AMDGPUAS::GLOBAL_ADDRESS:
|
2016-02-13 12:18:53 +08:00
|
|
|
case AMDGPUAS::FLAT_ADDRESS:
|
|
|
|
if (NumElements > 4)
|
2016-02-11 02:21:45 +08:00
|
|
|
return SplitVectorLoad(Op, DAG);
|
|
|
|
// v4 loads are supported for private and global memory.
|
|
|
|
return SDValue();
|
2016-02-13 12:18:53 +08:00
|
|
|
case AMDGPUAS::PRIVATE_ADDRESS: {
|
|
|
|
// Depending on the setting of the private_element_size field in the
|
|
|
|
// resource descriptor, we can only make private accesses up to a certain
|
|
|
|
// size.
|
|
|
|
switch (Subtarget->getMaxPrivateElementSize()) {
|
|
|
|
case 4:
|
2016-04-15 07:31:26 +08:00
|
|
|
return scalarizeVectorLoad(Load, DAG);
|
2016-02-13 12:18:53 +08:00
|
|
|
case 8:
|
|
|
|
if (NumElements > 2)
|
|
|
|
return SplitVectorLoad(Op, DAG);
|
|
|
|
return SDValue();
|
|
|
|
case 16:
|
|
|
|
// Same as global/flat
|
|
|
|
if (NumElements > 4)
|
|
|
|
return SplitVectorLoad(Op, DAG);
|
|
|
|
return SDValue();
|
|
|
|
default:
|
|
|
|
llvm_unreachable("unsupported private_element_size");
|
|
|
|
}
|
|
|
|
}
|
2016-05-03 04:13:51 +08:00
|
|
|
case AMDGPUAS::LOCAL_ADDRESS: {
|
|
|
|
if (NumElements > 2)
|
|
|
|
return SplitVectorLoad(Op, DAG);
|
|
|
|
|
|
|
|
if (NumElements == 2)
|
|
|
|
return SDValue();
|
|
|
|
|
2016-02-11 02:21:45 +08:00
|
|
|
// If properly aligned, if we split we might be able to use ds_read_b64.
|
|
|
|
return SplitVectorLoad(Op, DAG);
|
2016-05-03 04:13:51 +08:00
|
|
|
}
|
2016-02-11 02:21:45 +08:00
|
|
|
default:
|
|
|
|
return SDValue();
|
2014-03-25 01:50:46 +08:00
|
|
|
}
|
2013-11-14 07:36:50 +08:00
|
|
|
}
|
|
|
|
|
2014-02-05 01:18:40 +08:00
|
|
|
SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
|
|
|
|
if (Op.getValueType() != MVT::i64)
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
SDLoc DL(Op);
|
|
|
|
SDValue Cond = Op.getOperand(0);
|
|
|
|
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
|
|
|
|
SDValue One = DAG.getConstant(1, DL, MVT::i32);
|
2014-02-05 01:18:40 +08:00
|
|
|
|
2014-03-31 22:01:55 +08:00
|
|
|
SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
|
|
|
|
SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
|
|
|
|
|
|
|
|
SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
|
|
|
|
SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
|
2014-02-05 01:18:40 +08:00
|
|
|
|
|
|
|
SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
|
|
|
|
|
2014-03-31 22:01:55 +08:00
|
|
|
SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
|
|
|
|
SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
|
2014-02-05 01:18:40 +08:00
|
|
|
|
|
|
|
SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
|
|
|
|
|
2016-04-27 05:15:30 +08:00
|
|
|
SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
|
2014-03-31 22:01:55 +08:00
|
|
|
return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res);
|
2014-02-05 01:18:40 +08:00
|
|
|
}
|
|
|
|
|
2014-07-16 07:50:10 +08:00
|
|
|
// Catch division cases where we can use shortcuts with rcp and rsq
|
|
|
|
// instructions.
|
2016-07-20 07:16:53 +08:00
|
|
|
SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
|
|
|
|
SelectionDAG &DAG) const {
|
2014-07-16 04:18:31 +08:00
|
|
|
SDLoc SL(Op);
|
|
|
|
SDValue LHS = Op.getOperand(0);
|
|
|
|
SDValue RHS = Op.getOperand(1);
|
|
|
|
EVT VT = Op.getValueType();
|
2014-07-16 07:50:10 +08:00
|
|
|
bool Unsafe = DAG.getTarget().Options.UnsafeFPMath;
|
2014-07-16 04:18:31 +08:00
|
|
|
|
|
|
|
if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
|
2016-08-03 06:25:04 +08:00
|
|
|
if ((Unsafe || (VT == MVT::f32 && !Subtarget->hasFP32Denormals()))) {
|
|
|
|
|
|
|
|
if (CLHS->isExactlyValue(1.0)) {
|
|
|
|
// v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
|
|
|
|
// the CI documentation has a worst case error of 1 ulp.
|
|
|
|
// OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
|
|
|
|
// use it as long as we aren't trying to use denormals.
|
|
|
|
|
|
|
|
// 1.0 / sqrt(x) -> rsq(x)
|
|
|
|
//
|
|
|
|
// XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
|
|
|
|
// error seems really high at 2^29 ULP.
|
|
|
|
if (RHS.getOpcode() == ISD::FSQRT)
|
|
|
|
return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
|
|
|
|
|
|
|
|
// 1.0 / x -> rcp(x)
|
|
|
|
return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Same as for 1.0, but expand the sign out of the constant.
|
|
|
|
if (CLHS->isExactlyValue(-1.0)) {
|
|
|
|
// -1.0 / x -> rcp (fneg x)
|
|
|
|
SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
|
|
|
|
return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
|
|
|
|
}
|
2014-07-16 04:18:31 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-06-10 03:17:15 +08:00
|
|
|
const SDNodeFlags *Flags = Op->getFlags();
|
|
|
|
|
|
|
|
if (Unsafe || Flags->hasAllowReciprocal()) {
|
2014-07-16 07:50:10 +08:00
|
|
|
// Turn into multiply by the reciprocal.
|
|
|
|
// x / y -> x * (1.0 / y)
|
2015-09-17 00:31:21 +08:00
|
|
|
SDNodeFlags Flags;
|
|
|
|
Flags.setUnsafeAlgebra(true);
|
2014-07-16 07:50:10 +08:00
|
|
|
SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
|
2015-09-17 00:31:21 +08:00
|
|
|
return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, &Flags);
|
2014-07-16 07:50:10 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return SDValue();
|
2014-07-16 04:18:31 +08:00
|
|
|
}
|
|
|
|
|
2016-07-20 07:16:53 +08:00
|
|
|
// Faster 2.5 ULP division that does not support denormals.
|
|
|
|
SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
|
2014-07-16 04:18:31 +08:00
|
|
|
SDLoc SL(Op);
|
2016-07-20 07:16:53 +08:00
|
|
|
SDValue LHS = Op.getOperand(1);
|
|
|
|
SDValue RHS = Op.getOperand(2);
|
2014-07-16 04:18:31 +08:00
|
|
|
|
2016-07-20 07:16:53 +08:00
|
|
|
SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
|
2014-07-16 04:18:31 +08:00
|
|
|
|
2016-07-20 07:16:53 +08:00
|
|
|
const APFloat K0Val(BitsToFloat(0x6f800000));
|
|
|
|
const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
|
2014-07-16 04:18:31 +08:00
|
|
|
|
2016-07-20 07:16:53 +08:00
|
|
|
const APFloat K1Val(BitsToFloat(0x2f800000));
|
|
|
|
const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
|
2014-07-16 04:18:31 +08:00
|
|
|
|
2016-07-20 07:16:53 +08:00
|
|
|
const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
|
2014-07-16 04:18:31 +08:00
|
|
|
|
2016-07-20 07:16:53 +08:00
|
|
|
EVT SetCCVT =
|
|
|
|
getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
|
2014-07-16 04:18:31 +08:00
|
|
|
|
2016-07-20 07:16:53 +08:00
|
|
|
SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
|
2014-07-16 04:18:31 +08:00
|
|
|
|
2016-07-20 07:16:53 +08:00
|
|
|
SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
|
2014-07-16 04:18:31 +08:00
|
|
|
|
2016-07-20 07:16:53 +08:00
|
|
|
// TODO: Should this propagate fast-math-flags?
|
|
|
|
r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
|
2015-09-17 00:31:21 +08:00
|
|
|
|
2016-07-20 07:16:53 +08:00
|
|
|
// rcp does not support denormals.
|
|
|
|
SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
|
2014-07-16 04:18:31 +08:00
|
|
|
|
2016-07-20 07:16:53 +08:00
|
|
|
SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
|
2014-07-16 04:18:31 +08:00
|
|
|
|
2016-07-20 07:16:53 +08:00
|
|
|
return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
|
|
|
|
}
|
2014-07-16 04:18:31 +08:00
|
|
|
|
2016-07-20 07:16:53 +08:00
|
|
|
SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
|
|
|
|
if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
|
|
|
|
return FastLowered;
|
|
|
|
|
|
|
|
SDLoc SL(Op);
|
|
|
|
SDValue LHS = Op.getOperand(0);
|
|
|
|
SDValue RHS = Op.getOperand(1);
|
2016-06-10 10:18:02 +08:00
|
|
|
|
2016-06-10 03:17:15 +08:00
|
|
|
const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
|
2016-06-10 10:18:02 +08:00
|
|
|
|
2016-06-10 03:17:15 +08:00
|
|
|
SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
|
2016-06-10 10:18:02 +08:00
|
|
|
|
2016-06-10 03:17:15 +08:00
|
|
|
SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, RHS, RHS, LHS);
|
|
|
|
SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, LHS, RHS, LHS);
|
2016-06-10 10:18:02 +08:00
|
|
|
|
2016-07-09 15:48:11 +08:00
|
|
|
// Denominator is scaled to not be denormal, so using rcp is ok.
|
2016-06-10 03:17:15 +08:00
|
|
|
SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled);
|
2016-06-10 10:18:02 +08:00
|
|
|
|
2016-06-10 03:17:15 +08:00
|
|
|
SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled);
|
2016-06-10 10:18:02 +08:00
|
|
|
|
2016-06-10 03:17:15 +08:00
|
|
|
SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, ApproxRcp, One);
|
|
|
|
SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp, ApproxRcp);
|
2016-06-10 10:18:02 +08:00
|
|
|
|
2016-06-10 03:17:15 +08:00
|
|
|
SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1);
|
2016-06-10 10:18:02 +08:00
|
|
|
|
2016-06-10 03:17:15 +08:00
|
|
|
SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Mul, NumeratorScaled);
|
|
|
|
SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul);
|
|
|
|
SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, NumeratorScaled);
|
2016-06-10 10:18:02 +08:00
|
|
|
|
2016-06-10 03:17:15 +08:00
|
|
|
SDValue Scale = NumeratorScaled.getValue(1);
|
|
|
|
SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32, Fma4, Fma1, Fma3, Scale);
|
2016-06-10 10:18:02 +08:00
|
|
|
|
2016-06-10 03:17:15 +08:00
|
|
|
return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS);
|
2014-07-16 04:18:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
|
2015-02-14 12:30:08 +08:00
|
|
|
if (DAG.getTarget().Options.UnsafeFPMath)
|
2016-07-20 07:16:53 +08:00
|
|
|
return lowerFastUnsafeFDIV(Op, DAG);
|
2015-02-14 12:30:08 +08:00
|
|
|
|
|
|
|
SDLoc SL(Op);
|
|
|
|
SDValue X = Op.getOperand(0);
|
|
|
|
SDValue Y = Op.getOperand(1);
|
|
|
|
|
2015-04-28 22:05:47 +08:00
|
|
|
const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
|
2015-02-14 12:30:08 +08:00
|
|
|
|
|
|
|
SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
|
|
|
|
|
|
|
|
SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
|
|
|
|
|
|
|
|
SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
|
|
|
|
|
|
|
|
SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
|
|
|
|
|
|
|
|
SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
|
|
|
|
|
|
|
|
SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
|
|
|
|
|
|
|
|
SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
|
|
|
|
|
|
|
|
SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
|
|
|
|
|
|
|
|
SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
|
|
|
|
SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
|
|
|
|
|
|
|
|
SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
|
|
|
|
NegDivScale0, Mul, DivScale1);
|
|
|
|
|
|
|
|
SDValue Scale;
|
|
|
|
|
2016-06-24 14:30:11 +08:00
|
|
|
if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
|
2015-02-14 12:30:08 +08:00
|
|
|
// Workaround a hardware bug on SI where the condition output from div_scale
|
|
|
|
// is not usable.
|
|
|
|
|
2015-04-28 22:05:47 +08:00
|
|
|
const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
|
2015-02-14 12:30:08 +08:00
|
|
|
|
|
|
|
// Figure out if the scale to use for div_fmas.
|
|
|
|
SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
|
|
|
|
SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
|
|
|
|
SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
|
|
|
|
SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
|
|
|
|
|
|
|
|
SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
|
|
|
|
SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
|
|
|
|
|
|
|
|
SDValue Scale0Hi
|
|
|
|
= DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
|
|
|
|
SDValue Scale1Hi
|
|
|
|
= DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
|
|
|
|
|
|
|
|
SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
|
|
|
|
SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
|
|
|
|
Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
|
|
|
|
} else {
|
|
|
|
Scale = DivScale1.getValue(1);
|
|
|
|
}
|
|
|
|
|
|
|
|
SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
|
|
|
|
Fma4, Fma3, Mul, Scale);
|
|
|
|
|
|
|
|
return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
|
2014-07-16 04:18:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
|
|
|
|
EVT VT = Op.getValueType();
|
|
|
|
|
|
|
|
if (VT == MVT::f32)
|
|
|
|
return LowerFDIV32(Op, DAG);
|
|
|
|
|
|
|
|
if (VT == MVT::f64)
|
|
|
|
return LowerFDIV64(Op, DAG);
|
|
|
|
|
|
|
|
llvm_unreachable("Unexpected type for fdiv");
|
|
|
|
}
|
|
|
|
|
2013-11-14 07:36:50 +08:00
|
|
|
SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
|
|
|
|
SDLoc DL(Op);
|
|
|
|
StoreSDNode *Store = cast<StoreSDNode>(Op);
|
|
|
|
EVT VT = Store->getMemoryVT();
|
|
|
|
|
2016-02-11 13:32:46 +08:00
|
|
|
if (VT == MVT::i1) {
|
|
|
|
return DAG.getTruncStore(Store->getChain(), DL,
|
|
|
|
DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
|
|
|
|
Store->getBasePtr(), MVT::i1, Store->getMemOperand());
|
2014-07-21 23:45:01 +08:00
|
|
|
}
|
|
|
|
|
2016-05-03 04:13:51 +08:00
|
|
|
assert(VT.isVector() &&
|
|
|
|
Store->getValue().getValueType().getScalarType() == MVT::i32);
|
|
|
|
|
|
|
|
unsigned AS = Store->getAddressSpace();
|
|
|
|
if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
|
|
|
|
AS, Store->getAlignment())) {
|
|
|
|
return expandUnalignedStore(Store, DAG);
|
|
|
|
}
|
2013-11-14 07:36:50 +08:00
|
|
|
|
2016-10-26 22:38:47 +08:00
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
|
|
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
|
|
|
// If there is a possibilty that flat instruction access scratch memory
|
|
|
|
// then we need to use the same legalization rules we use for private.
|
|
|
|
if (AS == AMDGPUAS::FLAT_ADDRESS)
|
|
|
|
AS = MFI->hasFlatScratchInit() ?
|
|
|
|
AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
|
|
|
|
|
2016-02-13 12:18:53 +08:00
|
|
|
unsigned NumElements = VT.getVectorNumElements();
|
2016-05-03 04:13:51 +08:00
|
|
|
switch (AS) {
|
2016-02-13 12:18:53 +08:00
|
|
|
case AMDGPUAS::GLOBAL_ADDRESS:
|
|
|
|
case AMDGPUAS::FLAT_ADDRESS:
|
|
|
|
if (NumElements > 4)
|
|
|
|
return SplitVectorStore(Op, DAG);
|
|
|
|
return SDValue();
|
|
|
|
case AMDGPUAS::PRIVATE_ADDRESS: {
|
|
|
|
switch (Subtarget->getMaxPrivateElementSize()) {
|
|
|
|
case 4:
|
2016-04-15 07:31:26 +08:00
|
|
|
return scalarizeVectorStore(Store, DAG);
|
2016-02-13 12:18:53 +08:00
|
|
|
case 8:
|
|
|
|
if (NumElements > 2)
|
|
|
|
return SplitVectorStore(Op, DAG);
|
|
|
|
return SDValue();
|
|
|
|
case 16:
|
|
|
|
if (NumElements > 4)
|
|
|
|
return SplitVectorStore(Op, DAG);
|
|
|
|
return SDValue();
|
|
|
|
default:
|
|
|
|
llvm_unreachable("unsupported private_element_size");
|
|
|
|
}
|
|
|
|
}
|
2016-05-03 04:13:51 +08:00
|
|
|
case AMDGPUAS::LOCAL_ADDRESS: {
|
|
|
|
if (NumElements > 2)
|
|
|
|
return SplitVectorStore(Op, DAG);
|
|
|
|
|
|
|
|
if (NumElements == 2)
|
|
|
|
return Op;
|
|
|
|
|
2016-02-11 13:32:46 +08:00
|
|
|
// If properly aligned, if we split we might be able to use ds_write_b64.
|
|
|
|
return SplitVectorStore(Op, DAG);
|
2016-05-03 04:13:51 +08:00
|
|
|
}
|
2016-02-13 12:18:53 +08:00
|
|
|
default:
|
|
|
|
llvm_unreachable("unhandled address space");
|
2016-02-11 13:32:46 +08:00
|
|
|
}
|
2013-11-14 07:36:50 +08:00
|
|
|
}
|
|
|
|
|
2014-07-20 02:44:39 +08:00
|
|
|
SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
|
2015-04-28 22:05:47 +08:00
|
|
|
SDLoc DL(Op);
|
2014-07-20 02:44:39 +08:00
|
|
|
EVT VT = Op.getValueType();
|
|
|
|
SDValue Arg = Op.getOperand(0);
|
2015-09-17 00:31:21 +08:00
|
|
|
// TODO: Should this propagate fast-math-flags?
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
|
|
|
|
DAG.getNode(ISD::FMUL, DL, VT, Arg,
|
|
|
|
DAG.getConstantFP(0.5/M_PI, DL,
|
|
|
|
VT)));
|
2014-07-20 02:44:39 +08:00
|
|
|
|
|
|
|
switch (Op.getOpcode()) {
|
|
|
|
case ISD::FCOS:
|
|
|
|
return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, FractPart);
|
|
|
|
case ISD::FSIN:
|
|
|
|
return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, FractPart);
|
|
|
|
default:
|
|
|
|
llvm_unreachable("Wrong trig opcode");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
AMDGPU: Implement {BUFFER,FLAT}_ATOMIC_CMPSWAP{,_X2}
Summary:
Implement BUFFER_ATOMIC_CMPSWAP{,_X2} instructions on all GCN targets, and FLAT_ATOMIC_CMPSWAP{,_X2} on CI+.
32-bit instruction variants tested manually on Kabini and Bonaire. Tests and parts of code provided by Jan Veselý.
Patch by: Vedran Miletić
Reviewers: arsenm, tstellarAMD, nhaehnle
Subscribers: jvesely, scchan, kanarayan, arsenm
Differential Revision: http://reviews.llvm.org/D17280
llvm-svn: 265170
2016-04-02 02:27:37 +08:00
|
|
|
SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
|
|
|
|
AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
|
|
|
|
assert(AtomicNode->isCompareAndSwap());
|
|
|
|
unsigned AS = AtomicNode->getAddressSpace();
|
|
|
|
|
|
|
|
// No custom lowering required for local address space
|
|
|
|
if (!isFlatGlobalAddrSpace(AS))
|
|
|
|
return Op;
|
|
|
|
|
|
|
|
// Non-local address space requires custom lowering for atomic compare
|
|
|
|
// and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
|
|
|
|
SDLoc DL(Op);
|
|
|
|
SDValue ChainIn = Op.getOperand(0);
|
|
|
|
SDValue Addr = Op.getOperand(1);
|
|
|
|
SDValue Old = Op.getOperand(2);
|
|
|
|
SDValue New = Op.getOperand(3);
|
|
|
|
EVT VT = Op.getValueType();
|
|
|
|
MVT SimpleVT = VT.getSimpleVT();
|
|
|
|
MVT VecType = MVT::getVectorVT(SimpleVT, 2);
|
|
|
|
|
2016-04-27 05:15:30 +08:00
|
|
|
SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
|
AMDGPU: Implement {BUFFER,FLAT}_ATOMIC_CMPSWAP{,_X2}
Summary:
Implement BUFFER_ATOMIC_CMPSWAP{,_X2} instructions on all GCN targets, and FLAT_ATOMIC_CMPSWAP{,_X2} on CI+.
32-bit instruction variants tested manually on Kabini and Bonaire. Tests and parts of code provided by Jan Veselý.
Patch by: Vedran Miletić
Reviewers: arsenm, tstellarAMD, nhaehnle
Subscribers: jvesely, scchan, kanarayan, arsenm
Differential Revision: http://reviews.llvm.org/D17280
llvm-svn: 265170
2016-04-02 02:27:37 +08:00
|
|
|
SDValue Ops[] = { ChainIn, Addr, NewOld };
|
2016-06-10 07:42:48 +08:00
|
|
|
|
|
|
|
return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(),
|
|
|
|
Ops, VT, AtomicNode->getMemOperand());
|
AMDGPU: Implement {BUFFER,FLAT}_ATOMIC_CMPSWAP{,_X2}
Summary:
Implement BUFFER_ATOMIC_CMPSWAP{,_X2} instructions on all GCN targets, and FLAT_ATOMIC_CMPSWAP{,_X2} on CI+.
32-bit instruction variants tested manually on Kabini and Bonaire. Tests and parts of code provided by Jan Veselý.
Patch by: Vedran Miletić
Reviewers: arsenm, tstellarAMD, nhaehnle
Subscribers: jvesely, scchan, kanarayan, arsenm
Differential Revision: http://reviews.llvm.org/D17280
llvm-svn: 265170
2016-04-02 02:27:37 +08:00
|
|
|
}
|
|
|
|
|
2012-12-12 05:25:42 +08:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// Custom DAG optimizations
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2014-06-12 01:50:44 +08:00
|
|
|
SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
|
2015-01-14 09:35:22 +08:00
|
|
|
DAGCombinerInfo &DCI) const {
|
2014-06-12 01:50:44 +08:00
|
|
|
EVT VT = N->getValueType(0);
|
|
|
|
EVT ScalarVT = VT.getScalarType();
|
|
|
|
if (ScalarVT != MVT::f32)
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
SelectionDAG &DAG = DCI.DAG;
|
|
|
|
SDLoc DL(N);
|
|
|
|
|
|
|
|
SDValue Src = N->getOperand(0);
|
|
|
|
EVT SrcVT = Src.getValueType();
|
|
|
|
|
|
|
|
// TODO: We could try to match extracting the higher bytes, which would be
|
|
|
|
// easier if i8 vectors weren't promoted to i32 vectors, particularly after
|
|
|
|
// types are legalized. v4i8 -> v4f32 is probably the only case to worry
|
|
|
|
// about in practice.
|
|
|
|
if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) {
|
|
|
|
if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
|
|
|
|
SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src);
|
|
|
|
DCI.AddToWorklist(Cvt.getNode());
|
|
|
|
return Cvt;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return SDValue();
|
|
|
|
}
|
|
|
|
|
2015-03-12 02:43:21 +08:00
|
|
|
/// \brief Return true if the given offset Size in bytes can be folded into
|
|
|
|
/// the immediate offsets of a memory instruction for the given address space.
|
|
|
|
static bool canFoldOffset(unsigned OffsetSize, unsigned AS,
|
2016-06-24 14:30:11 +08:00
|
|
|
const SISubtarget &STI) {
|
2015-03-12 02:43:21 +08:00
|
|
|
switch (AS) {
|
|
|
|
case AMDGPUAS::GLOBAL_ADDRESS: {
|
|
|
|
// MUBUF instructions a 12-bit offset in bytes.
|
|
|
|
return isUInt<12>(OffsetSize);
|
|
|
|
}
|
|
|
|
case AMDGPUAS::CONSTANT_ADDRESS: {
|
|
|
|
// SMRD instructions have an 8-bit offset in dwords on SI and
|
|
|
|
// a 20-bit offset in bytes on VI.
|
2016-06-24 14:30:11 +08:00
|
|
|
if (STI.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
|
2015-03-12 02:43:21 +08:00
|
|
|
return isUInt<20>(OffsetSize);
|
|
|
|
else
|
|
|
|
return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4);
|
|
|
|
}
|
|
|
|
case AMDGPUAS::LOCAL_ADDRESS:
|
|
|
|
case AMDGPUAS::REGION_ADDRESS: {
|
|
|
|
// The single offset versions have a 16-bit offset in bytes.
|
|
|
|
return isUInt<16>(OffsetSize);
|
|
|
|
}
|
|
|
|
case AMDGPUAS::PRIVATE_ADDRESS:
|
|
|
|
// Indirect register addressing does not use any offsets.
|
|
|
|
default:
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-08-16 01:49:05 +08:00
|
|
|
// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
|
|
|
|
|
|
|
|
// This is a variant of
|
|
|
|
// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
|
|
|
|
//
|
|
|
|
// The normal DAG combiner will do this, but only if the add has one use since
|
|
|
|
// that would increase the number of instructions.
|
|
|
|
//
|
|
|
|
// This prevents us from seeing a constant offset that can be folded into a
|
|
|
|
// memory instruction's addressing mode. If we know the resulting add offset of
|
|
|
|
// a pointer can be folded into an addressing offset, we can replace the pointer
|
|
|
|
// operand with the add of new constant offset. This eliminates one of the uses,
|
|
|
|
// and may allow the remaining use to also be simplified.
|
|
|
|
//
|
|
|
|
SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
|
|
|
|
unsigned AddrSpace,
|
|
|
|
DAGCombinerInfo &DCI) const {
|
|
|
|
SDValue N0 = N->getOperand(0);
|
|
|
|
SDValue N1 = N->getOperand(1);
|
|
|
|
|
|
|
|
if (N0.getOpcode() != ISD::ADD)
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
|
|
|
|
if (!CN1)
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
|
|
|
|
if (!CAdd)
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
// If the resulting offset is too large, we can't fold it into the addressing
|
|
|
|
// mode offset.
|
|
|
|
APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
|
2016-06-24 14:30:11 +08:00
|
|
|
if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *getSubtarget()))
|
2014-08-16 01:49:05 +08:00
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
SelectionDAG &DAG = DCI.DAG;
|
|
|
|
SDLoc SL(N);
|
|
|
|
EVT VT = N->getValueType(0);
|
|
|
|
|
|
|
|
SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32);
|
2014-08-16 01:49:05 +08:00
|
|
|
|
|
|
|
return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset);
|
|
|
|
}
|
|
|
|
|
2016-09-14 23:19:03 +08:00
|
|
|
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
|
|
|
|
return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
|
|
|
|
(Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
|
|
|
|
(Opc == ISD::XOR && Val == 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
|
|
|
|
// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
|
|
|
|
// integer combine opportunities since most 64-bit operations are decomposed
|
|
|
|
// this way. TODO: We won't want this for SALU especially if it is an inline
|
|
|
|
// immediate.
|
|
|
|
SDValue SITargetLowering::splitBinaryBitConstantOp(
|
|
|
|
DAGCombinerInfo &DCI,
|
|
|
|
const SDLoc &SL,
|
|
|
|
unsigned Opc, SDValue LHS,
|
|
|
|
const ConstantSDNode *CRHS) const {
|
|
|
|
uint64_t Val = CRHS->getZExtValue();
|
|
|
|
uint32_t ValLo = Lo_32(Val);
|
|
|
|
uint32_t ValHi = Hi_32(Val);
|
|
|
|
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
|
|
|
|
|
|
|
|
if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
|
|
|
|
bitOpWithConstantIsReducible(Opc, ValHi)) ||
|
|
|
|
(CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
|
|
|
|
// If we need to materialize a 64-bit immediate, it will be split up later
|
|
|
|
// anyway. Avoid creating the harder to understand 64-bit immediate
|
|
|
|
// materialization.
|
|
|
|
return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
|
|
|
|
}
|
|
|
|
|
|
|
|
return SDValue();
|
|
|
|
}
|
|
|
|
|
2015-01-07 07:00:46 +08:00
|
|
|
SDValue SITargetLowering::performAndCombine(SDNode *N,
|
|
|
|
DAGCombinerInfo &DCI) const {
|
|
|
|
if (DCI.isBeforeLegalize())
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
SelectionDAG &DAG = DCI.DAG;
|
2016-09-14 23:19:03 +08:00
|
|
|
EVT VT = N->getValueType(0);
|
2015-01-07 07:00:46 +08:00
|
|
|
SDValue LHS = N->getOperand(0);
|
|
|
|
SDValue RHS = N->getOperand(1);
|
|
|
|
|
2016-09-14 23:19:03 +08:00
|
|
|
|
|
|
|
if (VT == MVT::i64) {
|
|
|
|
const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
|
|
|
|
if (CRHS) {
|
|
|
|
if (SDValue Split
|
|
|
|
= splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
|
|
|
|
return Split;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
|
|
|
|
// fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
|
|
|
|
if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
|
2015-01-07 07:00:46 +08:00
|
|
|
ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
|
|
|
|
ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
|
|
|
|
|
|
|
|
SDValue X = LHS.getOperand(0);
|
|
|
|
SDValue Y = RHS.getOperand(0);
|
|
|
|
if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X)
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
if (LCC == ISD::SETO) {
|
|
|
|
if (X != LHS.getOperand(1))
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
if (RCC == ISD::SETUNE) {
|
|
|
|
const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
|
|
|
|
if (!C1 || !C1->isInfinity() || C1->isNegative())
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
const uint32_t Mask = SIInstrFlags::N_NORMAL |
|
|
|
|
SIInstrFlags::N_SUBNORMAL |
|
|
|
|
SIInstrFlags::N_ZERO |
|
|
|
|
SIInstrFlags::P_ZERO |
|
|
|
|
SIInstrFlags::P_SUBNORMAL |
|
|
|
|
SIInstrFlags::P_NORMAL;
|
|
|
|
|
|
|
|
static_assert(((~(SIInstrFlags::S_NAN |
|
|
|
|
SIInstrFlags::Q_NAN |
|
|
|
|
SIInstrFlags::N_INFINITY |
|
|
|
|
SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
|
|
|
|
"mask not equal");
|
|
|
|
|
2015-04-28 22:05:47 +08:00
|
|
|
SDLoc DL(N);
|
|
|
|
return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
|
|
|
|
X, DAG.getConstant(Mask, DL, MVT::i32));
|
2015-01-07 07:00:46 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return SDValue();
|
|
|
|
}
|
|
|
|
|
2015-01-07 07:00:39 +08:00
|
|
|
SDValue SITargetLowering::performOrCombine(SDNode *N,
|
|
|
|
DAGCombinerInfo &DCI) const {
|
|
|
|
SelectionDAG &DAG = DCI.DAG;
|
|
|
|
SDValue LHS = N->getOperand(0);
|
|
|
|
SDValue RHS = N->getOperand(1);
|
|
|
|
|
2016-04-13 02:24:38 +08:00
|
|
|
EVT VT = N->getValueType(0);
|
2016-09-14 23:19:03 +08:00
|
|
|
if (VT == MVT::i1) {
|
|
|
|
// or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
|
|
|
|
if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
|
|
|
|
RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
|
|
|
|
SDValue Src = LHS.getOperand(0);
|
|
|
|
if (Src != RHS.getOperand(0))
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
|
|
|
|
const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
|
|
|
|
if (!CLHS || !CRHS)
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
// Only 10 bits are used.
|
|
|
|
static const uint32_t MaxMask = 0x3ff;
|
|
|
|
|
|
|
|
uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
|
|
|
|
SDLoc DL(N);
|
|
|
|
return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
|
|
|
|
Src, DAG.getConstant(NewMask, DL, MVT::i32));
|
2016-04-13 02:24:38 +08:00
|
|
|
}
|
2016-09-14 23:19:03 +08:00
|
|
|
|
|
|
|
return SDValue();
|
2016-04-13 02:24:38 +08:00
|
|
|
}
|
|
|
|
|
2016-09-14 23:19:03 +08:00
|
|
|
if (VT != MVT::i64)
|
|
|
|
return SDValue();
|
2015-01-07 07:00:39 +08:00
|
|
|
|
2016-09-14 23:19:03 +08:00
|
|
|
// TODO: This could be a generic combine with a predicate for extracting the
|
|
|
|
// high half of an integer being free.
|
|
|
|
|
|
|
|
// (or i64:x, (zero_extend i32:y)) ->
|
|
|
|
// i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
|
|
|
|
if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
|
|
|
|
RHS.getOpcode() != ISD::ZERO_EXTEND)
|
|
|
|
std::swap(LHS, RHS);
|
|
|
|
|
|
|
|
if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
|
|
|
|
SDValue ExtSrc = RHS.getOperand(0);
|
|
|
|
EVT SrcVT = ExtSrc.getValueType();
|
|
|
|
if (SrcVT == MVT::i32) {
|
|
|
|
SDLoc SL(N);
|
|
|
|
SDValue LowLHS, HiBits;
|
|
|
|
std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
|
|
|
|
SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
|
|
|
|
|
|
|
|
DCI.AddToWorklist(LowOr.getNode());
|
|
|
|
DCI.AddToWorklist(HiBits.getNode());
|
|
|
|
|
|
|
|
SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
|
|
|
|
LowOr, HiBits);
|
|
|
|
return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
|
|
|
|
}
|
|
|
|
}
|
2015-01-07 07:00:39 +08:00
|
|
|
|
2016-09-14 23:19:03 +08:00
|
|
|
const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
|
|
|
|
if (CRHS) {
|
|
|
|
if (SDValue Split
|
|
|
|
= splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR, LHS, CRHS))
|
|
|
|
return Split;
|
|
|
|
}
|
|
|
|
|
|
|
|
return SDValue();
|
|
|
|
}
|
|
|
|
|
|
|
|
SDValue SITargetLowering::performXorCombine(SDNode *N,
|
|
|
|
DAGCombinerInfo &DCI) const {
|
|
|
|
EVT VT = N->getValueType(0);
|
|
|
|
if (VT != MVT::i64)
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
SDValue LHS = N->getOperand(0);
|
|
|
|
SDValue RHS = N->getOperand(1);
|
2015-01-07 07:00:39 +08:00
|
|
|
|
2016-09-14 23:19:03 +08:00
|
|
|
const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
|
|
|
|
if (CRHS) {
|
|
|
|
if (SDValue Split
|
|
|
|
= splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
|
|
|
|
return Split;
|
2015-01-07 07:00:39 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return SDValue();
|
|
|
|
}
|
|
|
|
|
|
|
|
SDValue SITargetLowering::performClassCombine(SDNode *N,
|
|
|
|
DAGCombinerInfo &DCI) const {
|
|
|
|
SelectionDAG &DAG = DCI.DAG;
|
|
|
|
SDValue Mask = N->getOperand(1);
|
|
|
|
|
|
|
|
// fp_class x, 0 -> false
|
|
|
|
if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) {
|
|
|
|
if (CMask->isNullValue())
|
2015-04-28 22:05:47 +08:00
|
|
|
return DAG.getConstant(0, SDLoc(N), MVT::i1);
|
2015-01-07 07:00:39 +08:00
|
|
|
}
|
|
|
|
|
2016-06-21 02:33:56 +08:00
|
|
|
if (N->getOperand(0).isUndef())
|
|
|
|
return DAG.getUNDEF(MVT::i1);
|
|
|
|
|
2015-01-07 07:00:39 +08:00
|
|
|
return SDValue();
|
|
|
|
}
|
|
|
|
|
2016-04-14 09:42:16 +08:00
|
|
|
// Constant fold canonicalize.
|
|
|
|
SDValue SITargetLowering::performFCanonicalizeCombine(
|
|
|
|
SDNode *N,
|
|
|
|
DAGCombinerInfo &DCI) const {
|
|
|
|
ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
|
|
|
|
if (!CFP)
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
SelectionDAG &DAG = DCI.DAG;
|
|
|
|
const APFloat &C = CFP->getValueAPF();
|
|
|
|
|
|
|
|
// Flush denormals to 0 if not enabled.
|
|
|
|
if (C.isDenormal()) {
|
|
|
|
EVT VT = N->getValueType(0);
|
|
|
|
if (VT == MVT::f32 && !Subtarget->hasFP32Denormals())
|
|
|
|
return DAG.getConstantFP(0.0, SDLoc(N), VT);
|
|
|
|
|
|
|
|
if (VT == MVT::f64 && !Subtarget->hasFP64Denormals())
|
|
|
|
return DAG.getConstantFP(0.0, SDLoc(N), VT);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (C.isNaN()) {
|
|
|
|
EVT VT = N->getValueType(0);
|
|
|
|
APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
|
|
|
|
if (C.isSignaling()) {
|
|
|
|
// Quiet a signaling NaN.
|
|
|
|
return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Make sure it is the canonical NaN bitpattern.
|
|
|
|
//
|
|
|
|
// TODO: Can we use -1 as the canonical NaN value since it's an inline
|
|
|
|
// immediate?
|
|
|
|
if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
|
|
|
|
return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
|
|
|
|
}
|
|
|
|
|
|
|
|
return SDValue(CFP, 0);
|
|
|
|
}
|
|
|
|
|
2014-11-15 04:08:52 +08:00
|
|
|
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
|
|
|
|
switch (Opc) {
|
|
|
|
case ISD::FMAXNUM:
|
|
|
|
return AMDGPUISD::FMAX3;
|
2015-06-09 08:52:37 +08:00
|
|
|
case ISD::SMAX:
|
2014-11-15 04:08:52 +08:00
|
|
|
return AMDGPUISD::SMAX3;
|
2015-06-09 08:52:37 +08:00
|
|
|
case ISD::UMAX:
|
2014-11-15 04:08:52 +08:00
|
|
|
return AMDGPUISD::UMAX3;
|
|
|
|
case ISD::FMINNUM:
|
|
|
|
return AMDGPUISD::FMIN3;
|
2015-06-09 08:52:37 +08:00
|
|
|
case ISD::SMIN:
|
2014-11-15 04:08:52 +08:00
|
|
|
return AMDGPUISD::SMIN3;
|
2015-06-09 08:52:37 +08:00
|
|
|
case ISD::UMIN:
|
2014-11-15 04:08:52 +08:00
|
|
|
return AMDGPUISD::UMIN3;
|
|
|
|
default:
|
|
|
|
llvm_unreachable("Not a min/max opcode");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-06-12 23:39:02 +08:00
|
|
|
static SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
|
|
|
|
SDValue Op0, SDValue Op1, bool Signed) {
|
2016-01-29 04:53:42 +08:00
|
|
|
ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1);
|
|
|
|
if (!K1)
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
ConstantSDNode *K0 = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
|
|
|
|
if (!K0)
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
if (Signed) {
|
|
|
|
if (K0->getAPIntValue().sge(K1->getAPIntValue()))
|
|
|
|
return SDValue();
|
|
|
|
} else {
|
|
|
|
if (K0->getAPIntValue().uge(K1->getAPIntValue()))
|
|
|
|
return SDValue();
|
|
|
|
}
|
|
|
|
|
|
|
|
EVT VT = K0->getValueType(0);
|
2016-11-11 00:02:37 +08:00
|
|
|
|
|
|
|
MVT NVT = MVT::i32;
|
|
|
|
unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
|
|
|
|
|
|
|
|
SDValue Tmp1, Tmp2, Tmp3;
|
|
|
|
Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
|
|
|
|
Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
|
|
|
|
Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
|
|
|
|
|
|
|
|
if (VT == MVT::i16) {
|
|
|
|
Tmp1 = DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, NVT,
|
|
|
|
Tmp1, Tmp2, Tmp3);
|
|
|
|
|
|
|
|
return DAG.getNode(ISD::TRUNCATE, SL, VT, Tmp1);
|
|
|
|
} else
|
|
|
|
return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT,
|
|
|
|
Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
|
2016-01-29 04:53:42 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
|
|
|
|
if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions())
|
|
|
|
return true;
|
|
|
|
|
|
|
|
return DAG.isKnownNeverNaN(Op);
|
|
|
|
}
|
|
|
|
|
2016-06-12 23:39:02 +08:00
|
|
|
static SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
|
|
|
|
SDValue Op0, SDValue Op1) {
|
2016-01-29 04:53:42 +08:00
|
|
|
ConstantFPSDNode *K1 = dyn_cast<ConstantFPSDNode>(Op1);
|
|
|
|
if (!K1)
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
ConstantFPSDNode *K0 = dyn_cast<ConstantFPSDNode>(Op0.getOperand(1));
|
|
|
|
if (!K0)
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
// Ordered >= (although NaN inputs should have folded away by now).
|
|
|
|
APFloat::cmpResult Cmp = K0->getValueAPF().compare(K1->getValueAPF());
|
|
|
|
if (Cmp == APFloat::cmpGreaterThan)
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
// This isn't safe with signaling NaNs because in IEEE mode, min/max on a
|
|
|
|
// signaling NaN gives a quiet NaN. The quiet NaN input to the min would then
|
|
|
|
// give the other result, which is different from med3 with a NaN input.
|
|
|
|
SDValue Var = Op0.getOperand(0);
|
|
|
|
if (!isKnownNeverSNan(DAG, Var))
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
|
|
|
|
Var, SDValue(K0, 0), SDValue(K1, 0));
|
|
|
|
}
|
|
|
|
|
|
|
|
SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
|
|
|
|
DAGCombinerInfo &DCI) const {
|
2014-11-15 04:08:52 +08:00
|
|
|
SelectionDAG &DAG = DCI.DAG;
|
|
|
|
|
|
|
|
unsigned Opc = N->getOpcode();
|
|
|
|
SDValue Op0 = N->getOperand(0);
|
|
|
|
SDValue Op1 = N->getOperand(1);
|
|
|
|
|
|
|
|
// Only do this if the inner op has one use since this will just increases
|
|
|
|
// register pressure for no benefit.
|
|
|
|
|
2016-01-29 04:53:48 +08:00
|
|
|
if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY) {
|
|
|
|
// max(max(a, b), c) -> max3(a, b, c)
|
|
|
|
// min(min(a, b), c) -> min3(a, b, c)
|
|
|
|
if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
|
|
|
|
SDLoc DL(N);
|
|
|
|
return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
|
|
|
|
DL,
|
|
|
|
N->getValueType(0),
|
|
|
|
Op0.getOperand(0),
|
|
|
|
Op0.getOperand(1),
|
|
|
|
Op1);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Try commuted.
|
|
|
|
// max(a, max(b, c)) -> max3(a, b, c)
|
|
|
|
// min(a, min(b, c)) -> min3(a, b, c)
|
|
|
|
if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
|
|
|
|
SDLoc DL(N);
|
|
|
|
return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
|
|
|
|
DL,
|
|
|
|
N->getValueType(0),
|
|
|
|
Op0,
|
|
|
|
Op1.getOperand(0),
|
|
|
|
Op1.getOperand(1));
|
|
|
|
}
|
2014-11-15 04:08:52 +08:00
|
|
|
}
|
|
|
|
|
2016-01-29 04:53:42 +08:00
|
|
|
// min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
|
|
|
|
if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
|
|
|
|
if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true))
|
|
|
|
return Med3;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
|
|
|
|
if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false))
|
|
|
|
return Med3;
|
|
|
|
}
|
|
|
|
|
|
|
|
// fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
|
2016-01-29 04:53:48 +08:00
|
|
|
if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
|
|
|
|
(Opc == AMDGPUISD::FMIN_LEGACY &&
|
|
|
|
Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
|
2016-01-29 04:53:42 +08:00
|
|
|
N->getValueType(0) == MVT::f32 && Op0.hasOneUse()) {
|
|
|
|
if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
|
|
|
|
return Res;
|
|
|
|
}
|
|
|
|
|
2014-11-15 04:08:52 +08:00
|
|
|
return SDValue();
|
|
|
|
}
|
|
|
|
|
2015-01-07 07:00:41 +08:00
|
|
|
SDValue SITargetLowering::performSetCCCombine(SDNode *N,
|
|
|
|
DAGCombinerInfo &DCI) const {
|
|
|
|
SelectionDAG &DAG = DCI.DAG;
|
|
|
|
SDLoc SL(N);
|
|
|
|
|
|
|
|
SDValue LHS = N->getOperand(0);
|
|
|
|
SDValue RHS = N->getOperand(1);
|
|
|
|
EVT VT = LHS.getValueType();
|
|
|
|
|
2016-11-13 15:01:11 +08:00
|
|
|
if (VT != MVT::f32 && VT != MVT::f64 && (Subtarget->has16BitInsts() &&
|
|
|
|
VT != MVT::f16))
|
2015-01-07 07:00:41 +08:00
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
// Match isinf pattern
|
|
|
|
// (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
|
|
|
|
ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
|
|
|
|
if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) {
|
|
|
|
const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
|
|
|
|
if (!CRHS)
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
const APFloat &APF = CRHS->getValueAPF();
|
|
|
|
if (APF.isInfinity() && !APF.isNegative()) {
|
|
|
|
unsigned Mask = SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY;
|
2015-04-28 22:05:47 +08:00
|
|
|
return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
|
|
|
|
DAG.getConstant(Mask, SL, MVT::i32));
|
2015-01-07 07:00:41 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return SDValue();
|
|
|
|
}
|
|
|
|
|
2012-12-12 05:25:42 +08:00
|
|
|
SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
|
|
|
|
DAGCombinerInfo &DCI) const {
|
|
|
|
SelectionDAG &DAG = DCI.DAG;
|
2013-05-25 10:42:55 +08:00
|
|
|
SDLoc DL(N);
|
2012-12-12 05:25:42 +08:00
|
|
|
|
|
|
|
switch (N->getOpcode()) {
|
2014-12-22 00:48:42 +08:00
|
|
|
default:
|
|
|
|
return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
|
2015-01-07 07:00:41 +08:00
|
|
|
case ISD::SETCC:
|
|
|
|
return performSetCCCombine(N, DCI);
|
2016-01-29 04:53:48 +08:00
|
|
|
case ISD::FMAXNUM:
|
2014-11-15 04:08:52 +08:00
|
|
|
case ISD::FMINNUM:
|
2015-06-09 08:52:37 +08:00
|
|
|
case ISD::SMAX:
|
|
|
|
case ISD::SMIN:
|
|
|
|
case ISD::UMAX:
|
2016-01-29 04:53:48 +08:00
|
|
|
case ISD::UMIN:
|
|
|
|
case AMDGPUISD::FMIN_LEGACY:
|
|
|
|
case AMDGPUISD::FMAX_LEGACY: {
|
2014-11-15 04:08:52 +08:00
|
|
|
if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
|
2015-03-16 23:53:55 +08:00
|
|
|
N->getValueType(0) != MVT::f64 &&
|
2014-11-15 04:08:52 +08:00
|
|
|
getTargetMachine().getOptLevel() > CodeGenOpt::None)
|
2016-01-29 04:53:42 +08:00
|
|
|
return performMinMaxCombine(N, DCI);
|
2014-11-15 04:08:52 +08:00
|
|
|
break;
|
|
|
|
}
|
2014-06-12 01:50:44 +08:00
|
|
|
|
|
|
|
case AMDGPUISD::CVT_F32_UBYTE0:
|
|
|
|
case AMDGPUISD::CVT_F32_UBYTE1:
|
|
|
|
case AMDGPUISD::CVT_F32_UBYTE2:
|
|
|
|
case AMDGPUISD::CVT_F32_UBYTE3: {
|
|
|
|
unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
|
2016-10-22 06:10:03 +08:00
|
|
|
|
2014-06-12 01:50:44 +08:00
|
|
|
SDValue Src = N->getOperand(0);
|
2016-10-22 06:10:03 +08:00
|
|
|
SDValue Srl = N->getOperand(0);
|
|
|
|
if (Srl.getOpcode() == ISD::ZERO_EXTEND)
|
|
|
|
Srl = Srl.getOperand(0);
|
2016-05-10 00:29:50 +08:00
|
|
|
|
2016-07-02 06:47:50 +08:00
|
|
|
// TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero.
|
2016-10-22 06:10:03 +08:00
|
|
|
if (Srl.getOpcode() == ISD::SRL) {
|
2016-05-10 00:29:50 +08:00
|
|
|
// cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
|
|
|
|
// cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
|
|
|
|
// cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
|
|
|
|
|
2016-10-22 06:10:03 +08:00
|
|
|
if (const ConstantSDNode *C =
|
|
|
|
dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
|
|
|
|
Srl = DAG.getZExtOrTrunc(Srl.getOperand(0), SDLoc(Srl.getOperand(0)),
|
|
|
|
EVT(MVT::i32));
|
|
|
|
|
2016-05-10 00:29:50 +08:00
|
|
|
unsigned SrcOffset = C->getZExtValue() + 8 * Offset;
|
|
|
|
if (SrcOffset < 32 && SrcOffset % 8 == 0) {
|
|
|
|
return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + SrcOffset / 8, DL,
|
2016-10-22 06:10:03 +08:00
|
|
|
MVT::f32, Srl);
|
2016-05-10 00:29:50 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-06-12 01:50:44 +08:00
|
|
|
APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
|
|
|
|
|
|
|
|
APInt KnownZero, KnownOne;
|
|
|
|
TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
|
|
|
|
!DCI.isBeforeLegalizeOps());
|
|
|
|
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
|
|
|
|
if (TLO.ShrinkDemandedConstant(Src, Demanded) ||
|
|
|
|
TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) {
|
|
|
|
DCI.CommitTargetLoweringOpt(TLO);
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
2016-10-22 06:10:03 +08:00
|
|
|
case ISD::SINT_TO_FP:
|
2014-06-12 01:50:44 +08:00
|
|
|
case ISD::UINT_TO_FP: {
|
|
|
|
return performUCharToFloatCombine(N, DCI);
|
2016-01-12 01:02:00 +08:00
|
|
|
}
|
2014-09-29 22:59:34 +08:00
|
|
|
case ISD::FADD: {
|
|
|
|
if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
|
|
|
|
break;
|
|
|
|
|
|
|
|
EVT VT = N->getValueType(0);
|
|
|
|
if (VT != MVT::f32)
|
|
|
|
break;
|
|
|
|
|
2015-02-21 06:10:41 +08:00
|
|
|
// Only do this if we are not trying to support denormals. v_mad_f32 does
|
|
|
|
// not support denormals ever.
|
|
|
|
if (Subtarget->hasFP32Denormals())
|
|
|
|
break;
|
|
|
|
|
2014-09-29 22:59:34 +08:00
|
|
|
SDValue LHS = N->getOperand(0);
|
|
|
|
SDValue RHS = N->getOperand(1);
|
|
|
|
|
|
|
|
// These should really be instruction patterns, but writing patterns with
|
|
|
|
// source modiifiers is a pain.
|
|
|
|
|
|
|
|
// fadd (fadd (a, a), b) -> mad 2.0, a, b
|
|
|
|
if (LHS.getOpcode() == ISD::FADD) {
|
|
|
|
SDValue A = LHS.getOperand(0);
|
|
|
|
if (A == LHS.getOperand(1)) {
|
2015-04-28 22:05:47 +08:00
|
|
|
const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32);
|
2015-02-21 06:10:41 +08:00
|
|
|
return DAG.getNode(ISD::FMAD, DL, VT, Two, A, RHS);
|
2014-09-29 22:59:34 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// fadd (b, fadd (a, a)) -> mad 2.0, a, b
|
|
|
|
if (RHS.getOpcode() == ISD::FADD) {
|
|
|
|
SDValue A = RHS.getOperand(0);
|
|
|
|
if (A == RHS.getOperand(1)) {
|
2015-04-28 22:05:47 +08:00
|
|
|
const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32);
|
2015-02-21 06:10:41 +08:00
|
|
|
return DAG.getNode(ISD::FMAD, DL, VT, Two, A, LHS);
|
2014-09-29 22:59:34 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-02-21 06:10:41 +08:00
|
|
|
return SDValue();
|
2014-09-29 22:59:34 +08:00
|
|
|
}
|
2014-08-30 00:01:14 +08:00
|
|
|
case ISD::FSUB: {
|
|
|
|
if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
|
|
|
|
break;
|
|
|
|
|
|
|
|
EVT VT = N->getValueType(0);
|
|
|
|
|
|
|
|
// Try to get the fneg to fold into the source modifier. This undoes generic
|
|
|
|
// DAG combines and folds them into the mad.
|
2015-02-21 06:10:41 +08:00
|
|
|
//
|
|
|
|
// Only do this if we are not trying to support denormals. v_mad_f32 does
|
|
|
|
// not support denormals ever.
|
2016-11-13 15:01:11 +08:00
|
|
|
if (VT == MVT::f32 && !Subtarget->hasFP32Denormals()) {
|
2014-08-30 00:01:14 +08:00
|
|
|
SDValue LHS = N->getOperand(0);
|
|
|
|
SDValue RHS = N->getOperand(1);
|
2014-09-29 22:59:38 +08:00
|
|
|
if (LHS.getOpcode() == ISD::FADD) {
|
|
|
|
// (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
|
|
|
|
|
|
|
|
SDValue A = LHS.getOperand(0);
|
|
|
|
if (A == LHS.getOperand(1)) {
|
2015-04-28 22:05:47 +08:00
|
|
|
const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32);
|
2014-09-29 22:59:38 +08:00
|
|
|
SDValue NegRHS = DAG.getNode(ISD::FNEG, DL, VT, RHS);
|
|
|
|
|
2015-02-21 06:10:41 +08:00
|
|
|
return DAG.getNode(ISD::FMAD, DL, VT, Two, A, NegRHS);
|
2014-09-29 22:59:38 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (RHS.getOpcode() == ISD::FADD) {
|
|
|
|
// (fsub c, (fadd a, a)) -> mad -2.0, a, c
|
|
|
|
|
|
|
|
SDValue A = RHS.getOperand(0);
|
|
|
|
if (A == RHS.getOperand(1)) {
|
2015-04-28 22:05:47 +08:00
|
|
|
const SDValue NegTwo = DAG.getConstantFP(-2.0, DL, MVT::f32);
|
2015-02-21 06:10:41 +08:00
|
|
|
return DAG.getNode(ISD::FMAD, DL, VT, NegTwo, A, LHS);
|
2014-09-29 22:59:38 +08:00
|
|
|
}
|
|
|
|
}
|
2015-02-21 06:10:41 +08:00
|
|
|
|
|
|
|
return SDValue();
|
2014-08-30 00:01:14 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
2014-08-16 01:49:05 +08:00
|
|
|
case ISD::LOAD:
|
|
|
|
case ISD::STORE:
|
|
|
|
case ISD::ATOMIC_LOAD:
|
|
|
|
case ISD::ATOMIC_STORE:
|
|
|
|
case ISD::ATOMIC_CMP_SWAP:
|
|
|
|
case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
|
|
|
|
case ISD::ATOMIC_SWAP:
|
|
|
|
case ISD::ATOMIC_LOAD_ADD:
|
|
|
|
case ISD::ATOMIC_LOAD_SUB:
|
|
|
|
case ISD::ATOMIC_LOAD_AND:
|
|
|
|
case ISD::ATOMIC_LOAD_OR:
|
|
|
|
case ISD::ATOMIC_LOAD_XOR:
|
|
|
|
case ISD::ATOMIC_LOAD_NAND:
|
|
|
|
case ISD::ATOMIC_LOAD_MIN:
|
|
|
|
case ISD::ATOMIC_LOAD_MAX:
|
|
|
|
case ISD::ATOMIC_LOAD_UMIN:
|
2016-04-12 22:05:04 +08:00
|
|
|
case ISD::ATOMIC_LOAD_UMAX:
|
|
|
|
case AMDGPUISD::ATOMIC_INC:
|
|
|
|
case AMDGPUISD::ATOMIC_DEC: { // TODO: Target mem intrinsics.
|
2014-08-16 01:49:05 +08:00
|
|
|
if (DCI.isBeforeLegalize())
|
|
|
|
break;
|
|
|
|
|
|
|
|
MemSDNode *MemNode = cast<MemSDNode>(N);
|
|
|
|
SDValue Ptr = MemNode->getBasePtr();
|
2014-05-23 02:09:07 +08:00
|
|
|
|
2014-08-16 01:49:05 +08:00
|
|
|
// TODO: We could also do this for multiplies.
|
|
|
|
unsigned AS = MemNode->getAddressSpace();
|
|
|
|
if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUAS::PRIVATE_ADDRESS) {
|
|
|
|
SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI);
|
|
|
|
if (NewPtr) {
|
2015-02-17 23:29:18 +08:00
|
|
|
SmallVector<SDValue, 8> NewOps(MemNode->op_begin(), MemNode->op_end());
|
2014-08-16 01:49:05 +08:00
|
|
|
|
|
|
|
NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr;
|
|
|
|
return SDValue(DAG.UpdateNodeOperands(MemNode, NewOps), 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
2015-01-07 07:00:46 +08:00
|
|
|
case ISD::AND:
|
|
|
|
return performAndCombine(N, DCI);
|
2015-01-07 07:00:39 +08:00
|
|
|
case ISD::OR:
|
|
|
|
return performOrCombine(N, DCI);
|
2016-09-14 23:19:03 +08:00
|
|
|
case ISD::XOR:
|
|
|
|
return performXorCombine(N, DCI);
|
2015-01-07 07:00:39 +08:00
|
|
|
case AMDGPUISD::FP_CLASS:
|
|
|
|
return performClassCombine(N, DCI);
|
2016-04-14 09:42:16 +08:00
|
|
|
case ISD::FCANONICALIZE:
|
|
|
|
return performFCanonicalizeCombine(N, DCI);
|
2016-06-21 02:33:56 +08:00
|
|
|
case AMDGPUISD::FRACT:
|
|
|
|
case AMDGPUISD::RCP:
|
|
|
|
case AMDGPUISD::RSQ:
|
2016-07-27 00:45:45 +08:00
|
|
|
case AMDGPUISD::RCP_LEGACY:
|
2016-06-21 02:33:56 +08:00
|
|
|
case AMDGPUISD::RSQ_LEGACY:
|
|
|
|
case AMDGPUISD::RSQ_CLAMP:
|
|
|
|
case AMDGPUISD::LDEXP: {
|
|
|
|
SDValue Src = N->getOperand(0);
|
|
|
|
if (Src.isUndef())
|
|
|
|
return Src;
|
|
|
|
break;
|
|
|
|
}
|
2014-08-16 01:49:05 +08:00
|
|
|
}
|
2014-05-23 02:09:07 +08:00
|
|
|
return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
|
2012-12-12 05:25:42 +08:00
|
|
|
}
|
2013-02-27 01:52:16 +08:00
|
|
|
|
2013-04-10 16:39:08 +08:00
|
|
|
/// \brief Helper function for adjustWritemask
|
2013-05-23 23:43:05 +08:00
|
|
|
static unsigned SubIdx2Lane(unsigned Idx) {
|
2013-04-10 16:39:08 +08:00
|
|
|
switch (Idx) {
|
|
|
|
default: return 0;
|
|
|
|
case AMDGPU::sub0: return 0;
|
|
|
|
case AMDGPU::sub1: return 1;
|
|
|
|
case AMDGPU::sub2: return 2;
|
|
|
|
case AMDGPU::sub3: return 3;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// \brief Adjust the writemask of MIMG instructions
|
|
|
|
void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
|
|
|
|
SelectionDAG &DAG) const {
|
|
|
|
SDNode *Users[4] = { };
|
2013-10-23 10:53:47 +08:00
|
|
|
unsigned Lane = 0;
|
2016-02-26 17:51:05 +08:00
|
|
|
unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 9) ? 2 : 3;
|
|
|
|
unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
|
2013-10-23 10:53:47 +08:00
|
|
|
unsigned NewDmask = 0;
|
2013-04-10 16:39:08 +08:00
|
|
|
|
|
|
|
// Try to figure out the used register components
|
|
|
|
for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
|
|
|
|
I != E; ++I) {
|
|
|
|
|
|
|
|
// Abort if we can't understand the usage
|
|
|
|
if (!I->isMachineOpcode() ||
|
|
|
|
I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
|
|
|
|
return;
|
|
|
|
|
2013-10-23 10:53:47 +08:00
|
|
|
// Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used.
|
|
|
|
// Note that subregs are packed, i.e. Lane==0 is the first bit set
|
|
|
|
// in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
|
|
|
|
// set, etc.
|
2013-04-10 16:39:16 +08:00
|
|
|
Lane = SubIdx2Lane(I->getConstantOperandVal(1));
|
2013-04-10 16:39:08 +08:00
|
|
|
|
2013-10-23 10:53:47 +08:00
|
|
|
// Set which texture component corresponds to the lane.
|
|
|
|
unsigned Comp;
|
|
|
|
for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) {
|
|
|
|
assert(Dmask);
|
2013-10-23 11:50:25 +08:00
|
|
|
Comp = countTrailingZeros(Dmask);
|
2013-10-23 10:53:47 +08:00
|
|
|
Dmask &= ~(1 << Comp);
|
|
|
|
}
|
|
|
|
|
2013-04-10 16:39:08 +08:00
|
|
|
// Abort if we have more than one user per component
|
|
|
|
if (Users[Lane])
|
|
|
|
return;
|
|
|
|
|
|
|
|
Users[Lane] = *I;
|
2013-10-23 10:53:47 +08:00
|
|
|
NewDmask |= 1 << Comp;
|
2013-04-10 16:39:08 +08:00
|
|
|
}
|
|
|
|
|
2013-10-23 10:53:47 +08:00
|
|
|
// Abort if there's no change
|
|
|
|
if (NewDmask == OldDmask)
|
2013-04-10 16:39:08 +08:00
|
|
|
return;
|
|
|
|
|
|
|
|
// Adjust the writemask in the node
|
|
|
|
std::vector<SDValue> Ops;
|
2016-02-26 17:51:05 +08:00
|
|
|
Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
|
2015-04-28 22:05:47 +08:00
|
|
|
Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
|
2016-02-26 17:51:05 +08:00
|
|
|
Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
|
2014-04-28 13:57:50 +08:00
|
|
|
Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops);
|
2013-04-10 16:39:08 +08:00
|
|
|
|
2013-04-10 16:39:16 +08:00
|
|
|
// If we only got one lane, replace it with a copy
|
2013-10-23 10:53:47 +08:00
|
|
|
// (if NewDmask has only one bit set...)
|
|
|
|
if (NewDmask && (NewDmask & (NewDmask-1)) == 0) {
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue RC = DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, SDLoc(),
|
|
|
|
MVT::i32);
|
2013-04-10 16:39:16 +08:00
|
|
|
SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
|
2013-05-25 10:42:55 +08:00
|
|
|
SDLoc(), Users[Lane]->getValueType(0),
|
2013-04-10 16:39:16 +08:00
|
|
|
SDValue(Node, 0), RC);
|
|
|
|
DAG.ReplaceAllUsesWith(Users[Lane], Copy);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2013-04-10 16:39:08 +08:00
|
|
|
// Update the users of the node with the new indices
|
|
|
|
for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) {
|
|
|
|
|
|
|
|
SDNode *User = Users[i];
|
|
|
|
if (!User)
|
|
|
|
continue;
|
|
|
|
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
|
2013-04-10 16:39:08 +08:00
|
|
|
DAG.UpdateNodeOperands(User, User->getOperand(0), Op);
|
|
|
|
|
|
|
|
switch (Idx) {
|
|
|
|
default: break;
|
|
|
|
case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
|
|
|
|
case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
|
|
|
|
case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-07-17 03:40:07 +08:00
|
|
|
static bool isFrameIndexOp(SDValue Op) {
|
|
|
|
if (Op.getOpcode() == ISD::AssertZext)
|
|
|
|
Op = Op.getOperand(0);
|
|
|
|
|
|
|
|
return isa<FrameIndexSDNode>(Op);
|
|
|
|
}
|
|
|
|
|
2014-10-10 03:06:00 +08:00
|
|
|
/// \brief Legalize target independent instructions (e.g. INSERT_SUBREG)
|
|
|
|
/// with frame index operands.
|
|
|
|
/// LLVM assumes that inputs are to these instructions are registers.
|
|
|
|
void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
|
|
|
|
SelectionDAG &DAG) const {
|
2014-10-10 02:09:15 +08:00
|
|
|
|
|
|
|
SmallVector<SDValue, 8> Ops;
|
2014-10-10 03:06:00 +08:00
|
|
|
for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
|
2015-07-17 03:40:07 +08:00
|
|
|
if (!isFrameIndexOp(Node->getOperand(i))) {
|
2014-10-10 03:06:00 +08:00
|
|
|
Ops.push_back(Node->getOperand(i));
|
2014-10-10 02:09:15 +08:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2014-10-10 03:06:00 +08:00
|
|
|
SDLoc DL(Node);
|
2014-10-10 02:09:15 +08:00
|
|
|
Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
|
2014-10-10 03:06:00 +08:00
|
|
|
Node->getOperand(i).getValueType(),
|
|
|
|
Node->getOperand(i)), 0));
|
2014-10-10 02:09:15 +08:00
|
|
|
}
|
|
|
|
|
2014-10-10 03:06:00 +08:00
|
|
|
DAG.UpdateNodeOperands(Node, Ops);
|
2014-10-10 02:09:15 +08:00
|
|
|
}
|
|
|
|
|
2014-06-04 07:06:13 +08:00
|
|
|
/// \brief Fold the instructions after selecting them.
|
2013-04-10 16:39:08 +08:00
|
|
|
SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
|
|
|
|
SelectionDAG &DAG) const {
|
2016-06-24 14:30:11 +08:00
|
|
|
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
|
2016-02-19 00:44:18 +08:00
|
|
|
unsigned Opcode = Node->getMachineOpcode();
|
2013-04-10 16:39:08 +08:00
|
|
|
|
2016-07-12 05:59:43 +08:00
|
|
|
if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() &&
|
|
|
|
!TII->isGather4(Opcode))
|
2013-04-10 16:39:08 +08:00
|
|
|
adjustWritemask(Node, DAG);
|
|
|
|
|
2016-02-19 00:44:18 +08:00
|
|
|
if (Opcode == AMDGPU::INSERT_SUBREG ||
|
|
|
|
Opcode == AMDGPU::REG_SEQUENCE) {
|
2014-10-10 02:09:15 +08:00
|
|
|
legalizeTargetIndependentNode(Node, DAG);
|
|
|
|
return Node;
|
|
|
|
}
|
2015-01-08 23:08:17 +08:00
|
|
|
return Node;
|
2013-04-10 16:39:08 +08:00
|
|
|
}
|
2013-04-10 16:39:16 +08:00
|
|
|
|
|
|
|
/// \brief Assign the register class depending on the number of
|
|
|
|
/// bits set in the writemask
|
2016-07-01 06:52:52 +08:00
|
|
|
void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
|
2013-04-10 16:39:16 +08:00
|
|
|
SDNode *Node) const {
|
2016-06-24 14:30:11 +08:00
|
|
|
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
|
2013-04-10 16:39:16 +08:00
|
|
|
|
2016-07-01 06:52:52 +08:00
|
|
|
MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
|
2015-10-22 05:51:02 +08:00
|
|
|
|
2016-07-01 06:52:52 +08:00
|
|
|
if (TII->isVOP3(MI.getOpcode())) {
|
2015-10-22 05:51:02 +08:00
|
|
|
// Make sure constant bus requirements are respected.
|
2016-07-01 06:52:52 +08:00
|
|
|
TII->legalizeOperandsVOP3(MRI, MI);
|
2015-10-22 05:51:02 +08:00
|
|
|
return;
|
|
|
|
}
|
2014-09-27 01:54:59 +08:00
|
|
|
|
2016-07-01 06:52:52 +08:00
|
|
|
if (TII->isMIMG(MI)) {
|
|
|
|
unsigned VReg = MI.getOperand(0).getReg();
|
2016-11-15 02:33:18 +08:00
|
|
|
const TargetRegisterClass *RC = MRI.getRegClass(VReg);
|
|
|
|
// TODO: Need mapping tables to handle other cases (register classes).
|
|
|
|
if (RC != &AMDGPU::VReg_128RegClass)
|
|
|
|
return;
|
|
|
|
|
2016-07-01 06:52:52 +08:00
|
|
|
unsigned DmaskIdx = MI.getNumOperands() == 12 ? 3 : 4;
|
|
|
|
unsigned Writemask = MI.getOperand(DmaskIdx).getImm();
|
2014-09-08 23:07:31 +08:00
|
|
|
unsigned BitsSet = 0;
|
|
|
|
for (unsigned i = 0; i < 4; ++i)
|
|
|
|
BitsSet += Writemask & (1 << i) ? 1 : 0;
|
|
|
|
switch (BitsSet) {
|
|
|
|
default: return;
|
2015-01-08 04:59:25 +08:00
|
|
|
case 1: RC = &AMDGPU::VGPR_32RegClass; break;
|
2014-09-08 23:07:31 +08:00
|
|
|
case 2: RC = &AMDGPU::VReg_64RegClass; break;
|
|
|
|
case 3: RC = &AMDGPU::VReg_96RegClass; break;
|
|
|
|
}
|
|
|
|
|
2016-07-01 06:52:52 +08:00
|
|
|
unsigned NewOpcode = TII->getMaskedMIMGOp(MI.getOpcode(), BitsSet);
|
|
|
|
MI.setDesc(TII->get(NewOpcode));
|
2014-09-08 23:07:31 +08:00
|
|
|
MRI.setRegClass(VReg, RC);
|
|
|
|
return;
|
2013-04-10 16:39:16 +08:00
|
|
|
}
|
|
|
|
|
2014-09-08 23:07:31 +08:00
|
|
|
// Replace unused atomics with the no return version.
|
2016-07-01 06:52:52 +08:00
|
|
|
int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode());
|
2014-09-08 23:07:31 +08:00
|
|
|
if (NoRetAtomicOp != -1) {
|
|
|
|
if (!Node->hasAnyUseOfValue(0)) {
|
2016-07-01 06:52:52 +08:00
|
|
|
MI.setDesc(TII->get(NoRetAtomicOp));
|
|
|
|
MI.RemoveOperand(0);
|
AMDGPU: Implement {BUFFER,FLAT}_ATOMIC_CMPSWAP{,_X2}
Summary:
Implement BUFFER_ATOMIC_CMPSWAP{,_X2} instructions on all GCN targets, and FLAT_ATOMIC_CMPSWAP{,_X2} on CI+.
32-bit instruction variants tested manually on Kabini and Bonaire. Tests and parts of code provided by Jan Veselý.
Patch by: Vedran Miletić
Reviewers: arsenm, tstellarAMD, nhaehnle
Subscribers: jvesely, scchan, kanarayan, arsenm
Differential Revision: http://reviews.llvm.org/D17280
llvm-svn: 265170
2016-04-02 02:27:37 +08:00
|
|
|
return;
|
2014-09-08 23:07:31 +08:00
|
|
|
}
|
|
|
|
|
AMDGPU: Implement {BUFFER,FLAT}_ATOMIC_CMPSWAP{,_X2}
Summary:
Implement BUFFER_ATOMIC_CMPSWAP{,_X2} instructions on all GCN targets, and FLAT_ATOMIC_CMPSWAP{,_X2} on CI+.
32-bit instruction variants tested manually on Kabini and Bonaire. Tests and parts of code provided by Jan Veselý.
Patch by: Vedran Miletić
Reviewers: arsenm, tstellarAMD, nhaehnle
Subscribers: jvesely, scchan, kanarayan, arsenm
Differential Revision: http://reviews.llvm.org/D17280
llvm-svn: 265170
2016-04-02 02:27:37 +08:00
|
|
|
// For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg
|
|
|
|
// instruction, because the return type of these instructions is a vec2 of
|
|
|
|
// the memory type, so it can be tied to the input operand.
|
|
|
|
// This means these instructions always have a use, so we need to add a
|
|
|
|
// special case to check if the atomic has only one extract_subreg use,
|
|
|
|
// which itself has no uses.
|
|
|
|
if ((Node->hasNUsesOfValue(1, 0) &&
|
2016-04-15 22:42:36 +08:00
|
|
|
Node->use_begin()->isMachineOpcode() &&
|
AMDGPU: Implement {BUFFER,FLAT}_ATOMIC_CMPSWAP{,_X2}
Summary:
Implement BUFFER_ATOMIC_CMPSWAP{,_X2} instructions on all GCN targets, and FLAT_ATOMIC_CMPSWAP{,_X2} on CI+.
32-bit instruction variants tested manually on Kabini and Bonaire. Tests and parts of code provided by Jan Veselý.
Patch by: Vedran Miletić
Reviewers: arsenm, tstellarAMD, nhaehnle
Subscribers: jvesely, scchan, kanarayan, arsenm
Differential Revision: http://reviews.llvm.org/D17280
llvm-svn: 265170
2016-04-02 02:27:37 +08:00
|
|
|
Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG &&
|
|
|
|
!Node->use_begin()->hasAnyUseOfValue(0))) {
|
2016-07-01 06:52:52 +08:00
|
|
|
unsigned Def = MI.getOperand(0).getReg();
|
AMDGPU: Implement {BUFFER,FLAT}_ATOMIC_CMPSWAP{,_X2}
Summary:
Implement BUFFER_ATOMIC_CMPSWAP{,_X2} instructions on all GCN targets, and FLAT_ATOMIC_CMPSWAP{,_X2} on CI+.
32-bit instruction variants tested manually on Kabini and Bonaire. Tests and parts of code provided by Jan Veselý.
Patch by: Vedran Miletić
Reviewers: arsenm, tstellarAMD, nhaehnle
Subscribers: jvesely, scchan, kanarayan, arsenm
Differential Revision: http://reviews.llvm.org/D17280
llvm-svn: 265170
2016-04-02 02:27:37 +08:00
|
|
|
|
|
|
|
// Change this into a noret atomic.
|
2016-07-01 06:52:52 +08:00
|
|
|
MI.setDesc(TII->get(NoRetAtomicOp));
|
|
|
|
MI.RemoveOperand(0);
|
AMDGPU: Implement {BUFFER,FLAT}_ATOMIC_CMPSWAP{,_X2}
Summary:
Implement BUFFER_ATOMIC_CMPSWAP{,_X2} instructions on all GCN targets, and FLAT_ATOMIC_CMPSWAP{,_X2} on CI+.
32-bit instruction variants tested manually on Kabini and Bonaire. Tests and parts of code provided by Jan Veselý.
Patch by: Vedran Miletić
Reviewers: arsenm, tstellarAMD, nhaehnle
Subscribers: jvesely, scchan, kanarayan, arsenm
Differential Revision: http://reviews.llvm.org/D17280
llvm-svn: 265170
2016-04-02 02:27:37 +08:00
|
|
|
|
|
|
|
// If we only remove the def operand from the atomic instruction, the
|
|
|
|
// extract_subreg will be left with a use of a vreg without a def.
|
|
|
|
// So we need to insert an implicit_def to avoid machine verifier
|
|
|
|
// errors.
|
2016-07-01 06:52:52 +08:00
|
|
|
BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
|
AMDGPU: Implement {BUFFER,FLAT}_ATOMIC_CMPSWAP{,_X2}
Summary:
Implement BUFFER_ATOMIC_CMPSWAP{,_X2} instructions on all GCN targets, and FLAT_ATOMIC_CMPSWAP{,_X2} on CI+.
32-bit instruction variants tested manually on Kabini and Bonaire. Tests and parts of code provided by Jan Veselý.
Patch by: Vedran Miletić
Reviewers: arsenm, tstellarAMD, nhaehnle
Subscribers: jvesely, scchan, kanarayan, arsenm
Differential Revision: http://reviews.llvm.org/D17280
llvm-svn: 265170
2016-04-02 02:27:37 +08:00
|
|
|
TII->get(AMDGPU::IMPLICIT_DEF), Def);
|
|
|
|
}
|
2014-09-08 23:07:31 +08:00
|
|
|
return;
|
|
|
|
}
|
2013-04-10 16:39:16 +08:00
|
|
|
}
|
2013-06-04 01:39:58 +08:00
|
|
|
|
2016-06-12 23:39:02 +08:00
|
|
|
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
|
|
|
|
uint64_t Val) {
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
|
2014-11-06 03:01:17 +08:00
|
|
|
return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
|
2016-06-12 23:39:02 +08:00
|
|
|
const SDLoc &DL,
|
2014-11-06 03:01:17 +08:00
|
|
|
SDValue Ptr) const {
|
2016-06-24 14:30:11 +08:00
|
|
|
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
|
2015-09-26 01:08:42 +08:00
|
|
|
|
|
|
|
// Build the half of the subregister with the constants before building the
|
|
|
|
// full 128-bit register. If we are building multiple resource descriptors,
|
|
|
|
// this will allow CSEing of the 2-component register.
|
|
|
|
const SDValue Ops0[] = {
|
|
|
|
DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
|
|
|
|
buildSMovImm32(DAG, DL, 0),
|
|
|
|
DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
|
|
|
|
buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
|
|
|
|
DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
|
|
|
|
};
|
|
|
|
|
|
|
|
SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
|
|
|
|
MVT::v2i32, Ops0), 0);
|
|
|
|
|
|
|
|
// Combine the constants and the pointer.
|
|
|
|
const SDValue Ops1[] = {
|
|
|
|
DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
|
|
|
|
Ptr,
|
|
|
|
DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
|
|
|
|
SubRegHi,
|
|
|
|
DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
|
|
|
|
};
|
|
|
|
|
|
|
|
return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
|
2014-11-06 03:01:17 +08:00
|
|
|
}
|
|
|
|
|
2014-11-06 03:01:19 +08:00
|
|
|
/// \brief Return a resource descriptor with the 'Add TID' bit enabled
|
2015-08-09 02:27:36 +08:00
|
|
|
/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
|
|
|
|
/// of the resource descriptor) to create an offset, which is added to
|
|
|
|
/// the resource pointer.
|
2016-06-12 23:39:02 +08:00
|
|
|
MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
|
|
|
|
SDValue Ptr, uint32_t RsrcDword1,
|
2014-11-06 03:01:19 +08:00
|
|
|
uint64_t RsrcDword2And3) const {
|
|
|
|
SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
|
|
|
|
SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
|
|
|
|
if (RsrcDword1) {
|
|
|
|
PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
|
2015-04-28 22:05:47 +08:00
|
|
|
DAG.getConstant(RsrcDword1, DL, MVT::i32)),
|
|
|
|
0);
|
2014-11-06 03:01:19 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
SDValue DataLo = buildSMovImm32(DAG, DL,
|
|
|
|
RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
|
|
|
|
SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
|
|
|
|
|
|
|
|
const SDValue Ops[] = {
|
2015-04-28 22:05:47 +08:00
|
|
|
DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
|
2014-11-06 03:01:19 +08:00
|
|
|
PtrLo,
|
2015-04-28 22:05:47 +08:00
|
|
|
DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
|
2014-11-06 03:01:19 +08:00
|
|
|
PtrHi,
|
2015-04-28 22:05:47 +08:00
|
|
|
DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
|
2014-11-06 03:01:19 +08:00
|
|
|
DataLo,
|
2015-04-28 22:05:47 +08:00
|
|
|
DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
|
2014-11-06 03:01:19 +08:00
|
|
|
DataHi,
|
2015-04-28 22:05:47 +08:00
|
|
|
DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)
|
2014-11-06 03:01:19 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
|
|
|
|
}
|
|
|
|
|
2013-06-04 01:40:18 +08:00
|
|
|
SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
|
|
|
|
const TargetRegisterClass *RC,
|
|
|
|
unsigned Reg, EVT VT) const {
|
|
|
|
SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT);
|
|
|
|
|
|
|
|
return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()),
|
|
|
|
cast<RegisterSDNode>(VReg)->getReg(), VT);
|
|
|
|
}
|
2015-04-08 09:09:26 +08:00
|
|
|
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// SI Inline Assembly Support
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
std::pair<unsigned, const TargetRegisterClass *>
|
|
|
|
SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
|
2015-07-06 03:29:18 +08:00
|
|
|
StringRef Constraint,
|
2015-04-08 09:09:26 +08:00
|
|
|
MVT VT) const {
|
2016-11-18 12:42:57 +08:00
|
|
|
if (!isTypeLegal(VT))
|
|
|
|
return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
|
2015-12-10 10:12:53 +08:00
|
|
|
|
|
|
|
if (Constraint.size() == 1) {
|
|
|
|
switch (Constraint[0]) {
|
|
|
|
case 's':
|
|
|
|
case 'r':
|
|
|
|
switch (VT.getSizeInBits()) {
|
|
|
|
default:
|
|
|
|
return std::make_pair(0U, nullptr);
|
|
|
|
case 32:
|
2016-11-26 01:37:09 +08:00
|
|
|
return std::make_pair(0U, &AMDGPU::SReg_32_XM0RegClass);
|
2015-12-10 10:12:53 +08:00
|
|
|
case 64:
|
|
|
|
return std::make_pair(0U, &AMDGPU::SGPR_64RegClass);
|
|
|
|
case 128:
|
|
|
|
return std::make_pair(0U, &AMDGPU::SReg_128RegClass);
|
|
|
|
case 256:
|
|
|
|
return std::make_pair(0U, &AMDGPU::SReg_256RegClass);
|
|
|
|
}
|
|
|
|
|
|
|
|
case 'v':
|
|
|
|
switch (VT.getSizeInBits()) {
|
|
|
|
default:
|
|
|
|
return std::make_pair(0U, nullptr);
|
|
|
|
case 32:
|
|
|
|
return std::make_pair(0U, &AMDGPU::VGPR_32RegClass);
|
|
|
|
case 64:
|
|
|
|
return std::make_pair(0U, &AMDGPU::VReg_64RegClass);
|
|
|
|
case 96:
|
|
|
|
return std::make_pair(0U, &AMDGPU::VReg_96RegClass);
|
|
|
|
case 128:
|
|
|
|
return std::make_pair(0U, &AMDGPU::VReg_128RegClass);
|
|
|
|
case 256:
|
|
|
|
return std::make_pair(0U, &AMDGPU::VReg_256RegClass);
|
|
|
|
case 512:
|
|
|
|
return std::make_pair(0U, &AMDGPU::VReg_512RegClass);
|
|
|
|
}
|
2015-04-08 09:09:26 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (Constraint.size() > 1) {
|
|
|
|
const TargetRegisterClass *RC = nullptr;
|
|
|
|
if (Constraint[1] == 'v') {
|
|
|
|
RC = &AMDGPU::VGPR_32RegClass;
|
|
|
|
} else if (Constraint[1] == 's') {
|
|
|
|
RC = &AMDGPU::SGPR_32RegClass;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (RC) {
|
2015-06-23 10:05:55 +08:00
|
|
|
uint32_t Idx;
|
|
|
|
bool Failed = Constraint.substr(2).getAsInteger(10, Idx);
|
|
|
|
if (!Failed && Idx < RC->getNumRegs())
|
2015-04-08 09:09:26 +08:00
|
|
|
return std::make_pair(RC->getRegister(Idx), RC);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
|
|
|
|
}
|
2015-12-10 10:12:53 +08:00
|
|
|
|
|
|
|
SITargetLowering::ConstraintType
|
|
|
|
SITargetLowering::getConstraintType(StringRef Constraint) const {
|
|
|
|
if (Constraint.size() == 1) {
|
|
|
|
switch (Constraint[0]) {
|
|
|
|
default: break;
|
|
|
|
case 's':
|
|
|
|
case 'v':
|
|
|
|
return C_RegisterClass;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return TargetLowering::getConstraintType(Constraint);
|
|
|
|
}
|