2012-12-12 05:25:42 +08:00
|
|
|
//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
|
|
|
|
//
|
|
|
|
// The LLVM Compiler Infrastructure
|
|
|
|
//
|
|
|
|
// This file is distributed under the University of Illinois Open Source
|
|
|
|
// License. See LICENSE.TXT for details.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
|
|
|
/// \file
|
|
|
|
/// \brief Custom DAG lowering for SI
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2014-07-20 19:15:07 +08:00
|
|
|
#ifdef _MSC_VER
|
|
|
|
// Provide M_PI.
|
|
|
|
#define _USE_MATH_DEFINES
|
|
|
|
#endif
|
|
|
|
|
2017-06-06 19:49:48 +08:00
|
|
|
#include "SIISelLowering.h"
|
2013-03-07 17:04:14 +08:00
|
|
|
#include "AMDGPU.h"
|
2014-06-24 02:00:31 +08:00
|
|
|
#include "AMDGPUIntrinsicInfo.h"
|
2014-02-25 05:01:28 +08:00
|
|
|
#include "AMDGPUSubtarget.h"
|
2017-06-06 19:49:48 +08:00
|
|
|
#include "AMDGPUTargetMachine.h"
|
2016-12-07 10:42:15 +08:00
|
|
|
#include "SIDefines.h"
|
2012-12-12 05:25:42 +08:00
|
|
|
#include "SIInstrInfo.h"
|
|
|
|
#include "SIMachineFunctionInfo.h"
|
|
|
|
#include "SIRegisterInfo.h"
|
2017-01-21 08:53:49 +08:00
|
|
|
#include "Utils/AMDGPUBaseInfo.h"
|
|
|
|
#include "llvm/ADT/APFloat.h"
|
|
|
|
#include "llvm/ADT/APInt.h"
|
|
|
|
#include "llvm/ADT/ArrayRef.h"
|
2014-08-28 03:36:53 +08:00
|
|
|
#include "llvm/ADT/BitVector.h"
|
2017-01-21 08:53:49 +08:00
|
|
|
#include "llvm/ADT/SmallVector.h"
|
2017-08-12 04:42:08 +08:00
|
|
|
#include "llvm/ADT/Statistic.h"
|
2017-01-21 08:53:49 +08:00
|
|
|
#include "llvm/ADT/StringRef.h"
|
2016-01-26 12:29:24 +08:00
|
|
|
#include "llvm/ADT/StringSwitch.h"
|
2017-01-21 08:53:49 +08:00
|
|
|
#include "llvm/ADT/Twine.h"
|
|
|
|
#include "llvm/CodeGen/Analysis.h"
|
2013-03-07 17:03:52 +08:00
|
|
|
#include "llvm/CodeGen/CallingConvLower.h"
|
2017-01-21 08:53:49 +08:00
|
|
|
#include "llvm/CodeGen/DAGCombine.h"
|
|
|
|
#include "llvm/CodeGen/ISDOpcodes.h"
|
|
|
|
#include "llvm/CodeGen/MachineBasicBlock.h"
|
|
|
|
#include "llvm/CodeGen/MachineFrameInfo.h"
|
|
|
|
#include "llvm/CodeGen/MachineFunction.h"
|
|
|
|
#include "llvm/CodeGen/MachineInstr.h"
|
2012-12-12 05:25:42 +08:00
|
|
|
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
2017-01-21 08:53:49 +08:00
|
|
|
#include "llvm/CodeGen/MachineMemOperand.h"
|
2017-08-04 07:00:29 +08:00
|
|
|
#include "llvm/CodeGen/MachineModuleInfo.h"
|
2017-01-21 08:53:49 +08:00
|
|
|
#include "llvm/CodeGen/MachineOperand.h"
|
2012-12-12 05:25:42 +08:00
|
|
|
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
2017-01-21 08:53:49 +08:00
|
|
|
#include "llvm/CodeGen/MachineValueType.h"
|
2012-12-12 05:25:42 +08:00
|
|
|
#include "llvm/CodeGen/SelectionDAG.h"
|
2017-01-21 08:53:49 +08:00
|
|
|
#include "llvm/CodeGen/SelectionDAGNodes.h"
|
|
|
|
#include "llvm/CodeGen/ValueTypes.h"
|
|
|
|
#include "llvm/IR/Constants.h"
|
|
|
|
#include "llvm/IR/DataLayout.h"
|
|
|
|
#include "llvm/IR/DebugLoc.h"
|
|
|
|
#include "llvm/IR/DerivedTypes.h"
|
2016-02-02 21:52:43 +08:00
|
|
|
#include "llvm/IR/DiagnosticInfo.h"
|
2013-05-24 01:10:37 +08:00
|
|
|
#include "llvm/IR/Function.h"
|
2017-01-21 08:53:49 +08:00
|
|
|
#include "llvm/IR/GlobalValue.h"
|
|
|
|
#include "llvm/IR/InstrTypes.h"
|
|
|
|
#include "llvm/IR/Instruction.h"
|
|
|
|
#include "llvm/IR/Instructions.h"
|
2017-03-16 07:15:12 +08:00
|
|
|
#include "llvm/IR/IntrinsicInst.h"
|
2017-01-21 08:53:49 +08:00
|
|
|
#include "llvm/IR/Type.h"
|
|
|
|
#include "llvm/Support/Casting.h"
|
|
|
|
#include "llvm/Support/CodeGen.h"
|
|
|
|
#include "llvm/Support/CommandLine.h"
|
|
|
|
#include "llvm/Support/Compiler.h"
|
|
|
|
#include "llvm/Support/ErrorHandling.h"
|
2017-04-28 13:31:46 +08:00
|
|
|
#include "llvm/Support/KnownBits.h"
|
2017-01-21 08:53:49 +08:00
|
|
|
#include "llvm/Support/MathExtras.h"
|
|
|
|
#include "llvm/Target/TargetCallingConv.h"
|
|
|
|
#include "llvm/Target/TargetOptions.h"
|
|
|
|
#include "llvm/Target/TargetRegisterInfo.h"
|
|
|
|
#include <cassert>
|
|
|
|
#include <cmath>
|
|
|
|
#include <cstdint>
|
|
|
|
#include <iterator>
|
|
|
|
#include <tuple>
|
|
|
|
#include <utility>
|
|
|
|
#include <vector>
|
2012-12-12 05:25:42 +08:00
|
|
|
|
|
|
|
using namespace llvm;
|
|
|
|
|
2017-08-12 04:42:08 +08:00
|
|
|
#define DEBUG_TYPE "si-lower"
|
|
|
|
|
|
|
|
STATISTIC(NumTailCalls, "Number of tail calls");
|
|
|
|
|
2016-10-13 02:49:05 +08:00
|
|
|
static cl::opt<bool> EnableVGPRIndexMode(
|
|
|
|
"amdgpu-vgpr-index-mode",
|
|
|
|
cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
|
|
|
|
cl::init(false));
|
|
|
|
|
2016-04-15 00:27:03 +08:00
|
|
|
static unsigned findFirstFreeSGPR(CCState &CCInfo) {
|
|
|
|
unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
|
|
|
|
for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
|
|
|
|
if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
|
|
|
|
return AMDGPU::SGPR0 + Reg;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
llvm_unreachable("Cannot allocate sgpr");
|
|
|
|
}
|
|
|
|
|
2016-06-24 14:30:11 +08:00
|
|
|
SITargetLowering::SITargetLowering(const TargetMachine &TM,
|
|
|
|
const SISubtarget &STI)
|
2015-01-31 07:24:40 +08:00
|
|
|
: AMDGPUTargetLowering(TM, STI) {
|
2014-04-30 23:31:33 +08:00
|
|
|
addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
|
2014-05-15 22:41:57 +08:00
|
|
|
addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
|
2013-03-07 17:03:38 +08:00
|
|
|
|
2016-11-26 01:37:09 +08:00
|
|
|
addRegisterClass(MVT::i32, &AMDGPU::SReg_32_XM0RegClass);
|
2015-01-08 04:59:25 +08:00
|
|
|
addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
|
2012-12-12 05:25:42 +08:00
|
|
|
|
2014-05-15 22:41:57 +08:00
|
|
|
addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
|
|
|
|
addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
|
|
|
|
addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
|
2013-03-07 17:03:38 +08:00
|
|
|
|
2015-11-26 03:58:34 +08:00
|
|
|
addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass);
|
|
|
|
addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass);
|
|
|
|
|
2014-05-15 22:41:57 +08:00
|
|
|
addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
|
|
|
|
addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
|
2013-03-07 17:03:38 +08:00
|
|
|
|
2014-11-19 04:39:39 +08:00
|
|
|
addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
|
2013-03-07 17:03:38 +08:00
|
|
|
addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
|
|
|
|
|
2014-11-19 04:39:39 +08:00
|
|
|
addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
|
2013-03-07 17:03:38 +08:00
|
|
|
addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
|
2012-12-12 05:25:42 +08:00
|
|
|
|
2016-11-13 15:01:11 +08:00
|
|
|
if (Subtarget->has16BitInsts()) {
|
2016-11-26 01:37:09 +08:00
|
|
|
addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass);
|
|
|
|
addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass);
|
2016-11-13 15:01:11 +08:00
|
|
|
}
|
2016-11-11 00:02:37 +08:00
|
|
|
|
2017-02-28 04:52:10 +08:00
|
|
|
if (Subtarget->hasVOP3PInsts()) {
|
|
|
|
addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass);
|
|
|
|
addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass);
|
|
|
|
}
|
|
|
|
|
2015-02-26 08:00:24 +08:00
|
|
|
computeRegisterProperties(STI.getRegisterInfo());
|
2012-12-12 05:25:42 +08:00
|
|
|
|
2013-08-26 23:06:04 +08:00
|
|
|
// We need to custom lower vector stores from local memory
|
2016-05-21 10:27:49 +08:00
|
|
|
setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
|
2013-08-26 23:06:04 +08:00
|
|
|
setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
|
2013-10-23 08:44:32 +08:00
|
|
|
setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
|
|
|
|
setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
|
2016-05-21 10:27:49 +08:00
|
|
|
setOperationAction(ISD::LOAD, MVT::i1, Custom);
|
2013-10-23 08:44:32 +08:00
|
|
|
|
2016-05-21 10:27:49 +08:00
|
|
|
setOperationAction(ISD::STORE, MVT::v2i32, Custom);
|
|
|
|
setOperationAction(ISD::STORE, MVT::v4i32, Custom);
|
2013-10-23 08:44:32 +08:00
|
|
|
setOperationAction(ISD::STORE, MVT::v8i32, Custom);
|
|
|
|
setOperationAction(ISD::STORE, MVT::v16i32, Custom);
|
2014-03-08 04:12:33 +08:00
|
|
|
setOperationAction(ISD::STORE, MVT::i1, Custom);
|
2016-05-03 04:07:26 +08:00
|
|
|
|
2017-01-07 05:00:46 +08:00
|
|
|
setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
|
|
|
|
setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
|
|
|
|
setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
|
|
|
|
setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
|
|
|
|
setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
|
|
|
|
setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
|
|
|
|
setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
|
|
|
|
setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
|
|
|
|
setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
|
|
|
|
setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
|
|
|
|
|
2016-05-21 10:27:49 +08:00
|
|
|
setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
|
|
|
|
setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
|
|
|
|
setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand);
|
2016-05-03 04:13:51 +08:00
|
|
|
|
2016-05-21 10:27:49 +08:00
|
|
|
setOperationAction(ISD::SELECT, MVT::i1, Promote);
|
2014-02-05 01:18:40 +08:00
|
|
|
setOperationAction(ISD::SELECT, MVT::i64, Custom);
|
2014-03-25 00:07:30 +08:00
|
|
|
setOperationAction(ISD::SELECT, MVT::f64, Promote);
|
|
|
|
AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
|
2013-11-14 07:36:50 +08:00
|
|
|
|
2014-06-11 00:01:22 +08:00
|
|
|
setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
|
|
|
|
setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
|
|
|
|
setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
|
|
|
|
setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
|
2016-05-21 10:27:49 +08:00
|
|
|
setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
|
2013-04-06 07:31:51 +08:00
|
|
|
|
2016-01-21 05:48:24 +08:00
|
|
|
setOperationAction(ISD::SETCC, MVT::i1, Promote);
|
2013-07-19 05:43:53 +08:00
|
|
|
setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
|
|
|
|
setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
|
2016-12-23 00:27:11 +08:00
|
|
|
AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
|
2013-07-19 05:43:53 +08:00
|
|
|
|
2016-05-21 10:27:49 +08:00
|
|
|
setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
|
|
|
|
setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
|
2014-10-22 00:25:08 +08:00
|
|
|
|
2014-04-16 09:41:30 +08:00
|
|
|
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
|
|
|
|
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
|
|
|
|
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
|
|
|
|
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
|
|
|
|
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
|
|
|
|
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
|
|
|
|
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
|
|
|
|
|
2017-04-04 02:08:08 +08:00
|
|
|
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
|
2013-08-15 07:24:45 +08:00
|
|
|
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
|
|
|
|
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
|
2017-04-04 02:08:08 +08:00
|
|
|
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
|
|
|
|
|
2016-04-12 22:05:04 +08:00
|
|
|
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
|
2017-04-04 02:08:08 +08:00
|
|
|
|
|
|
|
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
|
2017-01-17 15:26:53 +08:00
|
|
|
setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom);
|
|
|
|
setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom);
|
2016-04-12 22:05:04 +08:00
|
|
|
|
2014-06-24 02:00:44 +08:00
|
|
|
setOperationAction(ISD::BRCOND, MVT::Other, Custom);
|
2016-05-21 10:27:49 +08:00
|
|
|
setOperationAction(ISD::BR_CC, MVT::i1, Expand);
|
2016-02-13 07:45:29 +08:00
|
|
|
setOperationAction(ISD::BR_CC, MVT::i32, Expand);
|
|
|
|
setOperationAction(ISD::BR_CC, MVT::i64, Expand);
|
|
|
|
setOperationAction(ISD::BR_CC, MVT::f32, Expand);
|
|
|
|
setOperationAction(ISD::BR_CC, MVT::f64, Expand);
|
2013-09-12 10:55:14 +08:00
|
|
|
|
2017-01-31 02:11:38 +08:00
|
|
|
setOperationAction(ISD::UADDO, MVT::i32, Legal);
|
|
|
|
setOperationAction(ISD::USUBO, MVT::i32, Legal);
|
|
|
|
|
2017-06-22 06:05:06 +08:00
|
|
|
setOperationAction(ISD::ADDCARRY, MVT::i32, Legal);
|
|
|
|
setOperationAction(ISD::SUBCARRY, MVT::i32, Legal);
|
|
|
|
|
2015-03-08 01:41:00 +08:00
|
|
|
// We only support LOAD/STORE and vector manipulation ops for vectors
|
|
|
|
// with > 4 elements.
|
2017-02-28 04:52:10 +08:00
|
|
|
for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
|
|
|
|
MVT::v2i64, MVT::v2f64}) {
|
2014-02-14 07:34:15 +08:00
|
|
|
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
|
2016-05-21 10:27:49 +08:00
|
|
|
switch (Op) {
|
2014-02-14 07:34:15 +08:00
|
|
|
case ISD::LOAD:
|
|
|
|
case ISD::STORE:
|
|
|
|
case ISD::BUILD_VECTOR:
|
|
|
|
case ISD::BITCAST:
|
|
|
|
case ISD::EXTRACT_VECTOR_ELT:
|
|
|
|
case ISD::INSERT_VECTOR_ELT:
|
|
|
|
case ISD::INSERT_SUBVECTOR:
|
|
|
|
case ISD::EXTRACT_SUBVECTOR:
|
2015-11-26 03:58:34 +08:00
|
|
|
case ISD::SCALAR_TO_VECTOR:
|
2014-02-14 07:34:15 +08:00
|
|
|
break;
|
2014-08-09 09:06:56 +08:00
|
|
|
case ISD::CONCAT_VECTORS:
|
|
|
|
setOperationAction(Op, VT, Custom);
|
|
|
|
break;
|
2014-02-14 07:34:15 +08:00
|
|
|
default:
|
2014-05-16 05:44:05 +08:00
|
|
|
setOperationAction(Op, VT, Expand);
|
2014-02-14 07:34:15 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-07-19 08:35:03 +08:00
|
|
|
// TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
|
|
|
|
// is expanded to avoid having two separate loops in case the index is a VGPR.
|
|
|
|
|
2015-11-26 03:58:34 +08:00
|
|
|
// Most operations are naturally 32-bit vector operations. We only support
|
|
|
|
// load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
|
|
|
|
for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
|
|
|
|
setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
|
|
|
|
AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
|
|
|
|
|
|
|
|
setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
|
|
|
|
AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
|
|
|
|
|
|
|
|
setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
|
|
|
|
AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
|
|
|
|
|
|
|
|
setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
|
|
|
|
AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
|
|
|
|
}
|
|
|
|
|
2016-05-21 10:27:49 +08:00
|
|
|
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
|
|
|
|
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
|
|
|
|
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
|
|
|
|
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
|
2014-07-16 04:18:31 +08:00
|
|
|
|
2017-01-24 07:09:58 +08:00
|
|
|
// Avoid stack access for these.
|
|
|
|
// TODO: Generalize to more vector types.
|
|
|
|
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom);
|
|
|
|
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom);
|
|
|
|
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
|
|
|
|
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
|
|
|
|
|
AMDGPU: Implement {BUFFER,FLAT}_ATOMIC_CMPSWAP{,_X2}
Summary:
Implement BUFFER_ATOMIC_CMPSWAP{,_X2} instructions on all GCN targets, and FLAT_ATOMIC_CMPSWAP{,_X2} on CI+.
32-bit instruction variants tested manually on Kabini and Bonaire. Tests and parts of code provided by Jan Veselý.
Patch by: Vedran Miletić
Reviewers: arsenm, tstellarAMD, nhaehnle
Subscribers: jvesely, scchan, kanarayan, arsenm
Differential Revision: http://reviews.llvm.org/D17280
llvm-svn: 265170
2016-04-02 02:27:37 +08:00
|
|
|
// BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
|
|
|
|
// and output demarshalling
|
|
|
|
setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
|
|
|
|
setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
|
|
|
|
|
|
|
|
// We can't return success/failure, only the old value,
|
|
|
|
// let LLVM add the comparison
|
|
|
|
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand);
|
|
|
|
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand);
|
|
|
|
|
2016-06-24 14:30:11 +08:00
|
|
|
if (getSubtarget()->hasFlatAddressSpace()) {
|
2016-04-26 03:27:24 +08:00
|
|
|
setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
|
|
|
|
setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
|
|
|
|
}
|
|
|
|
|
2016-05-21 10:27:49 +08:00
|
|
|
setOperationAction(ISD::BSWAP, MVT::i32, Legal);
|
|
|
|
setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
|
|
|
|
|
|
|
|
// On SI this is s_memtime and s_memrealtime on VI.
|
|
|
|
setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
|
2017-04-25 01:49:13 +08:00
|
|
|
setOperationAction(ISD::TRAP, MVT::Other, Custom);
|
|
|
|
setOperationAction(ISD::DEBUGTRAP, MVT::Other, Custom);
|
2016-05-21 10:27:49 +08:00
|
|
|
|
|
|
|
setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
|
|
|
|
setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
|
|
|
|
|
2016-06-24 14:30:11 +08:00
|
|
|
if (Subtarget->getGeneration() >= SISubtarget::SEA_ISLANDS) {
|
2016-05-21 10:27:49 +08:00
|
|
|
setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
|
|
|
|
setOperationAction(ISD::FCEIL, MVT::f64, Legal);
|
|
|
|
setOperationAction(ISD::FRINT, MVT::f64, Legal);
|
|
|
|
}
|
|
|
|
|
|
|
|
setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
|
|
|
|
|
|
|
|
setOperationAction(ISD::FSIN, MVT::f32, Custom);
|
|
|
|
setOperationAction(ISD::FCOS, MVT::f32, Custom);
|
|
|
|
setOperationAction(ISD::FDIV, MVT::f32, Custom);
|
|
|
|
setOperationAction(ISD::FDIV, MVT::f64, Custom);
|
|
|
|
|
2016-11-11 00:02:37 +08:00
|
|
|
if (Subtarget->has16BitInsts()) {
|
|
|
|
setOperationAction(ISD::Constant, MVT::i16, Legal);
|
|
|
|
|
|
|
|
setOperationAction(ISD::SMIN, MVT::i16, Legal);
|
|
|
|
setOperationAction(ISD::SMAX, MVT::i16, Legal);
|
|
|
|
|
|
|
|
setOperationAction(ISD::UMIN, MVT::i16, Legal);
|
|
|
|
setOperationAction(ISD::UMAX, MVT::i16, Legal);
|
|
|
|
|
|
|
|
setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Promote);
|
|
|
|
AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
|
|
|
|
|
|
|
|
setOperationAction(ISD::ROTR, MVT::i16, Promote);
|
|
|
|
setOperationAction(ISD::ROTL, MVT::i16, Promote);
|
|
|
|
|
|
|
|
setOperationAction(ISD::SDIV, MVT::i16, Promote);
|
|
|
|
setOperationAction(ISD::UDIV, MVT::i16, Promote);
|
|
|
|
setOperationAction(ISD::SREM, MVT::i16, Promote);
|
|
|
|
setOperationAction(ISD::UREM, MVT::i16, Promote);
|
|
|
|
|
|
|
|
setOperationAction(ISD::BSWAP, MVT::i16, Promote);
|
|
|
|
setOperationAction(ISD::BITREVERSE, MVT::i16, Promote);
|
|
|
|
|
|
|
|
setOperationAction(ISD::CTTZ, MVT::i16, Promote);
|
|
|
|
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote);
|
|
|
|
setOperationAction(ISD::CTLZ, MVT::i16, Promote);
|
|
|
|
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote);
|
|
|
|
|
|
|
|
setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
|
|
|
|
|
|
|
|
setOperationAction(ISD::BR_CC, MVT::i16, Expand);
|
|
|
|
|
|
|
|
setOperationAction(ISD::LOAD, MVT::i16, Custom);
|
|
|
|
|
|
|
|
setTruncStoreAction(MVT::i64, MVT::i16, Expand);
|
|
|
|
|
|
|
|
setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
|
|
|
|
AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
|
|
|
|
setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
|
|
|
|
AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
|
2016-11-12 08:19:11 +08:00
|
|
|
|
2016-11-17 12:00:46 +08:00
|
|
|
setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
|
|
|
|
setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
|
|
|
|
setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
|
|
|
|
setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
|
2016-11-13 15:01:11 +08:00
|
|
|
|
|
|
|
// F16 - Constant Actions.
|
2016-12-09 04:14:46 +08:00
|
|
|
setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
|
2016-11-13 15:01:11 +08:00
|
|
|
|
|
|
|
// F16 - Load/Store Actions.
|
|
|
|
setOperationAction(ISD::LOAD, MVT::f16, Promote);
|
|
|
|
AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
|
|
|
|
setOperationAction(ISD::STORE, MVT::f16, Promote);
|
|
|
|
AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
|
2016-11-12 08:19:11 +08:00
|
|
|
|
2016-11-13 15:01:11 +08:00
|
|
|
// F16 - VOP1 Actions.
|
2016-11-17 12:28:37 +08:00
|
|
|
setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
|
2016-11-13 15:01:11 +08:00
|
|
|
setOperationAction(ISD::FCOS, MVT::f16, Promote);
|
|
|
|
setOperationAction(ISD::FSIN, MVT::f16, Promote);
|
2016-11-17 12:00:46 +08:00
|
|
|
setOperationAction(ISD::FP_TO_SINT, MVT::f16, Promote);
|
|
|
|
setOperationAction(ISD::FP_TO_UINT, MVT::f16, Promote);
|
|
|
|
setOperationAction(ISD::SINT_TO_FP, MVT::f16, Promote);
|
|
|
|
setOperationAction(ISD::UINT_TO_FP, MVT::f16, Promote);
|
2017-03-25 04:04:18 +08:00
|
|
|
setOperationAction(ISD::FROUND, MVT::f16, Custom);
|
2016-11-13 15:01:11 +08:00
|
|
|
|
|
|
|
// F16 - VOP2 Actions.
|
2016-11-17 11:49:01 +08:00
|
|
|
setOperationAction(ISD::BR_CC, MVT::f16, Expand);
|
2016-11-16 11:16:26 +08:00
|
|
|
setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
|
2016-11-13 15:01:11 +08:00
|
|
|
setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
|
|
|
|
setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
|
2016-12-22 11:05:41 +08:00
|
|
|
setOperationAction(ISD::FDIV, MVT::f16, Custom);
|
2016-11-13 15:01:11 +08:00
|
|
|
|
|
|
|
// F16 - VOP3 Actions.
|
|
|
|
setOperationAction(ISD::FMA, MVT::f16, Legal);
|
|
|
|
if (!Subtarget->hasFP16Denormals())
|
|
|
|
setOperationAction(ISD::FMAD, MVT::f16, Legal);
|
2016-11-11 00:02:37 +08:00
|
|
|
}
|
|
|
|
|
2017-02-28 04:52:10 +08:00
|
|
|
if (Subtarget->hasVOP3PInsts()) {
|
|
|
|
for (MVT VT : {MVT::v2i16, MVT::v2f16}) {
|
|
|
|
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
|
|
|
|
switch (Op) {
|
|
|
|
case ISD::LOAD:
|
|
|
|
case ISD::STORE:
|
|
|
|
case ISD::BUILD_VECTOR:
|
|
|
|
case ISD::BITCAST:
|
|
|
|
case ISD::EXTRACT_VECTOR_ELT:
|
|
|
|
case ISD::INSERT_VECTOR_ELT:
|
|
|
|
case ISD::INSERT_SUBVECTOR:
|
|
|
|
case ISD::EXTRACT_SUBVECTOR:
|
|
|
|
case ISD::SCALAR_TO_VECTOR:
|
|
|
|
break;
|
|
|
|
case ISD::CONCAT_VECTORS:
|
|
|
|
setOperationAction(Op, VT, Custom);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
setOperationAction(Op, VT, Expand);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-02-28 06:15:25 +08:00
|
|
|
// XXX - Do these do anything? Vector constants turn into build_vector.
|
|
|
|
setOperationAction(ISD::Constant, MVT::v2i16, Legal);
|
|
|
|
setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal);
|
|
|
|
|
2017-02-28 04:52:10 +08:00
|
|
|
setOperationAction(ISD::STORE, MVT::v2i16, Promote);
|
|
|
|
AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
|
|
|
|
setOperationAction(ISD::STORE, MVT::v2f16, Promote);
|
|
|
|
AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
|
|
|
|
|
|
|
|
setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
|
|
|
|
AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
|
|
|
|
setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
|
|
|
|
AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
|
2017-02-28 06:15:25 +08:00
|
|
|
|
|
|
|
setOperationAction(ISD::AND, MVT::v2i16, Promote);
|
|
|
|
AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
|
|
|
|
setOperationAction(ISD::OR, MVT::v2i16, Promote);
|
|
|
|
AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
|
|
|
|
setOperationAction(ISD::XOR, MVT::v2i16, Promote);
|
|
|
|
AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
|
|
|
|
setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
|
|
|
|
AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
|
|
|
|
setOperationAction(ISD::SELECT, MVT::v2f16, Promote);
|
|
|
|
AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
|
|
|
|
|
|
|
|
setOperationAction(ISD::ADD, MVT::v2i16, Legal);
|
|
|
|
setOperationAction(ISD::SUB, MVT::v2i16, Legal);
|
|
|
|
setOperationAction(ISD::MUL, MVT::v2i16, Legal);
|
|
|
|
setOperationAction(ISD::SHL, MVT::v2i16, Legal);
|
|
|
|
setOperationAction(ISD::SRL, MVT::v2i16, Legal);
|
|
|
|
setOperationAction(ISD::SRA, MVT::v2i16, Legal);
|
|
|
|
setOperationAction(ISD::SMIN, MVT::v2i16, Legal);
|
|
|
|
setOperationAction(ISD::UMIN, MVT::v2i16, Legal);
|
|
|
|
setOperationAction(ISD::SMAX, MVT::v2i16, Legal);
|
|
|
|
setOperationAction(ISD::UMAX, MVT::v2i16, Legal);
|
|
|
|
|
|
|
|
setOperationAction(ISD::FADD, MVT::v2f16, Legal);
|
|
|
|
setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
|
|
|
|
setOperationAction(ISD::FMUL, MVT::v2f16, Legal);
|
|
|
|
setOperationAction(ISD::FMA, MVT::v2f16, Legal);
|
|
|
|
setOperationAction(ISD::FMINNUM, MVT::v2f16, Legal);
|
|
|
|
setOperationAction(ISD::FMAXNUM, MVT::v2f16, Legal);
|
|
|
|
|
|
|
|
// This isn't really legal, but this avoids the legalizer unrolling it (and
|
|
|
|
// allows matching fneg (fabs x) patterns)
|
|
|
|
setOperationAction(ISD::FABS, MVT::v2f16, Legal);
|
|
|
|
|
|
|
|
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
|
|
|
|
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
|
|
|
|
|
|
|
|
setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
|
|
|
|
setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
|
|
|
|
setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
|
2017-04-20 04:53:07 +08:00
|
|
|
} else {
|
|
|
|
setOperationAction(ISD::SELECT, MVT::v2i16, Custom);
|
|
|
|
setOperationAction(ISD::SELECT, MVT::v2f16, Custom);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8 }) {
|
|
|
|
setOperationAction(ISD::SELECT, VT, Custom);
|
2017-02-28 04:52:10 +08:00
|
|
|
}
|
|
|
|
|
2017-06-22 06:05:06 +08:00
|
|
|
setTargetDAGCombine(ISD::ADD);
|
2017-06-22 06:30:01 +08:00
|
|
|
setTargetDAGCombine(ISD::ADDCARRY);
|
|
|
|
setTargetDAGCombine(ISD::SUB);
|
|
|
|
setTargetDAGCombine(ISD::SUBCARRY);
|
2014-09-29 22:59:34 +08:00
|
|
|
setTargetDAGCombine(ISD::FADD);
|
2014-08-30 00:01:14 +08:00
|
|
|
setTargetDAGCombine(ISD::FSUB);
|
2014-11-15 04:08:52 +08:00
|
|
|
setTargetDAGCombine(ISD::FMINNUM);
|
|
|
|
setTargetDAGCombine(ISD::FMAXNUM);
|
2015-06-09 08:52:37 +08:00
|
|
|
setTargetDAGCombine(ISD::SMIN);
|
|
|
|
setTargetDAGCombine(ISD::SMAX);
|
|
|
|
setTargetDAGCombine(ISD::UMIN);
|
|
|
|
setTargetDAGCombine(ISD::UMAX);
|
2012-12-12 05:25:42 +08:00
|
|
|
setTargetDAGCombine(ISD::SETCC);
|
2015-01-07 07:00:46 +08:00
|
|
|
setTargetDAGCombine(ISD::AND);
|
2015-01-07 07:00:39 +08:00
|
|
|
setTargetDAGCombine(ISD::OR);
|
2016-09-14 23:19:03 +08:00
|
|
|
setTargetDAGCombine(ISD::XOR);
|
2016-10-22 06:10:03 +08:00
|
|
|
setTargetDAGCombine(ISD::SINT_TO_FP);
|
2014-06-12 01:50:44 +08:00
|
|
|
setTargetDAGCombine(ISD::UINT_TO_FP);
|
2016-04-14 09:42:16 +08:00
|
|
|
setTargetDAGCombine(ISD::FCANONICALIZE);
|
2017-02-28 06:15:25 +08:00
|
|
|
setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
|
2017-04-01 03:53:03 +08:00
|
|
|
setTargetDAGCombine(ISD::ZERO_EXTEND);
|
2017-05-12 01:26:25 +08:00
|
|
|
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
|
2017-09-21 05:01:24 +08:00
|
|
|
setTargetDAGCombine(ISD::BUILD_VECTOR);
|
2014-06-12 01:50:44 +08:00
|
|
|
|
2014-08-16 01:49:05 +08:00
|
|
|
// All memory operations. Some folding on the pointer operand is done to help
|
|
|
|
// matching the constant offsets in the addressing modes.
|
|
|
|
setTargetDAGCombine(ISD::LOAD);
|
|
|
|
setTargetDAGCombine(ISD::STORE);
|
|
|
|
setTargetDAGCombine(ISD::ATOMIC_LOAD);
|
|
|
|
setTargetDAGCombine(ISD::ATOMIC_STORE);
|
|
|
|
setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP);
|
|
|
|
setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
|
|
|
|
setTargetDAGCombine(ISD::ATOMIC_SWAP);
|
|
|
|
setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD);
|
|
|
|
setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB);
|
|
|
|
setTargetDAGCombine(ISD::ATOMIC_LOAD_AND);
|
|
|
|
setTargetDAGCombine(ISD::ATOMIC_LOAD_OR);
|
|
|
|
setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR);
|
|
|
|
setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND);
|
|
|
|
setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN);
|
|
|
|
setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX);
|
|
|
|
setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN);
|
|
|
|
setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
|
|
|
|
|
2013-03-26 22:04:02 +08:00
|
|
|
setSchedulingPreference(Sched::RegPressure);
|
2012-12-12 05:25:42 +08:00
|
|
|
}
|
|
|
|
|
2016-06-24 14:30:11 +08:00
|
|
|
const SISubtarget *SITargetLowering::getSubtarget() const {
|
|
|
|
return static_cast<const SISubtarget *>(Subtarget);
|
|
|
|
}
|
|
|
|
|
2013-06-25 10:39:35 +08:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// TargetLowering queries
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2017-07-26 16:06:58 +08:00
|
|
|
bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
|
2017-03-16 07:15:12 +08:00
|
|
|
// SI has some legal vector types, but no legal vector operations. Say no
|
|
|
|
// shuffles are legal in order to prefer scalarizing some vector operations.
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2016-04-12 22:05:04 +08:00
|
|
|
bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
|
|
|
|
const CallInst &CI,
|
|
|
|
unsigned IntrID) const {
|
|
|
|
switch (IntrID) {
|
|
|
|
case Intrinsic::amdgcn_atomic_inc:
|
2017-03-31 06:21:40 +08:00
|
|
|
case Intrinsic::amdgcn_atomic_dec: {
|
2016-04-12 22:05:04 +08:00
|
|
|
Info.opc = ISD::INTRINSIC_W_CHAIN;
|
|
|
|
Info.memVT = MVT::getVT(CI.getType());
|
|
|
|
Info.ptrVal = CI.getOperand(0);
|
|
|
|
Info.align = 0;
|
2017-03-31 06:21:40 +08:00
|
|
|
|
|
|
|
const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
|
2017-07-07 02:39:47 +08:00
|
|
|
Info.vol = !Vol || !Vol->isZero();
|
2016-04-12 22:05:04 +08:00
|
|
|
Info.readMem = true;
|
|
|
|
Info.writeMem = true;
|
|
|
|
return true;
|
2017-03-31 06:21:40 +08:00
|
|
|
}
|
2016-04-12 22:05:04 +08:00
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-03-16 07:15:12 +08:00
|
|
|
bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
|
|
|
|
SmallVectorImpl<Value*> &Ops,
|
|
|
|
Type *&AccessTy) const {
|
|
|
|
switch (II->getIntrinsicID()) {
|
|
|
|
case Intrinsic::amdgcn_atomic_inc:
|
|
|
|
case Intrinsic::amdgcn_atomic_dec: {
|
|
|
|
Value *Ptr = II->getArgOperand(0);
|
|
|
|
AccessTy = II->getType();
|
|
|
|
Ops.push_back(Ptr);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
2014-10-22 00:25:08 +08:00
|
|
|
}
|
|
|
|
|
2015-07-20 22:28:41 +08:00
|
|
|
bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
|
2017-06-13 01:06:35 +08:00
|
|
|
if (!Subtarget->hasFlatInstOffsets()) {
|
|
|
|
// Flat instructions do not have offsets, and only have the register
|
|
|
|
// address.
|
|
|
|
return AM.BaseOffs == 0 && AM.Scale == 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
// GFX9 added a 13-bit signed offset. When using regular flat instructions,
|
|
|
|
// the sign bit is ignored and is treated as a 12-bit unsigned offset.
|
|
|
|
|
|
|
|
// Just r + i
|
|
|
|
return isUInt<12>(AM.BaseOffs) && AM.Scale == 0;
|
2015-07-20 22:28:41 +08:00
|
|
|
}
|
|
|
|
|
2017-07-29 09:12:31 +08:00
|
|
|
bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
|
|
|
|
if (Subtarget->hasFlatGlobalInsts())
|
|
|
|
return isInt<13>(AM.BaseOffs) && AM.Scale == 0;
|
|
|
|
|
|
|
|
if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
|
|
|
|
// Assume the we will use FLAT for all global memory accesses
|
|
|
|
// on VI.
|
|
|
|
// FIXME: This assumption is currently wrong. On VI we still use
|
|
|
|
// MUBUF instructions for the r + i addressing mode. As currently
|
|
|
|
// implemented, the MUBUF instructions only work on buffer < 4GB.
|
|
|
|
// It may be possible to support > 4GB buffers with MUBUF instructions,
|
|
|
|
// by setting the stride value in the resource descriptor which would
|
|
|
|
// increase the size limit to (stride * 4GB). However, this is risky,
|
|
|
|
// because it has never been validated.
|
|
|
|
return isLegalFlatAddressingMode(AM);
|
|
|
|
}
|
|
|
|
|
|
|
|
return isLegalMUBUFAddressingMode(AM);
|
|
|
|
}
|
|
|
|
|
2015-08-08 04:18:34 +08:00
|
|
|
bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
|
|
|
|
// MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
|
|
|
|
// additionally can do r + r + i with addr64. 32-bit has more addressing
|
|
|
|
// mode options. Depending on the resource constant, it can also do
|
|
|
|
// (i64 r0) + (i32 r1) * (i14 i).
|
|
|
|
//
|
|
|
|
// Private arrays end up using a scratch buffer most of the time, so also
|
|
|
|
// assume those use MUBUF instructions. Scratch loads / stores are currently
|
|
|
|
// implemented as mubuf instructions with offen bit set, so slightly
|
|
|
|
// different than the normal addr64.
|
|
|
|
if (!isUInt<12>(AM.BaseOffs))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// FIXME: Since we can split immediate into soffset and immediate offset,
|
|
|
|
// would it make sense to allow any immediate?
|
|
|
|
|
|
|
|
switch (AM.Scale) {
|
|
|
|
case 0: // r + i or just i, depending on HasBaseReg.
|
|
|
|
return true;
|
|
|
|
case 1:
|
|
|
|
return true; // We have r + r or r + i.
|
|
|
|
case 2:
|
|
|
|
if (AM.HasBaseReg) {
|
|
|
|
// Reject 2 * r + r.
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Allow 2 * r as r + r
|
|
|
|
// Or 2 * r + i is allowed as r + r + i.
|
|
|
|
return true;
|
|
|
|
default: // Don't allow n * r
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-07-09 10:09:40 +08:00
|
|
|
bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
|
|
|
|
const AddrMode &AM, Type *Ty,
|
2017-07-21 19:59:37 +08:00
|
|
|
unsigned AS, Instruction *I) const {
|
2014-08-16 01:17:07 +08:00
|
|
|
// No global is ever allowed as a base.
|
|
|
|
if (AM.BaseGV)
|
|
|
|
return false;
|
|
|
|
|
2017-07-29 09:12:31 +08:00
|
|
|
if (AS == AMDGPUASI.GLOBAL_ADDRESS)
|
|
|
|
return isLegalGlobalAddressingMode(AM);
|
2015-06-05 00:17:42 +08:00
|
|
|
|
2017-07-29 09:12:31 +08:00
|
|
|
if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
|
2015-08-08 04:18:34 +08:00
|
|
|
// If the offset isn't a multiple of 4, it probably isn't going to be
|
|
|
|
// correctly aligned.
|
2016-08-13 09:43:51 +08:00
|
|
|
// FIXME: Can we get the real alignment here?
|
2015-08-08 04:18:34 +08:00
|
|
|
if (AM.BaseOffs % 4 != 0)
|
|
|
|
return isLegalMUBUFAddressingMode(AM);
|
|
|
|
|
|
|
|
// There are no SMRD extloads, so if we have to do a small type access we
|
|
|
|
// will use a MUBUF load.
|
|
|
|
// FIXME?: We also need to do this if unaligned, but we don't know the
|
|
|
|
// alignment here.
|
|
|
|
if (DL.getTypeStoreSize(Ty) < 4)
|
2017-07-29 09:12:31 +08:00
|
|
|
return isLegalGlobalAddressingMode(AM);
|
2015-08-08 04:18:34 +08:00
|
|
|
|
2016-06-24 14:30:11 +08:00
|
|
|
if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
|
2015-08-08 04:18:34 +08:00
|
|
|
// SMRD instructions have an 8-bit, dword offset on SI.
|
|
|
|
if (!isUInt<8>(AM.BaseOffs / 4))
|
|
|
|
return false;
|
2016-06-24 14:30:11 +08:00
|
|
|
} else if (Subtarget->getGeneration() == SISubtarget::SEA_ISLANDS) {
|
2015-08-08 04:18:34 +08:00
|
|
|
// On CI+, this can also be a 32-bit literal constant offset. If it fits
|
|
|
|
// in 8-bits, it can use a smaller encoding.
|
|
|
|
if (!isUInt<32>(AM.BaseOffs / 4))
|
|
|
|
return false;
|
2017-02-19 02:29:53 +08:00
|
|
|
} else if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
|
2015-08-08 04:18:34 +08:00
|
|
|
// On VI, these use the SMEM format and the offset is 20-bit in bytes.
|
|
|
|
if (!isUInt<20>(AM.BaseOffs))
|
|
|
|
return false;
|
|
|
|
} else
|
|
|
|
llvm_unreachable("unhandled generation");
|
2015-06-05 00:17:42 +08:00
|
|
|
|
2015-08-08 04:18:34 +08:00
|
|
|
if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
|
2015-06-05 00:17:42 +08:00
|
|
|
return true;
|
2014-08-16 01:17:07 +08:00
|
|
|
|
2015-08-08 04:18:34 +08:00
|
|
|
if (AM.Scale == 1 && AM.HasBaseReg)
|
2015-06-05 00:17:42 +08:00
|
|
|
return true;
|
2015-08-08 04:18:34 +08:00
|
|
|
|
|
|
|
return false;
|
|
|
|
|
2017-03-27 22:04:01 +08:00
|
|
|
} else if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
|
2015-08-08 04:18:34 +08:00
|
|
|
return isLegalMUBUFAddressingMode(AM);
|
2017-03-27 22:04:01 +08:00
|
|
|
} else if (AS == AMDGPUASI.LOCAL_ADDRESS ||
|
|
|
|
AS == AMDGPUASI.REGION_ADDRESS) {
|
2015-06-05 00:17:42 +08:00
|
|
|
// Basic, single offset DS instructions allow a 16-bit unsigned immediate
|
|
|
|
// field.
|
|
|
|
// XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
|
|
|
|
// an 8-bit dword offset but we don't know the alignment here.
|
|
|
|
if (!isUInt<16>(AM.BaseOffs))
|
2014-08-16 01:17:07 +08:00
|
|
|
return false;
|
2015-06-05 00:17:42 +08:00
|
|
|
|
|
|
|
if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
|
|
|
|
return true;
|
|
|
|
|
|
|
|
if (AM.Scale == 1 && AM.HasBaseReg)
|
|
|
|
return true;
|
|
|
|
|
2014-08-16 01:17:07 +08:00
|
|
|
return false;
|
2017-03-27 22:04:01 +08:00
|
|
|
} else if (AS == AMDGPUASI.FLAT_ADDRESS ||
|
|
|
|
AS == AMDGPUASI.UNKNOWN_ADDRESS_SPACE) {
|
2016-04-29 14:25:10 +08:00
|
|
|
// For an unknown address space, this usually means that this is for some
|
|
|
|
// reason being used for pure arithmetic, and not based on some addressing
|
|
|
|
// computation. We don't have instructions that compute pointers with any
|
|
|
|
// addressing modes, so treat them as having no offset like flat
|
|
|
|
// instructions.
|
2015-07-20 22:28:41 +08:00
|
|
|
return isLegalFlatAddressingMode(AM);
|
2017-03-27 22:04:01 +08:00
|
|
|
} else {
|
2015-06-05 00:17:42 +08:00
|
|
|
llvm_unreachable("unhandled address space");
|
|
|
|
}
|
2014-08-16 01:17:07 +08:00
|
|
|
}
|
|
|
|
|
2017-07-11 04:25:54 +08:00
|
|
|
bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
|
|
|
|
const SelectionDAG &DAG) const {
|
2017-05-24 23:59:09 +08:00
|
|
|
if (AS == AMDGPUASI.GLOBAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS) {
|
|
|
|
return (MemVT.getSizeInBits() <= 4 * 32);
|
|
|
|
} else if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
|
|
|
|
unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
|
|
|
|
return (MemVT.getSizeInBits() <= MaxPrivateBits);
|
|
|
|
} else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
|
|
|
|
return (MemVT.getSizeInBits() <= 2 * 32);
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2015-01-14 09:35:22 +08:00
|
|
|
bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
|
2014-07-28 01:46:40 +08:00
|
|
|
unsigned AddrSpace,
|
|
|
|
unsigned Align,
|
|
|
|
bool *IsFast) const {
|
2014-04-25 01:08:26 +08:00
|
|
|
if (IsFast)
|
|
|
|
*IsFast = false;
|
|
|
|
|
|
|
|
// TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
|
|
|
|
// which isn't a simple VT.
|
2016-08-05 00:38:44 +08:00
|
|
|
// Until MVT is extended to handle this, simply check for the size and
|
|
|
|
// rely on the condition below: allow accesses if the size is a multiple of 4.
|
|
|
|
if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 &&
|
|
|
|
VT.getStoreSize() > 16)) {
|
2013-11-14 07:36:50 +08:00
|
|
|
return false;
|
2016-08-05 00:38:44 +08:00
|
|
|
}
|
2014-04-25 01:08:26 +08:00
|
|
|
|
2017-03-27 22:04:01 +08:00
|
|
|
if (AddrSpace == AMDGPUASI.LOCAL_ADDRESS ||
|
|
|
|
AddrSpace == AMDGPUASI.REGION_ADDRESS) {
|
2014-07-28 01:46:40 +08:00
|
|
|
// ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
|
|
|
|
// aligned, 8 byte access in a single operation using ds_read2/write2_b32
|
|
|
|
// with adjacent offsets.
|
2015-09-03 23:03:19 +08:00
|
|
|
bool AlignedBy4 = (Align % 4 == 0);
|
|
|
|
if (IsFast)
|
|
|
|
*IsFast = AlignedBy4;
|
2016-07-02 07:03:44 +08:00
|
|
|
|
2015-09-03 23:03:19 +08:00
|
|
|
return AlignedBy4;
|
2014-07-28 01:46:40 +08:00
|
|
|
}
|
2014-04-25 01:08:26 +08:00
|
|
|
|
2016-10-15 02:10:39 +08:00
|
|
|
// FIXME: We have to be conservative here and assume that flat operations
|
|
|
|
// will access scratch. If we had access to the IR function, then we
|
|
|
|
// could determine if any private memory was used in the function.
|
|
|
|
if (!Subtarget->hasUnalignedScratchAccess() &&
|
2017-03-27 22:04:01 +08:00
|
|
|
(AddrSpace == AMDGPUASI.PRIVATE_ADDRESS ||
|
|
|
|
AddrSpace == AMDGPUASI.FLAT_ADDRESS)) {
|
2016-10-15 02:10:39 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2016-07-02 07:03:44 +08:00
|
|
|
if (Subtarget->hasUnalignedBufferAccess()) {
|
|
|
|
// If we have an uniform constant load, it still requires using a slow
|
|
|
|
// buffer instruction if unaligned.
|
|
|
|
if (IsFast) {
|
2017-03-27 22:04:01 +08:00
|
|
|
*IsFast = (AddrSpace == AMDGPUASI.CONSTANT_ADDRESS) ?
|
2016-07-02 07:03:44 +08:00
|
|
|
(Align % 4 == 0) : true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2015-02-05 04:49:52 +08:00
|
|
|
// Smaller than dword value must be aligned.
|
|
|
|
if (VT.bitsLT(MVT::i32))
|
|
|
|
return false;
|
|
|
|
|
2014-04-25 01:08:26 +08:00
|
|
|
// 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
|
|
|
|
// byte-address are ignored, thus forcing Dword alignment.
|
2014-07-21 23:45:06 +08:00
|
|
|
// This applies to private, global, and constant memory.
|
2014-04-25 01:08:26 +08:00
|
|
|
if (IsFast)
|
|
|
|
*IsFast = true;
|
2015-02-03 02:02:28 +08:00
|
|
|
|
|
|
|
return VT.bitsGT(MVT::i32) && Align % 4 == 0;
|
2013-06-25 10:39:35 +08:00
|
|
|
}
|
|
|
|
|
2014-07-29 01:49:26 +08:00
|
|
|
EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
|
|
|
|
unsigned SrcAlign, bool IsMemset,
|
|
|
|
bool ZeroMemset,
|
|
|
|
bool MemcpyStrSrc,
|
|
|
|
MachineFunction &MF) const {
|
|
|
|
// FIXME: Should account for address space here.
|
|
|
|
|
|
|
|
// The default fallback uses the private pointer size as a guess for a type to
|
|
|
|
// use. Make sure we switch these to 64-bit accesses.
|
|
|
|
|
|
|
|
if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global
|
|
|
|
return MVT::v4i32;
|
|
|
|
|
|
|
|
if (Size >= 8 && DstAlign >= 4)
|
|
|
|
return MVT::v2i32;
|
|
|
|
|
|
|
|
// Use the default.
|
|
|
|
return MVT::Other;
|
|
|
|
}
|
|
|
|
|
2017-03-27 22:04:01 +08:00
|
|
|
static bool isFlatGlobalAddrSpace(unsigned AS, AMDGPUAS AMDGPUASI) {
|
|
|
|
return AS == AMDGPUASI.GLOBAL_ADDRESS ||
|
|
|
|
AS == AMDGPUASI.FLAT_ADDRESS ||
|
|
|
|
AS == AMDGPUASI.CONSTANT_ADDRESS;
|
2015-12-02 07:04:00 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
|
|
|
|
unsigned DestAS) const {
|
2017-03-27 22:04:01 +08:00
|
|
|
return isFlatGlobalAddrSpace(SrcAS, AMDGPUASI) &&
|
|
|
|
isFlatGlobalAddrSpace(DestAS, AMDGPUASI);
|
2015-12-02 07:04:00 +08:00
|
|
|
}
|
|
|
|
|
2016-12-09 01:28:47 +08:00
|
|
|
bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
|
|
|
|
const MemSDNode *MemNode = cast<MemSDNode>(N);
|
|
|
|
const Value *Ptr = MemNode->getMemOperand()->getValue();
|
|
|
|
const Instruction *I = dyn_cast<Instruction>(Ptr);
|
|
|
|
return I && I->getMetadata("amdgpu.noclobber");
|
|
|
|
}
|
|
|
|
|
2016-12-03 02:12:53 +08:00
|
|
|
bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS,
|
|
|
|
unsigned DestAS) const {
|
|
|
|
// Flat -> private/local is a simple truncate.
|
|
|
|
// Flat -> global is no-op
|
2017-03-27 22:04:01 +08:00
|
|
|
if (SrcAS == AMDGPUASI.FLAT_ADDRESS)
|
2016-12-03 02:12:53 +08:00
|
|
|
return true;
|
|
|
|
|
|
|
|
return isNoopAddrSpaceCast(SrcAS, DestAS);
|
|
|
|
}
|
|
|
|
|
2015-12-16 04:55:55 +08:00
|
|
|
bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
|
|
|
|
const MemSDNode *MemNode = cast<MemSDNode>(N);
|
|
|
|
|
2017-01-28 02:41:14 +08:00
|
|
|
return AMDGPU::isUniformMMO(MemNode->getMemOperand());
|
2015-12-16 04:55:55 +08:00
|
|
|
}
|
|
|
|
|
2014-07-03 08:23:43 +08:00
|
|
|
TargetLoweringBase::LegalizeTypeAction
|
|
|
|
SITargetLowering::getPreferredVectorAction(EVT VT) const {
|
|
|
|
if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
|
|
|
|
return TypeSplitVector;
|
|
|
|
|
|
|
|
return TargetLoweringBase::getPreferredVectorAction(VT);
|
2013-08-15 07:25:00 +08:00
|
|
|
}
|
2013-06-25 10:39:35 +08:00
|
|
|
|
2014-04-01 03:54:27 +08:00
|
|
|
bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
|
|
|
|
Type *Ty) const {
|
2016-07-30 09:40:36 +08:00
|
|
|
// FIXME: Could be smarter if called for vector constants.
|
|
|
|
return true;
|
2014-04-01 03:54:27 +08:00
|
|
|
}
|
|
|
|
|
2016-01-20 08:13:22 +08:00
|
|
|
bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
|
2016-12-10 01:57:43 +08:00
|
|
|
if (Subtarget->has16BitInsts() && VT == MVT::i16) {
|
|
|
|
switch (Op) {
|
|
|
|
case ISD::LOAD:
|
|
|
|
case ISD::STORE:
|
|
|
|
|
|
|
|
// These operations are done with 32-bit instructions anyway.
|
|
|
|
case ISD::AND:
|
|
|
|
case ISD::OR:
|
|
|
|
case ISD::XOR:
|
|
|
|
case ISD::SELECT:
|
|
|
|
// TODO: Extensions?
|
|
|
|
return true;
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
2016-09-29 04:05:39 +08:00
|
|
|
|
2016-01-20 08:13:22 +08:00
|
|
|
// SimplifySetCC uses this function to determine whether or not it should
|
|
|
|
// create setcc with i1 operands. We don't have instructions for i1 setcc.
|
|
|
|
if (VT == MVT::i1 && Op == ISD::SETCC)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return TargetLowering::isTypeDesirableForOp(Op, VT);
|
|
|
|
}
|
|
|
|
|
2017-04-12 06:29:24 +08:00
|
|
|
SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
|
|
|
|
const SDLoc &SL,
|
|
|
|
SDValue Chain,
|
|
|
|
uint64_t Offset) const {
|
2015-07-09 10:09:52 +08:00
|
|
|
const DataLayout &DL = DAG.getDataLayout();
|
2014-09-22 23:35:29 +08:00
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
2017-08-04 07:00:29 +08:00
|
|
|
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
|
|
|
|
|
|
|
|
const ArgDescriptor *InputPtrReg;
|
|
|
|
const TargetRegisterClass *RC;
|
|
|
|
|
|
|
|
std::tie(InputPtrReg, RC)
|
|
|
|
= Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
|
2014-07-29 01:31:39 +08:00
|
|
|
|
2013-06-04 01:40:18 +08:00
|
|
|
MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
|
2017-03-27 22:04:01 +08:00
|
|
|
MVT PtrVT = getPointerTy(DL, AMDGPUASI.CONSTANT_ADDRESS);
|
2015-06-02 05:58:24 +08:00
|
|
|
SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
|
2017-08-04 07:00:29 +08:00
|
|
|
MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
|
|
|
|
|
2016-06-22 04:46:20 +08:00
|
|
|
return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
|
|
|
|
DAG.getConstant(Offset, SL, PtrVT));
|
|
|
|
}
|
2016-11-13 15:01:11 +08:00
|
|
|
|
2017-07-28 23:52:08 +08:00
|
|
|
SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
|
|
|
|
const SDLoc &SL) const {
|
|
|
|
auto MFI = DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
|
|
|
|
uint64_t Offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
|
|
|
|
return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
|
|
|
|
}
|
|
|
|
|
2017-04-12 06:29:24 +08:00
|
|
|
SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
|
|
|
|
const SDLoc &SL, SDValue Val,
|
|
|
|
bool Signed,
|
2017-01-10 02:52:39 +08:00
|
|
|
const ISD::InputArg *Arg) const {
|
|
|
|
if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
|
|
|
|
VT.bitsLT(MemVT)) {
|
|
|
|
unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
|
|
|
|
Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
|
|
|
|
}
|
|
|
|
|
2016-10-18 00:21:45 +08:00
|
|
|
if (MemVT.isFloatingPoint())
|
2017-01-10 02:52:39 +08:00
|
|
|
Val = getFPExtOrFPTrunc(DAG, Val, SL, VT);
|
2016-10-18 00:21:45 +08:00
|
|
|
else if (Signed)
|
2017-01-10 02:52:39 +08:00
|
|
|
Val = DAG.getSExtOrTrunc(Val, SL, VT);
|
2016-10-18 00:21:45 +08:00
|
|
|
else
|
2017-01-10 02:52:39 +08:00
|
|
|
Val = DAG.getZExtOrTrunc(Val, SL, VT);
|
2016-10-18 00:21:45 +08:00
|
|
|
|
2017-04-12 06:29:24 +08:00
|
|
|
return Val;
|
2013-06-04 01:40:18 +08:00
|
|
|
}
|
|
|
|
|
2017-04-12 06:29:24 +08:00
|
|
|
SDValue SITargetLowering::lowerKernargMemParameter(
|
|
|
|
SelectionDAG &DAG, EVT VT, EVT MemVT,
|
|
|
|
const SDLoc &SL, SDValue Chain,
|
|
|
|
uint64_t Offset, bool Signed,
|
|
|
|
const ISD::InputArg *Arg) const {
|
|
|
|
const DataLayout &DL = DAG.getDataLayout();
|
|
|
|
Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
|
|
|
|
PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS);
|
|
|
|
MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
|
2013-03-07 17:03:52 +08:00
|
|
|
|
2017-04-12 06:29:24 +08:00
|
|
|
unsigned Align = DL.getABITypeAlignment(Ty);
|
2015-11-03 07:23:02 +08:00
|
|
|
|
2017-04-12 06:29:24 +08:00
|
|
|
SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
|
|
|
|
SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
|
|
|
|
MachineMemOperand::MONonTemporal |
|
|
|
|
MachineMemOperand::MODereferenceable |
|
|
|
|
MachineMemOperand::MOInvariant);
|
2016-06-25 11:11:28 +08:00
|
|
|
|
2017-04-12 06:29:24 +08:00
|
|
|
SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
|
|
|
|
return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
|
|
|
|
}
|
2013-03-07 17:04:14 +08:00
|
|
|
|
2017-05-18 05:56:25 +08:00
|
|
|
SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
|
|
|
|
const SDLoc &SL, SDValue Chain,
|
|
|
|
const ISD::InputArg &Arg) const {
|
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
|
|
MachineFrameInfo &MFI = MF.getFrameInfo();
|
|
|
|
|
|
|
|
if (Arg.Flags.isByVal()) {
|
|
|
|
unsigned Size = Arg.Flags.getByValSize();
|
|
|
|
int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
|
|
|
|
return DAG.getFrameIndex(FrameIdx, MVT::i32);
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned ArgOffset = VA.getLocMemOffset();
|
|
|
|
unsigned ArgSize = VA.getValVT().getStoreSize();
|
|
|
|
|
|
|
|
int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
|
|
|
|
|
|
|
|
// Create load nodes to retrieve arguments from the stack.
|
|
|
|
SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
|
|
|
|
SDValue ArgValue;
|
|
|
|
|
|
|
|
// For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
|
|
|
|
ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
|
|
|
|
MVT MemVT = VA.getValVT();
|
|
|
|
|
|
|
|
switch (VA.getLocInfo()) {
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
case CCValAssign::BCvt:
|
|
|
|
MemVT = VA.getLocVT();
|
|
|
|
break;
|
|
|
|
case CCValAssign::SExt:
|
|
|
|
ExtType = ISD::SEXTLOAD;
|
|
|
|
break;
|
|
|
|
case CCValAssign::ZExt:
|
|
|
|
ExtType = ISD::ZEXTLOAD;
|
|
|
|
break;
|
|
|
|
case CCValAssign::AExt:
|
|
|
|
ExtType = ISD::EXTLOAD;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
ArgValue = DAG.getExtLoad(
|
|
|
|
ExtType, SL, VA.getLocVT(), Chain, FIN,
|
|
|
|
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
|
|
|
|
MemVT);
|
|
|
|
return ArgValue;
|
|
|
|
}
|
|
|
|
|
2017-08-04 07:00:29 +08:00
|
|
|
SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
|
|
|
|
const SIMachineFunctionInfo &MFI,
|
|
|
|
EVT VT,
|
|
|
|
AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
|
|
|
|
const ArgDescriptor *Reg;
|
|
|
|
const TargetRegisterClass *RC;
|
|
|
|
|
|
|
|
std::tie(Reg, RC) = MFI.getPreloadedValue(PVID);
|
|
|
|
return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
|
|
|
|
}
|
|
|
|
|
2017-04-12 06:29:24 +08:00
|
|
|
static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
|
|
|
|
CallingConv::ID CallConv,
|
|
|
|
ArrayRef<ISD::InputArg> Ins,
|
|
|
|
BitVector &Skipped,
|
|
|
|
FunctionType *FType,
|
|
|
|
SIMachineFunctionInfo *Info) {
|
|
|
|
for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
|
|
|
|
const ISD::InputArg &Arg = Ins[I];
|
2013-05-18 08:21:46 +08:00
|
|
|
|
2017-04-12 06:29:24 +08:00
|
|
|
// First check if it's a PS input addr.
|
2016-04-07 03:40:20 +08:00
|
|
|
if (CallConv == CallingConv::AMDGPU_PS && !Arg.Flags.isInReg() &&
|
2016-01-13 19:46:10 +08:00
|
|
|
!Arg.Flags.isByVal() && PSInputNum <= 15) {
|
2013-03-07 17:04:14 +08:00
|
|
|
|
2016-01-13 19:45:36 +08:00
|
|
|
if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) {
|
2017-04-12 06:29:24 +08:00
|
|
|
// We can safely skip PS inputs.
|
|
|
|
Skipped.set(I);
|
2013-03-07 17:04:14 +08:00
|
|
|
++PSInputNum;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2016-01-13 19:45:36 +08:00
|
|
|
Info->markPSInputAllocated(PSInputNum);
|
|
|
|
if (Arg.Used)
|
2017-04-12 06:29:24 +08:00
|
|
|
Info->markPSInputEnabled(PSInputNum);
|
2016-01-13 19:45:36 +08:00
|
|
|
|
|
|
|
++PSInputNum;
|
2013-03-07 17:04:14 +08:00
|
|
|
}
|
|
|
|
|
2017-04-12 06:29:24 +08:00
|
|
|
// Second split vertices into their elements.
|
|
|
|
if (Arg.VT.isVector()) {
|
|
|
|
ISD::InputArg NewArg = Arg;
|
|
|
|
NewArg.Flags.setSplit();
|
|
|
|
NewArg.VT = Arg.VT.getVectorElementType();
|
|
|
|
|
|
|
|
// We REALLY want the ORIGINAL number of vertex elements here, e.g. a
|
|
|
|
// three or five element vertex only needs three or five registers,
|
|
|
|
// NOT four or eight.
|
|
|
|
Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
|
|
|
|
unsigned NumElements = ParamType->getVectorNumElements();
|
|
|
|
|
|
|
|
for (unsigned J = 0; J != NumElements; ++J) {
|
|
|
|
Splits.push_back(NewArg);
|
|
|
|
NewArg.PartOffset += NewArg.VT.getStoreSize();
|
2013-03-07 17:03:52 +08:00
|
|
|
}
|
2017-04-12 06:29:24 +08:00
|
|
|
} else {
|
|
|
|
Splits.push_back(Arg);
|
2013-03-07 17:03:52 +08:00
|
|
|
}
|
|
|
|
}
|
2017-04-12 06:29:24 +08:00
|
|
|
}
|
2013-03-07 17:03:52 +08:00
|
|
|
|
2017-04-12 06:29:24 +08:00
|
|
|
// Allocate special inputs passed in VGPRs.
|
2017-08-04 07:00:29 +08:00
|
|
|
static void allocateSpecialEntryInputVGPRs(CCState &CCInfo,
|
|
|
|
MachineFunction &MF,
|
|
|
|
const SIRegisterInfo &TRI,
|
|
|
|
SIMachineFunctionInfo &Info) {
|
2017-04-12 06:29:24 +08:00
|
|
|
if (Info.hasWorkItemIDX()) {
|
2017-08-04 07:00:29 +08:00
|
|
|
unsigned Reg = AMDGPU::VGPR0;
|
2017-04-12 06:29:24 +08:00
|
|
|
MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
|
2017-08-04 07:00:29 +08:00
|
|
|
|
2017-04-12 06:29:24 +08:00
|
|
|
CCInfo.AllocateReg(Reg);
|
2017-08-04 07:00:29 +08:00
|
|
|
Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg));
|
2017-04-12 06:29:24 +08:00
|
|
|
}
|
2013-03-07 17:03:52 +08:00
|
|
|
|
2017-04-12 06:29:24 +08:00
|
|
|
if (Info.hasWorkItemIDY()) {
|
2017-08-04 07:00:29 +08:00
|
|
|
unsigned Reg = AMDGPU::VGPR1;
|
2017-04-12 06:29:24 +08:00
|
|
|
MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
|
2017-08-04 07:00:29 +08:00
|
|
|
|
2017-04-12 06:29:24 +08:00
|
|
|
CCInfo.AllocateReg(Reg);
|
2017-08-04 07:00:29 +08:00
|
|
|
Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
|
2015-12-01 05:16:03 +08:00
|
|
|
}
|
2015-11-30 23:46:47 +08:00
|
|
|
|
2017-04-12 06:29:24 +08:00
|
|
|
if (Info.hasWorkItemIDZ()) {
|
2017-08-04 07:00:29 +08:00
|
|
|
unsigned Reg = AMDGPU::VGPR2;
|
2017-04-12 06:29:24 +08:00
|
|
|
MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
|
2017-08-04 07:00:29 +08:00
|
|
|
|
2017-04-12 06:29:24 +08:00
|
|
|
CCInfo.AllocateReg(Reg);
|
2017-08-04 07:00:29 +08:00
|
|
|
Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Try to allocate a VGPR at the end of the argument list, or if no argument
|
|
|
|
// VGPRs are left allocating a stack slot.
|
|
|
|
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) {
|
|
|
|
ArrayRef<MCPhysReg> ArgVGPRs
|
|
|
|
= makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
|
|
|
|
unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
|
|
|
|
if (RegIdx == ArgVGPRs.size()) {
|
|
|
|
// Spill to stack required.
|
|
|
|
int64_t Offset = CCInfo.AllocateStack(4, 4);
|
|
|
|
|
|
|
|
return ArgDescriptor::createStack(Offset);
|
2017-04-12 06:29:24 +08:00
|
|
|
}
|
2017-08-04 07:00:29 +08:00
|
|
|
|
|
|
|
unsigned Reg = ArgVGPRs[RegIdx];
|
|
|
|
Reg = CCInfo.AllocateReg(Reg);
|
|
|
|
assert(Reg != AMDGPU::NoRegister);
|
|
|
|
|
|
|
|
MachineFunction &MF = CCInfo.getMachineFunction();
|
|
|
|
MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
|
|
|
|
return ArgDescriptor::createRegister(Reg);
|
|
|
|
}
|
|
|
|
|
|
|
|
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
|
|
|
|
const TargetRegisterClass *RC,
|
|
|
|
unsigned NumArgRegs) {
|
|
|
|
ArrayRef<MCPhysReg> ArgSGPRs = makeArrayRef(RC->begin(), 32);
|
|
|
|
unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
|
|
|
|
if (RegIdx == ArgSGPRs.size())
|
|
|
|
report_fatal_error("ran out of SGPRs for arguments");
|
|
|
|
|
|
|
|
unsigned Reg = ArgSGPRs[RegIdx];
|
|
|
|
Reg = CCInfo.AllocateReg(Reg);
|
|
|
|
assert(Reg != AMDGPU::NoRegister);
|
|
|
|
|
|
|
|
MachineFunction &MF = CCInfo.getMachineFunction();
|
|
|
|
MF.addLiveIn(Reg, RC);
|
|
|
|
return ArgDescriptor::createRegister(Reg);
|
|
|
|
}
|
|
|
|
|
|
|
|
static ArgDescriptor allocateSGPR32Input(CCState &CCInfo) {
|
|
|
|
return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
|
|
|
|
}
|
|
|
|
|
|
|
|
static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) {
|
|
|
|
return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void allocateSpecialInputVGPRs(CCState &CCInfo,
|
|
|
|
MachineFunction &MF,
|
|
|
|
const SIRegisterInfo &TRI,
|
|
|
|
SIMachineFunctionInfo &Info) {
|
|
|
|
if (Info.hasWorkItemIDX())
|
|
|
|
Info.setWorkItemIDX(allocateVGPR32Input(CCInfo));
|
|
|
|
|
|
|
|
if (Info.hasWorkItemIDY())
|
|
|
|
Info.setWorkItemIDY(allocateVGPR32Input(CCInfo));
|
|
|
|
|
|
|
|
if (Info.hasWorkItemIDZ())
|
|
|
|
Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo));
|
|
|
|
}
|
|
|
|
|
|
|
|
static void allocateSpecialInputSGPRs(CCState &CCInfo,
|
|
|
|
MachineFunction &MF,
|
|
|
|
const SIRegisterInfo &TRI,
|
|
|
|
SIMachineFunctionInfo &Info) {
|
|
|
|
auto &ArgInfo = Info.getArgInfo();
|
|
|
|
|
|
|
|
// TODO: Unify handling with private memory pointers.
|
|
|
|
|
|
|
|
if (Info.hasDispatchPtr())
|
|
|
|
ArgInfo.DispatchPtr = allocateSGPR64Input(CCInfo);
|
|
|
|
|
|
|
|
if (Info.hasQueuePtr())
|
|
|
|
ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo);
|
|
|
|
|
|
|
|
if (Info.hasKernargSegmentPtr())
|
|
|
|
ArgInfo.KernargSegmentPtr = allocateSGPR64Input(CCInfo);
|
|
|
|
|
|
|
|
if (Info.hasDispatchID())
|
|
|
|
ArgInfo.DispatchID = allocateSGPR64Input(CCInfo);
|
|
|
|
|
|
|
|
// flat_scratch_init is not applicable for non-kernel functions.
|
|
|
|
|
|
|
|
if (Info.hasWorkGroupIDX())
|
|
|
|
ArgInfo.WorkGroupIDX = allocateSGPR32Input(CCInfo);
|
|
|
|
|
|
|
|
if (Info.hasWorkGroupIDY())
|
|
|
|
ArgInfo.WorkGroupIDY = allocateSGPR32Input(CCInfo);
|
|
|
|
|
|
|
|
if (Info.hasWorkGroupIDZ())
|
|
|
|
ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo);
|
2017-08-04 07:12:44 +08:00
|
|
|
|
|
|
|
if (Info.hasImplicitArgPtr())
|
|
|
|
ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo);
|
2017-04-12 06:29:24 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Allocate special inputs passed in user SGPRs.
|
|
|
|
static void allocateHSAUserSGPRs(CCState &CCInfo,
|
|
|
|
MachineFunction &MF,
|
|
|
|
const SIRegisterInfo &TRI,
|
|
|
|
SIMachineFunctionInfo &Info) {
|
2017-06-26 11:01:31 +08:00
|
|
|
if (Info.hasImplicitBufferPtr()) {
|
|
|
|
unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
|
|
|
|
MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
|
|
|
|
CCInfo.AllocateReg(ImplicitBufferPtrReg);
|
2017-01-25 09:25:13 +08:00
|
|
|
}
|
|
|
|
|
2015-12-01 05:16:03 +08:00
|
|
|
// FIXME: How should these inputs interact with inreg / custom SGPR inputs?
|
2017-04-12 06:29:24 +08:00
|
|
|
if (Info.hasPrivateSegmentBuffer()) {
|
|
|
|
unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
|
|
|
|
MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
|
2015-12-01 05:16:03 +08:00
|
|
|
CCInfo.AllocateReg(PrivateSegmentBufferReg);
|
|
|
|
}
|
2015-11-30 23:46:47 +08:00
|
|
|
|
2017-04-12 06:29:24 +08:00
|
|
|
if (Info.hasDispatchPtr()) {
|
|
|
|
unsigned DispatchPtrReg = Info.addDispatchPtr(TRI);
|
2016-11-30 03:39:48 +08:00
|
|
|
MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
|
2015-12-01 05:16:03 +08:00
|
|
|
CCInfo.AllocateReg(DispatchPtrReg);
|
2013-06-04 01:40:11 +08:00
|
|
|
}
|
|
|
|
|
2017-04-12 06:29:24 +08:00
|
|
|
if (Info.hasQueuePtr()) {
|
|
|
|
unsigned QueuePtrReg = Info.addQueuePtr(TRI);
|
2016-11-30 03:39:48 +08:00
|
|
|
MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
|
2016-04-26 03:27:18 +08:00
|
|
|
CCInfo.AllocateReg(QueuePtrReg);
|
|
|
|
}
|
|
|
|
|
2017-04-12 06:29:24 +08:00
|
|
|
if (Info.hasKernargSegmentPtr()) {
|
|
|
|
unsigned InputPtrReg = Info.addKernargSegmentPtr(TRI);
|
2016-11-30 03:39:48 +08:00
|
|
|
MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
|
2015-12-01 05:16:03 +08:00
|
|
|
CCInfo.AllocateReg(InputPtrReg);
|
2013-10-23 08:44:32 +08:00
|
|
|
}
|
|
|
|
|
2017-04-12 06:29:24 +08:00
|
|
|
if (Info.hasDispatchID()) {
|
|
|
|
unsigned DispatchIDReg = Info.addDispatchID(TRI);
|
2016-11-30 03:39:48 +08:00
|
|
|
MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
|
2016-07-23 01:01:30 +08:00
|
|
|
CCInfo.AllocateReg(DispatchIDReg);
|
|
|
|
}
|
|
|
|
|
2017-04-12 06:29:24 +08:00
|
|
|
if (Info.hasFlatScratchInit()) {
|
|
|
|
unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI);
|
2016-11-30 03:39:48 +08:00
|
|
|
MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
|
2016-02-12 14:31:30 +08:00
|
|
|
CCInfo.AllocateReg(FlatScratchInitReg);
|
|
|
|
}
|
|
|
|
|
2015-12-01 05:16:03 +08:00
|
|
|
// TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
|
|
|
|
// these from the dispatch pointer.
|
2017-04-12 06:29:24 +08:00
|
|
|
}
|
2015-12-01 05:16:03 +08:00
|
|
|
|
2017-04-12 06:29:24 +08:00
|
|
|
// Allocate special input registers that are initialized per-wave.
|
|
|
|
static void allocateSystemSGPRs(CCState &CCInfo,
|
|
|
|
MachineFunction &MF,
|
|
|
|
SIMachineFunctionInfo &Info,
|
2017-05-05 06:25:20 +08:00
|
|
|
CallingConv::ID CallConv,
|
2017-04-12 06:29:24 +08:00
|
|
|
bool IsShader) {
|
|
|
|
if (Info.hasWorkGroupIDX()) {
|
|
|
|
unsigned Reg = Info.addWorkGroupIDX();
|
2016-11-26 01:37:09 +08:00
|
|
|
MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
|
2015-12-01 05:16:03 +08:00
|
|
|
CCInfo.AllocateReg(Reg);
|
2016-04-15 00:27:03 +08:00
|
|
|
}
|
2015-12-01 05:16:03 +08:00
|
|
|
|
2017-04-12 06:29:24 +08:00
|
|
|
if (Info.hasWorkGroupIDY()) {
|
|
|
|
unsigned Reg = Info.addWorkGroupIDY();
|
2016-11-26 01:37:09 +08:00
|
|
|
MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
|
2015-12-01 05:16:03 +08:00
|
|
|
CCInfo.AllocateReg(Reg);
|
|
|
|
}
|
|
|
|
|
2017-04-12 06:29:24 +08:00
|
|
|
if (Info.hasWorkGroupIDZ()) {
|
|
|
|
unsigned Reg = Info.addWorkGroupIDZ();
|
2016-11-26 01:37:09 +08:00
|
|
|
MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
|
2015-12-01 05:16:03 +08:00
|
|
|
CCInfo.AllocateReg(Reg);
|
|
|
|
}
|
|
|
|
|
2017-04-12 06:29:24 +08:00
|
|
|
if (Info.hasWorkGroupInfo()) {
|
|
|
|
unsigned Reg = Info.addWorkGroupInfo();
|
2016-11-26 01:37:09 +08:00
|
|
|
MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
|
2015-12-01 05:16:03 +08:00
|
|
|
CCInfo.AllocateReg(Reg);
|
2015-01-21 03:33:04 +08:00
|
|
|
}
|
2015-07-11 06:51:36 +08:00
|
|
|
|
2017-04-12 06:29:24 +08:00
|
|
|
if (Info.hasPrivateSegmentWaveByteOffset()) {
|
2015-12-01 05:16:03 +08:00
|
|
|
// Scratch wave offset passed in system SGPR.
|
2016-04-15 00:27:03 +08:00
|
|
|
unsigned PrivateSegmentWaveByteOffsetReg;
|
|
|
|
|
2017-04-12 06:29:24 +08:00
|
|
|
if (IsShader) {
|
2017-05-05 06:25:20 +08:00
|
|
|
PrivateSegmentWaveByteOffsetReg =
|
|
|
|
Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
|
|
|
|
|
|
|
|
// This is true if the scratch wave byte offset doesn't have a fixed
|
|
|
|
// location.
|
|
|
|
if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
|
|
|
|
PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
|
|
|
|
Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
|
|
|
|
}
|
2016-04-15 00:27:03 +08:00
|
|
|
} else
|
2017-04-12 06:29:24 +08:00
|
|
|
PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
|
2015-12-01 05:16:03 +08:00
|
|
|
|
|
|
|
MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
|
|
|
|
CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
|
|
|
|
}
|
2017-04-12 06:29:24 +08:00
|
|
|
}
|
2015-12-01 05:16:03 +08:00
|
|
|
|
2017-04-12 06:29:24 +08:00
|
|
|
static void reservePrivateMemoryRegs(const TargetMachine &TM,
|
|
|
|
MachineFunction &MF,
|
|
|
|
const SIRegisterInfo &TRI,
|
2017-07-19 00:44:56 +08:00
|
|
|
SIMachineFunctionInfo &Info) {
|
2015-12-01 05:16:03 +08:00
|
|
|
// Now that we've figured out where the scratch register inputs are, see if
|
|
|
|
// should reserve the arguments and use them directly.
|
2017-05-18 05:56:25 +08:00
|
|
|
MachineFrameInfo &MFI = MF.getFrameInfo();
|
|
|
|
bool HasStackObjects = MFI.hasStackObjects();
|
2017-04-12 06:29:24 +08:00
|
|
|
|
2016-02-12 14:31:30 +08:00
|
|
|
// Record that we know we have non-spill stack objects so we don't need to
|
|
|
|
// check all stack objects later.
|
|
|
|
if (HasStackObjects)
|
2017-04-12 06:29:24 +08:00
|
|
|
Info.setHasNonSpillStackObjects(true);
|
2015-12-01 05:16:03 +08:00
|
|
|
|
2016-10-13 21:10:00 +08:00
|
|
|
// Everything live out of a block is spilled with fast regalloc, so it's
|
|
|
|
// almost certain that spilling will be required.
|
2017-04-12 06:29:24 +08:00
|
|
|
if (TM.getOptLevel() == CodeGenOpt::None)
|
2016-10-13 21:10:00 +08:00
|
|
|
HasStackObjects = true;
|
|
|
|
|
2017-08-02 03:54:18 +08:00
|
|
|
// For now assume stack access is needed in any callee functions, so we need
|
|
|
|
// the scratch registers to pass in.
|
|
|
|
bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
|
|
|
|
|
2017-04-12 06:29:24 +08:00
|
|
|
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
|
2017-01-25 09:25:13 +08:00
|
|
|
if (ST.isAmdCodeObjectV2(MF)) {
|
2017-08-02 03:54:18 +08:00
|
|
|
if (RequiresStackAccess) {
|
2015-12-01 05:16:03 +08:00
|
|
|
// If we have stack objects, we unquestionably need the private buffer
|
2016-09-17 05:34:26 +08:00
|
|
|
// resource. For the Code Object V2 ABI, this will be the first 4 user
|
|
|
|
// SGPR inputs. We can reserve those and use them directly.
|
2015-12-01 05:16:03 +08:00
|
|
|
|
2017-08-04 07:00:29 +08:00
|
|
|
unsigned PrivateSegmentBufferReg = Info.getPreloadedReg(
|
|
|
|
AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
|
2017-04-12 06:29:24 +08:00
|
|
|
Info.setScratchRSrcReg(PrivateSegmentBufferReg);
|
2015-12-01 05:16:03 +08:00
|
|
|
|
2017-08-02 03:54:18 +08:00
|
|
|
if (MFI.hasCalls()) {
|
|
|
|
// If we have calls, we need to keep the frame register in a register
|
|
|
|
// that won't be clobbered by a call, so ensure it is copied somewhere.
|
|
|
|
|
|
|
|
// This is not a problem for the scratch wave offset, because the same
|
|
|
|
// registers are reserved in all functions.
|
|
|
|
|
|
|
|
// FIXME: Nothing is really ensuring this is a call preserved register,
|
|
|
|
// it's just selected from the end so it happens to be.
|
|
|
|
unsigned ReservedOffsetReg
|
|
|
|
= TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
|
|
|
|
Info.setScratchWaveOffsetReg(ReservedOffsetReg);
|
|
|
|
} else {
|
2017-08-04 07:00:29 +08:00
|
|
|
unsigned PrivateSegmentWaveByteOffsetReg = Info.getPreloadedReg(
|
|
|
|
AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
|
2017-08-02 03:54:18 +08:00
|
|
|
Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
|
|
|
|
}
|
2015-12-01 05:16:03 +08:00
|
|
|
} else {
|
|
|
|
unsigned ReservedBufferReg
|
2017-04-12 06:29:24 +08:00
|
|
|
= TRI.reservedPrivateSegmentBufferReg(MF);
|
2015-12-01 05:16:03 +08:00
|
|
|
unsigned ReservedOffsetReg
|
2017-04-12 06:29:24 +08:00
|
|
|
= TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
|
2015-12-01 05:16:03 +08:00
|
|
|
|
|
|
|
// We tentatively reserve the last registers (skipping the last two
|
|
|
|
// which may contain VCC). After register allocation, we'll replace
|
|
|
|
// these with the ones immediately after those which were really
|
|
|
|
// allocated. In the prologue copies will be inserted from the argument
|
|
|
|
// to these reserved registers.
|
2017-04-12 06:29:24 +08:00
|
|
|
Info.setScratchRSrcReg(ReservedBufferReg);
|
|
|
|
Info.setScratchWaveOffsetReg(ReservedOffsetReg);
|
2015-12-01 05:16:03 +08:00
|
|
|
}
|
|
|
|
} else {
|
2017-04-12 06:29:24 +08:00
|
|
|
unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
|
2015-12-01 05:16:03 +08:00
|
|
|
|
|
|
|
// Without HSA, relocations are used for the scratch pointer and the
|
|
|
|
// buffer resource setup is always inserted in the prologue. Scratch wave
|
|
|
|
// offset is still in an input SGPR.
|
2017-04-12 06:29:24 +08:00
|
|
|
Info.setScratchRSrcReg(ReservedBufferReg);
|
2015-12-01 05:16:03 +08:00
|
|
|
|
2017-08-02 03:54:18 +08:00
|
|
|
if (HasStackObjects && !MFI.hasCalls()) {
|
2017-08-04 07:00:29 +08:00
|
|
|
unsigned ScratchWaveOffsetReg = Info.getPreloadedReg(
|
|
|
|
AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
|
2017-04-12 06:29:24 +08:00
|
|
|
Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg);
|
2015-12-01 05:16:03 +08:00
|
|
|
} else {
|
|
|
|
unsigned ReservedOffsetReg
|
2017-04-12 06:29:24 +08:00
|
|
|
= TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
|
|
|
|
Info.setScratchWaveOffsetReg(ReservedOffsetReg);
|
2015-12-01 05:16:03 +08:00
|
|
|
}
|
|
|
|
}
|
2017-04-12 06:29:24 +08:00
|
|
|
}
|
2015-12-01 05:16:03 +08:00
|
|
|
|
2017-08-02 03:54:18 +08:00
|
|
|
bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const {
|
|
|
|
const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
|
|
|
|
return !Info->isEntryFunction();
|
|
|
|
}
|
|
|
|
|
|
|
|
void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
void SITargetLowering::insertCopiesSplitCSR(
|
|
|
|
MachineBasicBlock *Entry,
|
|
|
|
const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
|
|
|
|
const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
|
|
|
|
|
|
|
|
const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
|
|
|
|
if (!IStart)
|
|
|
|
return;
|
|
|
|
|
|
|
|
const TargetInstrInfo *TII = Subtarget->getInstrInfo();
|
|
|
|
MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
|
|
|
|
MachineBasicBlock::iterator MBBI = Entry->begin();
|
|
|
|
for (const MCPhysReg *I = IStart; *I; ++I) {
|
|
|
|
const TargetRegisterClass *RC = nullptr;
|
|
|
|
if (AMDGPU::SReg_64RegClass.contains(*I))
|
|
|
|
RC = &AMDGPU::SGPR_64RegClass;
|
|
|
|
else if (AMDGPU::SReg_32RegClass.contains(*I))
|
|
|
|
RC = &AMDGPU::SGPR_32RegClass;
|
|
|
|
else
|
|
|
|
llvm_unreachable("Unexpected register class in CSRsViaCopy!");
|
|
|
|
|
|
|
|
unsigned NewVR = MRI->createVirtualRegister(RC);
|
|
|
|
// Create copy from CSR to a virtual register.
|
|
|
|
Entry->addLiveIn(*I);
|
|
|
|
BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
|
|
|
|
.addReg(*I);
|
|
|
|
|
|
|
|
// Insert the copy-back instructions right before the terminator.
|
|
|
|
for (auto *Exit : Exits)
|
|
|
|
BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
|
|
|
|
TII->get(TargetOpcode::COPY), *I)
|
|
|
|
.addReg(NewVR);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-04-12 06:29:24 +08:00
|
|
|
SDValue SITargetLowering::LowerFormalArguments(
|
|
|
|
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
|
|
|
|
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
|
|
|
|
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
|
|
|
|
const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
|
|
|
|
|
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
|
|
FunctionType *FType = MF.getFunction()->getFunctionType();
|
|
|
|
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
|
|
|
|
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
|
|
|
|
|
|
|
|
if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
|
|
|
|
const Function *Fn = MF.getFunction();
|
|
|
|
DiagnosticInfoUnsupported NoGraphicsHSA(
|
|
|
|
*Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
|
|
|
|
DAG.getContext()->diagnose(NoGraphicsHSA);
|
|
|
|
return DAG.getEntryNode();
|
2016-04-15 00:27:03 +08:00
|
|
|
}
|
2015-12-01 05:16:03 +08:00
|
|
|
|
2017-04-12 06:29:24 +08:00
|
|
|
// Create stack objects that are used for emitting debugger prologue if
|
|
|
|
// "amdgpu-debugger-emit-prologue" attribute was specified.
|
|
|
|
if (ST.debuggerEmitPrologue())
|
|
|
|
createDebuggerPrologueStackObjects(MF);
|
|
|
|
|
|
|
|
SmallVector<ISD::InputArg, 16> Splits;
|
|
|
|
SmallVector<CCValAssign, 16> ArgLocs;
|
|
|
|
BitVector Skipped(Ins.size());
|
|
|
|
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
|
|
|
|
*DAG.getContext());
|
|
|
|
|
|
|
|
bool IsShader = AMDGPU::isShader(CallConv);
|
2017-04-12 06:29:28 +08:00
|
|
|
bool IsKernel = AMDGPU::isKernel(CallConv);
|
2017-04-12 06:29:24 +08:00
|
|
|
bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
|
|
|
|
|
2017-08-02 08:59:51 +08:00
|
|
|
if (!IsEntryFunc) {
|
|
|
|
// 4 bytes are reserved at offset 0 for the emergency stack slot. Skip over
|
|
|
|
// this when allocating argument fixed offsets.
|
|
|
|
CCInfo.AllocateStack(4, 4);
|
|
|
|
}
|
|
|
|
|
2017-04-12 06:29:24 +08:00
|
|
|
if (IsShader) {
|
|
|
|
processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
|
|
|
|
|
|
|
|
// At least one interpolation mode must be enabled or else the GPU will
|
|
|
|
// hang.
|
|
|
|
//
|
|
|
|
// Check PSInputAddr instead of PSInputEnable. The idea is that if the user
|
|
|
|
// set PSInputAddr, the user wants to enable some bits after the compilation
|
|
|
|
// based on run-time states. Since we can't know what the final PSInputEna
|
|
|
|
// will look like, so we shouldn't do anything here and the user should take
|
|
|
|
// responsibility for the correct programming.
|
|
|
|
//
|
|
|
|
// Otherwise, the following restrictions apply:
|
|
|
|
// - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
|
|
|
|
// - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
|
|
|
|
// enabled too.
|
|
|
|
if (CallConv == CallingConv::AMDGPU_PS &&
|
|
|
|
((Info->getPSInputAddr() & 0x7F) == 0 ||
|
|
|
|
((Info->getPSInputAddr() & 0xF) == 0 &&
|
|
|
|
Info->isPSInputAllocated(11)))) {
|
|
|
|
CCInfo.AllocateReg(AMDGPU::VGPR0);
|
|
|
|
CCInfo.AllocateReg(AMDGPU::VGPR1);
|
|
|
|
Info->markPSInputAllocated(0);
|
|
|
|
Info->markPSInputEnabled(0);
|
|
|
|
}
|
|
|
|
|
|
|
|
assert(!Info->hasDispatchPtr() &&
|
|
|
|
!Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
|
|
|
|
!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
|
|
|
|
!Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
|
|
|
|
!Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
|
|
|
|
!Info->hasWorkItemIDZ());
|
2017-05-18 05:56:25 +08:00
|
|
|
} else if (IsKernel) {
|
|
|
|
assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
|
2017-04-12 06:29:24 +08:00
|
|
|
} else {
|
2017-05-18 05:56:25 +08:00
|
|
|
Splits.append(Ins.begin(), Ins.end());
|
2015-12-01 05:16:03 +08:00
|
|
|
}
|
|
|
|
|
2017-04-12 06:29:24 +08:00
|
|
|
if (IsEntryFunc) {
|
2017-08-04 07:00:29 +08:00
|
|
|
allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
|
2017-04-12 06:29:24 +08:00
|
|
|
allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (IsKernel) {
|
|
|
|
analyzeFormalArgumentsCompute(CCInfo, Ins);
|
|
|
|
} else {
|
|
|
|
CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
|
|
|
|
CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
|
|
|
|
}
|
|
|
|
|
|
|
|
SmallVector<SDValue, 16> Chains;
|
|
|
|
|
|
|
|
for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
|
|
|
|
const ISD::InputArg &Arg = Ins[i];
|
|
|
|
if (Skipped[i]) {
|
|
|
|
InVals.push_back(DAG.getUNDEF(Arg.VT));
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
CCValAssign &VA = ArgLocs[ArgIdx++];
|
|
|
|
MVT VT = VA.getLocVT();
|
|
|
|
|
|
|
|
if (IsEntryFunc && VA.isMemLoc()) {
|
|
|
|
VT = Ins[i].VT;
|
|
|
|
EVT MemVT = VA.getLocVT();
|
|
|
|
|
|
|
|
const uint64_t Offset = Subtarget->getExplicitKernelArgOffset(MF) +
|
|
|
|
VA.getLocMemOffset();
|
|
|
|
Info->setABIArgOffset(Offset + MemVT.getStoreSize());
|
|
|
|
|
|
|
|
// The first 36 bytes of the input buffer contains information about
|
|
|
|
// thread group and global sizes.
|
|
|
|
SDValue Arg = lowerKernargMemParameter(
|
|
|
|
DAG, VT, MemVT, DL, Chain, Offset, Ins[i].Flags.isSExt(), &Ins[i]);
|
|
|
|
Chains.push_back(Arg.getValue(1));
|
|
|
|
|
|
|
|
auto *ParamTy =
|
|
|
|
dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
|
|
|
|
if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
|
|
|
|
ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
|
|
|
|
// On SI local pointers are just offsets into LDS, so they are always
|
|
|
|
// less than 16-bits. On CI and newer they could potentially be
|
|
|
|
// real pointers, so we can't guarantee their size.
|
|
|
|
Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
|
|
|
|
DAG.getValueType(MVT::i16));
|
|
|
|
}
|
|
|
|
|
|
|
|
InVals.push_back(Arg);
|
|
|
|
continue;
|
2017-05-18 05:56:25 +08:00
|
|
|
} else if (!IsEntryFunc && VA.isMemLoc()) {
|
|
|
|
SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
|
|
|
|
InVals.push_back(Val);
|
|
|
|
if (!Arg.Flags.isByVal())
|
|
|
|
Chains.push_back(Val.getValue(1));
|
|
|
|
continue;
|
2017-04-12 06:29:24 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
assert(VA.isRegLoc() && "Parameter must be in a register!");
|
|
|
|
|
|
|
|
unsigned Reg = VA.getLocReg();
|
|
|
|
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
|
2017-07-15 13:52:59 +08:00
|
|
|
EVT ValVT = VA.getValVT();
|
2017-04-12 06:29:24 +08:00
|
|
|
|
|
|
|
Reg = MF.addLiveIn(Reg, RC);
|
|
|
|
SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
|
|
|
|
|
2017-07-15 13:52:59 +08:00
|
|
|
// If this is an 8 or 16-bit value, it is really passed promoted
|
|
|
|
// to 32 bits. Insert an assert[sz]ext to capture this, then
|
|
|
|
// truncate to the right size.
|
|
|
|
switch (VA.getLocInfo()) {
|
|
|
|
case CCValAssign::Full:
|
|
|
|
break;
|
|
|
|
case CCValAssign::BCvt:
|
|
|
|
Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
|
|
|
|
break;
|
|
|
|
case CCValAssign::SExt:
|
|
|
|
Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
|
|
|
|
DAG.getValueType(ValVT));
|
|
|
|
Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
|
|
|
|
break;
|
|
|
|
case CCValAssign::ZExt:
|
|
|
|
Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
|
|
|
|
DAG.getValueType(ValVT));
|
|
|
|
Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
|
|
|
|
break;
|
|
|
|
case CCValAssign::AExt:
|
|
|
|
Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
llvm_unreachable("Unknown loc info!");
|
|
|
|
}
|
|
|
|
|
2017-05-18 05:56:25 +08:00
|
|
|
if (IsShader && Arg.VT.isVector()) {
|
2017-04-12 06:29:24 +08:00
|
|
|
// Build a vector from the registers
|
|
|
|
Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
|
|
|
|
unsigned NumElements = ParamType->getVectorNumElements();
|
|
|
|
|
|
|
|
SmallVector<SDValue, 4> Regs;
|
|
|
|
Regs.push_back(Val);
|
|
|
|
for (unsigned j = 1; j != NumElements; ++j) {
|
|
|
|
Reg = ArgLocs[ArgIdx++].getLocReg();
|
|
|
|
Reg = MF.addLiveIn(Reg, RC);
|
|
|
|
|
|
|
|
SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
|
|
|
|
Regs.push_back(Copy);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Fill up the missing vector elements
|
|
|
|
NumElements = Arg.VT.getVectorNumElements() - NumElements;
|
|
|
|
Regs.append(NumElements, DAG.getUNDEF(VT));
|
|
|
|
|
|
|
|
InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs));
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
InVals.push_back(Val);
|
2015-12-01 05:16:03 +08:00
|
|
|
}
|
2015-12-01 05:15:53 +08:00
|
|
|
|
2017-08-04 07:00:29 +08:00
|
|
|
if (!IsEntryFunc) {
|
|
|
|
// Special inputs come after user arguments.
|
|
|
|
allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
|
|
|
|
}
|
|
|
|
|
2017-04-12 06:29:24 +08:00
|
|
|
// Start adding system SGPRs.
|
2017-05-18 05:56:25 +08:00
|
|
|
if (IsEntryFunc) {
|
2017-05-05 06:25:20 +08:00
|
|
|
allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
|
2017-05-18 05:56:25 +08:00
|
|
|
} else {
|
|
|
|
CCInfo.AllocateReg(Info->getScratchRSrcReg());
|
|
|
|
CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
|
|
|
|
CCInfo.AllocateReg(Info->getFrameOffsetReg());
|
2017-08-04 07:00:29 +08:00
|
|
|
allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
|
2017-05-18 05:56:25 +08:00
|
|
|
}
|
2015-07-11 06:51:36 +08:00
|
|
|
|
2017-08-04 07:00:29 +08:00
|
|
|
auto &ArgUsageInfo =
|
|
|
|
DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
|
|
|
|
ArgUsageInfo.setFuncArgInfo(*MF.getFunction(), Info->getArgInfo());
|
|
|
|
|
2017-08-12 04:42:08 +08:00
|
|
|
unsigned StackArgSize = CCInfo.getNextStackOffset();
|
|
|
|
Info->setBytesInStackArgArea(StackArgSize);
|
|
|
|
|
2017-04-12 06:29:24 +08:00
|
|
|
return Chains.empty() ? Chain :
|
|
|
|
DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
|
2013-03-07 17:03:52 +08:00
|
|
|
}
|
|
|
|
|
2017-05-18 05:56:25 +08:00
|
|
|
// TODO: If return values can't fit in registers, we should return as many as
|
|
|
|
// possible in registers before passing on stack.
|
|
|
|
bool SITargetLowering::CanLowerReturn(
|
|
|
|
CallingConv::ID CallConv,
|
|
|
|
MachineFunction &MF, bool IsVarArg,
|
|
|
|
const SmallVectorImpl<ISD::OutputArg> &Outs,
|
|
|
|
LLVMContext &Context) const {
|
|
|
|
// Replacing returns with sret/stack usage doesn't make sense for shaders.
|
|
|
|
// FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
|
|
|
|
// for shaders. Vector types should be explicitly handled by CC.
|
|
|
|
if (AMDGPU::isEntryFunctionCC(CallConv))
|
|
|
|
return true;
|
|
|
|
|
|
|
|
SmallVector<CCValAssign, 16> RVLocs;
|
|
|
|
CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
|
|
|
|
return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg));
|
|
|
|
}
|
|
|
|
|
2016-06-12 23:39:02 +08:00
|
|
|
SDValue
|
|
|
|
SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
|
|
|
|
bool isVarArg,
|
|
|
|
const SmallVectorImpl<ISD::OutputArg> &Outs,
|
|
|
|
const SmallVectorImpl<SDValue> &OutVals,
|
|
|
|
const SDLoc &DL, SelectionDAG &DAG) const {
|
2016-01-14 01:23:04 +08:00
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
|
|
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
|
|
|
|
|
2017-05-18 05:56:25 +08:00
|
|
|
if (AMDGPU::isKernel(CallConv)) {
|
2016-01-14 01:23:04 +08:00
|
|
|
return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
|
|
|
|
OutVals, DL, DAG);
|
2017-05-18 05:56:25 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
bool IsShader = AMDGPU::isShader(CallConv);
|
2016-01-14 01:23:04 +08:00
|
|
|
|
2016-01-14 01:23:09 +08:00
|
|
|
Info->setIfReturnsVoid(Outs.size() == 0);
|
2017-05-18 05:56:25 +08:00
|
|
|
bool IsWaveEnd = Info->returnsVoid() && IsShader;
|
2016-01-14 01:23:09 +08:00
|
|
|
|
2016-01-14 01:23:04 +08:00
|
|
|
SmallVector<ISD::OutputArg, 48> Splits;
|
|
|
|
SmallVector<SDValue, 48> SplitVals;
|
|
|
|
|
|
|
|
// Split vectors into their elements.
|
|
|
|
for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
|
|
|
|
const ISD::OutputArg &Out = Outs[i];
|
|
|
|
|
2017-05-18 05:56:25 +08:00
|
|
|
if (IsShader && Out.VT.isVector()) {
|
2016-01-14 01:23:04 +08:00
|
|
|
MVT VT = Out.VT.getVectorElementType();
|
|
|
|
ISD::OutputArg NewOut = Out;
|
|
|
|
NewOut.Flags.setSplit();
|
|
|
|
NewOut.VT = VT;
|
|
|
|
|
|
|
|
// We want the original number of vector elements here, e.g.
|
|
|
|
// three or five, not four or eight.
|
|
|
|
unsigned NumElements = Out.ArgVT.getVectorNumElements();
|
|
|
|
|
|
|
|
for (unsigned j = 0; j != NumElements; ++j) {
|
|
|
|
SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, OutVals[i],
|
|
|
|
DAG.getConstant(j, DL, MVT::i32));
|
|
|
|
SplitVals.push_back(Elem);
|
|
|
|
Splits.push_back(NewOut);
|
|
|
|
NewOut.PartOffset += NewOut.VT.getStoreSize();
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
SplitVals.push_back(OutVals[i]);
|
|
|
|
Splits.push_back(Out);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// CCValAssign - represent the assignment of the return value to a location.
|
|
|
|
SmallVector<CCValAssign, 48> RVLocs;
|
|
|
|
|
|
|
|
// CCState - Info about the registers and stack slots.
|
|
|
|
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
|
|
|
|
*DAG.getContext());
|
|
|
|
|
|
|
|
// Analyze outgoing return values.
|
2017-05-18 05:56:25 +08:00
|
|
|
CCInfo.AnalyzeReturn(Splits, CCAssignFnForReturn(CallConv, isVarArg));
|
2016-01-14 01:23:04 +08:00
|
|
|
|
|
|
|
SDValue Flag;
|
|
|
|
SmallVector<SDValue, 48> RetOps;
|
|
|
|
RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
|
|
|
|
|
2017-05-18 05:56:25 +08:00
|
|
|
// Add return address for callable functions.
|
|
|
|
if (!Info->isEntryFunction()) {
|
|
|
|
const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
|
|
|
|
SDValue ReturnAddrReg = CreateLiveInRegister(
|
|
|
|
DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
|
|
|
|
|
|
|
|
// FIXME: Should be able to use a vreg here, but need a way to prevent it
|
|
|
|
// from being allcoated to a CSR.
|
|
|
|
|
|
|
|
SDValue PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
|
|
|
|
MVT::i64);
|
|
|
|
|
|
|
|
Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, Flag);
|
|
|
|
Flag = Chain.getValue(1);
|
|
|
|
|
|
|
|
RetOps.push_back(PhysReturnAddrReg);
|
|
|
|
}
|
|
|
|
|
2016-01-14 01:23:04 +08:00
|
|
|
// Copy the result values into the output registers.
|
|
|
|
for (unsigned i = 0, realRVLocIdx = 0;
|
|
|
|
i != RVLocs.size();
|
|
|
|
++i, ++realRVLocIdx) {
|
|
|
|
CCValAssign &VA = RVLocs[i];
|
|
|
|
assert(VA.isRegLoc() && "Can only return in registers!");
|
2017-05-18 05:56:25 +08:00
|
|
|
// TODO: Partially return in registers if return values don't fit.
|
2016-01-14 01:23:04 +08:00
|
|
|
|
|
|
|
SDValue Arg = SplitVals[realRVLocIdx];
|
|
|
|
|
|
|
|
// Copied from other backends.
|
|
|
|
switch (VA.getLocInfo()) {
|
|
|
|
case CCValAssign::Full:
|
|
|
|
break;
|
|
|
|
case CCValAssign::BCvt:
|
|
|
|
Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
|
|
|
|
break;
|
2017-05-18 05:56:25 +08:00
|
|
|
case CCValAssign::SExt:
|
|
|
|
Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
|
|
|
|
break;
|
|
|
|
case CCValAssign::ZExt:
|
|
|
|
Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
|
|
|
|
break;
|
|
|
|
case CCValAssign::AExt:
|
|
|
|
Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
llvm_unreachable("Unknown loc info!");
|
2016-01-14 01:23:04 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
|
|
|
|
Flag = Chain.getValue(1);
|
|
|
|
RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
|
|
|
|
}
|
|
|
|
|
2017-05-18 05:56:25 +08:00
|
|
|
// FIXME: Does sret work properly?
|
2017-08-02 03:54:18 +08:00
|
|
|
if (!Info->isEntryFunction()) {
|
|
|
|
const SIRegisterInfo *TRI
|
|
|
|
= static_cast<const SISubtarget *>(Subtarget)->getRegisterInfo();
|
|
|
|
const MCPhysReg *I =
|
|
|
|
TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
|
|
|
|
if (I) {
|
|
|
|
for (; *I; ++I) {
|
|
|
|
if (AMDGPU::SReg_64RegClass.contains(*I))
|
|
|
|
RetOps.push_back(DAG.getRegister(*I, MVT::i64));
|
|
|
|
else if (AMDGPU::SReg_32RegClass.contains(*I))
|
|
|
|
RetOps.push_back(DAG.getRegister(*I, MVT::i32));
|
|
|
|
else
|
|
|
|
llvm_unreachable("Unexpected register class in CSRsViaCopy!");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2017-05-18 05:56:25 +08:00
|
|
|
|
2016-01-14 01:23:04 +08:00
|
|
|
// Update chain and glue.
|
|
|
|
RetOps[0] = Chain;
|
|
|
|
if (Flag.getNode())
|
|
|
|
RetOps.push_back(Flag);
|
|
|
|
|
2017-05-18 05:56:25 +08:00
|
|
|
unsigned Opc = AMDGPUISD::ENDPGM;
|
|
|
|
if (!IsWaveEnd)
|
|
|
|
Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_FLAG;
|
2016-06-23 04:15:28 +08:00
|
|
|
return DAG.getNode(Opc, DL, MVT::Other, RetOps);
|
2016-01-14 01:23:04 +08:00
|
|
|
}
|
|
|
|
|
2017-08-02 03:54:18 +08:00
|
|
|
SDValue SITargetLowering::LowerCallResult(
|
|
|
|
SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
|
|
|
|
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
|
|
|
|
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
|
|
|
|
SDValue ThisVal) const {
|
|
|
|
CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
|
|
|
|
|
|
|
|
// Assign locations to each value returned by this call.
|
|
|
|
SmallVector<CCValAssign, 16> RVLocs;
|
|
|
|
CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
|
|
|
|
*DAG.getContext());
|
|
|
|
CCInfo.AnalyzeCallResult(Ins, RetCC);
|
|
|
|
|
|
|
|
// Copy all of the result registers out of their specified physreg.
|
|
|
|
for (unsigned i = 0; i != RVLocs.size(); ++i) {
|
|
|
|
CCValAssign VA = RVLocs[i];
|
|
|
|
SDValue Val;
|
|
|
|
|
|
|
|
if (VA.isRegLoc()) {
|
|
|
|
Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
|
|
|
|
Chain = Val.getValue(1);
|
|
|
|
InFlag = Val.getValue(2);
|
|
|
|
} else if (VA.isMemLoc()) {
|
|
|
|
report_fatal_error("TODO: return values in memory");
|
|
|
|
} else
|
|
|
|
llvm_unreachable("unknown argument location type");
|
|
|
|
|
|
|
|
switch (VA.getLocInfo()) {
|
|
|
|
case CCValAssign::Full:
|
|
|
|
break;
|
|
|
|
case CCValAssign::BCvt:
|
|
|
|
Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
|
|
|
|
break;
|
|
|
|
case CCValAssign::ZExt:
|
|
|
|
Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
|
|
|
|
DAG.getValueType(VA.getValVT()));
|
|
|
|
Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
|
|
|
|
break;
|
|
|
|
case CCValAssign::SExt:
|
|
|
|
Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
|
|
|
|
DAG.getValueType(VA.getValVT()));
|
|
|
|
Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
|
|
|
|
break;
|
|
|
|
case CCValAssign::AExt:
|
|
|
|
Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
llvm_unreachable("Unknown loc info!");
|
|
|
|
}
|
|
|
|
|
|
|
|
InVals.push_back(Val);
|
|
|
|
}
|
|
|
|
|
|
|
|
return Chain;
|
|
|
|
}
|
|
|
|
|
2017-08-04 07:00:29 +08:00
|
|
|
// Add code to pass special inputs required depending on used features separate
|
|
|
|
// from the explicit user arguments present in the IR.
|
|
|
|
void SITargetLowering::passSpecialInputs(
|
|
|
|
CallLoweringInfo &CLI,
|
|
|
|
const SIMachineFunctionInfo &Info,
|
|
|
|
SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
|
|
|
|
SmallVectorImpl<SDValue> &MemOpChains,
|
|
|
|
SDValue Chain,
|
|
|
|
SDValue StackPtr) const {
|
|
|
|
// If we don't have a call site, this was a call inserted by
|
|
|
|
// legalization. These can never use special inputs.
|
|
|
|
if (!CLI.CS)
|
|
|
|
return;
|
|
|
|
|
|
|
|
const Function *CalleeFunc = CLI.CS.getCalledFunction();
|
2017-08-04 07:32:41 +08:00
|
|
|
assert(CalleeFunc);
|
2017-08-04 07:00:29 +08:00
|
|
|
|
|
|
|
SelectionDAG &DAG = CLI.DAG;
|
|
|
|
const SDLoc &DL = CLI.DL;
|
|
|
|
|
|
|
|
const SISubtarget *ST = getSubtarget();
|
|
|
|
const SIRegisterInfo *TRI = ST->getRegisterInfo();
|
|
|
|
|
|
|
|
auto &ArgUsageInfo =
|
|
|
|
DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
|
|
|
|
const AMDGPUFunctionArgInfo &CalleeArgInfo
|
|
|
|
= ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
|
|
|
|
|
|
|
|
const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
|
|
|
|
|
|
|
|
// TODO: Unify with private memory register handling. This is complicated by
|
|
|
|
// the fact that at least in kernels, the input argument is not necessarily
|
|
|
|
// in the same location as the input.
|
|
|
|
AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = {
|
|
|
|
AMDGPUFunctionArgInfo::DISPATCH_PTR,
|
|
|
|
AMDGPUFunctionArgInfo::QUEUE_PTR,
|
|
|
|
AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR,
|
|
|
|
AMDGPUFunctionArgInfo::DISPATCH_ID,
|
|
|
|
AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
|
|
|
|
AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
|
|
|
|
AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
|
|
|
|
AMDGPUFunctionArgInfo::WORKITEM_ID_X,
|
|
|
|
AMDGPUFunctionArgInfo::WORKITEM_ID_Y,
|
2017-08-04 07:12:44 +08:00
|
|
|
AMDGPUFunctionArgInfo::WORKITEM_ID_Z,
|
|
|
|
AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
|
2017-08-04 07:00:29 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
for (auto InputID : InputRegs) {
|
|
|
|
const ArgDescriptor *OutgoingArg;
|
|
|
|
const TargetRegisterClass *ArgRC;
|
|
|
|
|
|
|
|
std::tie(OutgoingArg, ArgRC) = CalleeArgInfo.getPreloadedValue(InputID);
|
|
|
|
if (!OutgoingArg)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
const ArgDescriptor *IncomingArg;
|
|
|
|
const TargetRegisterClass *IncomingArgRC;
|
|
|
|
std::tie(IncomingArg, IncomingArgRC)
|
|
|
|
= CallerArgInfo.getPreloadedValue(InputID);
|
|
|
|
assert(IncomingArgRC == ArgRC);
|
|
|
|
|
|
|
|
// All special arguments are ints for now.
|
|
|
|
EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
|
2017-08-04 07:12:44 +08:00
|
|
|
SDValue InputReg;
|
|
|
|
|
|
|
|
if (IncomingArg) {
|
|
|
|
InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
|
|
|
|
} else {
|
|
|
|
// The implicit arg ptr is special because it doesn't have a corresponding
|
|
|
|
// input for kernels, and is computed from the kernarg segment pointer.
|
|
|
|
assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
|
|
|
|
InputReg = getImplicitArgPtr(DAG, DL);
|
|
|
|
}
|
|
|
|
|
2017-08-04 07:00:29 +08:00
|
|
|
if (OutgoingArg->isRegister()) {
|
|
|
|
RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
|
|
|
|
} else {
|
|
|
|
SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, StackPtr,
|
|
|
|
InputReg,
|
|
|
|
OutgoingArg->getStackOffset());
|
|
|
|
MemOpChains.push_back(ArgStore);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-08-12 04:42:08 +08:00
|
|
|
static bool canGuaranteeTCO(CallingConv::ID CC) {
|
|
|
|
return CC == CallingConv::Fast;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Return true if we might ever do TCO for calls with this calling convention.
|
|
|
|
static bool mayTailCallThisCC(CallingConv::ID CC) {
|
|
|
|
switch (CC) {
|
|
|
|
case CallingConv::C:
|
|
|
|
return true;
|
|
|
|
default:
|
|
|
|
return canGuaranteeTCO(CC);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bool SITargetLowering::isEligibleForTailCallOptimization(
|
|
|
|
SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
|
|
|
|
const SmallVectorImpl<ISD::OutputArg> &Outs,
|
|
|
|
const SmallVectorImpl<SDValue> &OutVals,
|
|
|
|
const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
|
|
|
|
if (!mayTailCallThisCC(CalleeCC))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
|
|
const Function *CallerF = MF.getFunction();
|
|
|
|
CallingConv::ID CallerCC = CallerF->getCallingConv();
|
|
|
|
const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
|
|
|
|
const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
|
|
|
|
|
|
|
|
// Kernels aren't callable, and don't have a live in return address so it
|
|
|
|
// doesn't make sense to do a tail call with entry functions.
|
|
|
|
if (!CallerPreserved)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
bool CCMatch = CallerCC == CalleeCC;
|
|
|
|
|
|
|
|
if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
|
|
|
|
if (canGuaranteeTCO(CalleeCC) && CCMatch)
|
|
|
|
return true;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// TODO: Can we handle var args?
|
|
|
|
if (IsVarArg)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
for (const Argument &Arg : CallerF->args()) {
|
|
|
|
if (Arg.hasByValAttr())
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
LLVMContext &Ctx = *DAG.getContext();
|
|
|
|
|
|
|
|
// Check that the call results are passed in the same way.
|
|
|
|
if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
|
|
|
|
CCAssignFnForCall(CalleeCC, IsVarArg),
|
|
|
|
CCAssignFnForCall(CallerCC, IsVarArg)))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// The callee has to preserve all registers the caller needs to preserve.
|
|
|
|
if (!CCMatch) {
|
|
|
|
const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
|
|
|
|
if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Nothing more to check if the callee is taking no arguments.
|
|
|
|
if (Outs.empty())
|
|
|
|
return true;
|
|
|
|
|
|
|
|
SmallVector<CCValAssign, 16> ArgLocs;
|
|
|
|
CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
|
|
|
|
|
|
|
|
CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
|
|
|
|
|
|
|
|
const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
|
|
|
|
// If the stack arguments for this call do not fit into our own save area then
|
|
|
|
// the call cannot be made tail.
|
|
|
|
// TODO: Is this really necessary?
|
|
|
|
if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
const MachineRegisterInfo &MRI = MF.getRegInfo();
|
|
|
|
return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
|
|
|
|
if (!CI->isTailCall())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
const Function *ParentFn = CI->getParent()->getParent();
|
|
|
|
if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
auto Attr = ParentFn->getFnAttribute("disable-tail-calls");
|
|
|
|
return (Attr.getValueAsString() != "true");
|
|
|
|
}
|
|
|
|
|
2017-08-02 03:54:18 +08:00
|
|
|
// The wave scratch offset register is used as the global base pointer.
|
|
|
|
SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
|
|
|
|
SmallVectorImpl<SDValue> &InVals) const {
|
|
|
|
SelectionDAG &DAG = CLI.DAG;
|
|
|
|
const SDLoc &DL = CLI.DL;
|
|
|
|
SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
|
|
|
|
SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
|
|
|
|
SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
|
|
|
|
SDValue Chain = CLI.Chain;
|
|
|
|
SDValue Callee = CLI.Callee;
|
|
|
|
bool &IsTailCall = CLI.IsTailCall;
|
|
|
|
CallingConv::ID CallConv = CLI.CallConv;
|
|
|
|
bool IsVarArg = CLI.IsVarArg;
|
|
|
|
bool IsSibCall = false;
|
|
|
|
bool IsThisReturn = false;
|
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
|
|
|
2017-08-04 07:32:41 +08:00
|
|
|
if (IsVarArg) {
|
|
|
|
return lowerUnhandledCall(CLI, InVals,
|
|
|
|
"unsupported call to variadic function ");
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!CLI.CS.getCalledFunction()) {
|
|
|
|
return lowerUnhandledCall(CLI, InVals,
|
|
|
|
"unsupported indirect call to function ");
|
|
|
|
}
|
2017-08-02 03:54:18 +08:00
|
|
|
|
2017-08-04 07:32:41 +08:00
|
|
|
if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
|
|
|
|
return lowerUnhandledCall(CLI, InVals,
|
|
|
|
"unsupported required tail call to function ");
|
2017-08-02 03:54:18 +08:00
|
|
|
}
|
|
|
|
|
2017-08-12 04:42:08 +08:00
|
|
|
// The first 4 bytes are reserved for the callee's emergency stack slot.
|
|
|
|
const unsigned CalleeUsableStackOffset = 4;
|
|
|
|
|
|
|
|
if (IsTailCall) {
|
|
|
|
IsTailCall = isEligibleForTailCallOptimization(
|
|
|
|
Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
|
|
|
|
if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall()) {
|
|
|
|
report_fatal_error("failed to perform tail call elimination on a call "
|
|
|
|
"site marked musttail");
|
|
|
|
}
|
|
|
|
|
|
|
|
bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
|
|
|
|
|
|
|
|
// A sibling call is one where we're under the usual C ABI and not planning
|
|
|
|
// to change that but can still do a tail call:
|
|
|
|
if (!TailCallOpt && IsTailCall)
|
|
|
|
IsSibCall = true;
|
|
|
|
|
|
|
|
if (IsTailCall)
|
|
|
|
++NumTailCalls;
|
|
|
|
}
|
2017-08-04 07:32:41 +08:00
|
|
|
|
2017-08-02 03:54:18 +08:00
|
|
|
if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Callee)) {
|
|
|
|
// FIXME: Remove this hack for function pointer types.
|
|
|
|
const GlobalValue *GV = GA->getGlobal();
|
|
|
|
assert(Callee.getValueType() == MVT::i32);
|
|
|
|
Callee = DAG.getGlobalAddress(GV, DL, MVT::i64, GA->getOffset(),
|
|
|
|
false, GA->getTargetFlags());
|
|
|
|
}
|
|
|
|
|
|
|
|
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
|
|
|
|
|
|
|
|
// Analyze operands of the call, assigning locations to each operand.
|
|
|
|
SmallVector<CCValAssign, 16> ArgLocs;
|
|
|
|
CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
|
|
|
|
CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
|
|
|
|
CCInfo.AnalyzeCallOperands(Outs, AssignFn);
|
|
|
|
|
|
|
|
// Get a count of how many bytes are to be pushed on the stack.
|
|
|
|
unsigned NumBytes = CCInfo.getNextStackOffset();
|
|
|
|
|
|
|
|
if (IsSibCall) {
|
|
|
|
// Since we're not changing the ABI to make this a tail call, the memory
|
|
|
|
// operands are already available in the caller's incoming argument space.
|
|
|
|
NumBytes = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
// FPDiff is the byte offset of the call's argument area from the callee's.
|
|
|
|
// Stores to callee stack arguments will be placed in FixedStackSlots offset
|
|
|
|
// by this amount for a tail call. In a sibling call it must be 0 because the
|
|
|
|
// caller will deallocate the entire stack and the callee still expects its
|
|
|
|
// arguments to begin at SP+0. Completely unused for non-tail calls.
|
2017-08-12 04:42:08 +08:00
|
|
|
int32_t FPDiff = 0;
|
|
|
|
MachineFrameInfo &MFI = MF.getFrameInfo();
|
2017-08-02 03:54:18 +08:00
|
|
|
SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
|
|
|
|
|
2017-09-15 01:14:57 +08:00
|
|
|
SDValue CallerSavedFP;
|
|
|
|
|
2017-08-02 03:54:18 +08:00
|
|
|
// Adjust the stack pointer for the new arguments...
|
|
|
|
// These operations are automatically eliminated by the prolog/epilog pass
|
|
|
|
if (!IsSibCall) {
|
2017-09-15 01:37:40 +08:00
|
|
|
Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
|
2017-08-02 03:54:18 +08:00
|
|
|
|
|
|
|
unsigned OffsetReg = Info->getScratchWaveOffsetReg();
|
|
|
|
|
|
|
|
// In the HSA case, this should be an identity copy.
|
|
|
|
SDValue ScratchRSrcReg
|
|
|
|
= DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
|
|
|
|
RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
|
|
|
|
|
|
|
|
// TODO: Don't hardcode these registers and get from the callee function.
|
|
|
|
SDValue ScratchWaveOffsetReg
|
|
|
|
= DAG.getCopyFromReg(Chain, DL, OffsetReg, MVT::i32);
|
|
|
|
RegsToPass.emplace_back(AMDGPU::SGPR4, ScratchWaveOffsetReg);
|
2017-09-15 01:14:57 +08:00
|
|
|
|
|
|
|
if (!Info->isEntryFunction()) {
|
|
|
|
// Avoid clobbering this function's FP value. In the current convention
|
|
|
|
// callee will overwrite this, so do save/restore around the call site.
|
|
|
|
CallerSavedFP = DAG.getCopyFromReg(Chain, DL,
|
|
|
|
Info->getFrameOffsetReg(), MVT::i32);
|
|
|
|
}
|
2017-08-02 03:54:18 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Stack pointer relative accesses are done by changing the offset SGPR. This
|
|
|
|
// is just the VGPR offset component.
|
2017-08-12 04:42:08 +08:00
|
|
|
SDValue StackPtr = DAG.getConstant(CalleeUsableStackOffset, DL, MVT::i32);
|
2017-08-02 03:54:18 +08:00
|
|
|
|
|
|
|
SmallVector<SDValue, 8> MemOpChains;
|
|
|
|
MVT PtrVT = MVT::i32;
|
|
|
|
|
|
|
|
// Walk the register/memloc assignments, inserting copies/loads.
|
|
|
|
for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
|
|
|
|
++i, ++realArgIdx) {
|
|
|
|
CCValAssign &VA = ArgLocs[i];
|
|
|
|
SDValue Arg = OutVals[realArgIdx];
|
|
|
|
|
|
|
|
// Promote the value if needed.
|
|
|
|
switch (VA.getLocInfo()) {
|
|
|
|
case CCValAssign::Full:
|
|
|
|
break;
|
|
|
|
case CCValAssign::BCvt:
|
|
|
|
Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
|
|
|
|
break;
|
|
|
|
case CCValAssign::ZExt:
|
|
|
|
Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
|
|
|
|
break;
|
|
|
|
case CCValAssign::SExt:
|
|
|
|
Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
|
|
|
|
break;
|
|
|
|
case CCValAssign::AExt:
|
|
|
|
Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
|
|
|
|
break;
|
|
|
|
case CCValAssign::FPExt:
|
|
|
|
Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
llvm_unreachable("Unknown loc info!");
|
|
|
|
}
|
|
|
|
|
|
|
|
if (VA.isRegLoc()) {
|
|
|
|
RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
|
|
|
|
} else {
|
|
|
|
assert(VA.isMemLoc());
|
|
|
|
|
|
|
|
SDValue DstAddr;
|
|
|
|
MachinePointerInfo DstInfo;
|
|
|
|
|
|
|
|
unsigned LocMemOffset = VA.getLocMemOffset();
|
|
|
|
int32_t Offset = LocMemOffset;
|
|
|
|
SDValue PtrOff = DAG.getConstant(Offset, DL, MVT::i32);
|
|
|
|
PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
|
|
|
|
|
2017-08-12 04:42:08 +08:00
|
|
|
if (IsTailCall) {
|
|
|
|
ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
|
|
|
|
unsigned OpSize = Flags.isByVal() ?
|
|
|
|
Flags.getByValSize() : VA.getValVT().getStoreSize();
|
|
|
|
|
|
|
|
Offset = Offset + FPDiff;
|
|
|
|
int FI = MFI.CreateFixedObject(OpSize, Offset, true);
|
|
|
|
|
|
|
|
DstAddr = DAG.getFrameIndex(FI, PtrVT);
|
|
|
|
DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, DstAddr, StackPtr);
|
|
|
|
DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
|
2017-08-02 03:54:18 +08:00
|
|
|
|
2017-08-12 04:42:08 +08:00
|
|
|
// Make sure any stack arguments overlapping with where we're storing
|
|
|
|
// are loaded before this eventual operation. Otherwise they'll be
|
|
|
|
// clobbered.
|
|
|
|
|
|
|
|
// FIXME: Why is this really necessary? This seems to just result in a
|
|
|
|
// lot of code to copy the stack and write them back to the same
|
|
|
|
// locations, which are supposed to be immutable?
|
|
|
|
Chain = addTokenForArgument(Chain, DAG, MFI, FI);
|
|
|
|
} else {
|
|
|
|
DstAddr = PtrOff;
|
2017-08-02 03:54:18 +08:00
|
|
|
DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (Outs[i].Flags.isByVal()) {
|
|
|
|
SDValue SizeNode =
|
|
|
|
DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
|
|
|
|
SDValue Cpy = DAG.getMemcpy(
|
|
|
|
Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
|
|
|
|
/*isVol = */ false, /*AlwaysInline = */ true,
|
|
|
|
/*isTailCall = */ false,
|
|
|
|
DstInfo, MachinePointerInfo());
|
|
|
|
|
|
|
|
MemOpChains.push_back(Cpy);
|
|
|
|
} else {
|
|
|
|
SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
|
|
|
|
MemOpChains.push_back(Store);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-08-04 07:00:29 +08:00
|
|
|
// Copy special input registers after user input arguments.
|
|
|
|
passSpecialInputs(CLI, *Info, RegsToPass, MemOpChains, Chain, StackPtr);
|
|
|
|
|
2017-08-02 03:54:18 +08:00
|
|
|
if (!MemOpChains.empty())
|
|
|
|
Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
|
|
|
|
|
|
|
|
// Build a sequence of copy-to-reg nodes chained together with token chain
|
|
|
|
// and flag operands which copy the outgoing args into the appropriate regs.
|
|
|
|
SDValue InFlag;
|
|
|
|
for (auto &RegToPass : RegsToPass) {
|
|
|
|
Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
|
|
|
|
RegToPass.second, InFlag);
|
|
|
|
InFlag = Chain.getValue(1);
|
|
|
|
}
|
|
|
|
|
2017-08-12 04:42:08 +08:00
|
|
|
|
|
|
|
SDValue PhysReturnAddrReg;
|
|
|
|
if (IsTailCall) {
|
|
|
|
// Since the return is being combined with the call, we need to pass on the
|
|
|
|
// return address.
|
|
|
|
|
|
|
|
const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
|
|
|
|
SDValue ReturnAddrReg = CreateLiveInRegister(
|
|
|
|
DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
|
|
|
|
|
|
|
|
PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
|
|
|
|
MVT::i64);
|
|
|
|
Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, InFlag);
|
|
|
|
InFlag = Chain.getValue(1);
|
|
|
|
}
|
|
|
|
|
2017-08-02 03:54:18 +08:00
|
|
|
// We don't usually want to end the call-sequence here because we would tidy
|
|
|
|
// the frame up *after* the call, however in the ABI-changing tail-call case
|
|
|
|
// we've carefully laid out the parameters so that when sp is reset they'll be
|
|
|
|
// in the correct location.
|
|
|
|
if (IsTailCall && !IsSibCall) {
|
|
|
|
Chain = DAG.getCALLSEQ_END(Chain,
|
|
|
|
DAG.getTargetConstant(NumBytes, DL, MVT::i32),
|
|
|
|
DAG.getTargetConstant(0, DL, MVT::i32),
|
|
|
|
InFlag, DL);
|
|
|
|
InFlag = Chain.getValue(1);
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<SDValue> Ops;
|
|
|
|
Ops.push_back(Chain);
|
|
|
|
Ops.push_back(Callee);
|
|
|
|
|
|
|
|
if (IsTailCall) {
|
|
|
|
// Each tail call may have to adjust the stack by a different amount, so
|
|
|
|
// this information must travel along with the operation for eventual
|
|
|
|
// consumption by emitEpilogue.
|
|
|
|
Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
|
2017-08-12 04:42:08 +08:00
|
|
|
|
|
|
|
Ops.push_back(PhysReturnAddrReg);
|
2017-08-02 03:54:18 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Add argument registers to the end of the list so that they are known live
|
|
|
|
// into the call.
|
|
|
|
for (auto &RegToPass : RegsToPass) {
|
|
|
|
Ops.push_back(DAG.getRegister(RegToPass.first,
|
|
|
|
RegToPass.second.getValueType()));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Add a register mask operand representing the call-preserved registers.
|
|
|
|
|
|
|
|
const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo();
|
|
|
|
const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
|
|
|
|
assert(Mask && "Missing call preserved mask for calling convention");
|
|
|
|
Ops.push_back(DAG.getRegisterMask(Mask));
|
|
|
|
|
|
|
|
if (InFlag.getNode())
|
|
|
|
Ops.push_back(InFlag);
|
|
|
|
|
|
|
|
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
|
|
|
|
|
|
|
|
// If we're doing a tall call, use a TC_RETURN here rather than an
|
|
|
|
// actual call instruction.
|
|
|
|
if (IsTailCall) {
|
2017-08-12 04:42:08 +08:00
|
|
|
MFI.setHasTailCall();
|
|
|
|
return DAG.getNode(AMDGPUISD::TC_RETURN, DL, NodeTys, Ops);
|
2017-08-02 03:54:18 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Returns a chain and a flag for retval copy to use.
|
|
|
|
SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
|
|
|
|
Chain = Call.getValue(0);
|
|
|
|
InFlag = Call.getValue(1);
|
|
|
|
|
2017-09-15 01:14:57 +08:00
|
|
|
if (CallerSavedFP) {
|
|
|
|
SDValue FPReg = DAG.getRegister(Info->getFrameOffsetReg(), MVT::i32);
|
|
|
|
Chain = DAG.getCopyToReg(Chain, DL, FPReg, CallerSavedFP, InFlag);
|
|
|
|
InFlag = Chain.getValue(1);
|
|
|
|
}
|
|
|
|
|
2017-09-15 01:37:40 +08:00
|
|
|
uint64_t CalleePopBytes = NumBytes;
|
|
|
|
Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(0, DL, MVT::i32),
|
2017-08-02 03:54:18 +08:00
|
|
|
DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32),
|
|
|
|
InFlag, DL);
|
|
|
|
if (!Ins.empty())
|
|
|
|
InFlag = Chain.getValue(1);
|
|
|
|
|
|
|
|
// Handle result values, copying them out of physregs into vregs that we
|
|
|
|
// return.
|
|
|
|
return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
|
|
|
|
InVals, IsThisReturn,
|
|
|
|
IsThisReturn ? OutVals[0] : SDValue());
|
|
|
|
}
|
|
|
|
|
2016-01-26 12:29:24 +08:00
|
|
|
unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
|
|
|
|
SelectionDAG &DAG) const {
|
|
|
|
unsigned Reg = StringSwitch<unsigned>(RegName)
|
|
|
|
.Case("m0", AMDGPU::M0)
|
|
|
|
.Case("exec", AMDGPU::EXEC)
|
|
|
|
.Case("exec_lo", AMDGPU::EXEC_LO)
|
|
|
|
.Case("exec_hi", AMDGPU::EXEC_HI)
|
|
|
|
.Case("flat_scratch", AMDGPU::FLAT_SCR)
|
|
|
|
.Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
|
|
|
|
.Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
|
|
|
|
.Default(AMDGPU::NoRegister);
|
|
|
|
|
|
|
|
if (Reg == AMDGPU::NoRegister) {
|
|
|
|
report_fatal_error(Twine("invalid register name \""
|
|
|
|
+ StringRef(RegName) + "\"."));
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2016-06-24 14:30:11 +08:00
|
|
|
if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
|
2016-01-26 12:29:24 +08:00
|
|
|
Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
|
|
|
|
report_fatal_error(Twine("invalid register \""
|
|
|
|
+ StringRef(RegName) + "\" for subtarget."));
|
|
|
|
}
|
|
|
|
|
|
|
|
switch (Reg) {
|
|
|
|
case AMDGPU::M0:
|
|
|
|
case AMDGPU::EXEC_LO:
|
|
|
|
case AMDGPU::EXEC_HI:
|
|
|
|
case AMDGPU::FLAT_SCR_LO:
|
|
|
|
case AMDGPU::FLAT_SCR_HI:
|
|
|
|
if (VT.getSizeInBits() == 32)
|
|
|
|
return Reg;
|
|
|
|
break;
|
|
|
|
case AMDGPU::EXEC:
|
|
|
|
case AMDGPU::FLAT_SCR:
|
|
|
|
if (VT.getSizeInBits() == 64)
|
|
|
|
return Reg;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
llvm_unreachable("missing register type checking");
|
|
|
|
}
|
|
|
|
|
|
|
|
report_fatal_error(Twine("invalid type for register \""
|
|
|
|
+ StringRef(RegName) + "\"."));
|
|
|
|
}
|
|
|
|
|
2016-07-13 05:41:32 +08:00
|
|
|
// If kill is not the last instruction, split the block so kill is always a
|
|
|
|
// proper terminator.
|
|
|
|
MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI,
|
|
|
|
MachineBasicBlock *BB) const {
|
|
|
|
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
|
|
|
|
|
|
|
|
MachineBasicBlock::iterator SplitPoint(&MI);
|
|
|
|
++SplitPoint;
|
|
|
|
|
|
|
|
if (SplitPoint == BB->end()) {
|
|
|
|
// Don't bother with a new block.
|
|
|
|
MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR));
|
|
|
|
return BB;
|
|
|
|
}
|
|
|
|
|
|
|
|
MachineFunction *MF = BB->getParent();
|
|
|
|
MachineBasicBlock *SplitBB
|
|
|
|
= MF->CreateMachineBasicBlock(BB->getBasicBlock());
|
|
|
|
|
|
|
|
MF->insert(++MachineFunction::iterator(BB), SplitBB);
|
|
|
|
SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end());
|
|
|
|
|
2016-07-23 01:01:15 +08:00
|
|
|
SplitBB->transferSuccessorsAndUpdatePHIs(BB);
|
2016-07-13 05:41:32 +08:00
|
|
|
BB->addSuccessor(SplitBB);
|
|
|
|
|
|
|
|
MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR));
|
|
|
|
return SplitBB;
|
|
|
|
}
|
|
|
|
|
2016-07-19 08:35:03 +08:00
|
|
|
// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
|
|
|
|
// wavefront. If the value is uniform and just happens to be in a VGPR, this
|
|
|
|
// will only do one iteration. In the worst case, this will loop 64 times.
|
|
|
|
//
|
|
|
|
// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
|
2016-10-04 09:41:05 +08:00
|
|
|
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
|
|
|
|
const SIInstrInfo *TII,
|
|
|
|
MachineRegisterInfo &MRI,
|
|
|
|
MachineBasicBlock &OrigBB,
|
|
|
|
MachineBasicBlock &LoopBB,
|
|
|
|
const DebugLoc &DL,
|
|
|
|
const MachineOperand &IdxReg,
|
|
|
|
unsigned InitReg,
|
|
|
|
unsigned ResultReg,
|
|
|
|
unsigned PhiReg,
|
|
|
|
unsigned InitSaveExecReg,
|
2016-10-13 02:49:05 +08:00
|
|
|
int Offset,
|
|
|
|
bool UseGPRIdxMode) {
|
2016-07-19 08:35:03 +08:00
|
|
|
MachineBasicBlock::iterator I = LoopBB.begin();
|
|
|
|
|
|
|
|
unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
|
|
|
|
unsigned NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
|
|
|
|
unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
|
|
|
|
unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
|
|
|
|
|
|
|
|
BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
|
|
|
|
.addReg(InitReg)
|
|
|
|
.addMBB(&OrigBB)
|
|
|
|
.addReg(ResultReg)
|
|
|
|
.addMBB(&LoopBB);
|
|
|
|
|
|
|
|
BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
|
|
|
|
.addReg(InitSaveExecReg)
|
|
|
|
.addMBB(&OrigBB)
|
|
|
|
.addReg(NewExec)
|
|
|
|
.addMBB(&LoopBB);
|
|
|
|
|
|
|
|
// Read the next variant <- also loop target.
|
|
|
|
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
|
|
|
|
.addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
|
|
|
|
|
|
|
|
// Compare the just read M0 value to all possible Idx values.
|
|
|
|
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
|
|
|
|
.addReg(CurrentIdxReg)
|
2016-07-21 17:40:57 +08:00
|
|
|
.addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());
|
2016-07-19 08:35:03 +08:00
|
|
|
|
2016-10-13 02:49:05 +08:00
|
|
|
if (UseGPRIdxMode) {
|
|
|
|
unsigned IdxReg;
|
|
|
|
if (Offset == 0) {
|
|
|
|
IdxReg = CurrentIdxReg;
|
|
|
|
} else {
|
|
|
|
IdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
|
|
|
|
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), IdxReg)
|
|
|
|
.addReg(CurrentIdxReg, RegState::Kill)
|
|
|
|
.addImm(Offset);
|
|
|
|
}
|
|
|
|
|
|
|
|
MachineInstr *SetIdx =
|
|
|
|
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_IDX))
|
|
|
|
.addReg(IdxReg, RegState::Kill);
|
2016-10-13 20:45:16 +08:00
|
|
|
SetIdx->getOperand(2).setIsUndef();
|
2016-07-19 08:35:03 +08:00
|
|
|
} else {
|
2016-10-13 02:49:05 +08:00
|
|
|
// Move index from VCC into M0
|
|
|
|
if (Offset == 0) {
|
|
|
|
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
|
|
|
|
.addReg(CurrentIdxReg, RegState::Kill);
|
|
|
|
} else {
|
|
|
|
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
|
|
|
|
.addReg(CurrentIdxReg, RegState::Kill)
|
|
|
|
.addImm(Offset);
|
|
|
|
}
|
2016-07-19 08:35:03 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Update EXEC, save the original EXEC value to VCC.
|
|
|
|
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
|
|
|
|
.addReg(CondReg, RegState::Kill);
|
|
|
|
|
|
|
|
MRI.setSimpleHint(NewExec, CondReg);
|
|
|
|
|
|
|
|
// Update EXEC, switch all done bits to 0 and all todo bits to 1.
|
2016-10-04 09:41:05 +08:00
|
|
|
MachineInstr *InsertPt =
|
|
|
|
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
|
2016-07-19 08:35:03 +08:00
|
|
|
.addReg(AMDGPU::EXEC)
|
|
|
|
.addReg(NewExec);
|
|
|
|
|
|
|
|
// XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
|
|
|
|
// s_cbranch_scc0?
|
|
|
|
|
|
|
|
// Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
|
|
|
|
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
|
|
|
|
.addMBB(&LoopBB);
|
2016-10-04 09:41:05 +08:00
|
|
|
|
|
|
|
return InsertPt->getIterator();
|
2016-07-19 08:35:03 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// This has slightly sub-optimal regalloc when the source vector is killed by
|
|
|
|
// the read. The register allocator does not understand that the kill is
|
|
|
|
// per-workitem, so is kept alive for the whole loop so we end up not re-using a
|
|
|
|
// subregister from it, using 1 more VGPR than necessary. This was saved when
|
|
|
|
// this was expanded after register allocation.
|
2016-10-04 09:41:05 +08:00
|
|
|
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
|
|
|
|
MachineBasicBlock &MBB,
|
|
|
|
MachineInstr &MI,
|
|
|
|
unsigned InitResultReg,
|
|
|
|
unsigned PhiReg,
|
2016-10-13 02:49:05 +08:00
|
|
|
int Offset,
|
|
|
|
bool UseGPRIdxMode) {
|
2016-07-19 08:35:03 +08:00
|
|
|
MachineFunction *MF = MBB.getParent();
|
|
|
|
MachineRegisterInfo &MRI = MF->getRegInfo();
|
|
|
|
const DebugLoc &DL = MI.getDebugLoc();
|
|
|
|
MachineBasicBlock::iterator I(&MI);
|
|
|
|
|
|
|
|
unsigned DstReg = MI.getOperand(0).getReg();
|
|
|
|
unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
|
|
|
|
unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
|
|
|
|
|
|
|
|
BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
|
|
|
|
|
|
|
|
// Save the EXEC mask
|
|
|
|
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), SaveExec)
|
|
|
|
.addReg(AMDGPU::EXEC);
|
|
|
|
|
|
|
|
// To insert the loop we need to split the block. Move everything after this
|
|
|
|
// point to a new block, and insert a new empty block between the two.
|
|
|
|
MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
|
|
|
|
MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
|
|
|
|
MachineFunction::iterator MBBI(MBB);
|
|
|
|
++MBBI;
|
|
|
|
|
|
|
|
MF->insert(MBBI, LoopBB);
|
|
|
|
MF->insert(MBBI, RemainderBB);
|
|
|
|
|
|
|
|
LoopBB->addSuccessor(LoopBB);
|
|
|
|
LoopBB->addSuccessor(RemainderBB);
|
|
|
|
|
|
|
|
// Move the rest of the block into a new block.
|
2016-07-23 01:01:15 +08:00
|
|
|
RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
|
2016-07-19 08:35:03 +08:00
|
|
|
RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
|
|
|
|
|
|
|
|
MBB.addSuccessor(LoopBB);
|
|
|
|
|
|
|
|
const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
|
|
|
|
|
2016-10-04 09:41:05 +08:00
|
|
|
auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
|
|
|
|
InitResultReg, DstReg, PhiReg, TmpExec,
|
2016-10-13 02:49:05 +08:00
|
|
|
Offset, UseGPRIdxMode);
|
2016-07-19 08:35:03 +08:00
|
|
|
|
|
|
|
MachineBasicBlock::iterator First = RemainderBB->begin();
|
|
|
|
BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
|
|
|
|
.addReg(SaveExec);
|
|
|
|
|
2016-10-04 09:41:05 +08:00
|
|
|
return InsPt;
|
2016-07-19 08:35:03 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Returns subreg index, offset
|
|
|
|
static std::pair<unsigned, int>
|
|
|
|
computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
|
|
|
|
const TargetRegisterClass *SuperRC,
|
|
|
|
unsigned VecReg,
|
|
|
|
int Offset) {
|
2017-04-25 02:55:33 +08:00
|
|
|
int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
|
2016-07-19 08:35:03 +08:00
|
|
|
|
|
|
|
// Skip out of bounds offsets, or else we would end up using an undefined
|
|
|
|
// register.
|
|
|
|
if (Offset >= NumElts || Offset < 0)
|
|
|
|
return std::make_pair(AMDGPU::sub0, Offset);
|
|
|
|
|
|
|
|
return std::make_pair(AMDGPU::sub0 + Offset, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Return true if the index is an SGPR and was set.
|
|
|
|
static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII,
|
|
|
|
MachineRegisterInfo &MRI,
|
|
|
|
MachineInstr &MI,
|
2016-10-13 02:49:05 +08:00
|
|
|
int Offset,
|
|
|
|
bool UseGPRIdxMode,
|
|
|
|
bool IsIndirectSrc) {
|
2016-07-19 08:35:03 +08:00
|
|
|
MachineBasicBlock *MBB = MI.getParent();
|
|
|
|
const DebugLoc &DL = MI.getDebugLoc();
|
|
|
|
MachineBasicBlock::iterator I(&MI);
|
|
|
|
|
|
|
|
const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
|
|
|
|
const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
|
|
|
|
|
|
|
|
assert(Idx->getReg() != AMDGPU::NoRegister);
|
|
|
|
|
|
|
|
if (!TII->getRegisterInfo().isSGPRClass(IdxRC))
|
|
|
|
return false;
|
|
|
|
|
2016-10-13 02:49:05 +08:00
|
|
|
if (UseGPRIdxMode) {
|
|
|
|
unsigned IdxMode = IsIndirectSrc ?
|
|
|
|
VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE;
|
|
|
|
if (Offset == 0) {
|
|
|
|
MachineInstr *SetOn =
|
2017-01-13 17:58:52 +08:00
|
|
|
BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
|
|
|
|
.add(*Idx)
|
|
|
|
.addImm(IdxMode);
|
2016-10-13 02:49:05 +08:00
|
|
|
|
2016-10-13 20:45:16 +08:00
|
|
|
SetOn->getOperand(3).setIsUndef();
|
2016-10-13 02:49:05 +08:00
|
|
|
} else {
|
|
|
|
unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
|
|
|
|
BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
|
2017-01-13 17:58:52 +08:00
|
|
|
.add(*Idx)
|
|
|
|
.addImm(Offset);
|
2016-10-13 02:49:05 +08:00
|
|
|
MachineInstr *SetOn =
|
|
|
|
BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
|
|
|
|
.addReg(Tmp, RegState::Kill)
|
|
|
|
.addImm(IdxMode);
|
|
|
|
|
2016-10-13 20:45:16 +08:00
|
|
|
SetOn->getOperand(3).setIsUndef();
|
2016-10-13 02:49:05 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2016-07-19 08:35:03 +08:00
|
|
|
if (Offset == 0) {
|
2017-02-22 06:50:41 +08:00
|
|
|
BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
|
|
|
|
.add(*Idx);
|
2016-07-19 08:35:03 +08:00
|
|
|
} else {
|
|
|
|
BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
|
2017-02-22 06:50:41 +08:00
|
|
|
.add(*Idx)
|
|
|
|
.addImm(Offset);
|
2016-07-19 08:35:03 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Control flow needs to be inserted if indexing with a VGPR.
|
|
|
|
static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
|
|
|
|
MachineBasicBlock &MBB,
|
2016-10-04 09:41:05 +08:00
|
|
|
const SISubtarget &ST) {
|
|
|
|
const SIInstrInfo *TII = ST.getInstrInfo();
|
2016-07-19 08:35:03 +08:00
|
|
|
const SIRegisterInfo &TRI = TII->getRegisterInfo();
|
|
|
|
MachineFunction *MF = MBB.getParent();
|
|
|
|
MachineRegisterInfo &MRI = MF->getRegInfo();
|
|
|
|
|
|
|
|
unsigned Dst = MI.getOperand(0).getReg();
|
2016-10-14 17:03:04 +08:00
|
|
|
unsigned SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
|
2016-07-19 08:35:03 +08:00
|
|
|
int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
|
|
|
|
|
2016-10-14 17:03:04 +08:00
|
|
|
const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
|
2016-07-19 08:35:03 +08:00
|
|
|
|
|
|
|
unsigned SubReg;
|
|
|
|
std::tie(SubReg, Offset)
|
2016-10-14 17:03:04 +08:00
|
|
|
= computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
|
2016-07-19 08:35:03 +08:00
|
|
|
|
2017-03-22 01:00:32 +08:00
|
|
|
bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
|
2016-10-13 02:49:05 +08:00
|
|
|
|
|
|
|
if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) {
|
2016-07-19 08:35:03 +08:00
|
|
|
MachineBasicBlock::iterator I(&MI);
|
|
|
|
const DebugLoc &DL = MI.getDebugLoc();
|
|
|
|
|
2016-10-13 02:49:05 +08:00
|
|
|
if (UseGPRIdxMode) {
|
|
|
|
// TODO: Look at the uses to avoid the copy. This may require rescheduling
|
|
|
|
// to avoid interfering with other uses, so probably requires a new
|
|
|
|
// optimization pass.
|
|
|
|
BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
|
2016-10-14 17:03:04 +08:00
|
|
|
.addReg(SrcReg, RegState::Undef, SubReg)
|
|
|
|
.addReg(SrcReg, RegState::Implicit)
|
2016-10-13 02:49:05 +08:00
|
|
|
.addReg(AMDGPU::M0, RegState::Implicit);
|
|
|
|
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
|
|
|
|
} else {
|
|
|
|
BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
|
2016-10-14 17:03:04 +08:00
|
|
|
.addReg(SrcReg, RegState::Undef, SubReg)
|
|
|
|
.addReg(SrcReg, RegState::Implicit);
|
2016-10-13 02:49:05 +08:00
|
|
|
}
|
|
|
|
|
2016-07-19 08:35:03 +08:00
|
|
|
MI.eraseFromParent();
|
|
|
|
|
|
|
|
return &MBB;
|
|
|
|
}
|
|
|
|
|
|
|
|
const DebugLoc &DL = MI.getDebugLoc();
|
|
|
|
MachineBasicBlock::iterator I(&MI);
|
|
|
|
|
|
|
|
unsigned PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
|
|
|
|
unsigned InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
|
|
|
|
|
|
|
|
BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
|
|
|
|
|
2016-10-13 02:49:05 +08:00
|
|
|
if (UseGPRIdxMode) {
|
|
|
|
MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
|
|
|
|
.addImm(0) // Reset inside loop.
|
|
|
|
.addImm(VGPRIndexMode::SRC0_ENABLE);
|
2016-10-13 20:45:16 +08:00
|
|
|
SetOn->getOperand(3).setIsUndef();
|
2016-10-04 09:41:05 +08:00
|
|
|
|
2016-10-13 02:49:05 +08:00
|
|
|
// Disable again after the loop.
|
|
|
|
BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
|
|
|
|
}
|
|
|
|
|
|
|
|
auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset, UseGPRIdxMode);
|
|
|
|
MachineBasicBlock *LoopBB = InsPt->getParent();
|
2016-07-19 08:35:03 +08:00
|
|
|
|
2016-10-13 02:49:05 +08:00
|
|
|
if (UseGPRIdxMode) {
|
|
|
|
BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
|
2016-10-14 17:03:04 +08:00
|
|
|
.addReg(SrcReg, RegState::Undef, SubReg)
|
|
|
|
.addReg(SrcReg, RegState::Implicit)
|
2016-10-13 02:49:05 +08:00
|
|
|
.addReg(AMDGPU::M0, RegState::Implicit);
|
|
|
|
} else {
|
|
|
|
BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
|
2016-10-14 17:03:04 +08:00
|
|
|
.addReg(SrcReg, RegState::Undef, SubReg)
|
|
|
|
.addReg(SrcReg, RegState::Implicit);
|
2016-10-13 02:49:05 +08:00
|
|
|
}
|
|
|
|
|
2016-10-14 17:03:04 +08:00
|
|
|
MI.eraseFromParent();
|
|
|
|
|
2016-10-13 02:49:05 +08:00
|
|
|
return LoopBB;
|
2016-07-19 08:35:03 +08:00
|
|
|
}
|
|
|
|
|
2017-04-25 02:55:33 +08:00
|
|
|
static unsigned getMOVRELDPseudo(const SIRegisterInfo &TRI,
|
|
|
|
const TargetRegisterClass *VecRC) {
|
|
|
|
switch (TRI.getRegSizeInBits(*VecRC)) {
|
|
|
|
case 32: // 4 bytes
|
AMDGPU: Fix Two Address problems with v_movreld
Summary:
The v_movreld machine instruction is used with three operands that are
in a sense tied to each other (the explicit VGPR_32 def and the implicit
VGPR_NN def and use). There is no way to express that using the currently
available operand bits, and indeed there are cases where the Two Address
instructions pass does the wrong thing.
This patch introduces a new set of pseudo instructions that are identical
in intended semantics as v_movreld, but they only have two tied operands.
Having to add a new set of pseudo instructions is admittedly annoying, but
it's a fairly straightforward and solid approach. The only alternative I
see is to try to teach the Two Address instructions pass about Three Address
instructions, and I'm afraid that's trickier and is going to end up more
fragile.
Note that v_movrels does not suffer from this problem, and so this patch
does not touch it.
This fixes several GL45-CTS.shaders.indexing.* tests.
Reviewers: tstellarAMD, arsenm
Subscribers: kzhuravl, wdng, yaxunl, llvm-commits, tony-tye
Differential Revision: https://reviews.llvm.org/D25633
llvm-svn: 284980
2016-10-24 22:56:02 +08:00
|
|
|
return AMDGPU::V_MOVRELD_B32_V1;
|
2017-04-25 02:55:33 +08:00
|
|
|
case 64: // 8 bytes
|
AMDGPU: Fix Two Address problems with v_movreld
Summary:
The v_movreld machine instruction is used with three operands that are
in a sense tied to each other (the explicit VGPR_32 def and the implicit
VGPR_NN def and use). There is no way to express that using the currently
available operand bits, and indeed there are cases where the Two Address
instructions pass does the wrong thing.
This patch introduces a new set of pseudo instructions that are identical
in intended semantics as v_movreld, but they only have two tied operands.
Having to add a new set of pseudo instructions is admittedly annoying, but
it's a fairly straightforward and solid approach. The only alternative I
see is to try to teach the Two Address instructions pass about Three Address
instructions, and I'm afraid that's trickier and is going to end up more
fragile.
Note that v_movrels does not suffer from this problem, and so this patch
does not touch it.
This fixes several GL45-CTS.shaders.indexing.* tests.
Reviewers: tstellarAMD, arsenm
Subscribers: kzhuravl, wdng, yaxunl, llvm-commits, tony-tye
Differential Revision: https://reviews.llvm.org/D25633
llvm-svn: 284980
2016-10-24 22:56:02 +08:00
|
|
|
return AMDGPU::V_MOVRELD_B32_V2;
|
2017-04-25 02:55:33 +08:00
|
|
|
case 128: // 16 bytes
|
AMDGPU: Fix Two Address problems with v_movreld
Summary:
The v_movreld machine instruction is used with three operands that are
in a sense tied to each other (the explicit VGPR_32 def and the implicit
VGPR_NN def and use). There is no way to express that using the currently
available operand bits, and indeed there are cases where the Two Address
instructions pass does the wrong thing.
This patch introduces a new set of pseudo instructions that are identical
in intended semantics as v_movreld, but they only have two tied operands.
Having to add a new set of pseudo instructions is admittedly annoying, but
it's a fairly straightforward and solid approach. The only alternative I
see is to try to teach the Two Address instructions pass about Three Address
instructions, and I'm afraid that's trickier and is going to end up more
fragile.
Note that v_movrels does not suffer from this problem, and so this patch
does not touch it.
This fixes several GL45-CTS.shaders.indexing.* tests.
Reviewers: tstellarAMD, arsenm
Subscribers: kzhuravl, wdng, yaxunl, llvm-commits, tony-tye
Differential Revision: https://reviews.llvm.org/D25633
llvm-svn: 284980
2016-10-24 22:56:02 +08:00
|
|
|
return AMDGPU::V_MOVRELD_B32_V4;
|
2017-04-25 02:55:33 +08:00
|
|
|
case 256: // 32 bytes
|
AMDGPU: Fix Two Address problems with v_movreld
Summary:
The v_movreld machine instruction is used with three operands that are
in a sense tied to each other (the explicit VGPR_32 def and the implicit
VGPR_NN def and use). There is no way to express that using the currently
available operand bits, and indeed there are cases where the Two Address
instructions pass does the wrong thing.
This patch introduces a new set of pseudo instructions that are identical
in intended semantics as v_movreld, but they only have two tied operands.
Having to add a new set of pseudo instructions is admittedly annoying, but
it's a fairly straightforward and solid approach. The only alternative I
see is to try to teach the Two Address instructions pass about Three Address
instructions, and I'm afraid that's trickier and is going to end up more
fragile.
Note that v_movrels does not suffer from this problem, and so this patch
does not touch it.
This fixes several GL45-CTS.shaders.indexing.* tests.
Reviewers: tstellarAMD, arsenm
Subscribers: kzhuravl, wdng, yaxunl, llvm-commits, tony-tye
Differential Revision: https://reviews.llvm.org/D25633
llvm-svn: 284980
2016-10-24 22:56:02 +08:00
|
|
|
return AMDGPU::V_MOVRELD_B32_V8;
|
2017-04-25 02:55:33 +08:00
|
|
|
case 512: // 64 bytes
|
AMDGPU: Fix Two Address problems with v_movreld
Summary:
The v_movreld machine instruction is used with three operands that are
in a sense tied to each other (the explicit VGPR_32 def and the implicit
VGPR_NN def and use). There is no way to express that using the currently
available operand bits, and indeed there are cases where the Two Address
instructions pass does the wrong thing.
This patch introduces a new set of pseudo instructions that are identical
in intended semantics as v_movreld, but they only have two tied operands.
Having to add a new set of pseudo instructions is admittedly annoying, but
it's a fairly straightforward and solid approach. The only alternative I
see is to try to teach the Two Address instructions pass about Three Address
instructions, and I'm afraid that's trickier and is going to end up more
fragile.
Note that v_movrels does not suffer from this problem, and so this patch
does not touch it.
This fixes several GL45-CTS.shaders.indexing.* tests.
Reviewers: tstellarAMD, arsenm
Subscribers: kzhuravl, wdng, yaxunl, llvm-commits, tony-tye
Differential Revision: https://reviews.llvm.org/D25633
llvm-svn: 284980
2016-10-24 22:56:02 +08:00
|
|
|
return AMDGPU::V_MOVRELD_B32_V16;
|
|
|
|
default:
|
|
|
|
llvm_unreachable("unsupported size for MOVRELD pseudos");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-07-19 08:35:03 +08:00
|
|
|
static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
|
|
|
|
MachineBasicBlock &MBB,
|
2016-10-04 09:41:05 +08:00
|
|
|
const SISubtarget &ST) {
|
|
|
|
const SIInstrInfo *TII = ST.getInstrInfo();
|
2016-07-19 08:35:03 +08:00
|
|
|
const SIRegisterInfo &TRI = TII->getRegisterInfo();
|
|
|
|
MachineFunction *MF = MBB.getParent();
|
|
|
|
MachineRegisterInfo &MRI = MF->getRegInfo();
|
|
|
|
|
|
|
|
unsigned Dst = MI.getOperand(0).getReg();
|
|
|
|
const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
|
|
|
|
const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
|
|
|
|
const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
|
|
|
|
int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
|
|
|
|
const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
|
|
|
|
|
|
|
|
// This can be an immediate, but will be folded later.
|
|
|
|
assert(Val->getReg());
|
|
|
|
|
|
|
|
unsigned SubReg;
|
|
|
|
std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
|
|
|
|
SrcVec->getReg(),
|
|
|
|
Offset);
|
2017-03-22 01:00:32 +08:00
|
|
|
bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
|
2016-10-13 02:49:05 +08:00
|
|
|
|
2016-07-19 08:35:03 +08:00
|
|
|
if (Idx->getReg() == AMDGPU::NoRegister) {
|
|
|
|
MachineBasicBlock::iterator I(&MI);
|
|
|
|
const DebugLoc &DL = MI.getDebugLoc();
|
|
|
|
|
|
|
|
assert(Offset == 0);
|
|
|
|
|
|
|
|
BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
|
2017-01-13 17:58:52 +08:00
|
|
|
.add(*SrcVec)
|
|
|
|
.add(*Val)
|
|
|
|
.addImm(SubReg);
|
2016-07-19 08:35:03 +08:00
|
|
|
|
|
|
|
MI.eraseFromParent();
|
|
|
|
return &MBB;
|
|
|
|
}
|
|
|
|
|
2016-10-13 02:49:05 +08:00
|
|
|
if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, false)) {
|
2016-07-19 08:35:03 +08:00
|
|
|
MachineBasicBlock::iterator I(&MI);
|
|
|
|
const DebugLoc &DL = MI.getDebugLoc();
|
|
|
|
|
2016-10-13 02:49:05 +08:00
|
|
|
if (UseGPRIdxMode) {
|
|
|
|
BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
|
2017-01-13 17:58:52 +08:00
|
|
|
.addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst
|
|
|
|
.add(*Val)
|
|
|
|
.addReg(Dst, RegState::ImplicitDefine)
|
|
|
|
.addReg(SrcVec->getReg(), RegState::Implicit)
|
|
|
|
.addReg(AMDGPU::M0, RegState::Implicit);
|
2016-07-19 08:35:03 +08:00
|
|
|
|
2016-10-13 02:49:05 +08:00
|
|
|
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
|
|
|
|
} else {
|
2017-04-25 02:55:33 +08:00
|
|
|
const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
|
2016-10-13 02:49:05 +08:00
|
|
|
|
AMDGPU: Fix Two Address problems with v_movreld
Summary:
The v_movreld machine instruction is used with three operands that are
in a sense tied to each other (the explicit VGPR_32 def and the implicit
VGPR_NN def and use). There is no way to express that using the currently
available operand bits, and indeed there are cases where the Two Address
instructions pass does the wrong thing.
This patch introduces a new set of pseudo instructions that are identical
in intended semantics as v_movreld, but they only have two tied operands.
Having to add a new set of pseudo instructions is admittedly annoying, but
it's a fairly straightforward and solid approach. The only alternative I
see is to try to teach the Two Address instructions pass about Three Address
instructions, and I'm afraid that's trickier and is going to end up more
fragile.
Note that v_movrels does not suffer from this problem, and so this patch
does not touch it.
This fixes several GL45-CTS.shaders.indexing.* tests.
Reviewers: tstellarAMD, arsenm
Subscribers: kzhuravl, wdng, yaxunl, llvm-commits, tony-tye
Differential Revision: https://reviews.llvm.org/D25633
llvm-svn: 284980
2016-10-24 22:56:02 +08:00
|
|
|
BuildMI(MBB, I, DL, MovRelDesc)
|
|
|
|
.addReg(Dst, RegState::Define)
|
|
|
|
.addReg(SrcVec->getReg())
|
2017-01-13 17:58:52 +08:00
|
|
|
.add(*Val)
|
AMDGPU: Fix Two Address problems with v_movreld
Summary:
The v_movreld machine instruction is used with three operands that are
in a sense tied to each other (the explicit VGPR_32 def and the implicit
VGPR_NN def and use). There is no way to express that using the currently
available operand bits, and indeed there are cases where the Two Address
instructions pass does the wrong thing.
This patch introduces a new set of pseudo instructions that are identical
in intended semantics as v_movreld, but they only have two tied operands.
Having to add a new set of pseudo instructions is admittedly annoying, but
it's a fairly straightforward and solid approach. The only alternative I
see is to try to teach the Two Address instructions pass about Three Address
instructions, and I'm afraid that's trickier and is going to end up more
fragile.
Note that v_movrels does not suffer from this problem, and so this patch
does not touch it.
This fixes several GL45-CTS.shaders.indexing.* tests.
Reviewers: tstellarAMD, arsenm
Subscribers: kzhuravl, wdng, yaxunl, llvm-commits, tony-tye
Differential Revision: https://reviews.llvm.org/D25633
llvm-svn: 284980
2016-10-24 22:56:02 +08:00
|
|
|
.addImm(SubReg - AMDGPU::sub0);
|
2016-10-13 02:49:05 +08:00
|
|
|
}
|
2016-07-19 08:35:03 +08:00
|
|
|
|
|
|
|
MI.eraseFromParent();
|
|
|
|
return &MBB;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (Val->isReg())
|
|
|
|
MRI.clearKillFlags(Val->getReg());
|
|
|
|
|
|
|
|
const DebugLoc &DL = MI.getDebugLoc();
|
2016-10-13 02:49:05 +08:00
|
|
|
|
|
|
|
if (UseGPRIdxMode) {
|
|
|
|
MachineBasicBlock::iterator I(&MI);
|
|
|
|
|
|
|
|
MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
|
|
|
|
.addImm(0) // Reset inside loop.
|
|
|
|
.addImm(VGPRIndexMode::DST_ENABLE);
|
2016-10-13 20:45:16 +08:00
|
|
|
SetOn->getOperand(3).setIsUndef();
|
2016-10-13 02:49:05 +08:00
|
|
|
|
|
|
|
// Disable again after the loop.
|
|
|
|
BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
|
|
|
|
}
|
|
|
|
|
2016-07-19 08:35:03 +08:00
|
|
|
unsigned PhiReg = MRI.createVirtualRegister(VecRC);
|
|
|
|
|
2016-10-13 02:49:05 +08:00
|
|
|
auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg,
|
|
|
|
Offset, UseGPRIdxMode);
|
|
|
|
MachineBasicBlock *LoopBB = InsPt->getParent();
|
2016-10-04 09:41:05 +08:00
|
|
|
|
2016-10-13 02:49:05 +08:00
|
|
|
if (UseGPRIdxMode) {
|
|
|
|
BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
|
2017-01-13 17:58:52 +08:00
|
|
|
.addReg(PhiReg, RegState::Undef, SubReg) // vdst
|
|
|
|
.add(*Val) // src0
|
|
|
|
.addReg(Dst, RegState::ImplicitDefine)
|
|
|
|
.addReg(PhiReg, RegState::Implicit)
|
|
|
|
.addReg(AMDGPU::M0, RegState::Implicit);
|
2016-10-13 02:49:05 +08:00
|
|
|
} else {
|
2017-04-25 02:55:33 +08:00
|
|
|
const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
|
AMDGPU: Fix Two Address problems with v_movreld
Summary:
The v_movreld machine instruction is used with three operands that are
in a sense tied to each other (the explicit VGPR_32 def and the implicit
VGPR_NN def and use). There is no way to express that using the currently
available operand bits, and indeed there are cases where the Two Address
instructions pass does the wrong thing.
This patch introduces a new set of pseudo instructions that are identical
in intended semantics as v_movreld, but they only have two tied operands.
Having to add a new set of pseudo instructions is admittedly annoying, but
it's a fairly straightforward and solid approach. The only alternative I
see is to try to teach the Two Address instructions pass about Three Address
instructions, and I'm afraid that's trickier and is going to end up more
fragile.
Note that v_movrels does not suffer from this problem, and so this patch
does not touch it.
This fixes several GL45-CTS.shaders.indexing.* tests.
Reviewers: tstellarAMD, arsenm
Subscribers: kzhuravl, wdng, yaxunl, llvm-commits, tony-tye
Differential Revision: https://reviews.llvm.org/D25633
llvm-svn: 284980
2016-10-24 22:56:02 +08:00
|
|
|
|
|
|
|
BuildMI(*LoopBB, InsPt, DL, MovRelDesc)
|
|
|
|
.addReg(Dst, RegState::Define)
|
|
|
|
.addReg(PhiReg)
|
2017-01-13 17:58:52 +08:00
|
|
|
.add(*Val)
|
AMDGPU: Fix Two Address problems with v_movreld
Summary:
The v_movreld machine instruction is used with three operands that are
in a sense tied to each other (the explicit VGPR_32 def and the implicit
VGPR_NN def and use). There is no way to express that using the currently
available operand bits, and indeed there are cases where the Two Address
instructions pass does the wrong thing.
This patch introduces a new set of pseudo instructions that are identical
in intended semantics as v_movreld, but they only have two tied operands.
Having to add a new set of pseudo instructions is admittedly annoying, but
it's a fairly straightforward and solid approach. The only alternative I
see is to try to teach the Two Address instructions pass about Three Address
instructions, and I'm afraid that's trickier and is going to end up more
fragile.
Note that v_movrels does not suffer from this problem, and so this patch
does not touch it.
This fixes several GL45-CTS.shaders.indexing.* tests.
Reviewers: tstellarAMD, arsenm
Subscribers: kzhuravl, wdng, yaxunl, llvm-commits, tony-tye
Differential Revision: https://reviews.llvm.org/D25633
llvm-svn: 284980
2016-10-24 22:56:02 +08:00
|
|
|
.addImm(SubReg - AMDGPU::sub0);
|
2016-10-13 02:49:05 +08:00
|
|
|
}
|
2016-07-19 08:35:03 +08:00
|
|
|
|
2016-10-14 17:03:04 +08:00
|
|
|
MI.eraseFromParent();
|
|
|
|
|
2016-10-13 02:49:05 +08:00
|
|
|
return LoopBB;
|
2016-07-19 08:35:03 +08:00
|
|
|
}
|
|
|
|
|
2016-07-13 05:41:32 +08:00
|
|
|
MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
|
|
|
|
MachineInstr &MI, MachineBasicBlock *BB) const {
|
2016-12-20 23:52:17 +08:00
|
|
|
|
|
|
|
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
|
|
|
|
MachineFunction *MF = BB->getParent();
|
|
|
|
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
|
|
|
|
|
|
|
|
if (TII->isMIMG(MI)) {
|
|
|
|
if (!MI.memoperands_empty())
|
|
|
|
return BB;
|
|
|
|
// Add a memoperand for mimg instructions so that they aren't assumed to
|
|
|
|
// be ordered memory instuctions.
|
|
|
|
|
|
|
|
MachinePointerInfo PtrInfo(MFI->getImagePSV());
|
|
|
|
MachineMemOperand::Flags Flags = MachineMemOperand::MODereferenceable;
|
|
|
|
if (MI.mayStore())
|
|
|
|
Flags |= MachineMemOperand::MOStore;
|
|
|
|
|
|
|
|
if (MI.mayLoad())
|
|
|
|
Flags |= MachineMemOperand::MOLoad;
|
|
|
|
|
|
|
|
auto MMO = MF->getMachineMemOperand(PtrInfo, Flags, 0, 0);
|
|
|
|
MI.addMemOperand(*MF, MMO);
|
|
|
|
return BB;
|
|
|
|
}
|
|
|
|
|
2016-07-01 06:52:52 +08:00
|
|
|
switch (MI.getOpcode()) {
|
2017-01-21 08:53:49 +08:00
|
|
|
case AMDGPU::SI_INIT_M0:
|
2016-07-01 06:52:52 +08:00
|
|
|
BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
|
2016-04-15 05:58:15 +08:00
|
|
|
TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
|
2017-01-13 17:58:52 +08:00
|
|
|
.add(MI.getOperand(0));
|
2016-07-01 06:52:52 +08:00
|
|
|
MI.eraseFromParent();
|
2015-02-21 06:10:45 +08:00
|
|
|
return BB;
|
2017-01-21 08:53:49 +08:00
|
|
|
|
AMDGPU: Add new amdgcn.init.exec intrinsics
v2: More tests, bug fixes, cosmetic changes.
Subscribers: arsenm, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, llvm-commits, t-tye
Differential Revision: https://reviews.llvm.org/D31762
llvm-svn: 301677
2017-04-29 04:21:58 +08:00
|
|
|
case AMDGPU::SI_INIT_EXEC:
|
|
|
|
// This should be before all vector instructions.
|
|
|
|
BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
|
|
|
|
AMDGPU::EXEC)
|
|
|
|
.addImm(MI.getOperand(0).getImm());
|
|
|
|
MI.eraseFromParent();
|
|
|
|
return BB;
|
|
|
|
|
|
|
|
case AMDGPU::SI_INIT_EXEC_FROM_INPUT: {
|
|
|
|
// Extract the thread count from an SGPR input and set EXEC accordingly.
|
|
|
|
// Since BFM can't shift by 64, handle that case with CMP + CMOV.
|
|
|
|
//
|
|
|
|
// S_BFE_U32 count, input, {shift, 7}
|
|
|
|
// S_BFM_B64 exec, count, 0
|
|
|
|
// S_CMP_EQ_U32 count, 64
|
|
|
|
// S_CMOV_B64 exec, -1
|
|
|
|
MachineInstr *FirstMI = &*BB->begin();
|
|
|
|
MachineRegisterInfo &MRI = MF->getRegInfo();
|
|
|
|
unsigned InputReg = MI.getOperand(0).getReg();
|
|
|
|
unsigned CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
|
|
|
|
bool Found = false;
|
|
|
|
|
|
|
|
// Move the COPY of the input reg to the beginning, so that we can use it.
|
|
|
|
for (auto I = BB->begin(); I != &MI; I++) {
|
|
|
|
if (I->getOpcode() != TargetOpcode::COPY ||
|
|
|
|
I->getOperand(0).getReg() != InputReg)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (I == FirstMI) {
|
|
|
|
FirstMI = &*++BB->begin();
|
|
|
|
} else {
|
|
|
|
I->removeFromParent();
|
|
|
|
BB->insert(FirstMI, &*I);
|
|
|
|
}
|
|
|
|
Found = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
assert(Found);
|
2017-05-12 03:58:52 +08:00
|
|
|
(void)Found;
|
AMDGPU: Add new amdgcn.init.exec intrinsics
v2: More tests, bug fixes, cosmetic changes.
Subscribers: arsenm, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, llvm-commits, t-tye
Differential Revision: https://reviews.llvm.org/D31762
llvm-svn: 301677
2017-04-29 04:21:58 +08:00
|
|
|
|
|
|
|
// This should be before all vector instructions.
|
|
|
|
BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg)
|
|
|
|
.addReg(InputReg)
|
|
|
|
.addImm((MI.getOperand(1).getImm() & 0x7f) | 0x70000);
|
|
|
|
BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFM_B64),
|
|
|
|
AMDGPU::EXEC)
|
|
|
|
.addReg(CountReg)
|
|
|
|
.addImm(0);
|
|
|
|
BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32))
|
|
|
|
.addReg(CountReg, RegState::Kill)
|
|
|
|
.addImm(64);
|
|
|
|
BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMOV_B64),
|
|
|
|
AMDGPU::EXEC)
|
|
|
|
.addImm(-1);
|
|
|
|
MI.eraseFromParent();
|
|
|
|
return BB;
|
|
|
|
}
|
|
|
|
|
2016-03-16 01:28:44 +08:00
|
|
|
case AMDGPU::GET_GROUPSTATICSIZE: {
|
2016-07-01 06:52:52 +08:00
|
|
|
DebugLoc DL = MI.getDebugLoc();
|
2016-07-23 01:01:33 +08:00
|
|
|
BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
|
2017-01-13 17:58:52 +08:00
|
|
|
.add(MI.getOperand(0))
|
|
|
|
.addImm(MFI->getLDSSize());
|
2016-07-01 06:52:52 +08:00
|
|
|
MI.eraseFromParent();
|
2016-03-16 01:28:44 +08:00
|
|
|
return BB;
|
|
|
|
}
|
2016-07-19 08:35:03 +08:00
|
|
|
case AMDGPU::SI_INDIRECT_SRC_V1:
|
|
|
|
case AMDGPU::SI_INDIRECT_SRC_V2:
|
|
|
|
case AMDGPU::SI_INDIRECT_SRC_V4:
|
|
|
|
case AMDGPU::SI_INDIRECT_SRC_V8:
|
|
|
|
case AMDGPU::SI_INDIRECT_SRC_V16:
|
2016-10-04 09:41:05 +08:00
|
|
|
return emitIndirectSrc(MI, *BB, *getSubtarget());
|
2016-07-19 08:35:03 +08:00
|
|
|
case AMDGPU::SI_INDIRECT_DST_V1:
|
|
|
|
case AMDGPU::SI_INDIRECT_DST_V2:
|
|
|
|
case AMDGPU::SI_INDIRECT_DST_V4:
|
|
|
|
case AMDGPU::SI_INDIRECT_DST_V8:
|
|
|
|
case AMDGPU::SI_INDIRECT_DST_V16:
|
2016-10-04 09:41:05 +08:00
|
|
|
return emitIndirectDst(MI, *BB, *getSubtarget());
|
2016-07-13 05:41:32 +08:00
|
|
|
case AMDGPU::SI_KILL:
|
|
|
|
return splitKillBlock(MI, BB);
|
2016-08-27 09:00:37 +08:00
|
|
|
case AMDGPU::V_CNDMASK_B64_PSEUDO: {
|
|
|
|
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
|
|
|
|
|
|
|
|
unsigned Dst = MI.getOperand(0).getReg();
|
|
|
|
unsigned Src0 = MI.getOperand(1).getReg();
|
|
|
|
unsigned Src1 = MI.getOperand(2).getReg();
|
|
|
|
const DebugLoc &DL = MI.getDebugLoc();
|
|
|
|
unsigned SrcCond = MI.getOperand(3).getReg();
|
|
|
|
|
|
|
|
unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
|
|
|
|
unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
|
|
|
|
|
|
|
|
BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
|
|
|
|
.addReg(Src0, 0, AMDGPU::sub0)
|
|
|
|
.addReg(Src1, 0, AMDGPU::sub0)
|
|
|
|
.addReg(SrcCond);
|
|
|
|
BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
|
|
|
|
.addReg(Src0, 0, AMDGPU::sub1)
|
|
|
|
.addReg(Src1, 0, AMDGPU::sub1)
|
|
|
|
.addReg(SrcCond);
|
|
|
|
|
|
|
|
BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
|
|
|
|
.addReg(DstLo)
|
|
|
|
.addImm(AMDGPU::sub0)
|
|
|
|
.addReg(DstHi)
|
|
|
|
.addImm(AMDGPU::sub1);
|
|
|
|
MI.eraseFromParent();
|
|
|
|
return BB;
|
|
|
|
}
|
2016-12-16 05:57:11 +08:00
|
|
|
case AMDGPU::SI_BR_UNDEF: {
|
|
|
|
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
|
|
|
|
const DebugLoc &DL = MI.getDebugLoc();
|
|
|
|
MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
|
2017-01-13 17:58:52 +08:00
|
|
|
.add(MI.getOperand(0));
|
2016-12-16 05:57:11 +08:00
|
|
|
Br->getOperand(1).setIsUndef(true); // read undef SCC
|
|
|
|
MI.eraseFromParent();
|
|
|
|
return BB;
|
|
|
|
}
|
2017-08-02 03:54:18 +08:00
|
|
|
case AMDGPU::ADJCALLSTACKUP:
|
|
|
|
case AMDGPU::ADJCALLSTACKDOWN: {
|
|
|
|
const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
|
|
|
|
MachineInstrBuilder MIB(*MF, &MI);
|
|
|
|
MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
|
|
|
|
.addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
|
|
|
|
return BB;
|
|
|
|
}
|
2017-08-12 04:42:08 +08:00
|
|
|
case AMDGPU::SI_CALL_ISEL:
|
|
|
|
case AMDGPU::SI_TCRETURN_ISEL: {
|
2017-08-02 03:54:18 +08:00
|
|
|
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
|
|
|
|
const DebugLoc &DL = MI.getDebugLoc();
|
|
|
|
unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
|
2017-08-02 09:31:28 +08:00
|
|
|
|
|
|
|
MachineRegisterInfo &MRI = MF->getRegInfo();
|
|
|
|
unsigned GlobalAddrReg = MI.getOperand(0).getReg();
|
|
|
|
MachineInstr *PCRel = MRI.getVRegDef(GlobalAddrReg);
|
|
|
|
assert(PCRel->getOpcode() == AMDGPU::SI_PC_ADD_REL_OFFSET);
|
|
|
|
|
|
|
|
const GlobalValue *G = PCRel->getOperand(1).getGlobal();
|
|
|
|
|
2017-08-12 04:42:08 +08:00
|
|
|
MachineInstrBuilder MIB;
|
|
|
|
if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
|
|
|
|
MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg)
|
|
|
|
.add(MI.getOperand(0))
|
|
|
|
.addGlobalAddress(G);
|
|
|
|
} else {
|
|
|
|
MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_TCRETURN))
|
|
|
|
.add(MI.getOperand(0))
|
|
|
|
.addGlobalAddress(G);
|
|
|
|
|
|
|
|
// There is an additional imm operand for tcreturn, but it should be in the
|
|
|
|
// right place already.
|
|
|
|
}
|
2017-08-02 09:31:28 +08:00
|
|
|
|
|
|
|
for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
|
2017-08-02 03:54:18 +08:00
|
|
|
MIB.add(MI.getOperand(I));
|
2017-08-02 09:31:28 +08:00
|
|
|
|
2017-08-02 03:54:18 +08:00
|
|
|
MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
|
|
|
|
MI.eraseFromParent();
|
|
|
|
return BB;
|
|
|
|
}
|
2016-03-16 01:28:44 +08:00
|
|
|
default:
|
|
|
|
return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
|
2012-12-12 05:25:42 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-01-30 03:34:32 +08:00
|
|
|
bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
|
|
|
|
// This currently forces unfolding various combinations of fsub into fma with
|
|
|
|
// free fneg'd operands. As long as we have fast FMA (controlled by
|
|
|
|
// isFMAFasterThanFMulAndFAdd), we should perform these.
|
|
|
|
|
|
|
|
// When fma is quarter rate, for f64 where add / sub are at best half rate,
|
|
|
|
// most of these combines appear to be cycle neutral but save on instruction
|
|
|
|
// count / code size.
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2015-07-09 10:09:04 +08:00
|
|
|
EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
|
|
|
|
EVT VT) const {
|
2013-07-19 05:43:53 +08:00
|
|
|
if (!VT.isVector()) {
|
|
|
|
return MVT::i1;
|
|
|
|
}
|
2014-11-29 06:51:38 +08:00
|
|
|
return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
|
2012-12-12 05:25:42 +08:00
|
|
|
}
|
|
|
|
|
2016-12-23 00:36:25 +08:00
|
|
|
MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const {
|
|
|
|
// TODO: Should i16 be used always if legal? For now it would force VALU
|
|
|
|
// shifts.
|
|
|
|
return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
|
2013-03-18 19:34:05 +08:00
|
|
|
}
|
|
|
|
|
2015-01-30 03:34:32 +08:00
|
|
|
// Answering this is somewhat tricky and depends on the specific device which
|
|
|
|
// have different rates for fma or all f64 operations.
|
|
|
|
//
|
|
|
|
// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
|
|
|
|
// regardless of which device (although the number of cycles differs between
|
|
|
|
// devices), so it is always profitable for f64.
|
|
|
|
//
|
|
|
|
// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
|
|
|
|
// only on full rate devices. Normally, we should prefer selecting v_mad_f32
|
|
|
|
// which we can always do even without fused FP ops since it returns the same
|
|
|
|
// result as the separate operations and since it is always full
|
|
|
|
// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
|
|
|
|
// however does not support denormals, so we do report fma as faster if we have
|
|
|
|
// a fast fma device and require denormals.
|
|
|
|
//
|
2013-08-10 18:38:54 +08:00
|
|
|
bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
|
|
|
|
VT = VT.getScalarType();
|
|
|
|
|
|
|
|
switch (VT.getSimpleVT().SimpleTy) {
|
|
|
|
case MVT::f32:
|
2015-01-30 03:34:32 +08:00
|
|
|
// This is as fast on some subtargets. However, we always have full rate f32
|
|
|
|
// mad available which returns the same result as the separate operations
|
2015-02-21 06:10:41 +08:00
|
|
|
// which we should prefer over fma. We can't use this if we want to support
|
|
|
|
// denormals, so only report this in these cases.
|
|
|
|
return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32();
|
2013-08-10 18:38:54 +08:00
|
|
|
case MVT::f64:
|
|
|
|
return true;
|
2016-12-22 11:21:48 +08:00
|
|
|
case MVT::f16:
|
|
|
|
return Subtarget->has16BitInsts() && Subtarget->hasFP16Denormals();
|
2013-08-10 18:38:54 +08:00
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2012-12-12 05:25:42 +08:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// Custom DAG Lowering Operations
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
|
|
|
|
switch (Op.getOpcode()) {
|
|
|
|
default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
|
2012-12-20 06:10:31 +08:00
|
|
|
case ISD::BRCOND: return LowerBRCOND(Op, DAG);
|
2013-08-26 23:06:04 +08:00
|
|
|
case ISD::LOAD: {
|
2014-07-21 23:45:06 +08:00
|
|
|
SDValue Result = LowerLOAD(Op, DAG);
|
|
|
|
assert((!Result.getNode() ||
|
|
|
|
Result.getNode()->getNumValues() == 2) &&
|
|
|
|
"Load should return a value and a chain");
|
|
|
|
return Result;
|
2013-08-26 23:06:04 +08:00
|
|
|
}
|
2013-10-23 08:44:32 +08:00
|
|
|
|
2014-07-20 02:44:39 +08:00
|
|
|
case ISD::FSIN:
|
|
|
|
case ISD::FCOS:
|
|
|
|
return LowerTrig(Op, DAG);
|
2014-02-05 01:18:40 +08:00
|
|
|
case ISD::SELECT: return LowerSELECT(Op, DAG);
|
2014-07-16 04:18:31 +08:00
|
|
|
case ISD::FDIV: return LowerFDIV(Op, DAG);
|
AMDGPU: Implement {BUFFER,FLAT}_ATOMIC_CMPSWAP{,_X2}
Summary:
Implement BUFFER_ATOMIC_CMPSWAP{,_X2} instructions on all GCN targets, and FLAT_ATOMIC_CMPSWAP{,_X2} on CI+.
32-bit instruction variants tested manually on Kabini and Bonaire. Tests and parts of code provided by Jan Veselý.
Patch by: Vedran Miletić
Reviewers: arsenm, tstellarAMD, nhaehnle
Subscribers: jvesely, scchan, kanarayan, arsenm
Differential Revision: http://reviews.llvm.org/D17280
llvm-svn: 265170
2016-04-02 02:27:37 +08:00
|
|
|
case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
|
2013-11-14 07:36:50 +08:00
|
|
|
case ISD::STORE: return LowerSTORE(Op, DAG);
|
2014-07-26 14:23:37 +08:00
|
|
|
case ISD::GlobalAddress: {
|
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
|
|
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
|
|
|
return LowerGlobalAddress(MFI, Op, DAG);
|
2013-06-04 01:40:18 +08:00
|
|
|
}
|
2014-07-26 14:23:37 +08:00
|
|
|
case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
|
2016-04-12 22:05:04 +08:00
|
|
|
case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
|
2014-07-26 14:23:37 +08:00
|
|
|
case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
|
2016-04-26 03:27:24 +08:00
|
|
|
case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
|
2017-01-24 07:09:58 +08:00
|
|
|
case ISD::INSERT_VECTOR_ELT:
|
|
|
|
return lowerINSERT_VECTOR_ELT(Op, DAG);
|
|
|
|
case ISD::EXTRACT_VECTOR_ELT:
|
|
|
|
return lowerEXTRACT_VECTOR_ELT(Op, DAG);
|
2016-11-17 12:28:37 +08:00
|
|
|
case ISD::FP_ROUND:
|
|
|
|
return lowerFP_ROUND(Op, DAG);
|
2017-04-25 01:49:13 +08:00
|
|
|
|
|
|
|
case ISD::TRAP:
|
|
|
|
case ISD::DEBUGTRAP:
|
|
|
|
return lowerTRAP(Op, DAG);
|
2012-12-12 05:25:42 +08:00
|
|
|
}
|
|
|
|
return SDValue();
|
|
|
|
}
|
|
|
|
|
2017-01-24 07:09:58 +08:00
|
|
|
void SITargetLowering::ReplaceNodeResults(SDNode *N,
|
|
|
|
SmallVectorImpl<SDValue> &Results,
|
|
|
|
SelectionDAG &DAG) const {
|
|
|
|
switch (N->getOpcode()) {
|
|
|
|
case ISD::INSERT_VECTOR_ELT: {
|
|
|
|
if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
|
|
|
|
Results.push_back(Res);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
case ISD::EXTRACT_VECTOR_ELT: {
|
|
|
|
if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
|
|
|
|
Results.push_back(Res);
|
|
|
|
return;
|
|
|
|
}
|
2017-02-22 08:27:34 +08:00
|
|
|
case ISD::INTRINSIC_WO_CHAIN: {
|
|
|
|
unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
|
2017-07-09 03:50:03 +08:00
|
|
|
if (IID == Intrinsic::amdgcn_cvt_pkrtz) {
|
2017-02-22 08:27:34 +08:00
|
|
|
SDValue Src0 = N->getOperand(1);
|
|
|
|
SDValue Src1 = N->getOperand(2);
|
|
|
|
SDLoc SL(N);
|
|
|
|
SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32,
|
|
|
|
Src0, Src1);
|
|
|
|
Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
|
|
|
|
return;
|
|
|
|
}
|
2017-07-09 03:50:03 +08:00
|
|
|
break;
|
2017-02-22 08:27:34 +08:00
|
|
|
}
|
2017-04-20 04:53:07 +08:00
|
|
|
case ISD::SELECT: {
|
|
|
|
SDLoc SL(N);
|
|
|
|
EVT VT = N->getValueType(0);
|
|
|
|
EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
|
|
|
|
SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
|
|
|
|
SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
|
|
|
|
|
|
|
|
EVT SelectVT = NewVT;
|
|
|
|
if (NewVT.bitsLT(MVT::i32)) {
|
|
|
|
LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
|
|
|
|
RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
|
|
|
|
SelectVT = MVT::i32;
|
|
|
|
}
|
|
|
|
|
|
|
|
SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, SelectVT,
|
|
|
|
N->getOperand(0), LHS, RHS);
|
|
|
|
|
|
|
|
if (NewVT != SelectVT)
|
|
|
|
NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
|
|
|
|
Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
|
|
|
|
return;
|
|
|
|
}
|
2017-01-24 07:09:58 +08:00
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-12-20 06:10:31 +08:00
|
|
|
/// \brief Helper function for LowerBRCOND
|
|
|
|
static SDNode *findUser(SDValue Value, unsigned Opcode) {
|
|
|
|
|
|
|
|
SDNode *Parent = Value.getNode();
|
|
|
|
for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
|
|
|
|
I != E; ++I) {
|
|
|
|
|
|
|
|
if (I.getUse().get() != Value)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (I->getOpcode() == Opcode)
|
|
|
|
return *I;
|
|
|
|
}
|
2014-04-25 13:30:21 +08:00
|
|
|
return nullptr;
|
2012-12-20 06:10:31 +08:00
|
|
|
}
|
|
|
|
|
2017-03-18 04:41:45 +08:00
|
|
|
unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
|
2016-09-17 06:11:18 +08:00
|
|
|
if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
|
|
|
|
switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
|
2017-03-18 04:41:45 +08:00
|
|
|
case Intrinsic::amdgcn_if:
|
|
|
|
return AMDGPUISD::IF;
|
|
|
|
case Intrinsic::amdgcn_else:
|
|
|
|
return AMDGPUISD::ELSE;
|
|
|
|
case Intrinsic::amdgcn_loop:
|
|
|
|
return AMDGPUISD::LOOP;
|
|
|
|
case Intrinsic::amdgcn_end_cf:
|
|
|
|
llvm_unreachable("should not occur");
|
2016-09-17 06:11:18 +08:00
|
|
|
default:
|
2017-03-18 04:41:45 +08:00
|
|
|
return 0;
|
2016-09-17 06:11:18 +08:00
|
|
|
}
|
|
|
|
}
|
2016-02-13 07:45:29 +08:00
|
|
|
|
2017-03-18 04:41:45 +08:00
|
|
|
// break, if_break, else_break are all only used as inputs to loop, not
|
|
|
|
// directly as branch conditions.
|
|
|
|
return 0;
|
2016-02-13 07:45:29 +08:00
|
|
|
}
|
|
|
|
|
2016-06-25 11:11:28 +08:00
|
|
|
void SITargetLowering::createDebuggerPrologueStackObjects(
|
|
|
|
MachineFunction &MF) const {
|
|
|
|
// Create stack objects that are used for emitting debugger prologue.
|
|
|
|
//
|
|
|
|
// Debugger prologue writes work group IDs and work item IDs to scratch memory
|
|
|
|
// at fixed location in the following format:
|
|
|
|
// offset 0: work group ID x
|
|
|
|
// offset 4: work group ID y
|
|
|
|
// offset 8: work group ID z
|
|
|
|
// offset 16: work item ID x
|
|
|
|
// offset 20: work item ID y
|
|
|
|
// offset 24: work item ID z
|
|
|
|
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
|
|
|
|
int ObjectIdx = 0;
|
|
|
|
|
|
|
|
// For each dimension:
|
|
|
|
for (unsigned i = 0; i < 3; ++i) {
|
|
|
|
// Create fixed stack object for work group ID.
|
2016-07-29 02:40:00 +08:00
|
|
|
ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4, true);
|
2016-06-25 11:11:28 +08:00
|
|
|
Info->setDebuggerWorkGroupIDStackObjectIndex(i, ObjectIdx);
|
|
|
|
// Create fixed stack object for work item ID.
|
2016-07-29 02:40:00 +08:00
|
|
|
ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4 + 16, true);
|
2016-06-25 11:11:28 +08:00
|
|
|
Info->setDebuggerWorkItemIDStackObjectIndex(i, ObjectIdx);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-10-21 02:12:38 +08:00
|
|
|
bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
|
|
|
|
const Triple &TT = getTargetMachine().getTargetTriple();
|
2017-03-27 22:04:01 +08:00
|
|
|
return GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
|
2016-10-21 02:12:38 +08:00
|
|
|
AMDGPU::shouldEmitConstantsToTextSection(TT);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
|
2017-03-27 22:04:01 +08:00
|
|
|
return (GV->getType()->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
|
|
|
|
GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) &&
|
2016-10-21 02:12:38 +08:00
|
|
|
!shouldEmitFixup(GV) &&
|
|
|
|
!getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
|
|
|
|
return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
|
|
|
|
}
|
|
|
|
|
2012-12-20 06:10:31 +08:00
|
|
|
/// This transforms the control flow intrinsics to get the branch destination as
|
|
|
|
/// last parameter, also switches branch target with BR if the need arise
|
|
|
|
SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
|
|
|
|
SelectionDAG &DAG) const {
|
2013-05-25 10:42:55 +08:00
|
|
|
SDLoc DL(BRCOND);
|
2012-12-20 06:10:31 +08:00
|
|
|
|
|
|
|
SDNode *Intr = BRCOND.getOperand(1).getNode();
|
|
|
|
SDValue Target = BRCOND.getOperand(2);
|
2014-04-25 13:30:21 +08:00
|
|
|
SDNode *BR = nullptr;
|
2016-02-13 07:45:29 +08:00
|
|
|
SDNode *SetCC = nullptr;
|
2012-12-20 06:10:31 +08:00
|
|
|
|
|
|
|
if (Intr->getOpcode() == ISD::SETCC) {
|
|
|
|
// As long as we negate the condition everything is fine
|
2016-02-13 07:45:29 +08:00
|
|
|
SetCC = Intr;
|
2012-12-20 06:10:31 +08:00
|
|
|
Intr = SetCC->getOperand(0).getNode();
|
|
|
|
|
|
|
|
} else {
|
|
|
|
// Get the target from BR if we don't negate the condition
|
|
|
|
BR = findUser(BRCOND, ISD::BR);
|
|
|
|
Target = BR->getOperand(1);
|
|
|
|
}
|
|
|
|
|
2016-09-17 06:11:18 +08:00
|
|
|
// FIXME: This changes the types of the intrinsics instead of introducing new
|
|
|
|
// nodes with the correct types.
|
|
|
|
// e.g. llvm.amdgcn.loop
|
|
|
|
|
|
|
|
// eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3
|
|
|
|
// => t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088>
|
|
|
|
|
2017-03-18 04:41:45 +08:00
|
|
|
unsigned CFNode = isCFIntrinsic(Intr);
|
|
|
|
if (CFNode == 0) {
|
2016-02-13 07:45:29 +08:00
|
|
|
// This is a uniform branch so we don't need to legalize.
|
|
|
|
return BRCOND;
|
|
|
|
}
|
|
|
|
|
2016-09-17 06:11:18 +08:00
|
|
|
bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
|
|
|
|
Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
|
|
|
|
|
2016-02-13 07:45:29 +08:00
|
|
|
assert(!SetCC ||
|
|
|
|
(SetCC->getConstantOperandVal(1) == 1 &&
|
|
|
|
cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
|
|
|
|
ISD::SETNE));
|
2012-12-20 06:10:31 +08:00
|
|
|
|
|
|
|
// operands of the new intrinsic call
|
|
|
|
SmallVector<SDValue, 4> Ops;
|
2016-09-17 06:11:18 +08:00
|
|
|
if (HaveChain)
|
|
|
|
Ops.push_back(BRCOND.getOperand(0));
|
|
|
|
|
2017-03-18 04:41:45 +08:00
|
|
|
Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
|
2012-12-20 06:10:31 +08:00
|
|
|
Ops.push_back(Target);
|
|
|
|
|
2016-09-17 06:11:18 +08:00
|
|
|
ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
|
|
|
|
|
2012-12-20 06:10:31 +08:00
|
|
|
// build the new intrinsic call
|
2017-03-18 04:41:45 +08:00
|
|
|
SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
|
2012-12-20 06:10:31 +08:00
|
|
|
|
2016-09-17 06:11:18 +08:00
|
|
|
if (!HaveChain) {
|
|
|
|
SDValue Ops[] = {
|
|
|
|
SDValue(Result, 0),
|
|
|
|
BRCOND.getOperand(0)
|
|
|
|
};
|
|
|
|
|
|
|
|
Result = DAG.getMergeValues(Ops, DL).getNode();
|
|
|
|
}
|
|
|
|
|
2012-12-20 06:10:31 +08:00
|
|
|
if (BR) {
|
|
|
|
// Give the branch instruction our target
|
|
|
|
SDValue Ops[] = {
|
|
|
|
BR->getOperand(0),
|
|
|
|
BRCOND.getOperand(2)
|
|
|
|
};
|
2014-08-02 06:09:43 +08:00
|
|
|
SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
|
|
|
|
DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
|
|
|
|
BR = NewBR.getNode();
|
2012-12-20 06:10:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
|
|
|
|
|
|
|
|
// Copy the intrinsic results to registers
|
|
|
|
for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
|
|
|
|
SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
|
|
|
|
if (!CopyToReg)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
Chain = DAG.getCopyToReg(
|
|
|
|
Chain, DL,
|
|
|
|
CopyToReg->getOperand(1),
|
|
|
|
SDValue(Result, i - 1),
|
|
|
|
SDValue());
|
|
|
|
|
|
|
|
DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Remove the old intrinsic from the chain
|
|
|
|
DAG.ReplaceAllUsesOfValueWith(
|
|
|
|
SDValue(Intr, Intr->getNumValues() - 1),
|
|
|
|
Intr->getOperand(0));
|
|
|
|
|
|
|
|
return Chain;
|
2012-12-12 05:25:42 +08:00
|
|
|
}
|
|
|
|
|
2016-11-13 15:01:11 +08:00
|
|
|
SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG,
|
|
|
|
SDValue Op,
|
|
|
|
const SDLoc &DL,
|
|
|
|
EVT VT) const {
|
|
|
|
return Op.getValueType().bitsLE(VT) ?
|
|
|
|
DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
|
|
|
|
DAG.getNode(ISD::FTRUNC, DL, VT, Op);
|
|
|
|
}
|
|
|
|
|
2016-11-17 12:28:37 +08:00
|
|
|
SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
|
2016-11-19 02:33:36 +08:00
|
|
|
assert(Op.getValueType() == MVT::f16 &&
|
2016-11-17 12:28:37 +08:00
|
|
|
"Do not know how to custom lower FP_ROUND for non-f16 type");
|
|
|
|
|
2016-11-19 02:33:36 +08:00
|
|
|
SDValue Src = Op.getOperand(0);
|
|
|
|
EVT SrcVT = Src.getValueType();
|
2016-11-17 12:28:37 +08:00
|
|
|
if (SrcVT != MVT::f64)
|
|
|
|
return Op;
|
|
|
|
|
|
|
|
SDLoc DL(Op);
|
2016-11-19 02:33:36 +08:00
|
|
|
|
2016-11-17 12:28:37 +08:00
|
|
|
SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
|
|
|
|
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
|
2017-06-06 13:08:36 +08:00
|
|
|
return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
|
2016-11-17 12:28:37 +08:00
|
|
|
}
|
|
|
|
|
2017-04-25 01:49:13 +08:00
|
|
|
SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
|
|
|
|
SDLoc SL(Op);
|
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
|
|
SDValue Chain = Op.getOperand(0);
|
|
|
|
|
|
|
|
unsigned TrapID = Op.getOpcode() == ISD::DEBUGTRAP ?
|
|
|
|
SISubtarget::TrapIDLLVMDebugTrap : SISubtarget::TrapIDLLVMTrap;
|
|
|
|
|
|
|
|
if (Subtarget->getTrapHandlerAbi() == SISubtarget::TrapHandlerAbiHsa &&
|
|
|
|
Subtarget->isTrapHandlerEnabled()) {
|
|
|
|
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
|
|
|
|
unsigned UserSGPR = Info->getQueuePtrUserSGPR();
|
|
|
|
assert(UserSGPR != AMDGPU::NoRegister);
|
|
|
|
|
|
|
|
SDValue QueuePtr = CreateLiveInRegister(
|
|
|
|
DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
|
|
|
|
|
|
|
|
SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
|
|
|
|
|
|
|
|
SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
|
|
|
|
QueuePtr, SDValue());
|
|
|
|
|
|
|
|
SDValue Ops[] = {
|
|
|
|
ToReg,
|
|
|
|
DAG.getTargetConstant(TrapID, SL, MVT::i16),
|
|
|
|
SGPR01,
|
|
|
|
ToReg.getValue(1)
|
|
|
|
};
|
|
|
|
|
|
|
|
return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
|
|
|
|
}
|
|
|
|
|
|
|
|
switch (TrapID) {
|
|
|
|
case SISubtarget::TrapIDLLVMTrap:
|
|
|
|
return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
|
|
|
|
case SISubtarget::TrapIDLLVMDebugTrap: {
|
|
|
|
DiagnosticInfoUnsupported NoTrap(*MF.getFunction(),
|
|
|
|
"debugtrap handler not supported",
|
|
|
|
Op.getDebugLoc(),
|
|
|
|
DS_Warning);
|
|
|
|
LLVMContext &Ctx = MF.getFunction()->getContext();
|
|
|
|
Ctx.diagnose(NoTrap);
|
|
|
|
return Chain;
|
|
|
|
}
|
|
|
|
default:
|
|
|
|
llvm_unreachable("unsupported trap handler type!");
|
|
|
|
}
|
|
|
|
|
|
|
|
return Chain;
|
|
|
|
}
|
|
|
|
|
2017-04-07 07:02:33 +08:00
|
|
|
SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
|
2016-04-26 03:27:24 +08:00
|
|
|
SelectionDAG &DAG) const {
|
2017-04-07 07:02:33 +08:00
|
|
|
// FIXME: Use inline constants (src_{shared, private}_base) instead.
|
|
|
|
if (Subtarget->hasApertureRegs()) {
|
|
|
|
unsigned Offset = AS == AMDGPUASI.LOCAL_ADDRESS ?
|
|
|
|
AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
|
|
|
|
AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
|
|
|
|
unsigned WidthM1 = AS == AMDGPUASI.LOCAL_ADDRESS ?
|
|
|
|
AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
|
|
|
|
AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
|
|
|
|
unsigned Encoding =
|
|
|
|
AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
|
|
|
|
Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
|
|
|
|
WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
|
|
|
|
|
|
|
|
SDValue EncodingImm = DAG.getTargetConstant(Encoding, DL, MVT::i16);
|
|
|
|
SDValue ApertureReg = SDValue(
|
|
|
|
DAG.getMachineNode(AMDGPU::S_GETREG_B32, DL, MVT::i32, EncodingImm), 0);
|
|
|
|
SDValue ShiftAmount = DAG.getTargetConstant(WidthM1 + 1, DL, MVT::i32);
|
|
|
|
return DAG.getNode(ISD::SHL, DL, MVT::i32, ApertureReg, ShiftAmount);
|
2017-02-19 02:29:53 +08:00
|
|
|
}
|
|
|
|
|
2016-04-26 03:27:24 +08:00
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
|
|
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
|
2016-06-07 04:03:31 +08:00
|
|
|
unsigned UserSGPR = Info->getQueuePtrUserSGPR();
|
|
|
|
assert(UserSGPR != AMDGPU::NoRegister);
|
|
|
|
|
2016-04-26 03:27:24 +08:00
|
|
|
SDValue QueuePtr = CreateLiveInRegister(
|
2016-06-07 04:03:31 +08:00
|
|
|
DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
|
2016-04-26 03:27:24 +08:00
|
|
|
|
|
|
|
// Offset into amd_queue_t for group_segment_aperture_base_hi /
|
|
|
|
// private_segment_aperture_base_hi.
|
2017-03-27 22:04:01 +08:00
|
|
|
uint32_t StructOffset = (AS == AMDGPUASI.LOCAL_ADDRESS) ? 0x40 : 0x44;
|
2016-04-26 03:27:24 +08:00
|
|
|
|
2017-04-07 07:02:33 +08:00
|
|
|
SDValue Ptr = DAG.getNode(ISD::ADD, DL, MVT::i64, QueuePtr,
|
|
|
|
DAG.getConstant(StructOffset, DL, MVT::i64));
|
2016-04-26 03:27:24 +08:00
|
|
|
|
|
|
|
// TODO: Use custom target PseudoSourceValue.
|
|
|
|
// TODO: We should use the value from the IR intrinsic call, but it might not
|
|
|
|
// be available and how do we get it?
|
|
|
|
Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()),
|
2017-03-27 22:04:01 +08:00
|
|
|
AMDGPUASI.CONSTANT_ADDRESS));
|
2016-04-26 03:27:24 +08:00
|
|
|
|
|
|
|
MachinePointerInfo PtrInfo(V, StructOffset);
|
2017-04-07 07:02:33 +08:00
|
|
|
return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
|
[SelectionDAG] Get rid of bool parameters in SelectionDAG::getLoad, getStore, and friends.
Summary:
Instead, we take a single flags arg (a bitset).
Also add a default 0 alignment, and change the order of arguments so the
alignment comes before the flags.
This greatly simplifies many callsites, and fixes a bug in
AMDGPUISelLowering, wherein the order of the args to getLoad was
inverted. It also greatly simplifies the process of adding another flag
to getLoad.
Reviewers: chandlerc, tstellarAMD
Subscribers: jholewinski, arsenm, jyknight, dsanders, nemanjai, llvm-commits
Differential Revision: http://reviews.llvm.org/D22249
llvm-svn: 275592
2016-07-16 02:27:10 +08:00
|
|
|
MinAlign(64, StructOffset),
|
[CodeGen] Split out the notions of MI invariance and MI dereferenceability.
Summary:
An IR load can be invariant, dereferenceable, neither, or both. But
currently, MI's notion of invariance is IR-invariant &&
IR-dereferenceable.
This patch splits up the notions of invariance and dereferenceability at
the MI level. It's NFC, so adds some probably-unnecessary
"is-dereferenceable" checks, which we can remove later if desired.
Reviewers: chandlerc, tstellarAMD
Subscribers: jholewinski, arsenm, nemanjai, llvm-commits
Differential Revision: https://reviews.llvm.org/D23371
llvm-svn: 281151
2016-09-11 09:38:58 +08:00
|
|
|
MachineMemOperand::MODereferenceable |
|
|
|
|
MachineMemOperand::MOInvariant);
|
2016-04-26 03:27:24 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
|
|
|
|
SelectionDAG &DAG) const {
|
|
|
|
SDLoc SL(Op);
|
|
|
|
const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
|
|
|
|
|
|
|
|
SDValue Src = ASC->getOperand(0);
|
|
|
|
SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
|
|
|
|
|
2017-03-14 04:18:14 +08:00
|
|
|
const AMDGPUTargetMachine &TM =
|
|
|
|
static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
|
|
|
|
|
2016-04-26 03:27:24 +08:00
|
|
|
// flat -> local/private
|
2017-03-27 22:04:01 +08:00
|
|
|
if (ASC->getSrcAddressSpace() == AMDGPUASI.FLAT_ADDRESS) {
|
2017-03-14 03:47:31 +08:00
|
|
|
unsigned DestAS = ASC->getDestAddressSpace();
|
2017-03-27 22:04:01 +08:00
|
|
|
|
|
|
|
if (DestAS == AMDGPUASI.LOCAL_ADDRESS ||
|
|
|
|
DestAS == AMDGPUASI.PRIVATE_ADDRESS) {
|
2017-03-14 04:18:14 +08:00
|
|
|
unsigned NullVal = TM.getNullPointerValue(DestAS);
|
|
|
|
SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
|
2016-04-26 03:27:24 +08:00
|
|
|
SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
|
|
|
|
SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
|
|
|
|
|
|
|
|
return DAG.getNode(ISD::SELECT, SL, MVT::i32,
|
|
|
|
NonNull, Ptr, SegmentNullPtr);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// local/private -> flat
|
2017-03-27 22:04:01 +08:00
|
|
|
if (ASC->getDestAddressSpace() == AMDGPUASI.FLAT_ADDRESS) {
|
2017-03-14 03:47:31 +08:00
|
|
|
unsigned SrcAS = ASC->getSrcAddressSpace();
|
2017-03-27 22:04:01 +08:00
|
|
|
|
|
|
|
if (SrcAS == AMDGPUASI.LOCAL_ADDRESS ||
|
|
|
|
SrcAS == AMDGPUASI.PRIVATE_ADDRESS) {
|
2017-03-14 04:18:14 +08:00
|
|
|
unsigned NullVal = TM.getNullPointerValue(SrcAS);
|
|
|
|
SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
|
2017-03-14 03:47:31 +08:00
|
|
|
|
2016-04-26 03:27:24 +08:00
|
|
|
SDValue NonNull
|
|
|
|
= DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
|
|
|
|
|
2017-04-07 07:02:33 +08:00
|
|
|
SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG);
|
2016-04-26 03:27:24 +08:00
|
|
|
SDValue CvtPtr
|
|
|
|
= DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
|
|
|
|
|
|
|
|
return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull,
|
|
|
|
DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr),
|
|
|
|
FlatNullPtr);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// global <-> flat are no-ops and never emitted.
|
|
|
|
|
|
|
|
const MachineFunction &MF = DAG.getMachineFunction();
|
|
|
|
DiagnosticInfoUnsupported InvalidAddrSpaceCast(
|
|
|
|
*MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
|
|
|
|
DAG.getContext()->diagnose(InvalidAddrSpaceCast);
|
|
|
|
|
|
|
|
return DAG.getUNDEF(ASC->getValueType(0));
|
|
|
|
}
|
|
|
|
|
2017-01-24 07:09:58 +08:00
|
|
|
SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
|
|
|
|
SelectionDAG &DAG) const {
|
|
|
|
SDValue Idx = Op.getOperand(2);
|
|
|
|
if (isa<ConstantSDNode>(Idx))
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
// Avoid stack access for dynamic indexing.
|
|
|
|
SDLoc SL(Op);
|
|
|
|
SDValue Vec = Op.getOperand(0);
|
|
|
|
SDValue Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Op.getOperand(1));
|
|
|
|
|
|
|
|
// v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
|
|
|
|
SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Val);
|
|
|
|
|
|
|
|
// Convert vector index to bit-index.
|
|
|
|
SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx,
|
|
|
|
DAG.getConstant(16, SL, MVT::i32));
|
|
|
|
|
|
|
|
SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
|
|
|
|
|
|
|
|
SDValue BFM = DAG.getNode(ISD::SHL, SL, MVT::i32,
|
|
|
|
DAG.getConstant(0xffff, SL, MVT::i32),
|
|
|
|
ScaledIdx);
|
|
|
|
|
|
|
|
SDValue LHS = DAG.getNode(ISD::AND, SL, MVT::i32, BFM, ExtVal);
|
|
|
|
SDValue RHS = DAG.getNode(ISD::AND, SL, MVT::i32,
|
|
|
|
DAG.getNOT(SL, BFM, MVT::i32), BCVec);
|
|
|
|
|
|
|
|
SDValue BFI = DAG.getNode(ISD::OR, SL, MVT::i32, LHS, RHS);
|
|
|
|
return DAG.getNode(ISD::BITCAST, SL, Op.getValueType(), BFI);
|
|
|
|
}
|
|
|
|
|
|
|
|
SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
|
|
|
|
SelectionDAG &DAG) const {
|
|
|
|
SDLoc SL(Op);
|
|
|
|
|
|
|
|
EVT ResultVT = Op.getValueType();
|
|
|
|
SDValue Vec = Op.getOperand(0);
|
|
|
|
SDValue Idx = Op.getOperand(1);
|
|
|
|
|
2017-05-18 04:30:58 +08:00
|
|
|
DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
|
|
|
|
|
|
|
|
// Make sure we we do any optimizations that will make it easier to fold
|
|
|
|
// source modifiers before obscuring it with bit operations.
|
|
|
|
|
|
|
|
// XXX - Why doesn't this get called when vector_shuffle is expanded?
|
|
|
|
if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
|
|
|
|
return Combined;
|
|
|
|
|
2017-01-24 07:09:58 +08:00
|
|
|
if (const ConstantSDNode *CIdx = dyn_cast<ConstantSDNode>(Idx)) {
|
|
|
|
SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
|
|
|
|
|
|
|
|
if (CIdx->getZExtValue() == 1) {
|
|
|
|
Result = DAG.getNode(ISD::SRL, SL, MVT::i32, Result,
|
|
|
|
DAG.getConstant(16, SL, MVT::i32));
|
|
|
|
} else {
|
|
|
|
assert(CIdx->getZExtValue() == 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ResultVT.bitsLT(MVT::i32))
|
|
|
|
Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Result);
|
|
|
|
return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
|
|
|
|
}
|
|
|
|
|
|
|
|
SDValue Sixteen = DAG.getConstant(16, SL, MVT::i32);
|
|
|
|
|
|
|
|
// Convert vector index to bit-index.
|
|
|
|
SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, Sixteen);
|
|
|
|
|
|
|
|
SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
|
|
|
|
SDValue Elt = DAG.getNode(ISD::SRL, SL, MVT::i32, BC, ScaledIdx);
|
|
|
|
|
|
|
|
SDValue Result = Elt;
|
|
|
|
if (ResultVT.bitsLT(MVT::i32))
|
|
|
|
Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Result);
|
|
|
|
|
|
|
|
return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
|
|
|
|
}
|
|
|
|
|
2016-06-25 09:59:16 +08:00
|
|
|
bool
|
|
|
|
SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
|
2016-07-13 22:23:33 +08:00
|
|
|
// We can fold offsets for anything that doesn't require a GOT relocation.
|
2017-03-27 22:04:01 +08:00
|
|
|
return (GA->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
|
|
|
|
GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) &&
|
2016-10-21 02:12:38 +08:00
|
|
|
!shouldEmitGOTReloc(GA->getGlobal());
|
2016-06-25 09:59:16 +08:00
|
|
|
}
|
|
|
|
|
2017-01-13 22:39:03 +08:00
|
|
|
static SDValue
|
|
|
|
buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
|
|
|
|
const SDLoc &DL, unsigned Offset, EVT PtrVT,
|
|
|
|
unsigned GAFlags = SIInstrInfo::MO_NONE) {
|
2016-06-15 04:29:59 +08:00
|
|
|
// In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
|
|
|
|
// lowered to the following code sequence:
|
|
|
|
//
|
2016-10-14 12:37:34 +08:00
|
|
|
// For constant address space:
|
|
|
|
// s_getpc_b64 s[0:1]
|
|
|
|
// s_add_u32 s0, s0, $symbol
|
|
|
|
// s_addc_u32 s1, s1, 0
|
|
|
|
//
|
|
|
|
// s_getpc_b64 returns the address of the s_add_u32 instruction and then
|
|
|
|
// a fixup or relocation is emitted to replace $symbol with a literal
|
|
|
|
// constant, which is a pc-relative offset from the encoding of the $symbol
|
|
|
|
// operand to the global variable.
|
|
|
|
//
|
|
|
|
// For global address space:
|
|
|
|
// s_getpc_b64 s[0:1]
|
|
|
|
// s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
|
|
|
|
// s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
|
|
|
|
//
|
|
|
|
// s_getpc_b64 returns the address of the s_add_u32 instruction and then
|
|
|
|
// fixups or relocations are emitted to replace $symbol@*@lo and
|
|
|
|
// $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
|
|
|
|
// which is a 64-bit pc-relative offset from the encoding of the $symbol
|
|
|
|
// operand to the global variable.
|
2016-06-15 04:29:59 +08:00
|
|
|
//
|
|
|
|
// What we want here is an offset from the value returned by s_getpc
|
|
|
|
// (which is the address of the s_add_u32 instruction) to the global
|
|
|
|
// variable, but since the encoding of $symbol starts 4 bytes after the start
|
|
|
|
// of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
|
|
|
|
// small. This requires us to add 4 to the global variable offset in order to
|
|
|
|
// compute the correct address.
|
2016-10-14 12:37:34 +08:00
|
|
|
SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
|
|
|
|
GAFlags);
|
|
|
|
SDValue PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
|
|
|
|
GAFlags == SIInstrInfo::MO_NONE ?
|
|
|
|
GAFlags : GAFlags + 1);
|
|
|
|
return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
|
2016-06-15 04:29:59 +08:00
|
|
|
}
|
|
|
|
|
2016-07-13 22:23:33 +08:00
|
|
|
SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
|
|
|
|
SDValue Op,
|
|
|
|
SelectionDAG &DAG) const {
|
|
|
|
GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
|
2017-08-02 03:54:18 +08:00
|
|
|
const GlobalValue *GV = GSD->getGlobal();
|
2016-07-13 22:23:33 +08:00
|
|
|
|
2017-03-27 22:04:01 +08:00
|
|
|
if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS &&
|
2017-08-02 03:54:18 +08:00
|
|
|
GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS &&
|
|
|
|
// FIXME: It isn't correct to rely on the type of the pointer. This should
|
|
|
|
// be removed when address space 0 is 64-bit.
|
|
|
|
!GV->getType()->getElementType()->isFunctionTy())
|
2016-07-13 22:23:33 +08:00
|
|
|
return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
|
|
|
|
|
|
|
|
SDLoc DL(GSD);
|
|
|
|
EVT PtrVT = Op.getValueType();
|
|
|
|
|
2016-10-21 02:12:38 +08:00
|
|
|
if (shouldEmitFixup(GV))
|
2016-07-13 22:23:33 +08:00
|
|
|
return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
|
2016-10-21 02:12:38 +08:00
|
|
|
else if (shouldEmitPCReloc(GV))
|
2016-10-14 12:37:34 +08:00
|
|
|
return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
|
|
|
|
SIInstrInfo::MO_REL32);
|
2016-07-13 22:23:33 +08:00
|
|
|
|
|
|
|
SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
|
2016-10-14 12:37:34 +08:00
|
|
|
SIInstrInfo::MO_GOTPCREL32);
|
2016-07-13 22:23:33 +08:00
|
|
|
|
|
|
|
Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
|
2017-03-27 22:04:01 +08:00
|
|
|
PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS);
|
2016-07-13 22:23:33 +08:00
|
|
|
const DataLayout &DataLayout = DAG.getDataLayout();
|
|
|
|
unsigned Align = DataLayout.getABITypeAlignment(PtrTy);
|
|
|
|
// FIXME: Use a PseudoSourceValue once those can be assigned an address space.
|
|
|
|
MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
|
|
|
|
|
[SelectionDAG] Get rid of bool parameters in SelectionDAG::getLoad, getStore, and friends.
Summary:
Instead, we take a single flags arg (a bitset).
Also add a default 0 alignment, and change the order of arguments so the
alignment comes before the flags.
This greatly simplifies many callsites, and fixes a bug in
AMDGPUISelLowering, wherein the order of the args to getLoad was
inverted. It also greatly simplifies the process of adding another flag
to getLoad.
Reviewers: chandlerc, tstellarAMD
Subscribers: jholewinski, arsenm, jyknight, dsanders, nemanjai, llvm-commits
Differential Revision: http://reviews.llvm.org/D22249
llvm-svn: 275592
2016-07-16 02:27:10 +08:00
|
|
|
return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align,
|
[CodeGen] Split out the notions of MI invariance and MI dereferenceability.
Summary:
An IR load can be invariant, dereferenceable, neither, or both. But
currently, MI's notion of invariance is IR-invariant &&
IR-dereferenceable.
This patch splits up the notions of invariance and dereferenceability at
the MI level. It's NFC, so adds some probably-unnecessary
"is-dereferenceable" checks, which we can remove later if desired.
Reviewers: chandlerc, tstellarAMD
Subscribers: jholewinski, arsenm, nemanjai, llvm-commits
Differential Revision: https://reviews.llvm.org/D23371
llvm-svn: 281151
2016-09-11 09:38:58 +08:00
|
|
|
MachineMemOperand::MODereferenceable |
|
|
|
|
MachineMemOperand::MOInvariant);
|
2016-07-13 22:23:33 +08:00
|
|
|
}
|
|
|
|
|
2016-06-12 23:39:02 +08:00
|
|
|
SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
|
|
|
|
const SDLoc &DL, SDValue V) const {
|
2016-04-15 05:58:15 +08:00
|
|
|
// We can't use S_MOV_B32 directly, because there is no way to specify m0 as
|
|
|
|
// the destination register.
|
|
|
|
//
|
2015-05-12 22:18:14 +08:00
|
|
|
// We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
|
|
|
|
// so we will end up with redundant moves to m0.
|
|
|
|
//
|
2016-04-15 05:58:15 +08:00
|
|
|
// We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
|
|
|
|
|
|
|
|
// A Null SDValue creates a glue result.
|
|
|
|
SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
|
|
|
|
V, Chain);
|
|
|
|
return SDValue(M0, 0);
|
2015-05-12 22:18:14 +08:00
|
|
|
}
|
|
|
|
|
2015-12-01 05:15:45 +08:00
|
|
|
SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
|
|
|
|
SDValue Op,
|
|
|
|
MVT VT,
|
|
|
|
unsigned Offset) const {
|
|
|
|
SDLoc SL(Op);
|
2017-04-12 06:29:24 +08:00
|
|
|
SDValue Param = lowerKernargMemParameter(DAG, MVT::i32, MVT::i32, SL,
|
|
|
|
DAG.getEntryNode(), Offset, false);
|
2015-12-01 05:15:45 +08:00
|
|
|
// The local size values will have the hi 16-bits as zero.
|
|
|
|
return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
|
|
|
|
DAG.getValueType(VT));
|
|
|
|
}
|
|
|
|
|
2017-01-13 22:39:03 +08:00
|
|
|
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
|
|
|
|
EVT VT) {
|
2016-01-30 13:19:45 +08:00
|
|
|
DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(),
|
2016-06-21 02:33:56 +08:00
|
|
|
"non-hsa intrinsic with hsa target",
|
|
|
|
DL.getDebugLoc());
|
|
|
|
DAG.getContext()->diagnose(BadIntrin);
|
|
|
|
return DAG.getUNDEF(VT);
|
|
|
|
}
|
|
|
|
|
2017-01-13 22:39:03 +08:00
|
|
|
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
|
|
|
|
EVT VT) {
|
2016-06-21 02:33:56 +08:00
|
|
|
DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(),
|
|
|
|
"intrinsic not supported on subtarget",
|
|
|
|
DL.getDebugLoc());
|
2016-01-30 13:19:45 +08:00
|
|
|
DAG.getContext()->diagnose(BadIntrin);
|
|
|
|
return DAG.getUNDEF(VT);
|
|
|
|
}
|
|
|
|
|
2014-07-26 14:23:37 +08:00
|
|
|
SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
|
|
|
|
SelectionDAG &DAG) const {
|
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
2015-07-10 05:20:37 +08:00
|
|
|
auto MFI = MF.getInfo<SIMachineFunctionInfo>();
|
2014-07-26 14:23:37 +08:00
|
|
|
|
|
|
|
EVT VT = Op.getValueType();
|
|
|
|
SDLoc DL(Op);
|
|
|
|
unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
|
|
|
|
|
2015-09-17 00:31:21 +08:00
|
|
|
// TODO: Should this propagate fast-math-flags?
|
|
|
|
|
2014-07-26 14:23:37 +08:00
|
|
|
switch (IntrinsicID) {
|
2017-01-25 09:25:13 +08:00
|
|
|
case Intrinsic::amdgcn_implicit_buffer_ptr: {
|
2017-06-26 11:01:31 +08:00
|
|
|
if (getSubtarget()->isAmdCodeObjectV2(MF))
|
|
|
|
return emitNonHSAIntrinsicError(DAG, DL, VT);
|
2017-08-04 07:00:29 +08:00
|
|
|
return getPreloadedValue(DAG, *MFI, VT,
|
|
|
|
AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
|
2017-01-25 09:25:13 +08:00
|
|
|
}
|
2015-11-26 08:43:29 +08:00
|
|
|
case Intrinsic::amdgcn_dispatch_ptr:
|
2016-04-26 03:27:18 +08:00
|
|
|
case Intrinsic::amdgcn_queue_ptr: {
|
2017-01-25 09:25:13 +08:00
|
|
|
if (!Subtarget->isAmdCodeObjectV2(MF)) {
|
2016-02-02 21:52:43 +08:00
|
|
|
DiagnosticInfoUnsupported BadIntrin(
|
|
|
|
*MF.getFunction(), "unsupported hsa intrinsic without hsa target",
|
|
|
|
DL.getDebugLoc());
|
2016-01-12 05:18:33 +08:00
|
|
|
DAG.getContext()->diagnose(BadIntrin);
|
|
|
|
return DAG.getUNDEF(VT);
|
|
|
|
}
|
|
|
|
|
2017-08-04 07:00:29 +08:00
|
|
|
auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
|
|
|
|
AMDGPUFunctionArgInfo::DISPATCH_PTR : AMDGPUFunctionArgInfo::QUEUE_PTR;
|
|
|
|
return getPreloadedValue(DAG, *MFI, VT, RegID);
|
2016-04-26 03:27:18 +08:00
|
|
|
}
|
2016-06-22 04:46:20 +08:00
|
|
|
case Intrinsic::amdgcn_implicitarg_ptr: {
|
2017-07-28 23:52:08 +08:00
|
|
|
if (MFI->isEntryFunction())
|
|
|
|
return getImplicitArgPtr(DAG, DL);
|
2017-08-04 07:12:44 +08:00
|
|
|
return getPreloadedValue(DAG, *MFI, VT,
|
|
|
|
AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
|
2016-06-22 04:46:20 +08:00
|
|
|
}
|
2016-04-30 05:16:52 +08:00
|
|
|
case Intrinsic::amdgcn_kernarg_segment_ptr: {
|
2017-08-04 07:00:29 +08:00
|
|
|
return getPreloadedValue(DAG, *MFI, VT,
|
|
|
|
AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
|
2016-04-30 05:16:52 +08:00
|
|
|
}
|
2016-07-23 01:01:30 +08:00
|
|
|
case Intrinsic::amdgcn_dispatch_id: {
|
2017-08-04 07:00:29 +08:00
|
|
|
return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
|
2016-07-23 01:01:30 +08:00
|
|
|
}
|
2016-01-23 13:32:20 +08:00
|
|
|
case Intrinsic::amdgcn_rcp:
|
|
|
|
return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
|
|
|
|
case Intrinsic::amdgcn_rsq:
|
|
|
|
return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
|
2017-01-21 08:53:49 +08:00
|
|
|
case Intrinsic::amdgcn_rsq_legacy:
|
2016-06-24 14:30:11 +08:00
|
|
|
if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
|
2016-06-21 02:33:56 +08:00
|
|
|
return emitRemovedIntrinsicError(DAG, DL, VT);
|
|
|
|
|
|
|
|
return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
|
2017-01-21 08:53:49 +08:00
|
|
|
case Intrinsic::amdgcn_rcp_legacy:
|
2016-07-27 00:45:45 +08:00
|
|
|
if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
|
|
|
|
return emitRemovedIntrinsicError(DAG, DL, VT);
|
|
|
|
return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
|
2016-07-16 05:26:52 +08:00
|
|
|
case Intrinsic::amdgcn_rsq_clamp: {
|
2016-06-24 14:30:11 +08:00
|
|
|
if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
|
2016-02-13 09:03:00 +08:00
|
|
|
return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
|
2016-01-23 13:32:20 +08:00
|
|
|
|
|
|
|
Type *Type = VT.getTypeForEVT(*DAG.getContext());
|
|
|
|
APFloat Max = APFloat::getLargest(Type->getFltSemantics());
|
|
|
|
APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
|
|
|
|
|
|
|
|
SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
|
|
|
|
SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
|
|
|
|
DAG.getConstantFP(Max, DL, VT));
|
|
|
|
return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
|
|
|
|
DAG.getConstantFP(Min, DL, VT));
|
|
|
|
}
|
2014-07-26 14:23:37 +08:00
|
|
|
case Intrinsic::r600_read_ngroups_x:
|
2016-01-30 13:19:45 +08:00
|
|
|
if (Subtarget->isAmdHsaOS())
|
2016-06-21 02:33:56 +08:00
|
|
|
return emitNonHSAIntrinsicError(DAG, DL, VT);
|
2016-01-30 13:19:45 +08:00
|
|
|
|
2017-04-12 06:29:24 +08:00
|
|
|
return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
|
|
|
|
SI::KernelInputOffsets::NGROUPS_X, false);
|
2014-07-26 14:23:37 +08:00
|
|
|
case Intrinsic::r600_read_ngroups_y:
|
2016-01-30 13:19:45 +08:00
|
|
|
if (Subtarget->isAmdHsaOS())
|
2016-06-21 02:33:56 +08:00
|
|
|
return emitNonHSAIntrinsicError(DAG, DL, VT);
|
2016-01-30 13:19:45 +08:00
|
|
|
|
2017-04-12 06:29:24 +08:00
|
|
|
return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
|
|
|
|
SI::KernelInputOffsets::NGROUPS_Y, false);
|
2014-07-26 14:23:37 +08:00
|
|
|
case Intrinsic::r600_read_ngroups_z:
|
2016-01-30 13:19:45 +08:00
|
|
|
if (Subtarget->isAmdHsaOS())
|
2016-06-21 02:33:56 +08:00
|
|
|
return emitNonHSAIntrinsicError(DAG, DL, VT);
|
2016-01-30 13:19:45 +08:00
|
|
|
|
2017-04-12 06:29:24 +08:00
|
|
|
return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
|
|
|
|
SI::KernelInputOffsets::NGROUPS_Z, false);
|
2014-07-26 14:23:37 +08:00
|
|
|
case Intrinsic::r600_read_global_size_x:
|
2016-01-30 13:19:45 +08:00
|
|
|
if (Subtarget->isAmdHsaOS())
|
2016-06-21 02:33:56 +08:00
|
|
|
return emitNonHSAIntrinsicError(DAG, DL, VT);
|
2016-01-30 13:19:45 +08:00
|
|
|
|
2017-04-12 06:29:24 +08:00
|
|
|
return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
|
|
|
|
SI::KernelInputOffsets::GLOBAL_SIZE_X, false);
|
2014-07-26 14:23:37 +08:00
|
|
|
case Intrinsic::r600_read_global_size_y:
|
2016-01-30 13:19:45 +08:00
|
|
|
if (Subtarget->isAmdHsaOS())
|
2016-06-21 02:33:56 +08:00
|
|
|
return emitNonHSAIntrinsicError(DAG, DL, VT);
|
2016-01-30 13:19:45 +08:00
|
|
|
|
2017-04-12 06:29:24 +08:00
|
|
|
return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
|
|
|
|
SI::KernelInputOffsets::GLOBAL_SIZE_Y, false);
|
2014-07-26 14:23:37 +08:00
|
|
|
case Intrinsic::r600_read_global_size_z:
|
2016-01-30 13:19:45 +08:00
|
|
|
if (Subtarget->isAmdHsaOS())
|
2016-06-21 02:33:56 +08:00
|
|
|
return emitNonHSAIntrinsicError(DAG, DL, VT);
|
2016-01-30 13:19:45 +08:00
|
|
|
|
2017-04-12 06:29:24 +08:00
|
|
|
return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
|
|
|
|
SI::KernelInputOffsets::GLOBAL_SIZE_Z, false);
|
2014-07-26 14:23:37 +08:00
|
|
|
case Intrinsic::r600_read_local_size_x:
|
2016-01-30 13:19:45 +08:00
|
|
|
if (Subtarget->isAmdHsaOS())
|
2016-06-21 02:33:56 +08:00
|
|
|
return emitNonHSAIntrinsicError(DAG, DL, VT);
|
2016-01-30 13:19:45 +08:00
|
|
|
|
2015-12-01 05:15:45 +08:00
|
|
|
return lowerImplicitZextParam(DAG, Op, MVT::i16,
|
|
|
|
SI::KernelInputOffsets::LOCAL_SIZE_X);
|
2014-07-26 14:23:37 +08:00
|
|
|
case Intrinsic::r600_read_local_size_y:
|
2016-01-30 13:19:45 +08:00
|
|
|
if (Subtarget->isAmdHsaOS())
|
2016-06-21 02:33:56 +08:00
|
|
|
return emitNonHSAIntrinsicError(DAG, DL, VT);
|
2016-01-30 13:19:45 +08:00
|
|
|
|
2015-12-01 05:15:45 +08:00
|
|
|
return lowerImplicitZextParam(DAG, Op, MVT::i16,
|
|
|
|
SI::KernelInputOffsets::LOCAL_SIZE_Y);
|
2014-07-26 14:23:37 +08:00
|
|
|
case Intrinsic::r600_read_local_size_z:
|
2016-01-30 13:19:45 +08:00
|
|
|
if (Subtarget->isAmdHsaOS())
|
2016-06-21 02:33:56 +08:00
|
|
|
return emitNonHSAIntrinsicError(DAG, DL, VT);
|
2016-01-30 13:19:45 +08:00
|
|
|
|
2015-12-01 05:15:45 +08:00
|
|
|
return lowerImplicitZextParam(DAG, Op, MVT::i16,
|
|
|
|
SI::KernelInputOffsets::LOCAL_SIZE_Z);
|
2016-01-30 12:25:19 +08:00
|
|
|
case Intrinsic::amdgcn_workgroup_id_x:
|
2014-07-26 14:23:37 +08:00
|
|
|
case Intrinsic::r600_read_tgid_x:
|
2017-08-04 07:00:29 +08:00
|
|
|
return getPreloadedValue(DAG, *MFI, VT,
|
|
|
|
AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
|
2016-01-30 12:25:19 +08:00
|
|
|
case Intrinsic::amdgcn_workgroup_id_y:
|
2014-07-26 14:23:37 +08:00
|
|
|
case Intrinsic::r600_read_tgid_y:
|
2017-08-04 07:00:29 +08:00
|
|
|
return getPreloadedValue(DAG, *MFI, VT,
|
|
|
|
AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
|
2016-01-30 12:25:19 +08:00
|
|
|
case Intrinsic::amdgcn_workgroup_id_z:
|
2014-07-26 14:23:37 +08:00
|
|
|
case Intrinsic::r600_read_tgid_z:
|
2017-08-04 07:00:29 +08:00
|
|
|
return getPreloadedValue(DAG, *MFI, VT,
|
|
|
|
AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
|
|
|
|
case Intrinsic::amdgcn_workitem_id_x: {
|
2014-07-26 14:23:37 +08:00
|
|
|
case Intrinsic::r600_read_tidig_x:
|
2017-08-04 07:00:29 +08:00
|
|
|
return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
|
|
|
|
SDLoc(DAG.getEntryNode()),
|
|
|
|
MFI->getArgInfo().WorkItemIDX);
|
|
|
|
}
|
2016-01-30 12:25:19 +08:00
|
|
|
case Intrinsic::amdgcn_workitem_id_y:
|
2014-07-26 14:23:37 +08:00
|
|
|
case Intrinsic::r600_read_tidig_y:
|
2017-08-04 07:00:29 +08:00
|
|
|
return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
|
|
|
|
SDLoc(DAG.getEntryNode()),
|
|
|
|
MFI->getArgInfo().WorkItemIDY);
|
2016-01-30 12:25:19 +08:00
|
|
|
case Intrinsic::amdgcn_workitem_id_z:
|
2014-07-26 14:23:37 +08:00
|
|
|
case Intrinsic::r600_read_tidig_z:
|
2017-08-04 07:00:29 +08:00
|
|
|
return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
|
|
|
|
SDLoc(DAG.getEntryNode()),
|
|
|
|
MFI->getArgInfo().WorkItemIDZ);
|
2014-07-26 14:23:37 +08:00
|
|
|
case AMDGPUIntrinsic::SI_load_const: {
|
|
|
|
SDValue Ops[] = {
|
|
|
|
Op.getOperand(1),
|
|
|
|
Op.getOperand(2)
|
|
|
|
};
|
|
|
|
|
|
|
|
MachineMemOperand *MMO = MF.getMachineMemOperand(
|
[CodeGen] Split out the notions of MI invariance and MI dereferenceability.
Summary:
An IR load can be invariant, dereferenceable, neither, or both. But
currently, MI's notion of invariance is IR-invariant &&
IR-dereferenceable.
This patch splits up the notions of invariance and dereferenceability at
the MI level. It's NFC, so adds some probably-unnecessary
"is-dereferenceable" checks, which we can remove later if desired.
Reviewers: chandlerc, tstellarAMD
Subscribers: jholewinski, arsenm, nemanjai, llvm-commits
Differential Revision: https://reviews.llvm.org/D23371
llvm-svn: 281151
2016-09-11 09:38:58 +08:00
|
|
|
MachinePointerInfo(),
|
|
|
|
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
|
|
|
|
MachineMemOperand::MOInvariant,
|
|
|
|
VT.getStoreSize(), 4);
|
2014-07-26 14:23:37 +08:00
|
|
|
return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
|
|
|
|
Op->getVTList(), Ops, VT, MMO);
|
|
|
|
}
|
2017-03-18 04:41:45 +08:00
|
|
|
case Intrinsic::amdgcn_fdiv_fast:
|
2016-07-20 07:16:53 +08:00
|
|
|
return lowerFDIV_FAST(Op, DAG);
|
2016-12-07 07:52:13 +08:00
|
|
|
case Intrinsic::amdgcn_interp_mov: {
|
|
|
|
SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
|
|
|
|
SDValue Glue = M0.getValue(1);
|
|
|
|
return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, Op.getOperand(1),
|
|
|
|
Op.getOperand(2), Op.getOperand(3), Glue);
|
|
|
|
}
|
2015-12-16 01:02:49 +08:00
|
|
|
case Intrinsic::amdgcn_interp_p1: {
|
|
|
|
SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
|
|
|
|
SDValue Glue = M0.getValue(1);
|
|
|
|
return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1),
|
|
|
|
Op.getOperand(2), Op.getOperand(3), Glue);
|
|
|
|
}
|
|
|
|
case Intrinsic::amdgcn_interp_p2: {
|
|
|
|
SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
|
|
|
|
SDValue Glue = SDValue(M0.getNode(), 1);
|
|
|
|
return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1),
|
|
|
|
Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
|
|
|
|
Glue);
|
|
|
|
}
|
2016-02-13 09:19:56 +08:00
|
|
|
case Intrinsic::amdgcn_sin:
|
|
|
|
return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
|
|
|
|
|
|
|
|
case Intrinsic::amdgcn_cos:
|
|
|
|
return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
|
|
|
|
|
|
|
|
case Intrinsic::amdgcn_log_clamp: {
|
2016-06-24 14:30:11 +08:00
|
|
|
if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
|
2016-02-13 09:19:56 +08:00
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
DiagnosticInfoUnsupported BadIntrin(
|
|
|
|
*MF.getFunction(), "intrinsic not supported on subtarget",
|
|
|
|
DL.getDebugLoc());
|
|
|
|
DAG.getContext()->diagnose(BadIntrin);
|
|
|
|
return DAG.getUNDEF(VT);
|
|
|
|
}
|
2016-01-23 13:32:20 +08:00
|
|
|
case Intrinsic::amdgcn_ldexp:
|
|
|
|
return DAG.getNode(AMDGPUISD::LDEXP, DL, VT,
|
|
|
|
Op.getOperand(1), Op.getOperand(2));
|
2016-05-28 08:19:52 +08:00
|
|
|
|
|
|
|
case Intrinsic::amdgcn_fract:
|
|
|
|
return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
|
|
|
|
|
2016-01-23 13:32:20 +08:00
|
|
|
case Intrinsic::amdgcn_class:
|
|
|
|
return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
|
|
|
|
Op.getOperand(1), Op.getOperand(2));
|
|
|
|
case Intrinsic::amdgcn_div_fmas:
|
|
|
|
return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
|
|
|
|
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
|
|
|
|
Op.getOperand(4));
|
|
|
|
|
|
|
|
case Intrinsic::amdgcn_div_fixup:
|
|
|
|
return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
|
|
|
|
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
|
|
|
|
|
|
|
|
case Intrinsic::amdgcn_trig_preop:
|
|
|
|
return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
|
|
|
|
Op.getOperand(1), Op.getOperand(2));
|
|
|
|
case Intrinsic::amdgcn_div_scale: {
|
|
|
|
// 3rd parameter required to be a constant.
|
|
|
|
const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
|
|
|
|
if (!Param)
|
2017-08-02 04:49:41 +08:00
|
|
|
return DAG.getMergeValues({ DAG.getUNDEF(VT), DAG.getUNDEF(MVT::i1) }, DL);
|
2016-01-23 13:32:20 +08:00
|
|
|
|
|
|
|
// Translate to the operands expected by the machine instruction. The
|
|
|
|
// first parameter must be the same as the first instruction.
|
|
|
|
SDValue Numerator = Op.getOperand(1);
|
|
|
|
SDValue Denominator = Op.getOperand(2);
|
|
|
|
|
|
|
|
// Note this order is opposite of the machine instruction's operations,
|
|
|
|
// which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
|
|
|
|
// intrinsic has the numerator as the first operand to match a normal
|
|
|
|
// division operation.
|
|
|
|
|
|
|
|
SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
|
|
|
|
|
|
|
|
return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
|
|
|
|
Denominator, Numerator);
|
|
|
|
}
|
2016-07-29 00:42:13 +08:00
|
|
|
case Intrinsic::amdgcn_icmp: {
|
|
|
|
const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3));
|
2017-02-18 03:49:10 +08:00
|
|
|
if (!CD)
|
|
|
|
return DAG.getUNDEF(VT);
|
2016-07-29 00:42:13 +08:00
|
|
|
|
2017-02-18 03:49:10 +08:00
|
|
|
int CondCode = CD->getSExtValue();
|
2016-07-29 00:42:13 +08:00
|
|
|
if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE ||
|
2017-02-18 03:49:10 +08:00
|
|
|
CondCode > ICmpInst::Predicate::LAST_ICMP_PREDICATE)
|
2016-07-29 00:42:13 +08:00
|
|
|
return DAG.getUNDEF(VT);
|
|
|
|
|
2016-08-22 08:58:04 +08:00
|
|
|
ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
|
2016-07-29 00:42:13 +08:00
|
|
|
ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
|
|
|
|
return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1),
|
|
|
|
Op.getOperand(2), DAG.getCondCode(CCOpcode));
|
|
|
|
}
|
|
|
|
case Intrinsic::amdgcn_fcmp: {
|
|
|
|
const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3));
|
2017-02-18 03:49:10 +08:00
|
|
|
if (!CD)
|
|
|
|
return DAG.getUNDEF(VT);
|
2016-07-29 00:42:13 +08:00
|
|
|
|
2017-02-18 03:49:10 +08:00
|
|
|
int CondCode = CD->getSExtValue();
|
|
|
|
if (CondCode < FCmpInst::Predicate::FIRST_FCMP_PREDICATE ||
|
|
|
|
CondCode > FCmpInst::Predicate::LAST_FCMP_PREDICATE)
|
2016-07-29 00:42:13 +08:00
|
|
|
return DAG.getUNDEF(VT);
|
|
|
|
|
2016-08-22 08:58:04 +08:00
|
|
|
FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
|
2016-07-29 00:42:13 +08:00
|
|
|
ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
|
|
|
|
return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1),
|
|
|
|
Op.getOperand(2), DAG.getCondCode(CCOpcode));
|
|
|
|
}
|
2017-01-31 11:07:46 +08:00
|
|
|
case Intrinsic::amdgcn_fmed3:
|
|
|
|
return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
|
|
|
|
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
|
2016-07-27 00:45:45 +08:00
|
|
|
case Intrinsic::amdgcn_fmul_legacy:
|
|
|
|
return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
|
|
|
|
Op.getOperand(1), Op.getOperand(2));
|
2016-07-19 02:35:05 +08:00
|
|
|
case Intrinsic::amdgcn_sffbh:
|
|
|
|
return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
|
2017-02-23 07:04:58 +08:00
|
|
|
case Intrinsic::amdgcn_sbfe:
|
|
|
|
return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
|
|
|
|
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
|
|
|
|
case Intrinsic::amdgcn_ubfe:
|
|
|
|
return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
|
|
|
|
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
|
2017-02-22 08:27:34 +08:00
|
|
|
case Intrinsic::amdgcn_cvt_pkrtz: {
|
|
|
|
// FIXME: Stop adding cast if v2f16 legal.
|
|
|
|
EVT VT = Op.getValueType();
|
|
|
|
SDValue Node = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, DL, MVT::i32,
|
|
|
|
Op.getOperand(1), Op.getOperand(2));
|
|
|
|
return DAG.getNode(ISD::BITCAST, DL, VT, Node);
|
|
|
|
}
|
[AMDGPU] Add an llvm.amdgcn.wqm intrinsic for WQM
Summary:
Previously, we assumed that certain types of instructions needed WQM in
pixel shaders, particularly DS instructions and image sampling
instructions. This was ok because with OpenGL, the assumption was
correct. But we want to start using DPP instructions for derivatives as
well as other things, so the assumption that we can infer whether to use
WQM based on the instruction won't continue to hold. This intrinsic lets
frontends like Mesa indicate what things need WQM based on their
knowledge of the API, rather than second-guessing them in the backend.
We need to keep around the old method of enabling WQM, but eventually we
should remove it once Mesa catches up. For now, this will let us use DPP
instructions for computing derivatives correctly.
Reviewers: arsenm, tpr, nhaehnle
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, llvm-commits, t-tye
Differential Revision: https://reviews.llvm.org/D35167
llvm-svn: 310085
2017-08-05 02:36:49 +08:00
|
|
|
case Intrinsic::amdgcn_wqm: {
|
|
|
|
SDValue Src = Op.getOperand(1);
|
|
|
|
return SDValue(DAG.getMachineNode(AMDGPU::WQM, DL, Src.getValueType(), Src),
|
|
|
|
0);
|
[AMDGPU] Add support for Whole Wavefront Mode
Summary:
Whole Wavefront Wode (WWM) is similar to WQM, except that all of the
lanes are always enabled, regardless of control flow. This is required
for implementing wavefront reductions in non-uniform control flow, where
we need to use the inactive lanes to propagate intermediate results, so
they need to be enabled. We need to propagate WWM to uses (unless
they're explicitly marked as exact) so that they also propagate
intermediate results correctly. We do the analysis and exec mask munging
during the WQM pass, since there are interactions with WQM for things
that require both WQM and WWM. For simplicity, WWM is entirely
block-local -- blocks are never WWM on entry or exit of a block, and WWM
is not propagated to the block level. This means that computations
involving WWM cannot involve control flow, but we only ever plan to use
WWM for a few limited purposes (none of which involve control flow)
anyways.
Shaders can ask for WWM using the @llvm.amdgcn.wwm intrinsic. There
isn't yet a way to turn WWM off -- that will be added in a future
change.
Finally, it turns out that turning on inactive lanes causes a number of
problems with register allocation. While the best long-term solution
seems like teaching LLVM's register allocator about predication, for now
we need to add some hacks to prevent ourselves from getting into trouble
due to constraints that aren't currently expressed in LLVM. For the gory
details, see the comments at the top of SIFixWWMLiveness.cpp.
Reviewers: arsenm, nhaehnle, tpr
Subscribers: kzhuravl, wdng, mgorny, yaxunl, dstuttard, t-tye, llvm-commits
Differential Revision: https://reviews.llvm.org/D35524
llvm-svn: 310087
2017-08-05 02:36:52 +08:00
|
|
|
}
|
|
|
|
case Intrinsic::amdgcn_wwm: {
|
|
|
|
SDValue Src = Op.getOperand(1);
|
|
|
|
return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src),
|
|
|
|
0);
|
[AMDGPU] Add an llvm.amdgcn.wqm intrinsic for WQM
Summary:
Previously, we assumed that certain types of instructions needed WQM in
pixel shaders, particularly DS instructions and image sampling
instructions. This was ok because with OpenGL, the assumption was
correct. But we want to start using DPP instructions for derivatives as
well as other things, so the assumption that we can infer whether to use
WQM based on the instruction won't continue to hold. This intrinsic lets
frontends like Mesa indicate what things need WQM based on their
knowledge of the API, rather than second-guessing them in the backend.
We need to keep around the old method of enabling WQM, but eventually we
should remove it once Mesa catches up. For now, this will let us use DPP
instructions for computing derivatives correctly.
Reviewers: arsenm, tpr, nhaehnle
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, llvm-commits, t-tye
Differential Revision: https://reviews.llvm.org/D35167
llvm-svn: 310085
2017-08-05 02:36:49 +08:00
|
|
|
}
|
2014-07-26 14:23:37 +08:00
|
|
|
default:
|
2017-04-04 02:08:08 +08:00
|
|
|
return Op;
|
2014-07-26 14:23:37 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-04-12 22:05:04 +08:00
|
|
|
SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
|
|
|
|
SelectionDAG &DAG) const {
|
|
|
|
unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
|
2016-12-21 01:19:44 +08:00
|
|
|
SDLoc DL(Op);
|
2017-06-23 00:29:22 +08:00
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
|
|
|
2016-04-12 22:05:04 +08:00
|
|
|
switch (IntrID) {
|
|
|
|
case Intrinsic::amdgcn_atomic_inc:
|
|
|
|
case Intrinsic::amdgcn_atomic_dec: {
|
|
|
|
MemSDNode *M = cast<MemSDNode>(Op);
|
|
|
|
unsigned Opc = (IntrID == Intrinsic::amdgcn_atomic_inc) ?
|
|
|
|
AMDGPUISD::ATOMIC_INC : AMDGPUISD::ATOMIC_DEC;
|
|
|
|
SDValue Ops[] = {
|
|
|
|
M->getOperand(0), // Chain
|
|
|
|
M->getOperand(2), // Ptr
|
|
|
|
M->getOperand(3) // Value
|
|
|
|
};
|
|
|
|
|
|
|
|
return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
|
|
|
|
M->getMemoryVT(), M->getMemOperand());
|
|
|
|
}
|
2016-12-21 01:19:44 +08:00
|
|
|
case Intrinsic::amdgcn_buffer_load:
|
|
|
|
case Intrinsic::amdgcn_buffer_load_format: {
|
|
|
|
SDValue Ops[] = {
|
|
|
|
Op.getOperand(0), // Chain
|
|
|
|
Op.getOperand(2), // rsrc
|
|
|
|
Op.getOperand(3), // vindex
|
|
|
|
Op.getOperand(4), // offset
|
|
|
|
Op.getOperand(5), // glc
|
|
|
|
Op.getOperand(6) // slc
|
|
|
|
};
|
|
|
|
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
|
|
|
|
|
|
|
unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
|
|
|
|
AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
|
|
|
|
EVT VT = Op.getValueType();
|
|
|
|
EVT IntVT = VT.changeTypeToInteger();
|
|
|
|
|
|
|
|
MachineMemOperand *MMO = MF.getMachineMemOperand(
|
|
|
|
MachinePointerInfo(MFI->getBufferPSV()),
|
|
|
|
MachineMemOperand::MOLoad,
|
|
|
|
VT.getStoreSize(), VT.getStoreSize());
|
|
|
|
|
|
|
|
return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, MMO);
|
|
|
|
}
|
2017-06-23 00:29:22 +08:00
|
|
|
case Intrinsic::amdgcn_tbuffer_load: {
|
|
|
|
SDValue Ops[] = {
|
|
|
|
Op.getOperand(0), // Chain
|
|
|
|
Op.getOperand(2), // rsrc
|
|
|
|
Op.getOperand(3), // vindex
|
|
|
|
Op.getOperand(4), // voffset
|
|
|
|
Op.getOperand(5), // soffset
|
|
|
|
Op.getOperand(6), // offset
|
|
|
|
Op.getOperand(7), // dfmt
|
|
|
|
Op.getOperand(8), // nfmt
|
|
|
|
Op.getOperand(9), // glc
|
|
|
|
Op.getOperand(10) // slc
|
|
|
|
};
|
|
|
|
|
|
|
|
EVT VT = Op.getOperand(2).getValueType();
|
|
|
|
|
|
|
|
MachineMemOperand *MMO = MF.getMachineMemOperand(
|
|
|
|
MachinePointerInfo(),
|
|
|
|
MachineMemOperand::MOLoad,
|
|
|
|
VT.getStoreSize(), VT.getStoreSize());
|
|
|
|
return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
|
|
|
|
Op->getVTList(), Ops, VT, MMO);
|
|
|
|
}
|
2017-03-22 00:32:17 +08:00
|
|
|
// Basic sample.
|
|
|
|
case Intrinsic::amdgcn_image_sample:
|
|
|
|
case Intrinsic::amdgcn_image_sample_cl:
|
|
|
|
case Intrinsic::amdgcn_image_sample_d:
|
|
|
|
case Intrinsic::amdgcn_image_sample_d_cl:
|
|
|
|
case Intrinsic::amdgcn_image_sample_l:
|
|
|
|
case Intrinsic::amdgcn_image_sample_b:
|
|
|
|
case Intrinsic::amdgcn_image_sample_b_cl:
|
|
|
|
case Intrinsic::amdgcn_image_sample_lz:
|
|
|
|
case Intrinsic::amdgcn_image_sample_cd:
|
|
|
|
case Intrinsic::amdgcn_image_sample_cd_cl:
|
|
|
|
|
|
|
|
// Sample with comparison.
|
|
|
|
case Intrinsic::amdgcn_image_sample_c:
|
|
|
|
case Intrinsic::amdgcn_image_sample_c_cl:
|
|
|
|
case Intrinsic::amdgcn_image_sample_c_d:
|
|
|
|
case Intrinsic::amdgcn_image_sample_c_d_cl:
|
|
|
|
case Intrinsic::amdgcn_image_sample_c_l:
|
|
|
|
case Intrinsic::amdgcn_image_sample_c_b:
|
|
|
|
case Intrinsic::amdgcn_image_sample_c_b_cl:
|
|
|
|
case Intrinsic::amdgcn_image_sample_c_lz:
|
|
|
|
case Intrinsic::amdgcn_image_sample_c_cd:
|
|
|
|
case Intrinsic::amdgcn_image_sample_c_cd_cl:
|
|
|
|
|
|
|
|
// Sample with offsets.
|
|
|
|
case Intrinsic::amdgcn_image_sample_o:
|
|
|
|
case Intrinsic::amdgcn_image_sample_cl_o:
|
|
|
|
case Intrinsic::amdgcn_image_sample_d_o:
|
|
|
|
case Intrinsic::amdgcn_image_sample_d_cl_o:
|
|
|
|
case Intrinsic::amdgcn_image_sample_l_o:
|
|
|
|
case Intrinsic::amdgcn_image_sample_b_o:
|
|
|
|
case Intrinsic::amdgcn_image_sample_b_cl_o:
|
|
|
|
case Intrinsic::amdgcn_image_sample_lz_o:
|
|
|
|
case Intrinsic::amdgcn_image_sample_cd_o:
|
|
|
|
case Intrinsic::amdgcn_image_sample_cd_cl_o:
|
|
|
|
|
|
|
|
// Sample with comparison and offsets.
|
|
|
|
case Intrinsic::amdgcn_image_sample_c_o:
|
|
|
|
case Intrinsic::amdgcn_image_sample_c_cl_o:
|
|
|
|
case Intrinsic::amdgcn_image_sample_c_d_o:
|
|
|
|
case Intrinsic::amdgcn_image_sample_c_d_cl_o:
|
|
|
|
case Intrinsic::amdgcn_image_sample_c_l_o:
|
|
|
|
case Intrinsic::amdgcn_image_sample_c_b_o:
|
|
|
|
case Intrinsic::amdgcn_image_sample_c_b_cl_o:
|
|
|
|
case Intrinsic::amdgcn_image_sample_c_lz_o:
|
|
|
|
case Intrinsic::amdgcn_image_sample_c_cd_o:
|
|
|
|
case Intrinsic::amdgcn_image_sample_c_cd_cl_o:
|
|
|
|
|
|
|
|
case Intrinsic::amdgcn_image_getlod: {
|
|
|
|
// Replace dmask with everything disabled with undef.
|
|
|
|
const ConstantSDNode *DMask = dyn_cast<ConstantSDNode>(Op.getOperand(5));
|
|
|
|
if (!DMask || DMask->isNullValue()) {
|
|
|
|
SDValue Undef = DAG.getUNDEF(Op.getValueType());
|
|
|
|
return DAG.getMergeValues({ Undef, Op.getOperand(0) }, SDLoc(Op));
|
|
|
|
}
|
|
|
|
|
|
|
|
return SDValue();
|
|
|
|
}
|
2016-04-12 22:05:04 +08:00
|
|
|
default:
|
|
|
|
return SDValue();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-07-26 14:23:37 +08:00
|
|
|
SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
|
|
|
|
SelectionDAG &DAG) const {
|
2015-05-12 22:18:14 +08:00
|
|
|
SDLoc DL(Op);
|
2014-07-26 14:23:37 +08:00
|
|
|
SDValue Chain = Op.getOperand(0);
|
|
|
|
unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
|
2017-06-23 00:29:22 +08:00
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
2014-07-26 14:23:37 +08:00
|
|
|
|
|
|
|
switch (IntrinsicID) {
|
2017-02-22 06:50:41 +08:00
|
|
|
case Intrinsic::amdgcn_exp: {
|
2017-01-17 15:26:53 +08:00
|
|
|
const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
|
|
|
|
const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
|
|
|
|
const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(8));
|
|
|
|
const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(9));
|
|
|
|
|
|
|
|
const SDValue Ops[] = {
|
|
|
|
Chain,
|
|
|
|
DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
|
|
|
|
DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), // en
|
|
|
|
Op.getOperand(4), // src0
|
|
|
|
Op.getOperand(5), // src1
|
|
|
|
Op.getOperand(6), // src2
|
|
|
|
Op.getOperand(7), // src3
|
|
|
|
DAG.getTargetConstant(0, DL, MVT::i1), // compr
|
|
|
|
DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
|
|
|
|
};
|
|
|
|
|
|
|
|
unsigned Opc = Done->isNullValue() ?
|
|
|
|
AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
|
|
|
|
return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
|
|
|
|
}
|
|
|
|
case Intrinsic::amdgcn_exp_compr: {
|
|
|
|
const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
|
|
|
|
const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
|
|
|
|
SDValue Src0 = Op.getOperand(4);
|
|
|
|
SDValue Src1 = Op.getOperand(5);
|
|
|
|
const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
|
|
|
|
const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(7));
|
|
|
|
|
|
|
|
SDValue Undef = DAG.getUNDEF(MVT::f32);
|
|
|
|
const SDValue Ops[] = {
|
|
|
|
Chain,
|
|
|
|
DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
|
|
|
|
DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), // en
|
|
|
|
DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0),
|
|
|
|
DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1),
|
|
|
|
Undef, // src2
|
|
|
|
Undef, // src3
|
|
|
|
DAG.getTargetConstant(1, DL, MVT::i1), // compr
|
|
|
|
DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
|
|
|
|
};
|
|
|
|
|
|
|
|
unsigned Opc = Done->isNullValue() ?
|
|
|
|
AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
|
|
|
|
return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
|
|
|
|
}
|
|
|
|
case Intrinsic::amdgcn_s_sendmsg:
|
2017-02-16 10:01:17 +08:00
|
|
|
case Intrinsic::amdgcn_s_sendmsghalt: {
|
|
|
|
unsigned NodeOp = (IntrinsicID == Intrinsic::amdgcn_s_sendmsg) ?
|
|
|
|
AMDGPUISD::SENDMSG : AMDGPUISD::SENDMSGHALT;
|
2015-05-12 22:18:14 +08:00
|
|
|
Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
|
|
|
|
SDValue Glue = Chain.getValue(1);
|
2017-02-16 06:17:09 +08:00
|
|
|
return DAG.getNode(NodeOp, DL, MVT::Other, Chain,
|
2017-01-05 02:06:55 +08:00
|
|
|
Op.getOperand(2), Glue);
|
|
|
|
}
|
AMDGPU: Add new amdgcn.init.exec intrinsics
v2: More tests, bug fixes, cosmetic changes.
Subscribers: arsenm, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, llvm-commits, t-tye
Differential Revision: https://reviews.llvm.org/D31762
llvm-svn: 301677
2017-04-29 04:21:58 +08:00
|
|
|
case Intrinsic::amdgcn_init_exec: {
|
|
|
|
return DAG.getNode(AMDGPUISD::INIT_EXEC, DL, MVT::Other, Chain,
|
|
|
|
Op.getOperand(2));
|
|
|
|
}
|
|
|
|
case Intrinsic::amdgcn_init_exec_from_input: {
|
|
|
|
return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain,
|
|
|
|
Op.getOperand(2), Op.getOperand(3));
|
|
|
|
}
|
2016-07-13 14:04:22 +08:00
|
|
|
case AMDGPUIntrinsic::AMDGPU_kill: {
|
2016-07-20 00:27:56 +08:00
|
|
|
SDValue Src = Op.getOperand(2);
|
|
|
|
if (const ConstantFPSDNode *K = dyn_cast<ConstantFPSDNode>(Src)) {
|
2016-07-13 14:04:22 +08:00
|
|
|
if (!K->isNegative())
|
|
|
|
return Chain;
|
2016-07-20 00:27:56 +08:00
|
|
|
|
|
|
|
SDValue NegOne = DAG.getTargetConstant(FloatToBits(-1.0f), DL, MVT::i32);
|
|
|
|
return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, NegOne);
|
2016-07-13 14:04:22 +08:00
|
|
|
}
|
|
|
|
|
2016-07-20 00:27:56 +08:00
|
|
|
SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Src);
|
|
|
|
return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, Cast);
|
2016-07-13 14:04:22 +08:00
|
|
|
}
|
2017-04-07 00:48:30 +08:00
|
|
|
case Intrinsic::amdgcn_s_barrier: {
|
|
|
|
if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
|
|
|
|
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
|
|
|
|
unsigned WGSize = ST.getFlatWorkGroupSizes(*MF.getFunction()).second;
|
|
|
|
if (WGSize <= ST.getWavefrontSize())
|
|
|
|
return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
|
|
|
|
Op.getOperand(0)), 0);
|
|
|
|
}
|
|
|
|
return SDValue();
|
|
|
|
};
|
2017-06-23 00:29:22 +08:00
|
|
|
case AMDGPUIntrinsic::SI_tbuffer_store: {
|
|
|
|
|
|
|
|
// Extract vindex and voffset from vaddr as appropriate
|
|
|
|
const ConstantSDNode *OffEn = cast<ConstantSDNode>(Op.getOperand(10));
|
|
|
|
const ConstantSDNode *IdxEn = cast<ConstantSDNode>(Op.getOperand(11));
|
|
|
|
SDValue VAddr = Op.getOperand(5);
|
|
|
|
|
|
|
|
SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
|
|
|
|
|
|
|
|
assert(!(OffEn->isOne() && IdxEn->isOne()) &&
|
|
|
|
"Legacy intrinsic doesn't support both offset and index - use new version");
|
|
|
|
|
|
|
|
SDValue VIndex = IdxEn->isOne() ? VAddr : Zero;
|
|
|
|
SDValue VOffset = OffEn->isOne() ? VAddr : Zero;
|
|
|
|
|
|
|
|
// Deal with the vec-3 case
|
|
|
|
const ConstantSDNode *NumChannels = cast<ConstantSDNode>(Op.getOperand(4));
|
|
|
|
auto Opcode = NumChannels->getZExtValue() == 3 ?
|
|
|
|
AMDGPUISD::TBUFFER_STORE_FORMAT_X3 : AMDGPUISD::TBUFFER_STORE_FORMAT;
|
|
|
|
|
|
|
|
SDValue Ops[] = {
|
|
|
|
Chain,
|
|
|
|
Op.getOperand(3), // vdata
|
|
|
|
Op.getOperand(2), // rsrc
|
|
|
|
VIndex,
|
|
|
|
VOffset,
|
|
|
|
Op.getOperand(6), // soffset
|
|
|
|
Op.getOperand(7), // inst_offset
|
|
|
|
Op.getOperand(8), // dfmt
|
|
|
|
Op.getOperand(9), // nfmt
|
|
|
|
Op.getOperand(12), // glc
|
|
|
|
Op.getOperand(13), // slc
|
|
|
|
};
|
|
|
|
|
2017-06-23 01:15:49 +08:00
|
|
|
assert((cast<ConstantSDNode>(Op.getOperand(14)))->getZExtValue() == 0 &&
|
2017-06-23 00:29:22 +08:00
|
|
|
"Value of tfe other than zero is unsupported");
|
|
|
|
|
|
|
|
EVT VT = Op.getOperand(3).getValueType();
|
|
|
|
MachineMemOperand *MMO = MF.getMachineMemOperand(
|
|
|
|
MachinePointerInfo(),
|
|
|
|
MachineMemOperand::MOStore,
|
|
|
|
VT.getStoreSize(), 4);
|
|
|
|
return DAG.getMemIntrinsicNode(Opcode, DL,
|
|
|
|
Op->getVTList(), Ops, VT, MMO);
|
|
|
|
}
|
|
|
|
|
|
|
|
case Intrinsic::amdgcn_tbuffer_store: {
|
|
|
|
SDValue Ops[] = {
|
|
|
|
Chain,
|
|
|
|
Op.getOperand(2), // vdata
|
|
|
|
Op.getOperand(3), // rsrc
|
|
|
|
Op.getOperand(4), // vindex
|
|
|
|
Op.getOperand(5), // voffset
|
|
|
|
Op.getOperand(6), // soffset
|
|
|
|
Op.getOperand(7), // offset
|
|
|
|
Op.getOperand(8), // dfmt
|
|
|
|
Op.getOperand(9), // nfmt
|
|
|
|
Op.getOperand(10), // glc
|
|
|
|
Op.getOperand(11) // slc
|
|
|
|
};
|
|
|
|
EVT VT = Op.getOperand(3).getValueType();
|
|
|
|
MachineMemOperand *MMO = MF.getMachineMemOperand(
|
|
|
|
MachinePointerInfo(),
|
|
|
|
MachineMemOperand::MOStore,
|
|
|
|
VT.getStoreSize(), 4);
|
|
|
|
return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
|
|
|
|
Op->getVTList(), Ops, VT, MMO);
|
|
|
|
}
|
|
|
|
|
2014-07-26 14:23:37 +08:00
|
|
|
default:
|
2017-04-04 02:08:08 +08:00
|
|
|
return Op;
|
2014-07-26 14:23:37 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-11-14 07:36:50 +08:00
|
|
|
SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
|
|
|
|
SDLoc DL(Op);
|
|
|
|
LoadSDNode *Load = cast<LoadSDNode>(Op);
|
2016-02-11 02:21:39 +08:00
|
|
|
ISD::LoadExtType ExtType = Load->getExtensionType();
|
2016-02-11 02:21:45 +08:00
|
|
|
EVT MemVT = Load->getMemoryVT();
|
2016-02-11 02:21:39 +08:00
|
|
|
|
2016-02-11 02:21:45 +08:00
|
|
|
if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
|
2017-09-07 13:37:34 +08:00
|
|
|
if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
|
|
|
|
return SDValue();
|
|
|
|
|
2016-02-11 02:21:39 +08:00
|
|
|
// FIXME: Copied from PPC
|
|
|
|
// First, load into 32 bits, then truncate to 1 bit.
|
|
|
|
|
|
|
|
SDValue Chain = Load->getChain();
|
|
|
|
SDValue BasePtr = Load->getBasePtr();
|
|
|
|
MachineMemOperand *MMO = Load->getMemOperand();
|
|
|
|
|
2016-11-11 00:02:37 +08:00
|
|
|
EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
|
|
|
|
|
2016-02-11 02:21:39 +08:00
|
|
|
SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
|
2016-11-11 00:02:37 +08:00
|
|
|
BasePtr, RealMemVT, MMO);
|
2016-02-11 02:21:39 +08:00
|
|
|
|
|
|
|
SDValue Ops[] = {
|
2016-02-11 02:21:45 +08:00
|
|
|
DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
|
2016-02-11 02:21:39 +08:00
|
|
|
NewLD.getValue(1)
|
|
|
|
};
|
|
|
|
|
|
|
|
return DAG.getMergeValues(Ops, DL);
|
|
|
|
}
|
2014-07-08 02:34:45 +08:00
|
|
|
|
2016-02-11 02:21:45 +08:00
|
|
|
if (!MemVT.isVector())
|
|
|
|
return SDValue();
|
2015-11-24 20:05:03 +08:00
|
|
|
|
2016-02-11 02:21:45 +08:00
|
|
|
assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
|
|
|
|
"Custom lowering for non-i32 vectors hasn't been implemented.");
|
|
|
|
|
2016-05-03 04:13:51 +08:00
|
|
|
unsigned AS = Load->getAddressSpace();
|
|
|
|
if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
|
|
|
|
AS, Load->getAlignment())) {
|
|
|
|
SDValue Ops[2];
|
|
|
|
std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
|
|
|
|
return DAG.getMergeValues(Ops, DL);
|
|
|
|
}
|
|
|
|
|
2016-10-26 22:38:47 +08:00
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
|
|
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
|
|
|
// If there is a possibilty that flat instruction access scratch memory
|
|
|
|
// then we need to use the same legalization rules we use for private.
|
2017-03-27 22:04:01 +08:00
|
|
|
if (AS == AMDGPUASI.FLAT_ADDRESS)
|
2016-10-26 22:38:47 +08:00
|
|
|
AS = MFI->hasFlatScratchInit() ?
|
2017-03-27 22:04:01 +08:00
|
|
|
AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS;
|
2016-10-26 22:38:47 +08:00
|
|
|
|
2016-05-03 04:13:51 +08:00
|
|
|
unsigned NumElements = MemVT.getVectorNumElements();
|
2017-03-27 22:04:01 +08:00
|
|
|
if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
|
2016-02-11 02:21:45 +08:00
|
|
|
if (isMemOpUniform(Load))
|
|
|
|
return SDValue();
|
|
|
|
// Non-uniform loads will be selected to MUBUF instructions, so they
|
2016-12-09 01:28:47 +08:00
|
|
|
// have the same legalization requirements as global and private
|
|
|
|
// loads.
|
|
|
|
//
|
2017-03-27 22:04:01 +08:00
|
|
|
}
|
|
|
|
if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS) {
|
2016-12-15 23:17:19 +08:00
|
|
|
if (Subtarget->getScalarizeGlobalBehavior() && isMemOpUniform(Load) &&
|
2017-06-02 23:25:52 +08:00
|
|
|
!Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load))
|
2016-12-09 01:28:47 +08:00
|
|
|
return SDValue();
|
|
|
|
// Non-uniform loads will be selected to MUBUF instructions, so they
|
|
|
|
// have the same legalization requirements as global and private
|
2016-02-11 02:21:45 +08:00
|
|
|
// loads.
|
|
|
|
//
|
2017-03-27 22:04:01 +08:00
|
|
|
}
|
|
|
|
if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS ||
|
|
|
|
AS == AMDGPUASI.FLAT_ADDRESS) {
|
2016-02-13 12:18:53 +08:00
|
|
|
if (NumElements > 4)
|
2016-02-11 02:21:45 +08:00
|
|
|
return SplitVectorLoad(Op, DAG);
|
|
|
|
// v4 loads are supported for private and global memory.
|
|
|
|
return SDValue();
|
2017-03-27 22:04:01 +08:00
|
|
|
}
|
|
|
|
if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
|
2016-02-13 12:18:53 +08:00
|
|
|
// Depending on the setting of the private_element_size field in the
|
|
|
|
// resource descriptor, we can only make private accesses up to a certain
|
|
|
|
// size.
|
|
|
|
switch (Subtarget->getMaxPrivateElementSize()) {
|
|
|
|
case 4:
|
2016-04-15 07:31:26 +08:00
|
|
|
return scalarizeVectorLoad(Load, DAG);
|
2016-02-13 12:18:53 +08:00
|
|
|
case 8:
|
|
|
|
if (NumElements > 2)
|
|
|
|
return SplitVectorLoad(Op, DAG);
|
|
|
|
return SDValue();
|
|
|
|
case 16:
|
|
|
|
// Same as global/flat
|
|
|
|
if (NumElements > 4)
|
|
|
|
return SplitVectorLoad(Op, DAG);
|
|
|
|
return SDValue();
|
|
|
|
default:
|
|
|
|
llvm_unreachable("unsupported private_element_size");
|
|
|
|
}
|
2017-03-27 22:04:01 +08:00
|
|
|
} else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
|
2016-05-03 04:13:51 +08:00
|
|
|
if (NumElements > 2)
|
|
|
|
return SplitVectorLoad(Op, DAG);
|
|
|
|
|
|
|
|
if (NumElements == 2)
|
|
|
|
return SDValue();
|
|
|
|
|
2016-02-11 02:21:45 +08:00
|
|
|
// If properly aligned, if we split we might be able to use ds_read_b64.
|
|
|
|
return SplitVectorLoad(Op, DAG);
|
2014-03-25 01:50:46 +08:00
|
|
|
}
|
2017-03-27 22:04:01 +08:00
|
|
|
return SDValue();
|
2013-11-14 07:36:50 +08:00
|
|
|
}
|
|
|
|
|
2014-02-05 01:18:40 +08:00
|
|
|
SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
|
|
|
|
if (Op.getValueType() != MVT::i64)
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
SDLoc DL(Op);
|
|
|
|
SDValue Cond = Op.getOperand(0);
|
|
|
|
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
|
|
|
|
SDValue One = DAG.getConstant(1, DL, MVT::i32);
|
2014-02-05 01:18:40 +08:00
|
|
|
|
2014-03-31 22:01:55 +08:00
|
|
|
SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
|
|
|
|
SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
|
|
|
|
|
|
|
|
SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
|
|
|
|
SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
|
2014-02-05 01:18:40 +08:00
|
|
|
|
|
|
|
SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
|
|
|
|
|
2014-03-31 22:01:55 +08:00
|
|
|
SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
|
|
|
|
SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
|
2014-02-05 01:18:40 +08:00
|
|
|
|
|
|
|
SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
|
|
|
|
|
2016-04-27 05:15:30 +08:00
|
|
|
SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
|
2014-03-31 22:01:55 +08:00
|
|
|
return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res);
|
2014-02-05 01:18:40 +08:00
|
|
|
}
|
|
|
|
|
2014-07-16 07:50:10 +08:00
|
|
|
// Catch division cases where we can use shortcuts with rcp and rsq
|
|
|
|
// instructions.
|
2016-07-20 07:16:53 +08:00
|
|
|
SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
|
|
|
|
SelectionDAG &DAG) const {
|
2014-07-16 04:18:31 +08:00
|
|
|
SDLoc SL(Op);
|
|
|
|
SDValue LHS = Op.getOperand(0);
|
|
|
|
SDValue RHS = Op.getOperand(1);
|
|
|
|
EVT VT = Op.getValueType();
|
2017-07-07 04:34:21 +08:00
|
|
|
const SDNodeFlags Flags = Op->getFlags();
|
|
|
|
bool Unsafe = DAG.getTarget().Options.UnsafeFPMath ||
|
|
|
|
Flags.hasUnsafeAlgebra() || Flags.hasAllowReciprocal();
|
2014-07-16 04:18:31 +08:00
|
|
|
|
2017-04-22 03:25:33 +08:00
|
|
|
if (!Unsafe && VT == MVT::f32 && Subtarget->hasFP32Denormals())
|
|
|
|
return SDValue();
|
|
|
|
|
2014-07-16 04:18:31 +08:00
|
|
|
if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
|
2017-04-22 03:25:33 +08:00
|
|
|
if (Unsafe || VT == MVT::f32 || VT == MVT::f16) {
|
2016-08-03 06:25:04 +08:00
|
|
|
if (CLHS->isExactlyValue(1.0)) {
|
|
|
|
// v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
|
|
|
|
// the CI documentation has a worst case error of 1 ulp.
|
|
|
|
// OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
|
|
|
|
// use it as long as we aren't trying to use denormals.
|
2016-12-22 11:05:44 +08:00
|
|
|
//
|
|
|
|
// v_rcp_f16 and v_rsq_f16 DO support denormals.
|
2016-08-03 06:25:04 +08:00
|
|
|
|
|
|
|
// 1.0 / sqrt(x) -> rsq(x)
|
2016-12-22 11:05:44 +08:00
|
|
|
|
2016-08-03 06:25:04 +08:00
|
|
|
// XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
|
|
|
|
// error seems really high at 2^29 ULP.
|
|
|
|
if (RHS.getOpcode() == ISD::FSQRT)
|
|
|
|
return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
|
|
|
|
|
|
|
|
// 1.0 / x -> rcp(x)
|
|
|
|
return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Same as for 1.0, but expand the sign out of the constant.
|
|
|
|
if (CLHS->isExactlyValue(-1.0)) {
|
|
|
|
// -1.0 / x -> rcp (fneg x)
|
|
|
|
SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
|
|
|
|
return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
|
|
|
|
}
|
2014-07-16 04:18:31 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-07-07 04:34:21 +08:00
|
|
|
if (Unsafe) {
|
2014-07-16 07:50:10 +08:00
|
|
|
// Turn into multiply by the reciprocal.
|
|
|
|
// x / y -> x * (1.0 / y)
|
|
|
|
SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
|
2017-07-07 04:34:21 +08:00
|
|
|
return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
|
2014-07-16 07:50:10 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return SDValue();
|
2014-07-16 04:18:31 +08:00
|
|
|
}
|
|
|
|
|
2016-12-07 10:42:15 +08:00
|
|
|
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
|
|
|
|
EVT VT, SDValue A, SDValue B, SDValue GlueChain) {
|
|
|
|
if (GlueChain->getNumValues() <= 1) {
|
|
|
|
return DAG.getNode(Opcode, SL, VT, A, B);
|
|
|
|
}
|
|
|
|
|
|
|
|
assert(GlueChain->getNumValues() == 3);
|
|
|
|
|
|
|
|
SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
|
|
|
|
switch (Opcode) {
|
|
|
|
default: llvm_unreachable("no chain equivalent for opcode");
|
|
|
|
case ISD::FMUL:
|
|
|
|
Opcode = AMDGPUISD::FMUL_W_CHAIN;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B,
|
|
|
|
GlueChain.getValue(2));
|
|
|
|
}
|
|
|
|
|
|
|
|
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
|
|
|
|
EVT VT, SDValue A, SDValue B, SDValue C,
|
|
|
|
SDValue GlueChain) {
|
|
|
|
if (GlueChain->getNumValues() <= 1) {
|
|
|
|
return DAG.getNode(Opcode, SL, VT, A, B, C);
|
|
|
|
}
|
|
|
|
|
|
|
|
assert(GlueChain->getNumValues() == 3);
|
|
|
|
|
|
|
|
SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
|
|
|
|
switch (Opcode) {
|
|
|
|
default: llvm_unreachable("no chain equivalent for opcode");
|
|
|
|
case ISD::FMA:
|
|
|
|
Opcode = AMDGPUISD::FMA_W_CHAIN;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, C,
|
|
|
|
GlueChain.getValue(2));
|
|
|
|
}
|
|
|
|
|
2016-12-22 11:05:41 +08:00
|
|
|
SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
|
2016-12-22 11:05:44 +08:00
|
|
|
if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
|
|
|
|
return FastLowered;
|
|
|
|
|
2016-12-22 11:05:41 +08:00
|
|
|
SDLoc SL(Op);
|
|
|
|
SDValue Src0 = Op.getOperand(0);
|
|
|
|
SDValue Src1 = Op.getOperand(1);
|
|
|
|
|
|
|
|
SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
|
|
|
|
SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
|
|
|
|
|
|
|
|
SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
|
|
|
|
SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
|
|
|
|
|
|
|
|
SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
|
|
|
|
SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
|
|
|
|
|
|
|
|
return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
|
|
|
|
}
|
|
|
|
|
2016-07-20 07:16:53 +08:00
|
|
|
// Faster 2.5 ULP division that does not support denormals.
|
|
|
|
SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
|
2014-07-16 04:18:31 +08:00
|
|
|
SDLoc SL(Op);
|
2016-07-20 07:16:53 +08:00
|
|
|
SDValue LHS = Op.getOperand(1);
|
|
|
|
SDValue RHS = Op.getOperand(2);
|
2014-07-16 04:18:31 +08:00
|
|
|
|
2016-07-20 07:16:53 +08:00
|
|
|
SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
|
2014-07-16 04:18:31 +08:00
|
|
|
|
2016-07-20 07:16:53 +08:00
|
|
|
const APFloat K0Val(BitsToFloat(0x6f800000));
|
|
|
|
const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
|
2014-07-16 04:18:31 +08:00
|
|
|
|
2016-07-20 07:16:53 +08:00
|
|
|
const APFloat K1Val(BitsToFloat(0x2f800000));
|
|
|
|
const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
|
2014-07-16 04:18:31 +08:00
|
|
|
|
2016-07-20 07:16:53 +08:00
|
|
|
const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
|
2014-07-16 04:18:31 +08:00
|
|
|
|
2016-07-20 07:16:53 +08:00
|
|
|
EVT SetCCVT =
|
|
|
|
getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
|
2014-07-16 04:18:31 +08:00
|
|
|
|
2016-07-20 07:16:53 +08:00
|
|
|
SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
|
2014-07-16 04:18:31 +08:00
|
|
|
|
2016-07-20 07:16:53 +08:00
|
|
|
SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
|
2014-07-16 04:18:31 +08:00
|
|
|
|
2016-07-20 07:16:53 +08:00
|
|
|
// TODO: Should this propagate fast-math-flags?
|
|
|
|
r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
|
2015-09-17 00:31:21 +08:00
|
|
|
|
2016-07-20 07:16:53 +08:00
|
|
|
// rcp does not support denormals.
|
|
|
|
SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
|
2014-07-16 04:18:31 +08:00
|
|
|
|
2016-07-20 07:16:53 +08:00
|
|
|
SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
|
2014-07-16 04:18:31 +08:00
|
|
|
|
2016-07-20 07:16:53 +08:00
|
|
|
return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
|
|
|
|
}
|
2014-07-16 04:18:31 +08:00
|
|
|
|
2016-07-20 07:16:53 +08:00
|
|
|
SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
|
|
|
|
if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
|
|
|
|
return FastLowered;
|
|
|
|
|
|
|
|
SDLoc SL(Op);
|
|
|
|
SDValue LHS = Op.getOperand(0);
|
|
|
|
SDValue RHS = Op.getOperand(1);
|
2016-06-10 10:18:02 +08:00
|
|
|
|
2016-06-10 03:17:15 +08:00
|
|
|
const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
|
2016-06-10 10:18:02 +08:00
|
|
|
|
2016-06-10 03:17:15 +08:00
|
|
|
SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
|
2016-06-10 10:18:02 +08:00
|
|
|
|
2016-12-07 10:42:15 +08:00
|
|
|
SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
|
|
|
|
RHS, RHS, LHS);
|
|
|
|
SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
|
|
|
|
LHS, RHS, LHS);
|
2016-06-10 10:18:02 +08:00
|
|
|
|
2016-07-09 15:48:11 +08:00
|
|
|
// Denominator is scaled to not be denormal, so using rcp is ok.
|
2016-12-07 10:42:15 +08:00
|
|
|
SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32,
|
|
|
|
DenominatorScaled);
|
|
|
|
SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
|
|
|
|
DenominatorScaled);
|
|
|
|
|
|
|
|
const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE |
|
|
|
|
(4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
|
|
|
|
(1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
|
|
|
|
|
|
|
|
const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16);
|
|
|
|
|
|
|
|
if (!Subtarget->hasFP32Denormals()) {
|
|
|
|
SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
|
|
|
|
const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
|
|
|
|
SL, MVT::i32);
|
|
|
|
SDValue EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs,
|
|
|
|
DAG.getEntryNode(),
|
|
|
|
EnableDenormValue, BitField);
|
|
|
|
SDValue Ops[3] = {
|
|
|
|
NegDivScale0,
|
|
|
|
EnableDenorm.getValue(0),
|
|
|
|
EnableDenorm.getValue(1)
|
|
|
|
};
|
|
|
|
|
|
|
|
NegDivScale0 = DAG.getMergeValues(Ops, SL);
|
|
|
|
}
|
|
|
|
|
|
|
|
SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
|
|
|
|
ApproxRcp, One, NegDivScale0);
|
|
|
|
|
|
|
|
SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
|
|
|
|
ApproxRcp, Fma0);
|
2016-06-10 10:18:02 +08:00
|
|
|
|
2016-12-07 10:42:15 +08:00
|
|
|
SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled,
|
|
|
|
Fma1, Fma1);
|
2016-06-10 10:18:02 +08:00
|
|
|
|
2016-12-07 10:42:15 +08:00
|
|
|
SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
|
|
|
|
NumeratorScaled, Mul);
|
2016-06-10 10:18:02 +08:00
|
|
|
|
2016-12-07 10:42:15 +08:00
|
|
|
SDValue Fma3 = getFPTernOp(DAG, ISD::FMA,SL, MVT::f32, Fma2, Fma1, Mul, Fma2);
|
2016-06-10 10:18:02 +08:00
|
|
|
|
2016-12-07 10:42:15 +08:00
|
|
|
SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
|
|
|
|
NumeratorScaled, Fma3);
|
|
|
|
|
|
|
|
if (!Subtarget->hasFP32Denormals()) {
|
|
|
|
const SDValue DisableDenormValue =
|
|
|
|
DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
|
|
|
|
SDValue DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other,
|
|
|
|
Fma4.getValue(1),
|
|
|
|
DisableDenormValue,
|
|
|
|
BitField,
|
|
|
|
Fma4.getValue(2));
|
|
|
|
|
|
|
|
SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
|
|
|
|
DisableDenorm, DAG.getRoot());
|
|
|
|
DAG.setRoot(OutputChain);
|
|
|
|
}
|
2016-06-10 10:18:02 +08:00
|
|
|
|
2016-06-10 03:17:15 +08:00
|
|
|
SDValue Scale = NumeratorScaled.getValue(1);
|
2016-12-07 10:42:15 +08:00
|
|
|
SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
|
|
|
|
Fma4, Fma1, Fma3, Scale);
|
2016-06-10 10:18:02 +08:00
|
|
|
|
2016-06-10 03:17:15 +08:00
|
|
|
return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS);
|
2014-07-16 04:18:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
|
2015-02-14 12:30:08 +08:00
|
|
|
if (DAG.getTarget().Options.UnsafeFPMath)
|
2016-07-20 07:16:53 +08:00
|
|
|
return lowerFastUnsafeFDIV(Op, DAG);
|
2015-02-14 12:30:08 +08:00
|
|
|
|
|
|
|
SDLoc SL(Op);
|
|
|
|
SDValue X = Op.getOperand(0);
|
|
|
|
SDValue Y = Op.getOperand(1);
|
|
|
|
|
2015-04-28 22:05:47 +08:00
|
|
|
const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
|
2015-02-14 12:30:08 +08:00
|
|
|
|
|
|
|
SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
|
|
|
|
|
|
|
|
SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
|
|
|
|
|
|
|
|
SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
|
|
|
|
|
|
|
|
SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
|
|
|
|
|
|
|
|
SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
|
|
|
|
|
|
|
|
SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
|
|
|
|
|
|
|
|
SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
|
|
|
|
|
|
|
|
SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
|
|
|
|
|
|
|
|
SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
|
|
|
|
SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
|
|
|
|
|
|
|
|
SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
|
|
|
|
NegDivScale0, Mul, DivScale1);
|
|
|
|
|
|
|
|
SDValue Scale;
|
|
|
|
|
2016-06-24 14:30:11 +08:00
|
|
|
if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
|
2015-02-14 12:30:08 +08:00
|
|
|
// Workaround a hardware bug on SI where the condition output from div_scale
|
|
|
|
// is not usable.
|
|
|
|
|
2015-04-28 22:05:47 +08:00
|
|
|
const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
|
2015-02-14 12:30:08 +08:00
|
|
|
|
|
|
|
// Figure out if the scale to use for div_fmas.
|
|
|
|
SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
|
|
|
|
SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
|
|
|
|
SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
|
|
|
|
SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
|
|
|
|
|
|
|
|
SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
|
|
|
|
SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
|
|
|
|
|
|
|
|
SDValue Scale0Hi
|
|
|
|
= DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
|
|
|
|
SDValue Scale1Hi
|
|
|
|
= DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
|
|
|
|
|
|
|
|
SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
|
|
|
|
SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
|
|
|
|
Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
|
|
|
|
} else {
|
|
|
|
Scale = DivScale1.getValue(1);
|
|
|
|
}
|
|
|
|
|
|
|
|
SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
|
|
|
|
Fma4, Fma3, Mul, Scale);
|
|
|
|
|
|
|
|
return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
|
2014-07-16 04:18:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
|
|
|
|
EVT VT = Op.getValueType();
|
|
|
|
|
|
|
|
if (VT == MVT::f32)
|
|
|
|
return LowerFDIV32(Op, DAG);
|
|
|
|
|
|
|
|
if (VT == MVT::f64)
|
|
|
|
return LowerFDIV64(Op, DAG);
|
|
|
|
|
2016-12-22 11:05:41 +08:00
|
|
|
if (VT == MVT::f16)
|
|
|
|
return LowerFDIV16(Op, DAG);
|
|
|
|
|
2014-07-16 04:18:31 +08:00
|
|
|
llvm_unreachable("Unexpected type for fdiv");
|
|
|
|
}
|
|
|
|
|
2013-11-14 07:36:50 +08:00
|
|
|
SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
|
|
|
|
SDLoc DL(Op);
|
|
|
|
StoreSDNode *Store = cast<StoreSDNode>(Op);
|
|
|
|
EVT VT = Store->getMemoryVT();
|
|
|
|
|
2016-02-11 13:32:46 +08:00
|
|
|
if (VT == MVT::i1) {
|
|
|
|
return DAG.getTruncStore(Store->getChain(), DL,
|
|
|
|
DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
|
|
|
|
Store->getBasePtr(), MVT::i1, Store->getMemOperand());
|
2014-07-21 23:45:01 +08:00
|
|
|
}
|
|
|
|
|
2016-05-03 04:13:51 +08:00
|
|
|
assert(VT.isVector() &&
|
|
|
|
Store->getValue().getValueType().getScalarType() == MVT::i32);
|
|
|
|
|
|
|
|
unsigned AS = Store->getAddressSpace();
|
|
|
|
if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
|
|
|
|
AS, Store->getAlignment())) {
|
|
|
|
return expandUnalignedStore(Store, DAG);
|
|
|
|
}
|
2013-11-14 07:36:50 +08:00
|
|
|
|
2016-10-26 22:38:47 +08:00
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
|
|
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
|
|
|
// If there is a possibilty that flat instruction access scratch memory
|
|
|
|
// then we need to use the same legalization rules we use for private.
|
2017-03-27 22:04:01 +08:00
|
|
|
if (AS == AMDGPUASI.FLAT_ADDRESS)
|
2016-10-26 22:38:47 +08:00
|
|
|
AS = MFI->hasFlatScratchInit() ?
|
2017-03-27 22:04:01 +08:00
|
|
|
AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS;
|
2016-10-26 22:38:47 +08:00
|
|
|
|
2016-02-13 12:18:53 +08:00
|
|
|
unsigned NumElements = VT.getVectorNumElements();
|
2017-03-27 22:04:01 +08:00
|
|
|
if (AS == AMDGPUASI.GLOBAL_ADDRESS ||
|
|
|
|
AS == AMDGPUASI.FLAT_ADDRESS) {
|
2016-02-13 12:18:53 +08:00
|
|
|
if (NumElements > 4)
|
|
|
|
return SplitVectorStore(Op, DAG);
|
|
|
|
return SDValue();
|
2017-03-27 22:04:01 +08:00
|
|
|
} else if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
|
2016-02-13 12:18:53 +08:00
|
|
|
switch (Subtarget->getMaxPrivateElementSize()) {
|
|
|
|
case 4:
|
2016-04-15 07:31:26 +08:00
|
|
|
return scalarizeVectorStore(Store, DAG);
|
2016-02-13 12:18:53 +08:00
|
|
|
case 8:
|
|
|
|
if (NumElements > 2)
|
|
|
|
return SplitVectorStore(Op, DAG);
|
|
|
|
return SDValue();
|
|
|
|
case 16:
|
|
|
|
if (NumElements > 4)
|
|
|
|
return SplitVectorStore(Op, DAG);
|
|
|
|
return SDValue();
|
|
|
|
default:
|
|
|
|
llvm_unreachable("unsupported private_element_size");
|
|
|
|
}
|
2017-03-27 22:04:01 +08:00
|
|
|
} else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
|
2016-05-03 04:13:51 +08:00
|
|
|
if (NumElements > 2)
|
|
|
|
return SplitVectorStore(Op, DAG);
|
|
|
|
|
|
|
|
if (NumElements == 2)
|
|
|
|
return Op;
|
|
|
|
|
2016-02-11 13:32:46 +08:00
|
|
|
// If properly aligned, if we split we might be able to use ds_write_b64.
|
|
|
|
return SplitVectorStore(Op, DAG);
|
2017-03-27 22:04:01 +08:00
|
|
|
} else {
|
2016-02-13 12:18:53 +08:00
|
|
|
llvm_unreachable("unhandled address space");
|
2016-02-11 13:32:46 +08:00
|
|
|
}
|
2013-11-14 07:36:50 +08:00
|
|
|
}
|
|
|
|
|
2014-07-20 02:44:39 +08:00
|
|
|
SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
|
2015-04-28 22:05:47 +08:00
|
|
|
SDLoc DL(Op);
|
2014-07-20 02:44:39 +08:00
|
|
|
EVT VT = Op.getValueType();
|
|
|
|
SDValue Arg = Op.getOperand(0);
|
2015-09-17 00:31:21 +08:00
|
|
|
// TODO: Should this propagate fast-math-flags?
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
|
|
|
|
DAG.getNode(ISD::FMUL, DL, VT, Arg,
|
|
|
|
DAG.getConstantFP(0.5/M_PI, DL,
|
|
|
|
VT)));
|
2014-07-20 02:44:39 +08:00
|
|
|
|
|
|
|
switch (Op.getOpcode()) {
|
|
|
|
case ISD::FCOS:
|
|
|
|
return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, FractPart);
|
|
|
|
case ISD::FSIN:
|
|
|
|
return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, FractPart);
|
|
|
|
default:
|
|
|
|
llvm_unreachable("Wrong trig opcode");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
AMDGPU: Implement {BUFFER,FLAT}_ATOMIC_CMPSWAP{,_X2}
Summary:
Implement BUFFER_ATOMIC_CMPSWAP{,_X2} instructions on all GCN targets, and FLAT_ATOMIC_CMPSWAP{,_X2} on CI+.
32-bit instruction variants tested manually on Kabini and Bonaire. Tests and parts of code provided by Jan Veselý.
Patch by: Vedran Miletić
Reviewers: arsenm, tstellarAMD, nhaehnle
Subscribers: jvesely, scchan, kanarayan, arsenm
Differential Revision: http://reviews.llvm.org/D17280
llvm-svn: 265170
2016-04-02 02:27:37 +08:00
|
|
|
SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
|
|
|
|
AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
|
|
|
|
assert(AtomicNode->isCompareAndSwap());
|
|
|
|
unsigned AS = AtomicNode->getAddressSpace();
|
|
|
|
|
|
|
|
// No custom lowering required for local address space
|
2017-03-27 22:04:01 +08:00
|
|
|
if (!isFlatGlobalAddrSpace(AS, AMDGPUASI))
|
AMDGPU: Implement {BUFFER,FLAT}_ATOMIC_CMPSWAP{,_X2}
Summary:
Implement BUFFER_ATOMIC_CMPSWAP{,_X2} instructions on all GCN targets, and FLAT_ATOMIC_CMPSWAP{,_X2} on CI+.
32-bit instruction variants tested manually on Kabini and Bonaire. Tests and parts of code provided by Jan Veselý.
Patch by: Vedran Miletić
Reviewers: arsenm, tstellarAMD, nhaehnle
Subscribers: jvesely, scchan, kanarayan, arsenm
Differential Revision: http://reviews.llvm.org/D17280
llvm-svn: 265170
2016-04-02 02:27:37 +08:00
|
|
|
return Op;
|
|
|
|
|
|
|
|
// Non-local address space requires custom lowering for atomic compare
|
|
|
|
// and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
|
|
|
|
SDLoc DL(Op);
|
|
|
|
SDValue ChainIn = Op.getOperand(0);
|
|
|
|
SDValue Addr = Op.getOperand(1);
|
|
|
|
SDValue Old = Op.getOperand(2);
|
|
|
|
SDValue New = Op.getOperand(3);
|
|
|
|
EVT VT = Op.getValueType();
|
|
|
|
MVT SimpleVT = VT.getSimpleVT();
|
|
|
|
MVT VecType = MVT::getVectorVT(SimpleVT, 2);
|
|
|
|
|
2016-04-27 05:15:30 +08:00
|
|
|
SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
|
AMDGPU: Implement {BUFFER,FLAT}_ATOMIC_CMPSWAP{,_X2}
Summary:
Implement BUFFER_ATOMIC_CMPSWAP{,_X2} instructions on all GCN targets, and FLAT_ATOMIC_CMPSWAP{,_X2} on CI+.
32-bit instruction variants tested manually on Kabini and Bonaire. Tests and parts of code provided by Jan Veselý.
Patch by: Vedran Miletić
Reviewers: arsenm, tstellarAMD, nhaehnle
Subscribers: jvesely, scchan, kanarayan, arsenm
Differential Revision: http://reviews.llvm.org/D17280
llvm-svn: 265170
2016-04-02 02:27:37 +08:00
|
|
|
SDValue Ops[] = { ChainIn, Addr, NewOld };
|
2016-06-10 07:42:48 +08:00
|
|
|
|
|
|
|
return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(),
|
|
|
|
Ops, VT, AtomicNode->getMemOperand());
|
AMDGPU: Implement {BUFFER,FLAT}_ATOMIC_CMPSWAP{,_X2}
Summary:
Implement BUFFER_ATOMIC_CMPSWAP{,_X2} instructions on all GCN targets, and FLAT_ATOMIC_CMPSWAP{,_X2} on CI+.
32-bit instruction variants tested manually on Kabini and Bonaire. Tests and parts of code provided by Jan Veselý.
Patch by: Vedran Miletić
Reviewers: arsenm, tstellarAMD, nhaehnle
Subscribers: jvesely, scchan, kanarayan, arsenm
Differential Revision: http://reviews.llvm.org/D17280
llvm-svn: 265170
2016-04-02 02:27:37 +08:00
|
|
|
}
|
|
|
|
|
2012-12-12 05:25:42 +08:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// Custom DAG optimizations
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2014-06-12 01:50:44 +08:00
|
|
|
SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
|
2015-01-14 09:35:22 +08:00
|
|
|
DAGCombinerInfo &DCI) const {
|
2014-06-12 01:50:44 +08:00
|
|
|
EVT VT = N->getValueType(0);
|
|
|
|
EVT ScalarVT = VT.getScalarType();
|
|
|
|
if (ScalarVT != MVT::f32)
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
SelectionDAG &DAG = DCI.DAG;
|
|
|
|
SDLoc DL(N);
|
|
|
|
|
|
|
|
SDValue Src = N->getOperand(0);
|
|
|
|
EVT SrcVT = Src.getValueType();
|
|
|
|
|
|
|
|
// TODO: We could try to match extracting the higher bytes, which would be
|
|
|
|
// easier if i8 vectors weren't promoted to i32 vectors, particularly after
|
|
|
|
// types are legalized. v4i8 -> v4f32 is probably the only case to worry
|
|
|
|
// about in practice.
|
|
|
|
if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) {
|
|
|
|
if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
|
|
|
|
SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src);
|
|
|
|
DCI.AddToWorklist(Cvt.getNode());
|
|
|
|
return Cvt;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return SDValue();
|
|
|
|
}
|
|
|
|
|
2015-03-12 02:43:21 +08:00
|
|
|
/// \brief Return true if the given offset Size in bytes can be folded into
|
|
|
|
/// the immediate offsets of a memory instruction for the given address space.
|
|
|
|
static bool canFoldOffset(unsigned OffsetSize, unsigned AS,
|
2016-06-24 14:30:11 +08:00
|
|
|
const SISubtarget &STI) {
|
2017-03-27 22:04:01 +08:00
|
|
|
auto AMDGPUASI = STI.getAMDGPUAS();
|
|
|
|
if (AS == AMDGPUASI.GLOBAL_ADDRESS) {
|
2015-03-12 02:43:21 +08:00
|
|
|
// MUBUF instructions a 12-bit offset in bytes.
|
|
|
|
return isUInt<12>(OffsetSize);
|
2017-03-27 22:04:01 +08:00
|
|
|
}
|
|
|
|
if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
|
2015-03-12 02:43:21 +08:00
|
|
|
// SMRD instructions have an 8-bit offset in dwords on SI and
|
|
|
|
// a 20-bit offset in bytes on VI.
|
2016-06-24 14:30:11 +08:00
|
|
|
if (STI.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
|
2015-03-12 02:43:21 +08:00
|
|
|
return isUInt<20>(OffsetSize);
|
|
|
|
else
|
|
|
|
return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4);
|
2017-03-27 22:04:01 +08:00
|
|
|
}
|
|
|
|
if (AS == AMDGPUASI.LOCAL_ADDRESS ||
|
|
|
|
AS == AMDGPUASI.REGION_ADDRESS) {
|
2015-03-12 02:43:21 +08:00
|
|
|
// The single offset versions have a 16-bit offset in bytes.
|
|
|
|
return isUInt<16>(OffsetSize);
|
|
|
|
}
|
2017-03-27 22:04:01 +08:00
|
|
|
// Indirect register addressing does not use any offsets.
|
|
|
|
return false;
|
2015-03-12 02:43:21 +08:00
|
|
|
}
|
|
|
|
|
2014-08-16 01:49:05 +08:00
|
|
|
// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
|
|
|
|
|
|
|
|
// This is a variant of
|
|
|
|
// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
|
|
|
|
//
|
|
|
|
// The normal DAG combiner will do this, but only if the add has one use since
|
|
|
|
// that would increase the number of instructions.
|
|
|
|
//
|
|
|
|
// This prevents us from seeing a constant offset that can be folded into a
|
|
|
|
// memory instruction's addressing mode. If we know the resulting add offset of
|
|
|
|
// a pointer can be folded into an addressing offset, we can replace the pointer
|
|
|
|
// operand with the add of new constant offset. This eliminates one of the uses,
|
|
|
|
// and may allow the remaining use to also be simplified.
|
|
|
|
//
|
|
|
|
SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
|
|
|
|
unsigned AddrSpace,
|
|
|
|
DAGCombinerInfo &DCI) const {
|
|
|
|
SDValue N0 = N->getOperand(0);
|
|
|
|
SDValue N1 = N->getOperand(1);
|
|
|
|
|
|
|
|
if (N0.getOpcode() != ISD::ADD)
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
|
|
|
|
if (!CN1)
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
|
|
|
|
if (!CAdd)
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
// If the resulting offset is too large, we can't fold it into the addressing
|
|
|
|
// mode offset.
|
|
|
|
APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
|
2016-06-24 14:30:11 +08:00
|
|
|
if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *getSubtarget()))
|
2014-08-16 01:49:05 +08:00
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
SelectionDAG &DAG = DCI.DAG;
|
|
|
|
SDLoc SL(N);
|
|
|
|
EVT VT = N->getValueType(0);
|
|
|
|
|
|
|
|
SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32);
|
2014-08-16 01:49:05 +08:00
|
|
|
|
|
|
|
return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset);
|
|
|
|
}
|
|
|
|
|
2016-12-22 11:44:42 +08:00
|
|
|
SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
|
|
|
|
DAGCombinerInfo &DCI) const {
|
|
|
|
SDValue Ptr = N->getBasePtr();
|
|
|
|
SelectionDAG &DAG = DCI.DAG;
|
|
|
|
SDLoc SL(N);
|
|
|
|
|
|
|
|
// TODO: We could also do this for multiplies.
|
|
|
|
unsigned AS = N->getAddressSpace();
|
2017-03-27 22:04:01 +08:00
|
|
|
if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUASI.PRIVATE_ADDRESS) {
|
2016-12-22 11:44:42 +08:00
|
|
|
SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI);
|
|
|
|
if (NewPtr) {
|
|
|
|
SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
|
|
|
|
|
|
|
|
NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr;
|
|
|
|
return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return SDValue();
|
|
|
|
}
|
|
|
|
|
2016-09-14 23:19:03 +08:00
|
|
|
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
|
|
|
|
return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
|
|
|
|
(Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
|
|
|
|
(Opc == ISD::XOR && Val == 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
|
|
|
|
// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
|
|
|
|
// integer combine opportunities since most 64-bit operations are decomposed
|
|
|
|
// this way. TODO: We won't want this for SALU especially if it is an inline
|
|
|
|
// immediate.
|
|
|
|
SDValue SITargetLowering::splitBinaryBitConstantOp(
|
|
|
|
DAGCombinerInfo &DCI,
|
|
|
|
const SDLoc &SL,
|
|
|
|
unsigned Opc, SDValue LHS,
|
|
|
|
const ConstantSDNode *CRHS) const {
|
|
|
|
uint64_t Val = CRHS->getZExtValue();
|
|
|
|
uint32_t ValLo = Lo_32(Val);
|
|
|
|
uint32_t ValHi = Hi_32(Val);
|
|
|
|
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
|
|
|
|
|
|
|
|
if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
|
|
|
|
bitOpWithConstantIsReducible(Opc, ValHi)) ||
|
|
|
|
(CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
|
|
|
|
// If we need to materialize a 64-bit immediate, it will be split up later
|
|
|
|
// anyway. Avoid creating the harder to understand 64-bit immediate
|
|
|
|
// materialization.
|
|
|
|
return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
|
|
|
|
}
|
|
|
|
|
|
|
|
return SDValue();
|
|
|
|
}
|
|
|
|
|
2017-06-28 02:25:26 +08:00
|
|
|
// Returns true if argument is a boolean value which is not serialized into
|
|
|
|
// memory or argument and does not require v_cmdmask_b32 to be deserialized.
|
|
|
|
static bool isBoolSGPR(SDValue V) {
|
|
|
|
if (V.getValueType() != MVT::i1)
|
|
|
|
return false;
|
|
|
|
switch (V.getOpcode()) {
|
|
|
|
default: break;
|
|
|
|
case ISD::SETCC:
|
|
|
|
case ISD::AND:
|
|
|
|
case ISD::OR:
|
|
|
|
case ISD::XOR:
|
|
|
|
case AMDGPUISD::FP_CLASS:
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2015-01-07 07:00:46 +08:00
|
|
|
SDValue SITargetLowering::performAndCombine(SDNode *N,
|
|
|
|
DAGCombinerInfo &DCI) const {
|
|
|
|
if (DCI.isBeforeLegalize())
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
SelectionDAG &DAG = DCI.DAG;
|
2016-09-14 23:19:03 +08:00
|
|
|
EVT VT = N->getValueType(0);
|
2015-01-07 07:00:46 +08:00
|
|
|
SDValue LHS = N->getOperand(0);
|
|
|
|
SDValue RHS = N->getOperand(1);
|
|
|
|
|
2016-09-14 23:19:03 +08:00
|
|
|
|
2017-05-24 03:54:48 +08:00
|
|
|
const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
|
|
|
|
if (VT == MVT::i64 && CRHS) {
|
|
|
|
if (SDValue Split
|
|
|
|
= splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
|
|
|
|
return Split;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (CRHS && VT == MVT::i32) {
|
|
|
|
// and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
|
|
|
|
// nb = number of trailing zeroes in mask
|
|
|
|
// It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
|
|
|
|
// given that we are selecting 8 or 16 bit fields starting at byte boundary.
|
|
|
|
uint64_t Mask = CRHS->getZExtValue();
|
|
|
|
unsigned Bits = countPopulation(Mask);
|
|
|
|
if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
|
|
|
|
(Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
|
|
|
|
if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
|
|
|
|
unsigned Shift = CShift->getZExtValue();
|
|
|
|
unsigned NB = CRHS->getAPIntValue().countTrailingZeros();
|
|
|
|
unsigned Offset = NB + Shift;
|
|
|
|
if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
|
|
|
|
SDLoc SL(N);
|
|
|
|
SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
|
|
|
|
LHS->getOperand(0),
|
|
|
|
DAG.getConstant(Offset, SL, MVT::i32),
|
|
|
|
DAG.getConstant(Bits, SL, MVT::i32));
|
|
|
|
EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
|
|
|
|
SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
|
|
|
|
DAG.getValueType(NarrowVT));
|
|
|
|
SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
|
|
|
|
DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
|
|
|
|
return Shl;
|
|
|
|
}
|
|
|
|
}
|
2016-09-14 23:19:03 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
|
|
|
|
// fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
|
|
|
|
if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
|
2015-01-07 07:00:46 +08:00
|
|
|
ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
|
|
|
|
ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
|
|
|
|
|
|
|
|
SDValue X = LHS.getOperand(0);
|
|
|
|
SDValue Y = RHS.getOperand(0);
|
|
|
|
if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X)
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
if (LCC == ISD::SETO) {
|
|
|
|
if (X != LHS.getOperand(1))
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
if (RCC == ISD::SETUNE) {
|
|
|
|
const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
|
|
|
|
if (!C1 || !C1->isInfinity() || C1->isNegative())
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
const uint32_t Mask = SIInstrFlags::N_NORMAL |
|
|
|
|
SIInstrFlags::N_SUBNORMAL |
|
|
|
|
SIInstrFlags::N_ZERO |
|
|
|
|
SIInstrFlags::P_ZERO |
|
|
|
|
SIInstrFlags::P_SUBNORMAL |
|
|
|
|
SIInstrFlags::P_NORMAL;
|
|
|
|
|
|
|
|
static_assert(((~(SIInstrFlags::S_NAN |
|
|
|
|
SIInstrFlags::Q_NAN |
|
|
|
|
SIInstrFlags::N_INFINITY |
|
|
|
|
SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
|
|
|
|
"mask not equal");
|
|
|
|
|
2015-04-28 22:05:47 +08:00
|
|
|
SDLoc DL(N);
|
|
|
|
return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
|
|
|
|
X, DAG.getConstant(Mask, DL, MVT::i32));
|
2015-01-07 07:00:46 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-06-28 02:25:26 +08:00
|
|
|
if (VT == MVT::i32 &&
|
|
|
|
(RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) {
|
|
|
|
// and x, (sext cc from i1) => select cc, x, 0
|
|
|
|
if (RHS.getOpcode() != ISD::SIGN_EXTEND)
|
|
|
|
std::swap(LHS, RHS);
|
|
|
|
if (isBoolSGPR(RHS.getOperand(0)))
|
|
|
|
return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0),
|
|
|
|
LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
|
|
|
|
}
|
|
|
|
|
2015-01-07 07:00:46 +08:00
|
|
|
return SDValue();
|
|
|
|
}
|
|
|
|
|
2015-01-07 07:00:39 +08:00
|
|
|
SDValue SITargetLowering::performOrCombine(SDNode *N,
|
|
|
|
DAGCombinerInfo &DCI) const {
|
|
|
|
SelectionDAG &DAG = DCI.DAG;
|
|
|
|
SDValue LHS = N->getOperand(0);
|
|
|
|
SDValue RHS = N->getOperand(1);
|
|
|
|
|
2016-04-13 02:24:38 +08:00
|
|
|
EVT VT = N->getValueType(0);
|
2016-09-14 23:19:03 +08:00
|
|
|
if (VT == MVT::i1) {
|
|
|
|
// or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
|
|
|
|
if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
|
|
|
|
RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
|
|
|
|
SDValue Src = LHS.getOperand(0);
|
|
|
|
if (Src != RHS.getOperand(0))
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
|
|
|
|
const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
|
|
|
|
if (!CLHS || !CRHS)
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
// Only 10 bits are used.
|
|
|
|
static const uint32_t MaxMask = 0x3ff;
|
|
|
|
|
|
|
|
uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
|
|
|
|
SDLoc DL(N);
|
|
|
|
return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
|
|
|
|
Src, DAG.getConstant(NewMask, DL, MVT::i32));
|
2016-04-13 02:24:38 +08:00
|
|
|
}
|
2016-09-14 23:19:03 +08:00
|
|
|
|
|
|
|
return SDValue();
|
2016-04-13 02:24:38 +08:00
|
|
|
}
|
|
|
|
|
2016-09-14 23:19:03 +08:00
|
|
|
if (VT != MVT::i64)
|
|
|
|
return SDValue();
|
2015-01-07 07:00:39 +08:00
|
|
|
|
2016-09-14 23:19:03 +08:00
|
|
|
// TODO: This could be a generic combine with a predicate for extracting the
|
|
|
|
// high half of an integer being free.
|
|
|
|
|
|
|
|
// (or i64:x, (zero_extend i32:y)) ->
|
|
|
|
// i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
|
|
|
|
if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
|
|
|
|
RHS.getOpcode() != ISD::ZERO_EXTEND)
|
|
|
|
std::swap(LHS, RHS);
|
|
|
|
|
|
|
|
if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
|
|
|
|
SDValue ExtSrc = RHS.getOperand(0);
|
|
|
|
EVT SrcVT = ExtSrc.getValueType();
|
|
|
|
if (SrcVT == MVT::i32) {
|
|
|
|
SDLoc SL(N);
|
|
|
|
SDValue LowLHS, HiBits;
|
|
|
|
std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
|
|
|
|
SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
|
|
|
|
|
|
|
|
DCI.AddToWorklist(LowOr.getNode());
|
|
|
|
DCI.AddToWorklist(HiBits.getNode());
|
|
|
|
|
|
|
|
SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
|
|
|
|
LowOr, HiBits);
|
|
|
|
return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
|
|
|
|
}
|
|
|
|
}
|
2015-01-07 07:00:39 +08:00
|
|
|
|
2016-09-14 23:19:03 +08:00
|
|
|
const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
|
|
|
|
if (CRHS) {
|
|
|
|
if (SDValue Split
|
|
|
|
= splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR, LHS, CRHS))
|
|
|
|
return Split;
|
|
|
|
}
|
|
|
|
|
|
|
|
return SDValue();
|
|
|
|
}
|
|
|
|
|
|
|
|
SDValue SITargetLowering::performXorCombine(SDNode *N,
|
|
|
|
DAGCombinerInfo &DCI) const {
|
|
|
|
EVT VT = N->getValueType(0);
|
|
|
|
if (VT != MVT::i64)
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
SDValue LHS = N->getOperand(0);
|
|
|
|
SDValue RHS = N->getOperand(1);
|
2015-01-07 07:00:39 +08:00
|
|
|
|
2016-09-14 23:19:03 +08:00
|
|
|
const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
|
|
|
|
if (CRHS) {
|
|
|
|
if (SDValue Split
|
|
|
|
= splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
|
|
|
|
return Split;
|
2015-01-07 07:00:39 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return SDValue();
|
|
|
|
}
|
|
|
|
|
2017-04-07 04:58:30 +08:00
|
|
|
// Instructions that will be lowered with a final instruction that zeros the
|
|
|
|
// high result bits.
|
|
|
|
// XXX - probably only need to list legal operations.
|
2017-04-01 03:53:03 +08:00
|
|
|
static bool fp16SrcZerosHighBits(unsigned Opc) {
|
|
|
|
switch (Opc) {
|
2017-04-07 04:58:30 +08:00
|
|
|
case ISD::FADD:
|
|
|
|
case ISD::FSUB:
|
|
|
|
case ISD::FMUL:
|
|
|
|
case ISD::FDIV:
|
|
|
|
case ISD::FREM:
|
|
|
|
case ISD::FMA:
|
|
|
|
case ISD::FMAD:
|
|
|
|
case ISD::FCANONICALIZE:
|
|
|
|
case ISD::FP_ROUND:
|
|
|
|
case ISD::UINT_TO_FP:
|
|
|
|
case ISD::SINT_TO_FP:
|
|
|
|
case ISD::FABS:
|
|
|
|
// Fabs is lowered to a bit operation, but it's an and which will clear the
|
|
|
|
// high bits anyway.
|
|
|
|
case ISD::FSQRT:
|
|
|
|
case ISD::FSIN:
|
|
|
|
case ISD::FCOS:
|
|
|
|
case ISD::FPOWI:
|
|
|
|
case ISD::FPOW:
|
|
|
|
case ISD::FLOG:
|
|
|
|
case ISD::FLOG2:
|
|
|
|
case ISD::FLOG10:
|
|
|
|
case ISD::FEXP:
|
|
|
|
case ISD::FEXP2:
|
|
|
|
case ISD::FCEIL:
|
|
|
|
case ISD::FTRUNC:
|
|
|
|
case ISD::FRINT:
|
|
|
|
case ISD::FNEARBYINT:
|
|
|
|
case ISD::FROUND:
|
|
|
|
case ISD::FFLOOR:
|
|
|
|
case ISD::FMINNUM:
|
|
|
|
case ISD::FMAXNUM:
|
|
|
|
case AMDGPUISD::FRACT:
|
|
|
|
case AMDGPUISD::CLAMP:
|
|
|
|
case AMDGPUISD::COS_HW:
|
|
|
|
case AMDGPUISD::SIN_HW:
|
|
|
|
case AMDGPUISD::FMIN3:
|
|
|
|
case AMDGPUISD::FMAX3:
|
|
|
|
case AMDGPUISD::FMED3:
|
|
|
|
case AMDGPUISD::FMAD_FTZ:
|
|
|
|
case AMDGPUISD::RCP:
|
|
|
|
case AMDGPUISD::RSQ:
|
|
|
|
case AMDGPUISD::LDEXP:
|
2017-04-01 03:53:03 +08:00
|
|
|
return true;
|
2017-04-07 04:58:30 +08:00
|
|
|
default:
|
|
|
|
// fcopysign, select and others may be lowered to 32-bit bit operations
|
|
|
|
// which don't zero the high bits.
|
|
|
|
return false;
|
2017-04-01 03:53:03 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
|
|
|
|
DAGCombinerInfo &DCI) const {
|
|
|
|
if (!Subtarget->has16BitInsts() ||
|
|
|
|
DCI.getDAGCombineLevel() < AfterLegalizeDAG)
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
EVT VT = N->getValueType(0);
|
|
|
|
if (VT != MVT::i32)
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
SDValue Src = N->getOperand(0);
|
|
|
|
if (Src.getValueType() != MVT::i16)
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
// (i32 zext (i16 (bitcast f16:$src))) -> fp16_zext $src
|
|
|
|
// FIXME: It is not universally true that the high bits are zeroed on gfx9.
|
|
|
|
if (Src.getOpcode() == ISD::BITCAST) {
|
|
|
|
SDValue BCSrc = Src.getOperand(0);
|
|
|
|
if (BCSrc.getValueType() == MVT::f16 &&
|
|
|
|
fp16SrcZerosHighBits(BCSrc.getOpcode()))
|
|
|
|
return DCI.DAG.getNode(AMDGPUISD::FP16_ZEXT, SDLoc(N), VT, BCSrc);
|
|
|
|
}
|
|
|
|
|
|
|
|
return SDValue();
|
|
|
|
}
|
|
|
|
|
2015-01-07 07:00:39 +08:00
|
|
|
SDValue SITargetLowering::performClassCombine(SDNode *N,
|
|
|
|
DAGCombinerInfo &DCI) const {
|
|
|
|
SelectionDAG &DAG = DCI.DAG;
|
|
|
|
SDValue Mask = N->getOperand(1);
|
|
|
|
|
|
|
|
// fp_class x, 0 -> false
|
|
|
|
if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) {
|
|
|
|
if (CMask->isNullValue())
|
2015-04-28 22:05:47 +08:00
|
|
|
return DAG.getConstant(0, SDLoc(N), MVT::i1);
|
2015-01-07 07:00:39 +08:00
|
|
|
}
|
|
|
|
|
2016-06-21 02:33:56 +08:00
|
|
|
if (N->getOperand(0).isUndef())
|
|
|
|
return DAG.getUNDEF(MVT::i1);
|
|
|
|
|
2015-01-07 07:00:39 +08:00
|
|
|
return SDValue();
|
|
|
|
}
|
|
|
|
|
2017-07-13 05:20:28 +08:00
|
|
|
static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
|
|
|
|
if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions())
|
|
|
|
return true;
|
|
|
|
|
|
|
|
return DAG.isKnownNeverNaN(Op);
|
|
|
|
}
|
|
|
|
|
2017-07-14 07:59:15 +08:00
|
|
|
static bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
|
|
|
|
const SISubtarget *ST, unsigned MaxDepth=5) {
|
2017-07-13 05:20:28 +08:00
|
|
|
// If source is a result of another standard FP operation it is already in
|
|
|
|
// canonical form.
|
|
|
|
|
|
|
|
switch (Op.getOpcode()) {
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
|
|
|
|
// These will flush denorms if required.
|
|
|
|
case ISD::FADD:
|
|
|
|
case ISD::FSUB:
|
|
|
|
case ISD::FMUL:
|
|
|
|
case ISD::FSQRT:
|
|
|
|
case ISD::FCEIL:
|
|
|
|
case ISD::FFLOOR:
|
|
|
|
case ISD::FMA:
|
|
|
|
case ISD::FMAD:
|
|
|
|
|
|
|
|
case ISD::FCANONICALIZE:
|
|
|
|
return true;
|
|
|
|
|
|
|
|
case ISD::FP_ROUND:
|
|
|
|
return Op.getValueType().getScalarType() != MVT::f16 ||
|
|
|
|
ST->hasFP16Denormals();
|
|
|
|
|
|
|
|
case ISD::FP_EXTEND:
|
|
|
|
return Op.getOperand(0).getValueType().getScalarType() != MVT::f16 ||
|
|
|
|
ST->hasFP16Denormals();
|
|
|
|
|
|
|
|
case ISD::FP16_TO_FP:
|
|
|
|
case ISD::FP_TO_FP16:
|
|
|
|
return ST->hasFP16Denormals();
|
|
|
|
|
|
|
|
// It can/will be lowered or combined as a bit operation.
|
|
|
|
// Need to check their input recursively to handle.
|
|
|
|
case ISD::FNEG:
|
|
|
|
case ISD::FABS:
|
|
|
|
return (MaxDepth > 0) &&
|
2017-07-14 07:59:15 +08:00
|
|
|
isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1);
|
2017-07-13 05:20:28 +08:00
|
|
|
|
|
|
|
case ISD::FSIN:
|
|
|
|
case ISD::FCOS:
|
|
|
|
case ISD::FSINCOS:
|
|
|
|
return Op.getValueType().getScalarType() != MVT::f16;
|
|
|
|
|
|
|
|
// In pre-GFX9 targets V_MIN_F32 and others do not flush denorms.
|
|
|
|
// For such targets need to check their input recursively.
|
|
|
|
case ISD::FMINNUM:
|
|
|
|
case ISD::FMAXNUM:
|
|
|
|
case ISD::FMINNAN:
|
|
|
|
case ISD::FMAXNAN:
|
|
|
|
|
2017-07-14 07:59:15 +08:00
|
|
|
if (ST->supportsMinMaxDenormModes() &&
|
|
|
|
DAG.isKnownNeverNaN(Op.getOperand(0)) &&
|
|
|
|
DAG.isKnownNeverNaN(Op.getOperand(1)))
|
|
|
|
return true;
|
|
|
|
|
2017-07-13 05:20:28 +08:00
|
|
|
return (MaxDepth > 0) &&
|
2017-07-14 07:59:15 +08:00
|
|
|
isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1) &&
|
|
|
|
isCanonicalized(DAG, Op.getOperand(1), ST, MaxDepth - 1);
|
2017-07-13 05:20:28 +08:00
|
|
|
|
|
|
|
case ISD::ConstantFP: {
|
|
|
|
auto F = cast<ConstantFPSDNode>(Op)->getValueAPF();
|
|
|
|
return !F.isDenormal() && !(F.isNaN() && F.isSignaling());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2016-04-14 09:42:16 +08:00
|
|
|
// Constant fold canonicalize.
|
|
|
|
SDValue SITargetLowering::performFCanonicalizeCombine(
|
|
|
|
SDNode *N,
|
|
|
|
DAGCombinerInfo &DCI) const {
|
2017-07-13 05:20:28 +08:00
|
|
|
SelectionDAG &DAG = DCI.DAG;
|
2017-02-28 06:15:25 +08:00
|
|
|
ConstantFPSDNode *CFP = isConstOrConstSplatFP(N->getOperand(0));
|
2017-07-13 05:20:28 +08:00
|
|
|
|
|
|
|
if (!CFP) {
|
|
|
|
SDValue N0 = N->getOperand(0);
|
2017-07-14 07:59:15 +08:00
|
|
|
EVT VT = N0.getValueType().getScalarType();
|
|
|
|
auto ST = getSubtarget();
|
|
|
|
|
|
|
|
if (((VT == MVT::f32 && ST->hasFP32Denormals()) ||
|
|
|
|
(VT == MVT::f64 && ST->hasFP64Denormals()) ||
|
|
|
|
(VT == MVT::f16 && ST->hasFP16Denormals())) &&
|
|
|
|
DAG.isKnownNeverNaN(N0))
|
|
|
|
return N0;
|
2017-07-13 05:20:28 +08:00
|
|
|
|
|
|
|
bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
|
|
|
|
|
|
|
|
if ((IsIEEEMode || isKnownNeverSNan(DAG, N0)) &&
|
2017-07-14 07:59:15 +08:00
|
|
|
isCanonicalized(DAG, N0, ST))
|
2017-07-13 05:20:28 +08:00
|
|
|
return N0;
|
|
|
|
|
2016-04-14 09:42:16 +08:00
|
|
|
return SDValue();
|
2017-07-13 05:20:28 +08:00
|
|
|
}
|
2016-04-14 09:42:16 +08:00
|
|
|
|
|
|
|
const APFloat &C = CFP->getValueAPF();
|
|
|
|
|
|
|
|
// Flush denormals to 0 if not enabled.
|
|
|
|
if (C.isDenormal()) {
|
|
|
|
EVT VT = N->getValueType(0);
|
2017-02-28 06:15:25 +08:00
|
|
|
EVT SVT = VT.getScalarType();
|
|
|
|
if (SVT == MVT::f32 && !Subtarget->hasFP32Denormals())
|
2016-04-14 09:42:16 +08:00
|
|
|
return DAG.getConstantFP(0.0, SDLoc(N), VT);
|
|
|
|
|
2017-02-28 06:15:25 +08:00
|
|
|
if (SVT == MVT::f64 && !Subtarget->hasFP64Denormals())
|
2016-04-14 09:42:16 +08:00
|
|
|
return DAG.getConstantFP(0.0, SDLoc(N), VT);
|
2016-12-22 11:05:37 +08:00
|
|
|
|
2017-02-28 06:15:25 +08:00
|
|
|
if (SVT == MVT::f16 && !Subtarget->hasFP16Denormals())
|
2016-12-22 11:05:37 +08:00
|
|
|
return DAG.getConstantFP(0.0, SDLoc(N), VT);
|
2016-04-14 09:42:16 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
if (C.isNaN()) {
|
|
|
|
EVT VT = N->getValueType(0);
|
|
|
|
APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
|
|
|
|
if (C.isSignaling()) {
|
|
|
|
// Quiet a signaling NaN.
|
|
|
|
return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Make sure it is the canonical NaN bitpattern.
|
|
|
|
//
|
|
|
|
// TODO: Can we use -1 as the canonical NaN value since it's an inline
|
|
|
|
// immediate?
|
|
|
|
if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
|
|
|
|
return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
|
|
|
|
}
|
|
|
|
|
2017-02-28 06:15:25 +08:00
|
|
|
return N->getOperand(0);
|
2016-04-14 09:42:16 +08:00
|
|
|
}
|
|
|
|
|
2014-11-15 04:08:52 +08:00
|
|
|
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
|
|
|
|
switch (Opc) {
|
|
|
|
case ISD::FMAXNUM:
|
|
|
|
return AMDGPUISD::FMAX3;
|
2015-06-09 08:52:37 +08:00
|
|
|
case ISD::SMAX:
|
2014-11-15 04:08:52 +08:00
|
|
|
return AMDGPUISD::SMAX3;
|
2015-06-09 08:52:37 +08:00
|
|
|
case ISD::UMAX:
|
2014-11-15 04:08:52 +08:00
|
|
|
return AMDGPUISD::UMAX3;
|
|
|
|
case ISD::FMINNUM:
|
|
|
|
return AMDGPUISD::FMIN3;
|
2015-06-09 08:52:37 +08:00
|
|
|
case ISD::SMIN:
|
2014-11-15 04:08:52 +08:00
|
|
|
return AMDGPUISD::SMIN3;
|
2015-06-09 08:52:37 +08:00
|
|
|
case ISD::UMIN:
|
2014-11-15 04:08:52 +08:00
|
|
|
return AMDGPUISD::UMIN3;
|
|
|
|
default:
|
|
|
|
llvm_unreachable("Not a min/max opcode");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-02-28 06:40:39 +08:00
|
|
|
SDValue SITargetLowering::performIntMed3ImmCombine(
|
|
|
|
SelectionDAG &DAG, const SDLoc &SL,
|
|
|
|
SDValue Op0, SDValue Op1, bool Signed) const {
|
2016-01-29 04:53:42 +08:00
|
|
|
ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1);
|
|
|
|
if (!K1)
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
ConstantSDNode *K0 = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
|
|
|
|
if (!K0)
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
if (Signed) {
|
|
|
|
if (K0->getAPIntValue().sge(K1->getAPIntValue()))
|
|
|
|
return SDValue();
|
|
|
|
} else {
|
|
|
|
if (K0->getAPIntValue().uge(K1->getAPIntValue()))
|
|
|
|
return SDValue();
|
|
|
|
}
|
|
|
|
|
|
|
|
EVT VT = K0->getValueType(0);
|
2017-02-28 06:40:39 +08:00
|
|
|
unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
|
|
|
|
if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16())) {
|
|
|
|
return DAG.getNode(Med3Opc, SL, VT,
|
|
|
|
Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
|
|
|
|
}
|
2016-11-11 00:02:37 +08:00
|
|
|
|
2017-02-28 06:40:39 +08:00
|
|
|
// If there isn't a 16-bit med3 operation, convert to 32-bit.
|
2016-11-11 00:02:37 +08:00
|
|
|
MVT NVT = MVT::i32;
|
|
|
|
unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
|
|
|
|
|
2017-02-28 06:40:39 +08:00
|
|
|
SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
|
|
|
|
SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
|
|
|
|
SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
|
2016-11-11 00:02:37 +08:00
|
|
|
|
2017-02-28 06:40:39 +08:00
|
|
|
SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3);
|
|
|
|
return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3);
|
2016-01-29 04:53:42 +08:00
|
|
|
}
|
|
|
|
|
2017-08-30 09:20:17 +08:00
|
|
|
static ConstantFPSDNode *getSplatConstantFP(SDValue Op) {
|
|
|
|
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
|
|
|
|
return C;
|
|
|
|
|
|
|
|
if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {
|
|
|
|
if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
|
|
|
|
return C;
|
|
|
|
}
|
|
|
|
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
2017-02-22 07:35:48 +08:00
|
|
|
SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
|
|
|
|
const SDLoc &SL,
|
|
|
|
SDValue Op0,
|
|
|
|
SDValue Op1) const {
|
2017-08-30 09:20:17 +08:00
|
|
|
ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
|
2016-01-29 04:53:42 +08:00
|
|
|
if (!K1)
|
|
|
|
return SDValue();
|
|
|
|
|
2017-08-30 09:20:17 +08:00
|
|
|
ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
|
2016-01-29 04:53:42 +08:00
|
|
|
if (!K0)
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
// Ordered >= (although NaN inputs should have folded away by now).
|
|
|
|
APFloat::cmpResult Cmp = K0->getValueAPF().compare(K1->getValueAPF());
|
|
|
|
if (Cmp == APFloat::cmpGreaterThan)
|
|
|
|
return SDValue();
|
|
|
|
|
2017-02-22 07:35:48 +08:00
|
|
|
// TODO: Check IEEE bit enabled?
|
2017-08-30 09:20:17 +08:00
|
|
|
EVT VT = Op0.getValueType();
|
2017-02-22 07:35:48 +08:00
|
|
|
if (Subtarget->enableDX10Clamp()) {
|
|
|
|
// If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
|
|
|
|
// hardware fmed3 behavior converting to a min.
|
|
|
|
// FIXME: Should this be allowing -0.0?
|
|
|
|
if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
|
|
|
|
return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
|
|
|
|
}
|
|
|
|
|
2017-08-30 09:20:17 +08:00
|
|
|
// med3 for f16 is only available on gfx9+, and not available for v2f16.
|
|
|
|
if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
|
|
|
|
// This isn't safe with signaling NaNs because in IEEE mode, min/max on a
|
|
|
|
// signaling NaN gives a quiet NaN. The quiet NaN input to the min would
|
|
|
|
// then give the other result, which is different from med3 with a NaN
|
|
|
|
// input.
|
|
|
|
SDValue Var = Op0.getOperand(0);
|
|
|
|
if (!isKnownNeverSNan(DAG, Var))
|
|
|
|
return SDValue();
|
2017-02-22 07:35:48 +08:00
|
|
|
|
2017-08-30 09:20:17 +08:00
|
|
|
return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
|
|
|
|
Var, SDValue(K0, 0), SDValue(K1, 0));
|
|
|
|
}
|
2016-01-29 04:53:42 +08:00
|
|
|
|
2017-08-30 09:20:17 +08:00
|
|
|
return SDValue();
|
2016-01-29 04:53:42 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
|
|
|
|
DAGCombinerInfo &DCI) const {
|
2014-11-15 04:08:52 +08:00
|
|
|
SelectionDAG &DAG = DCI.DAG;
|
|
|
|
|
2017-02-23 07:53:37 +08:00
|
|
|
EVT VT = N->getValueType(0);
|
2014-11-15 04:08:52 +08:00
|
|
|
unsigned Opc = N->getOpcode();
|
|
|
|
SDValue Op0 = N->getOperand(0);
|
|
|
|
SDValue Op1 = N->getOperand(1);
|
|
|
|
|
|
|
|
// Only do this if the inner op has one use since this will just increases
|
|
|
|
// register pressure for no benefit.
|
|
|
|
|
2017-02-23 07:53:37 +08:00
|
|
|
|
|
|
|
if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
|
2017-05-18 03:25:06 +08:00
|
|
|
VT != MVT::f64 &&
|
|
|
|
((VT != MVT::f16 && VT != MVT::i16) || Subtarget->hasMin3Max3_16())) {
|
2016-01-29 04:53:48 +08:00
|
|
|
// max(max(a, b), c) -> max3(a, b, c)
|
|
|
|
// min(min(a, b), c) -> min3(a, b, c)
|
|
|
|
if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
|
|
|
|
SDLoc DL(N);
|
|
|
|
return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
|
|
|
|
DL,
|
|
|
|
N->getValueType(0),
|
|
|
|
Op0.getOperand(0),
|
|
|
|
Op0.getOperand(1),
|
|
|
|
Op1);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Try commuted.
|
|
|
|
// max(a, max(b, c)) -> max3(a, b, c)
|
|
|
|
// min(a, min(b, c)) -> min3(a, b, c)
|
|
|
|
if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
|
|
|
|
SDLoc DL(N);
|
|
|
|
return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
|
|
|
|
DL,
|
|
|
|
N->getValueType(0),
|
|
|
|
Op0,
|
|
|
|
Op1.getOperand(0),
|
|
|
|
Op1.getOperand(1));
|
|
|
|
}
|
2014-11-15 04:08:52 +08:00
|
|
|
}
|
|
|
|
|
2016-01-29 04:53:42 +08:00
|
|
|
// min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
|
|
|
|
if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
|
|
|
|
if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true))
|
|
|
|
return Med3;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
|
|
|
|
if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false))
|
|
|
|
return Med3;
|
|
|
|
}
|
|
|
|
|
|
|
|
// fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
|
2016-01-29 04:53:48 +08:00
|
|
|
if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
|
|
|
|
(Opc == AMDGPUISD::FMIN_LEGACY &&
|
|
|
|
Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
|
2017-02-23 07:53:37 +08:00
|
|
|
(VT == MVT::f32 || VT == MVT::f64 ||
|
2017-08-30 09:20:17 +08:00
|
|
|
(VT == MVT::f16 && Subtarget->has16BitInsts()) ||
|
|
|
|
(VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
|
2017-02-22 07:35:48 +08:00
|
|
|
Op0.hasOneUse()) {
|
2016-01-29 04:53:42 +08:00
|
|
|
if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
|
|
|
|
return Res;
|
|
|
|
}
|
|
|
|
|
2014-11-15 04:08:52 +08:00
|
|
|
return SDValue();
|
|
|
|
}
|
|
|
|
|
2017-02-22 07:35:48 +08:00
|
|
|
static bool isClampZeroToOne(SDValue A, SDValue B) {
|
|
|
|
if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
|
|
|
|
if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
|
|
|
|
// FIXME: Should this be allowing -0.0?
|
|
|
|
return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
|
|
|
|
(CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// FIXME: Should only worry about snans for version with chain.
|
|
|
|
SDValue SITargetLowering::performFMed3Combine(SDNode *N,
|
|
|
|
DAGCombinerInfo &DCI) const {
|
|
|
|
EVT VT = N->getValueType(0);
|
|
|
|
// v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
|
|
|
|
// NaNs. With a NaN input, the order of the operands may change the result.
|
|
|
|
|
|
|
|
SelectionDAG &DAG = DCI.DAG;
|
|
|
|
SDLoc SL(N);
|
|
|
|
|
|
|
|
SDValue Src0 = N->getOperand(0);
|
|
|
|
SDValue Src1 = N->getOperand(1);
|
|
|
|
SDValue Src2 = N->getOperand(2);
|
|
|
|
|
|
|
|
if (isClampZeroToOne(Src0, Src1)) {
|
|
|
|
// const_a, const_b, x -> clamp is safe in all cases including signaling
|
|
|
|
// nans.
|
|
|
|
// FIXME: Should this be allowing -0.0?
|
|
|
|
return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
|
|
|
|
}
|
|
|
|
|
|
|
|
// FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
|
|
|
|
// handling no dx10-clamp?
|
|
|
|
if (Subtarget->enableDX10Clamp()) {
|
|
|
|
// If NaNs is clamped to 0, we are free to reorder the inputs.
|
|
|
|
|
|
|
|
if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
|
|
|
|
std::swap(Src0, Src1);
|
|
|
|
|
|
|
|
if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
|
|
|
|
std::swap(Src1, Src2);
|
|
|
|
|
|
|
|
if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
|
|
|
|
std::swap(Src0, Src1);
|
|
|
|
|
|
|
|
if (isClampZeroToOne(Src1, Src2))
|
|
|
|
return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
|
|
|
|
}
|
|
|
|
|
|
|
|
return SDValue();
|
|
|
|
}
|
|
|
|
|
2017-02-22 08:27:34 +08:00
|
|
|
SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
|
|
|
|
DAGCombinerInfo &DCI) const {
|
|
|
|
SDValue Src0 = N->getOperand(0);
|
|
|
|
SDValue Src1 = N->getOperand(1);
|
|
|
|
if (Src0.isUndef() && Src1.isUndef())
|
|
|
|
return DCI.DAG.getUNDEF(N->getValueType(0));
|
|
|
|
return SDValue();
|
|
|
|
}
|
|
|
|
|
2017-05-12 01:26:25 +08:00
|
|
|
SDValue SITargetLowering::performExtractVectorEltCombine(
|
|
|
|
SDNode *N, DAGCombinerInfo &DCI) const {
|
|
|
|
SDValue Vec = N->getOperand(0);
|
|
|
|
|
2017-09-21 05:01:24 +08:00
|
|
|
SelectionDAG &DAG = DCI.DAG;
|
2017-05-12 01:26:25 +08:00
|
|
|
if (Vec.getOpcode() == ISD::FNEG && allUsesHaveSourceMods(N)) {
|
|
|
|
SDLoc SL(N);
|
|
|
|
EVT EltVT = N->getValueType(0);
|
|
|
|
SDValue Idx = N->getOperand(1);
|
|
|
|
SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
|
|
|
|
Vec.getOperand(0), Idx);
|
|
|
|
return DAG.getNode(ISD::FNEG, SL, EltVT, Elt);
|
|
|
|
}
|
|
|
|
|
|
|
|
return SDValue();
|
|
|
|
}
|
|
|
|
|
2017-09-21 05:01:24 +08:00
|
|
|
static bool convertBuildVectorCastElt(SelectionDAG &DAG,
|
|
|
|
SDValue &Lo, SDValue &Hi) {
|
|
|
|
if (Hi.getOpcode() == ISD::BITCAST &&
|
|
|
|
Hi.getOperand(0).getValueType() == MVT::f16 &&
|
|
|
|
(isa<ConstantSDNode>(Lo) || Lo.isUndef())) {
|
|
|
|
Lo = DAG.getNode(ISD::BITCAST, SDLoc(Lo), MVT::f16, Lo);
|
|
|
|
Hi = Hi.getOperand(0);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
SDValue SITargetLowering::performBuildVectorCombine(
|
|
|
|
SDNode *N, DAGCombinerInfo &DCI) const {
|
|
|
|
SDLoc SL(N);
|
|
|
|
|
|
|
|
if (!isTypeLegal(MVT::v2i16))
|
|
|
|
return SDValue();
|
|
|
|
SelectionDAG &DAG = DCI.DAG;
|
|
|
|
EVT VT = N->getValueType(0);
|
|
|
|
|
|
|
|
if (VT == MVT::v2i16) {
|
|
|
|
SDValue Lo = N->getOperand(0);
|
|
|
|
SDValue Hi = N->getOperand(1);
|
|
|
|
|
|
|
|
// v2i16 build_vector (const|undef), (bitcast f16:$x)
|
|
|
|
// -> bitcast (v2f16 build_vector const|undef, $x
|
|
|
|
if (convertBuildVectorCastElt(DAG, Lo, Hi)) {
|
|
|
|
SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Lo, Hi });
|
|
|
|
return DAG.getNode(ISD::BITCAST, SL, VT, NewVec);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (convertBuildVectorCastElt(DAG, Hi, Lo)) {
|
|
|
|
SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Hi, Lo });
|
|
|
|
return DAG.getNode(ISD::BITCAST, SL, VT, NewVec);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return SDValue();
|
|
|
|
}
|
2017-05-12 01:26:25 +08:00
|
|
|
|
2016-12-22 12:03:35 +08:00
|
|
|
unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
|
|
|
|
const SDNode *N0,
|
|
|
|
const SDNode *N1) const {
|
|
|
|
EVT VT = N0->getValueType(0);
|
|
|
|
|
2016-12-22 11:55:35 +08:00
|
|
|
// Only do this if we are not trying to support denormals. v_mad_f32 does not
|
|
|
|
// support denormals ever.
|
|
|
|
if ((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) ||
|
|
|
|
(VT == MVT::f16 && !Subtarget->hasFP16Denormals()))
|
|
|
|
return ISD::FMAD;
|
|
|
|
|
|
|
|
const TargetOptions &Options = DAG.getTarget().Options;
|
2017-05-01 23:17:51 +08:00
|
|
|
if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
|
|
|
|
(N0->getFlags().hasUnsafeAlgebra() &&
|
|
|
|
N1->getFlags().hasUnsafeAlgebra())) &&
|
2016-12-22 11:55:35 +08:00
|
|
|
isFMAFasterThanFMulAndFAdd(VT)) {
|
|
|
|
return ISD::FMA;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-06-22 06:05:06 +08:00
|
|
|
SDValue SITargetLowering::performAddCombine(SDNode *N,
|
|
|
|
DAGCombinerInfo &DCI) const {
|
|
|
|
SelectionDAG &DAG = DCI.DAG;
|
|
|
|
EVT VT = N->getValueType(0);
|
|
|
|
|
|
|
|
if (VT != MVT::i32)
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
SDLoc SL(N);
|
|
|
|
SDValue LHS = N->getOperand(0);
|
|
|
|
SDValue RHS = N->getOperand(1);
|
|
|
|
|
|
|
|
// add x, zext (setcc) => addcarry x, 0, setcc
|
|
|
|
// add x, sext (setcc) => subcarry x, 0, setcc
|
|
|
|
unsigned Opc = LHS.getOpcode();
|
|
|
|
if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
|
2017-06-22 06:30:01 +08:00
|
|
|
Opc == ISD::ANY_EXTEND || Opc == ISD::ADDCARRY)
|
2017-06-22 06:05:06 +08:00
|
|
|
std::swap(RHS, LHS);
|
|
|
|
|
|
|
|
Opc = RHS.getOpcode();
|
2017-06-22 06:30:01 +08:00
|
|
|
switch (Opc) {
|
|
|
|
default: break;
|
|
|
|
case ISD::ZERO_EXTEND:
|
|
|
|
case ISD::SIGN_EXTEND:
|
|
|
|
case ISD::ANY_EXTEND: {
|
2017-06-22 06:05:06 +08:00
|
|
|
auto Cond = RHS.getOperand(0);
|
2017-06-28 02:25:26 +08:00
|
|
|
if (!isBoolSGPR(Cond))
|
2017-06-22 07:46:22 +08:00
|
|
|
break;
|
2017-06-22 06:30:01 +08:00
|
|
|
SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
|
|
|
|
SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
|
|
|
|
Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::SUBCARRY : ISD::ADDCARRY;
|
|
|
|
return DAG.getNode(Opc, SL, VTList, Args);
|
|
|
|
}
|
|
|
|
case ISD::ADDCARRY: {
|
|
|
|
// add x, (addcarry y, 0, cc) => addcarry x, y, cc
|
|
|
|
auto C = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
|
|
|
|
if (!C || C->getZExtValue() != 0) break;
|
|
|
|
SDValue Args[] = { LHS, RHS.getOperand(0), RHS.getOperand(2) };
|
|
|
|
return DAG.getNode(ISD::ADDCARRY, SDLoc(N), RHS->getVTList(), Args);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return SDValue();
|
|
|
|
}
|
|
|
|
|
|
|
|
SDValue SITargetLowering::performSubCombine(SDNode *N,
|
|
|
|
DAGCombinerInfo &DCI) const {
|
|
|
|
SelectionDAG &DAG = DCI.DAG;
|
|
|
|
EVT VT = N->getValueType(0);
|
|
|
|
|
|
|
|
if (VT != MVT::i32)
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
SDLoc SL(N);
|
|
|
|
SDValue LHS = N->getOperand(0);
|
|
|
|
SDValue RHS = N->getOperand(1);
|
|
|
|
|
|
|
|
unsigned Opc = LHS.getOpcode();
|
|
|
|
if (Opc != ISD::SUBCARRY)
|
|
|
|
std::swap(RHS, LHS);
|
|
|
|
|
|
|
|
if (LHS.getOpcode() == ISD::SUBCARRY) {
|
|
|
|
// sub (subcarry x, 0, cc), y => subcarry x, y, cc
|
|
|
|
auto C = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
|
|
|
|
if (!C || C->getZExtValue() != 0)
|
|
|
|
return SDValue();
|
|
|
|
SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };
|
|
|
|
return DAG.getNode(ISD::SUBCARRY, SDLoc(N), LHS->getVTList(), Args);
|
|
|
|
}
|
|
|
|
return SDValue();
|
|
|
|
}
|
|
|
|
|
|
|
|
SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
|
|
|
|
DAGCombinerInfo &DCI) const {
|
|
|
|
|
|
|
|
if (N->getValueType(0) != MVT::i32)
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
auto C = dyn_cast<ConstantSDNode>(N->getOperand(1));
|
|
|
|
if (!C || C->getZExtValue() != 0)
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
SelectionDAG &DAG = DCI.DAG;
|
|
|
|
SDValue LHS = N->getOperand(0);
|
|
|
|
|
|
|
|
// addcarry (add x, y), 0, cc => addcarry x, y, cc
|
|
|
|
// subcarry (sub x, y), 0, cc => subcarry x, y, cc
|
|
|
|
unsigned LHSOpc = LHS.getOpcode();
|
|
|
|
unsigned Opc = N->getOpcode();
|
|
|
|
if ((LHSOpc == ISD::ADD && Opc == ISD::ADDCARRY) ||
|
|
|
|
(LHSOpc == ISD::SUB && Opc == ISD::SUBCARRY)) {
|
|
|
|
SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) };
|
|
|
|
return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
|
2017-06-22 06:05:06 +08:00
|
|
|
}
|
|
|
|
return SDValue();
|
|
|
|
}
|
|
|
|
|
2016-12-22 11:44:42 +08:00
|
|
|
SDValue SITargetLowering::performFAddCombine(SDNode *N,
|
|
|
|
DAGCombinerInfo &DCI) const {
|
|
|
|
if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
|
|
|
|
return SDValue();
|
|
|
|
|
2016-12-22 11:55:35 +08:00
|
|
|
SelectionDAG &DAG = DCI.DAG;
|
2016-12-22 11:44:42 +08:00
|
|
|
EVT VT = N->getValueType(0);
|
|
|
|
|
|
|
|
SDLoc SL(N);
|
|
|
|
SDValue LHS = N->getOperand(0);
|
|
|
|
SDValue RHS = N->getOperand(1);
|
|
|
|
|
|
|
|
// These should really be instruction patterns, but writing patterns with
|
|
|
|
// source modiifiers is a pain.
|
|
|
|
|
|
|
|
// fadd (fadd (a, a), b) -> mad 2.0, a, b
|
|
|
|
if (LHS.getOpcode() == ISD::FADD) {
|
|
|
|
SDValue A = LHS.getOperand(0);
|
|
|
|
if (A == LHS.getOperand(1)) {
|
2016-12-22 12:03:35 +08:00
|
|
|
unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
|
2016-12-22 11:55:35 +08:00
|
|
|
if (FusedOp != 0) {
|
|
|
|
const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
|
2016-12-22 12:03:40 +08:00
|
|
|
return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
|
2016-12-22 11:55:35 +08:00
|
|
|
}
|
2016-12-22 11:44:42 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// fadd (b, fadd (a, a)) -> mad 2.0, a, b
|
|
|
|
if (RHS.getOpcode() == ISD::FADD) {
|
|
|
|
SDValue A = RHS.getOperand(0);
|
|
|
|
if (A == RHS.getOperand(1)) {
|
2016-12-22 12:03:35 +08:00
|
|
|
unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
|
2016-12-22 11:55:35 +08:00
|
|
|
if (FusedOp != 0) {
|
|
|
|
const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
|
2016-12-22 12:03:40 +08:00
|
|
|
return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
|
2016-12-22 11:55:35 +08:00
|
|
|
}
|
2016-12-22 11:44:42 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return SDValue();
|
|
|
|
}
|
|
|
|
|
|
|
|
SDValue SITargetLowering::performFSubCombine(SDNode *N,
|
|
|
|
DAGCombinerInfo &DCI) const {
|
|
|
|
if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
SelectionDAG &DAG = DCI.DAG;
|
|
|
|
SDLoc SL(N);
|
|
|
|
EVT VT = N->getValueType(0);
|
|
|
|
assert(!VT.isVector());
|
|
|
|
|
|
|
|
// Try to get the fneg to fold into the source modifier. This undoes generic
|
|
|
|
// DAG combines and folds them into the mad.
|
|
|
|
//
|
|
|
|
// Only do this if we are not trying to support denormals. v_mad_f32 does
|
|
|
|
// not support denormals ever.
|
2016-12-22 11:55:35 +08:00
|
|
|
SDValue LHS = N->getOperand(0);
|
|
|
|
SDValue RHS = N->getOperand(1);
|
|
|
|
if (LHS.getOpcode() == ISD::FADD) {
|
|
|
|
// (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
|
|
|
|
SDValue A = LHS.getOperand(0);
|
|
|
|
if (A == LHS.getOperand(1)) {
|
2016-12-22 12:03:35 +08:00
|
|
|
unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
|
2016-12-22 11:55:35 +08:00
|
|
|
if (FusedOp != 0){
|
2016-12-22 11:44:42 +08:00
|
|
|
const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
|
|
|
|
SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
|
|
|
|
|
2016-12-22 12:03:40 +08:00
|
|
|
return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
|
2016-12-22 11:44:42 +08:00
|
|
|
}
|
|
|
|
}
|
2016-12-22 11:55:35 +08:00
|
|
|
}
|
2016-12-22 11:44:42 +08:00
|
|
|
|
2016-12-22 11:55:35 +08:00
|
|
|
if (RHS.getOpcode() == ISD::FADD) {
|
|
|
|
// (fsub c, (fadd a, a)) -> mad -2.0, a, c
|
2016-12-22 11:44:42 +08:00
|
|
|
|
2016-12-22 11:55:35 +08:00
|
|
|
SDValue A = RHS.getOperand(0);
|
|
|
|
if (A == RHS.getOperand(1)) {
|
2016-12-22 12:03:35 +08:00
|
|
|
unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
|
2016-12-22 11:55:35 +08:00
|
|
|
if (FusedOp != 0){
|
2016-12-22 11:44:42 +08:00
|
|
|
const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
|
2016-12-22 12:03:40 +08:00
|
|
|
return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
|
2016-12-22 11:44:42 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return SDValue();
|
|
|
|
}
|
|
|
|
|
2015-01-07 07:00:41 +08:00
|
|
|
SDValue SITargetLowering::performSetCCCombine(SDNode *N,
|
|
|
|
DAGCombinerInfo &DCI) const {
|
|
|
|
SelectionDAG &DAG = DCI.DAG;
|
|
|
|
SDLoc SL(N);
|
|
|
|
|
|
|
|
SDValue LHS = N->getOperand(0);
|
|
|
|
SDValue RHS = N->getOperand(1);
|
|
|
|
EVT VT = LHS.getValueType();
|
2017-06-28 02:53:03 +08:00
|
|
|
ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
|
|
|
|
|
|
|
|
auto CRHS = dyn_cast<ConstantSDNode>(RHS);
|
|
|
|
if (!CRHS) {
|
|
|
|
CRHS = dyn_cast<ConstantSDNode>(LHS);
|
|
|
|
if (CRHS) {
|
|
|
|
std::swap(LHS, RHS);
|
|
|
|
CC = getSetCCSwappedOperands(CC);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (CRHS && VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
|
|
|
|
isBoolSGPR(LHS.getOperand(0))) {
|
|
|
|
// setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
|
|
|
|
// setcc (sext from i1 cc), -1, eq|sle|uge) => cc
|
|
|
|
// setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
|
|
|
|
// setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
|
|
|
|
if ((CRHS->isAllOnesValue() &&
|
|
|
|
(CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
|
|
|
|
(CRHS->isNullValue() &&
|
|
|
|
(CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
|
|
|
|
return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
|
|
|
|
DAG.getConstant(-1, SL, MVT::i1));
|
|
|
|
if ((CRHS->isAllOnesValue() &&
|
|
|
|
(CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
|
|
|
|
(CRHS->isNullValue() &&
|
|
|
|
(CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
|
|
|
|
return LHS.getOperand(0);
|
|
|
|
}
|
2015-01-07 07:00:41 +08:00
|
|
|
|
2016-11-13 15:01:11 +08:00
|
|
|
if (VT != MVT::f32 && VT != MVT::f64 && (Subtarget->has16BitInsts() &&
|
|
|
|
VT != MVT::f16))
|
2015-01-07 07:00:41 +08:00
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
// Match isinf pattern
|
|
|
|
// (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
|
|
|
|
if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) {
|
|
|
|
const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
|
|
|
|
if (!CRHS)
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
const APFloat &APF = CRHS->getValueAPF();
|
|
|
|
if (APF.isInfinity() && !APF.isNegative()) {
|
|
|
|
unsigned Mask = SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY;
|
2015-04-28 22:05:47 +08:00
|
|
|
return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
|
|
|
|
DAG.getConstant(Mask, SL, MVT::i32));
|
2015-01-07 07:00:41 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return SDValue();
|
|
|
|
}
|
|
|
|
|
2016-12-22 11:44:42 +08:00
|
|
|
SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
|
|
|
|
DAGCombinerInfo &DCI) const {
|
2012-12-12 05:25:42 +08:00
|
|
|
SelectionDAG &DAG = DCI.DAG;
|
2016-12-22 11:44:42 +08:00
|
|
|
SDLoc SL(N);
|
|
|
|
unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
|
2012-12-12 05:25:42 +08:00
|
|
|
|
2016-12-22 11:44:42 +08:00
|
|
|
SDValue Src = N->getOperand(0);
|
|
|
|
SDValue Srl = N->getOperand(0);
|
|
|
|
if (Srl.getOpcode() == ISD::ZERO_EXTEND)
|
|
|
|
Srl = Srl.getOperand(0);
|
|
|
|
|
|
|
|
// TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero.
|
|
|
|
if (Srl.getOpcode() == ISD::SRL) {
|
|
|
|
// cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
|
|
|
|
// cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
|
|
|
|
// cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
|
|
|
|
|
|
|
|
if (const ConstantSDNode *C =
|
|
|
|
dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
|
|
|
|
Srl = DAG.getZExtOrTrunc(Srl.getOperand(0), SDLoc(Srl.getOperand(0)),
|
|
|
|
EVT(MVT::i32));
|
|
|
|
|
|
|
|
unsigned SrcOffset = C->getZExtValue() + 8 * Offset;
|
|
|
|
if (SrcOffset < 32 && SrcOffset % 8 == 0) {
|
|
|
|
return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + SrcOffset / 8, SL,
|
|
|
|
MVT::f32, Srl);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
|
|
|
|
|
2017-04-28 13:31:46 +08:00
|
|
|
KnownBits Known;
|
2016-12-22 11:44:42 +08:00
|
|
|
TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
|
|
|
|
!DCI.isBeforeLegalizeOps());
|
|
|
|
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
|
2017-04-22 02:53:12 +08:00
|
|
|
if (TLI.ShrinkDemandedConstant(Src, Demanded, TLO) ||
|
2017-04-28 13:31:46 +08:00
|
|
|
TLI.SimplifyDemandedBits(Src, Demanded, Known, TLO)) {
|
2016-12-22 11:44:42 +08:00
|
|
|
DCI.CommitTargetLoweringOpt(TLO);
|
|
|
|
}
|
|
|
|
|
|
|
|
return SDValue();
|
|
|
|
}
|
|
|
|
|
|
|
|
SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
|
|
|
|
DAGCombinerInfo &DCI) const {
|
2012-12-12 05:25:42 +08:00
|
|
|
switch (N->getOpcode()) {
|
2014-12-22 00:48:42 +08:00
|
|
|
default:
|
|
|
|
return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
|
2017-06-22 06:05:06 +08:00
|
|
|
case ISD::ADD:
|
|
|
|
return performAddCombine(N, DCI);
|
2017-06-22 06:30:01 +08:00
|
|
|
case ISD::SUB:
|
|
|
|
return performSubCombine(N, DCI);
|
|
|
|
case ISD::ADDCARRY:
|
|
|
|
case ISD::SUBCARRY:
|
|
|
|
return performAddCarrySubCarryCombine(N, DCI);
|
2016-12-22 11:44:42 +08:00
|
|
|
case ISD::FADD:
|
|
|
|
return performFAddCombine(N, DCI);
|
|
|
|
case ISD::FSUB:
|
|
|
|
return performFSubCombine(N, DCI);
|
2015-01-07 07:00:41 +08:00
|
|
|
case ISD::SETCC:
|
|
|
|
return performSetCCCombine(N, DCI);
|
2016-01-29 04:53:48 +08:00
|
|
|
case ISD::FMAXNUM:
|
2014-11-15 04:08:52 +08:00
|
|
|
case ISD::FMINNUM:
|
2015-06-09 08:52:37 +08:00
|
|
|
case ISD::SMAX:
|
|
|
|
case ISD::SMIN:
|
|
|
|
case ISD::UMAX:
|
2016-01-29 04:53:48 +08:00
|
|
|
case ISD::UMIN:
|
|
|
|
case AMDGPUISD::FMIN_LEGACY:
|
|
|
|
case AMDGPUISD::FMAX_LEGACY: {
|
2014-11-15 04:08:52 +08:00
|
|
|
if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
|
|
|
|
getTargetMachine().getOptLevel() > CodeGenOpt::None)
|
2016-01-29 04:53:42 +08:00
|
|
|
return performMinMaxCombine(N, DCI);
|
2014-11-15 04:08:52 +08:00
|
|
|
break;
|
|
|
|
}
|
2014-08-16 01:49:05 +08:00
|
|
|
case ISD::LOAD:
|
|
|
|
case ISD::STORE:
|
|
|
|
case ISD::ATOMIC_LOAD:
|
|
|
|
case ISD::ATOMIC_STORE:
|
|
|
|
case ISD::ATOMIC_CMP_SWAP:
|
|
|
|
case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
|
|
|
|
case ISD::ATOMIC_SWAP:
|
|
|
|
case ISD::ATOMIC_LOAD_ADD:
|
|
|
|
case ISD::ATOMIC_LOAD_SUB:
|
|
|
|
case ISD::ATOMIC_LOAD_AND:
|
|
|
|
case ISD::ATOMIC_LOAD_OR:
|
|
|
|
case ISD::ATOMIC_LOAD_XOR:
|
|
|
|
case ISD::ATOMIC_LOAD_NAND:
|
|
|
|
case ISD::ATOMIC_LOAD_MIN:
|
|
|
|
case ISD::ATOMIC_LOAD_MAX:
|
|
|
|
case ISD::ATOMIC_LOAD_UMIN:
|
2016-04-12 22:05:04 +08:00
|
|
|
case ISD::ATOMIC_LOAD_UMAX:
|
|
|
|
case AMDGPUISD::ATOMIC_INC:
|
2017-01-21 08:53:49 +08:00
|
|
|
case AMDGPUISD::ATOMIC_DEC: // TODO: Target mem intrinsics.
|
2014-08-16 01:49:05 +08:00
|
|
|
if (DCI.isBeforeLegalize())
|
|
|
|
break;
|
2016-12-22 11:44:42 +08:00
|
|
|
return performMemSDNodeCombine(cast<MemSDNode>(N), DCI);
|
2015-01-07 07:00:46 +08:00
|
|
|
case ISD::AND:
|
|
|
|
return performAndCombine(N, DCI);
|
2015-01-07 07:00:39 +08:00
|
|
|
case ISD::OR:
|
|
|
|
return performOrCombine(N, DCI);
|
2016-09-14 23:19:03 +08:00
|
|
|
case ISD::XOR:
|
|
|
|
return performXorCombine(N, DCI);
|
2017-04-01 03:53:03 +08:00
|
|
|
case ISD::ZERO_EXTEND:
|
|
|
|
return performZeroExtendCombine(N, DCI);
|
2015-01-07 07:00:39 +08:00
|
|
|
case AMDGPUISD::FP_CLASS:
|
|
|
|
return performClassCombine(N, DCI);
|
2016-04-14 09:42:16 +08:00
|
|
|
case ISD::FCANONICALIZE:
|
|
|
|
return performFCanonicalizeCombine(N, DCI);
|
2016-06-21 02:33:56 +08:00
|
|
|
case AMDGPUISD::FRACT:
|
|
|
|
case AMDGPUISD::RCP:
|
|
|
|
case AMDGPUISD::RSQ:
|
2016-07-27 00:45:45 +08:00
|
|
|
case AMDGPUISD::RCP_LEGACY:
|
2016-06-21 02:33:56 +08:00
|
|
|
case AMDGPUISD::RSQ_LEGACY:
|
|
|
|
case AMDGPUISD::RSQ_CLAMP:
|
|
|
|
case AMDGPUISD::LDEXP: {
|
|
|
|
SDValue Src = N->getOperand(0);
|
|
|
|
if (Src.isUndef())
|
|
|
|
return Src;
|
|
|
|
break;
|
|
|
|
}
|
2016-12-22 11:44:42 +08:00
|
|
|
case ISD::SINT_TO_FP:
|
|
|
|
case ISD::UINT_TO_FP:
|
|
|
|
return performUCharToFloatCombine(N, DCI);
|
|
|
|
case AMDGPUISD::CVT_F32_UBYTE0:
|
|
|
|
case AMDGPUISD::CVT_F32_UBYTE1:
|
|
|
|
case AMDGPUISD::CVT_F32_UBYTE2:
|
|
|
|
case AMDGPUISD::CVT_F32_UBYTE3:
|
|
|
|
return performCvtF32UByteNCombine(N, DCI);
|
2017-02-22 07:35:48 +08:00
|
|
|
case AMDGPUISD::FMED3:
|
|
|
|
return performFMed3Combine(N, DCI);
|
2017-02-22 08:27:34 +08:00
|
|
|
case AMDGPUISD::CVT_PKRTZ_F16_F32:
|
|
|
|
return performCvtPkRTZCombine(N, DCI);
|
2017-02-28 06:15:25 +08:00
|
|
|
case ISD::SCALAR_TO_VECTOR: {
|
|
|
|
SelectionDAG &DAG = DCI.DAG;
|
|
|
|
EVT VT = N->getValueType(0);
|
|
|
|
|
|
|
|
// v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
|
|
|
|
if (VT == MVT::v2i16 || VT == MVT::v2f16) {
|
|
|
|
SDLoc SL(N);
|
|
|
|
SDValue Src = N->getOperand(0);
|
|
|
|
EVT EltVT = Src.getValueType();
|
|
|
|
if (EltVT == MVT::f16)
|
|
|
|
Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
|
|
|
|
|
|
|
|
SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
|
|
|
|
return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
2017-05-12 01:26:25 +08:00
|
|
|
case ISD::EXTRACT_VECTOR_ELT:
|
|
|
|
return performExtractVectorEltCombine(N, DCI);
|
2017-09-21 05:01:24 +08:00
|
|
|
case ISD::BUILD_VECTOR:
|
|
|
|
return performBuildVectorCombine(N, DCI);
|
2014-08-16 01:49:05 +08:00
|
|
|
}
|
2014-05-23 02:09:07 +08:00
|
|
|
return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
|
2012-12-12 05:25:42 +08:00
|
|
|
}
|
2013-02-27 01:52:16 +08:00
|
|
|
|
2013-04-10 16:39:08 +08:00
|
|
|
/// \brief Helper function for adjustWritemask
|
2013-05-23 23:43:05 +08:00
|
|
|
static unsigned SubIdx2Lane(unsigned Idx) {
|
2013-04-10 16:39:08 +08:00
|
|
|
switch (Idx) {
|
|
|
|
default: return 0;
|
|
|
|
case AMDGPU::sub0: return 0;
|
|
|
|
case AMDGPU::sub1: return 1;
|
|
|
|
case AMDGPU::sub2: return 2;
|
|
|
|
case AMDGPU::sub3: return 3;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// \brief Adjust the writemask of MIMG instructions
|
|
|
|
void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
|
|
|
|
SelectionDAG &DAG) const {
|
|
|
|
SDNode *Users[4] = { };
|
2013-10-23 10:53:47 +08:00
|
|
|
unsigned Lane = 0;
|
2016-02-26 17:51:05 +08:00
|
|
|
unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 9) ? 2 : 3;
|
|
|
|
unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
|
2013-10-23 10:53:47 +08:00
|
|
|
unsigned NewDmask = 0;
|
2013-04-10 16:39:08 +08:00
|
|
|
|
|
|
|
// Try to figure out the used register components
|
|
|
|
for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
|
|
|
|
I != E; ++I) {
|
|
|
|
|
2017-02-23 05:16:41 +08:00
|
|
|
// Don't look at users of the chain.
|
|
|
|
if (I.getUse().getResNo() != 0)
|
|
|
|
continue;
|
|
|
|
|
2013-04-10 16:39:08 +08:00
|
|
|
// Abort if we can't understand the usage
|
|
|
|
if (!I->isMachineOpcode() ||
|
|
|
|
I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
|
|
|
|
return;
|
|
|
|
|
2013-10-23 10:53:47 +08:00
|
|
|
// Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used.
|
|
|
|
// Note that subregs are packed, i.e. Lane==0 is the first bit set
|
|
|
|
// in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
|
|
|
|
// set, etc.
|
2013-04-10 16:39:16 +08:00
|
|
|
Lane = SubIdx2Lane(I->getConstantOperandVal(1));
|
2013-04-10 16:39:08 +08:00
|
|
|
|
2013-10-23 10:53:47 +08:00
|
|
|
// Set which texture component corresponds to the lane.
|
|
|
|
unsigned Comp;
|
|
|
|
for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) {
|
|
|
|
assert(Dmask);
|
2013-10-23 11:50:25 +08:00
|
|
|
Comp = countTrailingZeros(Dmask);
|
2013-10-23 10:53:47 +08:00
|
|
|
Dmask &= ~(1 << Comp);
|
|
|
|
}
|
|
|
|
|
2013-04-10 16:39:08 +08:00
|
|
|
// Abort if we have more than one user per component
|
|
|
|
if (Users[Lane])
|
|
|
|
return;
|
|
|
|
|
|
|
|
Users[Lane] = *I;
|
2013-10-23 10:53:47 +08:00
|
|
|
NewDmask |= 1 << Comp;
|
2013-04-10 16:39:08 +08:00
|
|
|
}
|
|
|
|
|
2013-10-23 10:53:47 +08:00
|
|
|
// Abort if there's no change
|
|
|
|
if (NewDmask == OldDmask)
|
2013-04-10 16:39:08 +08:00
|
|
|
return;
|
|
|
|
|
|
|
|
// Adjust the writemask in the node
|
|
|
|
std::vector<SDValue> Ops;
|
2016-02-26 17:51:05 +08:00
|
|
|
Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
|
2015-04-28 22:05:47 +08:00
|
|
|
Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
|
2016-02-26 17:51:05 +08:00
|
|
|
Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
|
2014-04-28 13:57:50 +08:00
|
|
|
Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops);
|
2013-04-10 16:39:08 +08:00
|
|
|
|
2013-04-10 16:39:16 +08:00
|
|
|
// If we only got one lane, replace it with a copy
|
2013-10-23 10:53:47 +08:00
|
|
|
// (if NewDmask has only one bit set...)
|
|
|
|
if (NewDmask && (NewDmask & (NewDmask-1)) == 0) {
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue RC = DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, SDLoc(),
|
|
|
|
MVT::i32);
|
2013-04-10 16:39:16 +08:00
|
|
|
SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
|
2013-05-25 10:42:55 +08:00
|
|
|
SDLoc(), Users[Lane]->getValueType(0),
|
2013-04-10 16:39:16 +08:00
|
|
|
SDValue(Node, 0), RC);
|
|
|
|
DAG.ReplaceAllUsesWith(Users[Lane], Copy);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2013-04-10 16:39:08 +08:00
|
|
|
// Update the users of the node with the new indices
|
|
|
|
for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) {
|
|
|
|
SDNode *User = Users[i];
|
|
|
|
if (!User)
|
|
|
|
continue;
|
|
|
|
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
|
2013-04-10 16:39:08 +08:00
|
|
|
DAG.UpdateNodeOperands(User, User->getOperand(0), Op);
|
|
|
|
|
|
|
|
switch (Idx) {
|
|
|
|
default: break;
|
|
|
|
case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
|
|
|
|
case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
|
|
|
|
case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-07-17 03:40:07 +08:00
|
|
|
static bool isFrameIndexOp(SDValue Op) {
|
|
|
|
if (Op.getOpcode() == ISD::AssertZext)
|
|
|
|
Op = Op.getOperand(0);
|
|
|
|
|
|
|
|
return isa<FrameIndexSDNode>(Op);
|
|
|
|
}
|
|
|
|
|
2014-10-10 03:06:00 +08:00
|
|
|
/// \brief Legalize target independent instructions (e.g. INSERT_SUBREG)
|
|
|
|
/// with frame index operands.
|
|
|
|
/// LLVM assumes that inputs are to these instructions are registers.
|
2017-04-13 05:58:23 +08:00
|
|
|
SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
|
|
|
|
SelectionDAG &DAG) const {
|
|
|
|
if (Node->getOpcode() == ISD::CopyToReg) {
|
|
|
|
RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
|
|
|
|
SDValue SrcVal = Node->getOperand(2);
|
|
|
|
|
|
|
|
// Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
|
|
|
|
// to try understanding copies to physical registers.
|
|
|
|
if (SrcVal.getValueType() == MVT::i1 &&
|
|
|
|
TargetRegisterInfo::isPhysicalRegister(DestReg->getReg())) {
|
|
|
|
SDLoc SL(Node);
|
|
|
|
MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
|
|
|
|
SDValue VReg = DAG.getRegister(
|
|
|
|
MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
|
|
|
|
|
|
|
|
SDNode *Glued = Node->getGluedNode();
|
|
|
|
SDValue ToVReg
|
|
|
|
= DAG.getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal,
|
|
|
|
SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
|
|
|
|
SDValue ToResultReg
|
|
|
|
= DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
|
|
|
|
VReg, ToVReg.getValue(1));
|
|
|
|
DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
|
|
|
|
DAG.RemoveDeadNode(Node);
|
|
|
|
return ToResultReg.getNode();
|
|
|
|
}
|
|
|
|
}
|
2014-10-10 02:09:15 +08:00
|
|
|
|
|
|
|
SmallVector<SDValue, 8> Ops;
|
2014-10-10 03:06:00 +08:00
|
|
|
for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
|
2015-07-17 03:40:07 +08:00
|
|
|
if (!isFrameIndexOp(Node->getOperand(i))) {
|
2014-10-10 03:06:00 +08:00
|
|
|
Ops.push_back(Node->getOperand(i));
|
2014-10-10 02:09:15 +08:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2014-10-10 03:06:00 +08:00
|
|
|
SDLoc DL(Node);
|
2014-10-10 02:09:15 +08:00
|
|
|
Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
|
2014-10-10 03:06:00 +08:00
|
|
|
Node->getOperand(i).getValueType(),
|
|
|
|
Node->getOperand(i)), 0));
|
2014-10-10 02:09:15 +08:00
|
|
|
}
|
|
|
|
|
2014-10-10 03:06:00 +08:00
|
|
|
DAG.UpdateNodeOperands(Node, Ops);
|
2017-04-13 05:58:23 +08:00
|
|
|
return Node;
|
2014-10-10 02:09:15 +08:00
|
|
|
}
|
|
|
|
|
2014-06-04 07:06:13 +08:00
|
|
|
/// \brief Fold the instructions after selecting them.
|
2013-04-10 16:39:08 +08:00
|
|
|
SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
|
|
|
|
SelectionDAG &DAG) const {
|
2016-06-24 14:30:11 +08:00
|
|
|
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
|
2016-02-19 00:44:18 +08:00
|
|
|
unsigned Opcode = Node->getMachineOpcode();
|
2013-04-10 16:39:08 +08:00
|
|
|
|
2016-07-12 05:59:43 +08:00
|
|
|
if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() &&
|
|
|
|
!TII->isGather4(Opcode))
|
2013-04-10 16:39:08 +08:00
|
|
|
adjustWritemask(Node, DAG);
|
|
|
|
|
2016-02-19 00:44:18 +08:00
|
|
|
if (Opcode == AMDGPU::INSERT_SUBREG ||
|
|
|
|
Opcode == AMDGPU::REG_SEQUENCE) {
|
2014-10-10 02:09:15 +08:00
|
|
|
legalizeTargetIndependentNode(Node, DAG);
|
|
|
|
return Node;
|
|
|
|
}
|
2017-08-02 04:49:41 +08:00
|
|
|
|
|
|
|
switch (Opcode) {
|
|
|
|
case AMDGPU::V_DIV_SCALE_F32:
|
|
|
|
case AMDGPU::V_DIV_SCALE_F64: {
|
|
|
|
// Satisfy the operand register constraint when one of the inputs is
|
|
|
|
// undefined. Ordinarily each undef value will have its own implicit_def of
|
|
|
|
// a vreg, so force these to use a single register.
|
|
|
|
SDValue Src0 = Node->getOperand(0);
|
|
|
|
SDValue Src1 = Node->getOperand(1);
|
|
|
|
SDValue Src2 = Node->getOperand(2);
|
|
|
|
|
|
|
|
if ((Src0.isMachineOpcode() &&
|
|
|
|
Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
|
|
|
|
(Src0 == Src1 || Src0 == Src2))
|
|
|
|
break;
|
|
|
|
|
|
|
|
MVT VT = Src0.getValueType().getSimpleVT();
|
|
|
|
const TargetRegisterClass *RC = getRegClassFor(VT);
|
|
|
|
|
|
|
|
MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
|
|
|
|
SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
|
|
|
|
|
|
|
|
SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node),
|
|
|
|
UndefReg, Src0, SDValue());
|
|
|
|
|
|
|
|
// src0 must be the same register as src1 or src2, even if the value is
|
|
|
|
// undefined, so make sure we don't violate this constraint.
|
|
|
|
if (Src0.isMachineOpcode() &&
|
|
|
|
Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
|
|
|
|
if (Src1.isMachineOpcode() &&
|
|
|
|
Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
|
|
|
|
Src0 = Src1;
|
|
|
|
else if (Src2.isMachineOpcode() &&
|
|
|
|
Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
|
|
|
|
Src0 = Src2;
|
|
|
|
else {
|
|
|
|
assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
|
|
|
|
Src0 = UndefReg;
|
|
|
|
Src1 = UndefReg;
|
|
|
|
}
|
|
|
|
} else
|
|
|
|
break;
|
|
|
|
|
|
|
|
SmallVector<SDValue, 4> Ops = { Src0, Src1, Src2 };
|
|
|
|
for (unsigned I = 3, N = Node->getNumOperands(); I != N; ++I)
|
|
|
|
Ops.push_back(Node->getOperand(I));
|
|
|
|
|
|
|
|
Ops.push_back(ImpDef.getValue(1));
|
|
|
|
return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
|
|
|
|
}
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2015-01-08 23:08:17 +08:00
|
|
|
return Node;
|
2013-04-10 16:39:08 +08:00
|
|
|
}
|
2013-04-10 16:39:16 +08:00
|
|
|
|
|
|
|
/// \brief Assign the register class depending on the number of
|
|
|
|
/// bits set in the writemask
|
2016-07-01 06:52:52 +08:00
|
|
|
void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
|
2013-04-10 16:39:16 +08:00
|
|
|
SDNode *Node) const {
|
2016-06-24 14:30:11 +08:00
|
|
|
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
|
2013-04-10 16:39:16 +08:00
|
|
|
|
2016-07-01 06:52:52 +08:00
|
|
|
MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
|
2015-10-22 05:51:02 +08:00
|
|
|
|
2016-07-01 06:52:52 +08:00
|
|
|
if (TII->isVOP3(MI.getOpcode())) {
|
2015-10-22 05:51:02 +08:00
|
|
|
// Make sure constant bus requirements are respected.
|
2016-07-01 06:52:52 +08:00
|
|
|
TII->legalizeOperandsVOP3(MRI, MI);
|
2015-10-22 05:51:02 +08:00
|
|
|
return;
|
|
|
|
}
|
2014-09-27 01:54:59 +08:00
|
|
|
|
2016-07-01 06:52:52 +08:00
|
|
|
if (TII->isMIMG(MI)) {
|
|
|
|
unsigned VReg = MI.getOperand(0).getReg();
|
2016-11-15 02:33:18 +08:00
|
|
|
const TargetRegisterClass *RC = MRI.getRegClass(VReg);
|
|
|
|
// TODO: Need mapping tables to handle other cases (register classes).
|
|
|
|
if (RC != &AMDGPU::VReg_128RegClass)
|
|
|
|
return;
|
|
|
|
|
2016-07-01 06:52:52 +08:00
|
|
|
unsigned DmaskIdx = MI.getNumOperands() == 12 ? 3 : 4;
|
|
|
|
unsigned Writemask = MI.getOperand(DmaskIdx).getImm();
|
2014-09-08 23:07:31 +08:00
|
|
|
unsigned BitsSet = 0;
|
|
|
|
for (unsigned i = 0; i < 4; ++i)
|
|
|
|
BitsSet += Writemask & (1 << i) ? 1 : 0;
|
|
|
|
switch (BitsSet) {
|
|
|
|
default: return;
|
2015-01-08 04:59:25 +08:00
|
|
|
case 1: RC = &AMDGPU::VGPR_32RegClass; break;
|
2014-09-08 23:07:31 +08:00
|
|
|
case 2: RC = &AMDGPU::VReg_64RegClass; break;
|
|
|
|
case 3: RC = &AMDGPU::VReg_96RegClass; break;
|
|
|
|
}
|
|
|
|
|
2016-07-01 06:52:52 +08:00
|
|
|
unsigned NewOpcode = TII->getMaskedMIMGOp(MI.getOpcode(), BitsSet);
|
|
|
|
MI.setDesc(TII->get(NewOpcode));
|
2014-09-08 23:07:31 +08:00
|
|
|
MRI.setRegClass(VReg, RC);
|
|
|
|
return;
|
2013-04-10 16:39:16 +08:00
|
|
|
}
|
|
|
|
|
2014-09-08 23:07:31 +08:00
|
|
|
// Replace unused atomics with the no return version.
|
2016-07-01 06:52:52 +08:00
|
|
|
int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode());
|
2014-09-08 23:07:31 +08:00
|
|
|
if (NoRetAtomicOp != -1) {
|
|
|
|
if (!Node->hasAnyUseOfValue(0)) {
|
2016-07-01 06:52:52 +08:00
|
|
|
MI.setDesc(TII->get(NoRetAtomicOp));
|
|
|
|
MI.RemoveOperand(0);
|
AMDGPU: Implement {BUFFER,FLAT}_ATOMIC_CMPSWAP{,_X2}
Summary:
Implement BUFFER_ATOMIC_CMPSWAP{,_X2} instructions on all GCN targets, and FLAT_ATOMIC_CMPSWAP{,_X2} on CI+.
32-bit instruction variants tested manually on Kabini and Bonaire. Tests and parts of code provided by Jan Veselý.
Patch by: Vedran Miletić
Reviewers: arsenm, tstellarAMD, nhaehnle
Subscribers: jvesely, scchan, kanarayan, arsenm
Differential Revision: http://reviews.llvm.org/D17280
llvm-svn: 265170
2016-04-02 02:27:37 +08:00
|
|
|
return;
|
2014-09-08 23:07:31 +08:00
|
|
|
}
|
|
|
|
|
AMDGPU: Implement {BUFFER,FLAT}_ATOMIC_CMPSWAP{,_X2}
Summary:
Implement BUFFER_ATOMIC_CMPSWAP{,_X2} instructions on all GCN targets, and FLAT_ATOMIC_CMPSWAP{,_X2} on CI+.
32-bit instruction variants tested manually on Kabini and Bonaire. Tests and parts of code provided by Jan Veselý.
Patch by: Vedran Miletić
Reviewers: arsenm, tstellarAMD, nhaehnle
Subscribers: jvesely, scchan, kanarayan, arsenm
Differential Revision: http://reviews.llvm.org/D17280
llvm-svn: 265170
2016-04-02 02:27:37 +08:00
|
|
|
// For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg
|
|
|
|
// instruction, because the return type of these instructions is a vec2 of
|
|
|
|
// the memory type, so it can be tied to the input operand.
|
|
|
|
// This means these instructions always have a use, so we need to add a
|
|
|
|
// special case to check if the atomic has only one extract_subreg use,
|
|
|
|
// which itself has no uses.
|
|
|
|
if ((Node->hasNUsesOfValue(1, 0) &&
|
2016-04-15 22:42:36 +08:00
|
|
|
Node->use_begin()->isMachineOpcode() &&
|
AMDGPU: Implement {BUFFER,FLAT}_ATOMIC_CMPSWAP{,_X2}
Summary:
Implement BUFFER_ATOMIC_CMPSWAP{,_X2} instructions on all GCN targets, and FLAT_ATOMIC_CMPSWAP{,_X2} on CI+.
32-bit instruction variants tested manually on Kabini and Bonaire. Tests and parts of code provided by Jan Veselý.
Patch by: Vedran Miletić
Reviewers: arsenm, tstellarAMD, nhaehnle
Subscribers: jvesely, scchan, kanarayan, arsenm
Differential Revision: http://reviews.llvm.org/D17280
llvm-svn: 265170
2016-04-02 02:27:37 +08:00
|
|
|
Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG &&
|
|
|
|
!Node->use_begin()->hasAnyUseOfValue(0))) {
|
2016-07-01 06:52:52 +08:00
|
|
|
unsigned Def = MI.getOperand(0).getReg();
|
AMDGPU: Implement {BUFFER,FLAT}_ATOMIC_CMPSWAP{,_X2}
Summary:
Implement BUFFER_ATOMIC_CMPSWAP{,_X2} instructions on all GCN targets, and FLAT_ATOMIC_CMPSWAP{,_X2} on CI+.
32-bit instruction variants tested manually on Kabini and Bonaire. Tests and parts of code provided by Jan Veselý.
Patch by: Vedran Miletić
Reviewers: arsenm, tstellarAMD, nhaehnle
Subscribers: jvesely, scchan, kanarayan, arsenm
Differential Revision: http://reviews.llvm.org/D17280
llvm-svn: 265170
2016-04-02 02:27:37 +08:00
|
|
|
|
|
|
|
// Change this into a noret atomic.
|
2016-07-01 06:52:52 +08:00
|
|
|
MI.setDesc(TII->get(NoRetAtomicOp));
|
|
|
|
MI.RemoveOperand(0);
|
AMDGPU: Implement {BUFFER,FLAT}_ATOMIC_CMPSWAP{,_X2}
Summary:
Implement BUFFER_ATOMIC_CMPSWAP{,_X2} instructions on all GCN targets, and FLAT_ATOMIC_CMPSWAP{,_X2} on CI+.
32-bit instruction variants tested manually on Kabini and Bonaire. Tests and parts of code provided by Jan Veselý.
Patch by: Vedran Miletić
Reviewers: arsenm, tstellarAMD, nhaehnle
Subscribers: jvesely, scchan, kanarayan, arsenm
Differential Revision: http://reviews.llvm.org/D17280
llvm-svn: 265170
2016-04-02 02:27:37 +08:00
|
|
|
|
|
|
|
// If we only remove the def operand from the atomic instruction, the
|
|
|
|
// extract_subreg will be left with a use of a vreg without a def.
|
|
|
|
// So we need to insert an implicit_def to avoid machine verifier
|
|
|
|
// errors.
|
2016-07-01 06:52:52 +08:00
|
|
|
BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
|
AMDGPU: Implement {BUFFER,FLAT}_ATOMIC_CMPSWAP{,_X2}
Summary:
Implement BUFFER_ATOMIC_CMPSWAP{,_X2} instructions on all GCN targets, and FLAT_ATOMIC_CMPSWAP{,_X2} on CI+.
32-bit instruction variants tested manually on Kabini and Bonaire. Tests and parts of code provided by Jan Veselý.
Patch by: Vedran Miletić
Reviewers: arsenm, tstellarAMD, nhaehnle
Subscribers: jvesely, scchan, kanarayan, arsenm
Differential Revision: http://reviews.llvm.org/D17280
llvm-svn: 265170
2016-04-02 02:27:37 +08:00
|
|
|
TII->get(AMDGPU::IMPLICIT_DEF), Def);
|
|
|
|
}
|
2014-09-08 23:07:31 +08:00
|
|
|
return;
|
|
|
|
}
|
2013-04-10 16:39:16 +08:00
|
|
|
}
|
2013-06-04 01:39:58 +08:00
|
|
|
|
2016-06-12 23:39:02 +08:00
|
|
|
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
|
|
|
|
uint64_t Val) {
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
|
2014-11-06 03:01:17 +08:00
|
|
|
return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
|
2016-06-12 23:39:02 +08:00
|
|
|
const SDLoc &DL,
|
2014-11-06 03:01:17 +08:00
|
|
|
SDValue Ptr) const {
|
2016-06-24 14:30:11 +08:00
|
|
|
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
|
2015-09-26 01:08:42 +08:00
|
|
|
|
|
|
|
// Build the half of the subregister with the constants before building the
|
|
|
|
// full 128-bit register. If we are building multiple resource descriptors,
|
|
|
|
// this will allow CSEing of the 2-component register.
|
|
|
|
const SDValue Ops0[] = {
|
|
|
|
DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
|
|
|
|
buildSMovImm32(DAG, DL, 0),
|
|
|
|
DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
|
|
|
|
buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
|
|
|
|
DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
|
|
|
|
};
|
|
|
|
|
|
|
|
SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
|
|
|
|
MVT::v2i32, Ops0), 0);
|
|
|
|
|
|
|
|
// Combine the constants and the pointer.
|
|
|
|
const SDValue Ops1[] = {
|
|
|
|
DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
|
|
|
|
Ptr,
|
|
|
|
DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
|
|
|
|
SubRegHi,
|
|
|
|
DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
|
|
|
|
};
|
|
|
|
|
|
|
|
return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
|
2014-11-06 03:01:17 +08:00
|
|
|
}
|
|
|
|
|
2014-11-06 03:01:19 +08:00
|
|
|
/// \brief Return a resource descriptor with the 'Add TID' bit enabled
|
2015-08-09 02:27:36 +08:00
|
|
|
/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
|
|
|
|
/// of the resource descriptor) to create an offset, which is added to
|
|
|
|
/// the resource pointer.
|
2016-06-12 23:39:02 +08:00
|
|
|
MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
|
|
|
|
SDValue Ptr, uint32_t RsrcDword1,
|
2014-11-06 03:01:19 +08:00
|
|
|
uint64_t RsrcDword2And3) const {
|
|
|
|
SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
|
|
|
|
SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
|
|
|
|
if (RsrcDword1) {
|
|
|
|
PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
|
2015-04-28 22:05:47 +08:00
|
|
|
DAG.getConstant(RsrcDword1, DL, MVT::i32)),
|
|
|
|
0);
|
2014-11-06 03:01:19 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
SDValue DataLo = buildSMovImm32(DAG, DL,
|
|
|
|
RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
|
|
|
|
SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
|
|
|
|
|
|
|
|
const SDValue Ops[] = {
|
2015-04-28 22:05:47 +08:00
|
|
|
DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
|
2014-11-06 03:01:19 +08:00
|
|
|
PtrLo,
|
2015-04-28 22:05:47 +08:00
|
|
|
DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
|
2014-11-06 03:01:19 +08:00
|
|
|
PtrHi,
|
2015-04-28 22:05:47 +08:00
|
|
|
DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
|
2014-11-06 03:01:19 +08:00
|
|
|
DataLo,
|
2015-04-28 22:05:47 +08:00
|
|
|
DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
|
2014-11-06 03:01:19 +08:00
|
|
|
DataHi,
|
2015-04-28 22:05:47 +08:00
|
|
|
DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)
|
2014-11-06 03:01:19 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
|
|
|
|
}
|
|
|
|
|
2015-04-08 09:09:26 +08:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// SI Inline Assembly Support
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
std::pair<unsigned, const TargetRegisterClass *>
|
|
|
|
SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
|
2015-07-06 03:29:18 +08:00
|
|
|
StringRef Constraint,
|
2015-04-08 09:09:26 +08:00
|
|
|
MVT VT) const {
|
2016-11-18 12:42:57 +08:00
|
|
|
if (!isTypeLegal(VT))
|
|
|
|
return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
|
2015-12-10 10:12:53 +08:00
|
|
|
|
|
|
|
if (Constraint.size() == 1) {
|
|
|
|
switch (Constraint[0]) {
|
|
|
|
case 's':
|
|
|
|
case 'r':
|
|
|
|
switch (VT.getSizeInBits()) {
|
|
|
|
default:
|
|
|
|
return std::make_pair(0U, nullptr);
|
|
|
|
case 32:
|
2016-12-21 03:06:12 +08:00
|
|
|
case 16:
|
2016-11-26 01:37:09 +08:00
|
|
|
return std::make_pair(0U, &AMDGPU::SReg_32_XM0RegClass);
|
2015-12-10 10:12:53 +08:00
|
|
|
case 64:
|
|
|
|
return std::make_pair(0U, &AMDGPU::SGPR_64RegClass);
|
|
|
|
case 128:
|
|
|
|
return std::make_pair(0U, &AMDGPU::SReg_128RegClass);
|
|
|
|
case 256:
|
|
|
|
return std::make_pair(0U, &AMDGPU::SReg_256RegClass);
|
2017-02-22 03:12:08 +08:00
|
|
|
case 512:
|
|
|
|
return std::make_pair(0U, &AMDGPU::SReg_512RegClass);
|
2015-12-10 10:12:53 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
case 'v':
|
|
|
|
switch (VT.getSizeInBits()) {
|
|
|
|
default:
|
|
|
|
return std::make_pair(0U, nullptr);
|
|
|
|
case 32:
|
2016-12-21 03:06:12 +08:00
|
|
|
case 16:
|
2015-12-10 10:12:53 +08:00
|
|
|
return std::make_pair(0U, &AMDGPU::VGPR_32RegClass);
|
|
|
|
case 64:
|
|
|
|
return std::make_pair(0U, &AMDGPU::VReg_64RegClass);
|
|
|
|
case 96:
|
|
|
|
return std::make_pair(0U, &AMDGPU::VReg_96RegClass);
|
|
|
|
case 128:
|
|
|
|
return std::make_pair(0U, &AMDGPU::VReg_128RegClass);
|
|
|
|
case 256:
|
|
|
|
return std::make_pair(0U, &AMDGPU::VReg_256RegClass);
|
|
|
|
case 512:
|
|
|
|
return std::make_pair(0U, &AMDGPU::VReg_512RegClass);
|
|
|
|
}
|
2015-04-08 09:09:26 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (Constraint.size() > 1) {
|
|
|
|
const TargetRegisterClass *RC = nullptr;
|
|
|
|
if (Constraint[1] == 'v') {
|
|
|
|
RC = &AMDGPU::VGPR_32RegClass;
|
|
|
|
} else if (Constraint[1] == 's') {
|
|
|
|
RC = &AMDGPU::SGPR_32RegClass;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (RC) {
|
2015-06-23 10:05:55 +08:00
|
|
|
uint32_t Idx;
|
|
|
|
bool Failed = Constraint.substr(2).getAsInteger(10, Idx);
|
|
|
|
if (!Failed && Idx < RC->getNumRegs())
|
2015-04-08 09:09:26 +08:00
|
|
|
return std::make_pair(RC->getRegister(Idx), RC);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
|
|
|
|
}
|
2015-12-10 10:12:53 +08:00
|
|
|
|
|
|
|
SITargetLowering::ConstraintType
|
|
|
|
SITargetLowering::getConstraintType(StringRef Constraint) const {
|
|
|
|
if (Constraint.size() == 1) {
|
|
|
|
switch (Constraint[0]) {
|
|
|
|
default: break;
|
|
|
|
case 's':
|
|
|
|
case 'v':
|
|
|
|
return C_RegisterClass;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return TargetLowering::getConstraintType(Constraint);
|
|
|
|
}
|
2017-07-19 00:44:56 +08:00
|
|
|
|
|
|
|
// Figure out which registers should be reserved for stack access. Only after
|
|
|
|
// the function is legalized do we know all of the non-spill stack objects or if
|
|
|
|
// calls are present.
|
|
|
|
void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
|
|
|
|
MachineRegisterInfo &MRI = MF.getRegInfo();
|
|
|
|
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
|
|
|
|
const MachineFrameInfo &MFI = MF.getFrameInfo();
|
|
|
|
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
|
|
|
|
const SIRegisterInfo *TRI = ST.getRegisterInfo();
|
|
|
|
|
|
|
|
if (Info->isEntryFunction()) {
|
|
|
|
// Callable functions have fixed registers used for stack access.
|
|
|
|
reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info);
|
|
|
|
}
|
|
|
|
|
|
|
|
// We have to assume the SP is needed in case there are calls in the function
|
|
|
|
// during lowering. Calls are only detected after the function is
|
|
|
|
// lowered. We're about to reserve registers, so don't bother using it if we
|
|
|
|
// aren't really going to use it.
|
|
|
|
bool NeedSP = !Info->isEntryFunction() ||
|
|
|
|
MFI.hasVarSizedObjects() ||
|
|
|
|
MFI.hasCalls();
|
|
|
|
|
|
|
|
if (NeedSP) {
|
|
|
|
unsigned ReservedStackPtrOffsetReg = TRI->reservedStackPtrOffsetReg(MF);
|
|
|
|
Info->setStackPtrOffsetReg(ReservedStackPtrOffsetReg);
|
|
|
|
|
|
|
|
assert(Info->getStackPtrOffsetReg() != Info->getFrameOffsetReg());
|
|
|
|
assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
|
|
|
|
Info->getStackPtrOffsetReg()));
|
|
|
|
MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
|
|
|
|
}
|
|
|
|
|
|
|
|
MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
|
|
|
|
MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
|
|
|
|
MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG,
|
|
|
|
Info->getScratchWaveOffsetReg());
|
|
|
|
|
|
|
|
TargetLoweringBase::finalizeLowering(MF);
|
|
|
|
}
|