Re-commit AMDGPU/GlobalISel: Add support for simple shaders
Fix build when global-isel is disabled and fix a warning.
Summary: We can select constant/global G_LOAD, global G_STORE, and G_GEP.
Reviewers: qcolombet, MatzeB, t.p.northover, ab, arsenm
Subscribers: mehdi_amini, vkalintiris, kzhuravl, wdng, nhaehnle, mgorny, yaxunl, tony-tye, modocache, llvm-commits, dberris
Differential Revision: https://reviews.llvm.org/D26730
llvm-svn: 293551
2017-01-31 05:56:46 +08:00
|
|
|
//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
|
|
|
|
//
|
2019-01-19 16:50:56 +08:00
|
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
Re-commit AMDGPU/GlobalISel: Add support for simple shaders
Fix build when global-isel is disabled and fix a warning.
Summary: We can select constant/global G_LOAD, global G_STORE, and G_GEP.
Reviewers: qcolombet, MatzeB, t.p.northover, ab, arsenm
Subscribers: mehdi_amini, vkalintiris, kzhuravl, wdng, nhaehnle, mgorny, yaxunl, tony-tye, modocache, llvm-commits, dberris
Differential Revision: https://reviews.llvm.org/D26730
llvm-svn: 293551
2017-01-31 05:56:46 +08:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
/// \file
|
|
|
|
/// This file implements the targeting of the Machinelegalizer class for
|
|
|
|
/// AMDGPU.
|
|
|
|
/// \todo This should be generated by TableGen.
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2019-08-30 04:32:53 +08:00
|
|
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
|
|
|
// According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
|
|
|
|
// from the Visual C++ cmath / math.h headers:
|
|
|
|
// https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
|
|
|
|
#define _USE_MATH_DEFINES
|
|
|
|
#endif
|
|
|
|
|
2018-03-30 01:21:10 +08:00
|
|
|
#include "AMDGPULegalizerInfo.h"
|
2019-09-19 12:29:20 +08:00
|
|
|
|
|
|
|
#include "AMDGPU.h"
|
|
|
|
#include "AMDGPUGlobalISelUtils.h"
|
2018-03-17 23:17:41 +08:00
|
|
|
#include "AMDGPUTargetMachine.h"
|
2019-02-08 10:40:47 +08:00
|
|
|
#include "SIMachineFunctionInfo.h"
|
2020-01-27 09:02:51 +08:00
|
|
|
#include "llvm/ADT/ScopeExit.h"
|
2019-07-11 00:31:19 +08:00
|
|
|
#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
|
2019-02-08 10:40:47 +08:00
|
|
|
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
|
2019-09-19 12:29:20 +08:00
|
|
|
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
|
2017-11-17 09:07:10 +08:00
|
|
|
#include "llvm/CodeGen/TargetOpcodes.h"
|
2018-03-30 01:21:10 +08:00
|
|
|
#include "llvm/CodeGen/ValueTypes.h"
|
Re-commit AMDGPU/GlobalISel: Add support for simple shaders
Fix build when global-isel is disabled and fix a warning.
Summary: We can select constant/global G_LOAD, global G_STORE, and G_GEP.
Reviewers: qcolombet, MatzeB, t.p.northover, ab, arsenm
Subscribers: mehdi_amini, vkalintiris, kzhuravl, wdng, nhaehnle, mgorny, yaxunl, tony-tye, modocache, llvm-commits, dberris
Differential Revision: https://reviews.llvm.org/D26730
llvm-svn: 293551
2017-01-31 05:56:46 +08:00
|
|
|
#include "llvm/IR/DerivedTypes.h"
|
2019-09-10 01:13:44 +08:00
|
|
|
#include "llvm/IR/DiagnosticInfo.h"
|
2017-06-06 19:49:48 +08:00
|
|
|
#include "llvm/IR/Type.h"
|
Re-commit AMDGPU/GlobalISel: Add support for simple shaders
Fix build when global-isel is disabled and fix a warning.
Summary: We can select constant/global G_LOAD, global G_STORE, and G_GEP.
Reviewers: qcolombet, MatzeB, t.p.northover, ab, arsenm
Subscribers: mehdi_amini, vkalintiris, kzhuravl, wdng, nhaehnle, mgorny, yaxunl, tony-tye, modocache, llvm-commits, dberris
Differential Revision: https://reviews.llvm.org/D26730
llvm-svn: 293551
2017-01-31 05:56:46 +08:00
|
|
|
#include "llvm/Support/Debug.h"
|
|
|
|
|
2019-07-02 02:45:36 +08:00
|
|
|
#define DEBUG_TYPE "amdgpu-legalinfo"
|
|
|
|
|
Re-commit AMDGPU/GlobalISel: Add support for simple shaders
Fix build when global-isel is disabled and fix a warning.
Summary: We can select constant/global G_LOAD, global G_STORE, and G_GEP.
Reviewers: qcolombet, MatzeB, t.p.northover, ab, arsenm
Subscribers: mehdi_amini, vkalintiris, kzhuravl, wdng, nhaehnle, mgorny, yaxunl, tony-tye, modocache, llvm-commits, dberris
Differential Revision: https://reviews.llvm.org/D26730
llvm-svn: 293551
2017-01-31 05:56:46 +08:00
|
|
|
using namespace llvm;
|
2018-01-30 01:37:29 +08:00
|
|
|
using namespace LegalizeActions;
|
2019-01-25 08:51:00 +08:00
|
|
|
using namespace LegalizeMutations;
|
2019-01-21 03:45:18 +08:00
|
|
|
using namespace LegalityPredicates;
|
2019-09-19 12:29:20 +08:00
|
|
|
using namespace MIPatternMatch;
|
2019-02-08 03:10:15 +08:00
|
|
|
|
2020-01-31 06:34:33 +08:00
|
|
|
// Round the number of elements to the next power of two elements
|
|
|
|
static LLT getPow2VectorType(LLT Ty) {
|
|
|
|
unsigned NElts = Ty.getNumElements();
|
|
|
|
unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
|
|
|
|
return Ty.changeNumElements(Pow2NElts);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Round the number of bits to the next power of two bits
|
|
|
|
static LLT getPow2ScalarType(LLT Ty) {
|
|
|
|
unsigned Bits = Ty.getSizeInBits();
|
|
|
|
unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
|
|
|
|
return LLT::scalar(Pow2Bits);
|
|
|
|
}
|
|
|
|
|
2019-02-08 03:10:15 +08:00
|
|
|
static LegalityPredicate isMultiple32(unsigned TypeIdx,
|
2019-10-02 00:35:06 +08:00
|
|
|
unsigned MaxSize = 1024) {
|
2019-02-08 03:10:15 +08:00
|
|
|
return [=](const LegalityQuery &Query) {
|
|
|
|
const LLT Ty = Query.Types[TypeIdx];
|
|
|
|
const LLT EltTy = Ty.getScalarType();
|
|
|
|
return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
2019-10-06 09:37:35 +08:00
|
|
|
static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
|
|
|
|
return [=](const LegalityQuery &Query) {
|
|
|
|
return Query.Types[TypeIdx].getSizeInBits() == Size;
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
2019-02-12 06:00:39 +08:00
|
|
|
static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
|
|
|
|
return [=](const LegalityQuery &Query) {
|
|
|
|
const LLT Ty = Query.Types[TypeIdx];
|
|
|
|
return Ty.isVector() &&
|
|
|
|
Ty.getNumElements() % 2 != 0 &&
|
2019-10-04 01:50:29 +08:00
|
|
|
Ty.getElementType().getSizeInBits() < 32 &&
|
|
|
|
Ty.getSizeInBits() % 32 != 0;
|
2019-02-12 06:00:39 +08:00
|
|
|
};
|
|
|
|
}
|
|
|
|
|
2019-10-10 06:44:43 +08:00
|
|
|
static LegalityPredicate isWideVec16(unsigned TypeIdx) {
|
|
|
|
return [=](const LegalityQuery &Query) {
|
|
|
|
const LLT Ty = Query.Types[TypeIdx];
|
|
|
|
const LLT EltTy = Ty.getScalarType();
|
|
|
|
return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
2019-02-12 06:00:39 +08:00
|
|
|
static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
|
|
|
|
return [=](const LegalityQuery &Query) {
|
|
|
|
const LLT Ty = Query.Types[TypeIdx];
|
|
|
|
const LLT EltTy = Ty.getElementType();
|
|
|
|
return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
2019-02-20 00:30:19 +08:00
|
|
|
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
|
|
|
|
return [=](const LegalityQuery &Query) {
|
|
|
|
const LLT Ty = Query.Types[TypeIdx];
|
|
|
|
const LLT EltTy = Ty.getElementType();
|
|
|
|
unsigned Size = Ty.getSizeInBits();
|
|
|
|
unsigned Pieces = (Size + 63) / 64;
|
|
|
|
unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
|
|
|
|
return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
2019-09-11 00:20:14 +08:00
|
|
|
// Increase the number of vector elements to reach the next multiple of 32-bit
|
|
|
|
// type.
|
|
|
|
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
|
|
|
|
return [=](const LegalityQuery &Query) {
|
|
|
|
const LLT Ty = Query.Types[TypeIdx];
|
|
|
|
|
|
|
|
const LLT EltTy = Ty.getElementType();
|
|
|
|
const int Size = Ty.getSizeInBits();
|
|
|
|
const int EltSize = EltTy.getSizeInBits();
|
|
|
|
const int NextMul32 = (Size + 31) / 32;
|
|
|
|
|
|
|
|
assert(EltSize < 32);
|
|
|
|
|
|
|
|
const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
|
|
|
|
return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
|
|
|
|
return [=](const LegalityQuery &Query) {
|
|
|
|
const LLT QueryTy = Query.Types[TypeIdx];
|
|
|
|
return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
2019-02-20 00:30:19 +08:00
|
|
|
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
|
|
|
|
return [=](const LegalityQuery &Query) {
|
|
|
|
const LLT QueryTy = Query.Types[TypeIdx];
|
|
|
|
return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
2019-02-20 01:03:09 +08:00
|
|
|
static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
|
|
|
|
return [=](const LegalityQuery &Query) {
|
|
|
|
const LLT QueryTy = Query.Types[TypeIdx];
|
|
|
|
return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
|
|
|
|
};
|
|
|
|
}
|
2019-02-12 06:00:39 +08:00
|
|
|
|
2019-10-02 00:35:06 +08:00
|
|
|
// Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
|
2019-07-09 22:17:31 +08:00
|
|
|
// v2s16.
|
|
|
|
static LegalityPredicate isRegisterType(unsigned TypeIdx) {
|
|
|
|
return [=](const LegalityQuery &Query) {
|
|
|
|
const LLT Ty = Query.Types[TypeIdx];
|
|
|
|
if (Ty.isVector()) {
|
|
|
|
const int EltSize = Ty.getElementType().getSizeInBits();
|
|
|
|
return EltSize == 32 || EltSize == 64 ||
|
2019-07-10 06:48:04 +08:00
|
|
|
(EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
|
|
|
|
EltSize == 128 || EltSize == 256;
|
2019-07-09 22:17:31 +08:00
|
|
|
}
|
|
|
|
|
2019-10-02 00:35:06 +08:00
|
|
|
return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
|
2019-07-09 22:17:31 +08:00
|
|
|
};
|
|
|
|
}
|
|
|
|
|
2019-08-14 00:26:28 +08:00
|
|
|
static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
|
|
|
|
return [=](const LegalityQuery &Query) {
|
2020-01-12 08:05:06 +08:00
|
|
|
const LLT QueryTy = Query.Types[TypeIdx];
|
|
|
|
return QueryTy.isVector() && QueryTy.getElementType() == Type;
|
2019-08-14 00:26:28 +08:00
|
|
|
};
|
|
|
|
}
|
|
|
|
|
2020-02-16 10:59:30 +08:00
|
|
|
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
|
|
|
|
return [=](const LegalityQuery &Query) {
|
|
|
|
const LLT QueryTy = Query.Types[TypeIdx];
|
|
|
|
if (!QueryTy.isVector())
|
|
|
|
return false;
|
|
|
|
const LLT EltTy = QueryTy.getElementType();
|
|
|
|
return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
2019-09-11 00:20:14 +08:00
|
|
|
static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
|
|
|
|
return [=](const LegalityQuery &Query) {
|
|
|
|
const LLT Ty = Query.Types[TypeIdx];
|
|
|
|
return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
|
|
|
|
Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
2020-02-16 11:24:17 +08:00
|
|
|
static LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1) {
|
|
|
|
return [=](const LegalityQuery &Query) {
|
|
|
|
return Query.Types[TypeIdx0].getSizeInBits() <
|
|
|
|
Query.Types[TypeIdx1].getSizeInBits();
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
static LegalityPredicate greaterThan(unsigned TypeIdx0, unsigned TypeIdx1) {
|
|
|
|
return [=](const LegalityQuery &Query) {
|
|
|
|
return Query.Types[TypeIdx0].getSizeInBits() >
|
|
|
|
Query.Types[TypeIdx1].getSizeInBits();
|
|
|
|
};
|
2020-02-17 13:18:01 +08:00
|
|
|
}
|
2020-02-16 11:24:17 +08:00
|
|
|
|
2019-07-02 02:49:01 +08:00
|
|
|
AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
|
|
|
|
const GCNTargetMachine &TM)
|
|
|
|
: ST(ST_) {
|
Re-commit AMDGPU/GlobalISel: Add support for simple shaders
Fix build when global-isel is disabled and fix a warning.
Summary: We can select constant/global G_LOAD, global G_STORE, and G_GEP.
Reviewers: qcolombet, MatzeB, t.p.northover, ab, arsenm
Subscribers: mehdi_amini, vkalintiris, kzhuravl, wdng, nhaehnle, mgorny, yaxunl, tony-tye, modocache, llvm-commits, dberris
Differential Revision: https://reviews.llvm.org/D26730
llvm-svn: 293551
2017-01-31 05:56:46 +08:00
|
|
|
using namespace TargetOpcode;
|
|
|
|
|
2018-03-17 23:17:41 +08:00
|
|
|
auto GetAddrSpacePtr = [&TM](unsigned AS) {
|
|
|
|
return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
|
|
|
|
};
|
|
|
|
|
|
|
|
const LLT S1 = LLT::scalar(1);
|
2019-01-19 05:33:50 +08:00
|
|
|
const LLT S16 = LLT::scalar(16);
|
Re-commit AMDGPU/GlobalISel: Add support for simple shaders
Fix build when global-isel is disabled and fix a warning.
Summary: We can select constant/global G_LOAD, global G_STORE, and G_GEP.
Reviewers: qcolombet, MatzeB, t.p.northover, ab, arsenm
Subscribers: mehdi_amini, vkalintiris, kzhuravl, wdng, nhaehnle, mgorny, yaxunl, tony-tye, modocache, llvm-commits, dberris
Differential Revision: https://reviews.llvm.org/D26730
llvm-svn: 293551
2017-01-31 05:56:46 +08:00
|
|
|
const LLT S32 = LLT::scalar(32);
|
|
|
|
const LLT S64 = LLT::scalar(64);
|
2019-01-25 10:36:32 +08:00
|
|
|
const LLT S128 = LLT::scalar(128);
|
2019-01-21 02:40:36 +08:00
|
|
|
const LLT S256 = LLT::scalar(256);
|
2020-03-16 22:42:15 +08:00
|
|
|
const LLT S512 = LLT::scalar(512);
|
2019-10-02 00:35:06 +08:00
|
|
|
const LLT S1024 = LLT::scalar(1024);
|
2018-03-17 23:17:41 +08:00
|
|
|
|
2018-12-21 11:03:11 +08:00
|
|
|
const LLT V2S16 = LLT::vector(2, 16);
|
2019-01-08 09:30:02 +08:00
|
|
|
const LLT V4S16 = LLT::vector(4, 16);
|
2018-12-21 11:03:11 +08:00
|
|
|
|
|
|
|
const LLT V2S32 = LLT::vector(2, 32);
|
|
|
|
const LLT V3S32 = LLT::vector(3, 32);
|
|
|
|
const LLT V4S32 = LLT::vector(4, 32);
|
|
|
|
const LLT V5S32 = LLT::vector(5, 32);
|
|
|
|
const LLT V6S32 = LLT::vector(6, 32);
|
|
|
|
const LLT V7S32 = LLT::vector(7, 32);
|
|
|
|
const LLT V8S32 = LLT::vector(8, 32);
|
|
|
|
const LLT V9S32 = LLT::vector(9, 32);
|
|
|
|
const LLT V10S32 = LLT::vector(10, 32);
|
|
|
|
const LLT V11S32 = LLT::vector(11, 32);
|
|
|
|
const LLT V12S32 = LLT::vector(12, 32);
|
|
|
|
const LLT V13S32 = LLT::vector(13, 32);
|
|
|
|
const LLT V14S32 = LLT::vector(14, 32);
|
|
|
|
const LLT V15S32 = LLT::vector(15, 32);
|
|
|
|
const LLT V16S32 = LLT::vector(16, 32);
|
2019-10-02 09:02:18 +08:00
|
|
|
const LLT V32S32 = LLT::vector(32, 32);
|
2018-12-21 11:03:11 +08:00
|
|
|
|
|
|
|
const LLT V2S64 = LLT::vector(2, 64);
|
|
|
|
const LLT V3S64 = LLT::vector(3, 64);
|
|
|
|
const LLT V4S64 = LLT::vector(4, 64);
|
|
|
|
const LLT V5S64 = LLT::vector(5, 64);
|
|
|
|
const LLT V6S64 = LLT::vector(6, 64);
|
|
|
|
const LLT V7S64 = LLT::vector(7, 64);
|
|
|
|
const LLT V8S64 = LLT::vector(8, 64);
|
2019-10-02 09:02:18 +08:00
|
|
|
const LLT V16S64 = LLT::vector(16, 64);
|
2018-12-21 11:03:11 +08:00
|
|
|
|
|
|
|
std::initializer_list<LLT> AllS32Vectors =
|
|
|
|
{V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
|
2019-10-02 09:02:18 +08:00
|
|
|
V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
|
2018-12-21 11:03:11 +08:00
|
|
|
std::initializer_list<LLT> AllS64Vectors =
|
2019-10-02 09:02:18 +08:00
|
|
|
{V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
|
2018-12-21 11:03:11 +08:00
|
|
|
|
2018-03-17 23:17:41 +08:00
|
|
|
const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
|
|
|
|
const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
|
2019-07-20 06:28:44 +08:00
|
|
|
const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
|
2018-03-17 23:17:45 +08:00
|
|
|
const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
|
2019-07-20 06:28:44 +08:00
|
|
|
const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
|
2018-08-31 13:49:54 +08:00
|
|
|
const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
|
|
|
|
const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
|
2018-03-17 23:17:45 +08:00
|
|
|
|
2018-12-14 04:34:15 +08:00
|
|
|
const LLT CodePtr = FlatPtr;
|
|
|
|
|
2019-02-15 06:24:28 +08:00
|
|
|
const std::initializer_list<LLT> AddrSpaces64 = {
|
|
|
|
GlobalPtr, ConstantPtr, FlatPtr
|
|
|
|
};
|
|
|
|
|
|
|
|
const std::initializer_list<LLT> AddrSpaces32 = {
|
2019-07-20 06:28:44 +08:00
|
|
|
LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
|
2018-03-17 23:17:45 +08:00
|
|
|
};
|
Re-commit AMDGPU/GlobalISel: Add support for simple shaders
Fix build when global-isel is disabled and fix a warning.
Summary: We can select constant/global G_LOAD, global G_STORE, and G_GEP.
Reviewers: qcolombet, MatzeB, t.p.northover, ab, arsenm
Subscribers: mehdi_amini, vkalintiris, kzhuravl, wdng, nhaehnle, mgorny, yaxunl, tony-tye, modocache, llvm-commits, dberris
Differential Revision: https://reviews.llvm.org/D26730
llvm-svn: 293551
2017-01-31 05:56:46 +08:00
|
|
|
|
2019-07-02 01:35:53 +08:00
|
|
|
const std::initializer_list<LLT> FPTypesBase = {
|
|
|
|
S32, S64
|
|
|
|
};
|
|
|
|
|
|
|
|
const std::initializer_list<LLT> FPTypes16 = {
|
|
|
|
S32, S64, S16
|
|
|
|
};
|
|
|
|
|
2019-07-11 00:31:19 +08:00
|
|
|
const std::initializer_list<LLT> FPTypesPK16 = {
|
|
|
|
S32, S64, S16, V2S16
|
|
|
|
};
|
|
|
|
|
2020-01-25 10:21:03 +08:00
|
|
|
const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
|
2020-01-12 07:03:18 +08:00
|
|
|
|
AMDGPU/GlobalISel: Replace handling of boolean values
This solves selection failures with generated selection patterns,
which would fail due to inferring the SGPR reg bank for virtual
registers with a set register class instead of VCC bank. Use
instruction selection would constrain the virtual register to a
specific class, so when the def was selected later the bank no longer
was set to VCC.
Remove the SCC reg bank. SCC isn't directly addressable, so it
requires copying from SCC to an allocatable 32-bit register during
selection, so these might as well be treated as 32-bit SGPR values.
Now any scalar boolean value that will produce an outupt in SCC should
be widened during RegBankSelect to s32. Any s1 value should be a
vector boolean during selection. This makes the vcc register bank
unambiguous with a normal SGPR during selection.
Summary of how this should now work:
- G_TRUNC is always a no-op, and never should use a vcc bank result.
- SALU boolean operations should be promoted to s32 in RegBankSelect
apply mapping
- An s1 value means vcc bank at selection. The exception is for
legalization artifacts that use s1, which are never VCC. All other
contexts should infer the VCC register classes for s1 typed
registers. The LLT for the register is now needed to infer the
correct register class. Extensions with vcc sources should be
legalized to a select of constants during RegBankSelect.
- Copy from non-vcc to vcc ensures high bits of the input value are
cleared during selection.
- SALU boolean inputs should ensure the inputs are 0/1. This includes
select, conditional branches, and carry-ins.
There are a few somewhat dirty details. One is that G_TRUNC/G_*EXT
selection ignores the usual register-bank from register class
functions, and can't handle truncates with VCC result banks. I think
this is OK, since the artifacts are specially treated anyway. This
does require some care to avoid producing cases with vcc. There will
also be no 100% reliable way to verify this rule is followed in
selection in case of register classes, and violations manifests
themselves as invalid copy instructions much later.
Standard phi handling also only considers the bank of the result
register, and doesn't insert copies to make the source banks
match. This doesn't work for vcc, so we have to manually correct phi
inputs in this case. We should add a verifier check to make sure there
are no phis with mixed vcc and non-vcc register bank inputs.
There's also some duplication with the LegalizerHelper, and some code
which should live in the helper. I don't see a good way to share
special knowledge about what types to use for intermediate operations
depending on the bank for example. Using the helper to replace
extensions with selects also seems somewhat awkward to me.
Another issue is there are some contexts calling
getRegBankFromRegClass that apparently don't have the LLT type for the
register, but I haven't yet run into a real issue from this.
This also introduces new unnecessary instructions in most cases, since
we don't yet try to optimize out the zext when the source is known to
come from a compare.
2019-11-03 00:30:59 +08:00
|
|
|
setAction({G_BRCOND, S1}, Legal); // VCC branches
|
|
|
|
setAction({G_BRCOND, S32}, Legal); // SCC branches
|
2019-01-08 09:22:47 +08:00
|
|
|
|
2019-02-21 23:48:13 +08:00
|
|
|
// TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
|
|
|
|
// elements for v3s16
|
|
|
|
getActionDefinitionsBuilder(G_PHI)
|
|
|
|
.legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
|
|
|
|
.legalFor(AllS32Vectors)
|
|
|
|
.legalFor(AllS64Vectors)
|
|
|
|
.legalFor(AddrSpaces64)
|
|
|
|
.legalFor(AddrSpaces32)
|
|
|
|
.clampScalar(0, S32, S256)
|
|
|
|
.widenScalarToNextPow2(0, 32)
|
2019-02-28 08:16:32 +08:00
|
|
|
.clampMaxNumElements(0, S32, 16)
|
2019-02-28 08:01:05 +08:00
|
|
|
.moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
|
2019-02-21 23:48:13 +08:00
|
|
|
.legalIf(isPointer(0));
|
|
|
|
|
2020-02-18 11:10:27 +08:00
|
|
|
if (ST.hasVOP3PInsts()) {
|
|
|
|
getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
|
|
|
|
.legalFor({S32, S16, V2S16})
|
|
|
|
.clampScalar(0, S16, S32)
|
|
|
|
.clampMaxNumElements(0, S16, 2)
|
|
|
|
.scalarize(0)
|
|
|
|
.widenScalarToNextPow2(0, 32);
|
|
|
|
} else if (ST.has16BitInsts()) {
|
2019-07-02 02:18:55 +08:00
|
|
|
getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
|
|
|
|
.legalFor({S32, S16})
|
|
|
|
.clampScalar(0, S16, S32)
|
2020-02-07 09:33:47 +08:00
|
|
|
.scalarize(0)
|
|
|
|
.widenScalarToNextPow2(0, 32);
|
2019-07-02 02:18:55 +08:00
|
|
|
} else {
|
|
|
|
getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
|
|
|
|
.legalFor({S32})
|
|
|
|
.clampScalar(0, S32, S32)
|
|
|
|
.scalarize(0);
|
|
|
|
}
|
|
|
|
|
2020-01-05 02:24:09 +08:00
|
|
|
// FIXME: Not really legal. Placeholder for custom lowering.
|
|
|
|
getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
|
2020-02-12 09:51:02 +08:00
|
|
|
.customFor({S32, S64})
|
2020-01-05 02:24:09 +08:00
|
|
|
.clampScalar(0, S32, S64)
|
|
|
|
.widenScalarToNextPow2(0, 32)
|
|
|
|
.scalarize(0);
|
|
|
|
|
2019-07-02 02:18:55 +08:00
|
|
|
getActionDefinitionsBuilder({G_UMULH, G_SMULH})
|
2019-01-25 11:23:04 +08:00
|
|
|
.legalFor({S32})
|
2019-01-27 08:52:51 +08:00
|
|
|
.clampScalar(0, S32, S32)
|
2019-01-25 11:23:04 +08:00
|
|
|
.scalarize(0);
|
2018-12-20 09:35:49 +08:00
|
|
|
|
2019-01-27 07:47:07 +08:00
|
|
|
// Report legal for any types we can handle anywhere. For the cases only legal
|
|
|
|
// on the SALU, RegBankSelect will be able to re-legalize.
|
2018-12-20 09:35:49 +08:00
|
|
|
getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
|
2019-07-16 22:28:30 +08:00
|
|
|
.legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
|
2019-01-27 07:47:07 +08:00
|
|
|
.clampScalar(0, S32, S64)
|
2019-02-20 00:30:19 +08:00
|
|
|
.moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
|
2019-10-04 01:50:29 +08:00
|
|
|
.fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
|
2019-02-26 05:32:48 +08:00
|
|
|
.widenScalarToNextPow2(0)
|
2019-01-27 07:47:07 +08:00
|
|
|
.scalarize(0);
|
AMDGPU/GlobalISel: Mark 32-bit G_ADD as legal
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, wdng, nhaehnle, yaxunl, rovka, kristof.beyls, igorb, dstuttard, tpr, llvm-commits, t-tye
Differential Revision: https://reviews.llvm.org/D33992
llvm-svn: 305232
2017-06-13 04:54:56 +08:00
|
|
|
|
2019-10-17 04:46:32 +08:00
|
|
|
getActionDefinitionsBuilder({G_UADDO, G_USUBO,
|
2019-01-08 09:09:09 +08:00
|
|
|
G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
|
AMDGPU/GlobalISel: Replace handling of boolean values
This solves selection failures with generated selection patterns,
which would fail due to inferring the SGPR reg bank for virtual
registers with a set register class instead of VCC bank. Use
instruction selection would constrain the virtual register to a
specific class, so when the def was selected later the bank no longer
was set to VCC.
Remove the SCC reg bank. SCC isn't directly addressable, so it
requires copying from SCC to an allocatable 32-bit register during
selection, so these might as well be treated as 32-bit SGPR values.
Now any scalar boolean value that will produce an outupt in SCC should
be widened during RegBankSelect to s32. Any s1 value should be a
vector boolean during selection. This makes the vcc register bank
unambiguous with a normal SGPR during selection.
Summary of how this should now work:
- G_TRUNC is always a no-op, and never should use a vcc bank result.
- SALU boolean operations should be promoted to s32 in RegBankSelect
apply mapping
- An s1 value means vcc bank at selection. The exception is for
legalization artifacts that use s1, which are never VCC. All other
contexts should infer the VCC register classes for s1 typed
registers. The LLT for the register is now needed to infer the
correct register class. Extensions with vcc sources should be
legalized to a select of constants during RegBankSelect.
- Copy from non-vcc to vcc ensures high bits of the input value are
cleared during selection.
- SALU boolean inputs should ensure the inputs are 0/1. This includes
select, conditional branches, and carry-ins.
There are a few somewhat dirty details. One is that G_TRUNC/G_*EXT
selection ignores the usual register-bank from register class
functions, and can't handle truncates with VCC result banks. I think
this is OK, since the artifacts are specially treated anyway. This
does require some care to avoid producing cases with vcc. There will
also be no 100% reliable way to verify this rule is followed in
selection in case of register classes, and violations manifests
themselves as invalid copy instructions much later.
Standard phi handling also only considers the bank of the result
register, and doesn't insert copies to make the source banks
match. This doesn't work for vcc, so we have to manually correct phi
inputs in this case. We should add a verifier check to make sure there
are no phis with mixed vcc and non-vcc register bank inputs.
There's also some duplication with the LegalizerHelper, and some code
which should live in the helper. I don't see a good way to share
special knowledge about what types to use for intermediate operations
depending on the bank for example. Using the helper to replace
extensions with selects also seems somewhat awkward to me.
Another issue is there are some contexts calling
getRegBankFromRegClass that apparently don't have the LLT type for the
register, but I haven't yet run into a real issue from this.
This also introduces new unnecessary instructions in most cases, since
we don't yet try to optimize out the zext when the source is known to
come from a compare.
2019-11-03 00:30:59 +08:00
|
|
|
.legalFor({{S32, S1}, {S32, S32}})
|
AMDGPU/GlobalISel: Lower 64-bit uaddo/usubo
Summary: Add more test cases for signed and unsigned add/sub with overflow.
Reviewers: arsenm, rampitec, kerbowa
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D75051
2020-01-24 17:02:47 +08:00
|
|
|
.minScalar(0, S32)
|
|
|
|
// TODO: .scalarize(0)
|
|
|
|
.lower();
|
2019-01-08 09:03:58 +08:00
|
|
|
|
2019-01-21 03:45:18 +08:00
|
|
|
getActionDefinitionsBuilder(G_BITCAST)
|
|
|
|
// Don't worry about the size constraint.
|
2019-10-03 13:46:08 +08:00
|
|
|
.legalIf(all(isRegisterType(0), isRegisterType(1)))
|
2020-01-10 10:53:28 +08:00
|
|
|
.lower();
|
|
|
|
|
AMDGPU/GlobalISel: Mark G_BITCAST s32 <--> <2 x s16> legal
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, wdng, nhaehnle, yaxunl, rovka, kristof.beyls, igorb, dstuttard, tpr, t-tye, llvm-commits
Differential Revision: https://reviews.llvm.org/D34129
llvm-svn: 305692
2017-06-19 21:15:45 +08:00
|
|
|
|
2020-01-22 06:12:26 +08:00
|
|
|
getActionDefinitionsBuilder(G_CONSTANT)
|
|
|
|
.legalFor({S1, S32, S64, S16, GlobalPtr,
|
|
|
|
LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
|
|
|
|
.clampScalar(0, S32, S64)
|
|
|
|
.widenScalarToNextPow2(0)
|
|
|
|
.legalIf(isPointer(0));
|
|
|
|
|
2019-09-05 00:19:45 +08:00
|
|
|
getActionDefinitionsBuilder(G_FCONSTANT)
|
|
|
|
.legalFor({S32, S64, S16})
|
|
|
|
.clampScalar(0, S16, S64);
|
AMDGPU/GlobalISel: Make IMPLICIT_DEF of all sizes < 512 legal.
Summary:
We could split sizes that are not power of two into smaller sized
G_IMPLICIT_DEF instructions, but this ends up generating
G_MERGE_VALUES instructions which we then have to handle in the instruction
selector. Since G_IMPLICIT_DEF is really a no-op it's easier just to
keep everything that can fit into a register legal.
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, wdng, nhaehnle, yaxunl, rovka, kristof.beyls, dstuttard, tpr, t-tye, llvm-commits
Differential Revision: https://reviews.llvm.org/D48777
llvm-svn: 336041
2018-06-30 12:09:44 +08:00
|
|
|
|
[GlobalISel][AMDGPU] add legalization for G_FREEZE
Summary:
Copy the legalization rules from SelectionDAG:
-widenScalar using anyext
-narrowScalar using intermediate merges
-scalarize/fewerElements using unmerge
-moreElements using G_IMPLICIT_DEF and insert
Add G_FREEZE legalization actions to AMDGPULegalizerInfo.
Use the same legalization actions as G_IMPLICIT_DEF.
Depends on D77795.
Reviewers: dsanders, arsenm, aqjune, aditya_nandakumar, t.p.northover, lebedev.ri, paquette, aemerson
Reviewed By: arsenm
Subscribers: kzhuravl, yaxunl, dstuttard, tpr, t-tye, jvesely, nhaehnle, kerbowa, wdng, rovka, hiraditya, volkan, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D78092
2020-04-14 17:25:05 +08:00
|
|
|
getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
|
|
|
|
.legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
|
|
|
|
ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
|
|
|
|
.moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
|
|
|
|
.clampScalarOrElt(0, S32, S1024)
|
|
|
|
.legalIf(isMultiple32(0))
|
|
|
|
.widenScalarToNextPow2(0, 32)
|
|
|
|
.clampMaxNumElements(0, S32, 16);
|
2018-06-25 23:42:12 +08:00
|
|
|
|
2018-12-18 17:46:13 +08:00
|
|
|
setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
|
2019-10-01 09:06:43 +08:00
|
|
|
getActionDefinitionsBuilder(G_GLOBAL_VALUE)
|
2020-02-16 11:34:37 +08:00
|
|
|
.unsupportedFor({PrivatePtr})
|
|
|
|
.custom();
|
2020-01-22 06:12:26 +08:00
|
|
|
setAction({G_BLOCK_ADDR, CodePtr}, Legal);
|
2018-12-18 17:46:13 +08:00
|
|
|
|
2019-02-08 02:03:11 +08:00
|
|
|
auto &FPOpActions = getActionDefinitionsBuilder(
|
2019-09-11 01:19:46 +08:00
|
|
|
{ G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
|
2019-02-08 02:03:11 +08:00
|
|
|
.legalFor({S32, S64});
|
2019-08-30 04:06:48 +08:00
|
|
|
auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
|
|
|
|
.customFor({S32, S64});
|
AMDGPU/GlobalISel: Legalize fast unsafe FDIV
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69231
llvm-svn: 375460
2019-10-22 06:18:26 +08:00
|
|
|
auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
|
|
|
|
.customFor({S32, S64});
|
2019-02-08 02:03:11 +08:00
|
|
|
|
|
|
|
if (ST.has16BitInsts()) {
|
|
|
|
if (ST.hasVOP3PInsts())
|
|
|
|
FPOpActions.legalFor({S16, V2S16});
|
|
|
|
else
|
|
|
|
FPOpActions.legalFor({S16});
|
2019-08-30 04:06:48 +08:00
|
|
|
|
|
|
|
TrigActions.customFor({S16});
|
AMDGPU/GlobalISel: Legalize fast unsafe FDIV
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69231
llvm-svn: 375460
2019-10-22 06:18:26 +08:00
|
|
|
FDIVActions.customFor({S16});
|
2019-02-08 02:03:11 +08:00
|
|
|
}
|
|
|
|
|
2019-07-11 00:31:19 +08:00
|
|
|
auto &MinNumMaxNum = getActionDefinitionsBuilder({
|
|
|
|
G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
|
|
|
|
|
|
|
|
if (ST.hasVOP3PInsts()) {
|
|
|
|
MinNumMaxNum.customFor(FPTypesPK16)
|
2019-07-28 05:47:08 +08:00
|
|
|
.moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
|
2019-07-11 00:31:19 +08:00
|
|
|
.clampMaxNumElements(0, S16, 2)
|
|
|
|
.clampScalar(0, S16, S64)
|
|
|
|
.scalarize(0);
|
|
|
|
} else if (ST.has16BitInsts()) {
|
|
|
|
MinNumMaxNum.customFor(FPTypes16)
|
|
|
|
.clampScalar(0, S16, S64)
|
|
|
|
.scalarize(0);
|
|
|
|
} else {
|
|
|
|
MinNumMaxNum.customFor(FPTypesBase)
|
|
|
|
.clampScalar(0, S32, S64)
|
|
|
|
.scalarize(0);
|
|
|
|
}
|
|
|
|
|
2019-02-08 02:03:11 +08:00
|
|
|
if (ST.hasVOP3PInsts())
|
|
|
|
FPOpActions.clampMaxNumElements(0, S16, 2);
|
2019-08-30 04:06:48 +08:00
|
|
|
|
2019-02-08 02:03:11 +08:00
|
|
|
FPOpActions
|
|
|
|
.scalarize(0)
|
|
|
|
.clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
|
AMDGPU/GlobalISel: Mark 32-bit G_FADD as legal
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, wdng, nhaehnle, yaxunl, rovka, kristof.beyls, igorb, dstuttard, tpr, t-tye, llvm-commits
Differential Revision: https://reviews.llvm.org/D38439
llvm-svn: 316815
2017-10-28 07:57:41 +08:00
|
|
|
|
2019-08-30 04:06:48 +08:00
|
|
|
TrigActions
|
|
|
|
.scalarize(0)
|
|
|
|
.clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
|
|
|
|
|
AMDGPU/GlobalISel: Legalize fast unsafe FDIV
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69231
llvm-svn: 375460
2019-10-22 06:18:26 +08:00
|
|
|
FDIVActions
|
|
|
|
.scalarize(0)
|
|
|
|
.clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
|
|
|
|
|
2019-09-11 01:19:46 +08:00
|
|
|
getActionDefinitionsBuilder({G_FNEG, G_FABS})
|
|
|
|
.legalFor(FPTypesPK16)
|
|
|
|
.clampMaxNumElements(0, S16, 2)
|
|
|
|
.scalarize(0)
|
|
|
|
.clampScalar(0, S16, S64);
|
|
|
|
|
2019-02-08 02:14:39 +08:00
|
|
|
if (ST.has16BitInsts()) {
|
2019-09-13 09:48:15 +08:00
|
|
|
getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
|
2019-02-08 02:14:39 +08:00
|
|
|
.legalFor({S32, S64, S16})
|
|
|
|
.scalarize(0)
|
|
|
|
.clampScalar(0, S16, S64);
|
|
|
|
} else {
|
2020-01-24 23:01:15 +08:00
|
|
|
getActionDefinitionsBuilder(G_FSQRT)
|
2019-02-08 02:14:39 +08:00
|
|
|
.legalFor({S32, S64})
|
|
|
|
.scalarize(0)
|
|
|
|
.clampScalar(0, S32, S64);
|
2020-01-24 23:01:15 +08:00
|
|
|
|
|
|
|
if (ST.hasFractBug()) {
|
|
|
|
getActionDefinitionsBuilder(G_FFLOOR)
|
|
|
|
.customFor({S64})
|
|
|
|
.legalFor({S32, S64})
|
|
|
|
.scalarize(0)
|
|
|
|
.clampScalar(0, S32, S64);
|
|
|
|
} else {
|
|
|
|
getActionDefinitionsBuilder(G_FFLOOR)
|
|
|
|
.legalFor({S32, S64})
|
|
|
|
.scalarize(0)
|
|
|
|
.clampScalar(0, S32, S64);
|
|
|
|
}
|
2019-02-08 02:14:39 +08:00
|
|
|
}
|
|
|
|
|
2018-12-20 08:37:02 +08:00
|
|
|
getActionDefinitionsBuilder(G_FPTRUNC)
|
2019-01-25 12:37:33 +08:00
|
|
|
.legalFor({{S32, S64}, {S16, S32}})
|
2020-01-18 23:08:11 +08:00
|
|
|
.scalarize(0)
|
|
|
|
.lower();
|
2018-12-20 08:37:02 +08:00
|
|
|
|
2019-01-21 02:34:24 +08:00
|
|
|
getActionDefinitionsBuilder(G_FPEXT)
|
|
|
|
.legalFor({{S64, S32}, {S32, S16}})
|
2019-01-25 10:36:32 +08:00
|
|
|
.lowerFor({{S64, S16}}) // FIXME: Implement
|
|
|
|
.scalarize(0);
|
2019-01-21 02:34:24 +08:00
|
|
|
|
2019-01-21 03:10:31 +08:00
|
|
|
getActionDefinitionsBuilder(G_FSUB)
|
2019-01-23 04:14:29 +08:00
|
|
|
// Use actual fsub instruction
|
|
|
|
.legalFor({S32})
|
|
|
|
// Must use fadd + fneg
|
|
|
|
.lowerFor({S64, S16, V2S16})
|
2019-01-25 08:51:00 +08:00
|
|
|
.scalarize(0)
|
2019-01-23 04:14:29 +08:00
|
|
|
.clampScalar(0, S32, S64);
|
2018-12-18 17:19:03 +08:00
|
|
|
|
2019-09-13 08:44:35 +08:00
|
|
|
// Whether this is legal depends on the floating point mode for the function.
|
|
|
|
auto &FMad = getActionDefinitionsBuilder(G_FMAD);
|
|
|
|
if (ST.hasMadF16())
|
|
|
|
FMad.customFor({S32, S16});
|
|
|
|
else
|
|
|
|
FMad.customFor({S32});
|
|
|
|
FMad.scalarize(0)
|
|
|
|
.lower();
|
|
|
|
|
2020-02-16 10:59:30 +08:00
|
|
|
// TODO: Do we need to clamp maximum bitwidth?
|
2020-01-22 09:22:57 +08:00
|
|
|
getActionDefinitionsBuilder(G_TRUNC)
|
2020-02-16 10:59:30 +08:00
|
|
|
.legalIf(isScalar(0))
|
|
|
|
.legalFor({{V2S16, V2S32}})
|
2020-02-16 09:24:36 +08:00
|
|
|
.clampMaxNumElements(0, S16, 2)
|
2020-02-16 10:59:30 +08:00
|
|
|
// Avoid scalarizing in cases that should be truly illegal. In unresolvable
|
|
|
|
// situations (like an invalid implicit use), we don't want to infinite loop
|
|
|
|
// in the legalizer.
|
|
|
|
.fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
|
2020-01-22 09:22:57 +08:00
|
|
|
.alwaysLegal();
|
|
|
|
|
2019-01-21 02:34:24 +08:00
|
|
|
getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
|
2019-01-21 03:28:20 +08:00
|
|
|
.legalFor({{S64, S32}, {S32, S16}, {S64, S16},
|
2020-01-22 05:47:27 +08:00
|
|
|
{S32, S1}, {S64, S1}, {S16, S1}})
|
2020-01-10 22:06:47 +08:00
|
|
|
.scalarize(0)
|
2020-02-07 10:11:52 +08:00
|
|
|
.clampScalar(0, S32, S64)
|
|
|
|
.widenScalarToNextPow2(1, 32);
|
2018-12-13 16:23:51 +08:00
|
|
|
|
2019-10-01 10:23:20 +08:00
|
|
|
// TODO: Split s1->s64 during regbankselect for VALU.
|
2019-10-08 07:33:08 +08:00
|
|
|
auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
|
2019-11-15 14:29:12 +08:00
|
|
|
.legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
|
2019-05-18 07:05:13 +08:00
|
|
|
.lowerFor({{S32, S64}})
|
2019-11-15 14:29:12 +08:00
|
|
|
.lowerIf(typeIs(1, S1))
|
2019-10-08 07:33:08 +08:00
|
|
|
.customFor({{S64, S64}});
|
|
|
|
if (ST.has16BitInsts())
|
|
|
|
IToFP.legalFor({{S16, S16}});
|
|
|
|
IToFP.clampScalar(1, S32, S64)
|
2020-02-16 11:09:23 +08:00
|
|
|
.scalarize(0)
|
|
|
|
.widenScalarToNextPow2(1);
|
2018-03-02 03:04:25 +08:00
|
|
|
|
2019-10-01 09:06:48 +08:00
|
|
|
auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
|
2020-01-05 05:40:45 +08:00
|
|
|
.legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
|
|
|
|
.customFor({{S64, S64}});
|
2019-10-01 09:06:48 +08:00
|
|
|
if (ST.has16BitInsts())
|
|
|
|
FPToI.legalFor({{S16, S16}});
|
|
|
|
else
|
|
|
|
FPToI.minScalar(1, S32);
|
|
|
|
|
|
|
|
FPToI.minScalar(0, S32)
|
2020-01-05 06:09:48 +08:00
|
|
|
.scalarize(0)
|
|
|
|
.lower();
|
2018-02-07 12:47:59 +08:00
|
|
|
|
2019-05-17 20:20:01 +08:00
|
|
|
getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
|
2019-12-25 03:49:31 +08:00
|
|
|
.scalarize(0)
|
|
|
|
.lower();
|
2018-12-21 11:14:45 +08:00
|
|
|
|
2019-12-24 08:42:53 +08:00
|
|
|
if (ST.has16BitInsts()) {
|
|
|
|
getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
|
|
|
|
.legalFor({S16, S32, S64})
|
|
|
|
.clampScalar(0, S16, S64)
|
|
|
|
.scalarize(0);
|
|
|
|
} else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
|
2019-05-17 20:20:05 +08:00
|
|
|
getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
|
2019-05-17 20:19:57 +08:00
|
|
|
.legalFor({S32, S64})
|
|
|
|
.clampScalar(0, S32, S64)
|
|
|
|
.scalarize(0);
|
|
|
|
} else {
|
2019-05-17 20:20:05 +08:00
|
|
|
getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
|
2019-05-17 20:19:57 +08:00
|
|
|
.legalFor({S32})
|
|
|
|
.customFor({S64})
|
|
|
|
.clampScalar(0, S32, S64)
|
|
|
|
.scalarize(0);
|
|
|
|
}
|
Re-commit AMDGPU/GlobalISel: Add support for simple shaders
Fix build when global-isel is disabled and fix a warning.
Summary: We can select constant/global G_LOAD, global G_STORE, and G_GEP.
Reviewers: qcolombet, MatzeB, t.p.northover, ab, arsenm
Subscribers: mehdi_amini, vkalintiris, kzhuravl, wdng, nhaehnle, mgorny, yaxunl, tony-tye, modocache, llvm-commits, dberris
Differential Revision: https://reviews.llvm.org/D26730
llvm-svn: 293551
2017-01-31 05:56:46 +08:00
|
|
|
|
2020-05-16 06:33:01 +08:00
|
|
|
getActionDefinitionsBuilder(G_PTR_ADD)
|
|
|
|
.scalarize(0)
|
|
|
|
.alwaysLegal();
|
|
|
|
|
|
|
|
// TODO: Clamp mask to pointer sizes
|
|
|
|
getActionDefinitionsBuilder(G_PTRMASK)
|
2019-09-09 23:46:13 +08:00
|
|
|
.scalarize(0)
|
|
|
|
.alwaysLegal();
|
|
|
|
|
2019-07-09 22:10:43 +08:00
|
|
|
auto &CmpBuilder =
|
|
|
|
getActionDefinitionsBuilder(G_ICMP)
|
AMDGPU/GlobalISel: Replace handling of boolean values
This solves selection failures with generated selection patterns,
which would fail due to inferring the SGPR reg bank for virtual
registers with a set register class instead of VCC bank. Use
instruction selection would constrain the virtual register to a
specific class, so when the def was selected later the bank no longer
was set to VCC.
Remove the SCC reg bank. SCC isn't directly addressable, so it
requires copying from SCC to an allocatable 32-bit register during
selection, so these might as well be treated as 32-bit SGPR values.
Now any scalar boolean value that will produce an outupt in SCC should
be widened during RegBankSelect to s32. Any s1 value should be a
vector boolean during selection. This makes the vcc register bank
unambiguous with a normal SGPR during selection.
Summary of how this should now work:
- G_TRUNC is always a no-op, and never should use a vcc bank result.
- SALU boolean operations should be promoted to s32 in RegBankSelect
apply mapping
- An s1 value means vcc bank at selection. The exception is for
legalization artifacts that use s1, which are never VCC. All other
contexts should infer the VCC register classes for s1 typed
registers. The LLT for the register is now needed to infer the
correct register class. Extensions with vcc sources should be
legalized to a select of constants during RegBankSelect.
- Copy from non-vcc to vcc ensures high bits of the input value are
cleared during selection.
- SALU boolean inputs should ensure the inputs are 0/1. This includes
select, conditional branches, and carry-ins.
There are a few somewhat dirty details. One is that G_TRUNC/G_*EXT
selection ignores the usual register-bank from register class
functions, and can't handle truncates with VCC result banks. I think
this is OK, since the artifacts are specially treated anyway. This
does require some care to avoid producing cases with vcc. There will
also be no 100% reliable way to verify this rule is followed in
selection in case of register classes, and violations manifests
themselves as invalid copy instructions much later.
Standard phi handling also only considers the bank of the result
register, and doesn't insert copies to make the source banks
match. This doesn't work for vcc, so we have to manually correct phi
inputs in this case. We should add a verifier check to make sure there
are no phis with mixed vcc and non-vcc register bank inputs.
There's also some duplication with the LegalizerHelper, and some code
which should live in the helper. I don't see a good way to share
special knowledge about what types to use for intermediate operations
depending on the bank for example. Using the helper to replace
extensions with selects also seems somewhat awkward to me.
Another issue is there are some contexts calling
getRegBankFromRegClass that apparently don't have the LLT type for the
register, but I haven't yet run into a real issue from this.
This also introduces new unnecessary instructions in most cases, since
we don't yet try to optimize out the zext when the source is known to
come from a compare.
2019-11-03 00:30:59 +08:00
|
|
|
// The compare output type differs based on the register bank of the output,
|
|
|
|
// so make both s1 and s32 legal.
|
|
|
|
//
|
|
|
|
// Scalar compares producing output in scc will be promoted to s32, as that
|
|
|
|
// is the allocatable register type that will be needed for the copy from
|
|
|
|
// scc. This will be promoted during RegBankSelect, and we assume something
|
|
|
|
// before that won't try to use s32 result types.
|
|
|
|
//
|
|
|
|
// Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
|
|
|
|
// bank.
|
2019-02-03 07:35:15 +08:00
|
|
|
.legalForCartesianProduct(
|
|
|
|
{S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
|
AMDGPU/GlobalISel: Replace handling of boolean values
This solves selection failures with generated selection patterns,
which would fail due to inferring the SGPR reg bank for virtual
registers with a set register class instead of VCC bank. Use
instruction selection would constrain the virtual register to a
specific class, so when the def was selected later the bank no longer
was set to VCC.
Remove the SCC reg bank. SCC isn't directly addressable, so it
requires copying from SCC to an allocatable 32-bit register during
selection, so these might as well be treated as 32-bit SGPR values.
Now any scalar boolean value that will produce an outupt in SCC should
be widened during RegBankSelect to s32. Any s1 value should be a
vector boolean during selection. This makes the vcc register bank
unambiguous with a normal SGPR during selection.
Summary of how this should now work:
- G_TRUNC is always a no-op, and never should use a vcc bank result.
- SALU boolean operations should be promoted to s32 in RegBankSelect
apply mapping
- An s1 value means vcc bank at selection. The exception is for
legalization artifacts that use s1, which are never VCC. All other
contexts should infer the VCC register classes for s1 typed
registers. The LLT for the register is now needed to infer the
correct register class. Extensions with vcc sources should be
legalized to a select of constants during RegBankSelect.
- Copy from non-vcc to vcc ensures high bits of the input value are
cleared during selection.
- SALU boolean inputs should ensure the inputs are 0/1. This includes
select, conditional branches, and carry-ins.
There are a few somewhat dirty details. One is that G_TRUNC/G_*EXT
selection ignores the usual register-bank from register class
functions, and can't handle truncates with VCC result banks. I think
this is OK, since the artifacts are specially treated anyway. This
does require some care to avoid producing cases with vcc. There will
also be no 100% reliable way to verify this rule is followed in
selection in case of register classes, and violations manifests
themselves as invalid copy instructions much later.
Standard phi handling also only considers the bank of the result
register, and doesn't insert copies to make the source banks
match. This doesn't work for vcc, so we have to manually correct phi
inputs in this case. We should add a verifier check to make sure there
are no phis with mixed vcc and non-vcc register bank inputs.
There's also some duplication with the LegalizerHelper, and some code
which should live in the helper. I don't see a good way to share
special knowledge about what types to use for intermediate operations
depending on the bank for example. Using the helper to replace
extensions with selects also seems somewhat awkward to me.
Another issue is there are some contexts calling
getRegBankFromRegClass that apparently don't have the LLT type for the
register, but I haven't yet run into a real issue from this.
This also introduces new unnecessary instructions in most cases, since
we don't yet try to optimize out the zext when the source is known to
come from a compare.
2019-11-03 00:30:59 +08:00
|
|
|
.legalForCartesianProduct(
|
|
|
|
{S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
|
2019-07-09 22:10:43 +08:00
|
|
|
if (ST.has16BitInsts()) {
|
|
|
|
CmpBuilder.legalFor({{S1, S16}});
|
|
|
|
}
|
|
|
|
|
|
|
|
CmpBuilder
|
2019-02-03 07:35:15 +08:00
|
|
|
.widenScalarToNextPow2(1)
|
|
|
|
.clampScalar(1, S32, S64)
|
|
|
|
.scalarize(0)
|
AMDGPU/GlobalISel: Replace handling of boolean values
This solves selection failures with generated selection patterns,
which would fail due to inferring the SGPR reg bank for virtual
registers with a set register class instead of VCC bank. Use
instruction selection would constrain the virtual register to a
specific class, so when the def was selected later the bank no longer
was set to VCC.
Remove the SCC reg bank. SCC isn't directly addressable, so it
requires copying from SCC to an allocatable 32-bit register during
selection, so these might as well be treated as 32-bit SGPR values.
Now any scalar boolean value that will produce an outupt in SCC should
be widened during RegBankSelect to s32. Any s1 value should be a
vector boolean during selection. This makes the vcc register bank
unambiguous with a normal SGPR during selection.
Summary of how this should now work:
- G_TRUNC is always a no-op, and never should use a vcc bank result.
- SALU boolean operations should be promoted to s32 in RegBankSelect
apply mapping
- An s1 value means vcc bank at selection. The exception is for
legalization artifacts that use s1, which are never VCC. All other
contexts should infer the VCC register classes for s1 typed
registers. The LLT for the register is now needed to infer the
correct register class. Extensions with vcc sources should be
legalized to a select of constants during RegBankSelect.
- Copy from non-vcc to vcc ensures high bits of the input value are
cleared during selection.
- SALU boolean inputs should ensure the inputs are 0/1. This includes
select, conditional branches, and carry-ins.
There are a few somewhat dirty details. One is that G_TRUNC/G_*EXT
selection ignores the usual register-bank from register class
functions, and can't handle truncates with VCC result banks. I think
this is OK, since the artifacts are specially treated anyway. This
does require some care to avoid producing cases with vcc. There will
also be no 100% reliable way to verify this rule is followed in
selection in case of register classes, and violations manifests
themselves as invalid copy instructions much later.
Standard phi handling also only considers the bank of the result
register, and doesn't insert copies to make the source banks
match. This doesn't work for vcc, so we have to manually correct phi
inputs in this case. We should add a verifier check to make sure there
are no phis with mixed vcc and non-vcc register bank inputs.
There's also some duplication with the LegalizerHelper, and some code
which should live in the helper. I don't see a good way to share
special knowledge about what types to use for intermediate operations
depending on the bank for example. Using the helper to replace
extensions with selects also seems somewhat awkward to me.
Another issue is there are some contexts calling
getRegBankFromRegClass that apparently don't have the LLT type for the
register, but I haven't yet run into a real issue from this.
This also introduces new unnecessary instructions in most cases, since
we don't yet try to optimize out the zext when the source is known to
come from a compare.
2019-11-03 00:30:59 +08:00
|
|
|
.legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
|
2019-02-03 07:35:15 +08:00
|
|
|
|
|
|
|
getActionDefinitionsBuilder(G_FCMP)
|
2019-07-02 01:35:53 +08:00
|
|
|
.legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
|
2019-01-25 10:59:34 +08:00
|
|
|
.widenScalarToNextPow2(1)
|
|
|
|
.clampScalar(1, S32, S64)
|
2019-01-27 07:54:53 +08:00
|
|
|
.scalarize(0);
|
2019-01-25 10:59:34 +08:00
|
|
|
|
2020-01-25 09:53:26 +08:00
|
|
|
// FIXME: fpow has a selection pattern that should move to custom lowering.
|
2020-02-21 07:59:08 +08:00
|
|
|
auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
|
2020-01-25 10:21:03 +08:00
|
|
|
if (ST.has16BitInsts())
|
2020-01-25 09:53:26 +08:00
|
|
|
Exp2Ops.legalFor({S32, S16});
|
2020-01-25 10:21:03 +08:00
|
|
|
else
|
2020-01-25 09:53:26 +08:00
|
|
|
Exp2Ops.legalFor({S32});
|
|
|
|
Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
|
|
|
|
Exp2Ops.scalarize(0);
|
AMDGPU/GlobalISel: Mark 32-bit G_ICMP as legal
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, wdng, nhaehnle, yaxunl, rovka, kristof.beyls, igorb, dstuttard, tpr, t-tye, llvm-commits
Differential Revision: https://reviews.llvm.org/D33890
llvm-svn: 304797
2017-06-06 22:16:50 +08:00
|
|
|
|
2020-02-21 07:59:08 +08:00
|
|
|
auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
|
2020-01-25 09:53:26 +08:00
|
|
|
if (ST.has16BitInsts())
|
|
|
|
ExpOps.customFor({{S32}, {S16}});
|
|
|
|
else
|
|
|
|
ExpOps.customFor({S32});
|
|
|
|
ExpOps.clampScalar(0, MinScalarFPTy, S32)
|
|
|
|
.scalarize(0);
|
2020-01-22 11:29:30 +08:00
|
|
|
|
2019-01-31 10:09:57 +08:00
|
|
|
// The 64-bit versions produce 32-bit results, but only on the SALU.
|
2020-02-07 10:11:52 +08:00
|
|
|
getActionDefinitionsBuilder(G_CTPOP)
|
|
|
|
.legalFor({{S32, S32}, {S32, S64}})
|
|
|
|
.clampScalar(0, S32, S32)
|
|
|
|
.clampScalar(1, S32, S64)
|
|
|
|
.scalarize(0)
|
|
|
|
.widenScalarToNextPow2(0, 32)
|
|
|
|
.widenScalarToNextPow2(1, 32);
|
|
|
|
|
|
|
|
// The hardware instructions return a different result on 0 than the generic
|
|
|
|
// instructions expect. The hardware produces -1, but these produce the
|
|
|
|
// bitwidth.
|
|
|
|
getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
|
|
|
|
.scalarize(0)
|
|
|
|
.clampScalar(0, S32, S32)
|
|
|
|
.clampScalar(1, S32, S64)
|
|
|
|
.widenScalarToNextPow2(0, 32)
|
|
|
|
.widenScalarToNextPow2(1, 32)
|
|
|
|
.lower();
|
|
|
|
|
|
|
|
// The 64-bit versions produce 32-bit results, but only on the SALU.
|
|
|
|
getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
|
2019-01-31 10:09:57 +08:00
|
|
|
.legalFor({{S32, S32}, {S32, S64}})
|
|
|
|
.clampScalar(0, S32, S32)
|
2019-02-21 00:42:52 +08:00
|
|
|
.clampScalar(1, S32, S64)
|
2019-02-21 23:22:20 +08:00
|
|
|
.scalarize(0)
|
|
|
|
.widenScalarToNextPow2(0, 32)
|
|
|
|
.widenScalarToNextPow2(1, 32);
|
2019-01-31 10:09:57 +08:00
|
|
|
|
2020-02-14 23:56:46 +08:00
|
|
|
getActionDefinitionsBuilder(G_BITREVERSE)
|
2019-01-31 10:34:03 +08:00
|
|
|
.legalFor({S32})
|
|
|
|
.clampScalar(0, S32, S32)
|
|
|
|
.scalarize(0);
|
2019-01-31 10:09:57 +08:00
|
|
|
|
2019-05-24 01:58:48 +08:00
|
|
|
if (ST.has16BitInsts()) {
|
2020-02-14 23:56:46 +08:00
|
|
|
getActionDefinitionsBuilder(G_BSWAP)
|
|
|
|
.legalFor({S16, S32, V2S16})
|
|
|
|
.clampMaxNumElements(0, S16, 2)
|
|
|
|
// FIXME: Fixing non-power-of-2 before clamp is workaround for
|
|
|
|
// narrowScalar limitation.
|
|
|
|
.widenScalarToNextPow2(0)
|
|
|
|
.clampScalar(0, S16, S32)
|
|
|
|
.scalarize(0);
|
|
|
|
|
2019-05-24 01:58:48 +08:00
|
|
|
if (ST.hasVOP3PInsts()) {
|
|
|
|
getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
|
|
|
|
.legalFor({S32, S16, V2S16})
|
|
|
|
.moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
|
|
|
|
.clampMaxNumElements(0, S16, 2)
|
AMDGPU/GlobalISel: Legalize s64 min/max by lowering
Reviewers: arsenm, rampitec
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D75108
2020-02-25 18:31:37 +08:00
|
|
|
.minScalar(0, S16)
|
2019-05-24 01:58:48 +08:00
|
|
|
.widenScalarToNextPow2(0)
|
AMDGPU/GlobalISel: Legalize s64 min/max by lowering
Reviewers: arsenm, rampitec
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D75108
2020-02-25 18:31:37 +08:00
|
|
|
.scalarize(0)
|
|
|
|
.lower();
|
2019-05-24 01:58:48 +08:00
|
|
|
} else {
|
|
|
|
getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
|
|
|
|
.legalFor({S32, S16})
|
|
|
|
.widenScalarToNextPow2(0)
|
AMDGPU/GlobalISel: Legalize s64 min/max by lowering
Reviewers: arsenm, rampitec
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D75108
2020-02-25 18:31:37 +08:00
|
|
|
.minScalar(0, S16)
|
|
|
|
.scalarize(0)
|
|
|
|
.lower();
|
2019-05-24 01:58:48 +08:00
|
|
|
}
|
|
|
|
} else {
|
2020-02-14 23:56:46 +08:00
|
|
|
// TODO: Should have same legality without v_perm_b32
|
|
|
|
getActionDefinitionsBuilder(G_BSWAP)
|
|
|
|
.legalFor({S32})
|
|
|
|
.lowerIf(narrowerThan(0, 32))
|
|
|
|
// FIXME: Fixing non-power-of-2 before clamp is workaround for
|
|
|
|
// narrowScalar limitation.
|
|
|
|
.widenScalarToNextPow2(0)
|
|
|
|
.maxScalar(0, S32)
|
|
|
|
.scalarize(0)
|
|
|
|
.lower();
|
|
|
|
|
2019-05-24 01:58:48 +08:00
|
|
|
getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
|
|
|
|
.legalFor({S32})
|
AMDGPU/GlobalISel: Legalize s64 min/max by lowering
Reviewers: arsenm, rampitec
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D75108
2020-02-25 18:31:37 +08:00
|
|
|
.minScalar(0, S32)
|
2019-05-24 01:58:48 +08:00
|
|
|
.widenScalarToNextPow2(0)
|
AMDGPU/GlobalISel: Legalize s64 min/max by lowering
Reviewers: arsenm, rampitec
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D75108
2020-02-25 18:31:37 +08:00
|
|
|
.scalarize(0)
|
|
|
|
.lower();
|
2019-05-24 01:58:48 +08:00
|
|
|
}
|
2018-12-13 16:23:51 +08:00
|
|
|
|
AMDGPU/GlobalISel: Add support for G_INTTOPTR
Summary: This is a no-op.
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, kristof.beyls, dstuttard, tpr, t-tye, llvm-commits
Differential Revision: https://reviews.llvm.org/D52916
llvm-svn: 343839
2018-10-05 12:34:09 +08:00
|
|
|
getActionDefinitionsBuilder(G_INTTOPTR)
|
2019-02-03 07:29:55 +08:00
|
|
|
// List the common cases
|
2019-02-15 06:24:28 +08:00
|
|
|
.legalForCartesianProduct(AddrSpaces64, {S64})
|
|
|
|
.legalForCartesianProduct(AddrSpaces32, {S32})
|
2019-02-03 07:29:55 +08:00
|
|
|
.scalarize(0)
|
|
|
|
// Accept any address space as long as the size matches
|
|
|
|
.legalIf(sameSize(0, 1))
|
|
|
|
.widenScalarIf(smallerThan(1, 0),
|
|
|
|
[](const LegalityQuery &Query) {
|
|
|
|
return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
|
|
|
|
})
|
|
|
|
.narrowScalarIf(greaterThan(1, 0),
|
|
|
|
[](const LegalityQuery &Query) {
|
|
|
|
return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
|
|
|
|
});
|
2018-03-17 23:17:41 +08:00
|
|
|
|
2018-12-13 16:23:51 +08:00
|
|
|
getActionDefinitionsBuilder(G_PTRTOINT)
|
2019-02-03 07:29:55 +08:00
|
|
|
// List the common cases
|
2019-02-15 06:24:28 +08:00
|
|
|
.legalForCartesianProduct(AddrSpaces64, {S64})
|
|
|
|
.legalForCartesianProduct(AddrSpaces32, {S32})
|
2019-02-03 07:29:55 +08:00
|
|
|
.scalarize(0)
|
|
|
|
// Accept any address space as long as the size matches
|
|
|
|
.legalIf(sameSize(0, 1))
|
|
|
|
.widenScalarIf(smallerThan(0, 1),
|
|
|
|
[](const LegalityQuery &Query) {
|
|
|
|
return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
|
|
|
|
})
|
|
|
|
.narrowScalarIf(
|
|
|
|
greaterThan(0, 1),
|
|
|
|
[](const LegalityQuery &Query) {
|
|
|
|
return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
|
|
|
|
});
|
2018-12-13 16:23:51 +08:00
|
|
|
|
2019-08-28 08:58:24 +08:00
|
|
|
getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
|
|
|
|
.scalarize(0)
|
|
|
|
.custom();
|
2019-02-08 10:40:47 +08:00
|
|
|
|
2019-07-17 02:05:29 +08:00
|
|
|
// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
|
|
|
|
// handle some operations by just promoting the register during
|
|
|
|
// selection. There are also d16 loads on GFX9+ which preserve the high bits.
|
2020-01-20 08:48:27 +08:00
|
|
|
auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned {
|
2019-09-11 00:20:14 +08:00
|
|
|
switch (AS) {
|
|
|
|
// FIXME: Private element size.
|
|
|
|
case AMDGPUAS::PRIVATE_ADDRESS:
|
|
|
|
return 32;
|
|
|
|
// FIXME: Check subtarget
|
|
|
|
case AMDGPUAS::LOCAL_ADDRESS:
|
|
|
|
return ST.useDS128() ? 128 : 64;
|
|
|
|
|
|
|
|
// Treat constant and global as identical. SMRD loads are sometimes usable
|
|
|
|
// for global loads (ideally constant address space should be eliminated)
|
|
|
|
// depending on the context. Legality cannot be context dependent, but
|
|
|
|
// RegBankSelect can split the load as necessary depending on the pointer
|
|
|
|
// register bank/uniformity and if the memory is invariant or not written in
|
|
|
|
// a kernel.
|
|
|
|
case AMDGPUAS::CONSTANT_ADDRESS:
|
|
|
|
case AMDGPUAS::GLOBAL_ADDRESS:
|
2020-01-20 08:48:27 +08:00
|
|
|
return IsLoad ? 512 : 128;
|
2019-09-11 00:20:14 +08:00
|
|
|
default:
|
|
|
|
return 128;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2020-01-22 02:23:45 +08:00
|
|
|
const auto needToSplitMemOp = [=](const LegalityQuery &Query,
|
|
|
|
bool IsLoad) -> bool {
|
2019-09-11 00:20:14 +08:00
|
|
|
const LLT DstTy = Query.Types[0];
|
|
|
|
|
|
|
|
// Split vector extloads.
|
|
|
|
unsigned MemSize = Query.MMODescrs[0].SizeInBits;
|
2020-01-03 03:45:11 +08:00
|
|
|
unsigned Align = Query.MMODescrs[0].AlignInBits;
|
|
|
|
|
|
|
|
if (MemSize < DstTy.getSizeInBits())
|
|
|
|
MemSize = std::max(MemSize, Align);
|
|
|
|
|
2019-09-11 00:20:14 +08:00
|
|
|
if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
const LLT PtrTy = Query.Types[1];
|
|
|
|
unsigned AS = PtrTy.getAddressSpace();
|
2020-01-20 08:48:27 +08:00
|
|
|
if (MemSize > maxSizeForAddrSpace(AS, IsLoad))
|
2019-09-11 00:20:14 +08:00
|
|
|
return true;
|
|
|
|
|
|
|
|
// Catch weird sized loads that don't evenly divide into the access sizes
|
|
|
|
// TODO: May be able to widen depending on alignment etc.
|
2020-01-22 02:23:45 +08:00
|
|
|
unsigned NumRegs = (MemSize + 31) / 32;
|
|
|
|
if (NumRegs == 3) {
|
|
|
|
if (!ST.hasDwordx3LoadStores())
|
|
|
|
return true;
|
|
|
|
} else {
|
|
|
|
// If the alignment allows, these should have been widened.
|
|
|
|
if (!isPowerOf2_32(NumRegs))
|
|
|
|
return true;
|
|
|
|
}
|
2019-09-11 00:20:14 +08:00
|
|
|
|
|
|
|
if (Align < MemSize) {
|
|
|
|
const SITargetLowering *TLI = ST.getTargetLowering();
|
|
|
|
return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
|
|
|
|
}
|
2018-03-17 23:17:41 +08:00
|
|
|
|
2019-09-11 00:20:14 +08:00
|
|
|
return false;
|
|
|
|
};
|
|
|
|
|
2020-01-22 02:23:45 +08:00
|
|
|
const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool {
|
|
|
|
unsigned Size = Query.Types[0].getSizeInBits();
|
|
|
|
if (isPowerOf2_32(Size))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (Size == 96 && ST.hasDwordx3LoadStores())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
unsigned AddrSpace = Query.Types[1].getAddressSpace();
|
|
|
|
if (Size >= maxSizeForAddrSpace(AddrSpace, true))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
unsigned Align = Query.MMODescrs[0].AlignInBits;
|
|
|
|
unsigned RoundedSize = NextPowerOf2(Size);
|
|
|
|
return (Align >= RoundedSize);
|
|
|
|
};
|
|
|
|
|
2019-09-11 00:20:14 +08:00
|
|
|
unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
|
|
|
|
unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
|
|
|
|
unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
|
|
|
|
|
|
|
|
// TODO: Refine based on subtargets which support unaligned access or 128-bit
|
|
|
|
// LDS
|
|
|
|
// TODO: Unsupported flat for SI.
|
|
|
|
|
|
|
|
for (unsigned Op : {G_LOAD, G_STORE}) {
|
|
|
|
const bool IsStore = Op == G_STORE;
|
|
|
|
|
|
|
|
auto &Actions = getActionDefinitionsBuilder(Op);
|
|
|
|
// Whitelist the common cases.
|
|
|
|
// TODO: Loads to s16 on gfx9
|
|
|
|
Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
|
|
|
|
{V2S32, GlobalPtr, 64, GlobalAlign32},
|
|
|
|
{V4S32, GlobalPtr, 128, GlobalAlign32},
|
|
|
|
{S128, GlobalPtr, 128, GlobalAlign32},
|
|
|
|
{S64, GlobalPtr, 64, GlobalAlign32},
|
|
|
|
{V2S64, GlobalPtr, 128, GlobalAlign32},
|
|
|
|
{V2S16, GlobalPtr, 32, GlobalAlign32},
|
|
|
|
{S32, GlobalPtr, 8, GlobalAlign8},
|
|
|
|
{S32, GlobalPtr, 16, GlobalAlign16},
|
|
|
|
|
|
|
|
{S32, LocalPtr, 32, 32},
|
|
|
|
{S64, LocalPtr, 64, 32},
|
|
|
|
{V2S32, LocalPtr, 64, 32},
|
|
|
|
{S32, LocalPtr, 8, 8},
|
|
|
|
{S32, LocalPtr, 16, 16},
|
|
|
|
{V2S16, LocalPtr, 32, 32},
|
|
|
|
|
|
|
|
{S32, PrivatePtr, 32, 32},
|
|
|
|
{S32, PrivatePtr, 8, 8},
|
|
|
|
{S32, PrivatePtr, 16, 16},
|
|
|
|
{V2S16, PrivatePtr, 32, 32},
|
|
|
|
|
|
|
|
{S32, FlatPtr, 32, GlobalAlign32},
|
|
|
|
{S32, FlatPtr, 16, GlobalAlign16},
|
|
|
|
{S32, FlatPtr, 8, GlobalAlign8},
|
|
|
|
{V2S16, FlatPtr, 32, GlobalAlign32},
|
|
|
|
|
|
|
|
{S32, ConstantPtr, 32, GlobalAlign32},
|
|
|
|
{V2S32, ConstantPtr, 64, GlobalAlign32},
|
|
|
|
{V4S32, ConstantPtr, 128, GlobalAlign32},
|
|
|
|
{S64, ConstantPtr, 64, GlobalAlign32},
|
|
|
|
{S128, ConstantPtr, 128, GlobalAlign32},
|
|
|
|
{V2S32, ConstantPtr, 32, GlobalAlign32}});
|
|
|
|
Actions
|
2019-09-11 00:42:31 +08:00
|
|
|
.customIf(typeIs(1, Constant32Ptr))
|
2020-01-22 02:23:45 +08:00
|
|
|
// Widen suitably aligned loads by loading extra elements.
|
|
|
|
.moreElementsIf([=](const LegalityQuery &Query) {
|
|
|
|
const LLT Ty = Query.Types[0];
|
|
|
|
return Op == G_LOAD && Ty.isVector() &&
|
|
|
|
shouldWidenLoadResult(Query);
|
|
|
|
}, moreElementsToNextPow2(0))
|
|
|
|
.widenScalarIf([=](const LegalityQuery &Query) {
|
|
|
|
const LLT Ty = Query.Types[0];
|
|
|
|
return Op == G_LOAD && !Ty.isVector() &&
|
|
|
|
shouldWidenLoadResult(Query);
|
|
|
|
}, widenScalarOrEltToNextPow2(0))
|
2019-09-11 00:20:14 +08:00
|
|
|
.narrowScalarIf(
|
|
|
|
[=](const LegalityQuery &Query) -> bool {
|
2020-01-20 08:48:27 +08:00
|
|
|
return !Query.Types[0].isVector() &&
|
|
|
|
needToSplitMemOp(Query, Op == G_LOAD);
|
2019-09-11 00:20:14 +08:00
|
|
|
},
|
|
|
|
[=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
|
|
|
|
const LLT DstTy = Query.Types[0];
|
|
|
|
const LLT PtrTy = Query.Types[1];
|
|
|
|
|
|
|
|
const unsigned DstSize = DstTy.getSizeInBits();
|
|
|
|
unsigned MemSize = Query.MMODescrs[0].SizeInBits;
|
|
|
|
|
|
|
|
// Split extloads.
|
|
|
|
if (DstSize > MemSize)
|
|
|
|
return std::make_pair(0, LLT::scalar(MemSize));
|
|
|
|
|
2020-01-22 02:23:45 +08:00
|
|
|
if (!isPowerOf2_32(DstSize)) {
|
|
|
|
// We're probably decomposing an odd sized store. Try to split
|
|
|
|
// to the widest type. TODO: Account for alignment. As-is it
|
|
|
|
// should be OK, since the new parts will be further legalized.
|
|
|
|
unsigned FloorSize = PowerOf2Floor(DstSize);
|
|
|
|
return std::make_pair(0, LLT::scalar(FloorSize));
|
|
|
|
}
|
|
|
|
|
2019-09-11 00:20:14 +08:00
|
|
|
if (DstSize > 32 && (DstSize % 32 != 0)) {
|
|
|
|
// FIXME: Need a way to specify non-extload of larger size if
|
|
|
|
// suitably aligned.
|
|
|
|
return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
|
|
|
|
}
|
|
|
|
|
2020-01-20 08:48:27 +08:00
|
|
|
unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
|
|
|
|
Op == G_LOAD);
|
2019-09-11 00:20:14 +08:00
|
|
|
if (MemSize > MaxSize)
|
|
|
|
return std::make_pair(0, LLT::scalar(MaxSize));
|
|
|
|
|
|
|
|
unsigned Align = Query.MMODescrs[0].AlignInBits;
|
|
|
|
return std::make_pair(0, LLT::scalar(Align));
|
|
|
|
})
|
|
|
|
.fewerElementsIf(
|
|
|
|
[=](const LegalityQuery &Query) -> bool {
|
2020-01-20 08:48:27 +08:00
|
|
|
return Query.Types[0].isVector() &&
|
|
|
|
needToSplitMemOp(Query, Op == G_LOAD);
|
2019-09-11 00:20:14 +08:00
|
|
|
},
|
|
|
|
[=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
|
|
|
|
const LLT DstTy = Query.Types[0];
|
|
|
|
const LLT PtrTy = Query.Types[1];
|
|
|
|
|
|
|
|
LLT EltTy = DstTy.getElementType();
|
2020-01-20 08:48:27 +08:00
|
|
|
unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
|
|
|
|
Op == G_LOAD);
|
2019-09-11 00:20:14 +08:00
|
|
|
|
2020-01-22 02:23:45 +08:00
|
|
|
// FIXME: Handle widened to power of 2 results better. This ends
|
|
|
|
// up scalarizing.
|
|
|
|
// FIXME: 3 element stores scalarized on SI
|
|
|
|
|
2019-09-11 00:20:14 +08:00
|
|
|
// Split if it's too large for the address space.
|
|
|
|
if (Query.MMODescrs[0].SizeInBits > MaxSize) {
|
|
|
|
unsigned NumElts = DstTy.getNumElements();
|
2020-01-22 01:20:02 +08:00
|
|
|
unsigned EltSize = EltTy.getSizeInBits();
|
|
|
|
|
|
|
|
if (MaxSize % EltSize == 0) {
|
|
|
|
return std::make_pair(
|
|
|
|
0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
|
|
|
|
}
|
|
|
|
|
2019-09-11 00:20:14 +08:00
|
|
|
unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
|
|
|
|
|
|
|
|
// FIXME: Refine when odd breakdowns handled
|
|
|
|
// The scalars will need to be re-legalized.
|
|
|
|
if (NumPieces == 1 || NumPieces >= NumElts ||
|
|
|
|
NumElts % NumPieces != 0)
|
|
|
|
return std::make_pair(0, EltTy);
|
|
|
|
|
|
|
|
return std::make_pair(0,
|
|
|
|
LLT::vector(NumElts / NumPieces, EltTy));
|
|
|
|
}
|
|
|
|
|
2020-01-22 02:23:45 +08:00
|
|
|
// FIXME: We could probably handle weird extending loads better.
|
|
|
|
unsigned MemSize = Query.MMODescrs[0].SizeInBits;
|
|
|
|
if (DstTy.getSizeInBits() > MemSize)
|
|
|
|
return std::make_pair(0, EltTy);
|
|
|
|
|
|
|
|
unsigned EltSize = EltTy.getSizeInBits();
|
|
|
|
unsigned DstSize = DstTy.getSizeInBits();
|
|
|
|
if (!isPowerOf2_32(DstSize)) {
|
|
|
|
// We're probably decomposing an odd sized store. Try to split
|
|
|
|
// to the widest type. TODO: Account for alignment. As-is it
|
|
|
|
// should be OK, since the new parts will be further legalized.
|
|
|
|
unsigned FloorSize = PowerOf2Floor(DstSize);
|
|
|
|
return std::make_pair(
|
|
|
|
0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
|
|
|
|
}
|
|
|
|
|
2019-09-11 00:20:14 +08:00
|
|
|
// Need to split because of alignment.
|
|
|
|
unsigned Align = Query.MMODescrs[0].AlignInBits;
|
|
|
|
if (EltSize > Align &&
|
|
|
|
(EltSize / Align < DstTy.getNumElements())) {
|
|
|
|
return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
|
|
|
|
}
|
|
|
|
|
|
|
|
// May need relegalization for the scalars.
|
|
|
|
return std::make_pair(0, EltTy);
|
|
|
|
})
|
|
|
|
.minScalar(0, S32);
|
|
|
|
|
|
|
|
if (IsStore)
|
|
|
|
Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
|
|
|
|
|
|
|
|
// TODO: Need a bitcast lower option?
|
|
|
|
Actions
|
|
|
|
.legalIf([=](const LegalityQuery &Query) {
|
|
|
|
const LLT Ty0 = Query.Types[0];
|
|
|
|
unsigned Size = Ty0.getSizeInBits();
|
|
|
|
unsigned MemSize = Query.MMODescrs[0].SizeInBits;
|
|
|
|
unsigned Align = Query.MMODescrs[0].AlignInBits;
|
|
|
|
|
|
|
|
// FIXME: Widening store from alignment not valid.
|
|
|
|
if (MemSize < Size)
|
|
|
|
MemSize = std::max(MemSize, Align);
|
|
|
|
|
2020-01-03 03:45:11 +08:00
|
|
|
// No extending vector loads.
|
|
|
|
if (Size > MemSize && Ty0.isVector())
|
|
|
|
return false;
|
|
|
|
|
2019-09-11 00:20:14 +08:00
|
|
|
switch (MemSize) {
|
|
|
|
case 8:
|
|
|
|
case 16:
|
|
|
|
return Size == 32;
|
|
|
|
case 32:
|
|
|
|
case 64:
|
|
|
|
case 128:
|
|
|
|
return true;
|
|
|
|
case 96:
|
|
|
|
return ST.hasDwordx3LoadStores();
|
|
|
|
case 256:
|
|
|
|
case 512:
|
|
|
|
return true;
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
})
|
|
|
|
.widenScalarToNextPow2(0)
|
|
|
|
.moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
|
|
|
|
}
|
2018-03-17 23:17:41 +08:00
|
|
|
|
2019-01-23 03:02:10 +08:00
|
|
|
auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
|
2019-09-11 00:20:14 +08:00
|
|
|
.legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
|
|
|
|
{S32, GlobalPtr, 16, 2 * 8},
|
|
|
|
{S32, LocalPtr, 8, 8},
|
|
|
|
{S32, LocalPtr, 16, 16},
|
|
|
|
{S32, PrivatePtr, 8, 8},
|
2019-09-11 00:42:37 +08:00
|
|
|
{S32, PrivatePtr, 16, 16},
|
|
|
|
{S32, ConstantPtr, 8, 8},
|
|
|
|
{S32, ConstantPtr, 16, 2 * 8}});
|
2019-01-23 03:02:10 +08:00
|
|
|
if (ST.hasFlatAddressSpace()) {
|
2019-09-11 00:20:14 +08:00
|
|
|
ExtLoads.legalForTypesWithMemDesc(
|
|
|
|
{{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
|
2019-01-23 03:02:10 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
ExtLoads.clampScalar(0, S32, S32)
|
|
|
|
.widenScalarToNextPow2(0)
|
|
|
|
.unsupportedIfMemSizeNotPow2()
|
|
|
|
.lower();
|
|
|
|
|
2018-12-20 08:33:49 +08:00
|
|
|
auto &Atomics = getActionDefinitionsBuilder(
|
|
|
|
{G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
|
|
|
|
G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
|
|
|
|
G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
|
2019-10-09 01:04:41 +08:00
|
|
|
G_ATOMICRMW_UMIN})
|
2018-12-20 08:33:49 +08:00
|
|
|
.legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
|
|
|
|
{S64, GlobalPtr}, {S64, LocalPtr}});
|
|
|
|
if (ST.hasFlatAddressSpace()) {
|
|
|
|
Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
|
|
|
|
}
|
Re-commit AMDGPU/GlobalISel: Add support for simple shaders
Fix build when global-isel is disabled and fix a warning.
Summary: We can select constant/global G_LOAD, global G_STORE, and G_GEP.
Reviewers: qcolombet, MatzeB, t.p.northover, ab, arsenm
Subscribers: mehdi_amini, vkalintiris, kzhuravl, wdng, nhaehnle, mgorny, yaxunl, tony-tye, modocache, llvm-commits, dberris
Differential Revision: https://reviews.llvm.org/D26730
llvm-svn: 293551
2017-01-31 05:56:46 +08:00
|
|
|
|
2019-08-01 11:33:15 +08:00
|
|
|
getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
|
|
|
|
.legalFor({{S32, LocalPtr}});
|
|
|
|
|
2019-10-09 01:04:41 +08:00
|
|
|
// BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
|
|
|
|
// demarshalling
|
|
|
|
getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
|
|
|
|
.customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
|
|
|
|
{S32, FlatPtr}, {S64, FlatPtr}})
|
|
|
|
.legalFor({{S32, LocalPtr}, {S64, LocalPtr},
|
|
|
|
{S32, RegionPtr}, {S64, RegionPtr}});
|
2019-01-19 05:42:55 +08:00
|
|
|
// TODO: Pointer types, any 32-bit or 64-bit vector
|
AMDGPU/GlobalISel: Replace handling of boolean values
This solves selection failures with generated selection patterns,
which would fail due to inferring the SGPR reg bank for virtual
registers with a set register class instead of VCC bank. Use
instruction selection would constrain the virtual register to a
specific class, so when the def was selected later the bank no longer
was set to VCC.
Remove the SCC reg bank. SCC isn't directly addressable, so it
requires copying from SCC to an allocatable 32-bit register during
selection, so these might as well be treated as 32-bit SGPR values.
Now any scalar boolean value that will produce an outupt in SCC should
be widened during RegBankSelect to s32. Any s1 value should be a
vector boolean during selection. This makes the vcc register bank
unambiguous with a normal SGPR during selection.
Summary of how this should now work:
- G_TRUNC is always a no-op, and never should use a vcc bank result.
- SALU boolean operations should be promoted to s32 in RegBankSelect
apply mapping
- An s1 value means vcc bank at selection. The exception is for
legalization artifacts that use s1, which are never VCC. All other
contexts should infer the VCC register classes for s1 typed
registers. The LLT for the register is now needed to infer the
correct register class. Extensions with vcc sources should be
legalized to a select of constants during RegBankSelect.
- Copy from non-vcc to vcc ensures high bits of the input value are
cleared during selection.
- SALU boolean inputs should ensure the inputs are 0/1. This includes
select, conditional branches, and carry-ins.
There are a few somewhat dirty details. One is that G_TRUNC/G_*EXT
selection ignores the usual register-bank from register class
functions, and can't handle truncates with VCC result banks. I think
this is OK, since the artifacts are specially treated anyway. This
does require some care to avoid producing cases with vcc. There will
also be no 100% reliable way to verify this rule is followed in
selection in case of register classes, and violations manifests
themselves as invalid copy instructions much later.
Standard phi handling also only considers the bank of the result
register, and doesn't insert copies to make the source banks
match. This doesn't work for vcc, so we have to manually correct phi
inputs in this case. We should add a verifier check to make sure there
are no phis with mixed vcc and non-vcc register bank inputs.
There's also some duplication with the LegalizerHelper, and some code
which should live in the helper. I don't see a good way to share
special knowledge about what types to use for intermediate operations
depending on the bank for example. Using the helper to replace
extensions with selects also seems somewhat awkward to me.
Another issue is there are some contexts calling
getRegBankFromRegClass that apparently don't have the LLT type for the
register, but I haven't yet run into a real issue from this.
This also introduces new unnecessary instructions in most cases, since
we don't yet try to optimize out the zext when the source is known to
come from a compare.
2019-11-03 00:30:59 +08:00
|
|
|
|
|
|
|
// Condition should be s32 for scalar, s1 for vector.
|
2019-01-19 05:42:55 +08:00
|
|
|
getActionDefinitionsBuilder(G_SELECT)
|
2019-07-01 23:42:47 +08:00
|
|
|
.legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
|
2019-02-04 22:04:52 +08:00
|
|
|
GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
|
AMDGPU/GlobalISel: Replace handling of boolean values
This solves selection failures with generated selection patterns,
which would fail due to inferring the SGPR reg bank for virtual
registers with a set register class instead of VCC bank. Use
instruction selection would constrain the virtual register to a
specific class, so when the def was selected later the bank no longer
was set to VCC.
Remove the SCC reg bank. SCC isn't directly addressable, so it
requires copying from SCC to an allocatable 32-bit register during
selection, so these might as well be treated as 32-bit SGPR values.
Now any scalar boolean value that will produce an outupt in SCC should
be widened during RegBankSelect to s32. Any s1 value should be a
vector boolean during selection. This makes the vcc register bank
unambiguous with a normal SGPR during selection.
Summary of how this should now work:
- G_TRUNC is always a no-op, and never should use a vcc bank result.
- SALU boolean operations should be promoted to s32 in RegBankSelect
apply mapping
- An s1 value means vcc bank at selection. The exception is for
legalization artifacts that use s1, which are never VCC. All other
contexts should infer the VCC register classes for s1 typed
registers. The LLT for the register is now needed to infer the
correct register class. Extensions with vcc sources should be
legalized to a select of constants during RegBankSelect.
- Copy from non-vcc to vcc ensures high bits of the input value are
cleared during selection.
- SALU boolean inputs should ensure the inputs are 0/1. This includes
select, conditional branches, and carry-ins.
There are a few somewhat dirty details. One is that G_TRUNC/G_*EXT
selection ignores the usual register-bank from register class
functions, and can't handle truncates with VCC result banks. I think
this is OK, since the artifacts are specially treated anyway. This
does require some care to avoid producing cases with vcc. There will
also be no 100% reliable way to verify this rule is followed in
selection in case of register classes, and violations manifests
themselves as invalid copy instructions much later.
Standard phi handling also only considers the bank of the result
register, and doesn't insert copies to make the source banks
match. This doesn't work for vcc, so we have to manually correct phi
inputs in this case. We should add a verifier check to make sure there
are no phis with mixed vcc and non-vcc register bank inputs.
There's also some duplication with the LegalizerHelper, and some code
which should live in the helper. I don't see a good way to share
special knowledge about what types to use for intermediate operations
depending on the bank for example. Using the helper to replace
extensions with selects also seems somewhat awkward to me.
Another issue is there are some contexts calling
getRegBankFromRegClass that apparently don't have the LLT type for the
register, but I haven't yet run into a real issue from this.
This also introduces new unnecessary instructions in most cases, since
we don't yet try to optimize out the zext when the source is known to
come from a compare.
2019-11-03 00:30:59 +08:00
|
|
|
LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
|
2019-07-01 23:42:47 +08:00
|
|
|
.clampScalar(0, S16, S64)
|
2020-04-12 03:59:51 +08:00
|
|
|
.scalarize(1)
|
2019-02-20 01:03:09 +08:00
|
|
|
.moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
|
|
|
|
.fewerElementsIf(numElementsNotEven(0), scalarize(0))
|
2019-02-03 07:31:50 +08:00
|
|
|
.clampMaxNumElements(0, S32, 2)
|
|
|
|
.clampMaxNumElements(0, LocalPtr, 2)
|
|
|
|
.clampMaxNumElements(0, PrivatePtr, 2)
|
2019-02-20 01:03:09 +08:00
|
|
|
.scalarize(0)
|
2019-04-05 22:03:04 +08:00
|
|
|
.widenScalarToNextPow2(0)
|
2020-01-08 05:24:42 +08:00
|
|
|
.legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
|
AMDGPU/GlobalISel: Mark 32-bit G_SELECT as legal
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, wdng, nhaehnle, yaxunl, rovka, kristof.beyls, igorb, dstuttard, tpr, t-tye, llvm-commits
Differential Revision: https://reviews.llvm.org/D33949
llvm-svn: 304910
2017-06-07 21:54:51 +08:00
|
|
|
|
2019-01-23 06:00:19 +08:00
|
|
|
// TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
|
|
|
|
// be more flexible with the shift amount type.
|
|
|
|
auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
|
|
|
|
.legalFor({{S32, S32}, {S64, S32}});
|
2019-01-30 11:36:25 +08:00
|
|
|
if (ST.has16BitInsts()) {
|
2019-02-08 01:38:00 +08:00
|
|
|
if (ST.hasVOP3PInsts()) {
|
2020-04-12 05:04:48 +08:00
|
|
|
Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
|
2019-02-08 01:38:00 +08:00
|
|
|
.clampMaxNumElements(0, S16, 2);
|
|
|
|
} else
|
2020-04-12 05:04:48 +08:00
|
|
|
Shifts.legalFor({{S16, S16}});
|
|
|
|
|
|
|
|
// TODO: Support 16-bit shift amounts for all types
|
|
|
|
Shifts.widenScalarIf(
|
|
|
|
[=](const LegalityQuery &Query) {
|
|
|
|
// Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
|
|
|
|
// 32-bit amount.
|
|
|
|
const LLT ValTy = Query.Types[0];
|
|
|
|
const LLT AmountTy = Query.Types[1];
|
|
|
|
return ValTy.getSizeInBits() <= 16 &&
|
|
|
|
AmountTy.getSizeInBits() < 16;
|
|
|
|
}, changeTo(1, S16));
|
|
|
|
Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
|
2020-01-10 03:34:43 +08:00
|
|
|
Shifts.clampScalar(1, S32, S32);
|
2019-01-30 11:36:25 +08:00
|
|
|
Shifts.clampScalar(0, S16, S64);
|
2019-02-08 23:06:24 +08:00
|
|
|
Shifts.widenScalarToNextPow2(0, 16);
|
2019-02-08 03:37:44 +08:00
|
|
|
} else {
|
|
|
|
// Make sure we legalize the shift amount type first, as the general
|
|
|
|
// expansion for the shifted type will produce much worse code if it hasn't
|
|
|
|
// been truncated already.
|
|
|
|
Shifts.clampScalar(1, S32, S32);
|
2019-01-23 06:00:19 +08:00
|
|
|
Shifts.clampScalar(0, S32, S64);
|
2019-02-08 23:06:24 +08:00
|
|
|
Shifts.widenScalarToNextPow2(0, 32);
|
2019-02-08 03:37:44 +08:00
|
|
|
}
|
|
|
|
Shifts.scalarize(0);
|
Re-commit AMDGPU/GlobalISel: Add support for simple shaders
Fix build when global-isel is disabled and fix a warning.
Summary: We can select constant/global G_LOAD, global G_STORE, and G_GEP.
Reviewers: qcolombet, MatzeB, t.p.northover, ab, arsenm
Subscribers: mehdi_amini, vkalintiris, kzhuravl, wdng, nhaehnle, mgorny, yaxunl, tony-tye, modocache, llvm-commits, dberris
Differential Revision: https://reviews.llvm.org/D26730
llvm-svn: 293551
2017-01-31 05:56:46 +08:00
|
|
|
|
2018-03-12 21:35:53 +08:00
|
|
|
for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
|
2019-01-23 04:38:15 +08:00
|
|
|
unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
|
|
|
|
unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
|
|
|
|
unsigned IdxTypeIdx = 2;
|
|
|
|
|
2018-03-12 21:35:53 +08:00
|
|
|
getActionDefinitionsBuilder(Op)
|
2019-07-16 03:40:59 +08:00
|
|
|
.customIf([=](const LegalityQuery &Query) {
|
2019-07-16 02:31:10 +08:00
|
|
|
const LLT EltTy = Query.Types[EltTypeIdx];
|
|
|
|
const LLT VecTy = Query.Types[VecTypeIdx];
|
|
|
|
const LLT IdxTy = Query.Types[IdxTypeIdx];
|
|
|
|
return (EltTy.getSizeInBits() == 16 ||
|
|
|
|
EltTy.getSizeInBits() % 32 == 0) &&
|
|
|
|
VecTy.getSizeInBits() % 32 == 0 &&
|
2019-10-02 00:35:06 +08:00
|
|
|
VecTy.getSizeInBits() <= 1024 &&
|
2019-07-16 02:31:10 +08:00
|
|
|
IdxTy.getSizeInBits() == 32;
|
2019-01-23 04:38:15 +08:00
|
|
|
})
|
|
|
|
.clampScalar(EltTypeIdx, S32, S64)
|
|
|
|
.clampScalar(VecTypeIdx, S32, S64)
|
|
|
|
.clampScalar(IdxTypeIdx, S32, S32);
|
2018-03-12 21:35:53 +08:00
|
|
|
}
|
|
|
|
|
2019-01-23 04:38:15 +08:00
|
|
|
getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
|
|
|
|
.unsupportedIf([=](const LegalityQuery &Query) {
|
|
|
|
const LLT &EltTy = Query.Types[1].getElementType();
|
|
|
|
return Query.Types[0] != EltTy;
|
|
|
|
});
|
|
|
|
|
2019-02-21 00:11:22 +08:00
|
|
|
for (unsigned Op : {G_EXTRACT, G_INSERT}) {
|
|
|
|
unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
|
|
|
|
unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
|
|
|
|
|
|
|
|
// FIXME: Doesn't handle extract of illegal sizes.
|
2019-10-08 03:13:27 +08:00
|
|
|
getActionDefinitionsBuilder(Op)
|
|
|
|
.lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
|
|
|
|
// FIXME: Multiples of 16 should not be legal.
|
2019-02-08 01:25:51 +08:00
|
|
|
.legalIf([=](const LegalityQuery &Query) {
|
2019-02-21 00:11:22 +08:00
|
|
|
const LLT BigTy = Query.Types[BigTyIdx];
|
|
|
|
const LLT LitTy = Query.Types[LitTyIdx];
|
|
|
|
return (BigTy.getSizeInBits() % 32 == 0) &&
|
|
|
|
(LitTy.getSizeInBits() % 16 == 0);
|
|
|
|
})
|
|
|
|
.widenScalarIf(
|
|
|
|
[=](const LegalityQuery &Query) {
|
|
|
|
const LLT BigTy = Query.Types[BigTyIdx];
|
|
|
|
return (BigTy.getScalarSizeInBits() < 16);
|
|
|
|
},
|
|
|
|
LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
|
2019-02-08 01:25:51 +08:00
|
|
|
.widenScalarIf(
|
2019-02-21 00:11:22 +08:00
|
|
|
[=](const LegalityQuery &Query) {
|
|
|
|
const LLT LitTy = Query.Types[LitTyIdx];
|
|
|
|
return (LitTy.getScalarSizeInBits() < 16);
|
|
|
|
},
|
|
|
|
LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
|
2019-04-22 23:22:46 +08:00
|
|
|
.moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
|
|
|
|
.widenScalarToNextPow2(BigTyIdx, 32);
|
|
|
|
|
2019-02-21 00:11:22 +08:00
|
|
|
}
|
2018-03-06 00:25:15 +08:00
|
|
|
|
2019-09-10 02:57:51 +08:00
|
|
|
auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
|
|
|
|
.legalForCartesianProduct(AllS32Vectors, {S32})
|
|
|
|
.legalForCartesianProduct(AllS64Vectors, {S64})
|
2019-10-02 09:02:18 +08:00
|
|
|
.clampNumElements(0, V16S32, V32S32)
|
2019-10-10 06:44:43 +08:00
|
|
|
.clampNumElements(0, V2S64, V16S64)
|
|
|
|
.fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
|
2019-09-10 02:57:51 +08:00
|
|
|
|
2019-09-10 01:04:18 +08:00
|
|
|
if (ST.hasScalarPackInsts()) {
|
2020-01-05 04:35:26 +08:00
|
|
|
BuildVector
|
|
|
|
// FIXME: Should probably widen s1 vectors straight to s32
|
|
|
|
.minScalarOrElt(0, S16)
|
|
|
|
// Widen source elements and produce a G_BUILD_VECTOR_TRUNC
|
|
|
|
.minScalar(1, S32);
|
|
|
|
|
2019-09-10 01:04:18 +08:00
|
|
|
getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
|
|
|
|
.legalFor({V2S16, S32})
|
|
|
|
.lower();
|
2020-01-05 04:35:26 +08:00
|
|
|
BuildVector.minScalarOrElt(0, S32);
|
2019-09-10 01:04:18 +08:00
|
|
|
} else {
|
2020-01-05 04:35:26 +08:00
|
|
|
BuildVector.customFor({V2S16, S16});
|
|
|
|
BuildVector.minScalarOrElt(0, S32);
|
|
|
|
|
2019-09-10 01:04:18 +08:00
|
|
|
getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
|
2020-01-05 04:35:26 +08:00
|
|
|
.customFor({V2S16, S32})
|
2019-09-10 01:04:18 +08:00
|
|
|
.lower();
|
|
|
|
}
|
|
|
|
|
2020-01-05 04:35:26 +08:00
|
|
|
BuildVector.legalIf(isRegisterType(0));
|
|
|
|
|
|
|
|
// FIXME: Clamp maximum size
|
2019-01-08 09:30:02 +08:00
|
|
|
getActionDefinitionsBuilder(G_CONCAT_VECTORS)
|
2019-07-09 22:17:31 +08:00
|
|
|
.legalIf(isRegisterType(0));
|
2019-01-08 09:30:02 +08:00
|
|
|
|
2020-01-02 04:51:46 +08:00
|
|
|
// TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
|
|
|
|
// pre-legalize.
|
|
|
|
if (ST.hasVOP3PInsts()) {
|
|
|
|
getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
|
|
|
|
.customFor({V2S16, V2S16})
|
|
|
|
.lower();
|
|
|
|
} else
|
|
|
|
getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
|
2019-08-14 00:09:07 +08:00
|
|
|
|
2018-03-12 21:35:43 +08:00
|
|
|
// Merge/Unmerge
|
|
|
|
for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
|
|
|
|
unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
|
|
|
|
unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
|
|
|
|
|
2019-01-21 02:40:36 +08:00
|
|
|
auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
|
2020-03-16 22:42:15 +08:00
|
|
|
const LLT Ty = Query.Types[TypeIdx];
|
2019-01-21 02:40:36 +08:00
|
|
|
if (Ty.isVector()) {
|
|
|
|
const LLT &EltTy = Ty.getElementType();
|
2020-03-16 22:42:15 +08:00
|
|
|
if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
|
2019-01-21 02:40:36 +08:00
|
|
|
return true;
|
|
|
|
if (!isPowerOf2_32(EltTy.getSizeInBits()))
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
};
|
|
|
|
|
2019-10-08 03:05:58 +08:00
|
|
|
auto &Builder = getActionDefinitionsBuilder(Op)
|
2020-03-30 06:04:53 +08:00
|
|
|
.lowerFor({{S16, V2S16}})
|
|
|
|
.lowerIf([=](const LegalityQuery &Query) {
|
2020-03-30 03:51:54 +08:00
|
|
|
const LLT BigTy = Query.Types[BigTyIdx];
|
|
|
|
return BigTy.getSizeInBits() == 32;
|
2020-03-30 06:04:53 +08:00
|
|
|
})
|
2020-01-21 22:02:42 +08:00
|
|
|
// Try to widen to s16 first for small types.
|
|
|
|
// TODO: Only do this on targets with legal s16 shifts
|
|
|
|
.minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16)
|
2019-01-30 07:17:35 +08:00
|
|
|
.widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
|
2019-08-22 00:59:10 +08:00
|
|
|
.moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
|
2019-08-14 00:26:28 +08:00
|
|
|
.fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
|
|
|
|
elementTypeIs(1, S16)),
|
|
|
|
changeTo(1, V2S16))
|
2020-01-21 22:02:42 +08:00
|
|
|
// Clamp the little scalar to s8-s256 and make it a power of 2. It's not
|
|
|
|
// worth considering the multiples of 64 since 2*192 and 2*384 are not
|
|
|
|
// valid.
|
2020-03-16 22:42:15 +08:00
|
|
|
.clampScalar(LitTyIdx, S32, S512)
|
2020-01-21 22:02:42 +08:00
|
|
|
.widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
|
2019-01-21 02:40:36 +08:00
|
|
|
// Break up vectors with weird elements into scalars
|
|
|
|
.fewerElementsIf(
|
2020-03-16 22:42:15 +08:00
|
|
|
[=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
|
2019-01-25 08:51:00 +08:00
|
|
|
scalarize(0))
|
2019-01-21 02:40:36 +08:00
|
|
|
.fewerElementsIf(
|
2020-03-16 22:42:15 +08:00
|
|
|
[=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
|
2019-01-25 08:51:00 +08:00
|
|
|
scalarize(1))
|
2020-01-21 22:02:42 +08:00
|
|
|
.clampScalar(BigTyIdx, S32, S1024);
|
2019-10-08 03:05:58 +08:00
|
|
|
|
|
|
|
if (Op == G_MERGE_VALUES) {
|
|
|
|
Builder.widenScalarIf(
|
|
|
|
// TODO: Use 16-bit shifts if legal for 8-bit values?
|
2019-01-21 02:40:36 +08:00
|
|
|
[=](const LegalityQuery &Query) {
|
2019-10-08 03:05:58 +08:00
|
|
|
const LLT Ty = Query.Types[LitTyIdx];
|
|
|
|
return Ty.getSizeInBits() < 32;
|
2019-01-21 02:40:36 +08:00
|
|
|
},
|
2019-10-08 03:05:58 +08:00
|
|
|
changeTo(LitTyIdx, S32));
|
|
|
|
}
|
|
|
|
|
|
|
|
Builder.widenScalarIf(
|
|
|
|
[=](const LegalityQuery &Query) {
|
|
|
|
const LLT Ty = Query.Types[BigTyIdx];
|
|
|
|
return !isPowerOf2_32(Ty.getSizeInBits()) &&
|
|
|
|
Ty.getSizeInBits() % 16 != 0;
|
|
|
|
},
|
|
|
|
[=](const LegalityQuery &Query) {
|
|
|
|
// Pick the next power of 2, or a multiple of 64 over 128.
|
|
|
|
// Whichever is smaller.
|
|
|
|
const LLT &Ty = Query.Types[BigTyIdx];
|
|
|
|
unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
|
|
|
|
if (NewSizeInBits >= 256) {
|
|
|
|
unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
|
|
|
|
if (RoundedTo < NewSizeInBits)
|
|
|
|
NewSizeInBits = RoundedTo;
|
|
|
|
}
|
|
|
|
return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
|
|
|
|
})
|
2018-03-12 21:35:43 +08:00
|
|
|
.legalIf([=](const LegalityQuery &Query) {
|
|
|
|
const LLT &BigTy = Query.Types[BigTyIdx];
|
|
|
|
const LLT &LitTy = Query.Types[LitTyIdx];
|
2019-01-21 02:40:36 +08:00
|
|
|
|
|
|
|
if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
|
|
|
|
return false;
|
|
|
|
if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return BigTy.getSizeInBits() % 16 == 0 &&
|
|
|
|
LitTy.getSizeInBits() % 16 == 0 &&
|
2019-10-02 00:35:06 +08:00
|
|
|
BigTy.getSizeInBits() <= 1024;
|
2018-03-12 21:35:43 +08:00
|
|
|
})
|
|
|
|
// Any vectors left are the wrong size. Scalarize them.
|
2019-01-25 08:51:00 +08:00
|
|
|
.scalarize(0)
|
|
|
|
.scalarize(1);
|
2018-03-12 21:35:43 +08:00
|
|
|
}
|
|
|
|
|
2020-02-05 05:06:34 +08:00
|
|
|
// S64 is only legal on SALU, and needs to be broken into 32-bit elements in
|
|
|
|
// RegBankSelect.
|
|
|
|
auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
|
|
|
|
.legalFor({{S32}, {S64}});
|
|
|
|
|
2020-01-12 08:05:06 +08:00
|
|
|
if (ST.hasVOP3PInsts()) {
|
2020-02-05 05:06:34 +08:00
|
|
|
SextInReg.lowerFor({{V2S16}})
|
2020-01-12 08:05:06 +08:00
|
|
|
// Prefer to reduce vector widths for 16-bit vectors before lowering, to
|
|
|
|
// get more vector shift opportunities, since we'll get those when
|
|
|
|
// expanded.
|
|
|
|
.fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
|
|
|
|
} else if (ST.has16BitInsts()) {
|
|
|
|
SextInReg.lowerFor({{S32}, {S64}, {S16}});
|
|
|
|
} else {
|
|
|
|
// Prefer to promote to s32 before lowering if we don't have 16-bit
|
|
|
|
// shifts. This avoid a lot of intermediate truncate and extend operations.
|
|
|
|
SextInReg.lowerFor({{S32}, {S64}});
|
|
|
|
}
|
|
|
|
|
|
|
|
SextInReg
|
|
|
|
.scalarize(0)
|
2020-02-05 05:06:34 +08:00
|
|
|
.clampScalar(0, S32, S64)
|
2020-01-12 07:03:18 +08:00
|
|
|
.lower();
|
[globalisel] Add G_SEXT_INREG
Summary:
Targets often have instructions that can sign-extend certain cases faster
than the equivalent shift-left/arithmetic-shift-right. Such cases can be
identified by matching a shift-left/shift-right pair but there are some
issues with this in the context of combines. For example, suppose you can
sign-extend 8-bit up to 32-bit with a target extend instruction.
%1:_(s32) = G_SHL %0:_(s32), i32 24 # (I've inlined the G_CONSTANT for brevity)
%2:_(s32) = G_ASHR %1:_(s32), i32 24
%3:_(s32) = G_ASHR %2:_(s32), i32 1
would reasonably combine to:
%1:_(s32) = G_SHL %0:_(s32), i32 24
%2:_(s32) = G_ASHR %1:_(s32), i32 25
which no longer matches the special case. If your shifts and extend are
equal cost, this would break even as a pair of shifts but if your shift is
more expensive than the extend then it's cheaper as:
%2:_(s32) = G_SEXT_INREG %0:_(s32), i32 8
%3:_(s32) = G_ASHR %2:_(s32), i32 1
It's possible to match the shift-pair in ISel and emit an extend and ashr.
However, this is far from the only way to break this shift pair and make
it hard to match the extends. Another example is that with the right
known-zeros, this:
%1:_(s32) = G_SHL %0:_(s32), i32 24
%2:_(s32) = G_ASHR %1:_(s32), i32 24
%3:_(s32) = G_MUL %2:_(s32), i32 2
can become:
%1:_(s32) = G_SHL %0:_(s32), i32 24
%2:_(s32) = G_ASHR %1:_(s32), i32 23
All upstream targets have been configured to lower it to the current
G_SHL,G_ASHR pair but will likely want to make it legal in some cases to
handle their faster cases.
To follow-up: Provide a way to legalize based on the constant. At the
moment, I'm thinking that the best way to achieve this is to provide the
MI in LegalityQuery but that opens the door to breaking core principles
of the legalizer (legality is not context sensitive). That said, it's
worth noting that looking at other instructions and acting on that
information doesn't violate this principle in itself. It's only a
violation if, at the end of legalization, a pass that checks legality
without being able to see the context would say an instruction might not be
legal. That's a fairly subtle distinction so to give a concrete example,
saying %2 in:
%1 = G_CONSTANT 16
%2 = G_SEXT_INREG %0, %1
is legal is in violation of that principle if the legality of %2 depends
on %1 being constant and/or being 16. However, legalizing to either:
%2 = G_SEXT_INREG %0, 16
or:
%1 = G_CONSTANT 16
%2:_(s32) = G_SHL %0, %1
%3:_(s32) = G_ASHR %2, %1
depending on whether %1 is constant and 16 does not violate that principle
since both outputs are genuinely legal.
Reviewers: bogner, aditya_nandakumar, volkan, aemerson, paquette, arsenm
Subscribers: sdardis, jvesely, wdng, nhaehnle, rovka, kristof.beyls, javed.absar, hiraditya, jrtc27, atanasyan, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D61289
llvm-svn: 368487
2019-08-10 05:11:20 +08:00
|
|
|
|
2020-03-20 09:25:27 +08:00
|
|
|
getActionDefinitionsBuilder(G_FSHR)
|
|
|
|
.legalFor({{S32, S32}})
|
|
|
|
.scalarize(0)
|
|
|
|
.lower();
|
|
|
|
|
2020-01-05 01:50:18 +08:00
|
|
|
getActionDefinitionsBuilder(G_READCYCLECOUNTER)
|
|
|
|
.legalFor({S64});
|
|
|
|
|
2020-01-22 06:12:26 +08:00
|
|
|
getActionDefinitionsBuilder({
|
|
|
|
// TODO: Verify V_BFI_B32 is generated from expanded bit ops
|
|
|
|
G_FCOPYSIGN,
|
|
|
|
|
|
|
|
G_ATOMIC_CMPXCHG_WITH_SUCCESS,
|
|
|
|
G_READ_REGISTER,
|
|
|
|
G_WRITE_REGISTER,
|
|
|
|
|
|
|
|
G_SADDO, G_SSUBO,
|
|
|
|
|
|
|
|
// TODO: Implement
|
2020-03-20 09:25:27 +08:00
|
|
|
G_FMINIMUM, G_FMAXIMUM,
|
|
|
|
G_FSHL
|
2020-01-22 06:12:26 +08:00
|
|
|
}).lower();
|
|
|
|
|
2019-09-10 04:55:49 +08:00
|
|
|
getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
|
|
|
|
G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
|
|
|
|
G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
|
|
|
|
.unsupported();
|
|
|
|
|
Re-commit AMDGPU/GlobalISel: Add support for simple shaders
Fix build when global-isel is disabled and fix a warning.
Summary: We can select constant/global G_LOAD, global G_STORE, and G_GEP.
Reviewers: qcolombet, MatzeB, t.p.northover, ab, arsenm
Subscribers: mehdi_amini, vkalintiris, kzhuravl, wdng, nhaehnle, mgorny, yaxunl, tony-tye, modocache, llvm-commits, dberris
Differential Revision: https://reviews.llvm.org/D26730
llvm-svn: 293551
2017-01-31 05:56:46 +08:00
|
|
|
computeTables();
|
2018-06-01 00:16:48 +08:00
|
|
|
verify(*ST.getInstrInfo());
|
Re-commit AMDGPU/GlobalISel: Add support for simple shaders
Fix build when global-isel is disabled and fix a warning.
Summary: We can select constant/global G_LOAD, global G_STORE, and G_GEP.
Reviewers: qcolombet, MatzeB, t.p.northover, ab, arsenm
Subscribers: mehdi_amini, vkalintiris, kzhuravl, wdng, nhaehnle, mgorny, yaxunl, tony-tye, modocache, llvm-commits, dberris
Differential Revision: https://reviews.llvm.org/D26730
llvm-svn: 293551
2017-01-31 05:56:46 +08:00
|
|
|
}
|
2019-02-08 10:40:47 +08:00
|
|
|
|
|
|
|
bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
|
|
|
|
MachineRegisterInfo &MRI,
|
AMDGPU/GlobalISel: Rename MIRBuilder to B. NFC
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D67374
llvm-svn: 371467
2019-09-10 07:06:13 +08:00
|
|
|
MachineIRBuilder &B,
|
2019-02-08 10:40:47 +08:00
|
|
|
GISelChangeObserver &Observer) const {
|
|
|
|
switch (MI.getOpcode()) {
|
|
|
|
case TargetOpcode::G_ADDRSPACE_CAST:
|
AMDGPU/GlobalISel: Rename MIRBuilder to B. NFC
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D67374
llvm-svn: 371467
2019-09-10 07:06:13 +08:00
|
|
|
return legalizeAddrSpaceCast(MI, MRI, B);
|
2019-05-17 20:19:57 +08:00
|
|
|
case TargetOpcode::G_FRINT:
|
AMDGPU/GlobalISel: Rename MIRBuilder to B. NFC
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D67374
llvm-svn: 371467
2019-09-10 07:06:13 +08:00
|
|
|
return legalizeFrint(MI, MRI, B);
|
2019-05-17 20:20:05 +08:00
|
|
|
case TargetOpcode::G_FCEIL:
|
AMDGPU/GlobalISel: Rename MIRBuilder to B. NFC
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D67374
llvm-svn: 371467
2019-09-10 07:06:13 +08:00
|
|
|
return legalizeFceil(MI, MRI, B);
|
2019-05-17 20:20:01 +08:00
|
|
|
case TargetOpcode::G_INTRINSIC_TRUNC:
|
AMDGPU/GlobalISel: Rename MIRBuilder to B. NFC
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D67374
llvm-svn: 371467
2019-09-10 07:06:13 +08:00
|
|
|
return legalizeIntrinsicTrunc(MI, MRI, B);
|
2019-05-18 07:05:18 +08:00
|
|
|
case TargetOpcode::G_SITOFP:
|
AMDGPU/GlobalISel: Rename MIRBuilder to B. NFC
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D67374
llvm-svn: 371467
2019-09-10 07:06:13 +08:00
|
|
|
return legalizeITOFP(MI, MRI, B, true);
|
2019-05-18 07:05:18 +08:00
|
|
|
case TargetOpcode::G_UITOFP:
|
AMDGPU/GlobalISel: Rename MIRBuilder to B. NFC
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D67374
llvm-svn: 371467
2019-09-10 07:06:13 +08:00
|
|
|
return legalizeITOFP(MI, MRI, B, false);
|
2020-01-05 05:40:45 +08:00
|
|
|
case TargetOpcode::G_FPTOSI:
|
|
|
|
return legalizeFPTOI(MI, MRI, B, true);
|
|
|
|
case TargetOpcode::G_FPTOUI:
|
|
|
|
return legalizeFPTOI(MI, MRI, B, false);
|
2019-07-11 00:31:19 +08:00
|
|
|
case TargetOpcode::G_FMINNUM:
|
|
|
|
case TargetOpcode::G_FMAXNUM:
|
|
|
|
case TargetOpcode::G_FMINNUM_IEEE:
|
|
|
|
case TargetOpcode::G_FMAXNUM_IEEE:
|
AMDGPU/GlobalISel: Rename MIRBuilder to B. NFC
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D67374
llvm-svn: 371467
2019-09-10 07:06:13 +08:00
|
|
|
return legalizeMinNumMaxNum(MI, MRI, B);
|
2019-07-16 03:40:59 +08:00
|
|
|
case TargetOpcode::G_EXTRACT_VECTOR_ELT:
|
AMDGPU/GlobalISel: Rename MIRBuilder to B. NFC
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D67374
llvm-svn: 371467
2019-09-10 07:06:13 +08:00
|
|
|
return legalizeExtractVectorElt(MI, MRI, B);
|
2019-07-16 03:40:59 +08:00
|
|
|
case TargetOpcode::G_INSERT_VECTOR_ELT:
|
AMDGPU/GlobalISel: Rename MIRBuilder to B. NFC
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D67374
llvm-svn: 371467
2019-09-10 07:06:13 +08:00
|
|
|
return legalizeInsertVectorElt(MI, MRI, B);
|
2020-01-02 04:51:46 +08:00
|
|
|
case TargetOpcode::G_SHUFFLE_VECTOR:
|
|
|
|
return legalizeShuffleVector(MI, MRI, B);
|
2019-08-30 04:06:48 +08:00
|
|
|
case TargetOpcode::G_FSIN:
|
|
|
|
case TargetOpcode::G_FCOS:
|
AMDGPU/GlobalISel: Rename MIRBuilder to B. NFC
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D67374
llvm-svn: 371467
2019-09-10 07:06:13 +08:00
|
|
|
return legalizeSinCos(MI, MRI, B);
|
2019-09-10 01:13:44 +08:00
|
|
|
case TargetOpcode::G_GLOBAL_VALUE:
|
AMDGPU/GlobalISel: Rename MIRBuilder to B. NFC
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D67374
llvm-svn: 371467
2019-09-10 07:06:13 +08:00
|
|
|
return legalizeGlobalValue(MI, MRI, B);
|
2019-09-11 00:42:31 +08:00
|
|
|
case TargetOpcode::G_LOAD:
|
|
|
|
return legalizeLoad(MI, MRI, B, Observer);
|
2019-09-13 08:44:35 +08:00
|
|
|
case TargetOpcode::G_FMAD:
|
|
|
|
return legalizeFMad(MI, MRI, B);
|
AMDGPU/GlobalISel: Legalize fast unsafe FDIV
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69231
llvm-svn: 375460
2019-10-22 06:18:26 +08:00
|
|
|
case TargetOpcode::G_FDIV:
|
|
|
|
return legalizeFDIV(MI, MRI, B);
|
2020-02-12 09:51:02 +08:00
|
|
|
case TargetOpcode::G_UDIV:
|
|
|
|
case TargetOpcode::G_UREM:
|
|
|
|
return legalizeUDIV_UREM(MI, MRI, B);
|
2020-02-12 09:48:45 +08:00
|
|
|
case TargetOpcode::G_SDIV:
|
|
|
|
case TargetOpcode::G_SREM:
|
|
|
|
return legalizeSDIV_SREM(MI, MRI, B);
|
2019-10-09 01:04:41 +08:00
|
|
|
case TargetOpcode::G_ATOMIC_CMPXCHG:
|
|
|
|
return legalizeAtomicCmpXChg(MI, MRI, B);
|
2020-01-22 11:29:30 +08:00
|
|
|
case TargetOpcode::G_FLOG:
|
|
|
|
return legalizeFlog(MI, B, 1.0f / numbers::log2ef);
|
|
|
|
case TargetOpcode::G_FLOG10:
|
|
|
|
return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
|
2020-01-25 09:53:26 +08:00
|
|
|
case TargetOpcode::G_FEXP:
|
|
|
|
return legalizeFExp(MI, B);
|
2020-02-21 07:59:08 +08:00
|
|
|
case TargetOpcode::G_FPOW:
|
|
|
|
return legalizeFPow(MI, B);
|
2020-01-24 23:01:15 +08:00
|
|
|
case TargetOpcode::G_FFLOOR:
|
|
|
|
return legalizeFFloor(MI, MRI, B);
|
2020-01-05 04:35:26 +08:00
|
|
|
case TargetOpcode::G_BUILD_VECTOR:
|
|
|
|
return legalizeBuildVector(MI, MRI, B);
|
2019-02-08 10:40:47 +08:00
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
llvm_unreachable("expected switch to return");
|
|
|
|
}
|
|
|
|
|
2019-06-28 09:16:46 +08:00
|
|
|
Register AMDGPULegalizerInfo::getSegmentAperture(
|
2019-02-08 10:40:47 +08:00
|
|
|
unsigned AS,
|
|
|
|
MachineRegisterInfo &MRI,
|
AMDGPU/GlobalISel: Rename MIRBuilder to B. NFC
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D67374
llvm-svn: 371467
2019-09-10 07:06:13 +08:00
|
|
|
MachineIRBuilder &B) const {
|
|
|
|
MachineFunction &MF = B.getMF();
|
2019-02-08 10:40:47 +08:00
|
|
|
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
|
|
|
const LLT S32 = LLT::scalar(32);
|
|
|
|
|
2019-10-04 16:35:38 +08:00
|
|
|
assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
|
|
|
|
|
2019-02-08 10:40:47 +08:00
|
|
|
if (ST.hasApertureRegs()) {
|
|
|
|
// FIXME: Use inline constants (src_{shared, private}_base) instead of
|
|
|
|
// getreg.
|
|
|
|
unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
|
|
|
|
AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
|
|
|
|
AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
|
|
|
|
unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
|
|
|
|
AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
|
|
|
|
AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
|
|
|
|
unsigned Encoding =
|
|
|
|
AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
|
|
|
|
Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
|
|
|
|
WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
|
|
|
|
|
2019-06-28 09:16:46 +08:00
|
|
|
Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
|
2019-02-08 10:40:47 +08:00
|
|
|
|
AMDGPU/GlobalISel: Rename MIRBuilder to B. NFC
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D67374
llvm-svn: 371467
2019-09-10 07:06:13 +08:00
|
|
|
B.buildInstr(AMDGPU::S_GETREG_B32)
|
2019-02-08 10:40:47 +08:00
|
|
|
.addDef(GetReg)
|
|
|
|
.addImm(Encoding);
|
|
|
|
MRI.setType(GetReg, S32);
|
|
|
|
|
AMDGPU/GlobalISel: Rename MIRBuilder to B. NFC
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D67374
llvm-svn: 371467
2019-09-10 07:06:13 +08:00
|
|
|
auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
|
[GlobalISel] Tidy up unnecessary calls to createGenericVirtualRegister
Summary:
As a side effect some redundant copies of constant values are removed by
CSEMIRBuilder.
Reviewers: aemerson, arsenm, dsanders, aditya_nandakumar
Subscribers: sdardis, jvesely, wdng, nhaehnle, rovka, hiraditya, jrtc27, atanasyan, volkan, Petar.Avramovic, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D73789
2020-01-31 20:40:31 +08:00
|
|
|
return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
|
2019-02-08 10:40:47 +08:00
|
|
|
}
|
|
|
|
|
2019-06-28 09:16:46 +08:00
|
|
|
Register QueuePtr = MRI.createGenericVirtualRegister(
|
2019-02-08 10:40:47 +08:00
|
|
|
LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
|
|
|
|
|
2019-09-05 10:20:29 +08:00
|
|
|
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
AMDGPU/GlobalISel: Rename MIRBuilder to B. NFC
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D67374
llvm-svn: 371467
2019-09-10 07:06:13 +08:00
|
|
|
if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
|
2019-09-05 10:20:29 +08:00
|
|
|
return Register();
|
2019-02-08 10:40:47 +08:00
|
|
|
|
|
|
|
// Offset into amd_queue_t for group_segment_aperture_base_hi /
|
|
|
|
// private_segment_aperture_base_hi.
|
|
|
|
uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
|
|
|
|
|
[AMDGPU] Don't create MachinePointerInfos with an UndefValue pointer
Summary:
The only useful information the UndefValue conveys is the address space,
which MachinePointerInfo can represent directly without referring to an
IR value.
Reviewers: arsenm, rampitec
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D71838
2019-12-23 21:42:12 +08:00
|
|
|
// TODO: can we be smarter about machine pointer info?
|
|
|
|
MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
|
2019-02-08 10:40:47 +08:00
|
|
|
MachineMemOperand *MMO = MF.getMachineMemOperand(
|
[Alignment][NFC] Transitionning more getMachineMemOperand call sites
Summary:
This is patch is part of a series to introduce an Alignment type.
See this thread for context: http://lists.llvm.org/pipermail/llvm-dev/2019-July/133851.html
See this patch for the introduction of the type: https://reviews.llvm.org/D64790
Reviewers: courbet
Subscribers: arsenm, dylanmckay, sdardis, nemanjai, jvesely, nhaehnle, hiraditya, kbarton, jrtc27, atanasyan, Jim, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D77121
2020-03-31 16:05:00 +08:00
|
|
|
PtrInfo,
|
|
|
|
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
|
|
|
|
MachineMemOperand::MOInvariant,
|
|
|
|
4, commonAlignment(Align(64), StructOffset));
|
2019-02-08 10:40:47 +08:00
|
|
|
|
2019-06-24 23:50:29 +08:00
|
|
|
Register LoadAddr;
|
2019-02-08 10:40:47 +08:00
|
|
|
|
[globalisel] Rename G_GEP to G_PTR_ADD
Summary:
G_GEP is rather poorly named. It's a simple pointer+scalar addition and
doesn't support any of the complexities of getelementptr. I therefore
propose that we rename it. There's a G_PTR_MASK so let's follow that
convention and go with G_PTR_ADD
Reviewers: volkan, aditya_nandakumar, bogner, rovka, arsenm
Subscribers: sdardis, jvesely, wdng, nhaehnle, hiraditya, jrtc27, atanasyan, arphaman, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69734
2019-11-02 04:18:00 +08:00
|
|
|
B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
|
[GlobalISel] Tidy up unnecessary calls to createGenericVirtualRegister
Summary:
As a side effect some redundant copies of constant values are removed by
CSEMIRBuilder.
Reviewers: aemerson, arsenm, dsanders, aditya_nandakumar
Subscribers: sdardis, jvesely, wdng, nhaehnle, rovka, hiraditya, jrtc27, atanasyan, volkan, Petar.Avramovic, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D73789
2020-01-31 20:40:31 +08:00
|
|
|
return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
|
2019-02-08 10:40:47 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
|
|
|
|
MachineInstr &MI, MachineRegisterInfo &MRI,
|
AMDGPU/GlobalISel: Rename MIRBuilder to B. NFC
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D67374
llvm-svn: 371467
2019-09-10 07:06:13 +08:00
|
|
|
MachineIRBuilder &B) const {
|
|
|
|
MachineFunction &MF = B.getMF();
|
2019-02-08 10:40:47 +08:00
|
|
|
|
AMDGPU/GlobalISel: Rename MIRBuilder to B. NFC
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D67374
llvm-svn: 371467
2019-09-10 07:06:13 +08:00
|
|
|
B.setInstr(MI);
|
2019-02-08 10:40:47 +08:00
|
|
|
|
2019-08-28 08:58:24 +08:00
|
|
|
const LLT S32 = LLT::scalar(32);
|
2019-06-24 23:50:29 +08:00
|
|
|
Register Dst = MI.getOperand(0).getReg();
|
|
|
|
Register Src = MI.getOperand(1).getReg();
|
2019-02-08 10:40:47 +08:00
|
|
|
|
|
|
|
LLT DstTy = MRI.getType(Dst);
|
|
|
|
LLT SrcTy = MRI.getType(Src);
|
|
|
|
unsigned DestAS = DstTy.getAddressSpace();
|
|
|
|
unsigned SrcAS = SrcTy.getAddressSpace();
|
|
|
|
|
|
|
|
// TODO: Avoid reloading from the queue ptr for each cast, or at least each
|
|
|
|
// vector element.
|
|
|
|
assert(!DstTy.isVector());
|
|
|
|
|
|
|
|
const AMDGPUTargetMachine &TM
|
|
|
|
= static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
|
|
|
|
|
|
|
|
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
|
|
|
if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
|
AMDGPU/GlobalISel: Rename MIRBuilder to B. NFC
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D67374
llvm-svn: 371467
2019-09-10 07:06:13 +08:00
|
|
|
MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
|
2019-02-08 10:40:47 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2019-08-28 08:58:24 +08:00
|
|
|
if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
|
|
|
|
// Truncate.
|
AMDGPU/GlobalISel: Rename MIRBuilder to B. NFC
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D67374
llvm-svn: 371467
2019-09-10 07:06:13 +08:00
|
|
|
B.buildExtract(Dst, Src, 0);
|
2019-08-28 08:58:24 +08:00
|
|
|
MI.eraseFromParent();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
|
|
|
|
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
|
|
|
|
uint32_t AddrHiVal = Info->get32BitAddressHighBits();
|
|
|
|
|
|
|
|
// FIXME: This is a bit ugly due to creating a merge of 2 pointers to
|
|
|
|
// another. Merge operands are required to be the same type, but creating an
|
|
|
|
// extra ptrtoint would be kind of pointless.
|
AMDGPU/GlobalISel: Rename MIRBuilder to B. NFC
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D67374
llvm-svn: 371467
2019-09-10 07:06:13 +08:00
|
|
|
auto HighAddr = B.buildConstant(
|
2019-08-28 08:58:24 +08:00
|
|
|
LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
|
2020-02-08 00:38:01 +08:00
|
|
|
B.buildMerge(Dst, {Src, HighAddr});
|
2019-08-28 08:58:24 +08:00
|
|
|
MI.eraseFromParent();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2019-02-08 10:40:47 +08:00
|
|
|
if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
|
|
|
|
assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
|
|
|
|
DestAS == AMDGPUAS::PRIVATE_ADDRESS);
|
|
|
|
unsigned NullVal = TM.getNullPointerValue(DestAS);
|
|
|
|
|
AMDGPU/GlobalISel: Rename MIRBuilder to B. NFC
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D67374
llvm-svn: 371467
2019-09-10 07:06:13 +08:00
|
|
|
auto SegmentNull = B.buildConstant(DstTy, NullVal);
|
|
|
|
auto FlatNull = B.buildConstant(SrcTy, 0);
|
2019-02-08 10:40:47 +08:00
|
|
|
|
|
|
|
// Extract low 32-bits of the pointer.
|
[GlobalISel] Tidy up unnecessary calls to createGenericVirtualRegister
Summary:
As a side effect some redundant copies of constant values are removed by
CSEMIRBuilder.
Reviewers: aemerson, arsenm, dsanders, aditya_nandakumar
Subscribers: sdardis, jvesely, wdng, nhaehnle, rovka, hiraditya, jrtc27, atanasyan, volkan, Petar.Avramovic, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D73789
2020-01-31 20:40:31 +08:00
|
|
|
auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
|
2019-02-08 10:40:47 +08:00
|
|
|
|
[GlobalISel] Tidy up unnecessary calls to createGenericVirtualRegister
Summary:
As a side effect some redundant copies of constant values are removed by
CSEMIRBuilder.
Reviewers: aemerson, arsenm, dsanders, aditya_nandakumar
Subscribers: sdardis, jvesely, wdng, nhaehnle, rovka, hiraditya, jrtc27, atanasyan, volkan, Petar.Avramovic, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D73789
2020-01-31 20:40:31 +08:00
|
|
|
auto CmpRes =
|
|
|
|
B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
|
AMDGPU/GlobalISel: Rename MIRBuilder to B. NFC
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D67374
llvm-svn: 371467
2019-09-10 07:06:13 +08:00
|
|
|
B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
|
2019-02-08 10:40:47 +08:00
|
|
|
|
|
|
|
MI.eraseFromParent();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2019-08-28 08:58:24 +08:00
|
|
|
if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (!ST.hasFlatAddressSpace())
|
|
|
|
return false;
|
2019-02-08 10:40:47 +08:00
|
|
|
|
2019-04-15 13:04:20 +08:00
|
|
|
auto SegmentNull =
|
AMDGPU/GlobalISel: Rename MIRBuilder to B. NFC
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D67374
llvm-svn: 371467
2019-09-10 07:06:13 +08:00
|
|
|
B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
|
2019-04-15 13:04:20 +08:00
|
|
|
auto FlatNull =
|
AMDGPU/GlobalISel: Rename MIRBuilder to B. NFC
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D67374
llvm-svn: 371467
2019-09-10 07:06:13 +08:00
|
|
|
B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
|
2019-02-08 10:40:47 +08:00
|
|
|
|
2019-10-04 16:35:38 +08:00
|
|
|
Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
|
2019-09-05 10:20:29 +08:00
|
|
|
if (!ApertureReg.isValid())
|
|
|
|
return false;
|
2019-02-08 10:40:47 +08:00
|
|
|
|
[GlobalISel] Tidy up unnecessary calls to createGenericVirtualRegister
Summary:
As a side effect some redundant copies of constant values are removed by
CSEMIRBuilder.
Reviewers: aemerson, arsenm, dsanders, aditya_nandakumar
Subscribers: sdardis, jvesely, wdng, nhaehnle, rovka, hiraditya, jrtc27, atanasyan, volkan, Petar.Avramovic, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D73789
2020-01-31 20:40:31 +08:00
|
|
|
auto CmpRes =
|
|
|
|
B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
|
2019-02-08 10:40:47 +08:00
|
|
|
|
|
|
|
// Coerce the type of the low half of the result so we can use merge_values.
|
[GlobalISel] Tidy up unnecessary calls to createGenericVirtualRegister
Summary:
As a side effect some redundant copies of constant values are removed by
CSEMIRBuilder.
Reviewers: aemerson, arsenm, dsanders, aditya_nandakumar
Subscribers: sdardis, jvesely, wdng, nhaehnle, rovka, hiraditya, jrtc27, atanasyan, volkan, Petar.Avramovic, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D73789
2020-01-31 20:40:31 +08:00
|
|
|
Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
|
2019-02-08 10:40:47 +08:00
|
|
|
|
|
|
|
// TODO: Should we allow mismatched types but matching sizes in merges to
|
|
|
|
// avoid the ptrtoint?
|
[GlobalISel] Tidy up unnecessary calls to createGenericVirtualRegister
Summary:
As a side effect some redundant copies of constant values are removed by
CSEMIRBuilder.
Reviewers: aemerson, arsenm, dsanders, aditya_nandakumar
Subscribers: sdardis, jvesely, wdng, nhaehnle, rovka, hiraditya, jrtc27, atanasyan, volkan, Petar.Avramovic, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D73789
2020-01-31 20:40:31 +08:00
|
|
|
auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
|
|
|
|
B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
|
2019-02-08 10:40:47 +08:00
|
|
|
|
|
|
|
MI.eraseFromParent();
|
|
|
|
return true;
|
|
|
|
}
|
2019-05-17 20:19:57 +08:00
|
|
|
|
|
|
|
bool AMDGPULegalizerInfo::legalizeFrint(
|
|
|
|
MachineInstr &MI, MachineRegisterInfo &MRI,
|
AMDGPU/GlobalISel: Rename MIRBuilder to B. NFC
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D67374
llvm-svn: 371467
2019-09-10 07:06:13 +08:00
|
|
|
MachineIRBuilder &B) const {
|
|
|
|
B.setInstr(MI);
|
2019-05-17 20:19:57 +08:00
|
|
|
|
2019-06-25 00:16:12 +08:00
|
|
|
Register Src = MI.getOperand(1).getReg();
|
2019-05-17 20:19:57 +08:00
|
|
|
LLT Ty = MRI.getType(Src);
|
|
|
|
assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
|
|
|
|
|
|
|
|
APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
|
|
|
|
APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
|
|
|
|
|
AMDGPU/GlobalISel: Rename MIRBuilder to B. NFC
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D67374
llvm-svn: 371467
2019-09-10 07:06:13 +08:00
|
|
|
auto C1 = B.buildFConstant(Ty, C1Val);
|
|
|
|
auto CopySign = B.buildFCopysign(Ty, C1, Src);
|
2019-05-17 20:19:57 +08:00
|
|
|
|
|
|
|
// TODO: Should this propagate fast-math-flags?
|
AMDGPU/GlobalISel: Rename MIRBuilder to B. NFC
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D67374
llvm-svn: 371467
2019-09-10 07:06:13 +08:00
|
|
|
auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
|
|
|
|
auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
|
2019-05-17 20:19:57 +08:00
|
|
|
|
AMDGPU/GlobalISel: Rename MIRBuilder to B. NFC
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D67374
llvm-svn: 371467
2019-09-10 07:06:13 +08:00
|
|
|
auto C2 = B.buildFConstant(Ty, C2Val);
|
|
|
|
auto Fabs = B.buildFAbs(Ty, Src);
|
2019-05-17 20:19:57 +08:00
|
|
|
|
AMDGPU/GlobalISel: Rename MIRBuilder to B. NFC
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D67374
llvm-svn: 371467
2019-09-10 07:06:13 +08:00
|
|
|
auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
|
|
|
|
B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
|
2019-05-17 20:19:57 +08:00
|
|
|
return true;
|
2019-05-17 20:20:05 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
bool AMDGPULegalizerInfo::legalizeFceil(
|
|
|
|
MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
|
|
MachineIRBuilder &B) const {
|
|
|
|
B.setInstr(MI);
|
|
|
|
|
2019-05-17 20:59:27 +08:00
|
|
|
const LLT S1 = LLT::scalar(1);
|
|
|
|
const LLT S64 = LLT::scalar(64);
|
|
|
|
|
2019-06-25 00:16:12 +08:00
|
|
|
Register Src = MI.getOperand(1).getReg();
|
2019-05-17 20:59:27 +08:00
|
|
|
assert(MRI.getType(Src) == S64);
|
2019-05-17 20:20:05 +08:00
|
|
|
|
|
|
|
// result = trunc(src)
|
|
|
|
// if (src > 0.0 && src != result)
|
|
|
|
// result += 1.0
|
|
|
|
|
2020-01-31 21:52:33 +08:00
|
|
|
auto Trunc = B.buildIntrinsicTrunc(S64, Src);
|
2019-05-17 20:20:05 +08:00
|
|
|
|
|
|
|
const auto Zero = B.buildFConstant(S64, 0.0);
|
|
|
|
const auto One = B.buildFConstant(S64, 1.0);
|
|
|
|
auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
|
|
|
|
auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
|
|
|
|
auto And = B.buildAnd(S1, Lt0, NeTrunc);
|
|
|
|
auto Add = B.buildSelect(S64, And, One, Zero);
|
|
|
|
|
|
|
|
// TODO: Should this propagate fast-math-flags?
|
|
|
|
B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
|
|
|
|
return true;
|
2019-05-17 20:19:57 +08:00
|
|
|
}
|
2019-05-17 20:20:01 +08:00
|
|
|
|
|
|
|
static MachineInstrBuilder extractF64Exponent(unsigned Hi,
|
|
|
|
MachineIRBuilder &B) {
|
|
|
|
const unsigned FractBits = 52;
|
|
|
|
const unsigned ExpBits = 11;
|
|
|
|
LLT S32 = LLT::scalar(32);
|
|
|
|
|
|
|
|
auto Const0 = B.buildConstant(S32, FractBits - 32);
|
|
|
|
auto Const1 = B.buildConstant(S32, ExpBits);
|
|
|
|
|
|
|
|
auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
|
|
|
|
.addUse(Const0.getReg(0))
|
|
|
|
.addUse(Const1.getReg(0));
|
|
|
|
|
|
|
|
return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
|
|
|
|
}
|
|
|
|
|
|
|
|
bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
|
|
|
|
MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
|
|
MachineIRBuilder &B) const {
|
|
|
|
B.setInstr(MI);
|
|
|
|
|
2019-05-17 20:59:27 +08:00
|
|
|
const LLT S1 = LLT::scalar(1);
|
|
|
|
const LLT S32 = LLT::scalar(32);
|
|
|
|
const LLT S64 = LLT::scalar(64);
|
2019-05-17 20:20:01 +08:00
|
|
|
|
2019-06-25 00:16:12 +08:00
|
|
|
Register Src = MI.getOperand(1).getReg();
|
2019-05-17 20:59:27 +08:00
|
|
|
assert(MRI.getType(Src) == S64);
|
2019-05-17 20:20:01 +08:00
|
|
|
|
|
|
|
// TODO: Should this use extract since the low half is unused?
|
|
|
|
auto Unmerge = B.buildUnmerge({S32, S32}, Src);
|
2019-06-25 00:16:12 +08:00
|
|
|
Register Hi = Unmerge.getReg(1);
|
2019-05-17 20:20:01 +08:00
|
|
|
|
|
|
|
// Extract the upper half, since this is where we will find the sign and
|
|
|
|
// exponent.
|
|
|
|
auto Exp = extractF64Exponent(Hi, B);
|
|
|
|
|
|
|
|
const unsigned FractBits = 52;
|
|
|
|
|
|
|
|
// Extract the sign bit.
|
|
|
|
const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
|
|
|
|
auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
|
|
|
|
|
|
|
|
const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
|
|
|
|
|
|
|
|
const auto Zero32 = B.buildConstant(S32, 0);
|
|
|
|
|
|
|
|
// Extend back to 64-bits.
|
2020-02-08 00:38:01 +08:00
|
|
|
auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
|
2019-05-17 20:20:01 +08:00
|
|
|
|
|
|
|
auto Shr = B.buildAShr(S64, FractMask, Exp);
|
|
|
|
auto Not = B.buildNot(S64, Shr);
|
|
|
|
auto Tmp0 = B.buildAnd(S64, Src, Not);
|
|
|
|
auto FiftyOne = B.buildConstant(S32, FractBits - 1);
|
|
|
|
|
|
|
|
auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
|
|
|
|
auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
|
|
|
|
|
|
|
|
auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
|
|
|
|
B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
|
|
|
|
return true;
|
|
|
|
}
|
2019-05-18 07:05:18 +08:00
|
|
|
|
|
|
|
bool AMDGPULegalizerInfo::legalizeITOFP(
|
|
|
|
MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
|
|
MachineIRBuilder &B, bool Signed) const {
|
|
|
|
B.setInstr(MI);
|
|
|
|
|
2019-06-25 00:16:12 +08:00
|
|
|
Register Dst = MI.getOperand(0).getReg();
|
|
|
|
Register Src = MI.getOperand(1).getReg();
|
2019-05-18 07:05:18 +08:00
|
|
|
|
|
|
|
const LLT S64 = LLT::scalar(64);
|
|
|
|
const LLT S32 = LLT::scalar(32);
|
|
|
|
|
|
|
|
assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
|
|
|
|
|
|
|
|
auto Unmerge = B.buildUnmerge({S32, S32}, Src);
|
|
|
|
|
|
|
|
auto CvtHi = Signed ?
|
|
|
|
B.buildSITOFP(S64, Unmerge.getReg(1)) :
|
|
|
|
B.buildUITOFP(S64, Unmerge.getReg(1));
|
|
|
|
|
|
|
|
auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
|
|
|
|
|
|
|
|
auto ThirtyTwo = B.buildConstant(S32, 32);
|
|
|
|
auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
|
|
|
|
.addUse(CvtHi.getReg(0))
|
|
|
|
.addUse(ThirtyTwo.getReg(0));
|
|
|
|
|
|
|
|
// TODO: Should this propagate fast-math-flags?
|
|
|
|
B.buildFAdd(Dst, LdExp, CvtLo);
|
|
|
|
MI.eraseFromParent();
|
|
|
|
return true;
|
|
|
|
}
|
2019-07-02 02:40:23 +08:00
|
|
|
|
2020-01-05 05:40:45 +08:00
|
|
|
// TODO: Copied from DAG implementation. Verify logic and document how this
|
|
|
|
// actually works.
|
|
|
|
bool AMDGPULegalizerInfo::legalizeFPTOI(
|
|
|
|
MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
|
|
MachineIRBuilder &B, bool Signed) const {
|
|
|
|
B.setInstr(MI);
|
|
|
|
|
|
|
|
Register Dst = MI.getOperand(0).getReg();
|
|
|
|
Register Src = MI.getOperand(1).getReg();
|
|
|
|
|
|
|
|
const LLT S64 = LLT::scalar(64);
|
|
|
|
const LLT S32 = LLT::scalar(32);
|
|
|
|
|
|
|
|
assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
|
|
|
|
|
|
|
|
unsigned Flags = MI.getFlags();
|
|
|
|
|
|
|
|
auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
|
|
|
|
auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
|
|
|
|
auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
|
|
|
|
|
|
|
|
auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
|
|
|
|
auto FloorMul = B.buildFFloor(S64, Mul, Flags);
|
|
|
|
auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
|
|
|
|
|
|
|
|
auto Hi = Signed ?
|
|
|
|
B.buildFPTOSI(S32, FloorMul) :
|
|
|
|
B.buildFPTOUI(S32, FloorMul);
|
|
|
|
auto Lo = B.buildFPTOUI(S32, Fma);
|
|
|
|
|
2020-02-08 00:38:01 +08:00
|
|
|
B.buildMerge(Dst, { Lo, Hi });
|
2020-01-05 05:40:45 +08:00
|
|
|
MI.eraseFromParent();
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2019-07-11 00:31:19 +08:00
|
|
|
bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
|
|
|
|
MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
|
|
MachineIRBuilder &B) const {
|
|
|
|
MachineFunction &MF = B.getMF();
|
|
|
|
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
|
|
|
|
|
|
|
const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
|
|
|
|
MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
|
|
|
|
|
|
|
|
// With ieee_mode disabled, the instructions have the correct behavior
|
|
|
|
// already for G_FMINNUM/G_FMAXNUM
|
|
|
|
if (!MFI->getMode().IEEE)
|
|
|
|
return !IsIEEEOp;
|
|
|
|
|
|
|
|
if (IsIEEEOp)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
MachineIRBuilder HelperBuilder(MI);
|
|
|
|
GISelObserverWrapper DummyObserver;
|
|
|
|
LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
|
2019-09-10 07:30:11 +08:00
|
|
|
HelperBuilder.setInstr(MI);
|
2019-07-11 00:31:19 +08:00
|
|
|
return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
|
|
|
|
}
|
|
|
|
|
2019-07-16 03:40:59 +08:00
|
|
|
bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
|
|
|
|
MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
|
|
MachineIRBuilder &B) const {
|
|
|
|
// TODO: Should move some of this into LegalizerHelper.
|
|
|
|
|
|
|
|
// TODO: Promote dynamic indexing of s16 to s32
|
2020-02-07 05:52:04 +08:00
|
|
|
|
|
|
|
// FIXME: Artifact combiner probably should have replaced the truncated
|
|
|
|
// constant before this, so we shouldn't need
|
|
|
|
// getConstantVRegValWithLookThrough.
|
|
|
|
Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
|
|
|
|
MI.getOperand(2).getReg(), MRI);
|
2019-07-16 03:40:59 +08:00
|
|
|
if (!IdxVal) // Dynamic case will be selected to register indexing.
|
|
|
|
return true;
|
|
|
|
|
|
|
|
Register Dst = MI.getOperand(0).getReg();
|
|
|
|
Register Vec = MI.getOperand(1).getReg();
|
|
|
|
|
|
|
|
LLT VecTy = MRI.getType(Vec);
|
|
|
|
LLT EltTy = VecTy.getElementType();
|
|
|
|
assert(EltTy == MRI.getType(Dst));
|
|
|
|
|
|
|
|
B.setInstr(MI);
|
|
|
|
|
2020-02-07 05:52:04 +08:00
|
|
|
if (IdxVal->Value < VecTy.getNumElements())
|
|
|
|
B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
|
2019-07-16 03:40:59 +08:00
|
|
|
else
|
|
|
|
B.buildUndef(Dst);
|
|
|
|
|
|
|
|
MI.eraseFromParent();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2019-07-16 03:43:04 +08:00
|
|
|
bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
|
|
|
|
MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
|
|
MachineIRBuilder &B) const {
|
|
|
|
// TODO: Should move some of this into LegalizerHelper.
|
|
|
|
|
|
|
|
// TODO: Promote dynamic indexing of s16 to s32
|
2020-02-07 05:52:04 +08:00
|
|
|
|
|
|
|
// FIXME: Artifact combiner probably should have replaced the truncated
|
|
|
|
// constant before this, so we shouldn't need
|
|
|
|
// getConstantVRegValWithLookThrough.
|
|
|
|
Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
|
|
|
|
MI.getOperand(3).getReg(), MRI);
|
2019-07-16 03:43:04 +08:00
|
|
|
if (!IdxVal) // Dynamic case will be selected to register indexing.
|
|
|
|
return true;
|
|
|
|
|
|
|
|
Register Dst = MI.getOperand(0).getReg();
|
|
|
|
Register Vec = MI.getOperand(1).getReg();
|
|
|
|
Register Ins = MI.getOperand(2).getReg();
|
|
|
|
|
|
|
|
LLT VecTy = MRI.getType(Vec);
|
|
|
|
LLT EltTy = VecTy.getElementType();
|
|
|
|
assert(EltTy == MRI.getType(Ins));
|
|
|
|
|
|
|
|
B.setInstr(MI);
|
|
|
|
|
2020-02-07 05:52:04 +08:00
|
|
|
if (IdxVal->Value < VecTy.getNumElements())
|
|
|
|
B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
|
2019-07-16 03:43:04 +08:00
|
|
|
else
|
|
|
|
B.buildUndef(Dst);
|
|
|
|
|
|
|
|
MI.eraseFromParent();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2020-01-02 04:51:46 +08:00
|
|
|
bool AMDGPULegalizerInfo::legalizeShuffleVector(
|
|
|
|
MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
|
|
MachineIRBuilder &B) const {
|
|
|
|
const LLT V2S16 = LLT::vector(2, 16);
|
|
|
|
|
|
|
|
Register Dst = MI.getOperand(0).getReg();
|
|
|
|
Register Src0 = MI.getOperand(1).getReg();
|
|
|
|
LLT DstTy = MRI.getType(Dst);
|
|
|
|
LLT SrcTy = MRI.getType(Src0);
|
|
|
|
|
|
|
|
if (SrcTy == V2S16 && DstTy == V2S16 &&
|
2020-02-16 12:56:56 +08:00
|
|
|
AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
|
2020-01-02 04:51:46 +08:00
|
|
|
return true;
|
|
|
|
|
|
|
|
MachineIRBuilder HelperBuilder(MI);
|
|
|
|
GISelObserverWrapper DummyObserver;
|
|
|
|
LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
|
|
|
|
HelperBuilder.setInstr(MI);
|
|
|
|
return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
|
|
|
|
}
|
|
|
|
|
2019-08-30 04:06:48 +08:00
|
|
|
bool AMDGPULegalizerInfo::legalizeSinCos(
|
|
|
|
MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
|
|
MachineIRBuilder &B) const {
|
|
|
|
B.setInstr(MI);
|
|
|
|
|
|
|
|
Register DstReg = MI.getOperand(0).getReg();
|
|
|
|
Register SrcReg = MI.getOperand(1).getReg();
|
|
|
|
LLT Ty = MRI.getType(DstReg);
|
|
|
|
unsigned Flags = MI.getFlags();
|
|
|
|
|
|
|
|
Register TrigVal;
|
|
|
|
auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
|
|
|
|
if (ST.hasTrigReducedRange()) {
|
|
|
|
auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
|
|
|
|
TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
|
|
|
|
.addUse(MulVal.getReg(0))
|
|
|
|
.setMIFlags(Flags).getReg(0);
|
|
|
|
} else
|
|
|
|
TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
|
|
|
|
|
|
|
|
Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
|
|
|
|
Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
|
|
|
|
B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
|
|
|
|
.addUse(TrigVal)
|
|
|
|
.setMIFlags(Flags);
|
|
|
|
MI.eraseFromParent();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2019-10-01 09:06:43 +08:00
|
|
|
bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
|
|
|
|
Register DstReg, LLT PtrTy,
|
|
|
|
MachineIRBuilder &B, const GlobalValue *GV,
|
|
|
|
unsigned Offset, unsigned GAFlags) const {
|
|
|
|
// In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
|
|
|
|
// to the following code sequence:
|
|
|
|
//
|
|
|
|
// For constant address space:
|
|
|
|
// s_getpc_b64 s[0:1]
|
|
|
|
// s_add_u32 s0, s0, $symbol
|
|
|
|
// s_addc_u32 s1, s1, 0
|
|
|
|
//
|
|
|
|
// s_getpc_b64 returns the address of the s_add_u32 instruction and then
|
|
|
|
// a fixup or relocation is emitted to replace $symbol with a literal
|
|
|
|
// constant, which is a pc-relative offset from the encoding of the $symbol
|
|
|
|
// operand to the global variable.
|
|
|
|
//
|
|
|
|
// For global address space:
|
|
|
|
// s_getpc_b64 s[0:1]
|
|
|
|
// s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
|
|
|
|
// s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
|
|
|
|
//
|
|
|
|
// s_getpc_b64 returns the address of the s_add_u32 instruction and then
|
|
|
|
// fixups or relocations are emitted to replace $symbol@*@lo and
|
|
|
|
// $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
|
|
|
|
// which is a 64-bit pc-relative offset from the encoding of the $symbol
|
|
|
|
// operand to the global variable.
|
|
|
|
//
|
|
|
|
// What we want here is an offset from the value returned by s_getpc
|
|
|
|
// (which is the address of the s_add_u32 instruction) to the global
|
|
|
|
// variable, but since the encoding of $symbol starts 4 bytes after the start
|
|
|
|
// of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
|
|
|
|
// small. This requires us to add 4 to the global variable offset in order to
|
|
|
|
// compute the correct address.
|
|
|
|
|
|
|
|
LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
|
|
|
|
|
|
|
|
Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
|
|
|
|
B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
|
|
|
|
|
|
|
|
MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
|
|
|
|
.addDef(PCReg);
|
|
|
|
|
|
|
|
MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
|
|
|
|
if (GAFlags == SIInstrInfo::MO_NONE)
|
|
|
|
MIB.addImm(0);
|
|
|
|
else
|
|
|
|
MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
|
|
|
|
|
|
|
|
B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
|
|
|
|
|
|
|
|
if (PtrTy.getSizeInBits() == 32)
|
|
|
|
B.buildExtract(DstReg, PCReg, 0);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2019-09-10 01:13:44 +08:00
|
|
|
bool AMDGPULegalizerInfo::legalizeGlobalValue(
|
|
|
|
MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
|
|
MachineIRBuilder &B) const {
|
|
|
|
Register DstReg = MI.getOperand(0).getReg();
|
|
|
|
LLT Ty = MRI.getType(DstReg);
|
|
|
|
unsigned AS = Ty.getAddressSpace();
|
|
|
|
|
|
|
|
const GlobalValue *GV = MI.getOperand(1).getGlobal();
|
|
|
|
MachineFunction &MF = B.getMF();
|
|
|
|
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
2019-10-01 09:06:43 +08:00
|
|
|
B.setInstr(MI);
|
2019-09-10 01:13:44 +08:00
|
|
|
|
|
|
|
if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
|
|
|
|
if (!MFI->isEntryFunction()) {
|
|
|
|
const Function &Fn = MF.getFunction();
|
|
|
|
DiagnosticInfoUnsupported BadLDSDecl(
|
2020-03-11 23:49:03 +08:00
|
|
|
Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
|
|
|
|
DS_Warning);
|
2019-09-10 01:13:44 +08:00
|
|
|
Fn.getContext().diagnose(BadLDSDecl);
|
2020-03-11 23:49:03 +08:00
|
|
|
|
|
|
|
// We currently don't have a way to correctly allocate LDS objects that
|
|
|
|
// aren't directly associated with a kernel. We do force inlining of
|
|
|
|
// functions that use local objects. However, if these dead functions are
|
|
|
|
// not eliminated, we don't want a compile time error. Just emit a warning
|
|
|
|
// and a trap, since there should be no callable path here.
|
|
|
|
B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
|
|
|
|
B.buildUndef(DstReg);
|
|
|
|
MI.eraseFromParent();
|
|
|
|
return true;
|
2019-09-10 01:13:44 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// TODO: We could emit code to handle the initialization somewhere.
|
|
|
|
if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
|
2020-01-26 12:20:38 +08:00
|
|
|
const SITargetLowering *TLI = ST.getTargetLowering();
|
|
|
|
if (!TLI->shouldUseLDSConstAddress(GV)) {
|
|
|
|
MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
|
|
|
|
return true; // Leave in place;
|
|
|
|
}
|
|
|
|
|
2019-09-10 01:13:44 +08:00
|
|
|
B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
|
|
|
|
MI.eraseFromParent();
|
|
|
|
return true;
|
|
|
|
}
|
2019-10-01 09:06:43 +08:00
|
|
|
|
|
|
|
const Function &Fn = MF.getFunction();
|
|
|
|
DiagnosticInfoUnsupported BadInit(
|
|
|
|
Fn, "unsupported initializer for address space", MI.getDebugLoc());
|
|
|
|
Fn.getContext().diagnose(BadInit);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
const SITargetLowering *TLI = ST.getTargetLowering();
|
|
|
|
|
|
|
|
if (TLI->shouldEmitFixup(GV)) {
|
|
|
|
buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
|
|
|
|
MI.eraseFromParent();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (TLI->shouldEmitPCReloc(GV)) {
|
|
|
|
buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
|
|
|
|
MI.eraseFromParent();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
|
|
|
|
Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
|
|
|
|
|
|
|
|
MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
|
[Alignment][NFC] Transitionning more getMachineMemOperand call sites
Summary:
This is patch is part of a series to introduce an Alignment type.
See this thread for context: http://lists.llvm.org/pipermail/llvm-dev/2019-July/133851.html
See this patch for the introduction of the type: https://reviews.llvm.org/D64790
Reviewers: courbet
Subscribers: arsenm, dylanmckay, sdardis, nemanjai, jvesely, nhaehnle, hiraditya, kbarton, jrtc27, atanasyan, Jim, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D77121
2020-03-31 16:05:00 +08:00
|
|
|
MachinePointerInfo::getGOT(MF),
|
|
|
|
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
|
|
|
|
MachineMemOperand::MOInvariant,
|
|
|
|
8 /*Size*/, Align(8));
|
2019-10-01 09:06:43 +08:00
|
|
|
|
|
|
|
buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
|
|
|
|
|
|
|
|
if (Ty.getSizeInBits() == 32) {
|
|
|
|
// Truncate if this is a 32-bit constant adrdess.
|
|
|
|
auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
|
|
|
|
B.buildExtract(DstReg, Load, 0);
|
2019-09-10 01:13:44 +08:00
|
|
|
} else
|
2019-10-01 09:06:43 +08:00
|
|
|
B.buildLoad(DstReg, GOTAddr, *GOTMMO);
|
2019-09-10 01:13:44 +08:00
|
|
|
|
2019-10-01 09:06:43 +08:00
|
|
|
MI.eraseFromParent();
|
2019-09-10 01:13:44 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2019-09-11 00:42:31 +08:00
|
|
|
bool AMDGPULegalizerInfo::legalizeLoad(
|
|
|
|
MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
|
|
MachineIRBuilder &B, GISelChangeObserver &Observer) const {
|
|
|
|
B.setInstr(MI);
|
|
|
|
LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
|
|
|
|
auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
|
|
|
|
Observer.changingInstr(MI);
|
|
|
|
MI.getOperand(1).setReg(Cast.getReg(0));
|
|
|
|
Observer.changedInstr(MI);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2019-09-13 08:44:35 +08:00
|
|
|
bool AMDGPULegalizerInfo::legalizeFMad(
|
|
|
|
MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
|
|
MachineIRBuilder &B) const {
|
|
|
|
LLT Ty = MRI.getType(MI.getOperand(0).getReg());
|
|
|
|
assert(Ty.isScalar());
|
|
|
|
|
2019-11-02 00:44:56 +08:00
|
|
|
MachineFunction &MF = B.getMF();
|
|
|
|
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
|
|
|
|
2019-09-13 08:44:35 +08:00
|
|
|
// TODO: Always legal with future ftz flag.
|
2019-12-03 15:01:21 +08:00
|
|
|
// FIXME: Do we need just output?
|
|
|
|
if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
|
2019-09-13 08:44:35 +08:00
|
|
|
return true;
|
2019-12-03 15:01:21 +08:00
|
|
|
if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
|
2019-09-13 08:44:35 +08:00
|
|
|
return true;
|
|
|
|
|
|
|
|
MachineIRBuilder HelperBuilder(MI);
|
|
|
|
GISelObserverWrapper DummyObserver;
|
|
|
|
LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
|
2020-04-01 07:24:50 +08:00
|
|
|
HelperBuilder.setInstr(MI);
|
2019-09-13 08:44:35 +08:00
|
|
|
return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
|
|
|
|
}
|
|
|
|
|
2019-10-09 01:04:41 +08:00
|
|
|
bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
|
|
|
|
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
|
|
|
|
Register DstReg = MI.getOperand(0).getReg();
|
|
|
|
Register PtrReg = MI.getOperand(1).getReg();
|
|
|
|
Register CmpVal = MI.getOperand(2).getReg();
|
|
|
|
Register NewVal = MI.getOperand(3).getReg();
|
|
|
|
|
|
|
|
assert(SITargetLowering::isFlatGlobalAddrSpace(
|
|
|
|
MRI.getType(PtrReg).getAddressSpace()) &&
|
|
|
|
"this should not have been custom lowered");
|
|
|
|
|
|
|
|
LLT ValTy = MRI.getType(CmpVal);
|
|
|
|
LLT VecTy = LLT::vector(2, ValTy);
|
|
|
|
|
|
|
|
B.setInstr(MI);
|
|
|
|
Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
|
|
|
|
|
|
|
|
B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
|
|
|
|
.addDef(DstReg)
|
|
|
|
.addUse(PtrReg)
|
|
|
|
.addUse(PackedVal)
|
|
|
|
.setMemRefs(MI.memoperands());
|
|
|
|
|
|
|
|
MI.eraseFromParent();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2020-01-22 11:29:30 +08:00
|
|
|
bool AMDGPULegalizerInfo::legalizeFlog(
|
|
|
|
MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
|
|
|
|
Register Dst = MI.getOperand(0).getReg();
|
|
|
|
Register Src = MI.getOperand(1).getReg();
|
|
|
|
LLT Ty = B.getMRI()->getType(Dst);
|
|
|
|
unsigned Flags = MI.getFlags();
|
|
|
|
B.setInstr(MI);
|
|
|
|
|
|
|
|
auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
|
|
|
|
auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
|
|
|
|
|
|
|
|
B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
|
|
|
|
MI.eraseFromParent();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2020-01-25 09:53:26 +08:00
|
|
|
bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
|
|
|
|
MachineIRBuilder &B) const {
|
|
|
|
Register Dst = MI.getOperand(0).getReg();
|
|
|
|
Register Src = MI.getOperand(1).getReg();
|
|
|
|
unsigned Flags = MI.getFlags();
|
|
|
|
LLT Ty = B.getMRI()->getType(Dst);
|
|
|
|
B.setInstr(MI);
|
|
|
|
|
|
|
|
auto K = B.buildFConstant(Ty, numbers::log2e);
|
|
|
|
auto Mul = B.buildFMul(Ty, Src, K, Flags);
|
|
|
|
B.buildFExp2(Dst, Mul, Flags);
|
2020-01-05 04:35:26 +08:00
|
|
|
MI.eraseFromParent();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2020-02-21 07:59:08 +08:00
|
|
|
bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
|
|
|
|
MachineIRBuilder &B) const {
|
|
|
|
Register Dst = MI.getOperand(0).getReg();
|
|
|
|
Register Src0 = MI.getOperand(1).getReg();
|
|
|
|
Register Src1 = MI.getOperand(2).getReg();
|
|
|
|
unsigned Flags = MI.getFlags();
|
|
|
|
LLT Ty = B.getMRI()->getType(Dst);
|
|
|
|
B.setInstr(MI);
|
|
|
|
const LLT S16 = LLT::scalar(16);
|
|
|
|
const LLT S32 = LLT::scalar(32);
|
|
|
|
|
|
|
|
if (Ty == S32) {
|
|
|
|
auto Log = B.buildFLog2(S32, Src0, Flags);
|
|
|
|
auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
|
|
|
|
.addUse(Log.getReg(0))
|
|
|
|
.addUse(Src1)
|
|
|
|
.setMIFlags(Flags);
|
|
|
|
B.buildFExp2(Dst, Mul, Flags);
|
|
|
|
} else if (Ty == S16) {
|
|
|
|
// There's no f16 fmul_legacy, so we need to convert for it.
|
|
|
|
auto Log = B.buildFLog2(S16, Src0, Flags);
|
|
|
|
auto Ext0 = B.buildFPExt(S32, Log, Flags);
|
|
|
|
auto Ext1 = B.buildFPExt(S32, Src1, Flags);
|
|
|
|
auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
|
|
|
|
.addUse(Ext0.getReg(0))
|
|
|
|
.addUse(Ext1.getReg(0))
|
|
|
|
.setMIFlags(Flags);
|
|
|
|
|
|
|
|
B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
|
|
|
|
} else
|
|
|
|
return false;
|
|
|
|
|
|
|
|
MI.eraseFromParent();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2020-01-24 23:01:15 +08:00
|
|
|
// Find a source register, ignoring any possible source modifiers.
|
|
|
|
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
|
|
|
|
Register ModSrc = OrigSrc;
|
|
|
|
if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
|
|
|
|
ModSrc = SrcFNeg->getOperand(1).getReg();
|
|
|
|
if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
|
|
|
|
ModSrc = SrcFAbs->getOperand(1).getReg();
|
|
|
|
} else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
|
|
|
|
ModSrc = SrcFAbs->getOperand(1).getReg();
|
|
|
|
return ModSrc;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
|
|
|
|
MachineRegisterInfo &MRI,
|
|
|
|
MachineIRBuilder &B) const {
|
|
|
|
B.setInstr(MI);
|
|
|
|
|
|
|
|
const LLT S1 = LLT::scalar(1);
|
|
|
|
const LLT S64 = LLT::scalar(64);
|
|
|
|
Register Dst = MI.getOperand(0).getReg();
|
|
|
|
Register OrigSrc = MI.getOperand(1).getReg();
|
|
|
|
unsigned Flags = MI.getFlags();
|
|
|
|
assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
|
|
|
|
"this should not have been custom lowered");
|
|
|
|
|
|
|
|
// V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
|
|
|
|
// is used instead. However, SI doesn't have V_FLOOR_F64, so the most
|
|
|
|
// efficient way to implement it is using V_FRACT_F64. The workaround for the
|
|
|
|
// V_FRACT bug is:
|
|
|
|
// fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
|
|
|
|
//
|
|
|
|
// Convert floor(x) to (x - fract(x))
|
|
|
|
|
|
|
|
auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
|
|
|
|
.addUse(OrigSrc)
|
|
|
|
.setMIFlags(Flags);
|
|
|
|
|
|
|
|
// Give source modifier matching some assistance before obscuring a foldable
|
|
|
|
// pattern.
|
|
|
|
|
|
|
|
// TODO: We can avoid the neg on the fract? The input sign to fract
|
|
|
|
// shouldn't matter?
|
|
|
|
Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
|
|
|
|
|
|
|
|
auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
|
|
|
|
|
|
|
|
Register Min = MRI.createGenericVirtualRegister(S64);
|
|
|
|
|
|
|
|
// We don't need to concern ourselves with the snan handling difference, so
|
|
|
|
// use the one which will directly select.
|
|
|
|
const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
|
|
|
|
if (MFI->getMode().IEEE)
|
|
|
|
B.buildFMinNumIEEE(Min, Fract, Const, Flags);
|
|
|
|
else
|
|
|
|
B.buildFMinNum(Min, Fract, Const, Flags);
|
|
|
|
|
|
|
|
Register CorrectedFract = Min;
|
|
|
|
if (!MI.getFlag(MachineInstr::FmNoNans)) {
|
|
|
|
auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
|
|
|
|
CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
|
|
|
|
}
|
|
|
|
|
|
|
|
auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
|
|
|
|
B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
|
|
|
|
|
|
|
|
MI.eraseFromParent();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2020-01-05 04:35:26 +08:00
|
|
|
// Turn an illegal packed v2s16 build vector into bit operations.
|
|
|
|
// TODO: This should probably be a bitcast action in LegalizerHelper.
|
|
|
|
bool AMDGPULegalizerInfo::legalizeBuildVector(
|
|
|
|
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
|
|
|
|
Register Dst = MI.getOperand(0).getReg();
|
|
|
|
const LLT S32 = LLT::scalar(32);
|
2020-03-27 02:32:15 +08:00
|
|
|
assert(MRI.getType(Dst) == LLT::vector(2, 16));
|
2020-01-05 04:35:26 +08:00
|
|
|
|
|
|
|
Register Src0 = MI.getOperand(1).getReg();
|
|
|
|
Register Src1 = MI.getOperand(2).getReg();
|
|
|
|
assert(MRI.getType(Src0) == LLT::scalar(16));
|
|
|
|
|
|
|
|
B.setInstr(MI);
|
|
|
|
auto Merge = B.buildMerge(S32, {Src0, Src1});
|
|
|
|
B.buildBitcast(Dst, Merge);
|
2020-01-25 09:53:26 +08:00
|
|
|
|
|
|
|
MI.eraseFromParent();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2019-07-02 02:40:23 +08:00
|
|
|
// Return the use branch instruction, otherwise null if the usage is invalid.
|
|
|
|
static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
|
2020-01-06 11:09:24 +08:00
|
|
|
MachineRegisterInfo &MRI,
|
2020-05-17 22:51:22 +08:00
|
|
|
MachineInstr *&Br,
|
|
|
|
MachineBasicBlock *&UncondBrTarget) {
|
2019-07-02 02:40:23 +08:00
|
|
|
Register CondDef = MI.getOperand(0).getReg();
|
|
|
|
if (!MRI.hasOneNonDBGUse(CondDef))
|
|
|
|
return nullptr;
|
|
|
|
|
2020-05-17 22:51:22 +08:00
|
|
|
MachineBasicBlock *Parent = MI.getParent();
|
2019-07-02 02:40:23 +08:00
|
|
|
MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
|
2020-05-17 22:51:22 +08:00
|
|
|
if (UseMI.getParent() != Parent ||
|
2020-01-06 11:09:24 +08:00
|
|
|
UseMI.getOpcode() != AMDGPU::G_BRCOND)
|
|
|
|
return nullptr;
|
|
|
|
|
2020-05-17 22:51:22 +08:00
|
|
|
// Make sure the cond br is followed by a G_BR, or is the last instruction.
|
2020-01-06 11:09:24 +08:00
|
|
|
MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
|
2020-05-17 22:51:22 +08:00
|
|
|
if (Next == Parent->end()) {
|
|
|
|
MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
|
|
|
|
if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
|
|
|
|
return nullptr;
|
|
|
|
UncondBrTarget = &*NextMBB;
|
|
|
|
} else {
|
2020-01-06 11:09:24 +08:00
|
|
|
if (Next->getOpcode() != AMDGPU::G_BR)
|
|
|
|
return nullptr;
|
|
|
|
Br = &*Next;
|
2020-05-17 22:51:22 +08:00
|
|
|
UncondBrTarget = Br->getOperand(0).getMBB();
|
2020-01-06 11:09:24 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return &UseMI;
|
2019-07-02 02:40:23 +08:00
|
|
|
}
|
|
|
|
|
AMDGPU/GlobalISel: Support llvm.trap and llvm.debugtrap intrinsics
Summary: Lower trap and debugtrap intrinsics to AMDGPU machine instruction(s).
Reviewers: arsenm, nhaehnle, kerbowa, cdevadas, t-tye, kzhuravl
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, yaxunl, rovka, dstuttard, tpr, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D74688
2020-03-05 10:45:55 +08:00
|
|
|
Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B,
|
|
|
|
MachineRegisterInfo &MRI,
|
|
|
|
Register LiveIn,
|
|
|
|
Register PhyReg) const {
|
|
|
|
assert(PhyReg.isPhysical() && "Physical register expected");
|
|
|
|
|
|
|
|
// Insert the live-in copy, if required, by defining destination virtual
|
|
|
|
// register.
|
|
|
|
// FIXME: It seems EmitLiveInCopies isn't called anywhere?
|
|
|
|
if (!MRI.getVRegDef(LiveIn)) {
|
|
|
|
// FIXME: Should have scoped insert pt
|
|
|
|
MachineBasicBlock &OrigInsBB = B.getMBB();
|
|
|
|
auto OrigInsPt = B.getInsertPt();
|
|
|
|
|
|
|
|
MachineBasicBlock &EntryMBB = B.getMF().front();
|
|
|
|
EntryMBB.addLiveIn(PhyReg);
|
|
|
|
B.setInsertPt(EntryMBB, EntryMBB.begin());
|
|
|
|
B.buildCopy(LiveIn, PhyReg);
|
|
|
|
|
|
|
|
B.setInsertPt(OrigInsBB, OrigInsPt);
|
|
|
|
}
|
|
|
|
|
|
|
|
return LiveIn;
|
|
|
|
}
|
|
|
|
|
|
|
|
Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B,
|
|
|
|
MachineRegisterInfo &MRI,
|
|
|
|
Register PhyReg, LLT Ty,
|
|
|
|
bool InsertLiveInCopy) const {
|
|
|
|
assert(PhyReg.isPhysical() && "Physical register expected");
|
|
|
|
|
|
|
|
// Get or create virtual live-in regester
|
|
|
|
Register LiveIn = MRI.getLiveInVirtReg(PhyReg);
|
|
|
|
if (!LiveIn) {
|
|
|
|
LiveIn = MRI.createGenericVirtualRegister(Ty);
|
|
|
|
MRI.addLiveIn(PhyReg, LiveIn);
|
|
|
|
}
|
|
|
|
|
|
|
|
// When the actual true copy required is from virtual register to physical
|
|
|
|
// register (to be inserted later), live-in copy insertion from physical
|
|
|
|
// to register virtual register is not required
|
|
|
|
if (!InsertLiveInCopy)
|
2019-07-02 02:45:36 +08:00
|
|
|
return LiveIn;
|
|
|
|
|
AMDGPU/GlobalISel: Support llvm.trap and llvm.debugtrap intrinsics
Summary: Lower trap and debugtrap intrinsics to AMDGPU machine instruction(s).
Reviewers: arsenm, nhaehnle, kerbowa, cdevadas, t-tye, kzhuravl
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, yaxunl, rovka, dstuttard, tpr, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D74688
2020-03-05 10:45:55 +08:00
|
|
|
return insertLiveInCopy(B, MRI, LiveIn, PhyReg);
|
|
|
|
}
|
|
|
|
|
|
|
|
const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor(
|
|
|
|
MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
|
|
|
|
const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
|
|
|
|
const ArgDescriptor *Arg;
|
|
|
|
const TargetRegisterClass *RC;
|
|
|
|
std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
|
|
|
|
if (!Arg) {
|
|
|
|
LLVM_DEBUG(dbgs() << "Required arg register missing\n");
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
return Arg;
|
2019-07-02 02:45:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
|
|
|
|
const ArgDescriptor *Arg) const {
|
2019-09-05 10:20:29 +08:00
|
|
|
if (!Arg->isRegister() || !Arg->getRegister().isValid())
|
2019-07-02 02:45:36 +08:00
|
|
|
return false; // TODO: Handle these
|
|
|
|
|
AMDGPU/GlobalISel: Support llvm.trap and llvm.debugtrap intrinsics
Summary: Lower trap and debugtrap intrinsics to AMDGPU machine instruction(s).
Reviewers: arsenm, nhaehnle, kerbowa, cdevadas, t-tye, kzhuravl
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, yaxunl, rovka, dstuttard, tpr, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D74688
2020-03-05 10:45:55 +08:00
|
|
|
Register SrcReg = Arg->getRegister();
|
|
|
|
assert(SrcReg.isPhysical() && "Physical register expected");
|
|
|
|
assert(DstReg.isVirtual() && "Virtual register expected");
|
2019-07-02 02:45:36 +08:00
|
|
|
|
|
|
|
MachineRegisterInfo &MRI = *B.getMRI();
|
|
|
|
|
|
|
|
LLT Ty = MRI.getType(DstReg);
|
AMDGPU/GlobalISel: Support llvm.trap and llvm.debugtrap intrinsics
Summary: Lower trap and debugtrap intrinsics to AMDGPU machine instruction(s).
Reviewers: arsenm, nhaehnle, kerbowa, cdevadas, t-tye, kzhuravl
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, yaxunl, rovka, dstuttard, tpr, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D74688
2020-03-05 10:45:55 +08:00
|
|
|
Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty);
|
2019-07-02 02:45:36 +08:00
|
|
|
|
|
|
|
if (Arg->isMasked()) {
|
|
|
|
// TODO: Should we try to emit this once in the entry block?
|
|
|
|
const LLT S32 = LLT::scalar(32);
|
|
|
|
const unsigned Mask = Arg->getMask();
|
|
|
|
const unsigned Shift = countTrailingZeros<unsigned>(Mask);
|
|
|
|
|
2019-10-01 09:44:46 +08:00
|
|
|
Register AndMaskSrc = LiveIn;
|
|
|
|
|
|
|
|
if (Shift != 0) {
|
|
|
|
auto ShiftAmt = B.buildConstant(S32, Shift);
|
|
|
|
AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
|
|
|
|
}
|
|
|
|
|
|
|
|
B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
|
AMDGPU/GlobalISel: Support llvm.trap and llvm.debugtrap intrinsics
Summary: Lower trap and debugtrap intrinsics to AMDGPU machine instruction(s).
Reviewers: arsenm, nhaehnle, kerbowa, cdevadas, t-tye, kzhuravl
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, yaxunl, rovka, dstuttard, tpr, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D74688
2020-03-05 10:45:55 +08:00
|
|
|
} else {
|
2019-07-02 02:45:36 +08:00
|
|
|
B.buildCopy(DstReg, LiveIn);
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
|
AMDGPU/GlobalISel: Support llvm.trap and llvm.debugtrap intrinsics
Summary: Lower trap and debugtrap intrinsics to AMDGPU machine instruction(s).
Reviewers: arsenm, nhaehnle, kerbowa, cdevadas, t-tye, kzhuravl
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, yaxunl, rovka, dstuttard, tpr, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D74688
2020-03-05 10:45:55 +08:00
|
|
|
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
|
|
|
|
AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
|
2019-07-02 02:45:36 +08:00
|
|
|
B.setInstr(MI);
|
|
|
|
|
AMDGPU/GlobalISel: Support llvm.trap and llvm.debugtrap intrinsics
Summary: Lower trap and debugtrap intrinsics to AMDGPU machine instruction(s).
Reviewers: arsenm, nhaehnle, kerbowa, cdevadas, t-tye, kzhuravl
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, yaxunl, rovka, dstuttard, tpr, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D74688
2020-03-05 10:45:55 +08:00
|
|
|
const ArgDescriptor *Arg = getArgDescriptor(B, ArgType);
|
|
|
|
if (!Arg)
|
2019-07-02 02:45:36 +08:00
|
|
|
return false;
|
|
|
|
|
AMDGPU/GlobalISel: Support llvm.trap and llvm.debugtrap intrinsics
Summary: Lower trap and debugtrap intrinsics to AMDGPU machine instruction(s).
Reviewers: arsenm, nhaehnle, kerbowa, cdevadas, t-tye, kzhuravl
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, yaxunl, rovka, dstuttard, tpr, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D74688
2020-03-05 10:45:55 +08:00
|
|
|
if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg))
|
|
|
|
return false;
|
2019-07-02 02:45:36 +08:00
|
|
|
|
AMDGPU/GlobalISel: Support llvm.trap and llvm.debugtrap intrinsics
Summary: Lower trap and debugtrap intrinsics to AMDGPU machine instruction(s).
Reviewers: arsenm, nhaehnle, kerbowa, cdevadas, t-tye, kzhuravl
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, yaxunl, rovka, dstuttard, tpr, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D74688
2020-03-05 10:45:55 +08:00
|
|
|
MI.eraseFromParent();
|
|
|
|
return true;
|
2019-07-02 02:45:36 +08:00
|
|
|
}
|
|
|
|
|
AMDGPU/GlobalISel: Legalize fast unsafe FDIV
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69231
llvm-svn: 375460
2019-10-22 06:18:26 +08:00
|
|
|
bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
|
|
|
|
MachineRegisterInfo &MRI,
|
|
|
|
MachineIRBuilder &B) const {
|
|
|
|
B.setInstr(MI);
|
AMDGPU/GlobalISel: Legalize FDIV16
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, volkan, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69347
2019-10-23 08:39:26 +08:00
|
|
|
Register Dst = MI.getOperand(0).getReg();
|
|
|
|
LLT DstTy = MRI.getType(Dst);
|
|
|
|
LLT S16 = LLT::scalar(16);
|
AMDGPU/GlobalISel: Legalize FDIV32
Reviewers: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69581
2019-10-30 00:55:49 +08:00
|
|
|
LLT S32 = LLT::scalar(32);
|
AMDGPU/GlobalISel: Legalize FDIV64
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D70403
2019-11-18 08:43:59 +08:00
|
|
|
LLT S64 = LLT::scalar(64);
|
AMDGPU/GlobalISel: Legalize fast unsafe FDIV
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69231
llvm-svn: 375460
2019-10-22 06:18:26 +08:00
|
|
|
|
|
|
|
if (legalizeFastUnsafeFDIV(MI, MRI, B))
|
|
|
|
return true;
|
|
|
|
|
AMDGPU/GlobalISel: Legalize FDIV16
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, volkan, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69347
2019-10-23 08:39:26 +08:00
|
|
|
if (DstTy == S16)
|
|
|
|
return legalizeFDIV16(MI, MRI, B);
|
AMDGPU/GlobalISel: Legalize FDIV32
Reviewers: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69581
2019-10-30 00:55:49 +08:00
|
|
|
if (DstTy == S32)
|
|
|
|
return legalizeFDIV32(MI, MRI, B);
|
AMDGPU/GlobalISel: Legalize FDIV64
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D70403
2019-11-18 08:43:59 +08:00
|
|
|
if (DstTy == S64)
|
|
|
|
return legalizeFDIV64(MI, MRI, B);
|
AMDGPU/GlobalISel: Legalize FDIV16
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, volkan, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69347
2019-10-23 08:39:26 +08:00
|
|
|
|
AMDGPU/GlobalISel: Legalize fast unsafe FDIV
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69231
llvm-svn: 375460
2019-10-22 06:18:26 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2020-02-12 09:51:02 +08:00
|
|
|
static Register buildDivRCP(MachineIRBuilder &B, Register Src) {
|
|
|
|
const LLT S32 = LLT::scalar(32);
|
|
|
|
|
|
|
|
auto Cvt0 = B.buildUITOFP(S32, Src);
|
|
|
|
auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0});
|
|
|
|
auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000));
|
|
|
|
auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1);
|
|
|
|
return B.buildFPTOUI(S32, Mul).getReg(0);
|
|
|
|
}
|
|
|
|
|
2020-02-12 09:48:45 +08:00
|
|
|
void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
|
|
|
|
Register DstReg,
|
|
|
|
Register Num,
|
|
|
|
Register Den,
|
|
|
|
bool IsRem) const {
|
2020-02-12 09:51:02 +08:00
|
|
|
const LLT S1 = LLT::scalar(1);
|
|
|
|
const LLT S32 = LLT::scalar(32);
|
|
|
|
|
|
|
|
// RCP = URECIP(Den) = 2^32 / Den + e
|
|
|
|
// e is rounding error.
|
|
|
|
auto RCP = buildDivRCP(B, Den);
|
|
|
|
|
|
|
|
// RCP_LO = mul(RCP, Den)
|
|
|
|
auto RCP_LO = B.buildMul(S32, RCP, Den);
|
|
|
|
|
|
|
|
// RCP_HI = mulhu (RCP, Den) */
|
|
|
|
auto RCP_HI = B.buildUMulH(S32, RCP, Den);
|
|
|
|
|
|
|
|
// NEG_RCP_LO = -RCP_LO
|
|
|
|
auto Zero = B.buildConstant(S32, 0);
|
|
|
|
auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO);
|
|
|
|
|
|
|
|
// ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
|
|
|
|
auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero);
|
|
|
|
auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO);
|
|
|
|
|
|
|
|
// Calculate the rounding error from the URECIP instruction
|
|
|
|
// E = mulhu(ABS_RCP_LO, RCP)
|
|
|
|
auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP);
|
|
|
|
|
|
|
|
// RCP_A_E = RCP + E
|
|
|
|
auto RCP_A_E = B.buildAdd(S32, RCP, E);
|
|
|
|
|
|
|
|
// RCP_S_E = RCP - E
|
|
|
|
auto RCP_S_E = B.buildSub(S32, RCP, E);
|
|
|
|
|
|
|
|
// Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
|
|
|
|
auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E);
|
|
|
|
|
|
|
|
// Quotient = mulhu(Tmp0, Num)stmp
|
|
|
|
auto Quotient = B.buildUMulH(S32, Tmp0, Num);
|
|
|
|
|
|
|
|
// Num_S_Remainder = Quotient * Den
|
|
|
|
auto Num_S_Remainder = B.buildMul(S32, Quotient, Den);
|
|
|
|
|
|
|
|
// Remainder = Num - Num_S_Remainder
|
|
|
|
auto Remainder = B.buildSub(S32, Num, Num_S_Remainder);
|
|
|
|
|
|
|
|
// Remainder_GE_Den = Remainder >= Den
|
|
|
|
auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den);
|
|
|
|
|
|
|
|
// Remainder_GE_Zero = Num >= Num_S_Remainder;
|
|
|
|
auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1,
|
|
|
|
Num, Num_S_Remainder);
|
|
|
|
|
|
|
|
// Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
|
|
|
|
auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero);
|
|
|
|
|
|
|
|
// Calculate Division result:
|
|
|
|
|
|
|
|
// Quotient_A_One = Quotient + 1
|
|
|
|
auto One = B.buildConstant(S32, 1);
|
|
|
|
auto Quotient_A_One = B.buildAdd(S32, Quotient, One);
|
|
|
|
|
|
|
|
// Quotient_S_One = Quotient - 1
|
|
|
|
auto Quotient_S_One = B.buildSub(S32, Quotient, One);
|
|
|
|
|
|
|
|
// Div = (Tmp1 == 0 ? Quotient_A_One : Quotient)
|
|
|
|
auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One);
|
|
|
|
|
|
|
|
// Div = (Remainder_GE_Zero ? Div : Quotient_S_One)
|
|
|
|
if (IsRem) {
|
|
|
|
Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One);
|
|
|
|
|
|
|
|
// Calculate Rem result:
|
|
|
|
auto Remainder_S_Den = B.buildSub(S32, Remainder, Den);
|
|
|
|
|
|
|
|
// Remainder_A_Den = Remainder + Den
|
|
|
|
auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den);
|
|
|
|
|
|
|
|
// Rem = (Tmp1 ? Remainder_S_Den : Remainder)
|
|
|
|
auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder);
|
|
|
|
|
|
|
|
// Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den)
|
|
|
|
B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den);
|
|
|
|
} else {
|
|
|
|
B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One);
|
|
|
|
}
|
2020-02-12 09:48:45 +08:00
|
|
|
}
|
2020-02-12 09:51:02 +08:00
|
|
|
|
2020-02-12 09:48:45 +08:00
|
|
|
bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
|
|
|
|
MachineRegisterInfo &MRI,
|
|
|
|
MachineIRBuilder &B) const {
|
|
|
|
B.setInstr(MI);
|
|
|
|
const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM;
|
|
|
|
Register DstReg = MI.getOperand(0).getReg();
|
|
|
|
Register Num = MI.getOperand(1).getReg();
|
|
|
|
Register Den = MI.getOperand(2).getReg();
|
|
|
|
legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem);
|
2020-02-12 09:51:02 +08:00
|
|
|
MI.eraseFromParent();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2020-02-26 23:17:07 +08:00
|
|
|
// Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
|
|
|
|
//
|
|
|
|
// Return lo, hi of result
|
|
|
|
//
|
|
|
|
// %cvt.lo = G_UITOFP Val.lo
|
|
|
|
// %cvt.hi = G_UITOFP Val.hi
|
|
|
|
// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
|
|
|
|
// %rcp = G_AMDGPU_RCP_IFLAG %mad
|
|
|
|
// %mul1 = G_FMUL %rcp, 0x5f7ffffc
|
|
|
|
// %mul2 = G_FMUL %mul1, 2**(-32)
|
|
|
|
// %trunc = G_INTRINSIC_TRUNC %mul2
|
|
|
|
// %mad2 = G_FMAD %trunc, -(2**32), %mul1
|
|
|
|
// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
|
|
|
|
static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
|
|
|
|
Register Val) {
|
|
|
|
const LLT S32 = LLT::scalar(32);
|
|
|
|
auto Unmerge = B.buildUnmerge(S32, Val);
|
|
|
|
|
|
|
|
auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
|
|
|
|
auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
|
|
|
|
|
|
|
|
auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
|
|
|
|
B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
|
|
|
|
|
|
|
|
auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
|
|
|
|
auto Mul1 =
|
|
|
|
B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
|
|
|
|
|
|
|
|
// 2**(-32)
|
|
|
|
auto Mul2 =
|
|
|
|
B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
|
|
|
|
auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
|
|
|
|
|
|
|
|
// -(2**32)
|
|
|
|
auto Mad2 = B.buildFMAD(S32, Trunc,
|
|
|
|
B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
|
|
|
|
|
|
|
|
auto ResultLo = B.buildFPTOUI(S32, Mad2);
|
|
|
|
auto ResultHi = B.buildFPTOUI(S32, Trunc);
|
|
|
|
|
|
|
|
return {ResultLo.getReg(0), ResultHi.getReg(0)};
|
|
|
|
}
|
|
|
|
|
|
|
|
bool AMDGPULegalizerInfo::legalizeUDIV_UREM64(MachineInstr &MI,
|
|
|
|
MachineRegisterInfo &MRI,
|
|
|
|
MachineIRBuilder &B) const {
|
|
|
|
B.setInstr(MI);
|
|
|
|
|
|
|
|
const bool IsDiv = MI.getOpcode() == TargetOpcode::G_UDIV;
|
|
|
|
const LLT S32 = LLT::scalar(32);
|
|
|
|
const LLT S64 = LLT::scalar(64);
|
|
|
|
const LLT S1 = LLT::scalar(1);
|
|
|
|
Register Numer = MI.getOperand(1).getReg();
|
|
|
|
Register Denom = MI.getOperand(2).getReg();
|
|
|
|
Register RcpLo, RcpHi;
|
|
|
|
|
|
|
|
std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
|
|
|
|
|
|
|
|
auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
|
|
|
|
|
|
|
|
auto Zero64 = B.buildConstant(S64, 0);
|
|
|
|
auto NegDenom = B.buildSub(S64, Zero64, Denom);
|
|
|
|
|
|
|
|
auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
|
|
|
|
auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
|
|
|
|
|
|
|
|
auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
|
|
|
|
Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
|
|
|
|
Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
|
|
|
|
|
|
|
|
auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
|
|
|
|
auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
|
|
|
|
auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
|
|
|
|
auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
|
|
|
|
|
|
|
|
auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
|
|
|
|
auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
|
|
|
|
auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
|
|
|
|
Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
|
|
|
|
Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
|
|
|
|
|
|
|
|
auto Zero32 = B.buildConstant(S32, 0);
|
|
|
|
auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
|
|
|
|
auto Add2_HiC =
|
|
|
|
B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
|
|
|
|
auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
|
|
|
|
auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
|
|
|
|
|
|
|
|
auto UnmergeNumer = B.buildUnmerge(S32, Numer);
|
|
|
|
Register NumerLo = UnmergeNumer.getReg(0);
|
|
|
|
Register NumerHi = UnmergeNumer.getReg(1);
|
|
|
|
|
|
|
|
auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
|
|
|
|
auto Mul3 = B.buildMul(S64, Denom, MulHi3);
|
|
|
|
auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
|
|
|
|
Register Mul3_Lo = UnmergeMul3.getReg(0);
|
|
|
|
Register Mul3_Hi = UnmergeMul3.getReg(1);
|
|
|
|
auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
|
|
|
|
auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
|
|
|
|
auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
|
|
|
|
auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
|
|
|
|
|
|
|
|
auto UnmergeDenom = B.buildUnmerge(S32, Denom);
|
|
|
|
Register DenomLo = UnmergeDenom.getReg(0);
|
|
|
|
Register DenomHi = UnmergeDenom.getReg(1);
|
|
|
|
|
|
|
|
auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
|
|
|
|
auto C1 = B.buildSExt(S32, CmpHi);
|
|
|
|
|
|
|
|
auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
|
|
|
|
auto C2 = B.buildSExt(S32, CmpLo);
|
|
|
|
|
|
|
|
auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
|
|
|
|
auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
|
|
|
|
|
|
|
|
// TODO: Here and below portions of the code can be enclosed into if/endif.
|
|
|
|
// Currently control flow is unconditional and we have 4 selects after
|
|
|
|
// potential endif to substitute PHIs.
|
|
|
|
|
|
|
|
// if C3 != 0 ...
|
|
|
|
auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
|
|
|
|
auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
|
|
|
|
auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
|
|
|
|
auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
|
|
|
|
|
|
|
|
auto One64 = B.buildConstant(S64, 1);
|
|
|
|
auto Add3 = B.buildAdd(S64, MulHi3, One64);
|
|
|
|
|
|
|
|
auto C4 =
|
|
|
|
B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
|
|
|
|
auto C5 =
|
|
|
|
B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
|
|
|
|
auto C6 = B.buildSelect(
|
|
|
|
S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
|
|
|
|
|
|
|
|
// if (C6 != 0)
|
|
|
|
auto Add4 = B.buildAdd(S64, Add3, One64);
|
|
|
|
auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
|
|
|
|
|
|
|
|
auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
|
|
|
|
auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
|
|
|
|
auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
|
|
|
|
|
|
|
|
// endif C6
|
|
|
|
// endif C3
|
|
|
|
|
|
|
|
if (IsDiv) {
|
|
|
|
auto Sel1 = B.buildSelect(
|
|
|
|
S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
|
|
|
|
B.buildSelect(MI.getOperand(0),
|
|
|
|
B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
|
|
|
|
} else {
|
|
|
|
auto Sel2 = B.buildSelect(
|
|
|
|
S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
|
|
|
|
B.buildSelect(MI.getOperand(0),
|
|
|
|
B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
|
|
|
|
}
|
|
|
|
|
|
|
|
MI.eraseFromParent();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2020-02-12 09:51:02 +08:00
|
|
|
bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
|
|
|
|
MachineRegisterInfo &MRI,
|
|
|
|
MachineIRBuilder &B) const {
|
2020-02-26 23:17:07 +08:00
|
|
|
LLT Ty = MRI.getType(MI.getOperand(0).getReg());
|
|
|
|
if (Ty == LLT::scalar(32))
|
2020-02-12 09:51:02 +08:00
|
|
|
return legalizeUDIV_UREM32(MI, MRI, B);
|
2020-02-26 23:17:07 +08:00
|
|
|
if (Ty == LLT::scalar(64))
|
|
|
|
return legalizeUDIV_UREM64(MI, MRI, B);
|
2020-02-12 09:51:02 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2020-02-12 09:48:45 +08:00
|
|
|
bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI,
|
|
|
|
MachineRegisterInfo &MRI,
|
|
|
|
MachineIRBuilder &B) const {
|
|
|
|
B.setInstr(MI);
|
|
|
|
const LLT S32 = LLT::scalar(32);
|
|
|
|
|
|
|
|
const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM;
|
|
|
|
Register DstReg = MI.getOperand(0).getReg();
|
|
|
|
Register LHS = MI.getOperand(1).getReg();
|
|
|
|
Register RHS = MI.getOperand(2).getReg();
|
|
|
|
|
|
|
|
auto ThirtyOne = B.buildConstant(S32, 31);
|
|
|
|
auto LHSign = B.buildAShr(S32, LHS, ThirtyOne);
|
|
|
|
auto RHSign = B.buildAShr(S32, LHS, ThirtyOne);
|
|
|
|
|
|
|
|
LHS = B.buildAdd(S32, LHS, LHSign).getReg(0);
|
|
|
|
RHS = B.buildAdd(S32, RHS, RHSign).getReg(0);
|
|
|
|
|
|
|
|
LHS = B.buildXor(S32, LHS, LHSign).getReg(0);
|
|
|
|
RHS = B.buildXor(S32, RHS, RHSign).getReg(0);
|
|
|
|
|
|
|
|
Register UDivRem = MRI.createGenericVirtualRegister(S32);
|
|
|
|
legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem);
|
|
|
|
|
|
|
|
if (IsRem) {
|
|
|
|
auto RSign = LHSign; // Remainder sign is the same as LHS
|
|
|
|
UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0);
|
|
|
|
B.buildSub(DstReg, UDivRem, RSign);
|
|
|
|
} else {
|
|
|
|
auto DSign = B.buildXor(S32, LHSign, RHSign);
|
|
|
|
UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0);
|
|
|
|
B.buildSub(DstReg, UDivRem, DSign);
|
|
|
|
}
|
|
|
|
|
|
|
|
MI.eraseFromParent();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
|
|
|
|
MachineRegisterInfo &MRI,
|
|
|
|
MachineIRBuilder &B) const {
|
|
|
|
if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32))
|
|
|
|
return legalizeSDIV_SREM32(MI, MRI, B);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
AMDGPU/GlobalISel: Legalize fast unsafe FDIV
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69231
llvm-svn: 375460
2019-10-22 06:18:26 +08:00
|
|
|
bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
|
|
|
|
MachineRegisterInfo &MRI,
|
|
|
|
MachineIRBuilder &B) const {
|
|
|
|
Register Res = MI.getOperand(0).getReg();
|
|
|
|
Register LHS = MI.getOperand(1).getReg();
|
|
|
|
Register RHS = MI.getOperand(2).getReg();
|
|
|
|
|
|
|
|
uint16_t Flags = MI.getFlags();
|
|
|
|
|
|
|
|
LLT ResTy = MRI.getType(Res);
|
|
|
|
LLT S32 = LLT::scalar(32);
|
|
|
|
LLT S64 = LLT::scalar(64);
|
|
|
|
|
|
|
|
const MachineFunction &MF = B.getMF();
|
|
|
|
bool Unsafe =
|
|
|
|
MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
|
|
|
|
|
|
|
|
if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
|
|
|
|
return false;
|
|
|
|
|
2019-11-02 00:44:56 +08:00
|
|
|
if (!Unsafe && ResTy == S32 &&
|
2019-12-03 15:01:21 +08:00
|
|
|
MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
|
AMDGPU/GlobalISel: Legalize fast unsafe FDIV
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69231
llvm-svn: 375460
2019-10-22 06:18:26 +08:00
|
|
|
return false;
|
|
|
|
|
|
|
|
if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
|
|
|
|
// 1 / x -> RCP(x)
|
|
|
|
if (CLHS->isExactlyValue(1.0)) {
|
|
|
|
B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
|
|
|
|
.addUse(RHS)
|
|
|
|
.setMIFlags(Flags);
|
|
|
|
|
|
|
|
MI.eraseFromParent();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// -1 / x -> RCP( FNEG(x) )
|
|
|
|
if (CLHS->isExactlyValue(-1.0)) {
|
|
|
|
auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
|
|
|
|
B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
|
|
|
|
.addUse(FNeg.getReg(0))
|
|
|
|
.setMIFlags(Flags);
|
|
|
|
|
|
|
|
MI.eraseFromParent();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// x / y -> x * (1.0 / y)
|
|
|
|
if (Unsafe) {
|
|
|
|
auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
|
|
|
|
.addUse(RHS)
|
|
|
|
.setMIFlags(Flags);
|
|
|
|
B.buildFMul(Res, LHS, RCP, Flags);
|
|
|
|
|
|
|
|
MI.eraseFromParent();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
AMDGPU/GlobalISel: Legalize FDIV16
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, volkan, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69347
2019-10-23 08:39:26 +08:00
|
|
|
bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
|
|
|
|
MachineRegisterInfo &MRI,
|
|
|
|
MachineIRBuilder &B) const {
|
|
|
|
B.setInstr(MI);
|
|
|
|
Register Res = MI.getOperand(0).getReg();
|
|
|
|
Register LHS = MI.getOperand(1).getReg();
|
|
|
|
Register RHS = MI.getOperand(2).getReg();
|
|
|
|
|
|
|
|
uint16_t Flags = MI.getFlags();
|
|
|
|
|
|
|
|
LLT S16 = LLT::scalar(16);
|
|
|
|
LLT S32 = LLT::scalar(32);
|
|
|
|
|
|
|
|
auto LHSExt = B.buildFPExt(S32, LHS, Flags);
|
|
|
|
auto RHSExt = B.buildFPExt(S32, RHS, Flags);
|
|
|
|
|
|
|
|
auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
|
|
|
|
.addUse(RHSExt.getReg(0))
|
|
|
|
.setMIFlags(Flags);
|
|
|
|
|
|
|
|
auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
|
|
|
|
auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
|
|
|
|
|
|
|
|
B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
|
|
|
|
.addUse(RDst.getReg(0))
|
|
|
|
.addUse(RHS)
|
|
|
|
.addUse(LHS)
|
|
|
|
.setMIFlags(Flags);
|
|
|
|
|
|
|
|
MI.eraseFromParent();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
AMDGPU/GlobalISel: Legalize FDIV32
Reviewers: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69581
2019-10-30 00:55:49 +08:00
|
|
|
// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
|
|
|
|
// to enable denorm mode. When 'Enable' is false, disable denorm mode.
|
|
|
|
static void toggleSPDenormMode(bool Enable,
|
2019-11-02 00:44:56 +08:00
|
|
|
MachineIRBuilder &B,
|
AMDGPU/GlobalISel: Legalize FDIV32
Reviewers: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69581
2019-10-30 00:55:49 +08:00
|
|
|
const GCNSubtarget &ST,
|
2019-11-02 00:44:56 +08:00
|
|
|
AMDGPU::SIModeRegisterDefaults Mode) {
|
AMDGPU/GlobalISel: Legalize FDIV32
Reviewers: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69581
2019-10-30 00:55:49 +08:00
|
|
|
// Set SP denorm mode to this value.
|
|
|
|
unsigned SPDenormMode =
|
2019-12-03 15:01:21 +08:00
|
|
|
Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
|
AMDGPU/GlobalISel: Legalize FDIV32
Reviewers: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69581
2019-10-30 00:55:49 +08:00
|
|
|
|
|
|
|
if (ST.hasDenormModeInst()) {
|
|
|
|
// Preserve default FP64FP16 denorm mode while updating FP32 mode.
|
2019-12-03 15:01:21 +08:00
|
|
|
uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
|
AMDGPU/GlobalISel: Legalize FDIV32
Reviewers: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69581
2019-10-30 00:55:49 +08:00
|
|
|
|
2019-12-03 15:01:21 +08:00
|
|
|
uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
|
AMDGPU/GlobalISel: Legalize FDIV32
Reviewers: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69581
2019-10-30 00:55:49 +08:00
|
|
|
B.buildInstr(AMDGPU::S_DENORM_MODE)
|
|
|
|
.addImm(NewDenormModeValue);
|
|
|
|
|
|
|
|
} else {
|
|
|
|
// Select FP32 bit field in mode register.
|
|
|
|
unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
|
|
|
|
(4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
|
|
|
|
(1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
|
|
|
|
|
|
|
|
B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
|
|
|
|
.addImm(SPDenormMode)
|
|
|
|
.addImm(SPDenormModeBitField);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
|
|
|
|
MachineRegisterInfo &MRI,
|
|
|
|
MachineIRBuilder &B) const {
|
|
|
|
B.setInstr(MI);
|
|
|
|
Register Res = MI.getOperand(0).getReg();
|
|
|
|
Register LHS = MI.getOperand(1).getReg();
|
|
|
|
Register RHS = MI.getOperand(2).getReg();
|
2019-11-02 00:44:56 +08:00
|
|
|
const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
|
|
|
|
AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
|
AMDGPU/GlobalISel: Legalize FDIV32
Reviewers: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69581
2019-10-30 00:55:49 +08:00
|
|
|
|
|
|
|
uint16_t Flags = MI.getFlags();
|
|
|
|
|
|
|
|
LLT S32 = LLT::scalar(32);
|
|
|
|
LLT S1 = LLT::scalar(1);
|
|
|
|
|
|
|
|
auto One = B.buildFConstant(S32, 1.0f);
|
|
|
|
|
|
|
|
auto DenominatorScaled =
|
|
|
|
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
|
|
|
|
.addUse(LHS)
|
2020-04-13 14:25:18 +08:00
|
|
|
.addUse(RHS)
|
|
|
|
.addImm(0)
|
AMDGPU/GlobalISel: Legalize FDIV32
Reviewers: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69581
2019-10-30 00:55:49 +08:00
|
|
|
.setMIFlags(Flags);
|
|
|
|
auto NumeratorScaled =
|
|
|
|
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
|
|
|
|
.addUse(LHS)
|
|
|
|
.addUse(RHS)
|
2020-04-13 14:25:18 +08:00
|
|
|
.addImm(1)
|
AMDGPU/GlobalISel: Legalize FDIV32
Reviewers: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69581
2019-10-30 00:55:49 +08:00
|
|
|
.setMIFlags(Flags);
|
|
|
|
|
|
|
|
auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
|
|
|
|
.addUse(DenominatorScaled.getReg(0))
|
|
|
|
.setMIFlags(Flags);
|
|
|
|
auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
|
|
|
|
|
|
|
|
// FIXME: Doesn't correctly model the FP mode switch, and the FP operations
|
|
|
|
// aren't modeled as reading it.
|
2019-12-03 15:01:21 +08:00
|
|
|
if (!Mode.allFP32Denormals())
|
2019-11-02 00:44:56 +08:00
|
|
|
toggleSPDenormMode(true, B, ST, Mode);
|
AMDGPU/GlobalISel: Legalize FDIV32
Reviewers: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69581
2019-10-30 00:55:49 +08:00
|
|
|
|
|
|
|
auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
|
|
|
|
auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
|
|
|
|
auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
|
|
|
|
auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
|
|
|
|
auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
|
|
|
|
auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
|
|
|
|
|
2019-12-03 15:01:21 +08:00
|
|
|
if (!Mode.allFP32Denormals())
|
2019-11-02 00:44:56 +08:00
|
|
|
toggleSPDenormMode(false, B, ST, Mode);
|
AMDGPU/GlobalISel: Legalize FDIV32
Reviewers: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69581
2019-10-30 00:55:49 +08:00
|
|
|
|
|
|
|
auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
|
|
|
|
.addUse(Fma4.getReg(0))
|
|
|
|
.addUse(Fma1.getReg(0))
|
|
|
|
.addUse(Fma3.getReg(0))
|
|
|
|
.addUse(NumeratorScaled.getReg(1))
|
|
|
|
.setMIFlags(Flags);
|
|
|
|
|
|
|
|
B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
|
|
|
|
.addUse(Fmas.getReg(0))
|
|
|
|
.addUse(RHS)
|
|
|
|
.addUse(LHS)
|
|
|
|
.setMIFlags(Flags);
|
|
|
|
|
|
|
|
MI.eraseFromParent();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
AMDGPU/GlobalISel: Legalize FDIV64
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D70403
2019-11-18 08:43:59 +08:00
|
|
|
bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
|
|
|
|
MachineRegisterInfo &MRI,
|
|
|
|
MachineIRBuilder &B) const {
|
|
|
|
B.setInstr(MI);
|
|
|
|
Register Res = MI.getOperand(0).getReg();
|
|
|
|
Register LHS = MI.getOperand(1).getReg();
|
|
|
|
Register RHS = MI.getOperand(2).getReg();
|
|
|
|
|
|
|
|
uint16_t Flags = MI.getFlags();
|
|
|
|
|
|
|
|
LLT S64 = LLT::scalar(64);
|
|
|
|
LLT S1 = LLT::scalar(1);
|
|
|
|
|
|
|
|
auto One = B.buildFConstant(S64, 1.0);
|
|
|
|
|
|
|
|
auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
|
|
|
|
.addUse(LHS)
|
2019-12-21 00:09:19 +08:00
|
|
|
.addUse(RHS)
|
2020-04-13 14:25:18 +08:00
|
|
|
.addImm(0)
|
AMDGPU/GlobalISel: Legalize FDIV64
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D70403
2019-11-18 08:43:59 +08:00
|
|
|
.setMIFlags(Flags);
|
|
|
|
|
|
|
|
auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
|
|
|
|
|
|
|
|
auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
|
|
|
|
.addUse(DivScale0.getReg(0))
|
|
|
|
.setMIFlags(Flags);
|
|
|
|
|
|
|
|
auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
|
|
|
|
auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
|
|
|
|
auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
|
|
|
|
|
|
|
|
auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
|
|
|
|
.addUse(LHS)
|
|
|
|
.addUse(RHS)
|
2020-04-13 14:25:18 +08:00
|
|
|
.addImm(1)
|
AMDGPU/GlobalISel: Legalize FDIV64
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D70403
2019-11-18 08:43:59 +08:00
|
|
|
.setMIFlags(Flags);
|
|
|
|
|
|
|
|
auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
|
2020-05-19 02:53:26 +08:00
|
|
|
auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
|
AMDGPU/GlobalISel: Legalize FDIV64
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D70403
2019-11-18 08:43:59 +08:00
|
|
|
auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
|
|
|
|
|
|
|
|
Register Scale;
|
|
|
|
if (!ST.hasUsableDivScaleConditionOutput()) {
|
|
|
|
// Workaround a hardware bug on SI where the condition output from div_scale
|
|
|
|
// is not usable.
|
|
|
|
|
|
|
|
LLT S32 = LLT::scalar(32);
|
|
|
|
|
|
|
|
auto NumUnmerge = B.buildUnmerge(S32, LHS);
|
|
|
|
auto DenUnmerge = B.buildUnmerge(S32, RHS);
|
|
|
|
auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
|
|
|
|
auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
|
|
|
|
|
|
|
|
auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
|
|
|
|
Scale1Unmerge.getReg(1));
|
|
|
|
auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
|
|
|
|
Scale0Unmerge.getReg(1));
|
[GlobalISel] Tidy up unnecessary calls to createGenericVirtualRegister
Summary:
As a side effect some redundant copies of constant values are removed by
CSEMIRBuilder.
Reviewers: aemerson, arsenm, dsanders, aditya_nandakumar
Subscribers: sdardis, jvesely, wdng, nhaehnle, rovka, hiraditya, jrtc27, atanasyan, volkan, Petar.Avramovic, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D73789
2020-01-31 20:40:31 +08:00
|
|
|
Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
|
AMDGPU/GlobalISel: Legalize FDIV64
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D70403
2019-11-18 08:43:59 +08:00
|
|
|
} else {
|
|
|
|
Scale = DivScale1.getReg(1);
|
|
|
|
}
|
|
|
|
|
|
|
|
auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
|
|
|
|
.addUse(Fma4.getReg(0))
|
|
|
|
.addUse(Fma3.getReg(0))
|
|
|
|
.addUse(Mul.getReg(0))
|
|
|
|
.addUse(Scale)
|
|
|
|
.setMIFlags(Flags);
|
|
|
|
|
2019-12-25 20:48:37 +08:00
|
|
|
B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
|
AMDGPU/GlobalISel: Legalize FDIV64
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D70403
2019-11-18 08:43:59 +08:00
|
|
|
.addUse(Fmas.getReg(0))
|
|
|
|
.addUse(RHS)
|
|
|
|
.addUse(LHS)
|
|
|
|
.setMIFlags(Flags);
|
|
|
|
|
|
|
|
MI.eraseFromParent();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
AMDGPU/GlobalISel: Legalize fast unsafe FDIV
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69231
llvm-svn: 375460
2019-10-22 06:18:26 +08:00
|
|
|
bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
|
|
|
|
MachineRegisterInfo &MRI,
|
|
|
|
MachineIRBuilder &B) const {
|
[AMDGPU/GlobalISel] Add llvm.amdgcn.fdiv.fast legalization.
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: volkan, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D64966
llvm-svn: 367344
2019-07-31 02:49:16 +08:00
|
|
|
B.setInstr(MI);
|
|
|
|
Register Res = MI.getOperand(0).getReg();
|
|
|
|
Register LHS = MI.getOperand(2).getReg();
|
|
|
|
Register RHS = MI.getOperand(3).getReg();
|
|
|
|
uint16_t Flags = MI.getFlags();
|
|
|
|
|
|
|
|
LLT S32 = LLT::scalar(32);
|
|
|
|
LLT S1 = LLT::scalar(1);
|
|
|
|
|
|
|
|
auto Abs = B.buildFAbs(S32, RHS, Flags);
|
|
|
|
const APFloat C0Val(1.0f);
|
|
|
|
|
|
|
|
auto C0 = B.buildConstant(S32, 0x6f800000);
|
|
|
|
auto C1 = B.buildConstant(S32, 0x2f800000);
|
|
|
|
auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
|
|
|
|
|
|
|
|
auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
|
|
|
|
auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
|
|
|
|
|
|
|
|
auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
|
|
|
|
|
|
|
|
auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
|
|
|
|
.addUse(Mul0.getReg(0))
|
|
|
|
.setMIFlags(Flags);
|
|
|
|
|
|
|
|
auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
|
|
|
|
|
|
|
|
B.buildFMul(Res, Sel, Mul1, Flags);
|
|
|
|
|
|
|
|
MI.eraseFromParent();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2019-07-02 02:49:01 +08:00
|
|
|
bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
|
|
|
|
MachineRegisterInfo &MRI,
|
|
|
|
MachineIRBuilder &B) const {
|
|
|
|
const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
|
|
|
|
if (!MFI->isEntryFunction()) {
|
|
|
|
return legalizePreloadedArgIntrin(MI, MRI, B,
|
|
|
|
AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
|
|
|
|
}
|
|
|
|
|
|
|
|
B.setInstr(MI);
|
|
|
|
|
|
|
|
uint64_t Offset =
|
|
|
|
ST.getTargetLowering()->getImplicitParameterOffset(
|
|
|
|
B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
|
|
|
|
Register DstReg = MI.getOperand(0).getReg();
|
|
|
|
LLT DstTy = MRI.getType(DstReg);
|
|
|
|
LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
|
|
|
|
|
|
|
|
const ArgDescriptor *Arg;
|
|
|
|
const TargetRegisterClass *RC;
|
|
|
|
std::tie(Arg, RC)
|
|
|
|
= MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
|
|
|
|
if (!Arg)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
|
|
|
|
if (!loadInputValue(KernargPtrReg, B, Arg))
|
|
|
|
return false;
|
|
|
|
|
[globalisel] Rename G_GEP to G_PTR_ADD
Summary:
G_GEP is rather poorly named. It's a simple pointer+scalar addition and
doesn't support any of the complexities of getelementptr. I therefore
propose that we rename it. There's a G_PTR_MASK so let's follow that
convention and go with G_PTR_ADD
Reviewers: volkan, aditya_nandakumar, bogner, rovka, arsenm
Subscribers: sdardis, jvesely, wdng, nhaehnle, hiraditya, jrtc27, atanasyan, arphaman, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69734
2019-11-02 04:18:00 +08:00
|
|
|
B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
|
2019-07-02 02:49:01 +08:00
|
|
|
MI.eraseFromParent();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2019-09-05 10:20:39 +08:00
|
|
|
bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
|
|
|
|
MachineRegisterInfo &MRI,
|
|
|
|
MachineIRBuilder &B,
|
|
|
|
unsigned AddrSpace) const {
|
|
|
|
B.setInstr(MI);
|
|
|
|
Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
|
|
|
|
auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
|
|
|
|
B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
|
|
|
|
MI.eraseFromParent();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2019-09-19 12:29:20 +08:00
|
|
|
// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
|
|
|
|
// offset (the offset that is included in bounds checking and swizzling, to be
|
|
|
|
// split between the instruction's voffset and immoffset fields) and soffset
|
|
|
|
// (the offset that is excluded from bounds checking and swizzling, to go in
|
|
|
|
// the instruction's soffset field). This function takes the first kind of
|
|
|
|
// offset and figures out how to split it between voffset and immoffset.
|
|
|
|
std::tuple<Register, unsigned, unsigned>
|
|
|
|
AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
|
|
|
|
Register OrigOffset) const {
|
|
|
|
const unsigned MaxImm = 4095;
|
|
|
|
Register BaseReg;
|
|
|
|
unsigned TotalConstOffset;
|
|
|
|
MachineInstr *OffsetDef;
|
|
|
|
const LLT S32 = LLT::scalar(32);
|
|
|
|
|
|
|
|
std::tie(BaseReg, TotalConstOffset, OffsetDef)
|
|
|
|
= AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
|
|
|
|
|
|
|
|
unsigned ImmOffset = TotalConstOffset;
|
|
|
|
|
|
|
|
// If the immediate value is too big for the immoffset field, put the value
|
|
|
|
// and -4096 into the immoffset field so that the value that is copied/added
|
|
|
|
// for the voffset field is a multiple of 4096, and it stands more chance
|
|
|
|
// of being CSEd with the copy/add for another similar load/store.
|
|
|
|
// However, do not do that rounding down to a multiple of 4096 if that is a
|
|
|
|
// negative number, as it appears to be illegal to have a negative offset
|
|
|
|
// in the vgpr, even if adding the immediate offset makes it positive.
|
|
|
|
unsigned Overflow = ImmOffset & ~MaxImm;
|
|
|
|
ImmOffset -= Overflow;
|
|
|
|
if ((int32_t)Overflow < 0) {
|
|
|
|
Overflow += ImmOffset;
|
|
|
|
ImmOffset = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (Overflow != 0) {
|
|
|
|
if (!BaseReg) {
|
|
|
|
BaseReg = B.buildConstant(S32, Overflow).getReg(0);
|
|
|
|
} else {
|
|
|
|
auto OverflowVal = B.buildConstant(S32, Overflow);
|
|
|
|
BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!BaseReg)
|
|
|
|
BaseReg = B.buildConstant(S32, 0).getReg(0);
|
|
|
|
|
|
|
|
return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
|
|
|
|
}
|
|
|
|
|
2019-09-20 00:26:14 +08:00
|
|
|
/// Handle register layout difference for f16 images for some subtargets.
|
|
|
|
Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
|
|
|
|
MachineRegisterInfo &MRI,
|
|
|
|
Register Reg) const {
|
|
|
|
if (!ST.hasUnpackedD16VMem())
|
|
|
|
return Reg;
|
|
|
|
|
|
|
|
const LLT S16 = LLT::scalar(16);
|
|
|
|
const LLT S32 = LLT::scalar(32);
|
|
|
|
LLT StoreVT = MRI.getType(Reg);
|
|
|
|
assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
|
|
|
|
|
|
|
|
auto Unmerge = B.buildUnmerge(S16, Reg);
|
|
|
|
|
|
|
|
SmallVector<Register, 4> WideRegs;
|
|
|
|
for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
|
|
|
|
WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
|
|
|
|
|
|
|
|
int NumElts = StoreVT.getNumElements();
|
|
|
|
|
|
|
|
return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
|
|
|
|
}
|
|
|
|
|
2020-01-14 09:39:09 +08:00
|
|
|
Register AMDGPULegalizerInfo::fixStoreSourceType(
|
|
|
|
MachineIRBuilder &B, Register VData, bool IsFormat) const {
|
|
|
|
MachineRegisterInfo *MRI = B.getMRI();
|
|
|
|
LLT Ty = MRI->getType(VData);
|
2019-09-20 00:26:14 +08:00
|
|
|
|
|
|
|
const LLT S16 = LLT::scalar(16);
|
|
|
|
|
|
|
|
// Fixup illegal register types for i8 stores.
|
|
|
|
if (Ty == LLT::scalar(8) || Ty == S16) {
|
|
|
|
Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
|
2020-01-14 09:39:09 +08:00
|
|
|
return AnyExt;
|
2019-09-20 00:26:14 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
if (Ty.isVector()) {
|
|
|
|
if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
|
|
|
|
if (IsFormat)
|
2020-01-14 09:39:09 +08:00
|
|
|
return handleD16VData(B, *MRI, VData);
|
2019-09-20 00:26:14 +08:00
|
|
|
}
|
2020-01-14 09:39:09 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return VData;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
|
|
|
|
MachineRegisterInfo &MRI,
|
|
|
|
MachineIRBuilder &B,
|
|
|
|
bool IsTyped,
|
|
|
|
bool IsFormat) const {
|
|
|
|
B.setInstr(MI);
|
|
|
|
|
|
|
|
Register VData = MI.getOperand(1).getReg();
|
|
|
|
LLT Ty = MRI.getType(VData);
|
|
|
|
LLT EltTy = Ty.getScalarType();
|
|
|
|
const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
|
|
|
|
const LLT S32 = LLT::scalar(32);
|
|
|
|
|
|
|
|
VData = fixStoreSourceType(B, VData, IsFormat);
|
|
|
|
Register RSrc = MI.getOperand(2).getReg();
|
|
|
|
|
|
|
|
MachineMemOperand *MMO = *MI.memoperands_begin();
|
|
|
|
const int MemSize = MMO->getSize();
|
2019-09-20 00:26:14 +08:00
|
|
|
|
2020-01-14 09:39:09 +08:00
|
|
|
unsigned ImmOffset;
|
|
|
|
unsigned TotalOffset;
|
|
|
|
|
|
|
|
// The typed intrinsics add an immediate after the registers.
|
|
|
|
const unsigned NumVIndexOps = IsTyped ? 8 : 7;
|
|
|
|
|
|
|
|
// The struct intrinsic variants add one additional operand over raw.
|
|
|
|
const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
|
|
|
|
Register VIndex;
|
|
|
|
int OpOffset = 0;
|
|
|
|
if (HasVIndex) {
|
|
|
|
VIndex = MI.getOperand(3).getReg();
|
|
|
|
OpOffset = 1;
|
2019-09-20 00:26:14 +08:00
|
|
|
}
|
|
|
|
|
2020-01-14 09:39:09 +08:00
|
|
|
Register VOffset = MI.getOperand(3 + OpOffset).getReg();
|
|
|
|
Register SOffset = MI.getOperand(4 + OpOffset).getReg();
|
|
|
|
|
|
|
|
unsigned Format = 0;
|
|
|
|
if (IsTyped) {
|
|
|
|
Format = MI.getOperand(5 + OpOffset).getImm();
|
|
|
|
++OpOffset;
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
|
|
|
|
|
|
|
|
std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
|
|
|
|
if (TotalOffset != 0)
|
|
|
|
MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
|
|
|
|
|
|
|
|
unsigned Opc;
|
|
|
|
if (IsTyped) {
|
|
|
|
Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
|
|
|
|
AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
|
|
|
|
} else if (IsFormat) {
|
|
|
|
Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
|
|
|
|
AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
|
|
|
|
} else {
|
|
|
|
switch (MemSize) {
|
|
|
|
case 1:
|
|
|
|
Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
|
|
|
|
break;
|
|
|
|
case 2:
|
|
|
|
Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!VIndex)
|
|
|
|
VIndex = B.buildConstant(S32, 0).getReg(0);
|
|
|
|
|
|
|
|
auto MIB = B.buildInstr(Opc)
|
|
|
|
.addUse(VData) // vdata
|
|
|
|
.addUse(RSrc) // rsrc
|
|
|
|
.addUse(VIndex) // vindex
|
|
|
|
.addUse(VOffset) // voffset
|
|
|
|
.addUse(SOffset) // soffset
|
|
|
|
.addImm(ImmOffset); // offset(imm)
|
|
|
|
|
|
|
|
if (IsTyped)
|
|
|
|
MIB.addImm(Format);
|
|
|
|
|
|
|
|
MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
|
|
|
|
.addImm(HasVIndex ? -1 : 0) // idxen(imm)
|
|
|
|
.addMemOperand(MMO);
|
|
|
|
|
|
|
|
MI.eraseFromParent();
|
|
|
|
return true;
|
2019-09-20 00:26:14 +08:00
|
|
|
}
|
|
|
|
|
2020-01-14 05:02:14 +08:00
|
|
|
bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
|
|
|
|
MachineRegisterInfo &MRI,
|
|
|
|
MachineIRBuilder &B,
|
2020-01-14 07:18:56 +08:00
|
|
|
bool IsFormat,
|
|
|
|
bool IsTyped) const {
|
2019-09-19 12:29:20 +08:00
|
|
|
B.setInstr(MI);
|
|
|
|
|
|
|
|
// FIXME: Verifier should enforce 1 MMO for these intrinsics.
|
|
|
|
MachineMemOperand *MMO = *MI.memoperands_begin();
|
|
|
|
const int MemSize = MMO->getSize();
|
|
|
|
const LLT S32 = LLT::scalar(32);
|
|
|
|
|
|
|
|
Register Dst = MI.getOperand(0).getReg();
|
|
|
|
Register RSrc = MI.getOperand(2).getReg();
|
2020-01-14 05:02:14 +08:00
|
|
|
|
2020-01-14 07:18:56 +08:00
|
|
|
// The typed intrinsics add an immediate after the registers.
|
|
|
|
const unsigned NumVIndexOps = IsTyped ? 8 : 7;
|
|
|
|
|
2020-01-14 05:02:14 +08:00
|
|
|
// The struct intrinsic variants add one additional operand over raw.
|
2020-01-14 07:18:56 +08:00
|
|
|
const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
|
2020-01-14 05:02:14 +08:00
|
|
|
Register VIndex;
|
|
|
|
int OpOffset = 0;
|
|
|
|
if (HasVIndex) {
|
|
|
|
VIndex = MI.getOperand(3).getReg();
|
|
|
|
OpOffset = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
Register VOffset = MI.getOperand(3 + OpOffset).getReg();
|
|
|
|
Register SOffset = MI.getOperand(4 + OpOffset).getReg();
|
2020-01-14 07:18:56 +08:00
|
|
|
|
|
|
|
unsigned Format = 0;
|
|
|
|
if (IsTyped) {
|
|
|
|
Format = MI.getOperand(5 + OpOffset).getImm();
|
|
|
|
++OpOffset;
|
|
|
|
}
|
|
|
|
|
2020-01-14 05:02:14 +08:00
|
|
|
unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
|
2019-09-19 12:29:20 +08:00
|
|
|
unsigned ImmOffset;
|
|
|
|
unsigned TotalOffset;
|
|
|
|
|
2020-01-09 11:35:23 +08:00
|
|
|
LLT Ty = MRI.getType(Dst);
|
|
|
|
LLT EltTy = Ty.getScalarType();
|
|
|
|
const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
|
|
|
|
const bool Unpacked = ST.hasUnpackedD16VMem();
|
|
|
|
|
2019-09-19 12:29:20 +08:00
|
|
|
std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
|
|
|
|
if (TotalOffset != 0)
|
|
|
|
MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
|
|
|
|
|
|
|
|
unsigned Opc;
|
2020-01-14 07:18:56 +08:00
|
|
|
|
|
|
|
if (IsTyped) {
|
|
|
|
Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
|
|
|
|
AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
|
|
|
|
} else if (IsFormat) {
|
2020-01-09 11:35:23 +08:00
|
|
|
Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
|
|
|
|
AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
|
|
|
|
} else {
|
|
|
|
switch (MemSize) {
|
|
|
|
case 1:
|
|
|
|
Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
|
|
|
|
break;
|
|
|
|
case 2:
|
|
|
|
Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
|
|
|
|
break;
|
|
|
|
}
|
2019-09-19 12:29:20 +08:00
|
|
|
}
|
|
|
|
|
2020-01-09 11:35:23 +08:00
|
|
|
Register LoadDstReg;
|
|
|
|
|
|
|
|
bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
|
|
|
|
LLT UnpackedTy = Ty.changeElementSize(32);
|
|
|
|
|
|
|
|
if (IsExtLoad)
|
|
|
|
LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
|
|
|
|
else if (Unpacked && IsD16 && Ty.isVector())
|
|
|
|
LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
|
|
|
|
else
|
|
|
|
LoadDstReg = Dst;
|
2019-09-19 12:29:20 +08:00
|
|
|
|
2020-01-14 05:02:14 +08:00
|
|
|
if (!VIndex)
|
|
|
|
VIndex = B.buildConstant(S32, 0).getReg(0);
|
2019-09-19 12:29:20 +08:00
|
|
|
|
2020-01-14 07:18:56 +08:00
|
|
|
auto MIB = B.buildInstr(Opc)
|
2020-01-14 05:02:14 +08:00
|
|
|
.addDef(LoadDstReg) // vdata
|
|
|
|
.addUse(RSrc) // rsrc
|
|
|
|
.addUse(VIndex) // vindex
|
|
|
|
.addUse(VOffset) // voffset
|
|
|
|
.addUse(SOffset) // soffset
|
2020-01-14 07:18:56 +08:00
|
|
|
.addImm(ImmOffset); // offset(imm)
|
|
|
|
|
|
|
|
if (IsTyped)
|
|
|
|
MIB.addImm(Format);
|
|
|
|
|
|
|
|
MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
|
|
|
|
.addImm(HasVIndex ? -1 : 0) // idxen(imm)
|
|
|
|
.addMemOperand(MMO);
|
2019-09-19 12:29:20 +08:00
|
|
|
|
|
|
|
if (LoadDstReg != Dst) {
|
|
|
|
B.setInsertPt(B.getMBB(), ++B.getInsertPt());
|
2020-01-09 11:35:23 +08:00
|
|
|
|
|
|
|
// Widen result for extending loads was widened.
|
|
|
|
if (IsExtLoad)
|
|
|
|
B.buildTrunc(Dst, LoadDstReg);
|
|
|
|
else {
|
|
|
|
// Repack to original 16-bit vector result
|
|
|
|
// FIXME: G_TRUNC should work, but legalization currently fails
|
|
|
|
auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
|
|
|
|
SmallVector<Register, 4> Repack;
|
|
|
|
for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
|
|
|
|
Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
|
|
|
|
B.buildMerge(Dst, Repack);
|
|
|
|
}
|
2019-09-19 12:29:20 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
MI.eraseFromParent();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2020-01-18 09:51:01 +08:00
|
|
|
bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
|
|
|
|
MachineIRBuilder &B,
|
|
|
|
bool IsInc) const {
|
|
|
|
B.setInstr(MI);
|
|
|
|
unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
|
|
|
|
AMDGPU::G_AMDGPU_ATOMIC_DEC;
|
|
|
|
B.buildInstr(Opc)
|
|
|
|
.addDef(MI.getOperand(0).getReg())
|
|
|
|
.addUse(MI.getOperand(2).getReg())
|
|
|
|
.addUse(MI.getOperand(3).getReg())
|
|
|
|
.cloneMemRefs(MI);
|
|
|
|
MI.eraseFromParent();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2020-01-14 12:17:59 +08:00
|
|
|
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
|
|
|
|
switch (IntrID) {
|
|
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_swap:
|
|
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_swap:
|
|
|
|
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
|
|
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_add:
|
|
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_add:
|
|
|
|
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
|
|
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_sub:
|
|
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_sub:
|
|
|
|
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
|
|
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_smin:
|
|
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_smin:
|
|
|
|
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
|
|
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_umin:
|
|
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_umin:
|
|
|
|
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
|
|
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_smax:
|
|
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_smax:
|
|
|
|
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
|
|
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_umax:
|
|
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_umax:
|
|
|
|
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
|
|
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_and:
|
|
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_and:
|
|
|
|
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
|
|
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_or:
|
|
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_or:
|
|
|
|
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
|
|
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_xor:
|
|
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_xor:
|
|
|
|
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
|
|
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_inc:
|
|
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_inc:
|
|
|
|
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
|
|
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_dec:
|
|
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_dec:
|
|
|
|
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
|
2020-01-14 22:29:05 +08:00
|
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
|
|
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
|
|
|
|
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
|
2020-01-14 12:17:59 +08:00
|
|
|
default:
|
|
|
|
llvm_unreachable("unhandled atomic opcode");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
|
|
|
|
MachineIRBuilder &B,
|
|
|
|
Intrinsic::ID IID) const {
|
|
|
|
B.setInstr(MI);
|
|
|
|
|
|
|
|
const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
|
|
|
|
IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
|
|
|
|
|
|
|
|
Register Dst = MI.getOperand(0).getReg();
|
|
|
|
Register VData = MI.getOperand(2).getReg();
|
|
|
|
|
|
|
|
Register CmpVal;
|
|
|
|
int OpOffset = 0;
|
|
|
|
|
|
|
|
if (IsCmpSwap) {
|
|
|
|
CmpVal = MI.getOperand(3 + OpOffset).getReg();
|
|
|
|
++OpOffset;
|
|
|
|
}
|
|
|
|
|
|
|
|
Register RSrc = MI.getOperand(3 + OpOffset).getReg();
|
|
|
|
const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
|
|
|
|
|
|
|
|
// The struct intrinsic variants add one additional operand over raw.
|
|
|
|
const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
|
|
|
|
Register VIndex;
|
|
|
|
if (HasVIndex) {
|
2020-01-14 22:29:05 +08:00
|
|
|
VIndex = MI.getOperand(4 + OpOffset).getReg();
|
2020-01-14 12:17:59 +08:00
|
|
|
++OpOffset;
|
|
|
|
}
|
|
|
|
|
|
|
|
Register VOffset = MI.getOperand(4 + OpOffset).getReg();
|
|
|
|
Register SOffset = MI.getOperand(5 + OpOffset).getReg();
|
|
|
|
unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
|
|
|
|
|
|
|
|
MachineMemOperand *MMO = *MI.memoperands_begin();
|
|
|
|
|
|
|
|
unsigned ImmOffset;
|
|
|
|
unsigned TotalOffset;
|
|
|
|
std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
|
|
|
|
if (TotalOffset != 0)
|
|
|
|
MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
|
|
|
|
|
|
|
|
if (!VIndex)
|
|
|
|
VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
|
|
|
|
|
|
|
|
auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
|
|
|
|
.addDef(Dst)
|
|
|
|
.addUse(VData); // vdata
|
|
|
|
|
|
|
|
if (IsCmpSwap)
|
|
|
|
MIB.addReg(CmpVal);
|
|
|
|
|
|
|
|
MIB.addUse(RSrc) // rsrc
|
|
|
|
.addUse(VIndex) // vindex
|
|
|
|
.addUse(VOffset) // voffset
|
|
|
|
.addUse(SOffset) // soffset
|
|
|
|
.addImm(ImmOffset) // offset(imm)
|
|
|
|
.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
|
|
|
|
.addImm(HasVIndex ? -1 : 0) // idxen(imm)
|
|
|
|
.addMemOperand(MMO);
|
|
|
|
|
|
|
|
MI.eraseFromParent();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2020-01-27 03:39:56 +08:00
|
|
|
/// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
|
|
|
|
/// vector with s16 typed elements.
|
2020-02-08 08:10:02 +08:00
|
|
|
static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
|
2020-01-27 03:39:56 +08:00
|
|
|
SmallVectorImpl<Register> &PackedAddrs,
|
2020-02-08 08:10:02 +08:00
|
|
|
int AddrIdx, int DimIdx, int NumVAddrs,
|
|
|
|
int NumGradients) {
|
2020-01-27 03:39:56 +08:00
|
|
|
const LLT S16 = LLT::scalar(16);
|
|
|
|
const LLT V2S16 = LLT::vector(2, 16);
|
|
|
|
|
2020-02-08 08:10:02 +08:00
|
|
|
for (int I = AddrIdx; I < AddrIdx + NumVAddrs; ++I) {
|
2020-02-08 22:28:27 +08:00
|
|
|
MachineOperand &SrcOp = MI.getOperand(I);
|
|
|
|
if (!SrcOp.isReg())
|
|
|
|
continue; // _L to _LZ may have eliminated this.
|
|
|
|
|
|
|
|
Register AddrReg = SrcOp.getReg();
|
2020-01-27 03:39:56 +08:00
|
|
|
|
2020-02-08 08:10:02 +08:00
|
|
|
if (I < DimIdx) {
|
|
|
|
AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
|
|
|
|
PackedAddrs.push_back(AddrReg);
|
|
|
|
} else {
|
|
|
|
// Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
|
|
|
|
// derivatives dx/dh and dx/dv are packed with undef.
|
|
|
|
if (((I + 1) >= (AddrIdx + NumVAddrs)) ||
|
|
|
|
((NumGradients / 2) % 2 == 1 &&
|
|
|
|
(I == DimIdx + (NumGradients / 2) - 1 ||
|
2020-02-08 22:28:27 +08:00
|
|
|
I == DimIdx + NumGradients - 1)) ||
|
|
|
|
// Check for _L to _LZ optimization
|
|
|
|
!MI.getOperand(I + 1).isReg()) {
|
2020-02-08 08:10:02 +08:00
|
|
|
PackedAddrs.push_back(
|
|
|
|
B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
|
|
|
|
.getReg(0));
|
|
|
|
} else {
|
|
|
|
PackedAddrs.push_back(
|
|
|
|
B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
|
|
|
|
.getReg(0));
|
|
|
|
++I;
|
|
|
|
}
|
|
|
|
}
|
2020-01-27 03:39:56 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-01-27 09:02:51 +08:00
|
|
|
/// Convert from separate vaddr components to a single vector address register,
|
|
|
|
/// and replace the remaining operands with $noreg.
|
|
|
|
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
|
|
|
|
int DimIdx, int NumVAddrs) {
|
2020-02-08 22:28:27 +08:00
|
|
|
const LLT S32 = LLT::scalar(32);
|
|
|
|
|
|
|
|
SmallVector<Register, 8> AddrRegs;
|
2020-01-27 09:02:51 +08:00
|
|
|
for (int I = 0; I != NumVAddrs; ++I) {
|
2020-02-08 22:28:27 +08:00
|
|
|
MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
|
|
|
|
if (SrcOp.isReg()) {
|
|
|
|
AddrRegs.push_back(SrcOp.getReg());
|
|
|
|
assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
|
|
|
|
}
|
2020-01-27 09:02:51 +08:00
|
|
|
}
|
|
|
|
|
2020-02-09 05:53:04 +08:00
|
|
|
int NumAddrRegs = AddrRegs.size();
|
|
|
|
if (NumAddrRegs != 1) {
|
|
|
|
// Round up to 8 elements for v5-v7
|
|
|
|
// FIXME: Missing intermediate sized register classes and instructions.
|
|
|
|
if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
|
|
|
|
const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
|
|
|
|
auto Undef = B.buildUndef(S32);
|
|
|
|
AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
|
|
|
|
NumAddrRegs = RoundedNumRegs;
|
|
|
|
}
|
|
|
|
|
|
|
|
auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
|
2020-02-08 22:28:27 +08:00
|
|
|
MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
|
|
|
|
}
|
|
|
|
|
|
|
|
for (int I = 1; I != NumVAddrs; ++I) {
|
|
|
|
MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
|
|
|
|
if (SrcOp.isReg())
|
|
|
|
MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
|
|
|
|
}
|
2020-01-27 09:02:51 +08:00
|
|
|
}
|
|
|
|
|
2020-01-27 03:39:56 +08:00
|
|
|
/// Rewrite image intrinsics to use register layouts expected by the subtarget.
|
|
|
|
///
|
|
|
|
/// Depending on the subtarget, load/store with 16-bit element data need to be
|
|
|
|
/// rewritten to use the low half of 32-bit registers, or directly use a packed
|
|
|
|
/// layout. 16-bit addresses should also sometimes be packed into 32-bit
|
|
|
|
/// registers.
|
|
|
|
///
|
|
|
|
/// We don't want to directly select image instructions just yet, but also want
|
|
|
|
/// to exposes all register repacking to the legalizer/combiners. We also don't
|
|
|
|
/// want a selected instrution entering RegBankSelect. In order to avoid
|
|
|
|
/// defining a multitude of intermediate image instructions, directly hack on
|
|
|
|
/// the intrinsic's arguments. In cases like a16 addreses, this requires padding
|
|
|
|
/// now unnecessary arguments with $noreg.
|
2020-01-16 03:23:20 +08:00
|
|
|
bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
|
2020-01-28 04:50:55 +08:00
|
|
|
MachineInstr &MI, MachineIRBuilder &B,
|
2020-01-16 03:23:20 +08:00
|
|
|
GISelChangeObserver &Observer,
|
|
|
|
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
|
2020-03-18 22:11:09 +08:00
|
|
|
B.setInstr(MI);
|
|
|
|
|
2020-01-27 03:39:56 +08:00
|
|
|
const int NumDefs = MI.getNumExplicitDefs();
|
|
|
|
bool IsTFE = NumDefs == 2;
|
2020-01-16 03:23:20 +08:00
|
|
|
// We are only processing the operands of d16 image operations on subtargets
|
2020-01-18 03:16:58 +08:00
|
|
|
// that use the unpacked register layout, or need to repack the TFE result.
|
|
|
|
|
|
|
|
// TODO: Do we need to guard against already legalized intrinsics?
|
2020-01-16 03:23:20 +08:00
|
|
|
const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
|
|
|
|
AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
|
|
|
|
|
|
|
|
MachineRegisterInfo *MRI = B.getMRI();
|
|
|
|
const LLT S32 = LLT::scalar(32);
|
|
|
|
const LLT S16 = LLT::scalar(16);
|
2020-01-28 22:05:11 +08:00
|
|
|
const LLT V2S16 = LLT::vector(2, 16);
|
2020-01-16 03:23:20 +08:00
|
|
|
|
2020-01-27 03:39:56 +08:00
|
|
|
// Index of first address argument
|
|
|
|
const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
|
|
|
|
|
|
|
|
// Check for 16 bit addresses and pack if true.
|
|
|
|
int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
|
|
|
|
LLT AddrTy = MRI->getType(MI.getOperand(DimIdx).getReg());
|
|
|
|
const bool IsA16 = AddrTy == S16;
|
|
|
|
|
2020-02-08 08:10:02 +08:00
|
|
|
int NumVAddrs, NumGradients;
|
|
|
|
std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
|
2020-03-18 22:11:09 +08:00
|
|
|
const int DMaskIdx = BaseOpcode->Atomic ? -1 :
|
|
|
|
getDMaskIdx(BaseOpcode, NumDefs);
|
|
|
|
unsigned DMask = 0;
|
|
|
|
|
|
|
|
int DMaskLanes = 0;
|
|
|
|
if (!BaseOpcode->Atomic) {
|
|
|
|
DMask = MI.getOperand(DMaskIdx).getImm();
|
|
|
|
if (BaseOpcode->Gather4) {
|
|
|
|
DMaskLanes = 4;
|
|
|
|
} else if (DMask != 0) {
|
|
|
|
DMaskLanes = countPopulation(DMask);
|
|
|
|
} else if (!IsTFE && !BaseOpcode->Store) {
|
|
|
|
// If dmask is 0, this is a no-op load. This can be eliminated.
|
|
|
|
B.buildUndef(MI.getOperand(0));
|
|
|
|
MI.eraseFromParent();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Observer.changingInstr(MI);
|
|
|
|
auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
|
|
|
|
|
|
|
|
unsigned NewOpcode = NumDefs == 0 ?
|
|
|
|
AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
|
|
|
|
|
|
|
|
// Track that we legalized this
|
|
|
|
MI.setDesc(B.getTII().get(NewOpcode));
|
|
|
|
|
|
|
|
// Expecting to get an error flag since TFC is on - and dmask is 0 Force
|
|
|
|
// dmask to be at least 1 otherwise the instruction will fail
|
|
|
|
if (IsTFE && DMask == 0) {
|
|
|
|
DMask = 0x1;
|
|
|
|
DMaskLanes = 1;
|
|
|
|
MI.getOperand(DMaskIdx).setImm(DMask);
|
|
|
|
}
|
2020-01-27 09:02:51 +08:00
|
|
|
|
2020-02-09 08:08:34 +08:00
|
|
|
if (BaseOpcode->Atomic) {
|
|
|
|
Register VData0 = MI.getOperand(2).getReg();
|
|
|
|
LLT Ty = MRI->getType(VData0);
|
|
|
|
|
|
|
|
// TODO: Allow atomic swap and bit ops for v2s16/v4s16
|
|
|
|
if (Ty.isVector())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (BaseOpcode->AtomicX2) {
|
|
|
|
Register VData1 = MI.getOperand(3).getReg();
|
|
|
|
// The two values are packed in one register.
|
|
|
|
LLT PackedTy = LLT::vector(2, Ty);
|
|
|
|
auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
|
|
|
|
MI.getOperand(2).setReg(Concat.getReg(0));
|
|
|
|
MI.getOperand(3).setReg(AMDGPU::NoRegister);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-02-08 22:28:27 +08:00
|
|
|
int CorrectedNumVAddrs = NumVAddrs;
|
|
|
|
|
|
|
|
// Optimize _L to _LZ when _L is zero
|
2020-03-31 03:24:35 +08:00
|
|
|
if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
|
|
|
|
AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
|
2020-02-08 22:28:27 +08:00
|
|
|
const ConstantFP *ConstantLod;
|
|
|
|
const int LodIdx = AddrIdx + NumVAddrs - 1;
|
|
|
|
|
|
|
|
if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
|
|
|
|
if (ConstantLod->isZero() || ConstantLod->isNegative()) {
|
2020-03-31 03:24:35 +08:00
|
|
|
// Set new opcode to _lz variant of _l, and change the intrinsic ID.
|
|
|
|
ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
|
|
|
|
LZMappingInfo->LZ, ImageDimIntr->Dim);
|
|
|
|
|
|
|
|
// The starting indexes should remain in the same place.
|
|
|
|
--NumVAddrs;
|
2020-02-08 22:28:27 +08:00
|
|
|
--CorrectedNumVAddrs;
|
2020-03-31 03:24:35 +08:00
|
|
|
|
|
|
|
MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
|
|
|
|
static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
|
|
|
|
MI.RemoveOperand(LodIdx);
|
2020-02-08 22:28:27 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Optimize _mip away, when 'lod' is zero
|
2020-04-01 21:36:02 +08:00
|
|
|
if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
|
2020-02-08 22:28:27 +08:00
|
|
|
int64_t ConstantLod;
|
|
|
|
const int LodIdx = AddrIdx + NumVAddrs - 1;
|
|
|
|
|
|
|
|
if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
|
|
|
|
if (ConstantLod == 0) {
|
2020-03-31 03:24:35 +08:00
|
|
|
// TODO: Change intrinsic opcode and remove operand instead or replacing
|
|
|
|
// it with 0, as the _L to _LZ handling is done above.
|
2020-02-08 22:28:27 +08:00
|
|
|
MI.getOperand(LodIdx).ChangeToImmediate(0);
|
|
|
|
--CorrectedNumVAddrs;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-01-27 09:02:51 +08:00
|
|
|
// If the register allocator cannot place the address registers contiguously
|
|
|
|
// without introducing moves, then using the non-sequential address encoding
|
|
|
|
// is always preferable, since it saves VALU instructions and is usually a
|
|
|
|
// wash in terms of code size or even better.
|
|
|
|
//
|
|
|
|
// However, we currently have no way of hinting to the register allocator
|
|
|
|
// that MIMG addresses should be placed contiguously when it is possible to
|
|
|
|
// do so, so force non-NSA for the common 2-address case as a heuristic.
|
|
|
|
//
|
|
|
|
// SIShrinkInstructions will convert NSA encodings to non-NSA after register
|
|
|
|
// allocation when possible.
|
2020-02-08 12:37:54 +08:00
|
|
|
const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
|
2020-01-27 03:39:56 +08:00
|
|
|
|
|
|
|
// Rewrite the addressing register layout before doing anything else.
|
|
|
|
if (IsA16) {
|
2020-01-27 09:02:51 +08:00
|
|
|
// FIXME: this feature is missing from gfx10. When that is fixed, this check
|
|
|
|
// should be introduced.
|
2020-03-20 07:12:52 +08:00
|
|
|
if (!ST.hasR128A16() && !ST.hasGFX10A16())
|
2020-01-27 03:39:56 +08:00
|
|
|
return false;
|
|
|
|
|
|
|
|
if (NumVAddrs > 1) {
|
|
|
|
SmallVector<Register, 4> PackedRegs;
|
2020-02-08 08:10:02 +08:00
|
|
|
packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, NumVAddrs,
|
|
|
|
NumGradients);
|
2020-01-27 03:39:56 +08:00
|
|
|
|
|
|
|
if (!UseNSA && PackedRegs.size() > 1) {
|
|
|
|
LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
|
|
|
|
auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
|
|
|
|
PackedRegs[0] = Concat.getReg(0);
|
|
|
|
PackedRegs.resize(1);
|
|
|
|
}
|
|
|
|
|
|
|
|
const int NumPacked = PackedRegs.size();
|
|
|
|
for (int I = 0; I != NumVAddrs; ++I) {
|
2020-02-08 22:28:27 +08:00
|
|
|
MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
|
|
|
|
if (!SrcOp.isReg()) {
|
|
|
|
assert(SrcOp.isImm() && SrcOp.getImm() == 0);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
assert(SrcOp.getReg() != AMDGPU::NoRegister);
|
2020-01-27 03:39:56 +08:00
|
|
|
|
|
|
|
if (I < NumPacked)
|
2020-02-08 22:28:27 +08:00
|
|
|
SrcOp.setReg(PackedRegs[I]);
|
2020-01-27 03:39:56 +08:00
|
|
|
else
|
2020-02-08 22:28:27 +08:00
|
|
|
SrcOp.setReg(AMDGPU::NoRegister);
|
2020-01-27 03:39:56 +08:00
|
|
|
}
|
|
|
|
}
|
2020-01-27 09:02:51 +08:00
|
|
|
} else if (!UseNSA && NumVAddrs > 1) {
|
2020-02-08 08:10:02 +08:00
|
|
|
convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
|
2020-01-27 03:39:56 +08:00
|
|
|
}
|
|
|
|
|
2020-02-09 08:08:34 +08:00
|
|
|
|
2020-01-18 03:16:58 +08:00
|
|
|
if (BaseOpcode->Store) { // No TFE for stores?
|
2020-01-28 22:05:11 +08:00
|
|
|
// TODO: Handle dmask trim
|
2020-01-16 03:23:20 +08:00
|
|
|
Register VData = MI.getOperand(1).getReg();
|
|
|
|
LLT Ty = MRI->getType(VData);
|
|
|
|
if (!Ty.isVector() || Ty.getElementType() != S16)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
B.setInstr(MI);
|
|
|
|
|
2020-01-27 03:39:56 +08:00
|
|
|
Register RepackedReg = handleD16VData(B, *MRI, VData);
|
|
|
|
if (RepackedReg != VData) {
|
|
|
|
MI.getOperand(1).setReg(RepackedReg);
|
|
|
|
}
|
|
|
|
|
2020-01-16 03:23:20 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
Register DstReg = MI.getOperand(0).getReg();
|
|
|
|
LLT Ty = MRI->getType(DstReg);
|
2020-01-27 01:50:24 +08:00
|
|
|
const LLT EltTy = Ty.getScalarType();
|
2020-01-18 03:16:58 +08:00
|
|
|
const bool IsD16 = Ty.getScalarType() == S16;
|
2020-01-28 22:05:11 +08:00
|
|
|
const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
|
2020-01-18 03:16:58 +08:00
|
|
|
|
2020-01-28 22:05:11 +08:00
|
|
|
// Confirm that the return type is large enough for the dmask specified
|
|
|
|
if (NumElts < DMaskLanes)
|
|
|
|
return false;
|
2020-01-18 03:16:58 +08:00
|
|
|
|
2020-01-28 22:05:11 +08:00
|
|
|
if (NumElts > 4 || DMaskLanes > 4)
|
|
|
|
return false;
|
2020-01-18 03:16:58 +08:00
|
|
|
|
2020-01-28 22:05:11 +08:00
|
|
|
const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
|
|
|
|
const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
|
2020-01-18 03:16:58 +08:00
|
|
|
|
2020-01-28 22:05:11 +08:00
|
|
|
// The raw dword aligned data component of the load. The only legal cases
|
|
|
|
// where this matters should be when using the packed D16 format, for
|
|
|
|
// s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
|
|
|
|
LLT RoundedTy;
|
2020-01-18 03:16:58 +08:00
|
|
|
|
2020-01-28 22:05:11 +08:00
|
|
|
// S32 vector to to cover all data, plus TFE result element.
|
|
|
|
LLT TFETy;
|
2020-01-18 03:16:58 +08:00
|
|
|
|
2020-01-28 22:05:11 +08:00
|
|
|
// Register type to use for each loaded component. Will be S32 or V2S16.
|
|
|
|
LLT RegTy;
|
|
|
|
|
|
|
|
if (IsD16 && ST.hasUnpackedD16VMem()) {
|
|
|
|
RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
|
|
|
|
TFETy = LLT::vector(AdjustedNumElts + 1, 32);
|
|
|
|
RegTy = S32;
|
|
|
|
} else {
|
|
|
|
unsigned EltSize = EltTy.getSizeInBits();
|
|
|
|
unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
|
|
|
|
unsigned RoundedSize = 32 * RoundedElts;
|
|
|
|
RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
|
|
|
|
TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
|
|
|
|
RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
|
|
|
|
}
|
|
|
|
|
|
|
|
// The return type does not need adjustment.
|
|
|
|
// TODO: Should we change s16 case to s32 or <2 x s16>?
|
|
|
|
if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
|
|
|
|
return true;
|
|
|
|
|
|
|
|
Register Dst1Reg;
|
|
|
|
|
|
|
|
// Insert after the instruction.
|
|
|
|
B.setInsertPt(*MI.getParent(), ++MI.getIterator());
|
2020-01-18 03:16:58 +08:00
|
|
|
|
2020-01-28 22:05:11 +08:00
|
|
|
// TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
|
|
|
|
// s16> instead of s32, we would only need 1 bitcast instead of multiple.
|
|
|
|
const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
|
|
|
|
const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
|
2020-01-18 03:16:58 +08:00
|
|
|
|
2020-01-28 22:05:11 +08:00
|
|
|
Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
|
2020-01-27 01:50:24 +08:00
|
|
|
|
2020-01-28 22:05:11 +08:00
|
|
|
MI.getOperand(0).setReg(NewResultReg);
|
2020-01-27 01:50:24 +08:00
|
|
|
|
2020-01-28 22:05:11 +08:00
|
|
|
// In the IR, TFE is supposed to be used with a 2 element struct return
|
|
|
|
// type. The intruction really returns these two values in one contiguous
|
|
|
|
// register, with one additional dword beyond the loaded data. Rewrite the
|
|
|
|
// return type to use a single register result.
|
2020-01-27 01:50:24 +08:00
|
|
|
|
2020-01-28 22:05:11 +08:00
|
|
|
if (IsTFE) {
|
|
|
|
Dst1Reg = MI.getOperand(1).getReg();
|
|
|
|
if (MRI->getType(Dst1Reg) != S32)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// TODO: Make sure the TFE operand bit is set.
|
|
|
|
MI.RemoveOperand(1);
|
|
|
|
|
|
|
|
// Handle the easy case that requires no repack instructions.
|
|
|
|
if (Ty == S32) {
|
|
|
|
B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
|
2020-01-27 01:50:24 +08:00
|
|
|
return true;
|
2020-01-18 03:16:58 +08:00
|
|
|
}
|
2020-01-28 22:05:11 +08:00
|
|
|
}
|
2020-01-18 03:16:58 +08:00
|
|
|
|
2020-01-28 22:05:11 +08:00
|
|
|
// Now figure out how to copy the new result register back into the old
|
|
|
|
// result.
|
|
|
|
SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
|
2020-01-27 01:50:24 +08:00
|
|
|
|
2020-01-28 22:05:11 +08:00
|
|
|
const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
|
2020-01-27 01:50:24 +08:00
|
|
|
|
2020-01-28 22:05:11 +08:00
|
|
|
if (ResultNumRegs == 1) {
|
|
|
|
assert(!IsTFE);
|
|
|
|
ResultRegs[0] = NewResultReg;
|
|
|
|
} else {
|
|
|
|
// We have to repack into a new vector of some kind.
|
|
|
|
for (int I = 0; I != NumDataRegs; ++I)
|
|
|
|
ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
|
|
|
|
B.buildUnmerge(ResultRegs, NewResultReg);
|
|
|
|
|
|
|
|
// Drop the final TFE element to get the data part. The TFE result is
|
|
|
|
// directly written to the right place already.
|
|
|
|
if (IsTFE)
|
|
|
|
ResultRegs.resize(NumDataRegs);
|
|
|
|
}
|
2020-01-27 01:50:24 +08:00
|
|
|
|
2020-01-28 22:05:11 +08:00
|
|
|
// For an s16 scalar result, we form an s32 result with a truncate regardless
|
|
|
|
// of packed vs. unpacked.
|
|
|
|
if (IsD16 && !Ty.isVector()) {
|
|
|
|
B.buildTrunc(DstReg, ResultRegs[0]);
|
2020-01-18 03:16:58 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2020-01-28 22:05:11 +08:00
|
|
|
// Avoid a build/concat_vector of 1 entry.
|
|
|
|
if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
|
|
|
|
B.buildBitcast(DstReg, ResultRegs[0]);
|
2020-01-16 03:23:20 +08:00
|
|
|
return true;
|
2020-01-28 22:05:11 +08:00
|
|
|
}
|
2020-01-16 03:23:20 +08:00
|
|
|
|
2020-01-28 22:05:11 +08:00
|
|
|
assert(Ty.isVector());
|
|
|
|
|
|
|
|
if (IsD16) {
|
|
|
|
// For packed D16 results with TFE enabled, all the data components are
|
|
|
|
// S32. Cast back to the expected type.
|
|
|
|
//
|
|
|
|
// TODO: We don't really need to use load s32 elements. We would only need one
|
|
|
|
// cast for the TFE result if a multiple of v2s16 was used.
|
|
|
|
if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
|
|
|
|
for (Register &Reg : ResultRegs)
|
|
|
|
Reg = B.buildBitcast(V2S16, Reg).getReg(0);
|
|
|
|
} else if (ST.hasUnpackedD16VMem()) {
|
|
|
|
for (Register &Reg : ResultRegs)
|
|
|
|
Reg = B.buildTrunc(S16, Reg).getReg(0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
auto padWithUndef = [&](LLT Ty, int NumElts) {
|
|
|
|
if (NumElts == 0)
|
|
|
|
return;
|
|
|
|
Register Undef = B.buildUndef(Ty).getReg(0);
|
|
|
|
for (int I = 0; I != NumElts; ++I)
|
|
|
|
ResultRegs.push_back(Undef);
|
|
|
|
};
|
2020-01-16 03:23:20 +08:00
|
|
|
|
2020-01-28 22:05:11 +08:00
|
|
|
// Pad out any elements eliminated due to the dmask.
|
|
|
|
LLT ResTy = MRI->getType(ResultRegs[0]);
|
|
|
|
if (!ResTy.isVector()) {
|
|
|
|
padWithUndef(ResTy, NumElts - ResultRegs.size());
|
|
|
|
B.buildBuildVector(DstReg, ResultRegs);
|
|
|
|
return true;
|
|
|
|
}
|
2020-01-16 03:23:20 +08:00
|
|
|
|
2020-01-28 22:05:11 +08:00
|
|
|
assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
|
|
|
|
const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
|
|
|
|
|
|
|
|
// Deal with the one annoying legal case.
|
|
|
|
const LLT V3S16 = LLT::vector(3, 16);
|
|
|
|
if (Ty == V3S16) {
|
|
|
|
padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
|
|
|
|
auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
|
|
|
|
B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
|
|
|
|
return true;
|
|
|
|
}
|
2020-01-16 03:23:20 +08:00
|
|
|
|
2020-01-28 22:05:11 +08:00
|
|
|
padWithUndef(ResTy, RegsToCover - ResultRegs.size());
|
|
|
|
B.buildConcatVectors(DstReg, ResultRegs);
|
2020-01-16 03:23:20 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2020-01-31 06:34:33 +08:00
|
|
|
bool AMDGPULegalizerInfo::legalizeSBufferLoad(
|
|
|
|
MachineInstr &MI, MachineIRBuilder &B,
|
|
|
|
GISelChangeObserver &Observer) const {
|
|
|
|
Register Dst = MI.getOperand(0).getReg();
|
|
|
|
LLT Ty = B.getMRI()->getType(Dst);
|
|
|
|
unsigned Size = Ty.getSizeInBits();
|
2020-02-03 01:13:48 +08:00
|
|
|
MachineFunction &MF = B.getMF();
|
|
|
|
|
|
|
|
Observer.changingInstr(MI);
|
|
|
|
|
|
|
|
// FIXME: We don't really need this intermediate instruction. The intrinsic
|
|
|
|
// should be fixed to have a memory operand. Since it's readnone, we're not
|
|
|
|
// allowed to add one.
|
|
|
|
MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
|
|
|
|
MI.RemoveOperand(1); // Remove intrinsic ID
|
|
|
|
|
|
|
|
// FIXME: When intrinsic definition is fixed, this should have an MMO already.
|
|
|
|
// TODO: Should this use datalayout alignment?
|
|
|
|
const unsigned MemSize = (Size + 7) / 8;
|
[Alignment][NFC] Transitionning more getMachineMemOperand call sites
Summary:
This is patch is part of a series to introduce an Alignment type.
See this thread for context: http://lists.llvm.org/pipermail/llvm-dev/2019-July/133851.html
See this patch for the introduction of the type: https://reviews.llvm.org/D64790
Reviewers: courbet
Subscribers: arsenm, dylanmckay, sdardis, nemanjai, jvesely, nhaehnle, hiraditya, kbarton, jrtc27, atanasyan, Jim, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D77121
2020-03-31 16:05:00 +08:00
|
|
|
const Align MemAlign(4);
|
2020-02-03 01:13:48 +08:00
|
|
|
MachineMemOperand *MMO = MF.getMachineMemOperand(
|
[Alignment][NFC] Transitionning more getMachineMemOperand call sites
Summary:
This is patch is part of a series to introduce an Alignment type.
See this thread for context: http://lists.llvm.org/pipermail/llvm-dev/2019-July/133851.html
See this patch for the introduction of the type: https://reviews.llvm.org/D64790
Reviewers: courbet
Subscribers: arsenm, dylanmckay, sdardis, nemanjai, jvesely, nhaehnle, hiraditya, kbarton, jrtc27, atanasyan, Jim, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D77121
2020-03-31 16:05:00 +08:00
|
|
|
MachinePointerInfo(),
|
|
|
|
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
|
|
|
|
MachineMemOperand::MOInvariant,
|
|
|
|
MemSize, MemAlign);
|
2020-02-03 01:13:48 +08:00
|
|
|
MI.addMemOperand(MF, MMO);
|
2020-01-31 06:34:33 +08:00
|
|
|
|
|
|
|
// There are no 96-bit result scalar loads, but widening to 128-bit should
|
|
|
|
// always be legal. We may need to restore this to a 96-bit result if it turns
|
|
|
|
// out this needs to be converted to a vector load during RegBankSelect.
|
2020-02-03 01:13:48 +08:00
|
|
|
if (!isPowerOf2_32(Size)) {
|
|
|
|
LegalizerHelper Helper(MF, *this, Observer, B);
|
|
|
|
B.setInstr(MI);
|
2020-01-31 06:34:33 +08:00
|
|
|
|
2020-02-03 01:13:48 +08:00
|
|
|
if (Ty.isVector())
|
|
|
|
Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
|
|
|
|
else
|
|
|
|
Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
|
|
|
|
}
|
2020-01-31 06:34:33 +08:00
|
|
|
|
|
|
|
Observer.changedInstr(MI);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
AMDGPU/GlobalISel: Support llvm.trap and llvm.debugtrap intrinsics
Summary: Lower trap and debugtrap intrinsics to AMDGPU machine instruction(s).
Reviewers: arsenm, nhaehnle, kerbowa, cdevadas, t-tye, kzhuravl
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, yaxunl, rovka, dstuttard, tpr, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D74688
2020-03-05 10:45:55 +08:00
|
|
|
bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
|
|
|
|
MachineRegisterInfo &MRI,
|
|
|
|
MachineIRBuilder &B) const {
|
|
|
|
B.setInstr(MI);
|
|
|
|
|
|
|
|
// Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
|
|
|
|
if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
|
|
|
|
!ST.isTrapHandlerEnabled()) {
|
|
|
|
B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
|
|
|
|
} else {
|
|
|
|
// Pass queue pointer to trap handler as input, and insert trap instruction
|
|
|
|
// Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
|
|
|
|
const ArgDescriptor *Arg =
|
|
|
|
getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR);
|
|
|
|
if (!Arg)
|
|
|
|
return false;
|
|
|
|
MachineRegisterInfo &MRI = *B.getMRI();
|
|
|
|
Register SGPR01(AMDGPU::SGPR0_SGPR1);
|
|
|
|
Register LiveIn = getLiveInRegister(
|
|
|
|
B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64),
|
|
|
|
/*InsertLiveInCopy=*/false);
|
|
|
|
if (!loadInputValue(LiveIn, B, Arg))
|
|
|
|
return false;
|
|
|
|
B.buildCopy(SGPR01, LiveIn);
|
|
|
|
B.buildInstr(AMDGPU::S_TRAP)
|
|
|
|
.addImm(GCNSubtarget::TrapIDLLVMTrap)
|
|
|
|
.addReg(SGPR01, RegState::Implicit);
|
|
|
|
}
|
|
|
|
|
|
|
|
MI.eraseFromParent();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
|
|
|
|
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
|
|
|
|
B.setInstr(MI);
|
|
|
|
|
|
|
|
// Is non-HSA path or trap-handler disabled? then, report a warning
|
|
|
|
// accordingly
|
|
|
|
if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
|
|
|
|
!ST.isTrapHandlerEnabled()) {
|
|
|
|
DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
|
|
|
|
"debugtrap handler not supported",
|
|
|
|
MI.getDebugLoc(), DS_Warning);
|
|
|
|
LLVMContext &Ctx = B.getMF().getFunction().getContext();
|
|
|
|
Ctx.diagnose(NoTrap);
|
|
|
|
} else {
|
|
|
|
// Insert debug-trap instruction
|
|
|
|
B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
|
|
|
|
}
|
|
|
|
|
|
|
|
MI.eraseFromParent();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2020-01-16 03:23:20 +08:00
|
|
|
bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
|
|
|
|
MachineIRBuilder &B,
|
|
|
|
GISelChangeObserver &Observer) const {
|
2020-01-28 04:50:55 +08:00
|
|
|
MachineRegisterInfo &MRI = *B.getMRI();
|
2020-01-16 03:23:20 +08:00
|
|
|
|
2019-07-02 02:40:23 +08:00
|
|
|
// Replace the use G_BRCOND with the exec manipulate and branch pseudos.
|
2019-12-21 02:35:18 +08:00
|
|
|
auto IntrID = MI.getIntrinsicID();
|
|
|
|
switch (IntrID) {
|
|
|
|
case Intrinsic::amdgcn_if:
|
|
|
|
case Intrinsic::amdgcn_else: {
|
2020-01-06 11:09:24 +08:00
|
|
|
MachineInstr *Br = nullptr;
|
2020-05-17 22:51:22 +08:00
|
|
|
MachineBasicBlock *UncondBrTarget = nullptr;
|
|
|
|
if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
|
2019-07-02 02:40:23 +08:00
|
|
|
const SIRegisterInfo *TRI
|
|
|
|
= static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
|
|
|
|
|
|
|
|
B.setInstr(*BrCond);
|
|
|
|
Register Def = MI.getOperand(1).getReg();
|
|
|
|
Register Use = MI.getOperand(3).getReg();
|
2019-12-21 02:35:18 +08:00
|
|
|
|
2020-05-17 22:51:22 +08:00
|
|
|
MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
|
2019-12-21 02:35:18 +08:00
|
|
|
if (IntrID == Intrinsic::amdgcn_if) {
|
|
|
|
B.buildInstr(AMDGPU::SI_IF)
|
|
|
|
.addDef(Def)
|
|
|
|
.addUse(Use)
|
2020-05-17 22:51:22 +08:00
|
|
|
.addMBB(UncondBrTarget);
|
2019-12-21 02:35:18 +08:00
|
|
|
} else {
|
|
|
|
B.buildInstr(AMDGPU::SI_ELSE)
|
|
|
|
.addDef(Def)
|
|
|
|
.addUse(Use)
|
2020-05-17 22:51:22 +08:00
|
|
|
.addMBB(UncondBrTarget)
|
2019-12-21 02:35:18 +08:00
|
|
|
.addImm(0);
|
|
|
|
}
|
2019-07-02 02:40:23 +08:00
|
|
|
|
2020-05-17 22:51:22 +08:00
|
|
|
if (Br) {
|
|
|
|
Br->getOperand(0).setMBB(CondBrTarget);
|
|
|
|
} else {
|
|
|
|
// The IRTranslator skips inserting the G_BR for fallthrough cases, but
|
|
|
|
// since we're swapping branch targets it needs to be reinserted.
|
|
|
|
// FIXME: IRTranslator should probably not do this
|
|
|
|
B.buildBr(*CondBrTarget);
|
|
|
|
}
|
2020-01-06 11:09:24 +08:00
|
|
|
|
2019-07-02 02:40:23 +08:00
|
|
|
MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
|
|
|
|
MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
|
|
|
|
MI.eraseFromParent();
|
|
|
|
BrCond->eraseFromParent();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
case Intrinsic::amdgcn_loop: {
|
2020-01-06 11:09:24 +08:00
|
|
|
MachineInstr *Br = nullptr;
|
2020-05-17 22:51:22 +08:00
|
|
|
MachineBasicBlock *UncondBrTarget = nullptr;
|
|
|
|
if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
|
2019-07-02 02:40:23 +08:00
|
|
|
const SIRegisterInfo *TRI
|
|
|
|
= static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
|
|
|
|
|
|
|
|
B.setInstr(*BrCond);
|
2020-01-06 11:09:24 +08:00
|
|
|
|
2020-05-17 22:51:22 +08:00
|
|
|
MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
|
2019-07-02 02:40:23 +08:00
|
|
|
Register Reg = MI.getOperand(2).getReg();
|
|
|
|
B.buildInstr(AMDGPU::SI_LOOP)
|
|
|
|
.addUse(Reg)
|
2020-05-17 22:51:22 +08:00
|
|
|
.addMBB(UncondBrTarget);
|
2020-02-15 10:47:37 +08:00
|
|
|
|
|
|
|
if (Br)
|
2020-05-17 22:51:22 +08:00
|
|
|
Br->getOperand(0).setMBB(CondBrTarget);
|
|
|
|
else
|
|
|
|
B.buildBr(*CondBrTarget);
|
2020-02-15 10:47:37 +08:00
|
|
|
|
2019-07-02 02:40:23 +08:00
|
|
|
MI.eraseFromParent();
|
|
|
|
BrCond->eraseFromParent();
|
|
|
|
MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
2019-07-02 02:49:01 +08:00
|
|
|
case Intrinsic::amdgcn_kernarg_segment_ptr:
|
2020-03-13 02:35:22 +08:00
|
|
|
if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
|
|
|
|
B.setInstr(MI);
|
|
|
|
// This only makes sense to call in a kernel, so just lower to null.
|
|
|
|
B.buildConstant(MI.getOperand(0).getReg(), 0);
|
|
|
|
MI.eraseFromParent();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2019-07-02 02:49:01 +08:00
|
|
|
return legalizePreloadedArgIntrin(
|
|
|
|
MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
|
|
|
|
case Intrinsic::amdgcn_implicitarg_ptr:
|
|
|
|
return legalizeImplicitArgPtr(MI, MRI, B);
|
2019-07-02 02:45:36 +08:00
|
|
|
case Intrinsic::amdgcn_workitem_id_x:
|
|
|
|
return legalizePreloadedArgIntrin(MI, MRI, B,
|
|
|
|
AMDGPUFunctionArgInfo::WORKITEM_ID_X);
|
|
|
|
case Intrinsic::amdgcn_workitem_id_y:
|
|
|
|
return legalizePreloadedArgIntrin(MI, MRI, B,
|
|
|
|
AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
|
|
|
|
case Intrinsic::amdgcn_workitem_id_z:
|
|
|
|
return legalizePreloadedArgIntrin(MI, MRI, B,
|
|
|
|
AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
|
2019-07-02 02:47:22 +08:00
|
|
|
case Intrinsic::amdgcn_workgroup_id_x:
|
|
|
|
return legalizePreloadedArgIntrin(MI, MRI, B,
|
|
|
|
AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
|
|
|
|
case Intrinsic::amdgcn_workgroup_id_y:
|
|
|
|
return legalizePreloadedArgIntrin(MI, MRI, B,
|
|
|
|
AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
|
|
|
|
case Intrinsic::amdgcn_workgroup_id_z:
|
|
|
|
return legalizePreloadedArgIntrin(MI, MRI, B,
|
|
|
|
AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
|
2019-07-02 02:50:50 +08:00
|
|
|
case Intrinsic::amdgcn_dispatch_ptr:
|
|
|
|
return legalizePreloadedArgIntrin(MI, MRI, B,
|
|
|
|
AMDGPUFunctionArgInfo::DISPATCH_PTR);
|
|
|
|
case Intrinsic::amdgcn_queue_ptr:
|
|
|
|
return legalizePreloadedArgIntrin(MI, MRI, B,
|
|
|
|
AMDGPUFunctionArgInfo::QUEUE_PTR);
|
|
|
|
case Intrinsic::amdgcn_implicit_buffer_ptr:
|
|
|
|
return legalizePreloadedArgIntrin(
|
|
|
|
MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
|
|
|
|
case Intrinsic::amdgcn_dispatch_id:
|
|
|
|
return legalizePreloadedArgIntrin(MI, MRI, B,
|
|
|
|
AMDGPUFunctionArgInfo::DISPATCH_ID);
|
[AMDGPU/GlobalISel] Add llvm.amdgcn.fdiv.fast legalization.
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: volkan, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D64966
llvm-svn: 367344
2019-07-31 02:49:16 +08:00
|
|
|
case Intrinsic::amdgcn_fdiv_fast:
|
AMDGPU/GlobalISel: Legalize fast unsafe FDIV
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69231
llvm-svn: 375460
2019-10-22 06:18:26 +08:00
|
|
|
return legalizeFDIVFastIntrin(MI, MRI, B);
|
2019-09-05 10:20:39 +08:00
|
|
|
case Intrinsic::amdgcn_is_shared:
|
|
|
|
return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
|
|
|
|
case Intrinsic::amdgcn_is_private:
|
|
|
|
return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
|
2019-09-09 23:20:49 +08:00
|
|
|
case Intrinsic::amdgcn_wavefrontsize: {
|
|
|
|
B.setInstr(MI);
|
|
|
|
B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
|
|
|
|
MI.eraseFromParent();
|
|
|
|
return true;
|
|
|
|
}
|
2020-01-31 06:34:33 +08:00
|
|
|
case Intrinsic::amdgcn_s_buffer_load:
|
|
|
|
return legalizeSBufferLoad(MI, B, Observer);
|
2019-09-20 00:26:14 +08:00
|
|
|
case Intrinsic::amdgcn_raw_buffer_store:
|
2020-01-14 12:01:23 +08:00
|
|
|
case Intrinsic::amdgcn_struct_buffer_store:
|
2020-01-14 09:39:09 +08:00
|
|
|
return legalizeBufferStore(MI, MRI, B, false, false);
|
2019-09-20 00:26:14 +08:00
|
|
|
case Intrinsic::amdgcn_raw_buffer_store_format:
|
2020-01-14 12:01:23 +08:00
|
|
|
case Intrinsic::amdgcn_struct_buffer_store_format:
|
2020-01-14 09:39:09 +08:00
|
|
|
return legalizeBufferStore(MI, MRI, B, false, true);
|
2020-01-14 11:06:01 +08:00
|
|
|
case Intrinsic::amdgcn_raw_tbuffer_store:
|
|
|
|
case Intrinsic::amdgcn_struct_tbuffer_store:
|
|
|
|
return legalizeBufferStore(MI, MRI, B, true, true);
|
2019-09-19 12:29:20 +08:00
|
|
|
case Intrinsic::amdgcn_raw_buffer_load:
|
2020-01-14 05:02:14 +08:00
|
|
|
case Intrinsic::amdgcn_struct_buffer_load:
|
2020-01-14 07:18:56 +08:00
|
|
|
return legalizeBufferLoad(MI, MRI, B, false, false);
|
2020-01-09 11:35:23 +08:00
|
|
|
case Intrinsic::amdgcn_raw_buffer_load_format:
|
2020-01-14 06:07:41 +08:00
|
|
|
case Intrinsic::amdgcn_struct_buffer_load_format:
|
2020-01-14 07:18:56 +08:00
|
|
|
return legalizeBufferLoad(MI, MRI, B, true, false);
|
|
|
|
case Intrinsic::amdgcn_raw_tbuffer_load:
|
2020-01-14 09:06:26 +08:00
|
|
|
case Intrinsic::amdgcn_struct_tbuffer_load:
|
2020-01-14 07:18:56 +08:00
|
|
|
return legalizeBufferLoad(MI, MRI, B, true, true);
|
2020-01-14 12:17:59 +08:00
|
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_swap:
|
|
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_swap:
|
|
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_add:
|
|
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_add:
|
|
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_sub:
|
|
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_sub:
|
|
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_smin:
|
|
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_smin:
|
|
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_umin:
|
|
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_umin:
|
|
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_smax:
|
|
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_smax:
|
|
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_umax:
|
|
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_umax:
|
|
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_and:
|
|
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_and:
|
|
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_or:
|
|
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_or:
|
|
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_xor:
|
|
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_xor:
|
|
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_inc:
|
|
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_inc:
|
|
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_dec:
|
|
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_dec:
|
|
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
|
|
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
|
|
|
|
return legalizeBufferAtomic(MI, B, IntrID);
|
2020-01-18 09:51:01 +08:00
|
|
|
case Intrinsic::amdgcn_atomic_inc:
|
|
|
|
return legalizeAtomicIncDec(MI, B, true);
|
|
|
|
case Intrinsic::amdgcn_atomic_dec:
|
|
|
|
return legalizeAtomicIncDec(MI, B, false);
|
AMDGPU/GlobalISel: Support llvm.trap and llvm.debugtrap intrinsics
Summary: Lower trap and debugtrap intrinsics to AMDGPU machine instruction(s).
Reviewers: arsenm, nhaehnle, kerbowa, cdevadas, t-tye, kzhuravl
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, yaxunl, rovka, dstuttard, tpr, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D74688
2020-03-05 10:45:55 +08:00
|
|
|
case Intrinsic::trap:
|
|
|
|
return legalizeTrapIntrinsic(MI, MRI, B);
|
|
|
|
case Intrinsic::debugtrap:
|
|
|
|
return legalizeDebugTrapIntrinsic(MI, MRI, B);
|
2020-01-16 03:23:20 +08:00
|
|
|
default: {
|
|
|
|
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
|
|
|
|
AMDGPU::getImageDimIntrinsicInfo(IntrID))
|
|
|
|
return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr);
|
2019-07-02 02:40:23 +08:00
|
|
|
return true;
|
|
|
|
}
|
2020-01-16 03:23:20 +08:00
|
|
|
}
|
2019-07-02 02:40:23 +08:00
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|