forked from OSchip/llvm-project
[NVPTX] Added intrinsics for atom.gen.{sys|cta}.* instructions.
These are only available on sm_60+ GPUs. Differential Revision: https://reviews.llvm.org/D24943 llvm-svn: 282607
This commit is contained in:
parent
f0022125e0
commit
3e1211581c
|
@ -729,6 +729,39 @@ let TargetPrefix = "nvvm" in {
|
|||
[LLVMAnyPointerType<llvm_i32_ty>, llvm_i32_ty],
|
||||
[IntrArgMemOnly, NoCapture<0>]>;
|
||||
|
||||
class SCOPED_ATOMIC2_impl<LLVMType elty>
|
||||
: Intrinsic<[elty],
|
||||
[LLVMAnyPointerType<LLVMMatchType<0>>, LLVMMatchType<0>],
|
||||
[IntrArgMemOnly, NoCapture<0>]>;
|
||||
class SCOPED_ATOMIC3_impl<LLVMType elty>
|
||||
: Intrinsic<[elty],
|
||||
[LLVMAnyPointerType<LLVMMatchType<0>>, LLVMMatchType<0>,
|
||||
LLVMMatchType<0>],
|
||||
[IntrArgMemOnly, NoCapture<0>]>;
|
||||
|
||||
multiclass PTXAtomicWithScope2<LLVMType elty> {
|
||||
def _cta : SCOPED_ATOMIC2_impl<elty>;
|
||||
def _sys : SCOPED_ATOMIC2_impl<elty>;
|
||||
}
|
||||
multiclass PTXAtomicWithScope3<LLVMType elty> {
|
||||
def _cta : SCOPED_ATOMIC3_impl<elty>;
|
||||
def _sys : SCOPED_ATOMIC3_impl<elty>;
|
||||
}
|
||||
multiclass PTXAtomicWithScope2_fi {
|
||||
defm _f: PTXAtomicWithScope2<llvm_anyfloat_ty>;
|
||||
defm _i: PTXAtomicWithScope2<llvm_anyint_ty>;
|
||||
}
|
||||
defm int_nvvm_atomic_add_gen : PTXAtomicWithScope2_fi;
|
||||
defm int_nvvm_atomic_inc_gen_i : PTXAtomicWithScope2<llvm_anyint_ty>;
|
||||
defm int_nvvm_atomic_dec_gen_i : PTXAtomicWithScope2<llvm_anyint_ty>;
|
||||
defm int_nvvm_atomic_exch_gen_i: PTXAtomicWithScope2<llvm_anyint_ty>;
|
||||
defm int_nvvm_atomic_xor_gen_i : PTXAtomicWithScope2<llvm_anyint_ty>;
|
||||
defm int_nvvm_atomic_max_gen_i : PTXAtomicWithScope2<llvm_anyint_ty>;
|
||||
defm int_nvvm_atomic_min_gen_i : PTXAtomicWithScope2<llvm_anyint_ty>;
|
||||
defm int_nvvm_atomic_or_gen_i : PTXAtomicWithScope2<llvm_anyint_ty>;
|
||||
defm int_nvvm_atomic_and_gen_i : PTXAtomicWithScope2<llvm_anyint_ty>;
|
||||
defm int_nvvm_atomic_cas_gen_i : PTXAtomicWithScope3<llvm_anyint_ty>;
|
||||
|
||||
// Bar.Sync
|
||||
|
||||
// The builtin for "bar.sync 0" is called __syncthreads. Unlike most of the
|
||||
|
|
|
@ -51,6 +51,9 @@ def SM61 : SubtargetFeature<"sm_61", "SmVersion", "61",
|
|||
def SM62 : SubtargetFeature<"sm_62", "SmVersion", "62",
|
||||
"Target SM 6.2">;
|
||||
|
||||
def SATOM : SubtargetFeature<"satom", "HasAtomScope", "true",
|
||||
"Atomic operations with scope">;
|
||||
|
||||
// PTX Versions
|
||||
def PTX32 : SubtargetFeature<"ptx32", "PTXVersion", "32",
|
||||
"Use PTX version 3.2">;
|
||||
|
@ -81,9 +84,9 @@ def : Proc<"sm_37", [SM37, PTX41]>;
|
|||
def : Proc<"sm_50", [SM50, PTX40]>;
|
||||
def : Proc<"sm_52", [SM52, PTX41]>;
|
||||
def : Proc<"sm_53", [SM53, PTX42]>;
|
||||
def : Proc<"sm_60", [SM60, PTX50]>;
|
||||
def : Proc<"sm_61", [SM61, PTX50]>;
|
||||
def : Proc<"sm_62", [SM62, PTX50]>;
|
||||
def : Proc<"sm_60", [SM60, PTX50, SATOM]>;
|
||||
def : Proc<"sm_61", [SM61, PTX50, SATOM]>;
|
||||
def : Proc<"sm_62", [SM62, PTX50, SATOM]>;
|
||||
|
||||
def NVPTXInstrInfo : InstrInfo {
|
||||
}
|
||||
|
|
|
@ -3274,20 +3274,34 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
|
|||
return false;
|
||||
|
||||
case Intrinsic::nvvm_atomic_load_add_f32:
|
||||
Info.opc = ISD::INTRINSIC_W_CHAIN;
|
||||
Info.memVT = MVT::f32;
|
||||
Info.ptrVal = I.getArgOperand(0);
|
||||
Info.offset = 0;
|
||||
Info.vol = 0;
|
||||
Info.readMem = true;
|
||||
Info.writeMem = true;
|
||||
Info.align = 0;
|
||||
return true;
|
||||
|
||||
case Intrinsic::nvvm_atomic_load_inc_32:
|
||||
case Intrinsic::nvvm_atomic_load_dec_32:
|
||||
|
||||
case Intrinsic::nvvm_atomic_add_gen_f_cta:
|
||||
case Intrinsic::nvvm_atomic_add_gen_f_sys:
|
||||
case Intrinsic::nvvm_atomic_add_gen_i_cta:
|
||||
case Intrinsic::nvvm_atomic_add_gen_i_sys:
|
||||
case Intrinsic::nvvm_atomic_and_gen_i_cta:
|
||||
case Intrinsic::nvvm_atomic_and_gen_i_sys:
|
||||
case Intrinsic::nvvm_atomic_cas_gen_i_cta:
|
||||
case Intrinsic::nvvm_atomic_cas_gen_i_sys:
|
||||
case Intrinsic::nvvm_atomic_dec_gen_i_cta:
|
||||
case Intrinsic::nvvm_atomic_dec_gen_i_sys:
|
||||
case Intrinsic::nvvm_atomic_inc_gen_i_cta:
|
||||
case Intrinsic::nvvm_atomic_inc_gen_i_sys:
|
||||
case Intrinsic::nvvm_atomic_max_gen_i_cta:
|
||||
case Intrinsic::nvvm_atomic_max_gen_i_sys:
|
||||
case Intrinsic::nvvm_atomic_min_gen_i_cta:
|
||||
case Intrinsic::nvvm_atomic_min_gen_i_sys:
|
||||
case Intrinsic::nvvm_atomic_or_gen_i_cta:
|
||||
case Intrinsic::nvvm_atomic_or_gen_i_sys:
|
||||
case Intrinsic::nvvm_atomic_exch_gen_i_cta:
|
||||
case Intrinsic::nvvm_atomic_exch_gen_i_sys:
|
||||
case Intrinsic::nvvm_atomic_xor_gen_i_cta:
|
||||
case Intrinsic::nvvm_atomic_xor_gen_i_sys: {
|
||||
auto &DL = I.getModule()->getDataLayout();
|
||||
Info.opc = ISD::INTRINSIC_W_CHAIN;
|
||||
Info.memVT = MVT::i32;
|
||||
Info.memVT = getValueType(DL, I.getType());
|
||||
Info.ptrVal = I.getArgOperand(0);
|
||||
Info.offset = 0;
|
||||
Info.vol = 0;
|
||||
|
@ -3295,6 +3309,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
|
|||
Info.writeMem = true;
|
||||
Info.align = 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
case Intrinsic::nvvm_ldu_global_i:
|
||||
case Intrinsic::nvvm_ldu_global_f:
|
||||
|
|
|
@ -131,6 +131,10 @@ def hasAtomRedGen64 : Predicate<"Subtarget->hasAtomRedGen64()">;
|
|||
def useAtomRedG64forGen64 :
|
||||
Predicate<"!Subtarget->hasAtomRedGen64() && Subtarget->hasAtomRedG64()">;
|
||||
def hasAtomAddF32 : Predicate<"Subtarget->hasAtomAddF32()">;
|
||||
def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">;
|
||||
def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">;
|
||||
def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">;
|
||||
def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">;
|
||||
def hasVote : Predicate<"Subtarget->hasVote()">;
|
||||
def hasDouble : Predicate<"Subtarget->hasDouble()">;
|
||||
def reqPTX20 : Predicate<"Subtarget->reqPTX20()">;
|
||||
|
|
|
@ -1377,8 +1377,204 @@ defm INT_PTX_ATOM_CAS_GEN_64 : F_ATOMIC_3<Int64Regs, "", ".b64", ".cas",
|
|||
defm INT_PTX_ATOM_CAS_GEN_64_USE_G : F_ATOMIC_3<Int64Regs, ".global", ".b64",
|
||||
".cas", atomic_cmp_swap_64_gen, i64imm, useAtomRedG64forGen64>;
|
||||
|
||||
// Support for scoped atomic operations. Matches
|
||||
// int_nvvm_atomic_{op}_{space}_{type}_{scope}
|
||||
// and converts it into the appropriate instruction.
|
||||
// NOTE: not all possible combinations are implemented
|
||||
// 'space' is limited to generic as it's the only one needed to support CUDA.
|
||||
// 'scope' = 'gpu' is default and is handled by regular atomic instructions.
|
||||
class ATOM23_impl<string AsmStr, NVPTXRegClass regclass, list<Predicate> Preds,
|
||||
dag ins, dag Operands>
|
||||
: NVPTXInst<(outs regclass:$result), ins,
|
||||
AsmStr,
|
||||
[(set regclass:$result, Operands)]>,
|
||||
Requires<Preds>;
|
||||
|
||||
// Define instruction variants for all addressing modes.
|
||||
multiclass ATOM2P_impl<string AsmStr, Intrinsic Intr,
|
||||
NVPTXRegClass regclass, Operand ImmType,
|
||||
SDNode Imm, ValueType ImmTy,
|
||||
list<Predicate> Preds> {
|
||||
let AddedComplexity = 1 in {
|
||||
def : ATOM23_impl<AsmStr, regclass, Preds,
|
||||
(ins Int32Regs:$src, regclass:$b),
|
||||
(Intr Int32Regs:$src, regclass:$b)>;
|
||||
def : ATOM23_impl<AsmStr, regclass, Preds,
|
||||
(ins Int64Regs:$src, regclass:$b),
|
||||
(Intr Int64Regs:$src, regclass:$b)>;
|
||||
}
|
||||
// tablegen can't infer argument types from Intrinsic (though it can
|
||||
// from Instruction) so we have to enforce specific type on
|
||||
// immediates via explicit cast to ImmTy.
|
||||
def : ATOM23_impl<AsmStr, regclass, Preds,
|
||||
(ins Int32Regs:$src, ImmType:$b),
|
||||
(Intr Int32Regs:$src, (ImmTy Imm:$b))>;
|
||||
def : ATOM23_impl<AsmStr, regclass, Preds,
|
||||
(ins Int64Regs:$src, ImmType:$b),
|
||||
(Intr Int64Regs:$src, (ImmTy Imm:$b))>;
|
||||
}
|
||||
|
||||
multiclass ATOM3P_impl<string AsmStr, Intrinsic Intr,
|
||||
NVPTXRegClass regclass, Operand ImmType,
|
||||
SDNode Imm, ValueType ImmTy,
|
||||
list<Predicate> Preds> {
|
||||
// Variants for register/immediate permutations of $b and $c
|
||||
let AddedComplexity = 2 in {
|
||||
def : ATOM23_impl<AsmStr, regclass, Preds,
|
||||
(ins Int32Regs:$src, regclass:$b, regclass:$c),
|
||||
(Intr Int32Regs:$src, regclass:$b, regclass:$c)>;
|
||||
def : ATOM23_impl<AsmStr, regclass, Preds,
|
||||
(ins Int64Regs:$src, regclass:$b, regclass:$c),
|
||||
(Intr Int64Regs:$src, regclass:$b, regclass:$c)>;
|
||||
}
|
||||
let AddedComplexity = 1 in {
|
||||
def : ATOM23_impl<AsmStr, regclass, Preds,
|
||||
(ins Int32Regs:$src, ImmType:$b, regclass:$c),
|
||||
(Intr Int32Regs:$src, (ImmTy Imm:$b), regclass:$c)>;
|
||||
def : ATOM23_impl<AsmStr, regclass, Preds,
|
||||
(ins Int64Regs:$src, ImmType:$b, regclass:$c),
|
||||
(Intr Int64Regs:$src, (ImmTy Imm:$b), regclass:$c)>;
|
||||
def : ATOM23_impl<AsmStr, regclass, Preds,
|
||||
(ins Int32Regs:$src, regclass:$b, ImmType:$c),
|
||||
(Intr Int32Regs:$src, regclass:$b, (ImmTy Imm:$c))>;
|
||||
def : ATOM23_impl<AsmStr, regclass, Preds,
|
||||
(ins Int64Regs:$src, regclass:$b, ImmType:$c),
|
||||
(Intr Int64Regs:$src, regclass:$b, (ImmTy Imm:$c))>;
|
||||
}
|
||||
def : ATOM23_impl<AsmStr, regclass, Preds,
|
||||
(ins Int32Regs:$src, ImmType:$b, ImmType:$c),
|
||||
(Intr Int32Regs:$src, (ImmTy Imm:$b), (ImmTy Imm:$c))>;
|
||||
def : ATOM23_impl<AsmStr, regclass, Preds,
|
||||
(ins Int64Regs:$src, ImmType:$b, ImmType:$c),
|
||||
(Intr Int64Regs:$src, (ImmTy Imm:$b), (ImmTy Imm:$c))>;
|
||||
}
|
||||
|
||||
// Constructs instrinsic name and instruction asm strings.
|
||||
multiclass ATOM2N_impl<string OpStr, string IntTypeStr, string TypeStr,
|
||||
string ScopeStr, string SpaceStr,
|
||||
NVPTXRegClass regclass, Operand ImmType, SDNode Imm,
|
||||
ValueType ImmTy, list<Predicate> Preds> {
|
||||
defm : ATOM2P_impl<"atom" # !if(!eq(SpaceStr, "gen"), "", "." # SpaceStr)
|
||||
# !if(!eq(ScopeStr, "gpu"), "", "." # ScopeStr)
|
||||
# "." # OpStr # "." # TypeStr
|
||||
# " \t$result, [$src], $b;",
|
||||
!cast<Intrinsic>(
|
||||
"int_nvvm_atomic_" # OpStr
|
||||
# "_" # SpaceStr # "_" # IntTypeStr
|
||||
# !if(!eq(ScopeStr,""), "", "_" # ScopeStr)),
|
||||
regclass, ImmType, Imm, ImmTy, Preds>;
|
||||
}
|
||||
multiclass ATOM3N_impl<string OpStr, string IntTypeStr, string TypeStr,
|
||||
string ScopeStr, string SpaceStr,
|
||||
NVPTXRegClass regclass, Operand ImmType, SDNode Imm,
|
||||
ValueType ImmTy, list<Predicate> Preds> {
|
||||
defm : ATOM3P_impl<"atom" # !if(!eq(SpaceStr, "gen"), "", "." # SpaceStr)
|
||||
# !if(!eq(ScopeStr, "gpu"), "", "." # ScopeStr)
|
||||
# "." # OpStr # "." # TypeStr
|
||||
# " \t$result, [$src], $b, $c;",
|
||||
!cast<Intrinsic>(
|
||||
"int_nvvm_atomic_" # OpStr
|
||||
# "_" # SpaceStr # "_" # IntTypeStr
|
||||
# !if(!eq(ScopeStr,""), "", "_" # ScopeStr)),
|
||||
regclass, ImmType, Imm, ImmTy, Preds>;
|
||||
}
|
||||
|
||||
// Constructs variants for different address spaces.
|
||||
// For now we only need variants for generic space pointers.
|
||||
multiclass ATOM2A_impl<string OpStr, string IntTypeStr, string TypeStr,
|
||||
string ScopeStr, NVPTXRegClass regclass, Operand ImmType,
|
||||
SDNode Imm, ValueType ImmTy, list<Predicate> Preds> {
|
||||
defm _gen_ : ATOM2N_impl<OpStr, IntTypeStr, TypeStr, ScopeStr, "gen",
|
||||
regclass, ImmType, Imm, ImmTy, Preds>;
|
||||
}
|
||||
multiclass ATOM3A_impl<string OpStr, string IntTypeStr, string TypeStr,
|
||||
string ScopeStr, NVPTXRegClass regclass, Operand ImmType,
|
||||
SDNode Imm, ValueType ImmTy, list<Predicate> Preds> {
|
||||
defm _gen_ : ATOM3N_impl<OpStr, IntTypeStr, TypeStr, ScopeStr, "gen",
|
||||
regclass, ImmType, Imm, ImmTy, Preds>;
|
||||
}
|
||||
|
||||
// Constructs variants for different scopes of atomic op.
|
||||
multiclass ATOM2S_impl<string OpStr, string IntTypeStr, string TypeStr,
|
||||
NVPTXRegClass regclass, Operand ImmType, SDNode Imm,
|
||||
ValueType ImmTy, list<Predicate> Preds> {
|
||||
// .gpu scope is default and is currently covered by existing
|
||||
// atomics w/o explicitly specified scope.
|
||||
defm _cta : ATOM2A_impl<OpStr, IntTypeStr, TypeStr, "cta",
|
||||
regclass, ImmType, Imm, ImmTy,
|
||||
!listconcat(Preds,[hasAtomScope])>;
|
||||
defm _sys : ATOM2A_impl<OpStr, IntTypeStr, TypeStr, "sys",
|
||||
regclass, ImmType, Imm, ImmTy,
|
||||
!listconcat(Preds,[hasAtomScope])>;
|
||||
}
|
||||
multiclass ATOM3S_impl<string OpStr, string IntTypeStr, string TypeStr,
|
||||
NVPTXRegClass regclass, Operand ImmType, SDNode Imm, ValueType ImmTy,
|
||||
list<Predicate> Preds> {
|
||||
// No need to define ".gpu"-scoped atomics. They do the same thing
|
||||
// as the regular, non-scoped atomics defined elsewhere.
|
||||
defm _cta : ATOM3A_impl<OpStr, IntTypeStr, TypeStr, "cta",
|
||||
regclass, ImmType, Imm, ImmTy,
|
||||
!listconcat(Preds,[hasAtomScope])>;
|
||||
defm _sys : ATOM3A_impl<OpStr, IntTypeStr, TypeStr, "sys",
|
||||
regclass, ImmType, Imm, ImmTy,
|
||||
!listconcat(Preds,[hasAtomScope])>;
|
||||
}
|
||||
|
||||
// atom.add
|
||||
multiclass ATOM2_add_impl<string OpStr> {
|
||||
defm _s32 : ATOM2S_impl<OpStr, "i", "s32", Int32Regs, i32imm, imm, i32, []>;
|
||||
defm _u32 : ATOM2S_impl<OpStr, "i", "u32", Int32Regs, i32imm, imm, i32, []>;
|
||||
defm _u64 : ATOM2S_impl<OpStr, "i", "u64", Int64Regs, i64imm, imm, i64, []>;
|
||||
defm _f32 : ATOM2S_impl<OpStr, "f", "f32", Float32Regs, f32imm, fpimm, f32,
|
||||
[hasAtomAddF32]>;
|
||||
defm _f64 : ATOM2S_impl<OpStr, "f", "f64", Float64Regs, f64imm, fpimm, f64,
|
||||
[hasAtomAddF64]>;
|
||||
}
|
||||
|
||||
// atom.{and,or,xor}
|
||||
multiclass ATOM2_bitwise_impl<string OpStr> {
|
||||
defm _b32 : ATOM2S_impl<OpStr, "i", "b32", Int32Regs, i32imm, imm, i32, []>;
|
||||
defm _b64 : ATOM2S_impl<OpStr, "i", "b64", Int64Regs, i64imm, imm, i64,
|
||||
[hasAtomBitwise64]>;
|
||||
}
|
||||
|
||||
// atom.exch
|
||||
multiclass ATOM2_exch_impl<string OpStr> {
|
||||
defm _b32 : ATOM2S_impl<OpStr, "i", "b32", Int32Regs, i32imm, imm, i32, []>;
|
||||
defm _b64 : ATOM2S_impl<OpStr, "i", "b64", Int64Regs, i64imm, imm, i64, []>;
|
||||
}
|
||||
|
||||
// atom.{min,max}
|
||||
multiclass ATOM2_minmax_impl<string OpStr> {
|
||||
defm _s32 : ATOM2S_impl<OpStr, "i", "s32", Int32Regs, i32imm, imm, i32, []>;
|
||||
defm _u32 : ATOM2S_impl<OpStr, "i", "u32", Int32Regs, i32imm, imm, i32, []>;
|
||||
defm _s64 : ATOM2S_impl<OpStr, "i", "s64", Int64Regs, i64imm, imm, i64,
|
||||
[hasAtomMinMax64]>;
|
||||
defm _u64 : ATOM2S_impl<OpStr, "i", "u64", Int64Regs, i64imm, imm, i64,
|
||||
[hasAtomMinMax64]>;
|
||||
}
|
||||
|
||||
// atom.{inc,dec}
|
||||
multiclass ATOM2_incdec_impl<string OpStr> {
|
||||
defm _u32 : ATOM2S_impl<OpStr, "i", "u32", Int32Regs, i32imm, imm, i32, []>;
|
||||
}
|
||||
|
||||
// atom.cas
|
||||
multiclass ATOM3_cas_impl<string OpStr> {
|
||||
defm _b32 : ATOM3S_impl<OpStr, "i", "b32", Int32Regs, i32imm, imm, i32, []>;
|
||||
defm _b64 : ATOM3S_impl<OpStr, "i", "b64", Int64Regs, i64imm, imm, i64, []>;
|
||||
}
|
||||
|
||||
defm INT_PTX_SATOM_ADD : ATOM2_add_impl<"add">;
|
||||
defm INT_PTX_SATOM_AND : ATOM2_bitwise_impl<"and">;
|
||||
defm INT_PTX_SATOM_CAS : ATOM3_cas_impl<"cas">;
|
||||
defm INT_PTX_SATOM_DEC : ATOM2_incdec_impl<"dec">;
|
||||
defm INT_PTX_SATOM_EXCH: ATOM2_exch_impl<"exch">;
|
||||
defm INT_PTX_SATOM_INC : ATOM2_incdec_impl<"inc">;
|
||||
defm INT_PTX_SATOM_MAX : ATOM2_minmax_impl<"max">;
|
||||
defm INT_PTX_SATOM_MIN : ATOM2_minmax_impl<"min">;
|
||||
defm INT_PTX_SATOM_OR : ATOM2_bitwise_impl<"or">;
|
||||
defm INT_PTX_SATOM_XOR : ATOM2_bitwise_impl<"xor">;
|
||||
|
||||
//-----------------------------------
|
||||
// Support for ldu on sm_20 or later
|
||||
|
|
|
@ -29,8 +29,6 @@ void NVPTXSubtarget::anchor() {}
|
|||
NVPTXSubtarget &NVPTXSubtarget::initializeSubtargetDependencies(StringRef CPU,
|
||||
StringRef FS) {
|
||||
// Provide the default CPU if we don't have one.
|
||||
if (CPU.empty() && FS.size())
|
||||
llvm_unreachable("we are not using FeatureStr");
|
||||
TargetName = CPU.empty() ? "sm_20" : CPU;
|
||||
|
||||
ParseSubtargetFeatures(TargetName, FS);
|
||||
|
|
|
@ -48,6 +48,10 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
|
|||
// FrameLowering class because TargetFrameLowering is abstract.
|
||||
NVPTXFrameLowering FrameLowering;
|
||||
|
||||
protected:
|
||||
// Processor supports scoped atomic operations.
|
||||
bool HasAtomScope;
|
||||
|
||||
public:
|
||||
/// This constructor initializes the data members to match that
|
||||
/// of the specified module.
|
||||
|
@ -77,6 +81,10 @@ public:
|
|||
bool hasAtomRedGen32() const { return SmVersion >= 20; }
|
||||
bool hasAtomRedGen64() const { return SmVersion >= 20; }
|
||||
bool hasAtomAddF32() const { return SmVersion >= 20; }
|
||||
bool hasAtomAddF64() const { return SmVersion >= 60; }
|
||||
bool hasAtomScope() const { return HasAtomScope; }
|
||||
bool hasAtomBitwise64() const { return SmVersion >= 32; }
|
||||
bool hasAtomMinMax64() const { return SmVersion >= 32; }
|
||||
bool hasVote() const { return SmVersion >= 12; }
|
||||
bool hasDouble() const { return SmVersion >= 13; }
|
||||
bool reqPTX20() const { return SmVersion >= 20; }
|
||||
|
|
|
@ -42,6 +42,29 @@ static bool isNVVMAtomic(const IntrinsicInst *II) {
|
|||
case Intrinsic::nvvm_atomic_load_add_f32:
|
||||
case Intrinsic::nvvm_atomic_load_inc_32:
|
||||
case Intrinsic::nvvm_atomic_load_dec_32:
|
||||
|
||||
case Intrinsic::nvvm_atomic_add_gen_f_cta:
|
||||
case Intrinsic::nvvm_atomic_add_gen_f_sys:
|
||||
case Intrinsic::nvvm_atomic_add_gen_i_cta:
|
||||
case Intrinsic::nvvm_atomic_add_gen_i_sys:
|
||||
case Intrinsic::nvvm_atomic_and_gen_i_cta:
|
||||
case Intrinsic::nvvm_atomic_and_gen_i_sys:
|
||||
case Intrinsic::nvvm_atomic_cas_gen_i_cta:
|
||||
case Intrinsic::nvvm_atomic_cas_gen_i_sys:
|
||||
case Intrinsic::nvvm_atomic_dec_gen_i_cta:
|
||||
case Intrinsic::nvvm_atomic_dec_gen_i_sys:
|
||||
case Intrinsic::nvvm_atomic_inc_gen_i_cta:
|
||||
case Intrinsic::nvvm_atomic_inc_gen_i_sys:
|
||||
case Intrinsic::nvvm_atomic_max_gen_i_cta:
|
||||
case Intrinsic::nvvm_atomic_max_gen_i_sys:
|
||||
case Intrinsic::nvvm_atomic_min_gen_i_cta:
|
||||
case Intrinsic::nvvm_atomic_min_gen_i_sys:
|
||||
case Intrinsic::nvvm_atomic_or_gen_i_cta:
|
||||
case Intrinsic::nvvm_atomic_or_gen_i_sys:
|
||||
case Intrinsic::nvvm_atomic_exch_gen_i_cta:
|
||||
case Intrinsic::nvvm_atomic_exch_gen_i_sys:
|
||||
case Intrinsic::nvvm_atomic_xor_gen_i_cta:
|
||||
case Intrinsic::nvvm_atomic_xor_gen_i_sys:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,187 @@
|
|||
; RUN: llc < %s -march=nvptx -mcpu=sm_60 | FileCheck %s -check-prefixes=CHECK,CHECK32
|
||||
; RUN: llc < %s -march=nvptx64 -mcpu=sm_60 | FileCheck %s
|
||||
|
||||
; CHECK-LABEL: .func test_atomics_scope(
|
||||
define void @test_atomics_scope(float* %fp, float %f,
|
||||
double* %dfp, double %df,
|
||||
i32* %ip, i32 %i,
|
||||
i32* %uip, i32 %ui,
|
||||
i64* %llp, i64 %ll) #0 {
|
||||
entry:
|
||||
; CHECK: atom.cta.add.s32
|
||||
%tmp36 = tail call i32 @llvm.nvvm.atomic.add.gen.i.cta.i32.p0i32(i32* %ip, i32 %i)
|
||||
; CHECK: atom.cta.add.u64
|
||||
%tmp38 = tail call i64 @llvm.nvvm.atomic.add.gen.i.cta.i64.p0i64(i64* %llp, i64 %ll)
|
||||
; CHECK: atom.sys.add.s32
|
||||
%tmp39 = tail call i32 @llvm.nvvm.atomic.add.gen.i.sys.i32.p0i32(i32* %ip, i32 %i)
|
||||
; CHECK: atom.sys.add.u64
|
||||
%tmp41 = tail call i64 @llvm.nvvm.atomic.add.gen.i.sys.i64.p0i64(i64* %llp, i64 %ll)
|
||||
; CHECK: atom.cta.add.f32
|
||||
%tmp42 = tail call float @llvm.nvvm.atomic.add.gen.f.cta.f32.p0f32(float* %fp, float %f)
|
||||
; CHECK: atom.cta.add.f64
|
||||
%tmp43 = tail call double @llvm.nvvm.atomic.add.gen.f.cta.f64.p0f64(double* %dfp, double %df)
|
||||
; CHECK: atom.sys.add.f32
|
||||
%tmp44 = tail call float @llvm.nvvm.atomic.add.gen.f.sys.f32.p0f32(float* %fp, float %f)
|
||||
; CHECK: atom.sys.add.f64
|
||||
%tmp45 = tail call double @llvm.nvvm.atomic.add.gen.f.sys.f64.p0f64(double* %dfp, double %df)
|
||||
|
||||
; CHECK: atom.cta.exch.b32
|
||||
%tmp46 = tail call i32 @llvm.nvvm.atomic.exch.gen.i.cta.i32.p0i32(i32* %ip, i32 %i)
|
||||
; CHECK: atom.cta.exch.b64
|
||||
%tmp48 = tail call i64 @llvm.nvvm.atomic.exch.gen.i.cta.i64.p0i64(i64* %llp, i64 %ll)
|
||||
; CHECK: atom.sys.exch.b32
|
||||
%tmp49 = tail call i32 @llvm.nvvm.atomic.exch.gen.i.sys.i32.p0i32(i32* %ip, i32 %i)
|
||||
; CHECK: atom.sys.exch.b64
|
||||
%tmp51 = tail call i64 @llvm.nvvm.atomic.exch.gen.i.sys.i64.p0i64(i64* %llp, i64 %ll)
|
||||
|
||||
; CHECK: atom.cta.max.s32
|
||||
%tmp52 = tail call i32 @llvm.nvvm.atomic.max.gen.i.cta.i32.p0i32(i32* %ip, i32 %i)
|
||||
; CHECK: atom.cta.max.s64
|
||||
%tmp56 = tail call i64 @llvm.nvvm.atomic.max.gen.i.cta.i64.p0i64(i64* %llp, i64 %ll)
|
||||
; CHECK: atom.sys.max.s32
|
||||
%tmp58 = tail call i32 @llvm.nvvm.atomic.max.gen.i.sys.i32.p0i32(i32* %ip, i32 %i)
|
||||
; CHECK: atom.sys.max.s64
|
||||
%tmp62 = tail call i64 @llvm.nvvm.atomic.max.gen.i.sys.i64.p0i64(i64* %llp, i64 %ll)
|
||||
|
||||
; CHECK: atom.cta.min.s32
|
||||
%tmp64 = tail call i32 @llvm.nvvm.atomic.min.gen.i.cta.i32.p0i32(i32* %ip, i32 %i)
|
||||
; CHECK: atom.cta.min.s64
|
||||
%tmp68 = tail call i64 @llvm.nvvm.atomic.min.gen.i.cta.i64.p0i64(i64* %llp, i64 %ll)
|
||||
; CHECK: atom.sys.min.s32
|
||||
%tmp70 = tail call i32 @llvm.nvvm.atomic.min.gen.i.sys.i32.p0i32(i32* %ip, i32 %i)
|
||||
; CHECK: atom.sys.min.s64
|
||||
%tmp74 = tail call i64 @llvm.nvvm.atomic.min.gen.i.sys.i64.p0i64(i64* %llp, i64 %ll)
|
||||
|
||||
; CHECK: atom.cta.inc.u32
|
||||
%tmp76 = tail call i32 @llvm.nvvm.atomic.inc.gen.i.cta.i32.p0i32(i32* %ip, i32 %i)
|
||||
; CHECK: atom.sys.inc.u32
|
||||
%tmp77 = tail call i32 @llvm.nvvm.atomic.inc.gen.i.sys.i32.p0i32(i32* %ip, i32 %i)
|
||||
|
||||
; CHECK: atom.cta.dec.u32
|
||||
%tmp78 = tail call i32 @llvm.nvvm.atomic.dec.gen.i.cta.i32.p0i32(i32* %ip, i32 %i)
|
||||
; CHECK: atom.sys.dec.u32
|
||||
%tmp79 = tail call i32 @llvm.nvvm.atomic.dec.gen.i.sys.i32.p0i32(i32* %ip, i32 %i)
|
||||
|
||||
; CHECK: atom.cta.and.b32
|
||||
%tmp80 = tail call i32 @llvm.nvvm.atomic.and.gen.i.cta.i32.p0i32(i32* %ip, i32 %i)
|
||||
; CHECK: atom.cta.and.b64
|
||||
%tmp82 = tail call i64 @llvm.nvvm.atomic.and.gen.i.cta.i64.p0i64(i64* %llp, i64 %ll)
|
||||
; CHECK: atom.sys.and.b32
|
||||
%tmp83 = tail call i32 @llvm.nvvm.atomic.and.gen.i.sys.i32.p0i32(i32* %ip, i32 %i)
|
||||
; CHECK: atom.sys.and.b64
|
||||
%tmp85 = tail call i64 @llvm.nvvm.atomic.and.gen.i.sys.i64.p0i64(i64* %llp, i64 %ll)
|
||||
|
||||
; CHECK: atom.cta.or.b32
|
||||
%tmp86 = tail call i32 @llvm.nvvm.atomic.or.gen.i.cta.i32.p0i32(i32* %ip, i32 %i)
|
||||
; CHECK: atom.cta.or.b64
|
||||
%tmp88 = tail call i64 @llvm.nvvm.atomic.or.gen.i.cta.i64.p0i64(i64* %llp, i64 %ll)
|
||||
; CHECK: atom.sys.or.b32
|
||||
%tmp89 = tail call i32 @llvm.nvvm.atomic.or.gen.i.sys.i32.p0i32(i32* %ip, i32 %i)
|
||||
; CHECK: atom.sys.or.b64
|
||||
%tmp91 = tail call i64 @llvm.nvvm.atomic.or.gen.i.sys.i64.p0i64(i64* %llp, i64 %ll)
|
||||
|
||||
; CHECK: atom.cta.xor.b32
|
||||
%tmp92 = tail call i32 @llvm.nvvm.atomic.xor.gen.i.cta.i32.p0i32(i32* %ip, i32 %i)
|
||||
; CHECK: atom.cta.xor.b64
|
||||
%tmp94 = tail call i64 @llvm.nvvm.atomic.xor.gen.i.cta.i64.p0i64(i64* %llp, i64 %ll)
|
||||
; CHECK: atom.sys.xor.b32
|
||||
%tmp95 = tail call i32 @llvm.nvvm.atomic.xor.gen.i.sys.i32.p0i32(i32* %ip, i32 %i)
|
||||
; CHECK: atom.sys.xor.b64
|
||||
%tmp97 = tail call i64 @llvm.nvvm.atomic.xor.gen.i.sys.i64.p0i64(i64* %llp, i64 %ll)
|
||||
|
||||
; CHECK: atom.cta.cas.b32
|
||||
%tmp98 = tail call i32 @llvm.nvvm.atomic.cas.gen.i.cta.i32.p0i32(i32* %ip, i32 %i, i32 %i)
|
||||
; CHECK: atom.cta.cas.b64
|
||||
%tmp100 = tail call i64 @llvm.nvvm.atomic.cas.gen.i.cta.i64.p0i64(i64* %llp, i64 %ll, i64 %ll)
|
||||
; CHECK: atom.sys.cas.b32
|
||||
%tmp101 = tail call i32 @llvm.nvvm.atomic.cas.gen.i.sys.i32.p0i32(i32* %ip, i32 %i, i32 %i)
|
||||
; CHECK: atom.sys.cas.b64
|
||||
%tmp103 = tail call i64 @llvm.nvvm.atomic.cas.gen.i.sys.i64.p0i64(i64* %llp, i64 %ll, i64 %ll)
|
||||
|
||||
; CHECK: ret
|
||||
ret void
|
||||
}
|
||||
|
||||
; Make sure we use constants as operands to our scoped atomic calls, where appropriate.
|
||||
; CHECK-LABEL: .func test_atomics_scope_imm(
|
||||
define void @test_atomics_scope_imm(float* %fp, float %f,
|
||||
double* %dfp, double %df,
|
||||
i32* %ip, i32 %i,
|
||||
i32* %uip, i32 %ui,
|
||||
i64* %llp, i64 %ll) #0 {
|
||||
|
||||
; CHECK: atom.cta.add.s32{{.*}} %r{{[0-9]+}};
|
||||
%tmp1r = tail call i32 @llvm.nvvm.atomic.add.gen.i.cta.i32.p0i32(i32* %ip, i32 %i)
|
||||
; CHECK: atom.cta.add.s32{{.*}}, 1;
|
||||
%tmp1i = tail call i32 @llvm.nvvm.atomic.add.gen.i.cta.i32.p0i32(i32* %ip, i32 1)
|
||||
; CHECK: atom.cta.add.u64{{.*}}, %rd{{[0-9]+}};
|
||||
%tmp2r = tail call i64 @llvm.nvvm.atomic.add.gen.i.cta.i64.p0i64(i64* %llp, i64 %ll)
|
||||
; CHECK: atom.cta.add.u64{{.*}}, 2;
|
||||
%tmp2i = tail call i64 @llvm.nvvm.atomic.add.gen.i.cta.i64.p0i64(i64* %llp, i64 2)
|
||||
|
||||
; CHECK: atom.cta.add.f32{{.*}}, %f{{[0-9]+}};
|
||||
%tmp3r = tail call float @llvm.nvvm.atomic.add.gen.f.cta.f32.p0f32(float* %fp, float %f)
|
||||
; CHECK: atom.cta.add.f32{{.*}}, 0f40400000;
|
||||
%tmp3i = tail call float @llvm.nvvm.atomic.add.gen.f.cta.f32.p0f32(float* %fp, float 3.0)
|
||||
; CHECK: atom.cta.add.f64{{.*}}, %fd{{[0-9]+}};
|
||||
%tmp4r = tail call double @llvm.nvvm.atomic.add.gen.f.cta.f64.p0f64(double* %dfp, double %df)
|
||||
; CHECK: atom.cta.add.f64{{.*}}, 0d4010000000000000;
|
||||
%tmp4i = tail call double @llvm.nvvm.atomic.add.gen.f.cta.f64.p0f64(double* %dfp, double 4.0)
|
||||
|
||||
; CAS is implemented separately and has more arguments
|
||||
; CHECK: atom.cta.cas.b32{{.*}}], %r{{[0-9+]}}, %r{{[0-9+]}};
|
||||
%tmp5rr = tail call i32 @llvm.nvvm.atomic.cas.gen.i.cta.i32.p0i32(i32* %ip, i32 %i, i32 %i)
|
||||
; For some reason in 64-bit mode we end up passing 51 via a register.
|
||||
; CHECK32: atom.cta.cas.b32{{.*}}], %r{{[0-9+]}}, 51;
|
||||
%tmp5ri = tail call i32 @llvm.nvvm.atomic.cas.gen.i.cta.i32.p0i32(i32* %ip, i32 %i, i32 51)
|
||||
; CHECK: atom.cta.cas.b32{{.*}}], 52, %r{{[0-9+]}};
|
||||
%tmp5ir = tail call i32 @llvm.nvvm.atomic.cas.gen.i.cta.i32.p0i32(i32* %ip, i32 52, i32 %i)
|
||||
; CHECK: atom.cta.cas.b32{{.*}}], 53, 54;
|
||||
%tmp5ii = tail call i32 @llvm.nvvm.atomic.cas.gen.i.cta.i32.p0i32(i32* %ip, i32 53, i32 54)
|
||||
|
||||
; CHECK: ret
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i32 @llvm.nvvm.atomic.add.gen.i.cta.i32.p0i32(i32* nocapture, i32) #1
|
||||
declare i64 @llvm.nvvm.atomic.add.gen.i.cta.i64.p0i64(i64* nocapture, i64) #1
|
||||
declare i32 @llvm.nvvm.atomic.add.gen.i.sys.i32.p0i32(i32* nocapture, i32) #1
|
||||
declare i64 @llvm.nvvm.atomic.add.gen.i.sys.i64.p0i64(i64* nocapture, i64) #1
|
||||
declare float @llvm.nvvm.atomic.add.gen.f.cta.f32.p0f32(float* nocapture, float) #1
|
||||
declare double @llvm.nvvm.atomic.add.gen.f.cta.f64.p0f64(double* nocapture, double) #1
|
||||
declare float @llvm.nvvm.atomic.add.gen.f.sys.f32.p0f32(float* nocapture, float) #1
|
||||
declare double @llvm.nvvm.atomic.add.gen.f.sys.f64.p0f64(double* nocapture, double) #1
|
||||
declare i32 @llvm.nvvm.atomic.exch.gen.i.cta.i32.p0i32(i32* nocapture, i32) #1
|
||||
declare i64 @llvm.nvvm.atomic.exch.gen.i.cta.i64.p0i64(i64* nocapture, i64) #1
|
||||
declare i32 @llvm.nvvm.atomic.exch.gen.i.sys.i32.p0i32(i32* nocapture, i32) #1
|
||||
declare i64 @llvm.nvvm.atomic.exch.gen.i.sys.i64.p0i64(i64* nocapture, i64) #1
|
||||
declare i32 @llvm.nvvm.atomic.max.gen.i.cta.i32.p0i32(i32* nocapture, i32) #1
|
||||
declare i64 @llvm.nvvm.atomic.max.gen.i.cta.i64.p0i64(i64* nocapture, i64) #1
|
||||
declare i32 @llvm.nvvm.atomic.max.gen.i.sys.i32.p0i32(i32* nocapture, i32) #1
|
||||
declare i64 @llvm.nvvm.atomic.max.gen.i.sys.i64.p0i64(i64* nocapture, i64) #1
|
||||
declare i32 @llvm.nvvm.atomic.min.gen.i.cta.i32.p0i32(i32* nocapture, i32) #1
|
||||
declare i64 @llvm.nvvm.atomic.min.gen.i.cta.i64.p0i64(i64* nocapture, i64) #1
|
||||
declare i32 @llvm.nvvm.atomic.min.gen.i.sys.i32.p0i32(i32* nocapture, i32) #1
|
||||
declare i64 @llvm.nvvm.atomic.min.gen.i.sys.i64.p0i64(i64* nocapture, i64) #1
|
||||
declare i32 @llvm.nvvm.atomic.inc.gen.i.cta.i32.p0i32(i32* nocapture, i32) #1
|
||||
declare i32 @llvm.nvvm.atomic.inc.gen.i.sys.i32.p0i32(i32* nocapture, i32) #1
|
||||
declare i32 @llvm.nvvm.atomic.dec.gen.i.cta.i32.p0i32(i32* nocapture, i32) #1
|
||||
declare i32 @llvm.nvvm.atomic.dec.gen.i.sys.i32.p0i32(i32* nocapture, i32) #1
|
||||
declare i32 @llvm.nvvm.atomic.and.gen.i.cta.i32.p0i32(i32* nocapture, i32) #1
|
||||
declare i64 @llvm.nvvm.atomic.and.gen.i.cta.i64.p0i64(i64* nocapture, i64) #1
|
||||
declare i32 @llvm.nvvm.atomic.and.gen.i.sys.i32.p0i32(i32* nocapture, i32) #1
|
||||
declare i64 @llvm.nvvm.atomic.and.gen.i.sys.i64.p0i64(i64* nocapture, i64) #1
|
||||
declare i32 @llvm.nvvm.atomic.or.gen.i.cta.i32.p0i32(i32* nocapture, i32) #1
|
||||
declare i64 @llvm.nvvm.atomic.or.gen.i.cta.i64.p0i64(i64* nocapture, i64) #1
|
||||
declare i32 @llvm.nvvm.atomic.or.gen.i.sys.i32.p0i32(i32* nocapture, i32) #1
|
||||
declare i64 @llvm.nvvm.atomic.or.gen.i.sys.i64.p0i64(i64* nocapture, i64) #1
|
||||
declare i32 @llvm.nvvm.atomic.xor.gen.i.cta.i32.p0i32(i32* nocapture, i32) #1
|
||||
declare i64 @llvm.nvvm.atomic.xor.gen.i.cta.i64.p0i64(i64* nocapture, i64) #1
|
||||
declare i32 @llvm.nvvm.atomic.xor.gen.i.sys.i32.p0i32(i32* nocapture, i32) #1
|
||||
declare i64 @llvm.nvvm.atomic.xor.gen.i.sys.i64.p0i64(i64* nocapture, i64) #1
|
||||
declare i32 @llvm.nvvm.atomic.cas.gen.i.cta.i32.p0i32(i32* nocapture, i32, i32) #1
|
||||
declare i64 @llvm.nvvm.atomic.cas.gen.i.cta.i64.p0i64(i64* nocapture, i64, i64) #1
|
||||
declare i32 @llvm.nvvm.atomic.cas.gen.i.sys.i32.p0i32(i32* nocapture, i32, i32) #1
|
||||
declare i64 @llvm.nvvm.atomic.cas.gen.i.sys.i64.p0i64(i64* nocapture, i64, i64) #1
|
||||
|
||||
attributes #1 = { argmemonly nounwind }
|
Loading…
Reference in New Issue