AMDGPU: Add intrinsic for s_setreg

This will be more useful with fenv access implemented.
2020-05-06 16:43:33 -04:00 · 2020-05-06 16:43:33 -04:00 · 97f3f0bab0
parent 504d8d9d8a
commit 97f3f0bab0
11 changed files with 164 additions and 24 deletions
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@ -44,6 +44,7 @@ BUILTIN(__builtin_amdgcn_mbcnt_lo, "UiUiUi", "nc")
 // Instruction builtins.
 //===----------------------------------------------------------------------===//
 BUILTIN(__builtin_amdgcn_s_getreg, "UiIi", "n")
+BUILTIN(__builtin_amdgcn_s_setreg, "vIiUi", "n")
 BUILTIN(__builtin_amdgcn_s_getpc, "LUi", "n")
 BUILTIN(__builtin_amdgcn_s_waitcnt, "vIi", "n")
 BUILTIN(__builtin_amdgcn_s_sendmsg, "vIiUi", "n")
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@ -715,6 +715,12 @@ kernel void test_mqsad_u32_u8(global uint4* out, ulong src0, uint src1, uint4 sr
  *out = __builtin_amdgcn_mqsad_u32_u8(src0, src1, src2);
 }

+// CHECK-LABEL: test_s_setreg(
+// CHECK: call void @llvm.amdgcn.s.setreg(i32 8193, i32 %val)
+kernel void test_s_setreg(uint val) {
+  __builtin_amdgcn_s_setreg(8193, val);
+}
+
 // CHECK-DAG: [[$WI_RANGE]] = !{i32 0, i32 1024}
 // CHECK-DAG: [[$WS_RANGE]] = !{i16 1, i16 1025}
 // CHECK-DAG: attributes #[[$NOUNWIND_READONLY:[0-9]+]] = { nounwind readonly }
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error.cl
@ -139,3 +139,8 @@ void test_fence() {
  const char ptr[] = "workgroup";
  __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, ptr); // expected-error {{expression is not a string literal}}
 }
+
+void test_s_setreg(int x, int y) {
+  __builtin_amdgcn_s_setreg(x, 0); // expected-error {{argument to '__builtin_amdgcn_s_setreg' must be a constant integer}}
+  __builtin_amdgcn_s_setreg(x, y); // expected-error {{argument to '__builtin_amdgcn_s_setreg' must be a constant integer}}
+}
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@ -1207,6 +1207,16 @@ def int_amdgcn_s_getreg :
  [IntrInaccessibleMemOnly, IntrReadMem, IntrSpeculatable, ImmArg<ArgIndex<0>>]
 >;

+// Note this can be used to set FP environment properties that are
+// unsafe to change in non-strictfp functions. The register properties
+// available (and value required to access them) may differ per
+// subtarget. llvm.amdgcn.s.setreg(hwmode, value)
+def int_amdgcn_s_setreg :
+  GCCBuiltin<"__builtin_amdgcn_s_setreg">,
+  Intrinsic<[], [llvm_i32_ty, llvm_i32_ty],
+  [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>]
+>;
+
 // int_amdgcn_s_getpc is provided to allow a specific style of position
 // independent code to determine the high part of its address when it is
 // known (through convention) that the code and any data of interest does
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@ -202,13 +202,6 @@ def AMDGPUSetCCOp : SDTypeProfile<1, 3, [        // setcc

 def AMDGPUsetcc : SDNode<"AMDGPUISD::SETCC", AMDGPUSetCCOp>;

-def AMDGPUSetRegOp :  SDTypeProfile<0, 2, [
-  SDTCisInt<0>, SDTCisInt<1>
-]>;
-
-def AMDGPUsetreg : SDNode<"AMDGPUISD::SETREG", AMDGPUSetRegOp, [
-  SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>;
-
 def AMDGPUfma : SDNode<"AMDGPUISD::FMA_W_CHAIN", SDTFPTernaryOp, [
   SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;

--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@ -2783,6 +2783,10 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
      constrainOpWithReadfirstlane(MI, MRI, 2); // M0
      return;
    }
+    case Intrinsic::amdgcn_s_setreg: {
+      constrainOpWithReadfirstlane(MI, MRI, 2);
+      return;
+    }
    default: {
      if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
              AMDGPU::lookupRsrcIntrinsic(IntrID)) {
@ -3924,6 +3928,13 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
      OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
      break;
    }
+    case Intrinsic::amdgcn_s_setreg: {
+      // This must be an SGPR, but accept a VGPR.
+      unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
+                                   AMDGPU::SGPRRegBankID);
+      OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
+      break;
+    }
    case Intrinsic::amdgcn_end_cf:
    case Intrinsic::amdgcn_init_exec: {
      unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@ -7977,32 +7977,32 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
  const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE |
                               (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
                               (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
-  const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16);
+  const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);

  const bool HasFP32Denormals = hasFP32Denormals(DAG.getMachineFunction());

  if (!HasFP32Denormals) {
    SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);

-    SDValue EnableDenorm;
+    SDNode *EnableDenorm;
    if (Subtarget->hasDenormModeInst()) {
      const SDValue EnableDenormValue =
          getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, SL, Subtarget);

      EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
-                                 DAG.getEntryNode(), EnableDenormValue);
+                                 DAG.getEntryNode(), EnableDenormValue).getNode();
    } else {
      const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
                                                        SL, MVT::i32);
-      EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs,
-                                 DAG.getEntryNode(), EnableDenormValue,
-                                 BitField);
+      EnableDenorm =
+          DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
+                             {EnableDenormValue, BitField, DAG.getEntryNode()});
    }

    SDValue Ops[3] = {
      NegDivScale0,
-      EnableDenorm.getValue(0),
-      EnableDenorm.getValue(1)
+      SDValue(EnableDenorm, 0),
+      SDValue(EnableDenorm, 1)
    };

    NegDivScale0 = DAG.getMergeValues(Ops, SL);
@ -8026,25 +8026,25 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
                             NumeratorScaled, Fma3);

  if (!HasFP32Denormals) {
-    SDValue DisableDenorm;
+    SDNode *DisableDenorm;
    if (Subtarget->hasDenormModeInst()) {
      const SDValue DisableDenormValue =
          getSPDenormModeValue(FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, SL, Subtarget);

      DisableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, MVT::Other,
                                  Fma4.getValue(1), DisableDenormValue,
-                                  Fma4.getValue(2));
+                                  Fma4.getValue(2)).getNode();
    } else {
      const SDValue DisableDenormValue =
          DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);

-      DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other,
-                                  Fma4.getValue(1), DisableDenormValue,
-                                  BitField, Fma4.getValue(2));
+      DisableDenorm = DAG.getMachineNode(
+          AMDGPU::S_SETREG_B32, SL, MVT::Other,
+          {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
    }

    SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
-                                      DisableDenorm, DAG.getRoot());
+                                      SDValue(DisableDenorm, 0), DAG.getRoot());
    DAG.setRoot(OutputChain);
  }

--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@ -1131,7 +1131,7 @@ def blgp : NamedOperandU32<"BLGP", NamedMatchClass<"BLGP">>;
 def cbsz : NamedOperandU32<"CBSZ", NamedMatchClass<"CBSZ">>;
 def abid : NamedOperandU32<"ABID", NamedMatchClass<"ABID">>;

-def hwreg : NamedOperandU16<"Hwreg", NamedMatchClass<"Hwreg", 0>>;
+def hwreg : NamedOperandU32<"Hwreg", NamedMatchClass<"Hwreg", 0>>;

 def exp_tgt : NamedOperandU32<"ExpTgt", NamedMatchClass<"ExpTgt", 0>> {

--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@ -801,13 +801,13 @@ def S_GETREG_B32 : SOPK_Pseudo <
 >;
 }

-let hasSideEffects = 1 in {
+let hasSideEffects = 1, mayLoad = 0, mayStore =0 in {

 def S_SETREG_B32 : SOPK_Pseudo <
  "s_setreg_b32",
  (outs), (ins SReg_32:$sdst, hwreg:$simm16),
  "$simm16, $sdst",
-  [(AMDGPUsetreg i32:$sdst, (i16 timm:$simm16))]> {
+  [(int_amdgcn_s_setreg (i32 timm:$simm16), i32:$sdst)]> {
  let Defs = [MODE];
  let Uses = [MODE];
 }
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll
@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+
+; Set FP32 fp_round to round to zero
+define amdgpu_kernel void @test_setreg_f32_round_mode_rtz() {
+; GCN-LABEL: test_setreg_f32_round_mode_rtz:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 3), 3
+; GCN-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 4097, i32 3)
+  ret void
+}
+
+; Set FP64/FP16 fp_round to round to zero
+define amdgpu_kernel void @test_setreg_f64_round_mode_rtz() {
+; GCN-LABEL: test_setreg_f64_round_mode_rtz:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 3), 3
+; GCN-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 4225, i32 3)
+  ret void
+}
+
+; Set all fp_round to round to zero
+define amdgpu_kernel void @test_setreg_all_round_mode_rtz() {
+; GCN-LABEL: test_setreg_all_round_mode_rtz:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 5), 7
+; GCN-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 8193, i32 7)
+  ret void
+}
+
+; Set FP32 fp_round to dynamic mode
+define amdgpu_cs void @test_setreg_roundingmode_var(i32 inreg %var.mode) {
+; GCN-LABEL: test_setreg_roundingmode_var:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 3), s0
+; GCN-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 4097, i32 %var.mode)
+  ret void
+}
+
+define void @test_setreg_roundingmode_var_vgpr(i32 %var.mode) {
+; GCN-LABEL: test_setreg_roundingmode_var_vgpr:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_readfirstlane_b32 s4, v0
+; GCN-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 3), s4
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.amdgcn.s.setreg(i32 4097, i32 %var.mode)
+  ret void
+}
+
+declare void @llvm.amdgcn.s.setreg(i32 immarg, i32) #0
+
+attributes #0 = { nounwind }
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setreg.ll
@ -0,0 +1,55 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; FIXME: This copy of the test is a subset of the -global-isel version, since the VGPR case doesn't work.
+
+; Set FP32 fp_round to round to zero
+define amdgpu_kernel void @test_setreg_f32_round_mode_rtz() {
+; GCN-LABEL: test_setreg_f32_round_mode_rtz:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 3), 3
+; GCN-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 4097, i32 3)
+  ret void
+}
+
+; Set FP64/FP16 fp_round to round to zero
+define amdgpu_kernel void @test_setreg_f64_round_mode_rtz() {
+; GCN-LABEL: test_setreg_f64_round_mode_rtz:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 3), 3
+; GCN-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 4225, i32 3)
+  ret void
+}
+
+; Set all fp_round to round to zero
+define amdgpu_kernel void @test_setreg_all_round_mode_rtz() {
+; GCN-LABEL: test_setreg_all_round_mode_rtz:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 5), 7
+; GCN-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 8193, i32 7)
+  ret void
+}
+
+; Set FP32 fp_round to dynamic mode
+define amdgpu_cs void @test_setreg_roundingmode_var(i32 inreg %var.mode) {
+; GCN-LABEL: test_setreg_roundingmode_var:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 3), s0
+; GCN-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 4097, i32 %var.mode)
+  ret void
+}
+
+; FIXME: Broken for DAG
+; define void @test_setreg_roundingmode_var_vgpr(i32 %var.mode) {
+;   call void @llvm.amdgcn.s.setreg(i32 4097, i32 %var.mode)
+;   ret void
+; }
+
+declare void @llvm.amdgcn.s.setreg(i32 immarg, i32) #0
+
+attributes #0 = { nounwind }