[AMDGPU] gfx1010 wait count insertion

Differential Revision: https://reviews.llvm.org/D61534 llvm-svn: 359938
2019-05-03 21:53:53 +00:00 · 2019-05-03 21:53:53 +00:00 · d9dcf392c7
parent 41bbe101a2
commit d9dcf392c7
2 changed files with 405 additions and 57 deletions
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@ -100,7 +100,7 @@ public:

 #define CNT_MASK(t) (1u << (t))

-enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS };
+enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, VS_CNT, NUM_INST_CNTS };

 iterator_range<enum_iterator<InstCounterType>> inst_counter_types() {
  return make_range(enum_iterator<InstCounterType>(VM_CNT),
@ -113,6 +113,7 @@ struct {
  uint32_t VmcntMax;
  uint32_t ExpcntMax;
  uint32_t LgkmcntMax;
+  uint32_t VscntMax;
  int32_t NumVGPRsMax;
  int32_t NumSGPRsMax;
 } HardwareLimits;
@ -126,6 +127,8 @@ struct {

 enum WaitEventType {
  VMEM_ACCESS,      // vector-memory read & write
+  VMEM_READ_ACCESS, // vector-memory read
+  VMEM_WRITE_ACCESS,// vector-memory write
  LDS_ACCESS,       // lds read & write
  GDS_ACCESS,       // gds read & write
  SQ_MESSAGE,       // send message
@ -139,11 +142,12 @@ enum WaitEventType {
 };

 static const uint32_t WaitEventMaskForInst[NUM_INST_CNTS] = {
-  (1 << VMEM_ACCESS),
+  (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS),
  (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
      (1 << SQ_MESSAGE),
  (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
      (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS),
+  (1 << VMEM_WRITE_ACCESS)
 };

 // The mapping is:
@ -171,6 +175,9 @@ void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
  case LGKM_CNT:
    Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count);
    break;
+  case VS_CNT:
+    Wait.VsCnt = std::min(Wait.VsCnt, Count);
+    break;
  default:
    llvm_unreachable("bad InstCounterType");
  }
@ -199,6 +206,8 @@ public:
      return HardwareLimits.LgkmcntMax;
    case EXP_CNT:
      return HardwareLimits.ExpcntMax;
+    case VS_CNT:
+      return HardwareLimits.VscntMax;
    default:
      break;
    }
@ -221,10 +230,12 @@ public:

  // Mapping from event to counter.
  InstCounterType eventCounter(WaitEventType E) {
-    if (E == VMEM_ACCESS)
+    if (WaitEventMaskForInst[VM_CNT] & (1 << E))
      return VM_CNT;
    if (WaitEventMaskForInst[LGKM_CNT] & (1 << E))
      return LGKM_CNT;
+    if (WaitEventMaskForInst[VS_CNT] & (1 << E))
+      return VS_CNT;
    assert(WaitEventMaskForInst[EXP_CNT] & (1 << E));
    return EXP_CNT;
  }
@ -665,6 +676,9 @@ void WaitcntBrackets::print(raw_ostream &OS) {
    case EXP_CNT:
      OS << "    EXP_CNT(" << UB - LB << "): ";
      break;
+    case VS_CNT:
+      OS << "    VS_CNT(" << UB - LB << "): ";
+      break;
    default:
      OS << "    UNKNOWN(" << UB - LB << "): ";
      break;
@ -704,7 +718,8 @@ void WaitcntBrackets::print(raw_ostream &OS) {
 bool WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
  return simplifyWaitcnt(VM_CNT, Wait.VmCnt) |
         simplifyWaitcnt(EXP_CNT, Wait.ExpCnt) |
-         simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
+         simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt) |
+         simplifyWaitcnt(VS_CNT, Wait.VsCnt);
 }

 bool WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
@ -747,6 +762,7 @@ void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
  applyWaitcnt(VM_CNT, Wait.VmCnt);
  applyWaitcnt(EXP_CNT, Wait.ExpCnt);
  applyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
+  applyWaitcnt(VS_CNT, Wait.VsCnt);
 }

 void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
@ -817,7 +833,9 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
  // TODO: Handle other cases of NeedsWaitcntVmBefore()
  if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
      MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
-      MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
+      MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
+      MI.getOpcode() == AMDGPU::BUFFER_GL0_INV ||
+      MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
    Wait.VmCnt = 0;
  }

@ -826,7 +844,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
  //   with knowledge of the called routines.
  if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
      MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {
-    Wait = AMDGPU::Waitcnt::allZero(IV);
+    Wait = Wait.combined(AMDGPU::Waitcnt::allZero(IV));
  }
  // Resolve vm waits before gs-done.
  else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
@ -998,7 +1016,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
  // requiring a WAITCNT beforehand.
  if (MI.getOpcode() == AMDGPU::S_BARRIER &&
      !ST->hasAutoWaitcntBeforeBarrier()) {
-    Wait = AMDGPU::Waitcnt::allZero(IV);
+    Wait = Wait.combined(AMDGPU::Waitcnt::allZero(IV));
  }

  // TODO: Remove this work-around, enable the assert for Bug 457939
@ -1016,15 +1034,25 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
  if (!ScoreBrackets.simplifyWaitcnt(Wait) && !IsForceEmitWaitcnt) {
    bool Modified = false;
    if (OldWaitcntInstr) {
-      if (TrackedWaitcntSet.count(OldWaitcntInstr)) {
-        TrackedWaitcntSet.erase(OldWaitcntInstr);
-        OldWaitcntInstr->eraseFromParent();
-        Modified = true;
-      } else {
-        int64_t Imm = OldWaitcntInstr->getOperand(0).getImm();
-        ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
+      for (auto II = OldWaitcntInstr->getIterator(), NextI = std::next(II);
+           &*II != &MI; II = NextI, ++NextI) {
+        if (II->isDebugInstr())
+          continue;
+
+        if (TrackedWaitcntSet.count(&*II)) {
+          TrackedWaitcntSet.erase(&*II);
+          II->eraseFromParent();
+          Modified = true;
+        } else if (II->getOpcode() == AMDGPU::S_WAITCNT) {
+          int64_t Imm = II->getOperand(0).getImm();
+          ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
+        } else {
+          assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
+          assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
+          ScoreBrackets.applyWaitcnt(
+              AMDGPU::Waitcnt(0, 0, 0, II->getOperand(1).getImm()));
+        }
      }
-      Modified = true;
    }
    return Modified;
  }
@ -1038,39 +1066,88 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
    Wait.ExpCnt = 0;
  if (ForceEmitWaitcnt[LGKM_CNT])
    Wait.LgkmCnt = 0;
+  if (ForceEmitWaitcnt[VS_CNT])
+    Wait.VsCnt = 0;

  ScoreBrackets.applyWaitcnt(Wait);

  AMDGPU::Waitcnt OldWait;
+  bool Modified = false;
+
  if (OldWaitcntInstr) {
-    OldWait =
-        AMDGPU::decodeWaitcnt(IV, OldWaitcntInstr->getOperand(0).getImm());
+    for (auto II = OldWaitcntInstr->getIterator(), NextI = std::next(II);
+         &*II != &MI; II = NextI, NextI++) {
+      if (II->isDebugInstr())
+        continue;
+
+      if (II->getOpcode() == AMDGPU::S_WAITCNT) {
+        unsigned IEnc = II->getOperand(0).getImm();
+        AMDGPU::Waitcnt IWait = AMDGPU::decodeWaitcnt(IV, IEnc);
+        OldWait = OldWait.combined(IWait);
+        if (!TrackedWaitcntSet.count(&*II))
+          Wait = Wait.combined(IWait);
+        unsigned NewEnc = AMDGPU::encodeWaitcnt(IV, Wait);
+        if (IEnc != NewEnc) {
+          II->getOperand(0).setImm(NewEnc);
+          Modified = true;
+        }
+        Wait.VmCnt = ~0u;
+        Wait.LgkmCnt = ~0u;
+        Wait.ExpCnt = ~0u;
+      } else {
+        assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
+        assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
+
+        unsigned ICnt = II->getOperand(1).getImm();
+        OldWait.VsCnt = std::min(OldWait.VsCnt, ICnt);
+        if (!TrackedWaitcntSet.count(&*II))
+          Wait.VsCnt = std::min(Wait.VsCnt, ICnt);
+        if (Wait.VsCnt != ICnt) {
+          II->getOperand(1).setImm(Wait.VsCnt);
+          Modified = true;
+        }
+        Wait.VsCnt = ~0u;
+      }
+
+      LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"
+                        << "Old Instr: " << MI << '\n'
+                        << "New Instr: " << *II << '\n');
+
+      if (!Wait.hasWait())
+        return Modified;
+    }
  }
-  if (OldWait.dominates(Wait))
-    return false;

-  if (OldWaitcntInstr && !TrackedWaitcntSet.count(OldWaitcntInstr))
-    Wait = Wait.combined(OldWait);
-
-  unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
-  if (OldWaitcntInstr) {
-    OldWaitcntInstr->getOperand(0).setImm(Enc);
-
-    LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"
-                      << "Old Instr: " << MI << '\n'
-                      << "New Instr: " << *OldWaitcntInstr << '\n');
-  } else {
+  if (Wait.VmCnt != ~0u || Wait.LgkmCnt != ~0u || Wait.ExpCnt != ~0u) {
+    unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
    auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
                             MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
                         .addImm(Enc);
    TrackedWaitcntSet.insert(SWaitInst);
+    Modified = true;

    LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
                      << "Old Instr: " << MI << '\n'
                      << "New Instr: " << *SWaitInst << '\n');
  }

-  return true;
+  if (Wait.VsCnt != ~0u) {
+    assert(ST->hasVscnt());
+
+    auto SWaitInst =
+        BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
+                TII->get(AMDGPU::S_WAITCNT_VSCNT))
+            .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+            .addImm(Wait.VsCnt);
+    TrackedWaitcntSet.insert(SWaitInst);
+    Modified = true;
+
+    LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
+                      << "Old Instr: " << MI << '\n'
+                      << "New Instr: " << *SWaitInst << '\n');
+  }
+
+  return Modified;
 }

 // This is a flat memory operation. Check to see if it has memory
@ -1105,8 +1182,15 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
  } else if (TII->isFLAT(Inst)) {
    assert(Inst.mayLoad() || Inst.mayStore());

-    if (TII->usesVM_CNT(Inst))
-      ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
+    if (TII->usesVM_CNT(Inst)) {
+      if (!ST->hasVscnt())
+        ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
+      else if (Inst.mayLoad() &&
+               AMDGPU::getAtomicRetOp(Inst.getOpcode()) == -1)
+        ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst);
+      else
+        ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst);
+    }

    if (TII->usesLGKM_CNT(Inst)) {
      ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
@ -1121,8 +1205,19 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
             // TODO: get a better carve out.
             Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
             Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
-             Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) {
-    ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
+             Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL &&
+             Inst.getOpcode() != AMDGPU::BUFFER_GL0_INV &&
+             Inst.getOpcode() != AMDGPU::BUFFER_GL1_INV) {
+    if (!ST->hasVscnt())
+      ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
+    else if ((Inst.mayLoad() &&
+              AMDGPU::getAtomicRetOp(Inst.getOpcode()) == -1) ||
+             /* IMAGE_GET_RESINFO / IMAGE_GET_LOD */
+             (TII->isMIMG(Inst) && !Inst.mayLoad() && !Inst.mayStore()))
+      ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst);
+    else if (Inst.mayStore())
+      ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst);
+
    if (ST->vmemWriteNeedsExpWaitcnt() &&
        (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
      ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
@ -1243,27 +1338,13 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
       Iter != E;) {
    MachineInstr &Inst = *Iter;

-    // Remove any previously existing waitcnts.
-    if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
-      if (OldWaitcntInstr) {
-        if (TrackedWaitcntSet.count(OldWaitcntInstr)) {
-          TrackedWaitcntSet.erase(OldWaitcntInstr);
-          OldWaitcntInstr->eraseFromParent();
-          OldWaitcntInstr = nullptr;
-        } else if (!TrackedWaitcntSet.count(&Inst)) {
-          // Two successive s_waitcnt's, both of which are pre-existing and
-          // are therefore preserved.
-          int64_t Imm = OldWaitcntInstr->getOperand(0).getImm();
-          ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
-        } else {
-          ++Iter;
-          Inst.eraseFromParent();
-          Modified = true;
-          continue;
-        }
-      }
-
-      OldWaitcntInstr = &Inst;
+    // Track pre-existing waitcnts from earlier iterations.
+    if (Inst.getOpcode() == AMDGPU::S_WAITCNT ||
+        (Inst.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
+         Inst.getOperand(0).isReg() &&
+         Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL)) {
+      if (!OldWaitcntInstr)
+        OldWaitcntInstr = &Inst;
      ++Iter;
      continue;
    }
@ -1320,7 +1401,8 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
      // Restore the vccz bit.  Any time a value is written to vcc, the vcc
      // bit is updated, so we can restore the bit by reading the value of
      // vcc and then writing it back to the register.
-      BuildMI(Block, Inst, Inst.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
+      BuildMI(Block, Inst, Inst.getDebugLoc(),
+              TII->get(AMDGPU::S_MOV_B64),
              AMDGPU::VCC)
          .addReg(AMDGPU::VCC);
      VCCZBugHandledSet.insert(&Inst);
@ -1348,6 +1430,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
  HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
  HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
  HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
+  HardwareLimits.VscntMax = ST->hasVscnt() ? 63 : 0;

  HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs();
  HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs();
@ -1483,6 +1566,11 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
    // TODO: Could insert earlier and schedule more liberally with operations
    // that only use caller preserved registers.
    MachineBasicBlock &EntryBB = MF.front();
+    if (ST->hasVscnt())
+      BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(),
+              TII->get(AMDGPU::S_WAITCNT_VSCNT))
+      .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+      .addImm(0);
    BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
      .addImm(0);

--- a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
@ -0,0 +1,260 @@
+; RUN: llc -march=amdgcn -mcpu=gfx802  -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8_9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900  -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9_10,GFX8_9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX9_10 %s
+
+; GCN-LABEL: barrier_vmcnt_global:
+; GFX8:         flat_load_dword
+; GFX9_10:      global_load_dword
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX9_10-NEXT: s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:     s_barrier
+define amdgpu_kernel void @barrier_vmcnt_global(i32 addrspace(1)* %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = shl nuw nsw i64 %tmp1, 32
+  %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp1
+  %tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4
+  fence syncscope("singlethread") release
+  tail call void @llvm.amdgcn.s.barrier()
+  fence syncscope("singlethread") acquire
+  %tmp5 = add nuw nsw i64 %tmp2, 4294967296
+  %tmp6 = lshr exact i64 %tmp5, 32
+  %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp6
+  store i32 %tmp4, i32 addrspace(1)* %tmp7, align 4
+  ret void
+}
+
+; GCN-LABEL: barrier_vscnt_global:
+; GFX8:       flat_store_dword
+; GFX9_10:    global_store_dword
+; GFX8-NEXT:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX9-NEXT:  s_waitcnt vmcnt(0){{$}}
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-NEXT:   s_barrier
+define amdgpu_kernel void @barrier_vscnt_global(i32 addrspace(1)* %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = shl nuw nsw i64 %tmp1, 32
+  %tmp3 = add nuw nsw i64 %tmp2, 8589934592
+  %tmp4 = lshr exact i64 %tmp3, 32
+  %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp4
+  store i32 0, i32 addrspace(1)* %tmp5, align 4
+  fence syncscope("singlethread") release
+  tail call void @llvm.amdgcn.s.barrier() #3
+  fence syncscope("singlethread") acquire
+  %tmp6 = add nuw nsw i64 %tmp2, 4294967296
+  %tmp7 = lshr exact i64 %tmp6, 32
+  %tmp8 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp7
+  store i32 1, i32 addrspace(1)* %tmp8, align 4
+  ret void
+}
+
+; GCN-LABEL: barrier_vmcnt_vscnt_global:
+; GFX8:         flat_load_dword
+; GFX9_10:      global_load_dword
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX9_10-NEXT: s_waitcnt vmcnt(0){{$}}
+; GFX10-NEXT:   s_waitcnt_vscnt null, 0x0
+; GCN-NEXT:     s_barrier
+define amdgpu_kernel void @barrier_vmcnt_vscnt_global(i32 addrspace(1)* %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = shl nuw nsw i64 %tmp1, 32
+  %tmp3 = add nuw nsw i64 %tmp2, 8589934592
+  %tmp4 = lshr exact i64 %tmp3, 32
+  %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp4
+  store i32 0, i32 addrspace(1)* %tmp5, align 4
+  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp1
+  %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+  fence syncscope("singlethread") release
+  tail call void @llvm.amdgcn.s.barrier()
+  fence syncscope("singlethread") acquire
+  %tmp8 = add nuw nsw i64 %tmp2, 4294967296
+  %tmp9 = lshr exact i64 %tmp8, 32
+  %tmp10 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp9
+  store i32 %tmp7, i32 addrspace(1)* %tmp10, align 4
+  ret void
+}
+
+; GCN-LABEL: barrier_vmcnt_flat:
+; GCN:      flat_load_dword
+; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NEXT: s_barrier
+define amdgpu_kernel void @barrier_vmcnt_flat(i32* %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = shl nuw nsw i64 %tmp1, 32
+  %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp1
+  %tmp4 = load i32, i32* %tmp3, align 4
+  fence syncscope("singlethread") release
+  tail call void @llvm.amdgcn.s.barrier()
+  fence syncscope("singlethread") acquire
+  %tmp5 = add nuw nsw i64 %tmp2, 4294967296
+  %tmp6 = lshr exact i64 %tmp5, 32
+  %tmp7 = getelementptr inbounds i32, i32* %arg, i64 %tmp6
+  store i32 %tmp4, i32* %tmp7, align 4
+  ret void
+}
+
+; GCN-LABEL: barrier_vscnt_flat:
+; GCN:         flat_store_dword
+; GFX8_9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10-NEXT:  s_waitcnt lgkmcnt(0){{$}}
+; GFX10-NEXT:  s_waitcnt_vscnt null, 0x0
+; GCN-NEXT:    s_barrier
+define amdgpu_kernel void @barrier_vscnt_flat(i32* %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = shl nuw nsw i64 %tmp1, 32
+  %tmp3 = add nuw nsw i64 %tmp2, 8589934592
+  %tmp4 = lshr exact i64 %tmp3, 32
+  %tmp5 = getelementptr inbounds i32, i32* %arg, i64 %tmp4
+  store i32 0, i32* %tmp5, align 4
+  fence syncscope("singlethread") release
+  tail call void @llvm.amdgcn.s.barrier() #3
+  fence syncscope("singlethread") acquire
+  %tmp6 = add nuw nsw i64 %tmp2, 4294967296
+  %tmp7 = lshr exact i64 %tmp6, 32
+  %tmp8 = getelementptr inbounds i32, i32* %arg, i64 %tmp7
+  store i32 1, i32* %tmp8, align 4
+  ret void
+}
+
+; GCN-LABEL: barrier_vmcnt_vscnt_flat:
+; GCN:        flat_load_dword
+; GCN-NEXT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-NEXT:   s_barrier
+define amdgpu_kernel void @barrier_vmcnt_vscnt_flat(i32* %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = shl nuw nsw i64 %tmp1, 32
+  %tmp3 = add nuw nsw i64 %tmp2, 8589934592
+  %tmp4 = lshr exact i64 %tmp3, 32
+  %tmp5 = getelementptr inbounds i32, i32* %arg, i64 %tmp4
+  store i32 0, i32* %tmp5, align 4
+  %tmp6 = getelementptr inbounds i32, i32* %arg, i64 %tmp1
+  %tmp7 = load i32, i32* %tmp6, align 4
+  fence syncscope("singlethread") release
+  tail call void @llvm.amdgcn.s.barrier()
+  fence syncscope("singlethread") acquire
+  %tmp8 = add nuw nsw i64 %tmp2, 4294967296
+  %tmp9 = lshr exact i64 %tmp8, 32
+  %tmp10 = getelementptr inbounds i32, i32* %arg, i64 %tmp9
+  store i32 %tmp7, i32* %tmp10, align 4
+  ret void
+}
+
+; GCN-LABEL: barrier_vmcnt_vscnt_flat_workgroup:
+; GCN:        flat_load_dword
+; GCN-NEXT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-NEXT:   s_barrier
+define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(i32* %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = shl nuw nsw i64 %tmp1, 32
+  %tmp3 = add nuw nsw i64 %tmp2, 8589934592
+  %tmp4 = lshr exact i64 %tmp3, 32
+  %tmp5 = getelementptr inbounds i32, i32* %arg, i64 %tmp4
+  store i32 0, i32* %tmp5, align 4
+  %tmp6 = getelementptr inbounds i32, i32* %arg, i64 %tmp1
+  %tmp7 = load i32, i32* %tmp6, align 4
+  fence syncscope("workgroup") release
+  tail call void @llvm.amdgcn.s.barrier()
+  fence syncscope("workgroup") acquire
+  %tmp8 = add nuw nsw i64 %tmp2, 4294967296
+  %tmp9 = lshr exact i64 %tmp8, 32
+  %tmp10 = getelementptr inbounds i32, i32* %arg, i64 %tmp9
+  store i32 %tmp7, i32* %tmp10, align 4
+  ret void
+}
+
+; GCN-LABEL: load_vmcnt_global:
+; GFX8:     flat_load_dword
+; GFX9_10:  global_load_dword
+; GFX8:     s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX9_10:  s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT: {{global|flat}}_store_dword
+define amdgpu_kernel void @load_vmcnt_global(i32 addrspace(1)* %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = shl nuw nsw i64 %tmp1, 32
+  %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp1
+  %tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4
+  %tmp5 = add nuw nsw i64 %tmp2, 4294967296
+  %tmp6 = lshr exact i64 %tmp5, 32
+  %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp6
+  store i32 %tmp4, i32 addrspace(1)* %tmp7, align 4
+  ret void
+}
+
+; GCN-LABEL: load_vmcnt_flat:
+; GCN:      flat_load_dword
+; GCN-NOT:  vscnt
+; GCN:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NEXT: {{global|flat}}_store_dword
+define amdgpu_kernel void @load_vmcnt_flat(i32* %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = shl nuw nsw i64 %tmp1, 32
+  %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp1
+  %tmp4 = load i32, i32* %tmp3, align 4
+  %tmp5 = add nuw nsw i64 %tmp2, 4294967296
+  %tmp6 = lshr exact i64 %tmp5, 32
+  %tmp7 = getelementptr inbounds i32, i32* %arg, i64 %tmp6
+  store i32 %tmp4, i32* %tmp7, align 4
+  ret void
+}
+
+; GCN-LABEL: store_vscnt_private:
+; GCN:         buffer_store_dword
+; GFX8_9-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT:  s_waitcnt_vscnt null, 0x0
+; GCN-NEXT:    s_setpc_b64
+define void @store_vscnt_private(i32 addrspace(5)* %p) {
+  store i32 0, i32 addrspace(5)* %p
+  ret void
+}
+
+; GCN-LABEL: store_vscnt_global:
+; GFX8:        flat_store_dword
+; GFX9_10:     global_store_dword
+; GFX8_9-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT:  s_waitcnt_vscnt null, 0x0
+; GCN-NEXT:    s_setpc_b64
+define void @store_vscnt_global(i32 addrspace(1)* %p) {
+  store i32 0, i32 addrspace(1)* %p
+  ret void
+}
+
+; GCN-LABEL: store_vscnt_flat:
+; GCN:         flat_store_dword
+; GFX8_9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10-NEXT:  s_waitcnt lgkmcnt(0){{$}}
+; GFX10-NEXT:  s_waitcnt_vscnt null, 0x0
+; GCN-NEXT:    s_setpc_b64
+define void @store_vscnt_flat(i32* %p) {
+  store i32 0, i32* %p
+  ret void
+}
+
+; GCN-LABEL: function_prologue:
+; GCN-NEXT:   s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0){{$}}
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-NEXT:   s_setpc_b64
+define void @function_prologue() {
+  ret void
+}
+
+declare void @llvm.amdgcn.s.barrier()
+declare i32 @llvm.amdgcn.workitem.id.x()