From 70e8bc1bf3fd7374ea98990db69e87eb7bd86e1b Mon Sep 17 00:00:00 2001
From: David Stuttard <david.stuttard@amd.com>
Date: Thu, 22 Jun 2017 16:29:22 +0000
Subject: [PATCH] [AMDGPU] Add intrinsics for tbuffer load and store

Intrinsic already existed for llvm.SI.tbuffer.store

Needed tbuffer.load and also re-implementing the intrinsic as llvm.amdgcn.tbuffer.*

Added CodeGen tests for the 2 new variants added.
Left the original llvm.SI.tbuffer.store implementation to avoid issues with existing code

Subscribers: arsenm, kzhuravl, wdng, nhaehnle, yaxunl, tony-tye, tpr

Differential Revision: https://reviews.llvm.org/D30687

llvm-svn: 306031
---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      |  27 ++
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |   2 +
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h   |   2 +
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp      |  48 ++
 llvm/lib/Target/AMDGPU/BUFInstructions.td     | 411 ++++++++++++++----
 .../AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp  |  18 +
 .../AMDGPU/InstPrinter/AMDGPUInstPrinter.h    |   4 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     | 126 ++++--
 llvm/lib/Target/AMDGPU/SIInstrInfo.td         |  45 +-
 .../CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll   |  38 +-
 .../AMDGPU/llvm.amdgcn.tbuffer.load.ll        | 109 +++++
 .../AMDGPU/llvm.amdgcn.tbuffer.store.ll       | 110 +++++
 llvm/test/CodeGen/AMDGPU/merge-store-crash.ll |   4 +-
 .../test/CodeGen/AMDGPU/merge-store-usedef.ll |   4 +-
 llvm/test/CodeGen/AMDGPU/mubuf.ll             |   8 +-
 .../AMDGPU/scheduler-subrange-crash.ll        |  18 +-
 .../AMDGPU/si-triv-disjoint-mem-access.ll     |   9 +-
 llvm/test/MC/AMDGPU/mtbuf.s                   |  36 ++
 llvm/test/MC/Disassembler/AMDGPU/mtbuf_vi.txt |  22 +
 19 files changed, 895 insertions(+), 146 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll
 create mode 100644 llvm/test/MC/AMDGPU/mtbuf.s
 create mode 100644 llvm/test/MC/Disassembler/AMDGPU/mtbuf_vi.txt

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 8017223c4ab0..4e0529a32d29 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -475,6 +475,33 @@ class AMDGPUBufferStore : Intrinsic <
 def int_amdgcn_buffer_store_format : AMDGPUBufferStore;
 def int_amdgcn_buffer_store : AMDGPUBufferStore;
 
+def int_amdgcn_tbuffer_load : Intrinsic <
+    [llvm_any_ty],    // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
+    [llvm_v4i32_ty,   // rsrc(SGPR)
+     llvm_i32_ty,     // vindex(VGPR)
+     llvm_i32_ty,     // voffset(VGPR)
+     llvm_i32_ty,     // soffset(SGPR)
+     llvm_i32_ty,     // offset(imm)
+     llvm_i32_ty,     // dfmt(imm)
+     llvm_i32_ty,     // nfmt(imm)
+     llvm_i1_ty,     // glc(imm)
+     llvm_i1_ty],    // slc(imm)
+    []>;
+
+def int_amdgcn_tbuffer_store : Intrinsic <
+    [],
+    [llvm_any_ty,    // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
+     llvm_v4i32_ty,  // rsrc(SGPR)
+     llvm_i32_ty,    // vindex(VGPR)
+     llvm_i32_ty,    // voffset(VGPR)
+     llvm_i32_ty,    // soffset(SGPR)
+     llvm_i32_ty,    // offset(imm)
+     llvm_i32_ty,    // dfmt(imm)
+     llvm_i32_ty,    // nfmt(imm)
+     llvm_i1_ty,     // glc(imm)
+     llvm_i1_ty],    // slc(imm)
+    []>;
+
 class AMDGPUBufferAtomic : Intrinsic <
   [llvm_i32_ty],
   [llvm_i32_ty,       // vdata(VGPR)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 061edf8fbc3e..96f819fd0e68 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3664,6 +3664,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(STORE_MSKOR)
   NODE_NAME_CASE(LOAD_CONSTANT)
   NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
+  NODE_NAME_CASE(TBUFFER_STORE_FORMAT_X3)
+  NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
   NODE_NAME_CASE(ATOMIC_CMP_SWAP)
   NODE_NAME_CASE(ATOMIC_INC)
   NODE_NAME_CASE(ATOMIC_DEC)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index ed9721970dd9..a45234e2b39f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -403,6 +403,8 @@ enum NodeType : unsigned {
   STORE_MSKOR,
   LOAD_CONSTANT,
   TBUFFER_STORE_FORMAT,
+  TBUFFER_STORE_FORMAT_X3,
+  TBUFFER_LOAD_FORMAT,
   ATOMIC_CMP_SWAP,
   ATOMIC_INC,
   ATOMIC_DEC,
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 0dc939bd7950..7b8756050b75 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -152,6 +152,8 @@ public:
     ImmTyExpTgt,
     ImmTyExpCompr,
     ImmTyExpVM,
+    ImmTyDFMT,
+    ImmTyNFMT,
     ImmTyHwreg,
     ImmTyOff,
     ImmTySendMsg,
@@ -294,6 +296,8 @@ public:
   bool isGLC() const { return isImmTy(ImmTyGLC); }
   bool isSLC() const { return isImmTy(ImmTySLC); }
   bool isTFE() const { return isImmTy(ImmTyTFE); }
+  bool isDFMT() const { return isImmTy(ImmTyDFMT) && isUInt<8>(getImm()); }
+  bool isNFMT() const { return isImmTy(ImmTyNFMT) && isUInt<8>(getImm()); }
   bool isBankMask() const { return isImmTy(ImmTyDppBankMask); }
   bool isRowMask() const { return isImmTy(ImmTyDppRowMask); }
   bool isBoundCtrl() const { return isImmTy(ImmTyDppBoundCtrl); }
@@ -638,6 +642,8 @@ public:
     case ImmTyGLC: OS << "GLC"; break;
     case ImmTySLC: OS << "SLC"; break;
     case ImmTyTFE: OS << "TFE"; break;
+    case ImmTyDFMT: OS << "DFMT"; break;
+    case ImmTyNFMT: OS << "NFMT"; break;
     case ImmTyClampSI: OS << "ClampSI"; break;
     case ImmTyOModSI: OS << "OModSI"; break;
     case ImmTyDppCtrl: OS << "DppCtrl"; break;
@@ -1033,6 +1039,8 @@ public:
   void cvtMubuf(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false); }
   void cvtMubufAtomic(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, false); }
   void cvtMubufAtomicReturn(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, true); }
+  void cvtMtbuf(MCInst &Inst, const OperandVector &Operands);
+
   AMDGPUOperand::Ptr defaultGLC() const;
   AMDGPUOperand::Ptr defaultSLC() const;
   AMDGPUOperand::Ptr defaultTFE() const;
@@ -3820,6 +3828,44 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
 }
 
+void AMDGPUAsmParser::cvtMtbuf(MCInst &Inst, const OperandVector &Operands) {
+  OptionalImmIndexMap OptionalIdx;
+
+  for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
+    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+
+    // Add the register arguments
+    if (Op.isReg()) {
+      Op.addRegOperands(Inst, 1);
+      continue;
+    }
+
+    // Handle the case where soffset is an immediate
+    if (Op.isImm() && Op.getImmTy() == AMDGPUOperand::ImmTyNone) {
+      Op.addImmOperands(Inst, 1);
+      continue;
+    }
+
+    // Handle tokens like 'offen' which are sometimes hard-coded into the
+    // asm string.  There are no MCInst operands for these.
+    if (Op.isToken()) {
+      continue;
+    }
+    assert(Op.isImm());
+
+    // Handle optional arguments
+    OptionalIdx[Op.getImmTy()] = i;
+  }
+
+  addOptionalImmOperand(Inst, Operands, OptionalIdx,
+                        AMDGPUOperand::ImmTyOffset);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDFMT);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyNFMT);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
+}
+
 //===----------------------------------------------------------------------===//
 // mimg
 //===----------------------------------------------------------------------===//
@@ -4000,6 +4046,8 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
   {"offset1", AMDGPUOperand::ImmTyOffset1, false, nullptr},
   {"gds",     AMDGPUOperand::ImmTyGDS, true, nullptr},
   {"offset",  AMDGPUOperand::ImmTyOffset, false, nullptr},
+  {"dfmt",    AMDGPUOperand::ImmTyDFMT, false, nullptr},
+  {"nfmt",    AMDGPUOperand::ImmTyNFMT, false, nullptr},
   {"glc",     AMDGPUOperand::ImmTyGLC, true, nullptr},
   {"slc",     AMDGPUOperand::ImmTySLC, true, nullptr},
   {"tfe",     AMDGPUOperand::ImmTyTFE, true, nullptr},
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 2aca65ac8430..2e96c14eaa32 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -57,6 +57,11 @@ class MUBUFAddr64Table <bit is_addr64, string suffix = ""> {
   string OpName = NAME # suffix;
 }
 
+class MTBUFAddr64Table <bit is_addr64, string suffix = ""> {
+  bit IsAddr64 = is_addr64;
+  string OpName = NAME # suffix;
+}
+
 //===----------------------------------------------------------------------===//
 // MTBUF classes
 //===----------------------------------------------------------------------===//
@@ -78,14 +83,31 @@ class MTBUF_Pseudo <string opName, dag outs, dag ins,
   let EXP_CNT = 1;
   let MTBUF = 1;
   let Uses = [EXEC];
-
   let hasSideEffects = 0;
   let SchedRW = [WriteVMEM];
+
+  let AsmMatchConverter = "cvtMtbuf";
+
+  bits<1> offen       = 0;
+  bits<1> idxen       = 0;
+  bits<1> addr64      = 0;
+  bits<1> has_vdata   = 1;
+  bits<1> has_vaddr   = 1;
+  bits<1> has_glc     = 1;
+  bits<1> glc_value   = 0; // the value for glc if no such operand
+  bits<4> dfmt_value  = 1; // the value for dfmt if no such operand
+  bits<3> nfmt_value  = 0; // the value for nfmt if no such operand
+  bits<1> has_srsrc   = 1;
+  bits<1> has_soffset = 1;
+  bits<1> has_offset  = 1;
+  bits<1> has_slc     = 1;
+  bits<1> has_tfe     = 1;
+  bits<1> has_dfmt    = 1;
+  bits<1> has_nfmt    = 1;
 }
 
 class MTBUF_Real <MTBUF_Pseudo ps> :
-  InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
-  Enc64 {
+  InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []> {
 
   let isPseudo = 0;
   let isCodeGenOnly = 0;
@@ -97,57 +119,168 @@ class MTBUF_Real <MTBUF_Pseudo ps> :
   let DisableEncoding    = ps.DisableEncoding;
   let TSFlags            = ps.TSFlags;
 
-  bits<8> vdata;
   bits<12> offset;
-  bits<1> offen;
-  bits<1> idxen;
-  bits<1> glc;
-  bits<1> addr64;
-  bits<4> dfmt;
-  bits<3> nfmt;
-  bits<8> vaddr;
-  bits<7> srsrc;
-  bits<1> slc;
-  bits<1> tfe;
-  bits<8> soffset;
-
-  let Inst{11-0}  = offset;
-  let Inst{12}    = offen;
-  let Inst{13}    = idxen;
-  let Inst{14}    = glc;
-  let Inst{22-19} = dfmt;
-  let Inst{25-23} = nfmt;
-  let Inst{31-26} = 0x3a; //encoding
-  let Inst{39-32} = vaddr;
-  let Inst{47-40} = vdata;
-  let Inst{52-48} = srsrc{6-2};
-  let Inst{54}    = slc;
-  let Inst{55}    = tfe;
-  let Inst{63-56} = soffset;
+  bits<1>  glc;
+  bits<4>  dfmt;
+  bits<3>  nfmt;
+  bits<8>  vaddr;
+  bits<8>  vdata;
+  bits<7>  srsrc;
+  bits<1>  slc;
+  bits<1>  tfe;
+  bits<8>  soffset;
 }
 
-class MTBUF_Load_Pseudo <string opName, RegisterClass regClass> : MTBUF_Pseudo <
-  opName, (outs regClass:$dst),
-  (ins u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
-       i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr, SReg_128:$srsrc,
-       i1imm:$slc, i1imm:$tfe, SCSrc_b32:$soffset),
-  " $dst, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"#
-  " $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset"> {
+class getMTBUFInsDA<list<RegisterClass> vdataList,
+                    list<RegisterClass> vaddrList=[]> {
+  RegisterClass vdataClass = !if(!empty(vdataList), ?, !head(vdataList));
+  RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
+  dag InsNoData = !if(!empty(vaddrList),
+    (ins                    SReg_128:$srsrc, SCSrc_b32:$soffset,
+         offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, slc:$slc, tfe:$tfe),
+    (ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset,
+         offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, slc:$slc, tfe:$tfe)
+  );
+  dag InsData = !if(!empty(vaddrList),
+    (ins vdataClass:$vdata,                    SReg_128:$srsrc,
+         SCSrc_b32:$soffset, offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc,
+         slc:$slc, tfe:$tfe),
+    (ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc,
+         SCSrc_b32:$soffset, offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc,
+         slc:$slc, tfe:$tfe)
+  );
+  dag ret = !if(!empty(vdataList), InsNoData, InsData);
+}
+
+class getMTBUFIns<int addrKind, list<RegisterClass> vdataList=[]> {
+  dag ret =
+    !if(!eq(addrKind, BUFAddrKind.Offset), getMTBUFInsDA<vdataList>.ret,
+    !if(!eq(addrKind, BUFAddrKind.OffEn),  getMTBUFInsDA<vdataList, [VGPR_32]>.ret,
+    !if(!eq(addrKind, BUFAddrKind.IdxEn),  getMTBUFInsDA<vdataList, [VGPR_32]>.ret,
+    !if(!eq(addrKind, BUFAddrKind.BothEn), getMTBUFInsDA<vdataList, [VReg_64]>.ret,
+    !if(!eq(addrKind, BUFAddrKind.Addr64), getMTBUFInsDA<vdataList, [VReg_64]>.ret,
+    (ins))))));
+}
+
+class getMTBUFAsmOps<int addrKind> {
+  string Pfx =
+    !if(!eq(addrKind, BUFAddrKind.Offset), "off, $srsrc, $dfmt, $nfmt, $soffset",
+    !if(!eq(addrKind, BUFAddrKind.OffEn),
+            "$vaddr, $srsrc, $dfmt, $nfmt, $soffset offen",
+    !if(!eq(addrKind, BUFAddrKind.IdxEn),
+            "$vaddr, $srsrc, $dfmt, $nfmt, $soffset idxen",
+    !if(!eq(addrKind, BUFAddrKind.BothEn),
+            "$vaddr, $srsrc, $dfmt, $nfmt, $soffset idxen offen",
+    !if(!eq(addrKind, BUFAddrKind.Addr64),
+            "$vaddr, $srsrc, $dfmt, $nfmt, $soffset addr64",
+    "")))));
+  string ret = Pfx # "$offset";
+}
+
+class MTBUF_SetupAddr<int addrKind> {
+  bits<1> offen  = !if(!eq(addrKind, BUFAddrKind.OffEn), 1,
+                   !if(!eq(addrKind, BUFAddrKind.BothEn), 1 , 0));
+
+  bits<1> idxen  = !if(!eq(addrKind, BUFAddrKind.IdxEn), 1,
+                   !if(!eq(addrKind, BUFAddrKind.BothEn), 1 , 0));
+
+  bits<1> addr64 = !if(!eq(addrKind, BUFAddrKind.Addr64), 1, 0);
+
+  bits<1> has_vaddr = !if(!eq(addrKind, BUFAddrKind.Offset), 0, 1);
+}
+
+class MTBUF_Load_Pseudo <string opName,
+                         int addrKind,
+                         RegisterClass vdataClass,
+                         list<dag> pattern=[],
+                         // Workaround bug bz30254
+                         int addrKindCopy = addrKind>
+  : MTBUF_Pseudo<opName,
+                 (outs vdataClass:$vdata),
+                 getMTBUFIns<addrKindCopy>.ret,
+                 " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe",
+                 pattern>,
+    MTBUF_SetupAddr<addrKindCopy> {
+  let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
   let mayLoad = 1;
   let mayStore = 0;
 }
 
-class MTBUF_Store_Pseudo <string opName, RegisterClass regClass> : MTBUF_Pseudo <
-  opName, (outs),
-  (ins regClass:$vdata, u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc,
-       i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr,
-       SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SCSrc_b32:$soffset),
-  " $vdata, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"#
-  " $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset"> {
+multiclass MTBUF_Pseudo_Loads<string opName, RegisterClass vdataClass,
+                              ValueType load_vt = i32,
+                              SDPatternOperator ld = null_frag> {
+
+  def _OFFSET : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
+    [(set load_vt:$vdata,
+     (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i8:$dfmt,
+                      i8:$nfmt, i1:$glc, i1:$slc, i1:$tfe)))]>,
+    MTBUFAddr64Table<0>;
+
+  def _ADDR64 : MTBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
+    [(set load_vt:$vdata,
+     (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset,
+                      i8:$dfmt, i8:$nfmt, i1:$glc, i1:$slc, i1:$tfe)))]>,
+    MTBUFAddr64Table<1>;
+
+  def _OFFEN  : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+  def _IDXEN  : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+  def _BOTHEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+
+  let DisableWQM = 1 in {
+    def _OFFSET_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass>;
+    def _OFFEN_exact  : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+    def _IDXEN_exact  : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+    def _BOTHEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+  }
+}
+
+class MTBUF_Store_Pseudo <string opName,
+                          int addrKind,
+                          RegisterClass vdataClass,
+                          list<dag> pattern=[],
+                          // Workaround bug bz30254
+                          int addrKindCopy = addrKind,
+                          RegisterClass vdataClassCopy = vdataClass>
+  : MTBUF_Pseudo<opName,
+                 (outs),
+                 getMTBUFIns<addrKindCopy, [vdataClassCopy]>.ret,
+                 " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe",
+                 pattern>,
+    MTBUF_SetupAddr<addrKindCopy> {
+  let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
   let mayLoad = 0;
   let mayStore = 1;
 }
 
+multiclass MTBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
+                               ValueType store_vt = i32,
+                               SDPatternOperator st = null_frag> {
+
+  def _OFFSET : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
+    [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
+                                       i16:$offset, i8:$dfmt, i8:$nfmt, i1:$glc,
+                                       i1:$slc, i1:$tfe))]>,
+    MTBUFAddr64Table<0>;
+
+  def _ADDR64 : MTBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
+    [(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
+                                       i16:$offset, i8:$dfmt, i8:$nfmt, i1:$glc,
+                                       i1:$slc, i1:$tfe))]>,
+    MTBUFAddr64Table<1>;
+
+  def _OFFEN  : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+  def _IDXEN  : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+  def _BOTHEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+
+  let DisableWQM = 1 in {
+    def _OFFSET_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass>;
+    def _OFFEN_exact  : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+    def _IDXEN_exact  : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+    def _BOTHEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+  }
+}
+
+
 //===----------------------------------------------------------------------===//
 // MUBUF classes
 //===----------------------------------------------------------------------===//
@@ -676,14 +809,14 @@ def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1",
 // MTBUF Instructions
 //===----------------------------------------------------------------------===//
 
-//def TBUFFER_LOAD_FORMAT_X    : MTBUF_ <0, "tbuffer_load_format_x", []>;
-//def TBUFFER_LOAD_FORMAT_XY   : MTBUF_ <1, "tbuffer_load_format_xy", []>;
-//def TBUFFER_LOAD_FORMAT_XYZ  : MTBUF_ <2, "tbuffer_load_format_xyz", []>;
-def TBUFFER_LOAD_FORMAT_XYZW  : MTBUF_Load_Pseudo  <"tbuffer_load_format_xyzw", VReg_128>;
-def TBUFFER_STORE_FORMAT_X    : MTBUF_Store_Pseudo <"tbuffer_store_format_x", VGPR_32>;
-def TBUFFER_STORE_FORMAT_XY   : MTBUF_Store_Pseudo <"tbuffer_store_format_xy", VReg_64>;
-def TBUFFER_STORE_FORMAT_XYZ  : MTBUF_Store_Pseudo <"tbuffer_store_format_xyz", VReg_128>;
-def TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Pseudo <"tbuffer_store_format_xyzw", VReg_128>;
+defm TBUFFER_LOAD_FORMAT_X     : MTBUF_Pseudo_Loads  <"tbuffer_load_format_x",     VGPR_32>;
+defm TBUFFER_LOAD_FORMAT_XY    : MTBUF_Pseudo_Loads  <"tbuffer_load_format_xy",    VReg_64>;
+defm TBUFFER_LOAD_FORMAT_XYZ   : MTBUF_Pseudo_Loads  <"tbuffer_load_format_xyz",   VReg_128>;
+defm TBUFFER_LOAD_FORMAT_XYZW  : MTBUF_Pseudo_Loads  <"tbuffer_load_format_xyzw",  VReg_128>;
+defm TBUFFER_STORE_FORMAT_X    : MTBUF_Pseudo_Stores <"tbuffer_store_format_x",    VGPR_32>;
+defm TBUFFER_STORE_FORMAT_XY   : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy",   VReg_64>;
+defm TBUFFER_STORE_FORMAT_XYZ  : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz",  VReg_128>;
+defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128>;
 
 } // End let SubtargetPredicate = isGCN
 
@@ -1093,22 +1226,98 @@ defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, BUFFER_STORE_DWORDX4_OF
 // MTBUF Patterns
 //===----------------------------------------------------------------------===//
 
-// TBUFFER_STORE_FORMAT_*, addr64=0
-class MTBUF_StoreResource <ValueType vt, int num_channels, MTBUF_Pseudo opcode> : Pat<
-  (SItbuffer_store v4i32:$rsrc, vt:$vdata, num_channels, i32:$vaddr,
-                   i32:$soffset, imm:$inst_offset, imm:$dfmt,
-                   imm:$nfmt, imm:$offen, imm:$idxen,
-                   imm:$glc, imm:$slc, imm:$tfe),
-  (opcode
-    $vdata, (as_i16imm $inst_offset), (as_i1imm $offen), (as_i1imm $idxen),
-    (as_i1imm $glc), 0, (as_i8imm $dfmt), (as_i8imm $nfmt), $vaddr, $rsrc,
-    (as_i1imm $slc), (as_i1imm $tfe), $soffset)
->;
+//===----------------------------------------------------------------------===//
+// tbuffer_load/store_format patterns
+//===----------------------------------------------------------------------===//
 
-def : MTBUF_StoreResource <i32, 1, TBUFFER_STORE_FORMAT_X>;
-def : MTBUF_StoreResource <v2i32, 2, TBUFFER_STORE_FORMAT_XY>;
-def : MTBUF_StoreResource <v4i32, 3, TBUFFER_STORE_FORMAT_XYZ>;
-def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>;
+multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
+                                  string opcode> {
+  def : Pat<
+    (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset,
+              imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
+    (!cast<MTBUF_Pseudo>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset),
+      (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset,
+              imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
+    (!cast<MTBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset),
+      (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset,
+              imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
+    (!cast<MTBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset),
+      (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, imm:$offset,
+              imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
+    (!cast<MTBUF_Pseudo>(opcode # _BOTHEN)
+      (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
+      $rsrc, $soffset, (as_i16imm $offset),
+      (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+}
+
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, i32,   "TBUFFER_LOAD_FORMAT_X">;
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v2i32, "TBUFFER_LOAD_FORMAT_XY">;
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v4i32, "TBUFFER_LOAD_FORMAT_XYZW">;
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, f32,   "TBUFFER_LOAD_FORMAT_X">;
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v2f32, "TBUFFER_LOAD_FORMAT_XY">;
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v4f32, "TBUFFER_LOAD_FORMAT_XYZW">;
+
+multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
+                                   string opcode> {
+  def : Pat<
+    (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset,
+          imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
+    (!cast<MTBUF_Pseudo>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset,
+                                (as_i16imm $offset), (as_i8imm $dfmt),
+                                (as_i8imm $nfmt), (as_i1imm $glc),
+                                (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset,
+          imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
+    (!cast<MTBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset,
+                                   (as_i16imm $offset), (as_i8imm $dfmt),
+                                   (as_i8imm $nfmt), (as_i1imm $glc),
+                                   (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset,
+          imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
+    (!cast<MTBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset,
+                                   (as_i16imm $offset), (as_i8imm $dfmt),
+                                   (as_i8imm $nfmt), (as_i1imm $glc),
+                                   (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset,
+          imm:$offset, imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
+    (!cast<MTBUF_Pseudo>(opcode # _BOTHEN_exact)
+      $vdata,
+      (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
+      $rsrc, $soffset, (as_i16imm $offset),
+      (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+}
+
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, i32,   "TBUFFER_STORE_FORMAT_X">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v2i32, "TBUFFER_STORE_FORMAT_XY">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_x3, v4i32, "TBUFFER_STORE_FORMAT_XYZ">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v4i32, "TBUFFER_STORE_FORMAT_XYZW">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, f32,   "TBUFFER_STORE_FORMAT_X">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v2f32, "TBUFFER_STORE_FORMAT_XY">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_x3, v4f32, "TBUFFER_STORE_FORMAT_XYZ">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v4f32, "TBUFFER_STORE_FORMAT_XYZW">;
 
 } // End let Predicates = [isGCN]
 
@@ -1224,21 +1433,44 @@ def BUFFER_WBINVL1_si           : MUBUF_Real_si <0x71, BUFFER_WBINVL1>;
 
 class MTBUF_Real_si <bits<3> op, MTBUF_Pseudo ps> :
   MTBUF_Real<ps>,
+  Enc64,
   SIMCInstr<ps.PseudoInstr, SIEncodingFamily.SI> {
   let AssemblerPredicate=isSICI;
   let DecoderNamespace="SICI";
 
-  bits<1> addr64;
-  let Inst{15}    = addr64;
+  let Inst{11-0}  = !if(ps.has_offset, offset, ?);
+  let Inst{12}    = ps.offen;
+  let Inst{13}    = ps.idxen;
+  let Inst{14}    = !if(ps.has_glc, glc, ps.glc_value);
+  let Inst{15}    = ps.addr64;
   let Inst{18-16} = op;
+  let Inst{22-19} = !if(ps.has_dfmt, dfmt, ps.dfmt_value);
+  let Inst{25-23} = !if(ps.has_nfmt, nfmt, ps.nfmt_value);
+  let Inst{31-26} = 0x3a; //encoding
+  let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
+  let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+  let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
+  let Inst{54}    = !if(ps.has_slc, slc, ?);
+  let Inst{55}    = !if(ps.has_tfe, tfe, ?);
+  let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
 }
 
-def TBUFFER_LOAD_FORMAT_XYZW_si  : MTBUF_Real_si <3, TBUFFER_LOAD_FORMAT_XYZW>;
-def TBUFFER_STORE_FORMAT_X_si    : MTBUF_Real_si <4, TBUFFER_STORE_FORMAT_X>;
-def TBUFFER_STORE_FORMAT_XY_si   : MTBUF_Real_si <5, TBUFFER_STORE_FORMAT_XY>;
-def TBUFFER_STORE_FORMAT_XYZ_si  : MTBUF_Real_si <6, TBUFFER_STORE_FORMAT_XYZ>;
-def TBUFFER_STORE_FORMAT_XYZW_si : MTBUF_Real_si <7, TBUFFER_STORE_FORMAT_XYZW>;
+multiclass MTBUF_Real_AllAddr_si<bits<3> op> {
+  def _OFFSET_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET")>;
+  def _ADDR64_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_ADDR64")>;
+  def _OFFEN_si  : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN")>;
+  def _IDXEN_si  : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN")>;
+  def _BOTHEN_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>;
+}
 
+defm TBUFFER_LOAD_FORMAT_X     : MTBUF_Real_AllAddr_si <0>;
+defm TBUFFER_LOAD_FORMAT_XY    : MTBUF_Real_AllAddr_si <1>;
+//defm TBUFFER_LOAD_FORMAT_XYZ   : MTBUF_Real_AllAddr_si <2>;
+defm TBUFFER_LOAD_FORMAT_XYZW  : MTBUF_Real_AllAddr_si <3>;
+defm TBUFFER_STORE_FORMAT_X    : MTBUF_Real_AllAddr_si <4>;
+defm TBUFFER_STORE_FORMAT_XY   : MTBUF_Real_AllAddr_si <5>;
+defm TBUFFER_STORE_FORMAT_XYZ  : MTBUF_Real_AllAddr_si <6>;
+defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_si <7>;
 
 //===----------------------------------------------------------------------===//
 // CI
@@ -1350,16 +1582,39 @@ def BUFFER_WBINVL1_VOL_vi       : MUBUF_Real_vi <0x3f, BUFFER_WBINVL1_VOL>;
 
 class MTBUF_Real_vi <bits<4> op, MTBUF_Pseudo ps> :
   MTBUF_Real<ps>,
+  Enc64,
   SIMCInstr<ps.PseudoInstr, SIEncodingFamily.VI> {
   let AssemblerPredicate=isVI;
   let DecoderNamespace="VI";
 
+  let Inst{11-0}  = !if(ps.has_offset, offset, ?);
+  let Inst{12}    = ps.offen;
+  let Inst{13}    = ps.idxen;
+  let Inst{14}    = !if(ps.has_glc, glc, ps.glc_value);
   let Inst{18-15} = op;
+  let Inst{22-19} = !if(ps.has_dfmt, dfmt, ps.dfmt_value);
+  let Inst{25-23} = !if(ps.has_nfmt, nfmt, ps.nfmt_value);
+  let Inst{31-26} = 0x3a; //encoding
+  let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
+  let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+  let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
+  let Inst{54}    = !if(ps.has_slc, slc, ?);
+  let Inst{55}    = !if(ps.has_tfe, tfe, ?);
+  let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
 }
 
-def TBUFFER_LOAD_FORMAT_XYZW_vi  : MTBUF_Real_vi <3, TBUFFER_LOAD_FORMAT_XYZW>;
-def TBUFFER_STORE_FORMAT_X_vi    : MTBUF_Real_vi <4, TBUFFER_STORE_FORMAT_X>;
-def TBUFFER_STORE_FORMAT_XY_vi   : MTBUF_Real_vi <5, TBUFFER_STORE_FORMAT_XY>;
-def TBUFFER_STORE_FORMAT_XYZ_vi  : MTBUF_Real_vi <6, TBUFFER_STORE_FORMAT_XYZ>;
-def TBUFFER_STORE_FORMAT_XYZW_vi : MTBUF_Real_vi <7, TBUFFER_STORE_FORMAT_XYZW>;
+multiclass MTBUF_Real_AllAddr_vi<bits<4> op> {
+  def _OFFSET_vi : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET")>;
+  def _OFFEN_vi  : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN")>;
+  def _IDXEN_vi  : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN")>;
+  def _BOTHEN_vi : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>;
+}
 
+defm TBUFFER_LOAD_FORMAT_X     : MTBUF_Real_AllAddr_vi <0>;
+defm TBUFFER_LOAD_FORMAT_XY    : MTBUF_Real_AllAddr_vi <1>;
+//defm TBUFFER_LOAD_FORMAT_XYZ   : MTBUF_Real_AllAddr_vi <2>;
+defm TBUFFER_LOAD_FORMAT_XYZW  : MTBUF_Real_AllAddr_vi <3>;
+defm TBUFFER_STORE_FORMAT_X    : MTBUF_Real_AllAddr_vi <4>;
+defm TBUFFER_STORE_FORMAT_XY   : MTBUF_Real_AllAddr_vi <5>;
+defm TBUFFER_STORE_FORMAT_XYZ  : MTBUF_Real_AllAddr_vi <6>;
+defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_vi <7>;
diff --git a/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
index 2ff9c3f3190d..7c31c8e397ba 100644
--- a/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
@@ -231,6 +231,24 @@ void AMDGPUInstPrinter::printExpVM(const MCInst *MI, unsigned OpNo,
     O << " vm";
 }
 
+void AMDGPUInstPrinter::printDFMT(const MCInst *MI, unsigned OpNo,
+                                  const MCSubtargetInfo &STI,
+                                  raw_ostream &O) {
+  if (MI->getOperand(OpNo).getImm()) {
+    O << " dfmt:";
+    printU8ImmDecOperand(MI, OpNo, O);
+  }
+}
+
+void AMDGPUInstPrinter::printNFMT(const MCInst *MI, unsigned OpNo,
+                                  const MCSubtargetInfo &STI,
+                                  raw_ostream &O) {
+  if (MI->getOperand(OpNo).getImm()) {
+    O << " nfmt:";
+    printU8ImmDecOperand(MI, OpNo, O);
+  }
+}
+
 void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O,
                                         const MCRegisterInfo &MRI) {
   switch (RegNo) {
diff --git a/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
index f7b9c88e20fa..7bbf99a85f40 100644
--- a/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
@@ -88,6 +88,10 @@ private:
                      const MCSubtargetInfo &STI, raw_ostream &O);
   void printExpVM(const MCInst *MI, unsigned OpNo,
                   const MCSubtargetInfo &STI, raw_ostream &O);
+  void printDFMT(const MCInst *MI, unsigned OpNo,
+                 const MCSubtargetInfo &STI, raw_ostream &O);
+  void printNFMT(const MCInst *MI, unsigned OpNo,
+                 const MCSubtargetInfo &STI, raw_ostream &O);
 
   void printRegOperand(unsigned RegNo, raw_ostream &O);
   void printVOPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 3563eb365b96..880c0dda945a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3295,6 +3295,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
                                                  SelectionDAG &DAG) const {
   unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
   SDLoc DL(Op);
+  MachineFunction &MF = DAG.getMachineFunction();
+
   switch (IntrID) {
   case Intrinsic::amdgcn_atomic_inc:
   case Intrinsic::amdgcn_atomic_dec: {
@@ -3320,7 +3322,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
       Op.getOperand(5), // glc
       Op.getOperand(6)  // slc
     };
-    MachineFunction &MF = DAG.getMachineFunction();
     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
     unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
@@ -3335,6 +3336,29 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
 
     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, MMO);
   }
+  case Intrinsic::amdgcn_tbuffer_load: {
+    SDValue Ops[] = {
+      Op.getOperand(0),  // Chain
+      Op.getOperand(2),  // rsrc
+      Op.getOperand(3),  // vindex
+      Op.getOperand(4),  // voffset
+      Op.getOperand(5),  // soffset
+      Op.getOperand(6),  // offset
+      Op.getOperand(7),  // dfmt
+      Op.getOperand(8),  // nfmt
+      Op.getOperand(9),  // glc
+      Op.getOperand(10)   // slc
+    };
+
+    EVT VT = Op.getOperand(2).getValueType();
+
+    MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo(),
+      MachineMemOperand::MOLoad,
+      VT.getStoreSize(), VT.getStoreSize());
+    return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
+                                   Op->getVTList(), Ops, VT, MMO);
+  }
   // Basic sample.
   case Intrinsic::amdgcn_image_sample:
   case Intrinsic::amdgcn_image_sample_cl:
@@ -3400,10 +3424,10 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
 
 SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
                                               SelectionDAG &DAG) const {
-  MachineFunction &MF = DAG.getMachineFunction();
   SDLoc DL(Op);
   SDValue Chain = Op.getOperand(0);
   unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  MachineFunction &MF = DAG.getMachineFunction();
 
   switch (IntrinsicID) {
   case Intrinsic::amdgcn_exp: {
@@ -3470,33 +3494,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain,
                        Op.getOperand(2), Op.getOperand(3));
   }
-  case AMDGPUIntrinsic::SI_tbuffer_store: {
-    SDValue Ops[] = {
-      Chain,
-      Op.getOperand(2),
-      Op.getOperand(3),
-      Op.getOperand(4),
-      Op.getOperand(5),
-      Op.getOperand(6),
-      Op.getOperand(7),
-      Op.getOperand(8),
-      Op.getOperand(9),
-      Op.getOperand(10),
-      Op.getOperand(11),
-      Op.getOperand(12),
-      Op.getOperand(13),
-      Op.getOperand(14)
-    };
-
-    EVT VT = Op.getOperand(3).getValueType();
-
-    MachineMemOperand *MMO = MF.getMachineMemOperand(
-      MachinePointerInfo(),
-      MachineMemOperand::MOStore,
-      VT.getStoreSize(), 4);
-    return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
-                                   Op->getVTList(), Ops, VT, MMO);
-  }
   case AMDGPUIntrinsic::AMDGPU_kill: {
     SDValue Src = Op.getOperand(2);
     if (const ConstantFPSDNode *K = dyn_cast<ConstantFPSDNode>(Src)) {
@@ -3512,7 +3509,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
   }
   case Intrinsic::amdgcn_s_barrier: {
     if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
-      const MachineFunction &MF = DAG.getMachineFunction();
       const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
       unsigned WGSize = ST.getFlatWorkGroupSizes(*MF.getFunction()).second;
       if (WGSize <= ST.getWavefrontSize())
@@ -3521,6 +3517,76 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     }
     return SDValue();
   };
+  case AMDGPUIntrinsic::SI_tbuffer_store: {
+
+    // Extract vindex and voffset from vaddr as appropriate
+    const ConstantSDNode *OffEn = cast<ConstantSDNode>(Op.getOperand(10));
+    const ConstantSDNode *IdxEn = cast<ConstantSDNode>(Op.getOperand(11));
+    SDValue VAddr = Op.getOperand(5);
+
+    SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
+
+    assert(!(OffEn->isOne() && IdxEn->isOne()) &&
+           "Legacy intrinsic doesn't support both offset and index - use new version");
+
+    SDValue VIndex = IdxEn->isOne() ? VAddr : Zero;
+    SDValue VOffset = OffEn->isOne() ? VAddr : Zero;
+
+    // Deal with the vec-3 case
+    const ConstantSDNode *NumChannels = cast<ConstantSDNode>(Op.getOperand(4));
+    auto Opcode = NumChannels->getZExtValue() == 3 ?
+      AMDGPUISD::TBUFFER_STORE_FORMAT_X3 : AMDGPUISD::TBUFFER_STORE_FORMAT;
+
+    SDValue Ops[] = {
+     Chain,
+     Op.getOperand(3),  // vdata
+     Op.getOperand(2),  // rsrc
+     VIndex,
+     VOffset,
+     Op.getOperand(6),  // soffset
+     Op.getOperand(7),  // inst_offset
+     Op.getOperand(8),  // dfmt
+     Op.getOperand(9),  // nfmt
+     Op.getOperand(12), // glc
+     Op.getOperand(13), // slc
+    };
+
+    const ConstantSDNode *tfe = cast<ConstantSDNode>(Op.getOperand(14));
+    assert(tfe->getZExtValue() == 0 &&
+           "Value of tfe other than zero is unsupported");
+
+    EVT VT = Op.getOperand(3).getValueType();
+    MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo(),
+      MachineMemOperand::MOStore,
+      VT.getStoreSize(), 4);
+    return DAG.getMemIntrinsicNode(Opcode, DL,
+                                   Op->getVTList(), Ops, VT, MMO);
+  }
+
+  case Intrinsic::amdgcn_tbuffer_store: {
+    SDValue Ops[] = {
+      Chain,
+      Op.getOperand(2),  // vdata
+      Op.getOperand(3),  // rsrc
+      Op.getOperand(4),  // vindex
+      Op.getOperand(5),  // voffset
+      Op.getOperand(6),  // soffset
+      Op.getOperand(7),  // offset
+      Op.getOperand(8),  // dfmt
+      Op.getOperand(9),  // nfmt
+      Op.getOperand(10), // glc
+      Op.getOperand(11)  // slc
+    };
+    EVT VT = Op.getOperand(3).getValueType();
+    MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo(),
+      MachineMemOperand::MOStore,
+      VT.getStoreSize(), 4);
+    return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
+                                   Op->getVTList(), Ops, VT, MMO);
+  }
+
   default:
     return Op;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index a8686ec2999c..3b4a8b5d1e81 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -41,25 +41,41 @@ def SIatomic_dec : SDNode<"AMDGPUISD::ATOMIC_DEC", SDTAtomic2,
   [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
 >;
 
-def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT",
-  SDTypeProfile<0, 13,
-    [SDTCisVT<0, v4i32>,   // rsrc(SGPR)
-     SDTCisVT<1, iAny>,   // vdata(VGPR)
-     SDTCisVT<2, i32>,    // num_channels(imm)
-     SDTCisVT<3, i32>,    // vaddr(VGPR)
+def SItbuffer_load : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT",
+  SDTypeProfile<1, 9,
+    [                     // vdata
+     SDTCisVT<1, v4i32>,  // rsrc
+     SDTCisVT<2, i32>,    // vindex(VGPR)
+     SDTCisVT<3, i32>,    // voffset(VGPR)
      SDTCisVT<4, i32>,    // soffset(SGPR)
-     SDTCisVT<5, i32>,    // inst_offset(imm)
+     SDTCisVT<5, i32>,    // offset(imm)
      SDTCisVT<6, i32>,    // dfmt(imm)
      SDTCisVT<7, i32>,    // nfmt(imm)
-     SDTCisVT<8, i32>,    // offen(imm)
-     SDTCisVT<9, i32>,    // idxen(imm)
-     SDTCisVT<10, i32>,   // glc(imm)
-     SDTCisVT<11, i32>,   // slc(imm)
-     SDTCisVT<12, i32>    // tfe(imm)
+     SDTCisVT<8, i32>,    // glc(imm)
+     SDTCisVT<9, i32>     // slc(imm)
     ]>,
-  [SDNPMayStore, SDNPMemOperand, SDNPHasChain]
+  [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
 >;
 
+def SDTtbuffer_store : SDTypeProfile<0, 10,
+    [                     // vdata
+     SDTCisVT<1, v4i32>,  // rsrc
+     SDTCisVT<2, i32>,    // vindex(VGPR)
+     SDTCisVT<3, i32>,    // voffset(VGPR)
+     SDTCisVT<4, i32>,    // soffset(SGPR)
+     SDTCisVT<5, i32>,    // offset(imm)
+     SDTCisVT<6, i32>,    // dfmt(imm)
+     SDTCisVT<7, i32>,    // nfmt(imm)
+     SDTCisVT<8, i32>,    // glc(imm)
+     SDTCisVT<9, i32>     // slc(imm)
+    ]>;
+
+def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT", SDTtbuffer_store,
+                             [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
+def SItbuffer_store_x3 : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT_X3",
+                                SDTtbuffer_store,
+                                [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
+
 def SDTBufferLoad : SDTypeProfile<1, 5,
     [                    // vdata
      SDTCisVT<1, v4i32>, // rsrc
@@ -547,6 +563,9 @@ def lwe : NamedOperandBit<"LWE", NamedMatchClass<"LWE">>;
 def exp_compr : NamedOperandBit<"ExpCompr", NamedMatchClass<"ExpCompr">>;
 def exp_vm : NamedOperandBit<"ExpVM", NamedMatchClass<"ExpVM">>;
 
+def DFMT : NamedOperandU8<"DFMT", NamedMatchClass<"DFMT">>;
+def NFMT : NamedOperandU8<"NFMT", NamedMatchClass<"NFMT">>;
+
 def dmask : NamedOperandU16<"DMask", NamedMatchClass<"DMask">>;
 
 def dpp_ctrl : NamedOperandU32<"DPPCtrl", NamedMatchClass<"DPPCtrl", 0>>;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll
index 645c6a6b8d7e..cd9c082ed941 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll
@@ -2,7 +2,7 @@
 ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
 
 ;CHECK-LABEL: {{^}}test1:
-;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, 0x20, -1, 0, -1, 0, 14, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offen offset:32 glc slc
 define amdgpu_vs void @test1(i32 %a1, i32 %vaddr) {
     %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
     call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
@@ -11,8 +11,38 @@ define amdgpu_vs void @test1(i32 %a1, i32 %vaddr) {
     ret void
 }
 
+;CHECK-LABEL: {{^}}test1_idx:
+;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen offset:32 glc slc
+define amdgpu_vs void @test1_idx(i32 %a1, i32 %vaddr) {
+    %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
+    call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
+        i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 0, i32 1, i32 1,
+        i32 1, i32 0)
+    ret void
+}
+
+;CHECK-LABEL: {{^}}test1_scalar_offset:
+;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, {{s[0-9]+}} idxen offset:32 glc slc
+define amdgpu_vs void @test1_scalar_offset(i32 %a1, i32 %vaddr, i32 inreg %soffset) {
+    %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
+    call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
+        i32 4, i32 %vaddr, i32 %soffset, i32 32, i32 14, i32 4, i32 0, i32 1, i32 1,
+        i32 1, i32 0)
+    ret void
+}
+
+;CHECK-LABEL: {{^}}test1_no_glc_slc:
+;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offen offset:32
+define amdgpu_vs void @test1_no_glc_slc(i32 %a1, i32 %vaddr) {
+    %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
+    call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
+        i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 0,
+        i32 0, i32 0)
+    ret void
+}
+
 ;CHECK-LABEL: {{^}}test2:
-;CHECK: tbuffer_store_format_xyz {{v\[[0-9]+:[0-9]+\]}}, 0x18, -1, 0, -1, 0, 13, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+;CHECK: tbuffer_store_format_xyz {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, 0 offen offset:24 glc slc
 define amdgpu_vs void @test2(i32 %a1, i32 %vaddr) {
     %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
     call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
@@ -22,7 +52,7 @@ define amdgpu_vs void @test2(i32 %a1, i32 %vaddr) {
 }
 
 ;CHECK-LABEL: {{^}}test3:
-;CHECK: tbuffer_store_format_xy {{v\[[0-9]+:[0-9]+\]}}, 0x10, -1, 0, -1, 0, 11, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+;CHECK: tbuffer_store_format_xy {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:11, nfmt:4, 0 offen offset:16 glc slc
 define amdgpu_vs void @test3(i32 %a1, i32 %vaddr) {
     %vdata = insertelement <2 x i32> undef, i32 %a1, i32 0
     call void @llvm.SI.tbuffer.store.v2i32(<16 x i8> undef, <2 x i32> %vdata,
@@ -32,7 +62,7 @@ define amdgpu_vs void @test3(i32 %a1, i32 %vaddr) {
 }
 
 ;CHECK-LABEL: {{^}}test4:
-;CHECK: tbuffer_store_format_x {{v[0-9]+}}, 0x8, -1, 0, -1, 0, 4, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+;CHECK: tbuffer_store_format_x {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:4, nfmt:4, 0 offen offset:8 glc slc
 define amdgpu_vs void @test4(i32 %vdata, i32 %vaddr) {
     call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %vdata,
         i32 1, i32 %vaddr, i32 0, i32 8, i32 4, i32 4, i32 1, i32 0, i32 1,
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll
new file mode 100644
index 000000000000..712ee7ad1e5c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll
@@ -0,0 +1,109 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=GCN %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}tbuffer_load:
+; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0
+; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:15, nfmt:3, 0 glc
+; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:6, nfmt:1, 0 slc
+; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:6, nfmt:1, 0
+; GCN: s_waitcnt
+define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>, <4 x float>} @tbuffer_load(<4 x i32> inreg) {
+main_body:
+    %vdata     = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0)
+    %vdata_glc = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 15, i32 3, i1 1, i1 0)
+    %vdata_slc = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 1)
+    %vdata_f32 = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0)
+    %vdata.f     = bitcast <4 x i32> %vdata to <4 x float>
+    %vdata_glc.f = bitcast <4 x i32> %vdata_glc to <4 x float>
+    %vdata_slc.f = bitcast <4 x i32> %vdata_slc to <4 x float>
+    %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %vdata.f, 0
+    %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %vdata_glc.f, 1
+    %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %vdata_slc.f, 2
+    %r3 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r2, <4 x float> %vdata_f32, 3
+    ret {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r3
+}
+
+; GCN-LABEL: {{^}}tbuffer_load_immoffs:
+; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offset:42
+define amdgpu_vs <4 x float> @tbuffer_load_immoffs(<4 x i32> inreg) {
+main_body:
+    %vdata   = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 42, i32 14, i32 4, i1 0, i1 0)
+    %vdata.f = bitcast <4 x i32> %vdata to <4 x float>
+    ret <4 x float> %vdata.f
+}
+
+; GCN-LABEL: {{^}}tbuffer_load_immoffs_large
+; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:15, nfmt:2, 61 offset:4095
+; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:3, {{s[0-9]+}} offset:73
+; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, {{s[0-9]+}} offset:1
+; GCN: s_waitcnt
+define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>} @tbuffer_load_immoffs_large(<4 x i32> inreg, i32 inreg %soffs) {
+    %vdata     = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 61, i32 4095, i32 15, i32 2, i1 0, i1 0)
+    %vdata_glc = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 %soffs, i32 73, i32 14, i32 3, i1 0, i1 0)
+    %vdata_slc = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 %soffs, i32 1, i32 13, i32 4, i1 0, i1 0)
+    %vdata.f     = bitcast <4 x i32> %vdata to <4 x float>
+    %vdata_glc.f = bitcast <4 x i32> %vdata_glc to <4 x float>
+    %vdata_slc.f = bitcast <4 x i32> %vdata_slc to <4 x float>
+    %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %vdata.f, 0
+    %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %vdata_glc.f, 1
+    %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %vdata_slc.f, 2
+    ret {<4 x float>, <4 x float>, <4 x float>} %r2
+}
+
+; GCN-LABEL: {{^}}tbuffer_load_idx:
+; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen
+define amdgpu_vs <4 x float> @tbuffer_load_idx(<4 x i32> inreg, i32 %vindex) {
+main_body:
+    %vdata   = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 %vindex, i32 0, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0)
+    %vdata.f = bitcast <4 x i32> %vdata to <4 x float>
+    ret <4 x float> %vdata.f
+}
+
+; GCN-LABEL: {{^}}tbuffer_load_ofs:
+; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offen
+define amdgpu_vs <4 x float> @tbuffer_load_ofs(<4 x i32> inreg, i32 %voffs) {
+main_body:
+    %vdata   = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 %voffs, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0)
+    %vdata.f = bitcast <4 x i32> %vdata to <4 x float>
+    ret <4 x float> %vdata.f
+}
+
+; GCN-LABEL: {{^}}tbuffer_load_ofs_imm:
+; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offen offset:52
+define amdgpu_vs <4 x float> @tbuffer_load_ofs_imm(<4 x i32> inreg, i32 %voffs) {
+main_body:
+    %vdata   = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 %voffs, i32 0, i32 52, i32 14, i32 4, i1 0, i1 0)
+    %vdata.f = bitcast <4 x i32> %vdata to <4 x float>
+    ret <4 x float> %vdata.f
+}
+
+; GCN-LABEL: {{^}}tbuffer_load_both:
+; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen offen
+define amdgpu_vs <4 x float> @tbuffer_load_both(<4 x i32> inreg, i32 %vindex, i32 %voffs) {
+main_body:
+    %vdata   = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 %vindex, i32 %voffs, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0)
+    %vdata.f = bitcast <4 x i32> %vdata to <4 x float>
+    ret <4 x float> %vdata.f
+}
+
+
+; GCN-LABEL: {{^}}buffer_load_xy:
+; GCN: tbuffer_load_format_xy {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, 0
+define amdgpu_vs <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) {
+    %vdata = call <2 x i32> @llvm.amdgcn.tbuffer.load.v2i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 13, i32 4, i1 0, i1 0)
+    %vdata.f = bitcast <2 x i32> %vdata to <2 x float>
+    ret <2 x float> %vdata.f
+}
+
+; GCN-LABEL: {{^}}buffer_load_x:
+; GCN: tbuffer_load_format_x {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, 0
+define amdgpu_vs float @buffer_load_x(<4 x i32> inreg %rsrc) {
+    %vdata = call i32 @llvm.amdgcn.tbuffer.load.i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 13, i32 4, i1 0, i1 0)
+    %vdata.f = bitcast i32 %vdata to float
+    ret float %vdata.f
+}
+
+declare i32 @llvm.amdgcn.tbuffer.load.i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
+declare <2 x i32> @llvm.amdgcn.tbuffer.load.v2i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
+declare <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
+declare <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll
new file mode 100644
index 000000000000..997d7f529461
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll
@@ -0,0 +1,110 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=GCN %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}tbuffer_store:
+; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], dfmt:12, nfmt:2, 0
+; GCN: tbuffer_store_format_xyzw v[4:7], off, s[0:3], dfmt:13, nfmt:3, 0 glc
+; GCN: tbuffer_store_format_xyzw v[8:11], off, s[0:3], dfmt:14, nfmt:4, 0 slc
+; GCN: tbuffer_store_format_xyzw v[8:11], off, s[0:3], dfmt:14, nfmt:4, 0
+define amdgpu_ps void @tbuffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) {
+main_body:
+  %in1 = bitcast <4 x float> %1 to <4 x i32>
+  %in2 = bitcast <4 x float> %2 to <4 x i32>
+  %in3 = bitcast <4 x float> %3 to <4 x i32>
+  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 12, i32 2, i1 0, i1 0)
+  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in2, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 13, i32 3, i1 1, i1 0)
+  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in3, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 4, i1 0, i1 1)
+  call void @llvm.amdgcn.tbuffer.store.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}tbuffer_store_immoffs:
+; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], dfmt:5, nfmt:7, 0 offset:42
+define amdgpu_ps void @tbuffer_store_immoffs(<4 x i32> inreg, <4 x float>) {
+main_body:
+  %in1 = bitcast <4 x float> %1 to <4 x i32>
+  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 42, i32 5, i32 7, i1 0, i1 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}tbuffer_store_scalar_and_imm_offs:
+; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], dfmt:5, nfmt:7, {{s[0-9]+}} offset:42
+define amdgpu_ps void @tbuffer_store_scalar_and_imm_offs(<4 x i32> inreg, <4 x float> %vdata, i32 inreg %soffset) {
+main_body:
+  %in1 = bitcast <4 x float> %vdata to <4 x i32>
+  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 0, i32 %soffset, i32 42, i32 5, i32 7, i1 0, i1 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}buffer_store_idx:
+; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], dfmt:15, nfmt:2, 0 idxen
+define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex) {
+main_body:
+  %in1 = bitcast <4 x float> %vdata to <4 x i32>
+  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %vindex, i32 0, i32 0, i32 0, i32 15, i32 2, i1 0, i1 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}buffer_store_ofs:
+; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], dfmt:3, nfmt:7, 0 offen
+define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float> %vdata, i32 %voffset) {
+main_body:
+  %in1 = bitcast <4 x float> %vdata to <4 x i32>
+  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 %voffset, i32 0, i32 0, i32 3, i32 7, i1 0, i1 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}buffer_store_both:
+; GCN: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], dfmt:6, nfmt:4, 0 idxen offen
+define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex, i32 %voffset) {
+main_body:
+  %in1 = bitcast <4 x float> %vdata to <4 x i32>
+  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %vindex, i32 %voffset, i32 0, i32 0, i32 6, i32 4, i1 0, i1 0)
+  ret void
+}
+
+; Ideally, the register allocator would avoid the wait here
+;
+; GCN-LABEL: {{^}}buffer_store_wait:
+; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], dfmt:15, nfmt:3, 0 idxen
+; GCN: s_waitcnt expcnt(0)
+; GCN: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen
+; GCN: s_waitcnt vmcnt(0)
+; GCN: tbuffer_store_format_xyzw v[0:3], v6, s[0:3], dfmt:16, nfmt:2, 0 idxen
+define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex.1, i32 %vindex.2, i32 %vindex.3) {
+main_body:
+  %in1 = bitcast <4 x float> %vdata to <4 x i32>
+  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %vindex.1, i32 0, i32 0, i32 0, i32 15, i32 3, i1 0, i1 0)
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %vindex.2, i32 0, i1 0, i1 0)
+  %data.i = bitcast <4 x float> %data to <4 x i32>
+  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %data.i, <4 x i32> %0, i32 %vindex.3, i32 0, i32 0, i32 0, i32 16, i32 2, i1 0, i1 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}buffer_store_x1:
+; GCN: tbuffer_store_format_x v0, v1, s[0:3], dfmt:13, nfmt:7, 0 idxen
+define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %vindex) {
+main_body:
+  %data.i = bitcast float %data to i32
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 %data.i, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 13, i32 7, i1 0, i1 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}buffer_store_x2:
+; GCN: tbuffer_store_format_xy v[0:1], v2, s[0:3], dfmt:1, nfmt:2, 0 idxen
+define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %vindex) {
+main_body:
+  %data.i = bitcast <2 x float> %data to <2 x i32>
+  call void @llvm.amdgcn.tbuffer.store.v2i32(<2 x i32> %data.i, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0)
+  ret void
+}
+
+declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0
+declare void @llvm.amdgcn.tbuffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0
+declare void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0
+declare void @llvm.amdgcn.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0
+declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
+
diff --git a/llvm/test/CodeGen/AMDGPU/merge-store-crash.ll b/llvm/test/CodeGen/AMDGPU/merge-store-crash.ll
index ef552e295fd4..1252a5c0c02a 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-store-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/merge-store-crash.ll
@@ -26,11 +26,11 @@ main_body:
   %tmp9 = insertelement <4 x i32> %tmp8, i32 %tmp7, i32 1
   %tmp10 = insertelement <4 x i32> %tmp9, i32 undef, i32 2
   %tmp11 = insertelement <4 x i32> %tmp10, i32 undef, i32 3
-  call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %tmp11, i32 4, i32 undef, i32 %arg, i32 0, i32 14, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %tmp11, <4 x i32> undef, i32 undef, i32 0, i32 %arg, i32 0, i32 14, i32 4, i1 1, i1 1)
   ret void
 }
 
 ; Function Attrs: nounwind
-declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0
 
 attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/merge-store-usedef.ll b/llvm/test/CodeGen/AMDGPU/merge-store-usedef.ll
index 1c82aeb9b7f5..958692e0c92b 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-store-usedef.ll
+++ b/llvm/test/CodeGen/AMDGPU/merge-store-usedef.ll
@@ -11,13 +11,13 @@ define amdgpu_vs void @test1(i32 %v) #0 {
 
   store i32 %v, i32 addrspace(3)* %p0
 
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %v, i32 1, i32 undef, i32 undef, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 %v, <4 x i32> undef, i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i1 1, i1 0)
 
   %w = load i32, i32 addrspace(3)* %p0
   store i32 %w, i32 addrspace(3)* %p1
   ret void
 }
 
-declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0
 
 attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf.ll b/llvm/test/CodeGen/AMDGPU/mubuf.ll
index 9e1d2e0490c7..d883b87ec401 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf.ll
@@ -62,7 +62,8 @@ main_body:
   %tmp2 = shl i32 %6, 2
   %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp1, i32 %tmp2, i32 64, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
   %tmp4 = add i32 %6, 16
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %tmp1, i32 %tmp3, i32 1, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  %tmp1.4xi32 = bitcast <16 x i8> %tmp1 to <4 x i32>
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp3, <4 x i32> %tmp1.4xi32, i32 0, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i1 1, i1 1)
   ret void
 }
 
@@ -80,7 +81,8 @@ main_body:
   %tmp2 = shl i32 %6, 2
   %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp1, i32 %tmp2, i32 65, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
   %tmp4 = add i32 %6, 16
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %tmp1, i32 %tmp3, i32 1, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  %tmp1.4xi32 = bitcast <16 x i8> %tmp1 to <4 x i32>
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp3, <4 x i32> %tmp1.4xi32, i32 0, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i1 1, i1 1)
   ret void
 }
 
@@ -175,6 +177,6 @@ define amdgpu_kernel void @store_vgpr_ptr(i32 addrspace(1)* %out) #0 {
 }
 
 declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
+declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
 
 attributes #0 = { nounwind readonly }
diff --git a/llvm/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll b/llvm/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll
index c0e8e58556fc..47e32724d9ca 100644
--- a/llvm/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll
@@ -25,29 +25,29 @@ main_body:
   %array_vector10 = insertelement <4 x float> %array_vector9, float 0.000000e+00, i32 2
   %array_vector11 = insertelement <4 x float> %array_vector10, float undef, i32 3
   %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> undef, i32 undef, i32 4864, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %tmp3, i32 1, i32 36, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp3, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 36, i32 4, i32 4, i1 1, i1 1)
   %bc = bitcast <4 x float> %array_vector3 to <4 x i32>
   %tmp4 = extractelement <4 x i32> %bc, i32 undef
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %tmp4, i32 1, i32 48, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp4, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 48, i32 4, i32 4, i1 1, i1 1)
   %bc49 = bitcast <4 x float> %array_vector11 to <4 x i32>
   %tmp5 = extractelement <4 x i32> %bc49, i32 undef
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %tmp5, i32 1, i32 72, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp5, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 72, i32 4, i32 4, i1 1, i1 1)
   %array_vector21 = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %tmp, i32 1
   %array_vector22 = insertelement <4 x float> %array_vector21, float undef, i32 2
   %array_vector23 = insertelement <4 x float> %array_vector22, float undef, i32 3
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 undef, i32 1, i32 28, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 28, i32 4, i32 4, i1 1, i1 1)
   %bc52 = bitcast <4 x float> %array_vector23 to <4 x i32>
   %tmp6 = extractelement <4 x i32> %bc52, i32 undef
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %tmp6, i32 1, i32 64, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 undef, i32 1, i32 20, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 undef, i32 1, i32 56, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 undef, i32 1, i32 92, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp6, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 64, i32 4, i32 4, i1 1, i1 1)
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 20, i32 4, i32 4, i1 1, i1 1)
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 56, i32 4, i32 4, i1 1, i1 1)
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 92, i32 4, i32 4, i1 1, i1 1)
   ret void
 }
 
 declare float @llvm.SI.load.const(<16 x i8>, i32) #1
 declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #2
-declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) #3
+declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #3
 
 attributes #0 = { nounwind "target-cpu"="tonga" }
 attributes #1 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
index 8a4cee264fd8..348c7200c0bc 100644
--- a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=amdgcn -mcpu=bonaire -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -check-prefix=FUNC -check-prefix=CI %s
 
-declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
-declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
+declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
+declare void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
 declare void @llvm.amdgcn.s.barrier() #1
 declare i32 @llvm.amdgcn.workitem.id.x() #2
 
@@ -258,9 +258,8 @@ define amdgpu_kernel void @reorder_global_offsets_addr64_soffset0(i32 addrspace(
 ;   %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4
 
 ;   %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
-;   call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
-;         i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 1,
-;         i32 1, i32 0)
+;   call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %vdata, <4 x i32> undef,
+;         i32 %vaddr, i32 0, i32 0, i32 32, i32 14, i32 4, i1 1, i1 1)
 
 ;   %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4
 
diff --git a/llvm/test/MC/AMDGPU/mtbuf.s b/llvm/test/MC/AMDGPU/mtbuf.s
new file mode 100644
index 000000000000..9c8d71589bd0
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/mtbuf.s
@@ -0,0 +1,36 @@
+// RUN: llvm-mc -arch=amdgcn -mcpu=tahiti -show-encoding %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SICI %s
+// RUN: llvm-mc -arch=amdgcn -mcpu=bonaire -show-encoding %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=SICI %s
+// RUN: llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+//===----------------------------------------------------------------------===//
+// Test for dfmt and nfmt (tbuffer only)
+//===----------------------------------------------------------------------===//
+
+tbuffer_load_format_x v1, off, s[4:7], dfmt:15, nfmt:2, s1
+// SICI: tbuffer_load_format_x v1, off, s[4:7],  dfmt:15,  nfmt:2, s1 ; encoding: [0x00,0x00,0x78,0xe9,0x00,0x01,0x01,0x01]
+// VI:   tbuffer_load_format_x v1, off, s[4:7],  dfmt:15,  nfmt:2, s1 ; encoding: [0x00,0x00,0x78,0xe9,0x00,0x01,0x01,0x01]
+
+tbuffer_load_format_xy v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1
+// SICI: tbuffer_load_format_xy v[1:2], off, s[4:7],  dfmt:15,  nfmt:2, s1 ; encoding: [0x00,0x00,0x79,0xe9,0x00,0x01,0x01,0x01]
+// VI:   tbuffer_load_format_xy v[1:2], off, s[4:7],  dfmt:15,  nfmt:2, s1 ; encoding: [0x00,0x80,0x78,0xe9,0x00,0x01,0x01,0x01]
+
+tbuffer_load_format_xyzw v[1:4], off, s[4:7], dfmt:15, nfmt:2, s1
+// SICI: tbuffer_load_format_xyzw v[1:4], off, s[4:7],  dfmt:15,  nfmt:2, s1 ; encoding: [0x00,0x00,0x7b,0xe9,0x00,0x01,0x01,0x01]
+// VI:   tbuffer_load_format_xyzw v[1:4], off, s[4:7],  dfmt:15,  nfmt:2, s1 ; encoding: [0x00,0x80,0x79,0xe9,0x00,0x01,0x01,0x01]
+
+tbuffer_store_format_x v1, off, s[4:7], dfmt:15, nfmt:2, s1
+// SICI: tbuffer_store_format_x v1, off, s[4:7],  dfmt:15,  nfmt:2, s1 ; encoding: [0x00,0x00,0x7c,0xe9,0x00,0x01,0x01,0x01]
+// VI:   tbuffer_store_format_x v1, off, s[4:7],  dfmt:15,  nfmt:2, s1 ; encoding: [0x00,0x00,0x7a,0xe9,0x00,0x01,0x01,0x01]
+
+tbuffer_store_format_xy v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1
+// SICI: tbuffer_store_format_xy v[1:2], off, s[4:7],  dfmt:15,  nfmt:2, s1 ; encoding: [0x00,0x00,0x7d,0xe9,0x00,0x01,0x01,0x01]
+// VI:   tbuffer_store_format_xy v[1:2], off, s[4:7],  dfmt:15,  nfmt:2, s1 ; encoding: [0x00,0x80,0x7a,0xe9,0x00,0x01,0x01,0x01]
+
+tbuffer_store_format_xyzw v[1:4], off, s[4:7], dfmt:15, nfmt:2, s1
+// SICI: tbuffer_store_format_xyzw v[1:4], off, s[4:7],  dfmt:15,  nfmt:2, s1 ; encoding: [0x00,0x00,0x7f,0xe9,0x00,0x01,0x01,0x01]
+// VI:   tbuffer_store_format_xyzw v[1:4], off, s[4:7],  dfmt:15,  nfmt:2, s1 ; encoding: [0x00,0x80,0x7b,0xe9,0x00,0x01,0x01,0x01]
+
+tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:15, nfmt:2, ttmp1
+// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7],  dfmt:15,  nfmt:2, ttmp1 ; encoding: [0x00,0x00,0x7f,0xe9,0x00,0x01,0x1d,0x71]
+// VI:   tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:15,  nfmt:2, ttmp1 ; encoding: [0x00,0x80,0x7b,0xe9,0x00,0x01,0x1d,0x71]
+
diff --git a/llvm/test/MC/Disassembler/AMDGPU/mtbuf_vi.txt b/llvm/test/MC/Disassembler/AMDGPU/mtbuf_vi.txt
new file mode 100644
index 000000000000..519e03ede69e
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/mtbuf_vi.txt
@@ -0,0 +1,22 @@
+# RUN: llvm-mc -arch=amdgcn -mcpu=tonga -disassemble -show-encoding < %s | FileCheck %s -check-prefix=VI
+
+# VI:   tbuffer_load_format_x v1, off, s[4:7],  dfmt:15,  nfmt:2, s1 ; encoding: [0x00,0x00,0x78,0xe9,0x00,0x01,0x01,0x01]
+0x00 0x00 0x78 0xe9 0x00 0x01 0x01 0x01
+
+# VI:   tbuffer_load_format_xy v[1:2], off, s[4:7],  dfmt:15,  nfmt:2, s1 ; encoding: [0x00,0x80,0x78,0xe9,0x00,0x01,0x01,0x01]
+0x00 0x80 0x78 0xe9 0x00 0x01 0x01 0x01
+
+# VI:   tbuffer_load_format_xyzw v[1:4], off, s[4:7],  dfmt:15,  nfmt:2, s1 ; encoding: [0x00,0x80,0x79,0xe9,0x00,0x01,0x01,0x01]
+0x00 0x80 0x79 0xe9 0x00 0x01 0x01 0x01
+
+# VI:   tbuffer_store_format_x v1, off, s[4:7],  dfmt:15,  nfmt:2, s1 ; encoding: [0x00,0x00,0x7a,0xe9,0x00,0x01,0x01,0x01]
+0x00 0x00 0x7a 0xe9 0x00 0x01 0x01 0x01
+
+# VI:   tbuffer_store_format_xy v[1:2], off, s[4:7],  dfmt:15,  nfmt:2, s1 ; encoding: [0x00,0x80,0x7a,0xe9,0x00,0x01,0x01,0x01]
+0x00 0x80 0x7a 0xe9 0x00 0x01 0x01 0x01
+
+# VI:   tbuffer_store_format_xyzw v[1:4], off, s[4:7],  dfmt:15,  nfmt:2, s1 ; encoding: [0x00,0x80,0x7b,0xe9,0x00,0x01,0x01,0x01]
+0x00 0x80 0x7b 0xe9 0x00 0x01 0x01 0x01
+
+# VI:   tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7],  dfmt:15,  nfmt:2, ttmp1 ; encoding: [0x00,0x80,0x7b,0xe9,0x00,0x01,0x1d,0x71]
+0x00 0x80 0x7b 0xe9 0x00 0x01 0x1d 0x71