AMDGPU/GlobalISel: Adjust image load register type based on dmask

Trim elements that won't be written. The equivalent still needs to be done for writes. Also start widening 3 elements to 4 elements. Selection will get the count from the dmask.
2020-01-28 09:05:11 -05:00 · 2020-01-28 09:05:11 -05:00 · d9a012ed8a
parent 83ffbf2618
commit d9a012ed8a
4 changed files with 1983 additions and 146 deletions
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@ -3374,69 +3374,6 @@ bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
  return true;
 }

-// Produce a vector of s16 elements from s32 pieces.
-static void truncToS16Vector(MachineIRBuilder &B, Register DstReg,
-                             ArrayRef<Register> UnmergeParts) {
-  const LLT S16 = LLT::scalar(16);
-
-  SmallVector<Register, 4> RemergeParts(UnmergeParts.size());
-  for (int I = 0, E = UnmergeParts.size(); I != E; ++I)
-    RemergeParts[I] = B.buildTrunc(S16, UnmergeParts[I]).getReg(0);
-
-  B.buildBuildVector(DstReg, RemergeParts);
-}
-
-/// Convert a set of s32 registers to a result vector with s16 elements.
-static void bitcastToS16Vector(MachineIRBuilder &B, Register DstReg,
-                               ArrayRef<Register> UnmergeParts) {
-  MachineRegisterInfo &MRI = *B.getMRI();
-  const LLT V2S16 = LLT::vector(2, 16);
-  LLT TargetTy = MRI.getType(DstReg);
-  int NumElts = UnmergeParts.size();
-
-  if (NumElts == 1) {
-    assert(TargetTy == V2S16);
-    B.buildBitcast(DstReg, UnmergeParts[0]);
-    return;
-  }
-
-  SmallVector<Register, 4> RemergeParts(NumElts);
-  for (int I = 0; I != NumElts; ++I)
-    RemergeParts[I] = B.buildBitcast(V2S16, UnmergeParts[I]).getReg(0);
-
-  if (TargetTy.getSizeInBits() == 32u * NumElts) {
-    B.buildConcatVectors(DstReg, RemergeParts);
-    return;
-  }
-
-  const LLT V3S16 = LLT::vector(3, 16);
-  const LLT V6S16 = LLT::vector(6, 16);
-
-  // Widen to v6s16 and unpack v3 parts.
-  assert(TargetTy == V3S16);
-
-  RemergeParts.push_back(B.buildUndef(V2S16).getReg(0));
-  auto Concat = B.buildConcatVectors(V6S16, RemergeParts);
-  B.buildUnmerge({DstReg, MRI.createGenericVirtualRegister(V3S16)}, Concat);
-}
-
-// FIXME: Just vector trunc should be sufficent, but legalization currently
-// broken.
-static void repackUnpackedD16Load(MachineIRBuilder &B, Register DstReg,
-                                  Register WideDstReg) {
-  const LLT S32 = LLT::scalar(32);
-  const LLT S16 = LLT::scalar(16);
-
-  auto Unmerge = B.buildUnmerge(S32, WideDstReg);
-
-  int NumOps = Unmerge->getNumOperands() - 1;
-  SmallVector<Register, 4> RemergeParts(NumOps);
-  for (int I = 0; I != NumOps; ++I)
-    RemergeParts[I] = B.buildTrunc(S16, Unmerge.getReg(I)).getReg(0);
-
-  B.buildBuildVector(DstReg, RemergeParts);
-}
-
 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
 /// vector with s16 typed elements.
 static void packImageA16AddressToDwords(MachineIRBuilder &B,
@ -3493,14 +3430,18 @@ static int getImageNumVAddr(const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
  return BaseOpcode->NumExtraArgs + NumGradients + NumCoords + NumLCM;
 }

+static int getDMaskIdx(const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode,
+                       int NumDefs) {
+  assert(!BaseOpcode->Atomic);
+  return NumDefs + 1 + (BaseOpcode->Store ? 1 : 0);
+}
+
 /// Return first address operand index in an image intrinsic.
 static int getImageVAddrIdxBegin(const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode,
                                 int NumDefs) {
  if (BaseOpcode->Atomic)
    return NumDefs + 1 + (BaseOpcode->AtomicX2 ? 2 : 1);
-
-  int DMaskIdx = NumDefs + 1 + (BaseOpcode->Store ? 1 : 0);
-  return DMaskIdx + 1;
+  return getDMaskIdx(BaseOpcode, NumDefs) + 1;
 }

 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
@ -3544,6 +3485,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
  MachineRegisterInfo *MRI = B.getMRI();
  const LLT S32 = LLT::scalar(32);
  const LLT S16 = LLT::scalar(16);
+  const LLT V2S16 = LLT::vector(2, 16);

  // Index of first address argument
  const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
@ -3603,10 +3545,15 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
    convertImageAddrToPacked(B, MI, DimIdx, NumVAddrs);
  }

-  if (BaseOpcode->Atomic) // No d16 atomics, or TFE.
-    return true;
+  int DMaskLanes = 0;
+  if (!BaseOpcode->Atomic) {
+    const int DMaskIdx = getDMaskIdx(BaseOpcode, NumDefs);
+    unsigned DMask = MI.getOperand(DMaskIdx).getImm();
+    DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
+  }

  if (BaseOpcode->Store) { // No TFE for stores?
+    // TODO: Handle dmask trim
    Register VData = MI.getOperand(1).getReg();
    LLT Ty = MRI->getType(VData);
    if (!Ty.isVector() || Ty.getElementType() != S16)
@ -3626,91 +3573,162 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
  LLT Ty = MRI->getType(DstReg);
  const LLT EltTy = Ty.getScalarType();
  const bool IsD16 = Ty.getScalarType() == S16;
-  const unsigned NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
+  const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
+
+  // Confirm that the return type is large enough for the dmask specified
+  if (NumElts < DMaskLanes)
+    return false;
+
+  if (NumElts > 4 || DMaskLanes > 4)
+    return false;
+
+  const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
+  const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
+
+  // The raw dword aligned data component of the load. The only legal cases
+  // where this matters should be when using the packed D16 format, for
+  // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
+  LLT RoundedTy;
+
+  // S32 vector to to cover all data, plus TFE result element.
+  LLT TFETy;
+
+  // Register type to use for each loaded component. Will be S32 or V2S16.
+  LLT RegTy;
+
+  if (IsD16 && ST.hasUnpackedD16VMem()) {
+    RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
+    TFETy = LLT::vector(AdjustedNumElts + 1, 32);
+    RegTy = S32;
+  } else {
+    unsigned EltSize = EltTy.getSizeInBits();
+    unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
+    unsigned RoundedSize = 32 * RoundedElts;
+    RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
+    TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
+    RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
+  }
+
+  // The return type does not need adjustment.
+  // TODO: Should we change s16 case to s32 or <2 x s16>?
+  if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
+    return true;
+
+  Register Dst1Reg;
+
+  // Insert after the instruction.
+  B.setInsertPt(*MI.getParent(), ++MI.getIterator());
+
+  // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
+  // s16> instead of s32, we would only need 1 bitcast instead of multiple.
+  const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
+  const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
+
+  Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
+
+  MI.getOperand(0).setReg(NewResultReg);
+
+  // In the IR, TFE is supposed to be used with a 2 element struct return
+  // type. The intruction really returns these two values in one contiguous
+  // register, with one additional dword beyond the loaded data. Rewrite the
+  // return type to use a single register result.

  if (IsTFE) {
-    // In the IR, TFE is supposed to be used with a 2 element struct return
-    // type. The intruction really returns these two values in one contiguous
-    // register, with one additional dword beyond the loaded data. Rewrite the
-    // return type to use a single register result.
-    Register Dst1Reg = MI.getOperand(1).getReg();
+    Dst1Reg = MI.getOperand(1).getReg();
    if (MRI->getType(Dst1Reg) != S32)
      return false;

    // TODO: Make sure the TFE operand bit is set.
-
-    // The raw dword aligned data component of the load. The only legal cases
-    // where this matters should be when using the packed D16 format, for
-    // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
-    LLT RoundedTy;
-    LLT TFETy;
-
-    if (IsD16 && ST.hasUnpackedD16VMem()) {
-      RoundedTy = LLT::scalarOrVector(NumElts, 32);
-      TFETy = LLT::vector(NumElts + 1, 32);
-    } else {
-      unsigned EltSize = Ty.getScalarSizeInBits();
-      unsigned RoundedElts = (Ty.getSizeInBits() + 31) / 32;
-      unsigned RoundedSize = 32 * RoundedElts;
-      RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
-      TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
-    }
-
-    Register TFEReg = MRI->createGenericVirtualRegister(TFETy);
-
-    MI.getOperand(0).setReg(TFEReg);
    MI.RemoveOperand(1);

-    // Insert after the instruction.
-    B.setInsertPt(*MI.getParent(), ++MI.getIterator());
-
-    // Now figure out how to copy the new result register back into the old
-    // result.
-
-    SmallVector<Register, 5> UnmergeResults(TFETy.getNumElements(), Dst1Reg);
-    int NumDataElts = TFETy.getNumElements() - 1;
-
-    if (!Ty.isVector()) {
-      // Simplest case is a trivial unmerge (plus a truncate for d16).
-      UnmergeResults[0] = Ty == S32 ?
-        DstReg : MRI->createGenericVirtualRegister(S32);
-
-      B.buildUnmerge(UnmergeResults, TFEReg);
-      if (Ty != S32)
-        B.buildTrunc(DstReg, UnmergeResults[0]);
+    // Handle the easy case that requires no repack instructions.
+    if (Ty == S32) {
+      B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
      return true;
    }
+  }

+  // Now figure out how to copy the new result register back into the old
+  // result.
+  SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
+
+  const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
+
+  if (ResultNumRegs == 1) {
+    assert(!IsTFE);
+    ResultRegs[0] = NewResultReg;
+  } else {
    // We have to repack into a new vector of some kind.
-    for (int I = 0; I != NumDataElts; ++I)
-      UnmergeResults[I] = MRI->createGenericVirtualRegister(S32);
-    B.buildUnmerge(UnmergeResults, TFEReg);
+    for (int I = 0; I != NumDataRegs; ++I)
+      ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
+    B.buildUnmerge(ResultRegs, NewResultReg);

-    // Drop the final TFE element.
-    ArrayRef<Register> DataPart(UnmergeResults.data(), NumDataElts);
-
-    if (EltTy == S32)
-      B.buildBuildVector(DstReg, DataPart);
-    else if (ST.hasUnpackedD16VMem())
-      truncToS16Vector(B, DstReg, DataPart);
-    else
-      bitcastToS16Vector(B, DstReg, DataPart);
+    // Drop the final TFE element to get the data part. The TFE result is
+    // directly written to the right place already.
+    if (IsTFE)
+      ResultRegs.resize(NumDataRegs);
+  }

+  // For an s16 scalar result, we form an s32 result with a truncate regardless
+  // of packed vs. unpacked.
+  if (IsD16 && !Ty.isVector()) {
+    B.buildTrunc(DstReg, ResultRegs[0]);
    return true;
  }

-  // Must be an image load.
-  if (!ST.hasUnpackedD16VMem() || !Ty.isVector() || Ty.getElementType() != S16)
+  // Avoid a build/concat_vector of 1 entry.
+  if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
+    B.buildBitcast(DstReg, ResultRegs[0]);
    return true;
+  }

-  B.setInsertPt(*MI.getParent(), ++MI.getIterator());
+  assert(Ty.isVector());

-  LLT WidenedTy = Ty.changeElementType(S32);
-  Register WideDstReg = MRI->createGenericVirtualRegister(WidenedTy);
+  if (IsD16) {
+    // For packed D16 results with TFE enabled, all the data components are
+    // S32. Cast back to the expected type.
+    //
+    // TODO: We don't really need to use load s32 elements. We would only need one
+    // cast for the TFE result if a multiple of v2s16 was used.
+    if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
+      for (Register &Reg : ResultRegs)
+        Reg = B.buildBitcast(V2S16, Reg).getReg(0);
+    } else if (ST.hasUnpackedD16VMem()) {
+      for (Register &Reg : ResultRegs)
+        Reg = B.buildTrunc(S16, Reg).getReg(0);
+    }
+  }

-  MI.getOperand(0).setReg(WideDstReg);
+  auto padWithUndef = [&](LLT Ty, int NumElts) {
+    if (NumElts == 0)
+      return;
+    Register Undef = B.buildUndef(Ty).getReg(0);
+    for (int I = 0; I != NumElts; ++I)
+      ResultRegs.push_back(Undef);
+  };

-  repackUnpackedD16Load(B, DstReg, WideDstReg);
+  // Pad out any elements eliminated due to the dmask.
+  LLT ResTy = MRI->getType(ResultRegs[0]);
+  if (!ResTy.isVector()) {
+    padWithUndef(ResTy, NumElts - ResultRegs.size());
+    B.buildBuildVector(DstReg, ResultRegs);
+    return true;
+  }
+
+  assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
+  const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
+
+  // Deal with the one annoying legal case.
+  const LLT V3S16 = LLT::vector(3, 16);
+  if (Ty == V3S16) {
+    padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
+    auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
+    B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
+    return true;
+  }
+
+  padWithUndef(ResTy, RegsToCover - ResultRegs.size());
+  B.buildConcatVectors(DstReg, ResultRegs);
  return true;
 }

--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll
@ -2906,12 +2906,12 @@ define amdgpu_ps <4 x float> @getresinfo_dmask0(<8 x i32> inreg %rsrc, <4 x floa
  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
  ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
  ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
-  ; GFX9:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1d), 0, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0
-  ; GFX9:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX9:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX9:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX9:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX9:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX9:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1d), 0, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0
+  ; GFX9:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GFX9:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
+  ; GFX9:   $vgpr1 = COPY [[DEF]](s32)
+  ; GFX9:   $vgpr2 = COPY [[DEF]](s32)
+  ; GFX9:   $vgpr3 = COPY [[DEF]](s32)
  ; GFX9:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
  ; GFX10NSA-LABEL: name: getresinfo_dmask0
  ; GFX10NSA: bb.1.main_body:
@ -2928,12 +2928,12 @@ define amdgpu_ps <4 x float> @getresinfo_dmask0(<8 x i32> inreg %rsrc, <4 x floa
  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
  ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
  ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
-  ; GFX10NSA:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1d), 0, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0
-  ; GFX10NSA:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GFX10NSA:   $vgpr0 = COPY [[UV]](s32)
-  ; GFX10NSA:   $vgpr1 = COPY [[UV1]](s32)
-  ; GFX10NSA:   $vgpr2 = COPY [[UV2]](s32)
-  ; GFX10NSA:   $vgpr3 = COPY [[UV3]](s32)
+  ; GFX10NSA:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1d), 0, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0
+  ; GFX10NSA:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GFX10NSA:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
+  ; GFX10NSA:   $vgpr1 = COPY [[DEF]](s32)
+  ; GFX10NSA:   $vgpr2 = COPY [[DEF]](s32)
+  ; GFX10NSA:   $vgpr3 = COPY [[DEF]](s32)
  ; GFX10NSA:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 main_body:
  %mip = extractelement <2 x i16> %coords, i32 0
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.d16.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.ll
@ -224,6 +224,587 @@ define amdgpu_ps <4 x float> @image_load_tfe_v4f32(<8 x i32> inreg %rsrc, i32 %s
  ret <4 x float> %tex
 }

+define amdgpu_ps float @image_load_f32_dmask_0000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+  ; GCN-LABEL: name: image_load_f32_dmask_0000
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
+  ; GCN:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
+  ; GCN:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
+  ; GCN:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
+  ; GCN:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
+  ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
+  ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
+  ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
+  ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8")
+  ; GCN:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
+  ; GCN:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  %tex = call float @llvm.amdgcn.image.load.2d.f32.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
+  ret float %tex
+}
+
+define amdgpu_ps <2 x float> @image_load_v2f32_dmask_1000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+  ; GCN-LABEL: name: image_load_v2f32_dmask_1000
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
+  ; GCN:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
+  ; GCN:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
+  ; GCN:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
+  ; GCN:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
+  ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
+  ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
+  ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
+  ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 8 from custom "TargetCustom8")
+  ; GCN:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
+  ; GCN:   $vgpr1 = COPY [[DEF]](s32)
+  ; GCN:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
+  %tex = call <2 x float> @llvm.amdgcn.image.load.2d.v2f32.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <2 x float> %tex
+}
+
+define amdgpu_ps <2 x float> @image_load_v2f32_dmask_0000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+  ; GCN-LABEL: name: image_load_v2f32_dmask_0000
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
+  ; GCN:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
+  ; GCN:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
+  ; GCN:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
+  ; GCN:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
+  ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
+  ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
+  ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
+  ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 8 from custom "TargetCustom8")
+  ; GCN:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
+  ; GCN:   $vgpr1 = COPY [[DEF]](s32)
+  ; GCN:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
+  %tex = call <2 x float> @llvm.amdgcn.image.load.2d.v2f32.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <2 x float> %tex
+}
+
+define amdgpu_ps <3 x float> @image_load_v3f32_dmask_1100(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+  ; GCN-LABEL: name: image_load_v3f32_dmask_1100
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
+  ; GCN:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
+  ; GCN:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
+  ; GCN:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
+  ; GCN:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
+  ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
+  ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
+  ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
+  ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 12 from custom "TargetCustom8", align 16)
+  ; GCN:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
+  ; GCN:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN:   $vgpr0 = COPY [[UV]](s32)
+  ; GCN:   $vgpr1 = COPY [[UV1]](s32)
+  ; GCN:   $vgpr2 = COPY [[DEF]](s32)
+  ; GCN:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+  %tex = call <3 x float> @llvm.amdgcn.image.load.2d.v3f32.i32(i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <3 x float> %tex
+}
+
+define amdgpu_ps <3 x float> @image_load_v3f32_dmask_1000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+  ; GCN-LABEL: name: image_load_v3f32_dmask_1000
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
+  ; GCN:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
+  ; GCN:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
+  ; GCN:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
+  ; GCN:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
+  ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
+  ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
+  ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
+  ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 12 from custom "TargetCustom8", align 16)
+  ; GCN:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
+  ; GCN:   $vgpr1 = COPY [[DEF]](s32)
+  ; GCN:   $vgpr2 = COPY [[DEF]](s32)
+  ; GCN:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+  %tex = call <3 x float> @llvm.amdgcn.image.load.2d.v3f32.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <3 x float> %tex
+}
+
+define amdgpu_ps <3 x float> @image_load_v3f32_dmask_0000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+  ; GCN-LABEL: name: image_load_v3f32_dmask_0000
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
+  ; GCN:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
+  ; GCN:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
+  ; GCN:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
+  ; GCN:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
+  ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
+  ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
+  ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
+  ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 12 from custom "TargetCustom8", align 16)
+  ; GCN:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
+  ; GCN:   $vgpr1 = COPY [[DEF]](s32)
+  ; GCN:   $vgpr2 = COPY [[DEF]](s32)
+  ; GCN:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+  %tex = call <3 x float> @llvm.amdgcn.image.load.2d.v3f32.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <3 x float> %tex
+}
+
+define amdgpu_ps <4 x float> @image_load_v4f32_dmask_1110(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+  ; GCN-LABEL: name: image_load_v4f32_dmask_1110
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
+  ; GCN:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
+  ; GCN:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
+  ; GCN:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
+  ; GCN:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
+  ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
+  ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
+  ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
+  ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
+  ; GCN:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<3 x s32>)
+  ; GCN:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN:   $vgpr0 = COPY [[UV]](s32)
+  ; GCN:   $vgpr1 = COPY [[UV1]](s32)
+  ; GCN:   $vgpr2 = COPY [[UV2]](s32)
+  ; GCN:   $vgpr3 = COPY [[DEF]](s32)
+  ; GCN:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  %tex = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %tex
+}
+
+define amdgpu_ps <4 x float> @image_load_v4f32_dmask_1100(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+  ; GCN-LABEL: name: image_load_v4f32_dmask_1100
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
+  ; GCN:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
+  ; GCN:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
+  ; GCN:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
+  ; GCN:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
+  ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
+  ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
+  ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
+  ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
+  ; GCN:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
+  ; GCN:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN:   $vgpr0 = COPY [[UV]](s32)
+  ; GCN:   $vgpr1 = COPY [[UV1]](s32)
+  ; GCN:   $vgpr2 = COPY [[DEF]](s32)
+  ; GCN:   $vgpr3 = COPY [[DEF]](s32)
+  ; GCN:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  %tex = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %tex
+}
+
+define amdgpu_ps <4 x float> @image_load_v4f32_dmask_1000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+  ; GCN-LABEL: name: image_load_v4f32_dmask_1000
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
+  ; GCN:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
+  ; GCN:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
+  ; GCN:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
+  ; GCN:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
+  ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
+  ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
+  ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
+  ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
+  ; GCN:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
+  ; GCN:   $vgpr1 = COPY [[DEF]](s32)
+  ; GCN:   $vgpr2 = COPY [[DEF]](s32)
+  ; GCN:   $vgpr3 = COPY [[DEF]](s32)
+  ; GCN:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  %tex = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %tex
+}
+
+define amdgpu_ps <4 x float> @image_load_v4f32_dmask_0000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+  ; GCN-LABEL: name: image_load_v4f32_dmask_0000
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
+  ; GCN:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
+  ; GCN:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
+  ; GCN:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
+  ; GCN:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
+  ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
+  ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
+  ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
+  ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
+  ; GCN:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
+  ; GCN:   $vgpr1 = COPY [[DEF]](s32)
+  ; GCN:   $vgpr2 = COPY [[DEF]](s32)
+  ; GCN:   $vgpr3 = COPY [[DEF]](s32)
+  ; GCN:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  %tex = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %tex
+}
+
+define amdgpu_ps float @image_load_tfe_f32_dmask_0000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+  ; GCN-LABEL: name: image_load_tfe_f32_dmask_0000
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
+  ; GCN:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
+  ; GCN:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
+  ; GCN:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
+  ; GCN:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
+  ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
+  ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
+  ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
+  ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 4 from custom "TargetCustom8")
+  ; GCN:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
+  ; GCN:   G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   $vgpr0 = COPY [[UV]](s32)
+  ; GCN:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  %res = call { float, i32 } @llvm.amdgcn.image.load.2d.sl_f32i32s.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
+  %tex = extractvalue { float, i32 } %res, 0
+  %tfe = extractvalue { float, i32 } %res, 1
+  store i32 %tfe, i32 addrspace(1)* undef
+  ret float %tex
+}
+
+define amdgpu_ps <2 x float> @image_load_tfe_v2f32_dmask_1000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+  ; GCN-LABEL: name: image_load_tfe_v2f32_dmask_1000
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
+  ; GCN:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
+  ; GCN:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
+  ; GCN:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
+  ; GCN:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
+  ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
+  ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
+  ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
+  ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 8 from custom "TargetCustom8")
+  ; GCN:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
+  ; GCN:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN:   G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   $vgpr0 = COPY [[UV]](s32)
+  ; GCN:   $vgpr1 = COPY [[DEF1]](s32)
+  ; GCN:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
+  %res = call { <2 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v2f32i32s.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
+  %tex = extractvalue { <2 x float>, i32 } %res, 0
+  %tfe = extractvalue { <2 x float>, i32 } %res, 1
+  store i32 %tfe, i32 addrspace(1)* undef
+  ret <2 x float> %tex
+}
+
+define amdgpu_ps <2 x float> @image_load_tfe_v2f32_dmask_0000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+  ; GCN-LABEL: name: image_load_tfe_v2f32_dmask_0000
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
+  ; GCN:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
+  ; GCN:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
+  ; GCN:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
+  ; GCN:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
+  ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
+  ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
+  ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
+  ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 8 from custom "TargetCustom8")
+  ; GCN:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
+  ; GCN:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN:   G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   $vgpr0 = COPY [[UV]](s32)
+  ; GCN:   $vgpr1 = COPY [[DEF1]](s32)
+  ; GCN:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
+  %res = call { <2 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v2f32i32s.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
+  %tex = extractvalue { <2 x float>, i32 } %res, 0
+  %tfe = extractvalue { <2 x float>, i32 } %res, 1
+  store i32 %tfe, i32 addrspace(1)* undef
+  ret <2 x float> %tex
+}
+
+define amdgpu_ps <3 x float> @image_load_tfe_v3f32_dmask_1100(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+  ; GCN-LABEL: name: image_load_tfe_v3f32_dmask_1100
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
+  ; GCN:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
+  ; GCN:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
+  ; GCN:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
+  ; GCN:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
+  ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
+  ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
+  ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
+  ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 12 from custom "TargetCustom8", align 16)
+  ; GCN:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<3 x s32>)
+  ; GCN:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN:   G_STORE [[UV2]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   $vgpr0 = COPY [[UV]](s32)
+  ; GCN:   $vgpr1 = COPY [[UV1]](s32)
+  ; GCN:   $vgpr2 = COPY [[DEF1]](s32)
+  ; GCN:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+  %res = call { <3 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v3f32i32s.i32(i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
+  %tex = extractvalue { <3 x float>, i32 } %res, 0
+  %tfe = extractvalue { <3 x float>, i32 } %res, 1
+  store i32 %tfe, i32 addrspace(1)* undef
+  ret <3 x float> %tex
+}
+
+define amdgpu_ps <3 x float> @image_load_tfe_v3f32_dmask_1000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+  ; GCN-LABEL: name: image_load_tfe_v3f32_dmask_1000
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
+  ; GCN:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
+  ; GCN:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
+  ; GCN:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
+  ; GCN:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
+  ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
+  ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
+  ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
+  ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 12 from custom "TargetCustom8", align 16)
+  ; GCN:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
+  ; GCN:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN:   G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   $vgpr0 = COPY [[UV]](s32)
+  ; GCN:   $vgpr1 = COPY [[DEF1]](s32)
+  ; GCN:   $vgpr2 = COPY [[DEF1]](s32)
+  ; GCN:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+  %res = call { <3 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v3f32i32s.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
+  %tex = extractvalue { <3 x float>, i32 } %res, 0
+  %tfe = extractvalue { <3 x float>, i32 } %res, 1
+  store i32 %tfe, i32 addrspace(1)* undef
+  ret <3 x float> %tex
+}
+
+define amdgpu_ps <3 x float> @image_load_tfe_v3f32_dmask_0000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+  ; GCN-LABEL: name: image_load_tfe_v3f32_dmask_0000
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
+  ; GCN:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
+  ; GCN:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
+  ; GCN:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
+  ; GCN:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
+  ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
+  ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
+  ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
+  ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 12 from custom "TargetCustom8", align 16)
+  ; GCN:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
+  ; GCN:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN:   G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   $vgpr0 = COPY [[UV]](s32)
+  ; GCN:   $vgpr1 = COPY [[DEF1]](s32)
+  ; GCN:   $vgpr2 = COPY [[DEF1]](s32)
+  ; GCN:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+  %res = call { <3 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v3f32i32s.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
+  %tex = extractvalue { <3 x float>, i32 } %res, 0
+  %tfe = extractvalue { <3 x float>, i32 } %res, 1
+  store i32 %tfe, i32 addrspace(1)* undef
+  ret <3 x float> %tex
+}
+
+define amdgpu_ps <4 x float> @image_load_tfe_v4f32_dmask_1110(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+  ; GCN-LABEL: name: image_load_tfe_v4f32_dmask_1110
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
+  ; GCN:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
+  ; GCN:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
+  ; GCN:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
+  ; GCN:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
+  ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
+  ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
+  ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
+  ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
+  ; GCN:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
+  ; GCN:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN:   G_STORE [[UV3]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   $vgpr0 = COPY [[UV]](s32)
+  ; GCN:   $vgpr1 = COPY [[UV1]](s32)
+  ; GCN:   $vgpr2 = COPY [[UV2]](s32)
+  ; GCN:   $vgpr3 = COPY [[DEF1]](s32)
+  ; GCN:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  %res = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
+  %tex = extractvalue { <4 x float>, i32 } %res, 0
+  %tfe = extractvalue { <4 x float>, i32 } %res, 1
+  store i32 %tfe, i32 addrspace(1)* undef
+  ret <4 x float> %tex
+}
+
+define amdgpu_ps <4 x float> @image_load_tfe_v4f32_dmask_1100(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+  ; GCN-LABEL: name: image_load_tfe_v4f32_dmask_1100
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
+  ; GCN:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
+  ; GCN:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
+  ; GCN:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
+  ; GCN:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
+  ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
+  ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
+  ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
+  ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
+  ; GCN:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<3 x s32>)
+  ; GCN:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN:   G_STORE [[UV2]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   $vgpr0 = COPY [[UV]](s32)
+  ; GCN:   $vgpr1 = COPY [[UV1]](s32)
+  ; GCN:   $vgpr2 = COPY [[DEF1]](s32)
+  ; GCN:   $vgpr3 = COPY [[DEF1]](s32)
+  ; GCN:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  %res = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
+  %tex = extractvalue { <4 x float>, i32 } %res, 0
+  %tfe = extractvalue { <4 x float>, i32 } %res, 1
+  store i32 %tfe, i32 addrspace(1)* undef
+  ret <4 x float> %tex
+}
+
+define amdgpu_ps <4 x float> @image_load_tfe_v4f32_dmask_1000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+  ; GCN-LABEL: name: image_load_tfe_v4f32_dmask_1000
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
+  ; GCN:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
+  ; GCN:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
+  ; GCN:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
+  ; GCN:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
+  ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
+  ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
+  ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
+  ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
+  ; GCN:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
+  ; GCN:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN:   G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   $vgpr0 = COPY [[UV]](s32)
+  ; GCN:   $vgpr1 = COPY [[DEF1]](s32)
+  ; GCN:   $vgpr2 = COPY [[DEF1]](s32)
+  ; GCN:   $vgpr3 = COPY [[DEF1]](s32)
+  ; GCN:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  %res = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
+  %tex = extractvalue { <4 x float>, i32 } %res, 0
+  %tfe = extractvalue { <4 x float>, i32 } %res, 1
+  store i32 %tfe, i32 addrspace(1)* undef
+  ret <4 x float> %tex
+}
+
+define amdgpu_ps <4 x float> @image_load_tfe_v4f32_dmask_0000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+  ; GCN-LABEL: name: image_load_tfe_v4f32_dmask_0000
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
+  ; GCN:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
+  ; GCN:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
+  ; GCN:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
+  ; GCN:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
+  ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
+  ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
+  ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
+  ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
+  ; GCN:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
+  ; GCN:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GCN:   G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   $vgpr0 = COPY [[UV]](s32)
+  ; GCN:   $vgpr1 = COPY [[DEF1]](s32)
+  ; GCN:   $vgpr2 = COPY [[DEF1]](s32)
+  ; GCN:   $vgpr3 = COPY [[DEF1]](s32)
+  ; GCN:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  %res = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
+  %tex = extractvalue { <4 x float>, i32 } %res, 0
+  %tfe = extractvalue { <4 x float>, i32 } %res, 1
+  store i32 %tfe, i32 addrspace(1)* undef
+  ret <4 x float> %tex
+}
+
 declare float @llvm.amdgcn.image.load.2d.f32.i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0
 declare <2 x float> @llvm.amdgcn.image.load.2d.v2f32.i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0
 declare <3 x float> @llvm.amdgcn.image.load.2d.v3f32.i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0