diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index fcaa6f907e5c..23c9352ce273 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -129,10 +129,7 @@ private:
   bool SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const;
   bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const;
   bool SelectSMRDBufferSgpr(SDValue Addr, SDValue &Offset) const;
-  bool selectMOVRELOffsetImpl(SDValue Index, SDValue &Base,
-                              SDValue &Offset, bool IsInsert) const;
-  bool selectMOVRELSOffset(SDValue Index, SDValue &Base, SDValue &Offset) const;
-  bool selectMOVRELDOffset(SDValue Index, SDValue &Base, SDValue &Offset) const;
+  bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const;
   bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
   bool SelectVOP3NoMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
   bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods,
@@ -1193,10 +1190,9 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgpr(SDValue Addr,
          !isa<ConstantSDNode>(Offset);
 }
 
-bool AMDGPUDAGToDAGISel::selectMOVRELOffsetImpl(SDValue Index,
-                                                SDValue &Base,
-                                                SDValue &Offset,
-                                                bool IsInsert) const {
+bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
+                                            SDValue &Base,
+                                            SDValue &Offset) const {
   SDLoc DL(Index);
 
   if (CurDAG->isBaseWithConstantOffset(Index)) {
@@ -1210,34 +1206,14 @@ bool AMDGPUDAGToDAGISel::selectMOVRELOffsetImpl(SDValue Index,
     return true;
   }
 
-  if (IsInsert) {
-    if (ConstantSDNode *CBase = dyn_cast<ConstantSDNode>(Index)) {
-      Base = CurDAG->getRegister(AMDGPU::NoRegister, MVT::i32);
-      Offset = CurDAG->getTargetConstant(CBase->getZExtValue(), DL, MVT::i32);
-      return true;
-    }
-  } else {
-    if (isa<ConstantSDNode>(Index))
-      return false;
-  }
+  if (isa<ConstantSDNode>(Index))
+    return false;
 
   Base = Index;
   Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
   return true;
 }
 
-bool AMDGPUDAGToDAGISel::selectMOVRELSOffset(SDValue Index,
-                                             SDValue &Base,
-                                             SDValue &Offset) const {
-  return selectMOVRELOffsetImpl(Index, Base, Offset, false);
-}
-
-bool AMDGPUDAGToDAGISel::selectMOVRELDOffset(SDValue Index,
-                                             SDValue &Base,
-                                             SDValue &Offset) const {
-  return selectMOVRELOffsetImpl(Index, Base, Offset, true);
-}
-
 SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, const SDLoc &DL,
                                      SDValue Val, uint32_t Offset,
                                      uint32_t Width) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index b31a804987ca..f133eb3270ad 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -624,8 +624,7 @@ def SMRDBufferImm   : ComplexPattern<i32, 1, "SelectSMRDBufferImm">;
 def SMRDBufferImm32 : ComplexPattern<i32, 1, "SelectSMRDBufferImm32">;
 def SMRDBufferSgpr  : ComplexPattern<i32, 1, "SelectSMRDBufferSgpr">;
 
-def MOVRELSOffset : ComplexPattern<i32, 2, "selectMOVRELSOffset">;
-def MOVRELDOffset : ComplexPattern<i32, 2, "selectMOVRELDOffset">;
+def MOVRELOffset : ComplexPattern<i32, 2, "SelectMOVRELOffset">;
 
 def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">;
 def VOP3NoMods0 : ComplexPattern<untyped, 4, "SelectVOP3NoMods0">;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index cf1f4f86df32..05e0e3e504ca 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3308,13 +3308,13 @@ def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>;
 multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, string VecSize> {
   // Extract with offset
   def : Pat<
-    (eltvt (extractelt vt:$src, (MOVRELSOffset i32:$idx, (i32 imm:$offset)))),
+    (eltvt (extractelt vt:$src, (MOVRELOffset i32:$idx, (i32 imm:$offset)))),
     (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $src, $idx, imm:$offset)
   >;
 
   // Insert with offset
   def : Pat<
-    (insertelt vt:$src, eltvt:$val, (MOVRELDOffset i32:$idx, (i32 imm:$offset))),
+    (insertelt vt:$src, eltvt:$val, (MOVRELOffset i32:$idx, (i32 imm:$offset))),
     (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $src, $idx, imm:$offset, $val)
   >;
 }
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 3e6905f887fb..367e7f734556 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -73,6 +73,15 @@ define void @insertelement_v3f32_3(<3 x float> addrspace(1)* %out, <3 x float> %
   ret void
 }
 
+; GCN-LABEL: {{^}}insertelement_to_sgpr:
+; GCN-NOT: v_readfirstlane
+define amdgpu_ps <4 x float> @insertelement_to_sgpr() nounwind {
+  %tmp = load <4 x i32>, <4 x i32> addrspace(2)* undef
+  %tmp1 = insertelement <4 x i32> %tmp, i32 0, i32 0
+  %tmp2 = call <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> %tmp1, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  ret <4 x float> %tmp2
+}
+
 ; GCN-LABEL: {{^}}dynamic_insertelement_v2f32:
 ; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
 ; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
@@ -432,3 +441,5 @@ define void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x d
   store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16
   ret void
 }
+
+declare <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.SI.gather4.ll b/llvm/test/CodeGen/AMDGPU/llvm.SI.gather4.ll
index 7eb9eb253b3b..aef9f660436e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.SI.gather4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.SI.gather4.ll
@@ -467,12 +467,9 @@ main_body:
 ; This crashed at some point due to a bug in FixSGPRCopies. Derived from the
 ; report in https://bugs.freedesktop.org/show_bug.cgi?id=96877
 ;
-;TODO: the readfirstlanes are unnecessary, see http://reviews.llvm.org/D22217
-;
-;CHECK: v_readfirstlane_b32 s[[LO:[0-9]+]], v{{[0-9]+}}
-;CHECK: v_readfirstlane_b32
-;CHECK: v_readfirstlane_b32
-;CHECK: v_readfirstlane_b32 s[[HI:[0-9]+]], v{{[0-9]+}}
+;CHECK: s_load_dwordx4 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]], {{s\[[0-9]+:[0-9]+\]}}, 0x0
+;CHECK: s_waitcnt lgkmcnt(0)
+;CHECK: s_mov_b32 s[[LO]], 0
 ;CHECK: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, s{{\[}}[[LO]]:[[HI]]] dmask:0x8
 define amdgpu_ps float @gather4_sgpr_bug() {
 main_body: