DAG: Don't use ABI copies in some contexts

If an ABI-like value is used in a different block, the type split used is not necessarily the same as the call's ABI. The value is used through an intermediate copy virtual registers from the other block. This resulted in copies with inconsistent sizes later. Fixes regressions since r338197 when AMDGPU started splitting vector types for calls. llvm-svn: 341018
2018-08-30 05:49:28 +00:00 · 2018-08-30 05:49:28 +00:00 · 167601e629
parent fcd552999f
commit 167601e629
2 changed files with 179 additions and 2 deletions
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@ -1178,7 +1178,8 @@ SDValue SelectionDAGBuilder::getCopyFromRegs(const Value *V, Type *Ty) {
    unsigned InReg = It->second;

    RegsForValue RFV(*DAG.getContext(), DAG.getTargetLoweringInfo(),
-                     DAG.getDataLayout(), InReg, Ty, getABIRegCopyCC(V));
+                     DAG.getDataLayout(), InReg, Ty,
+                     None); // This is not an ABI copy.
    SDValue Chain = DAG.getEntryNode();
    Result = RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, nullptr,
                                 V);
@ -8696,7 +8697,7 @@ SelectionDAGBuilder::CopyValueToVirtualRegister(const Value *V, unsigned Reg) {
  // notional registers required by the type.

  RegsForValue RFV(V->getContext(), TLI, DAG.getDataLayout(), Reg, V->getType(),
-                   getABIRegCopyCC(V));
+                   None); // This is not an ABI copy.
  SDValue Chain = DAG.getEntryNode();

  ISD::NodeType ExtendType = (FuncInfo.PreferredExtendType.find(V) ==
--- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
@ -0,0 +1,176 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; SelectionDAG builder was using the IR value kind to decide how to
+; split the types for copyToRegs/copyFromRegs in all contexts. This
+; was incorrect if the ABI-like value such as a call was used outside
+; of the block. The value in that case is not used directly, but
+; through another set of copies to potentially different register
+; types in the parent block.
+
+; This would then end up producing inconsistent pairs of copies with
+; the wrong sizes when the vector type result from the call was split
+; into multiple pieces, but expected to be a single register in the
+; cross-block copy.
+;
+; This isn't exactly ideal for AMDGPU, since in reality the
+; intermediate vector register type is undesirable anyway, but it
+; requires more work to be able to split all vector copies in all
+; contexts.
+;
+; This was only an issue if the value was used directly in another
+; block. If there was an intermediate operation or a phi it was fine,
+; since that didn't look like an ABI copy.
+
+
+define float @call_split_type_used_outside_block_v2f32() #0 {
+; GCN-LABEL: call_split_type_used_outside_block_v2f32:
+; GCN:       ; %bb.0: ; %bb0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s5, s32
+; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    v_writelane_b32 v32, s33, 0
+; GCN-NEXT:    v_writelane_b32 v32, s34, 1
+; GCN-NEXT:    s_add_u32 s32, s32, 0x400
+; GCN-NEXT:    v_writelane_b32 v32, s35, 2
+; GCN-NEXT:    s_getpc_b64 s[6:7]
+; GCN-NEXT:    s_add_u32 s6, s6, func_v2f32@rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s7, s7, func_v2f32@rel32@hi+4
+; GCN-NEXT:    s_mov_b64 s[34:35], s[30:31]
+; GCN-NEXT:    s_mov_b32 s33, s5
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GCN-NEXT:    s_mov_b32 s5, s33
+; GCN-NEXT:    s_mov_b64 s[30:31], s[34:35]
+; GCN-NEXT:    v_readlane_b32 s35, v32, 2
+; GCN-NEXT:    v_readlane_b32 s34, v32, 1
+; GCN-NEXT:    v_readlane_b32 s33, v32, 0
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    s_sub_u32 s32, s32, 0x400
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+bb0:
+  %split.ret.type = call <2 x float> @func_v2f32()
+  br label %bb1
+
+bb1:
+  %extract = extractelement <2 x float> %split.ret.type, i32 0
+  ret float %extract
+}
+
+define float @call_split_type_used_outside_block_v3f32() #0 {
+; GCN-LABEL: call_split_type_used_outside_block_v3f32:
+; GCN:       ; %bb.0: ; %bb0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s5, s32
+; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    v_writelane_b32 v32, s33, 0
+; GCN-NEXT:    v_writelane_b32 v32, s34, 1
+; GCN-NEXT:    s_add_u32 s32, s32, 0x400
+; GCN-NEXT:    v_writelane_b32 v32, s35, 2
+; GCN-NEXT:    s_getpc_b64 s[6:7]
+; GCN-NEXT:    s_add_u32 s6, s6, func_v3f32@rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s7, s7, func_v3f32@rel32@hi+4
+; GCN-NEXT:    s_mov_b64 s[34:35], s[30:31]
+; GCN-NEXT:    s_mov_b32 s33, s5
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GCN-NEXT:    s_mov_b32 s5, s33
+; GCN-NEXT:    s_mov_b64 s[30:31], s[34:35]
+; GCN-NEXT:    v_readlane_b32 s35, v32, 2
+; GCN-NEXT:    v_readlane_b32 s34, v32, 1
+; GCN-NEXT:    v_readlane_b32 s33, v32, 0
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    s_sub_u32 s32, s32, 0x400
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+bb0:
+  %split.ret.type = call <3 x float> @func_v3f32()
+  br label %bb1
+
+bb1:
+  %extract = extractelement <3 x float> %split.ret.type, i32 0
+  ret float %extract
+}
+
+define half @call_split_type_used_outside_block_v4f16() #0 {
+; GCN-LABEL: call_split_type_used_outside_block_v4f16:
+; GCN:       ; %bb.0: ; %bb0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s5, s32
+; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    v_writelane_b32 v32, s33, 0
+; GCN-NEXT:    v_writelane_b32 v32, s34, 1
+; GCN-NEXT:    s_add_u32 s32, s32, 0x400
+; GCN-NEXT:    v_writelane_b32 v32, s35, 2
+; GCN-NEXT:    s_getpc_b64 s[6:7]
+; GCN-NEXT:    s_add_u32 s6, s6, func_v4f16@rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s7, s7, func_v4f16@rel32@hi+4
+; GCN-NEXT:    s_mov_b64 s[34:35], s[30:31]
+; GCN-NEXT:    s_mov_b32 s33, s5
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GCN-NEXT:    s_mov_b32 s5, s33
+; GCN-NEXT:    s_mov_b64 s[30:31], s[34:35]
+; GCN-NEXT:    v_readlane_b32 s35, v32, 2
+; GCN-NEXT:    v_readlane_b32 s34, v32, 1
+; GCN-NEXT:    v_readlane_b32 s33, v32, 0
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    s_sub_u32 s32, s32, 0x400
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+bb0:
+  %split.ret.type = call <4 x half> @func_v4f16()
+  br label %bb1
+
+bb1:
+  %extract = extractelement <4 x half> %split.ret.type, i32 0
+  ret half %extract
+}
+
+define { i32, half } @call_split_type_used_outside_block_struct() #0 {
+; GCN-LABEL: call_split_type_used_outside_block_struct:
+; GCN:       ; %bb.0: ; %bb0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s5, s32
+; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    v_writelane_b32 v32, s33, 0
+; GCN-NEXT:    v_writelane_b32 v32, s34, 1
+; GCN-NEXT:    s_add_u32 s32, s32, 0x400
+; GCN-NEXT:    v_writelane_b32 v32, s35, 2
+; GCN-NEXT:    s_getpc_b64 s[6:7]
+; GCN-NEXT:    s_add_u32 s6, s6, func_struct@rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s7, s7, func_struct@rel32@hi+4
+; GCN-NEXT:    s_mov_b64 s[34:35], s[30:31]
+; GCN-NEXT:    s_mov_b32 s33, s5
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GCN-NEXT:    s_mov_b32 s5, s33
+; GCN-NEXT:    s_mov_b64 s[30:31], s[34:35]
+; GCN-NEXT:    v_readlane_b32 s35, v32, 2
+; GCN-NEXT:    v_readlane_b32 s34, v32, 1
+; GCN-NEXT:    v_readlane_b32 s33, v32, 0
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    v_mov_b32_e32 v1, v4
+; GCN-NEXT:    s_sub_u32 s32, s32, 0x400
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+bb0:
+  %split.ret.type = call { <4 x i32>, <4 x half> } @func_struct()
+  br label %bb1
+
+bb1:
+  %val0 = extractvalue { <4 x i32>, <4 x half> } %split.ret.type, 0
+  %val1 = extractvalue { <4 x i32>, <4 x half> } %split.ret.type, 1
+  %extract0 = extractelement <4 x i32> %val0, i32 0
+  %extract1 = extractelement <4 x half> %val1, i32 0
+  %ins0 = insertvalue { i32, half } undef, i32 %extract0, 0
+  %ins1 = insertvalue { i32, half } %ins0, half %extract1, 1
+  ret { i32, half } %ins1
+}
+
+
+declare <2 x float> @func_v2f32() #0
+declare <3 x float> @func_v3f32() #0
+declare <4 x float> @func_v4f32() #0
+declare <4 x half> @func_v4f16() #0
+
+declare { <4 x i32>, <4 x half> } @func_struct() #0
+
+attributes #0 = { nounwind}