AMDGPU: Expand unaligned accesses early

Due to visit order problems, in the case of an unaligned copy
the legalized DAG fails to eliminate extra instructions introduced
by the expansion of both unaligned parts.

llvm-svn: 274397
This commit is contained in:
Matt Arsenault 2016-07-01 22:55:55 +00:00
parent 53547d95ca
commit 8af47a09e5
6 changed files with 239 additions and 214 deletions

View File

@ -2182,14 +2182,11 @@ static bool hasVolatileUser(SDNode *Val) {
return false;
}
bool AMDGPUTargetLowering::shouldCombineMemoryType(const MemSDNode *M) const {
EVT VT = M->getMemoryVT();
bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
// i32 vectors are the canonical memory type.
if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
return false;
if (!VT.isByteSized())
return false;
@ -2201,15 +2198,6 @@ bool AMDGPUTargetLowering::shouldCombineMemoryType(const MemSDNode *M) const {
if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
return false;
unsigned Align = M->getAlignment();
if (Align < Size) {
bool IsFast;
if (!allowsMisalignedMemoryAccesses(VT, M->getAddressSpace(), Align, &IsFast) ||
!IsFast) {
return false;
}
}
return true;
}
@ -2224,12 +2212,32 @@ SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
if (LN->isVolatile() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
return SDValue();
if (!shouldCombineMemoryType(LN))
return SDValue();
SDLoc SL(N);
SelectionDAG &DAG = DCI.DAG;
EVT VT = LN->getMemoryVT();
unsigned Size = VT.getStoreSize();
unsigned Align = LN->getAlignment();
if (Align < Size && isTypeLegal(VT)) {
bool IsFast;
unsigned AS = LN->getAddressSpace();
// Expand unaligned loads earlier than legalization. Due to visitation order
// problems during legalization, the emitted instructions to pack and unpack
// the bytes again are not eliminated in the case of an unaligned copy.
if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) {
SDValue Ops[2];
std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
return DAG.getMergeValues(Ops, SDLoc(N));
}
if (!IsFast)
return SDValue();
}
if (!shouldCombineMemoryType(VT))
return SDValue();
EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
SDValue NewLoad
@ -2252,15 +2260,34 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
if (SN->isVolatile() || !ISD::isNormalStore(SN))
return SDValue();
if (!shouldCombineMemoryType(SN))
return SDValue();
SDValue Val = SN->getValue();
EVT VT = SN->getMemoryVT();
unsigned Size = VT.getStoreSize();
SDLoc SL(N);
SelectionDAG &DAG = DCI.DAG;
unsigned Align = SN->getAlignment();
if (Align < Size && isTypeLegal(VT)) {
bool IsFast;
unsigned AS = SN->getAddressSpace();
// Expand unaligned stores earlier than legalization. Due to visitation
// order problems during legalization, the emitted instructions to pack and
// unpack the bytes again are not eliminated in the case of an unaligned
// copy.
if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast))
return expandUnalignedStore(SN, DAG);
if (!IsFast)
return SDValue();
}
if (!shouldCombineMemoryType(VT))
return SDValue();
EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
SDValue Val = SN->getValue();
//DCI.AddToWorklist(Val.getNode());
bool OtherUses = !Val.hasOneUse();
SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);

View File

@ -66,7 +66,7 @@ protected:
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
protected:
bool shouldCombineMemoryType(const MemSDNode *M) const;
bool shouldCombineMemoryType(EVT VT) const;
SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const;

View File

@ -59,20 +59,20 @@ define void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8>
; This should not be adding instructions to shift into the correct
; position in the word for the component.
; FIXME: Packing bytes
; SI-LABEL: {{^}}load_v4i8_to_v4f32_unaligned:
; SI: buffer_load_ubyte [[LOADREG3:v[0-9]+]]
; SI: buffer_load_ubyte [[LOADREG2:v[0-9]+]]
; SI: buffer_load_ubyte [[LOADREG1:v[0-9]+]]
; SI: buffer_load_ubyte [[LOADREG0:v[0-9]+]]
; SI-NOT: v_lshlrev_b32
; SI-NOT: v_or_b32
; SI-DAG: v_lshlrev_b32
; SI-DAG: v_or_b32
; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]],
; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}},
; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}},
; SI-DAG: v_cvt_f32_ubyte0_e32 v[[HIRESULT:[0-9]+]]
; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG0]]
; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, [[LOADREG1]]
; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, [[LOADREG2]]
; SI-DAG: v_cvt_f32_ubyte0_e32 v[[HIRESULT:[0-9]+]], [[LOADREG3]]
; SI: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
; SI: buffer_store_dwordx4
define void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
%load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1
%cvt = uitofp <4 x i8> %load to <4 x float>

View File

@ -0,0 +1,22 @@
; XFAIL: *
; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s
; XUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG %s
;
; EG-LABEL: {{^}}sext_in_reg_v2i1_in_v2i32_other_amount:
; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
; EG-NOT: BFE
; EG: ADD_INT
; EG: LSHL
; EG: ASHR [[RES]]
; EG: LSHL
; EG: ASHR [[RES]]
; EG: LSHR {{\*?}} [[ADDR]]
; Works with the align 2 removed
define void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
%c = add <2 x i32> %a, %b
%x = shl <2 x i32> %c, <i32 6, i32 6>
%y = ashr <2 x i32> %x, <i32 7, i32 7>
store <2 x i32> %y, <2 x i32> addrspace(1)* %out, align 2
ret void
}

View File

@ -268,7 +268,7 @@ define void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out
%c = add <2 x i32> %a, %b
%x = shl <2 x i32> %c, <i32 6, i32 6>
%y = ashr <2 x i32> %x, <i32 7, i32 7>
store <2 x i32> %y, <2 x i32> addrspace(1)* %out, align 2
store <2 x i32> %y, <2 x i32> addrspace(1)* %out
ret void
}

View File

@ -15,7 +15,7 @@ define void @local_unaligned_load_store_i16(i16 addrspace(3)* %p, i16 addrspace(
ret void
}
; FUNC-LABEL: {{^}}unaligned_load_store_i16_global:
; FUNC-LABEL: {{^}}global_unaligned_load_store_i16:
; GCN-NOHSA: buffer_load_ubyte
; GCN-NOHSA: buffer_load_ubyte
; GCN-NOHSA: buffer_store_byte
@ -25,22 +25,25 @@ define void @local_unaligned_load_store_i16(i16 addrspace(3)* %p, i16 addrspace(
; GCN-HSA: flat_load_ubyte
; GCN-HSA: flat_store_byte
; GCN-HSA: flat_store_byte
define void @unaligned_load_store_i16_global(i16 addrspace(1)* %p, i16 addrspace(1)* %r) #0 {
define void @global_unaligned_load_store_i16(i16 addrspace(1)* %p, i16 addrspace(1)* %r) #0 {
%v = load i16, i16 addrspace(1)* %p, align 1
store i16 %v, i16 addrspace(1)* %r, align 1
ret void
}
; FUNC-LABEL: {{^}}local_unaligned_load_store_i32:
; GCN: ds_read_u8
; GCN: ds_read_u8
; GCN: ds_read_u8
; GCN: ds_read_u8
; GCN: ds_write_b8
; GCN: ds_write_b8
; GCN: ds_write_b8
; GCN: ds_write_b8
; GCN: s_endpgm
; SI: ds_read_u8
; SI: ds_read_u8
; SI: ds_read_u8
; SI: ds_read_u8
; SI-NOT: v_or
; SI-NOT: v_lshl
; SI: ds_write_b8
; SI: ds_write_b8
; SI: ds_write_b8
; SI: ds_write_b8
; SI: s_endpgm
define void @local_unaligned_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) #0 {
%v = load i32, i32 addrspace(3)* %p, align 1
store i32 %v, i32 addrspace(3)* %r, align 1
@ -98,141 +101,149 @@ define void @local_align2_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)*
ret void
}
; FIXME: Unnecessary packing and unpacking of bytes.
; FUNC-LABEL: {{^}}local_unaligned_load_store_i64:
; GCN: ds_read_u8
; GCN: ds_read_u8
; GCN: ds_read_u8
; GCN: ds_read_u8
; GCN: ds_read_u8
; GCN: ds_read_u8
; GCN: ds_read_u8
; GCN: ds_read_u8
; SI: ds_read_u8
; SI: ds_read_u8
; SI: ds_read_u8
; SI: ds_read_u8
; SI: ds_read_u8
; SI: ds_read_u8
; SI: ds_read_u8
; SI: ds_read_u8
; XGCN-NOT: v_or_b32
; XGCN-NOT: v_lshl
; GCN: ds_write_b8
; XGCN-NOT: v_or_b32
; XGCN-NOT: v_lshl
; SI-NOT: v_or_b32
; SI-NOT: v_lshl
; SI: ds_write_b8
; SI-NOT: v_or_b32
; SI-NOT: v_lshl
; GCN: ds_write_b8
; XGCN-NOT: v_or_b32
; XGCN-NOT: v_lshl
; SI: ds_write_b8
; SI-NOT: v_or_b32
; SI-NOT: v_lshl
; GCN: ds_write_b8
; XGCN-NOT: v_or_b32
; XGCN-NOT: v_lshl
; SI: ds_write_b8
; SI-NOT: v_or_b32
; SI-NOT: v_lshl
; GCN: ds_write_b8
; XGCN-NOT: v_or_b32
; XGCN-NOT: v_lshl
; SI: ds_write_b8
; SI-NOT: v_or_b32
; SI-NOT: v_lshl
; GCN: ds_write_b8
; XGCN-NOT: v_or_b32
; XGCN-NOT: v_lshl
; SI: ds_write_b8
; SI-NOT: v_or_b32
; SI-NOT: v_lshl
; GCN: ds_write_b8
; XGCN-NOT: v_or_b32
; XGCN-NOT: v_lshl
; SI: ds_write_b8
; SI-NOT: v_or_b32
; SI-NOT: v_lshl
; GCN: ds_write_b8
; XGCN-NOT: v_or_b32
; XGCN-NOT: v_lshl
; GCN: ds_write_b8
; GCN: s_endpgm
; SI: ds_write_b8
; SI-NOT: v_or_b32
; SI-NOT: v_lshl
; SI: ds_write_b8
; SI: s_endpgm
define void @local_unaligned_load_store_i64(i64 addrspace(3)* %p, i64 addrspace(3)* %r) {
%v = load i64, i64 addrspace(3)* %p, align 1
store i64 %v, i64 addrspace(3)* %r, align 1
ret void
}
; FUNC-LABEL: {{^}}local_unaligned_load_store_v2i32:
; GCN: ds_read_u8
; GCN: ds_read_u8
; GCN: ds_read_u8
; GCN: ds_read_u8
; GCN: ds_read_u8
; GCN: ds_read_u8
; GCN: ds_read_u8
; GCN: ds_read_u8
; SI-LABEL: {{^}}local_unaligned_load_store_v2i32:
; SI: ds_read_u8
; SI: ds_read_u8
; SI: ds_read_u8
; SI: ds_read_u8
; SI: ds_read_u8
; SI: ds_read_u8
; SI: ds_read_u8
; SI: ds_read_u8
; XGCN-NOT: v_or_b32
; XGCN-NOT: v_lshl
; GCN: ds_write_b8
; XGCN-NOT: v_or_b32
; XGCN-NOT: v_lshl
; SI-NOT: v_or_b32
; SI-NOT: v_lshl
; SI: ds_write_b8
; SI-NOT: v_or_b32
; SI-NOT: v_lshl
; GCN: ds_write_b8
; XGCN-NOT: v_or_b32
; XGCN-NOT: v_lshl
; SI: ds_write_b8
; SI-NOT: v_or_b32
; SI-NOT: v_lshl
; GCN: ds_write_b8
; XGCN-NOT: v_or_b32
; XGCN-NOT: v_lshl
; SI: ds_write_b8
; SI-NOT: v_or_b32
; SI-NOT: v_lshl
; GCN: ds_write_b8
; XGCN-NOT: v_or_b32
; XGCN-NOT: v_lshl
; SI: ds_write_b8
; SI-NOT: v_or_b32
; SI-NOT: v_lshl
; GCN: ds_write_b8
; XGCN-NOT: v_or_b32
; XGCN-NOT: v_lshl
; SI: ds_write_b8
; SI-NOT: v_or_b32
; SI-NOT: v_lshl
; GCN: ds_write_b8
; XGCN-NOT: v_or_b32
; XGCN-NOT: v_lshl
; SI: ds_write_b8
; SI-NOT: v_or_b32
; SI-NOT: v_lshl
; GCN: ds_write_b8
; XGCN-NOT: v_or_b32
; XGCN-NOT: v_lshl
; GCN: ds_write_b8
; GCN: s_endpgm
; SI: ds_write_b8
; SI-NOT: v_or_b32
; SI-NOT: v_lshl
; SI: ds_write_b8
; SI: s_endpgm
define void @local_unaligned_load_store_v2i32(<2 x i32> addrspace(3)* %p, <2 x i32> addrspace(3)* %r) {
%v = load <2 x i32>, <2 x i32> addrspace(3)* %p, align 1
store <2 x i32> %v, <2 x i32> addrspace(3)* %r, align 1
ret void
}
; FUNC-LABEL: {{^}}unaligned_load_store_i64_global:
; GCN-NOHSA: buffer_load_ubyte
; GCN-NOHSA: buffer_load_ubyte
; GCN-NOHSA: buffer_load_ubyte
; GCN-NOHSA: buffer_load_ubyte
; GCN-NOHSA: buffer_load_ubyte
; GCN-NOHSA: buffer_load_ubyte
; GCN-NOHSA: buffer_load_ubyte
; GCN-NOHSA: buffer_load_ubyte
; SI-LABEL: {{^}}global_align2_load_store_i64:
; SI: buffer_load_ushort
; SI: buffer_load_ushort
; GCN-HSA: flat_load_ubyte
; GCN-HSA: flat_load_ubyte
; GCN-HSA: flat_load_ubyte
; GCN-HSA: flat_load_ubyte
; GCN-HSA: flat_load_ubyte
; GCN-HSA: flat_load_ubyte
; GCN-HSA: flat_load_ubyte
; GCN-HSA: flat_load_ubyte
; SI-NOT: v_or_
; SI-NOT: v_lshl
; XGCN-NOT: v_or_
; XGCN-NOT: v_lshl
; SI: buffer_load_ushort
; GCN-NOHSA: buffer_store_byte
; GCN-NOHSA: buffer_store_byte
; GCN-NOHSA: buffer_store_byte
; GCN-NOHSA: buffer_store_byte
; GCN-NOHSA: buffer_store_byte
; GCN-NOHSA: buffer_store_byte
; GCN-NOHSA: buffer_store_byte
; GCN-NOHSA: buffer_store_byte
; SI-NOT: v_or_
; SI-NOT: v_lshl
; GCN-HSA: flat_store_byte
; GCN-HSA: flat_store_byte
; GCN-HSA: flat_store_byte
; GCN-HSA: flat_store_byte
; GCN-HSA: flat_store_byte
; GCN-HSA: flat_store_byte
; GCN-HSA: flat_store_byte
; GCN-HSA: flat_store_byte
define void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace(1)* %r) #0 {
; SI: buffer_load_ushort
; SI-NOT: v_or_
; SI-NOT: v_lshl
; SI: buffer_store_short
; SI: buffer_store_short
; SI: buffer_store_short
; SI: buffer_store_short
define void @global_align2_load_store_i64(i64 addrspace(1)* %p, i64 addrspace(1)* %r) {
%v = load i64, i64 addrspace(1)* %p, align 2
store i64 %v, i64 addrspace(1)* %r, align 2
ret void
}
; SI-LABEL: {{^}}unaligned_load_store_i64_global:
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI-NOT: v_or_
; SI-NOT: v_lshl
; SI: buffer_store_byte
; SI: buffer_store_byte
; SI: buffer_store_byte
; SI: buffer_store_byte
; SI: buffer_store_byte
; SI: buffer_store_byte
; SI: buffer_store_byte
; SI: buffer_store_byte
define void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace(1)* %r) {
%v = load i64, i64 addrspace(1)* %p, align 1
store i64 %v, i64 addrspace(1)* %r, align 1
ret void
@ -285,76 +296,41 @@ define void @local_unaligned_load_store_v4i32(<4 x i32> addrspace(3)* %p, <4 x i
ret void
}
; FUNC-LABEL: {{^}}global_unaligned_load_store_v4i32:
; GCN-NOHSA: buffer_load_ubyte
; GCN-NOHSA: buffer_load_ubyte
; GCN-NOHSA: buffer_load_ubyte
; GCN-NOHSA: buffer_load_ubyte
; GCN-NOHSA: buffer_load_ubyte
; GCN-NOHSA: buffer_load_ubyte
; GCN-NOHSA: buffer_load_ubyte
; GCN-NOHSA: buffer_load_ubyte
; GCN-NOHSA: buffer_load_ubyte
; GCN-NOHSA: buffer_load_ubyte
; GCN-NOHSA: buffer_load_ubyte
; GCN-NOHSA: buffer_load_ubyte
; GCN-NOHSA: buffer_load_ubyte
; GCN-NOHSA: buffer_load_ubyte
; GCN-NOHSA: buffer_load_ubyte
; GCN-NOHSA: buffer_load_ubyte
; SI-LABEL: {{^}}global_unaligned_load_store_v4i32
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; GCN-NOHSA: buffer_store_byte
; GCN-NOHSA: buffer_store_byte
; GCN-NOHSA: buffer_store_byte
; GCN-NOHSA: buffer_store_byte
; GCN-NOHSA: buffer_store_byte
; GCN-NOHSA: buffer_store_byte
; GCN-NOHSA: buffer_store_byte
; GCN-NOHSA: buffer_store_byte
; GCN-NOHSA: buffer_store_byte
; GCN-NOHSA: buffer_store_byte
; GCN-NOHSA: buffer_store_byte
; GCN-NOHSA: buffer_store_byte
; GCN-NOHSA: buffer_store_byte
; GCN-NOHSA: buffer_store_byte
; GCN-NOHSA: buffer_store_byte
; GCN-NOHSA: buffer_store_byte
; GCN-HSA: flat_load_ubyte
; GCN-HSA: flat_load_ubyte
; GCN-HSA: flat_load_ubyte
; GCN-HSA: flat_load_ubyte
; GCN-HSA: flat_load_ubyte
; GCN-HSA: flat_load_ubyte
; GCN-HSA: flat_load_ubyte
; GCN-HSA: flat_load_ubyte
; GCN-HSA: flat_load_ubyte
; GCN-HSA: flat_load_ubyte
; GCN-HSA: flat_load_ubyte
; GCN-HSA: flat_load_ubyte
; GCN-HSA: flat_load_ubyte
; GCN-HSA: flat_load_ubyte
; GCN-HSA: flat_load_ubyte
; GCN-HSA: flat_load_ubyte
; GCN-HSA: flat_store_byte
; GCN-HSA: flat_store_byte
; GCN-HSA: flat_store_byte
; GCN-HSA: flat_store_byte
; GCN-HSA: flat_store_byte
; GCN-HSA: flat_store_byte
; GCN-HSA: flat_store_byte
; GCN-HSA: flat_store_byte
; GCN-HSA: flat_store_byte
; GCN-HSA: flat_store_byte
; GCN-HSA: flat_store_byte
; GCN-HSA: flat_store_byte
; GCN-HSA: flat_store_byte
; GCN-HSA: flat_store_byte
; GCN-HSA: flat_store_byte
; GCN-HSA: flat_store_byte
define void @global_unaligned_load_store_v4i32(<4 x i32> addrspace(1)* %p, <4 x i32> addrspace(1)* %r) #0 {
; SI: buffer_store_byte
; SI: buffer_store_byte
; SI: buffer_store_byte
; SI: buffer_store_byte
; SI: buffer_store_byte
; SI: buffer_store_byte
; SI: buffer_store_byte
; SI: buffer_store_byte
; SI: buffer_store_byte
; SI: buffer_store_byte
; SI: buffer_store_byte
; SI: buffer_store_byte
; SI: buffer_store_byte
; SI: buffer_store_byte
; SI: buffer_store_byte
; SI: buffer_store_byte
define void @global_unaligned_load_store_v4i32(<4 x i32> addrspace(1)* %p, <4 x i32> addrspace(1)* %r) nounwind {
%v = load <4 x i32>, <4 x i32> addrspace(1)* %p, align 1
store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 1
ret void