forked from OSchip/llvm-project
[AMDGPU] Avoid splitting FLAT offsets in unsafe ways
As explained in the comment: // For a FLAT instruction the hardware decides whether to access // global/scratch/shared memory based on the high bits of vaddr, // ignoring the offset field, so we have to ensure that when we add // remainder to vaddr it still points into the same underlying object. // The easiest way to do that is to make sure that we split the offset // into two pieces that are both >= 0 or both <= 0. In particular FLAT (as opposed to SCRATCH and GLOBAL) instructions have an unsigned immediate offset field, so we can't use it to help split a negative offset. Differential Revision: https://reviews.llvm.org/D83394
This commit is contained in:
parent
1cfb207737
commit
760af7a074
|
@ -1688,33 +1688,27 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N,
|
|||
} else {
|
||||
// If the offset doesn't fit, put the low bits into the offset field and
|
||||
// add the rest.
|
||||
//
|
||||
// For a FLAT instruction the hardware decides whether to access
|
||||
// global/scratch/shared memory based on the high bits of vaddr,
|
||||
// ignoring the offset field, so we have to ensure that when we add
|
||||
// remainder to vaddr it still points into the same underlying object.
|
||||
// The easiest way to do that is to make sure that we split the offset
|
||||
// into two pieces that are both >= 0 or both <= 0.
|
||||
|
||||
SDLoc DL(N);
|
||||
uint64_t ImmField;
|
||||
uint64_t RemainderOffset = COffsetVal;
|
||||
uint64_t ImmField = 0;
|
||||
const unsigned NumBits = TII->getNumFlatOffsetBits(AS, IsSigned);
|
||||
if (IsSigned) {
|
||||
ImmField = SignExtend64(COffsetVal, NumBits);
|
||||
|
||||
// Don't use a negative offset field if the base offset is positive.
|
||||
// Since the scheduler currently relies on the offset field, doing so
|
||||
// could result in strange scheduling decisions.
|
||||
|
||||
// TODO: Should we not do this in the opposite direction as well?
|
||||
if (static_cast<int64_t>(COffsetVal) > 0) {
|
||||
if (static_cast<int64_t>(ImmField) < 0) {
|
||||
const uint64_t OffsetMask =
|
||||
maskTrailingOnes<uint64_t>(NumBits - 1);
|
||||
ImmField = COffsetVal & OffsetMask;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// TODO: Should we do this for a negative offset?
|
||||
const uint64_t OffsetMask = maskTrailingOnes<uint64_t>(NumBits);
|
||||
ImmField = COffsetVal & OffsetMask;
|
||||
// Use signed division by a power of two to truncate towards 0.
|
||||
int64_t D = 1LL << (NumBits - 1);
|
||||
RemainderOffset = (static_cast<int64_t>(COffsetVal) / D) * D;
|
||||
ImmField = COffsetVal - RemainderOffset;
|
||||
} else if (static_cast<int64_t>(COffsetVal) >= 0) {
|
||||
ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
|
||||
RemainderOffset = COffsetVal - ImmField;
|
||||
}
|
||||
|
||||
uint64_t RemainderOffset = COffsetVal - ImmField;
|
||||
|
||||
assert(TII->isLegalFLATOffset(ImmField, AS, IsSigned));
|
||||
assert(RemainderOffset + ImmField == COffsetVal);
|
||||
|
||||
|
|
|
@ -191,9 +191,9 @@ define amdgpu_kernel void @store_flat_i8_max_offset_p1(i8* %fptr, i8 %x) #0 {
|
|||
; CHECK-LABEL: {{^}}store_flat_i8_neg_offset:
|
||||
; CIVI: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}{{$}}
|
||||
|
||||
; GFX9: v_add_co_u32_e32 v{{[0-9]+}}, vcc, 0xfffff000, v
|
||||
; GFX9: v_add_co_u32_e64 v{{[0-9]+}}, vcc, -2, s
|
||||
; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, -1,
|
||||
; GFX9: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:4094{{$}}
|
||||
; GFX9: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}{{$}}
|
||||
define amdgpu_kernel void @store_flat_i8_neg_offset(i8* %fptr, i8 %x) #0 {
|
||||
%fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 -2
|
||||
store volatile i8 %x, i8* %fptr.offset
|
||||
|
@ -220,9 +220,9 @@ define amdgpu_kernel void @load_flat_i8_max_offset_p1(i8* %fptr) #0 {
|
|||
; CHECK-LABEL: {{^}}load_flat_i8_neg_offset:
|
||||
; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}{{$}}
|
||||
|
||||
; GFX9: v_add_co_u32_e32 v{{[0-9]+}}, vcc, 0xfffff000, v
|
||||
; GFX9: v_add_co_u32_e64 v{{[0-9]+}}, vcc, -2, s
|
||||
; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, -1,
|
||||
; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:4094{{$}}
|
||||
; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}{{$}}
|
||||
define amdgpu_kernel void @load_flat_i8_neg_offset(i8* %fptr) #0 {
|
||||
%fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 -2
|
||||
%val = load volatile i8, i8* %fptr.offset
|
||||
|
|
|
@ -103,9 +103,9 @@ define i8 @flat_inst_valu_offset_neg_11bit_max(i8* %p) {
|
|||
; GFX9-LABEL: flat_inst_valu_offset_neg_11bit_max:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
|
||||
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
|
||||
; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2048
|
||||
; GFX9-NEXT: flat_load_ubyte v0, v[0:1]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
|
@ -483,10 +483,10 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(i8* %p) {
|
|||
; GFX9-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x7ff, v0
|
||||
; GFX9-NEXT: v_bfrev_b32_e32 v2, 1
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0
|
||||
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
|
||||
; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2047
|
||||
; GFX9-NEXT: flat_load_ubyte v0, v[0:1]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
|
@ -510,10 +510,10 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(i8* %p) {
|
|||
; GFX9-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x800, v0
|
||||
; GFX9-NEXT: v_bfrev_b32_e32 v2, 1
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0
|
||||
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
|
||||
; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2048
|
||||
; GFX9-NEXT: flat_load_ubyte v0, v[0:1]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
|
@ -537,10 +537,10 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(i8* %p) {
|
|||
; GFX9-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfff, v0
|
||||
; GFX9-NEXT: v_bfrev_b32_e32 v2, 1
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0
|
||||
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
|
||||
; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095
|
||||
; GFX9-NEXT: flat_load_ubyte v0, v[0:1]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
|
@ -591,10 +591,10 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(i8* %p) {
|
|||
; GFX9-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1fff, v0
|
||||
; GFX9-NEXT: v_bfrev_b32_e32 v2, 1
|
||||
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
|
||||
; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095
|
||||
; GFX9-NEXT: flat_load_ubyte v0, v[0:1]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
|
@ -773,9 +773,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(i8* %p) {
|
|||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
|
||||
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
|
||||
; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2048
|
||||
; GFX9-NEXT: flat_load_ubyte v0, v[0:1]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: flat_store_byte v[0:1], v0
|
||||
; GFX9-NEXT: s_endpgm
|
||||
|
@ -1269,10 +1269,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(i8*
|
|||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX9-NEXT: v_bfrev_b32_e32 v1, 1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x7ff, v0
|
||||
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
|
||||
; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2047
|
||||
; GFX9-NEXT: flat_load_ubyte v0, v[0:1]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: flat_store_byte v[0:1], v0
|
||||
; GFX9-NEXT: s_endpgm
|
||||
|
@ -1303,10 +1304,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(i8*
|
|||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX9-NEXT: v_bfrev_b32_e32 v1, 1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x800, v0
|
||||
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
|
||||
; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2048
|
||||
; GFX9-NEXT: flat_load_ubyte v0, v[0:1]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: flat_store_byte v[0:1], v0
|
||||
; GFX9-NEXT: s_endpgm
|
||||
|
@ -1337,10 +1339,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(i8*
|
|||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX9-NEXT: v_bfrev_b32_e32 v1, 1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfff, v0
|
||||
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
|
||||
; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095
|
||||
; GFX9-NEXT: flat_load_ubyte v0, v[0:1]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: flat_store_byte v[0:1], v0
|
||||
; GFX9-NEXT: s_endpgm
|
||||
|
@ -1408,9 +1411,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(i8*
|
|||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1fff, v0
|
||||
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
|
||||
; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095
|
||||
; GFX9-NEXT: flat_load_ubyte v0, v[0:1]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: flat_store_byte v[0:1], v0
|
||||
; GFX9-NEXT: s_endpgm
|
||||
|
|
|
@ -471,10 +471,10 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(i8 addrspace(1)*
|
|||
; GFX9-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
|
||||
; GFX9-NEXT: v_bfrev_b32_e32 v2, 1
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0
|
||||
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
|
||||
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2047
|
||||
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2049
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
|
@ -482,10 +482,10 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(i8 addrspace(1)*
|
|||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0, v0
|
||||
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
|
||||
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047
|
||||
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854773761
|
||||
|
@ -498,10 +498,10 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(i8 addrspace(1)*
|
|||
; GFX9-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
|
||||
; GFX9-NEXT: v_bfrev_b32_e32 v2, 1
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0
|
||||
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
|
||||
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2048
|
||||
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
|
@ -509,10 +509,10 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(i8 addrspace(1)*
|
|||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, v0
|
||||
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
|
||||
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048
|
||||
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854773760
|
||||
|
@ -525,10 +525,10 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(i8 addrspace(1)*
|
|||
; GFX9-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
|
||||
; GFX9-NEXT: v_bfrev_b32_e32 v2, 1
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0
|
||||
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
|
||||
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095
|
||||
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
|
@ -552,10 +552,10 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(i8 addrspace(1)*
|
|||
; GFX9-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
|
||||
; GFX9-NEXT: v_bfrev_b32_e32 v2, 1
|
||||
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
|
||||
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-4096
|
||||
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
|
@ -1213,10 +1213,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(i
|
|||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX9-NEXT: v_bfrev_b32_e32 v1, 1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
|
||||
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
|
||||
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2047
|
||||
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2049
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_store_byte v[0:1], v0, off
|
||||
; GFX9-NEXT: s_endpgm
|
||||
|
@ -1227,9 +1228,9 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(i
|
|||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0, s0
|
||||
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, s0
|
||||
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
|
||||
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047
|
||||
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: global_store_byte v[0:1], v0, off
|
||||
; GFX10-NEXT: s_endpgm
|
||||
|
@ -1246,10 +1247,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(i
|
|||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX9-NEXT: v_bfrev_b32_e32 v1, 1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
|
||||
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
|
||||
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2048
|
||||
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_store_byte v[0:1], v0, off
|
||||
; GFX9-NEXT: s_endpgm
|
||||
|
@ -1260,9 +1262,9 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(i
|
|||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, s0
|
||||
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, s0
|
||||
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
|
||||
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048
|
||||
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: global_store_byte v[0:1], v0, off
|
||||
; GFX10-NEXT: s_endpgm
|
||||
|
@ -1279,10 +1281,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(i
|
|||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX9-NEXT: v_bfrev_b32_e32 v1, 1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
|
||||
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
|
||||
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095
|
||||
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_store_byte v[0:1], v0, off
|
||||
; GFX9-NEXT: s_endpgm
|
||||
|
@ -1314,9 +1317,9 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(i
|
|||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
|
||||
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
|
||||
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-4096
|
||||
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_store_byte v[0:1], v0, off
|
||||
; GFX9-NEXT: s_endpgm
|
||||
|
|
|
@ -87,18 +87,17 @@ define hidden amdgpu_kernel void @clmem_read(i8 addrspace(1)* %buffer) {
|
|||
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
;
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
|
||||
;
|
||||
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
||||
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
|
||||
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
||||
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
|
||||
|
@ -107,6 +106,7 @@ define hidden amdgpu_kernel void @clmem_read(i8 addrspace(1)* %buffer) {
|
|||
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
||||
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
|
||||
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
||||
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
||||
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
|
||||
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
||||
entry:
|
||||
|
@ -519,11 +519,11 @@ define hidden amdgpu_kernel void @negativeoffset(i8 addrspace(1)* nocapture %buf
|
|||
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
;
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
||||
;
|
||||
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
|
||||
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
|
||||
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
||||
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
||||
entry:
|
||||
%call = tail call i64 @_Z13get_global_idj(i32 0) #2
|
||||
%conv = and i64 %call, 255
|
||||
|
|
|
@ -314,13 +314,13 @@ entry:
|
|||
; GFX803: v_add{{(_co)?}}_{{i|u}}32_e32
|
||||
; GFX803: v_addc_u32_e32
|
||||
|
||||
; GFX9-DAG: v_add_co_u32_e32 v{{[0-9]+}}, vcc, 0xfffff000, v
|
||||
; GFX9-DAG: v_add_co_u32_e32 v{{[0-9]+}}, vcc, 0xfffff802, v
|
||||
; GFX9-DAG: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, -1, v
|
||||
|
||||
; GFX906-DAG: v_lshrrev_b32_e32
|
||||
; GFX906: flat_store_short v[0:1], v2 offset:2050{{$}}
|
||||
; GFX906: flat_store_short v[0:1], v2{{$}}
|
||||
|
||||
; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2 offset:2050{{$}}
|
||||
; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}}
|
||||
; GFX803: flat_store_short v[0:1], v2{{$}}
|
||||
; GCN-NEXT: s_waitcnt
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
|
@ -363,13 +363,13 @@ entry:
|
|||
; GFX803-DAG: v_add_u32_e32
|
||||
; GFX803-DAG: v_addc_u32_e32
|
||||
|
||||
; GFX9-DAG: v_add_co_u32_e32 v{{[0-9]+}}, vcc, 0xfffff000, v
|
||||
; GFX9-DAG: v_add_co_u32_e32 v{{[0-9]+}}, vcc, 0xfffff001, v
|
||||
; GFX9-DAG: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, -1, v{{[0-9]+}}, vcc
|
||||
|
||||
; GFX900-NEXT: flat_store_byte_d16_hi v[0:1], v2 offset:1{{$}}
|
||||
; GFX900-NEXT: flat_store_byte_d16_hi v[0:1], v2{{$}}
|
||||
|
||||
; GFX906-DAG: v_lshrrev_b32_e32 v2, 16, v2
|
||||
; GFX906: flat_store_byte v[0:1], v2 offset:1{{$}}
|
||||
; GFX906: flat_store_byte v[0:1], v2{{$}}
|
||||
|
||||
; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2
|
||||
; GFX803: flat_store_byte v[0:1], v2{{$}}
|
||||
|
|
Loading…
Reference in New Issue