llvm-project/llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll

; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s


; There is no dependence between the store and the two loads. So we can combine the loads
; and the combined load is at the original place of the second load.

; GCN-LABEL: {{^}}ds_combine_nodep

; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:7 offset1:8
; GCN-NEXT: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27
define amdgpu_kernel void @ds_combine_nodep(float addrspace(1)* %out, float addrspace(3)* %inptr) {

  %base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)*
  %addr0 = getelementptr i8, i8 addrspace(3)* %base, i32 24
  %tmp0 = bitcast i8 addrspace(3)* %addr0 to float addrspace(3)*
  %vaddr0 = bitcast float addrspace(3)* %tmp0 to <3 x float> addrspace(3)*
  %load0 = load <3 x float>, <3 x float> addrspace(3)* %vaddr0, align 4
  %v0 = extractelement <3 x float> %load0, i32 2

  %tmp1 = insertelement <2 x float> undef, float 1.0, i32 0
  %data = insertelement <2 x float> %tmp1, float 2.0, i32 1

  %tmp2 = getelementptr float, float addrspace(3)* %inptr, i32 26
  %vaddrs = bitcast float addrspace(3)* %tmp2 to <2 x float> addrspace(3)*
  store <2 x float> %data, <2 x float> addrspace(3)* %vaddrs, align 4

  %vaddr1 = getelementptr float, float addrspace(3)* %inptr, i32 7
  %v1 = load float, float addrspace(3)* %vaddr1, align 4

  %sum = fadd float %v0, %v1
  store float %sum, float addrspace(1)* %out, align 4
  ret void
}


; The store depends on the first load, so we could not move the first load down to combine with
; the second load directly. However, we can move the store after the combined load.

; GCN-LABEL: {{^}}ds_combine_WAR

; GCN:      ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:7 offset1:27
; GCN-NEXT: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27
define amdgpu_kernel void @ds_combine_WAR(float addrspace(1)* %out, float addrspace(3)* %inptr) {

  %base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)*
  %addr0 = getelementptr i8, i8 addrspace(3)* %base, i32 100
  %tmp0 = bitcast i8 addrspace(3)* %addr0 to float addrspace(3)*
  %vaddr0 = bitcast float addrspace(3)* %tmp0 to <3 x float> addrspace(3)*
  %load0 = load <3 x float>, <3 x float> addrspace(3)* %vaddr0, align 4
  %v0 = extractelement <3 x float> %load0, i32 2

  %tmp1 = insertelement <2 x float> undef, float 1.0, i32 0
  %data = insertelement <2 x float> %tmp1, float 2.0, i32 1

  %tmp2 = getelementptr float, float addrspace(3)* %inptr, i32 26
  %vaddrs = bitcast float addrspace(3)* %tmp2 to <2 x float> addrspace(3)*
  store <2 x float> %data, <2 x float> addrspace(3)* %vaddrs, align 4

  %vaddr1 = getelementptr float, float addrspace(3)* %inptr, i32 7
  %v1 = load float, float addrspace(3)* %vaddr1, align 4

  %sum = fadd float %v0, %v1
  store float %sum, float addrspace(1)* %out, align 4
  ret void
}


; The second load depends on the store. We can combine the two loads, and the combined load is
; at the original place of the second load.

; GCN-LABEL: {{^}}ds_combine_RAW

; GCN:      ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27
; GCN-NEXT: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:8 offset1:26
define amdgpu_kernel void @ds_combine_RAW(float addrspace(1)* %out, float addrspace(3)* %inptr) {

  %base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)*
  %addr0 = getelementptr i8, i8 addrspace(3)* %base, i32 24
  %tmp0 = bitcast i8 addrspace(3)* %addr0 to float addrspace(3)*
  %vaddr0 = bitcast float addrspace(3)* %tmp0 to <3 x float> addrspace(3)*
  %load0 = load <3 x float>, <3 x float> addrspace(3)* %vaddr0, align 4
  %v0 = extractelement <3 x float> %load0, i32 2

  %tmp1 = insertelement <2 x float> undef, float 1.0, i32 0
  %data = insertelement <2 x float> %tmp1, float 2.0, i32 1

  %tmp2 = getelementptr float, float addrspace(3)* %inptr, i32 26
  %vaddrs = bitcast float addrspace(3)* %tmp2 to <2 x float> addrspace(3)*
  store <2 x float> %data, <2 x float> addrspace(3)* %vaddrs, align 4

  %vaddr1 = getelementptr float, float addrspace(3)* %inptr, i32 26
  %v1 = load float, float addrspace(3)* %vaddr1, align 4

  %sum = fadd float %v0, %v1
  store float %sum, float addrspace(1)* %out, align 4
  ret void
}


; The store depends on the first load, also the second load depends on the store.
; So we can not combine the two loads.

; GCN-LABEL: {{^}}ds_combine_WAR_RAW

; GCN:      ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:108
; GCN-NEXT: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27
; GCN-NEXT: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:104
define amdgpu_kernel void @ds_combine_WAR_RAW(float addrspace(1)* %out, float addrspace(3)* %inptr) {

  %base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)*
  %addr0 = getelementptr i8, i8 addrspace(3)* %base, i32 100
  %tmp0 = bitcast i8 addrspace(3)* %addr0 to float addrspace(3)*
  %vaddr0 = bitcast float addrspace(3)* %tmp0 to <3 x float> addrspace(3)*
  %load0 = load <3 x float>, <3 x float> addrspace(3)* %vaddr0, align 4
  %v0 = extractelement <3 x float> %load0, i32 2

  %tmp1 = insertelement <2 x float> undef, float 1.0, i32 0
  %data = insertelement <2 x float> %tmp1, float 2.0, i32 1

  %tmp2 = getelementptr float, float addrspace(3)* %inptr, i32 26
  %vaddrs = bitcast float addrspace(3)* %tmp2 to <2 x float> addrspace(3)*
  store <2 x float> %data, <2 x float> addrspace(3)* %vaddrs, align 4

  %vaddr1 = getelementptr float, float addrspace(3)* %inptr, i32 26
  %v1 = load float, float addrspace(3)* %vaddr1, align 4

  %sum = fadd float %v0, %v1
  store float %sum, float addrspace(1)* %out, align 4
  ret void
}
AMDGPU: Use MachineInstr::mayAlias to replace areMemAccessesTriviallyDisjoint in LoadStoreOptimizer pass. Summary: This is to fix a memory dependence bug in LoadStoreOptimizer. Reviewers: arsenm, rampitec Differential Revision: https://reviews.llvm.org/D58295 llvm-svn: 354295 2019-02-19 07:00:26 +08:00			`; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN %s`


			`; There is no dependence between the store and the two loads. So we can combine the loads`
			`; and the combined load is at the original place of the second load.`

			`; GCN-LABEL: {{^}}ds_combine_nodep`

AMDGPU: Correct DS implementation of areLoadsFromSameBasePtr This was checking the wrong operands for the base register and the offsets. The indexes are shifted by the number of output registers from the machine instruction definition, and the chain is moved to the end. llvm-svn: 355722 2019-03-09 04:30:50 +08:00			`; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:7 offset1:8`
			`; GCN-NEXT: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27`
AMDGPU: Use MachineInstr::mayAlias to replace areMemAccessesTriviallyDisjoint in LoadStoreOptimizer pass. Summary: This is to fix a memory dependence bug in LoadStoreOptimizer. Reviewers: arsenm, rampitec Differential Revision: https://reviews.llvm.org/D58295 llvm-svn: 354295 2019-02-19 07:00:26 +08:00			`define amdgpu_kernel void @ds_combine_nodep(float addrspace(1)* %out, float addrspace(3)* %inptr) {`

			`%base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)*`
			`%addr0 = getelementptr i8, i8 addrspace(3)* %base, i32 24`
			`%tmp0 = bitcast i8 addrspace(3)* %addr0 to float addrspace(3)*`
			`%vaddr0 = bitcast float addrspace(3)* %tmp0 to <3 x float> addrspace(3)*`
			`%load0 = load <3 x float>, <3 x float> addrspace(3)* %vaddr0, align 4`
			`%v0 = extractelement <3 x float> %load0, i32 2`

			`%tmp1 = insertelement <2 x float> undef, float 1.0, i32 0`
			`%data = insertelement <2 x float> %tmp1, float 2.0, i32 1`

			`%tmp2 = getelementptr float, float addrspace(3)* %inptr, i32 26`
			`%vaddrs = bitcast float addrspace(3)* %tmp2 to <2 x float> addrspace(3)*`
			`store <2 x float> %data, <2 x float> addrspace(3)* %vaddrs, align 4`

			`%vaddr1 = getelementptr float, float addrspace(3)* %inptr, i32 7`
			`%v1 = load float, float addrspace(3)* %vaddr1, align 4`

			`%sum = fadd float %v0, %v1`
			`store float %sum, float addrspace(1)* %out, align 4`
			`ret void`
			`}`


			`; The store depends on the first load, so we could not move the first load down to combine with`
			`; the second load directly. However, we can move the store after the combined load.`

			`; GCN-LABEL: {{^}}ds_combine_WAR`

			`; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:7 offset1:27`
			`; GCN-NEXT: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27`
			`define amdgpu_kernel void @ds_combine_WAR(float addrspace(1)* %out, float addrspace(3)* %inptr) {`

			`%base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)*`
			`%addr0 = getelementptr i8, i8 addrspace(3)* %base, i32 100`
			`%tmp0 = bitcast i8 addrspace(3)* %addr0 to float addrspace(3)*`
			`%vaddr0 = bitcast float addrspace(3)* %tmp0 to <3 x float> addrspace(3)*`
			`%load0 = load <3 x float>, <3 x float> addrspace(3)* %vaddr0, align 4`
			`%v0 = extractelement <3 x float> %load0, i32 2`

			`%tmp1 = insertelement <2 x float> undef, float 1.0, i32 0`
			`%data = insertelement <2 x float> %tmp1, float 2.0, i32 1`

			`%tmp2 = getelementptr float, float addrspace(3)* %inptr, i32 26`
			`%vaddrs = bitcast float addrspace(3)* %tmp2 to <2 x float> addrspace(3)*`
			`store <2 x float> %data, <2 x float> addrspace(3)* %vaddrs, align 4`

			`%vaddr1 = getelementptr float, float addrspace(3)* %inptr, i32 7`
			`%v1 = load float, float addrspace(3)* %vaddr1, align 4`

			`%sum = fadd float %v0, %v1`
			`store float %sum, float addrspace(1)* %out, align 4`
			`ret void`
			`}`


			`; The second load depends on the store. We can combine the two loads, and the combined load is`
			`; at the original place of the second load.`

			`; GCN-LABEL: {{^}}ds_combine_RAW`

			`; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27`
			`; GCN-NEXT: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:8 offset1:26`
			`define amdgpu_kernel void @ds_combine_RAW(float addrspace(1)* %out, float addrspace(3)* %inptr) {`

			`%base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)*`
			`%addr0 = getelementptr i8, i8 addrspace(3)* %base, i32 24`
			`%tmp0 = bitcast i8 addrspace(3)* %addr0 to float addrspace(3)*`
			`%vaddr0 = bitcast float addrspace(3)* %tmp0 to <3 x float> addrspace(3)*`
			`%load0 = load <3 x float>, <3 x float> addrspace(3)* %vaddr0, align 4`
			`%v0 = extractelement <3 x float> %load0, i32 2`

			`%tmp1 = insertelement <2 x float> undef, float 1.0, i32 0`
			`%data = insertelement <2 x float> %tmp1, float 2.0, i32 1`

			`%tmp2 = getelementptr float, float addrspace(3)* %inptr, i32 26`
			`%vaddrs = bitcast float addrspace(3)* %tmp2 to <2 x float> addrspace(3)*`
			`store <2 x float> %data, <2 x float> addrspace(3)* %vaddrs, align 4`

			`%vaddr1 = getelementptr float, float addrspace(3)* %inptr, i32 26`
			`%v1 = load float, float addrspace(3)* %vaddr1, align 4`

			`%sum = fadd float %v0, %v1`
			`store float %sum, float addrspace(1)* %out, align 4`
			`ret void`
			`}`


			`; The store depends on the first load, also the second load depends on the store.`
			`; So we can not combine the two loads.`

			`; GCN-LABEL: {{^}}ds_combine_WAR_RAW`

			`; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:108`
			`; GCN-NEXT: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27`
			`; GCN-NEXT: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:104`
			`define amdgpu_kernel void @ds_combine_WAR_RAW(float addrspace(1)* %out, float addrspace(3)* %inptr) {`

			`%base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)*`
			`%addr0 = getelementptr i8, i8 addrspace(3)* %base, i32 100`
			`%tmp0 = bitcast i8 addrspace(3)* %addr0 to float addrspace(3)*`
			`%vaddr0 = bitcast float addrspace(3)* %tmp0 to <3 x float> addrspace(3)*`
			`%load0 = load <3 x float>, <3 x float> addrspace(3)* %vaddr0, align 4`
			`%v0 = extractelement <3 x float> %load0, i32 2`

			`%tmp1 = insertelement <2 x float> undef, float 1.0, i32 0`
			`%data = insertelement <2 x float> %tmp1, float 2.0, i32 1`

			`%tmp2 = getelementptr float, float addrspace(3)* %inptr, i32 26`
			`%vaddrs = bitcast float addrspace(3)* %tmp2 to <2 x float> addrspace(3)*`
			`store <2 x float> %data, <2 x float> addrspace(3)* %vaddrs, align 4`

			`%vaddr1 = getelementptr float, float addrspace(3)* %inptr, i32 26`
			`%v1 = load float, float addrspace(3)* %vaddr1, align 4`

			`%sum = fadd float %v0, %v1`
			`store float %sum, float addrspace(1)* %out, align 4`
			`ret void`
			`}`