llvm-project/llvm/test/CodeGen/AMDGPU/operand-folding.ll

; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s
; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -early-live-intervals < %s | FileCheck %s

; CHECK-LABEL: {{^}}fold_sgpr:
; CHECK: v_add_i32_e32 v{{[0-9]+}}, vcc, s
define amdgpu_kernel void @fold_sgpr(i32 addrspace(1)* %out, i32 %fold) #1 {
entry:
  %tmp0 = icmp ne i32 %fold, 0
  br i1 %tmp0, label %if, label %endif

if:
  %id = call i32 @llvm.amdgcn.workitem.id.x()
  %offset = add i32 %fold, %id
  %tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 %offset
  store i32 0, i32 addrspace(1)* %tmp1
  br label %endif

endif:
  ret void
}

; CHECK-LABEL: {{^}}fold_imm:
; CHECK: v_or_b32_e32 v{{[0-9]+}}, 5
define amdgpu_kernel void @fold_imm(i32 addrspace(1)* %out, i32 %cmp) #1 {
entry:
  %fold = add i32 3, 2
  %tmp0 = icmp ne i32 %cmp, 0
  br i1 %tmp0, label %if, label %endif

if:
  %id = call i32 @llvm.amdgcn.workitem.id.x()
  %val = or i32 %id, %fold
  store i32 %val, i32 addrspace(1)* %out
  br label %endif

endif:
  ret void
}

; CHECK-LABEL: {{^}}fold_64bit_constant_add:
; CHECK-NOT: s_mov_b64
; FIXME: It would be better if we could use v_add here and drop the extra
; v_mov_b32 instructions.
; CHECK-DAG: s_add_u32 [[LO:s[0-9]+]], s{{[0-9]+}}, 1
; CHECK-DAG: s_addc_u32 [[HI:s[0-9]+]], s{{[0-9]+}}, 0
; CHECK-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[LO]]
; CHECK-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[HI]]
; CHECK: buffer_store_dwordx2 v[[[VLO]]:[[VHI]]],

define amdgpu_kernel void @fold_64bit_constant_add(i64 addrspace(1)* %out, i32 %cmp, i64 %val) #1 {
entry:
  %tmp0 = add i64 %val, 1
  store i64 %tmp0, i64 addrspace(1)* %out
  ret void
}

; Inline constants should always be folded.

; CHECK-LABEL: {{^}}vector_inline:
; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}
; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}
; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}
; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}

define amdgpu_kernel void @vector_inline(<4 x i32> addrspace(1)* %out) #1 {
entry:
  %tmp0 = call i32 @llvm.amdgcn.workitem.id.x()
  %tmp1 = add i32 %tmp0, 1
  %tmp2 = add i32 %tmp0, 2
  %tmp3 = add i32 %tmp0, 3
  %vec0 = insertelement <4 x i32> undef, i32 %tmp0, i32 0
  %vec1 = insertelement <4 x i32> %vec0, i32 %tmp1, i32 1
  %vec2 = insertelement <4 x i32> %vec1, i32 %tmp2, i32 2
  %vec3 = insertelement <4 x i32> %vec2, i32 %tmp3, i32 3
  %tmp4 = xor <4 x i32> <i32 5, i32 5, i32 5, i32 5>, %vec3
  store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %out
  ret void
}

; Immediates with one use should be folded
; CHECK-LABEL: {{^}}imm_one_use:
; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 0x64, v{{[0-9]+}}

define amdgpu_kernel void @imm_one_use(i32 addrspace(1)* %out) #1 {
entry:
  %tmp0 = call i32 @llvm.amdgcn.workitem.id.x()
  %tmp1 = xor i32 %tmp0, 100
  store i32 %tmp1, i32 addrspace(1)* %out
  ret void
}
; CHECK-LABEL: {{^}}vector_imm:
; CHECK: s_movk_i32 [[IMM:s[0-9]+]], 0x64
; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}}
; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}}
; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}}
; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}}

define amdgpu_kernel void @vector_imm(<4 x i32> addrspace(1)* %out) #1 {
entry:
  %tmp0 = call i32 @llvm.amdgcn.workitem.id.x()
  %tmp1 = add i32 %tmp0, 1
  %tmp2 = add i32 %tmp0, 2
  %tmp3 = add i32 %tmp0, 3
  %vec0 = insertelement <4 x i32> undef, i32 %tmp0, i32 0
  %vec1 = insertelement <4 x i32> %vec0, i32 %tmp1, i32 1
  %vec2 = insertelement <4 x i32> %vec1, i32 %tmp2, i32 2
  %vec3 = insertelement <4 x i32> %vec2, i32 %tmp3, i32 3
  %tmp4 = xor <4 x i32> <i32 100, i32 100, i32 100, i32 100>, %vec3
  store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %out
  ret void
}

; A subregister use operand should not be tied.
; CHECK-LABEL: {{^}}no_fold_tied_subregister:
; CHECK: buffer_load_dwordx2 v[[[LO:[0-9]+]]:[[HI:[0-9]+]]]
; CHECK: v_mac_f32_e32 v[[LO]], 0x41200000, v[[HI]]
; CHECK: buffer_store_dword v[[LO]]
define amdgpu_kernel void @no_fold_tied_subregister() #1 {
  %tmp1 = load volatile <2 x float>, <2 x float> addrspace(1)* undef
  %tmp2 = extractelement <2 x float> %tmp1, i32 0
  %tmp3 = extractelement <2 x float> %tmp1, i32 1
  %tmp4 = fmul float %tmp3, 10.0
  %tmp5 = fadd float %tmp4, %tmp2
  store volatile float %tmp5, float addrspace(1)* undef
  ret void
}

; There should be exact one folding on the same operand.
; CHECK-LABEL: {{^}}no_extra_fold_on_same_opnd
; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
; CHECK: v_xor_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
define void @no_extra_fold_on_same_opnd() #1 {
entry:
  %s0 = load i32, i32 addrspace(5)* undef, align 4
  %s0.i64= zext i32 %s0 to i64
  br label %for.body.i.i

for.body.i.i:
  %s1 = load i32, i32 addrspace(1)* undef, align 8
  %s1.i64 = sext i32 %s1 to i64
  %xor = xor i64 %s1.i64, %s0.i64
  %flag = icmp ult i64 %xor, 8
  br i1 %flag, label %if.then, label %if.else

if.then:
  unreachable

if.else:
  unreachable
}

declare i32 @llvm.amdgcn.workitem.id.x() #0

attributes #0 = { nounwind readnone }
attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
[AMDGPU] Add gfx1030 target Differential Revision: https://reviews.llvm.org/D81886 2020-06-16 05:10:39 +08:00			`; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s \| FileCheck %s`
[LiveIntervals] Update subranges in processTiedPairs In TwoAddressInstructionPass::processTiedPairs when updating live intervals after moving the last use of RegB back to the newly inserted copy, update any affected subranges as well as the main range. Differential Revision: https://reviews.llvm.org/D110411 2021-09-24 20:57:45 +08:00			`; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -early-live-intervals < %s \| FileCheck %s`
R600/SI: Add SIFoldOperands pass This pass attempts to fold the source operands of mov and copy instructions into their uses. llvm-svn: 222581 2014-11-22 06:06:37 +08:00
			`; CHECK-LABEL: {{^}}fold_sgpr:`
AMDGPU: Add sdst operand to VOP2b instructions The VOP3 encoding of these allows any SGPR pair for the i1 output, but this was forced before to always use vcc. This doesn't yet try to use this, but does add the operand to the definitions so the main change is adding vcc to the output of the VOP2 encoding. llvm-svn: 246358 2015-08-29 15:16:50 +08:00			`; CHECK: v_add_i32_e32 v{{[0-9]+}}, vcc, s`
AMDGPU: Remove denormal subtarget features Switch to using the denormal-fp-math/denormal-fp-math-f32 attributes. 2019-11-18 19:18:07 +08:00			`define amdgpu_kernel void @fold_sgpr(i32 addrspace(1)* %out, i32 %fold) #1 {`
R600/SI: Add SIFoldOperands pass This pass attempts to fold the source operands of mov and copy instructions into their uses. llvm-svn: 222581 2014-11-22 06:06:37 +08:00			`entry:`
			`%tmp0 = icmp ne i32 %fold, 0`
			`br i1 %tmp0, label %if, label %endif`

			`if:`
AMDGPU: Remove some old intrinsic uses from tests llvm-svn: 260493 2016-02-11 14:02:01 +08:00			`%id = call i32 @llvm.amdgcn.workitem.id.x()`
R600/SI: Add SIFoldOperands pass This pass attempts to fold the source operands of mov and copy instructions into their uses. llvm-svn: 222581 2014-11-22 06:06:37 +08:00			`%offset = add i32 %fold, %id`
[opaque pointer type] Add textual IR support for explicit type parameter to getelementptr instruction One of several parallel first steps to remove the target type of pointers, replacing them with a single opaque pointer type. This adds an explicit type parameter to the gep instruction so that when the first parameter becomes an opaque pointer type, the type to gep through is still available to the instructions. * This doesn't modify gep operators, only instructions (operators will be handled separately) * Textual IR changes only. Bitcode (including upgrade) and changing the in-memory representation will be in separate changes. * geps of vectors are transformed as: getelementptr <4 x float> %x, ... ->getelementptr float, <4 x float> %x, ... Then, once the opaque pointer type is introduced, this will ultimately look like: getelementptr float, <4 x ptr> %x with the unambiguous interpretation that it is a vector of pointers to float. * address spaces remain on the pointer, not the type: getelementptr float addrspace(1)* %x ->getelementptr float, float addrspace(1)* %x Then, eventually: getelementptr float, ptr addrspace(1) %x Importantly, the massive amount of test case churn has been automated by same crappy python code. I had to manually update a few test cases that wouldn't fit the script's model (r228970,r229196,r229197,r229198). The python script just massages stdin and writes the result to stdout, I then wrapped that in a shell script to handle replacing files, then using the usual find+xargs to migrate all the files. update.py: import fileinput import sys import re ibrep = re.compile(r"(^.?[^%\w]getelementptr inbounds )(((?:<\d x )?)(.?)(\| addrspace\(\d\)) \(\|>)(?:$\| (?:%\|@\|null\|undef\|blockaddress\|getelementptr\|addrspacecast\|bitcast\|inttoptr\|\[\[[a-zA-Z]\|\{\{).$))") normrep = re.compile( r"(^.?[^%\w]getelementptr )(((?:<\d* x )?)(.?)(\| addrspace\(\d\)) \(\|>)(?:$\| (?:%\|@\|null\|undef\|blockaddress\|getelementptr\|addrspacecast\|bitcast\|inttoptr\|\[\[[a-zA-Z]\|\{\{).$))") def conv(match, line): if not match: return line line = match.groups()[0] if len(match.groups()[5]) == 0: line += match.groups()[2] line += match.groups()[3] line += ", " line += match.groups()[1] line += "\n" return line for line in sys.stdin: if line.find("getelementptr ") == line.find("getelementptr inbounds"): if line.find("getelementptr inbounds") != line.find("getelementptr inbounds ("): line = conv(re.match(ibrep, line), line) elif line.find("getelementptr ") != line.find("getelementptr ("): line = conv(re.match(normrep, line), line) sys.stdout.write(line) apply.sh: for name in "$@" do python3 `dirname "$0"`/update.py < "$name" > "$name.tmp" && mv "$name.tmp" "$name" rm -f "$name.tmp" done The actual commands: From llvm/src: find test/ -name .ll \| xargs ./apply.sh From llvm/src/tools/clang: find test/ -name .mm -o -name .m -o -name .cpp -o -name .c \| xargs -I '{}' ../../apply.sh "{}" From llvm/src/tools/polly: find test/ -name *.ll \| xargs ./apply.sh After that, check-all (with llvm, clang, clang-tools-extra, lld, compiler-rt, and polly all checked out). The extra 'rm' in the apply.sh script is due to a few files in clang's test suite using interesting unicode stuff that my python script was throwing exceptions on. None of those files needed to be migrated, so it seemed sufficient to ignore those cases. Reviewers: rafael, dexonsmith, grosser Differential Revision: http://reviews.llvm.org/D7636 llvm-svn: 230786 2015-02-28 03:29:02 +08:00			`%tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 %offset`
R600/SI: Add SIFoldOperands pass This pass attempts to fold the source operands of mov and copy instructions into their uses. llvm-svn: 222581 2014-11-22 06:06:37 +08:00			`store i32 0, i32 addrspace(1)* %tmp1`
			`br label %endif`

			`endif:`
			`ret void`
			`}`

			`; CHECK-LABEL: {{^}}fold_imm:`
[R600/SI] Fix testcase check line. Missing colon, instruction typo. llvm-svn: 233414 2015-03-28 04:41:42 +08:00			`; CHECK: v_or_b32_e32 v{{[0-9]+}}, 5`
AMDGPU: Remove denormal subtarget features Switch to using the denormal-fp-math/denormal-fp-math-f32 attributes. 2019-11-18 19:18:07 +08:00			`define amdgpu_kernel void @fold_imm(i32 addrspace(1)* %out, i32 %cmp) #1 {`
R600/SI: Add SIFoldOperands pass This pass attempts to fold the source operands of mov and copy instructions into their uses. llvm-svn: 222581 2014-11-22 06:06:37 +08:00			`entry:`
			`%fold = add i32 3, 2`
			`%tmp0 = icmp ne i32 %cmp, 0`
			`br i1 %tmp0, label %if, label %endif`

			`if:`
AMDGPU: Remove some old intrinsic uses from tests llvm-svn: 260493 2016-02-11 14:02:01 +08:00			`%id = call i32 @llvm.amdgcn.workitem.id.x()`
R600/SI: Add SIFoldOperands pass This pass attempts to fold the source operands of mov and copy instructions into their uses. llvm-svn: 222581 2014-11-22 06:06:37 +08:00			`%val = or i32 %id, %fold`
			`store i32 %val, i32 addrspace(1)* %out`
			`br label %endif`

			`endif:`
			`ret void`
			`}`

R600/SI: Teach SIFoldOperands to split 64-bit constants when folding This allows folding of sequences like: s[0:1] = s_mov_b64 4 v_add_i32 v0, s0, v0 v_addc_u32 v1, s1, v1 into v_add_i32 v0, 4, v0 v_add_i32 v1, 0, v1 llvm-svn: 225369 2015-01-08 03:56:17 +08:00			`; CHECK-LABEL: {{^}}fold_64bit_constant_add:`
			`; CHECK-NOT: s_mov_b64`
			`; FIXME: It would be better if we could use v_add here and drop the extra`
			`; v_mov_b32 instructions.`
			`; CHECK-DAG: s_add_u32 [[LO:s[0-9]+]], s{{[0-9]+}}, 1`
			`; CHECK-DAG: s_addc_u32 [[HI:s[0-9]+]], s{{[0-9]+}}, 0`
			`; CHECK-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[LO]]`
			`; CHECK-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[HI]]`
[CodeGen] Remove unneeded regex escaping in FileCheck patterns. NFC. Take advantage of D117117 to simplify all {{\[}} to [ and {{\]}} to ]. Differential Revision: https://reviews.llvm.org/D117298 2022-01-14 19:03:21 +08:00			`; CHECK: buffer_store_dwordx2 v[[[VLO]]:[[VHI]]],`
R600/SI: Teach SIFoldOperands to split 64-bit constants when folding This allows folding of sequences like: s[0:1] = s_mov_b64 4 v_add_i32 v0, s0, v0 v_addc_u32 v1, s1, v1 into v_add_i32 v0, 4, v0 v_add_i32 v1, 0, v1 llvm-svn: 225369 2015-01-08 03:56:17 +08:00
AMDGPU: Remove denormal subtarget features Switch to using the denormal-fp-math/denormal-fp-math-f32 attributes. 2019-11-18 19:18:07 +08:00			`define amdgpu_kernel void @fold_64bit_constant_add(i64 addrspace(1)* %out, i32 %cmp, i64 %val) #1 {`
R600/SI: Teach SIFoldOperands to split 64-bit constants when folding This allows folding of sequences like: s[0:1] = s_mov_b64 4 v_add_i32 v0, s0, v0 v_addc_u32 v1, s1, v1 into v_add_i32 v0, 4, v0 v_add_i32 v1, 0, v1 llvm-svn: 225369 2015-01-08 03:56:17 +08:00			`entry:`
			`%tmp0 = add i64 %val, 1`
			`store i64 %tmp0, i64 addrspace(1)* %out`
			`ret void`
			`}`

R600/SI: Only fold immediates that have one use Folding the same immediate into multiple instruction will increase program size, which can hurt performance. llvm-svn: 225405 2015-01-08 06:18:27 +08:00			`; Inline constants should always be folded.`

			`; CHECK-LABEL: {{^}}vector_inline:`
			`; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}`
			`; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}`
			`; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}`
			`; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}`

AMDGPU: Remove denormal subtarget features Switch to using the denormal-fp-math/denormal-fp-math-f32 attributes. 2019-11-18 19:18:07 +08:00			`define amdgpu_kernel void @vector_inline(<4 x i32> addrspace(1)* %out) #1 {`
R600/SI: Only fold immediates that have one use Folding the same immediate into multiple instruction will increase program size, which can hurt performance. llvm-svn: 225405 2015-01-08 06:18:27 +08:00			`entry:`
AMDGPU: Remove some old intrinsic uses from tests llvm-svn: 260493 2016-02-11 14:02:01 +08:00			`%tmp0 = call i32 @llvm.amdgcn.workitem.id.x()`
R600/SI: Only fold immediates that have one use Folding the same immediate into multiple instruction will increase program size, which can hurt performance. llvm-svn: 225405 2015-01-08 06:18:27 +08:00			`%tmp1 = add i32 %tmp0, 1`
			`%tmp2 = add i32 %tmp0, 2`
			`%tmp3 = add i32 %tmp0, 3`
			`%vec0 = insertelement <4 x i32> undef, i32 %tmp0, i32 0`
			`%vec1 = insertelement <4 x i32> %vec0, i32 %tmp1, i32 1`
			`%vec2 = insertelement <4 x i32> %vec1, i32 %tmp2, i32 2`
			`%vec3 = insertelement <4 x i32> %vec2, i32 %tmp3, i32 3`
			`%tmp4 = xor <4 x i32> <i32 5, i32 5, i32 5, i32 5>, %vec3`
			`store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %out`
			`ret void`
			`}`

			`; Immediates with one use should be folded`
			`; CHECK-LABEL: {{^}}imm_one_use:`
			`; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 0x64, v{{[0-9]+}}`

AMDGPU: Remove denormal subtarget features Switch to using the denormal-fp-math/denormal-fp-math-f32 attributes. 2019-11-18 19:18:07 +08:00			`define amdgpu_kernel void @imm_one_use(i32 addrspace(1)* %out) #1 {`
R600/SI: Only fold immediates that have one use Folding the same immediate into multiple instruction will increase program size, which can hurt performance. llvm-svn: 225405 2015-01-08 06:18:27 +08:00			`entry:`
AMDGPU: Remove some old intrinsic uses from tests llvm-svn: 260493 2016-02-11 14:02:01 +08:00			`%tmp0 = call i32 @llvm.amdgcn.workitem.id.x()`
R600/SI: Only fold immediates that have one use Folding the same immediate into multiple instruction will increase program size, which can hurt performance. llvm-svn: 225405 2015-01-08 06:18:27 +08:00			`%tmp1 = xor i32 %tmp0, 100`
			`store i32 %tmp1, i32 addrspace(1)* %out`
			`ret void`
			`}`
R600/SI: Remove SIISelLowering::legalizeOperands() Its functionality has been replaced by calling SIInstrInfo::legalizeOperands() from SIISelLowering::AdjstInstrPostInstrSelection() and running the SIFoldOperands and SIShrinkInstructions passes. llvm-svn: 225445 2015-01-08 23:08:17 +08:00			`; CHECK-LABEL: {{^}}vector_imm:`
			`; CHECK: s_movk_i32 [[IMM:s[0-9]+]], 0x64`
			`; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}}`
			`; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}}`
			`; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}}`
			`; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}}`

AMDGPU: Remove denormal subtarget features Switch to using the denormal-fp-math/denormal-fp-math-f32 attributes. 2019-11-18 19:18:07 +08:00			`define amdgpu_kernel void @vector_imm(<4 x i32> addrspace(1)* %out) #1 {`
R600/SI: Remove SIISelLowering::legalizeOperands() Its functionality has been replaced by calling SIInstrInfo::legalizeOperands() from SIISelLowering::AdjstInstrPostInstrSelection() and running the SIFoldOperands and SIShrinkInstructions passes. llvm-svn: 225445 2015-01-08 23:08:17 +08:00			`entry:`
AMDGPU: Remove some old intrinsic uses from tests llvm-svn: 260493 2016-02-11 14:02:01 +08:00			`%tmp0 = call i32 @llvm.amdgcn.workitem.id.x()`
R600/SI: Remove SIISelLowering::legalizeOperands() Its functionality has been replaced by calling SIInstrInfo::legalizeOperands() from SIISelLowering::AdjstInstrPostInstrSelection() and running the SIFoldOperands and SIShrinkInstructions passes. llvm-svn: 225445 2015-01-08 23:08:17 +08:00			`%tmp1 = add i32 %tmp0, 1`
			`%tmp2 = add i32 %tmp0, 2`
			`%tmp3 = add i32 %tmp0, 3`
			`%vec0 = insertelement <4 x i32> undef, i32 %tmp0, i32 0`
			`%vec1 = insertelement <4 x i32> %vec0, i32 %tmp1, i32 1`
			`%vec2 = insertelement <4 x i32> %vec1, i32 %tmp2, i32 2`
			`%vec3 = insertelement <4 x i32> %vec2, i32 %tmp3, i32 3`
			`%tmp4 = xor <4 x i32> <i32 100, i32 100, i32 100, i32 100>, %vec3`
			`store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %out`
			`ret void`
			`}`
R600/SI: Only fold immediates that have one use Folding the same immediate into multiple instruction will increase program size, which can hurt performance. llvm-svn: 225405 2015-01-08 06:18:27 +08:00
AMDGPU: Don't fold subregister extracts into tied operands llvm-svn: 278676 2016-08-16 00:18:36 +08:00			`; A subregister use operand should not be tied.`
			`; CHECK-LABEL: {{^}}no_fold_tied_subregister:`
[CodeGen] Remove unneeded regex escaping in FileCheck patterns. NFC. Take advantage of D117117 to simplify all {{\[}} to [ and {{\]}} to ]. Differential Revision: https://reviews.llvm.org/D117298 2022-01-14 19:03:21 +08:00			`; CHECK: buffer_load_dwordx2 v[[[LO:[0-9]+]]:[[HI:[0-9]+]]]`
AMDGPU: Don't fold subregister extracts into tied operands llvm-svn: 278676 2016-08-16 00:18:36 +08:00			`; CHECK: v_mac_f32_e32 v[[LO]], 0x41200000, v[[HI]]`
			`; CHECK: buffer_store_dword v[[LO]]`
AMDGPU: Remove denormal subtarget features Switch to using the denormal-fp-math/denormal-fp-math-f32 attributes. 2019-11-18 19:18:07 +08:00			`define amdgpu_kernel void @no_fold_tied_subregister() #1 {`
AMDGPU: Don't fold subregister extracts into tied operands llvm-svn: 278676 2016-08-16 00:18:36 +08:00			`%tmp1 = load volatile <2 x float>, <2 x float> addrspace(1)* undef`
			`%tmp2 = extractelement <2 x float> %tmp1, i32 0`
			`%tmp3 = extractelement <2 x float> %tmp1, i32 1`
			`%tmp4 = fmul float %tmp3, 10.0`
			`%tmp5 = fadd float %tmp4, %tmp2`
			`store volatile float %tmp5, float addrspace(1)* undef`
			`ret void`
			`}`

[AMDGPU] Skip additional folding on the same operand. Reviewers: rampitec, arsenm Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D69355 2019-10-24 03:19:06 +08:00			`; There should be exact one folding on the same operand.`
			`; CHECK-LABEL: {{^}}no_extra_fold_on_same_opnd`
			`; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}`
			`; CHECK: v_xor_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}`
AMDGPU: Remove denormal subtarget features Switch to using the denormal-fp-math/denormal-fp-math-f32 attributes. 2019-11-18 19:18:07 +08:00			`define void @no_extra_fold_on_same_opnd() #1 {`
[AMDGPU] Skip additional folding on the same operand. Reviewers: rampitec, arsenm Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D69355 2019-10-24 03:19:06 +08:00			`entry:`
			`%s0 = load i32, i32 addrspace(5)* undef, align 4`
			`%s0.i64= zext i32 %s0 to i64`
			`br label %for.body.i.i`

			`for.body.i.i:`
			`%s1 = load i32, i32 addrspace(1)* undef, align 8`
			`%s1.i64 = sext i32 %s1 to i64`
			`%xor = xor i64 %s1.i64, %s0.i64`
			`%flag = icmp ult i64 %xor, 8`
			`br i1 %flag, label %if.then, label %if.else`

			`if.then:`
			`unreachable`

			`if.else:`
			`unreachable`
			`}`

AMDGPU: Remove some old intrinsic uses from tests llvm-svn: 260493 2016-02-11 14:02:01 +08:00			`declare i32 @llvm.amdgcn.workitem.id.x() #0`

			`attributes #0 = { nounwind readnone }`
AMDGPU: Remove denormal subtarget features Switch to using the denormal-fp-math/denormal-fp-math-f32 attributes. 2019-11-18 19:18:07 +08:00			`attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }`