llvm-project/llvm/test/CodeGen/ARM/lsr-unfolded-offset.ll

; RUN: llc -regalloc=greedy -arm-atomic-cfg-tidy=0 < %s | FileCheck %s

; LSR shouldn't introduce more induction variables than needed, increasing
; register pressure and therefore spilling. There is more room for improvement
; here.

; CHECK: sub sp, #{{40|36|32|28|24}}

; CHECK: %for.inc
; CHECK-NOT: ldr
; CHECK: add

target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
target triple = "thumbv7-apple-ios"

%struct.partition_entry = type { i32, i32, i64, i64 }

define i32 @partition_overlap_check(%struct.partition_entry* nocapture %part, i32 %num_entries) nounwind readonly optsize ssp "no-frame-pointer-elim"="true" {
entry:
  %cmp79 = icmp sgt i32 %num_entries, 0
  br i1 %cmp79, label %outer.loop, label %for.end72

outer.loop:                                 ; preds = %for.inc69, %entry
  %overlap.081 = phi i32 [ %overlap.4, %for.inc69 ], [ 0, %entry ]
  %0 = phi i32 [ %inc71, %for.inc69 ], [ 0, %entry ]
  %offset = getelementptr %struct.partition_entry, %struct.partition_entry* %part, i32 %0, i32 2
  %len = getelementptr %struct.partition_entry, %struct.partition_entry* %part, i32 %0, i32 3
  %tmp5 = load i64, i64* %offset, align 4
  %tmp15 = load i64, i64* %len, align 4
  %add = add nsw i64 %tmp15, %tmp5
  br label %inner.loop

inner.loop:                                       ; preds = %for.inc, %outer.loop
  %overlap.178 = phi i32 [ %overlap.081, %outer.loop ], [ %overlap.4, %for.inc ]
  %1 = phi i32 [ 0, %outer.loop ], [ %inc, %for.inc ]
  %cmp23 = icmp eq i32 %0, %1
  br i1 %cmp23, label %for.inc, label %if.end

if.end:                                           ; preds = %inner.loop
  %len39 = getelementptr %struct.partition_entry, %struct.partition_entry* %part, i32 %1, i32 3
  %offset28 = getelementptr %struct.partition_entry, %struct.partition_entry* %part, i32 %1, i32 2
  %tmp29 = load i64, i64* %offset28, align 4
  %tmp40 = load i64, i64* %len39, align 4
  %add41 = add nsw i64 %tmp40, %tmp29
  %cmp44 = icmp sge i64 %tmp29, %tmp5
  %cmp47 = icmp slt i64 %tmp29, %add
  %or.cond = and i1 %cmp44, %cmp47
  %overlap.2 = select i1 %or.cond, i32 1, i32 %overlap.178
  %cmp52 = icmp sle i64 %add41, %add
  %cmp56 = icmp sgt i64 %add41, %tmp5
  %or.cond74 = and i1 %cmp52, %cmp56
  %overlap.3 = select i1 %or.cond74, i32 1, i32 %overlap.2
  %cmp61 = icmp sgt i64 %tmp29, %tmp5
  %cmp65 = icmp slt i64 %add41, %add
  %or.cond75 = or i1 %cmp61, %cmp65
  br i1 %or.cond75, label %for.inc, label %if.then66

if.then66:                                        ; preds = %if.end
  br label %for.inc

for.inc:                                          ; preds = %if.end, %if.then66, %inner.loop
  %overlap.4 = phi i32 [ %overlap.178, %inner.loop ], [ 1, %if.then66 ], [ %overlap.3, %if.end ]
  %inc = add nsw i32 %1, 1
  %exitcond = icmp eq i32 %inc, %num_entries
  br i1 %exitcond, label %for.inc69, label %inner.loop

for.inc69:                                        ; preds = %for.inc
  %inc71 = add nsw i32 %0, 1
  %exitcond83 = icmp eq i32 %inc71, %num_entries
  br i1 %exitcond83, label %for.end72, label %outer.loop

for.end72:                                        ; preds = %for.inc69, %entry
  %overlap.0.lcssa = phi i32 [ 0, %entry ], [ %overlap.4, %for.inc69 ]
  ret i32 %overlap.0.lcssa
}
ARM & AArch64: make use of common cmpxchg idioms after expansion The C and C++ semantics for compare_exchange require it to return a bool indicating success. This gets mapped to LLVM IR which follows each cmpxchg with an icmp of the value loaded against the desired value. When lowered to ldxr/stxr loops, this extra comparison is redundant: its results are implicit in the control-flow of the function. This commit makes two changes: it replaces that icmp with appropriate PHI nodes, and then makes sure earlyCSE is called after expansion to actually make use of the opportunities revealed. I've also added -{arm,aarch64}-enable-atomic-tidy options, so that existing fragile tests aren't perturbed too much by the change. Many of them either rely on undef/unreachable too pervasively to be restored to something well-defined (particularly while making sure they test the same obscure assert from many years ago), or depend on a particular CFG shape, which is disrupted by SimplifyCFG. rdar://problem/16227836 llvm-svn: 209883 2014-05-30 18:09:59 +08:00			`; RUN: llc -regalloc=greedy -arm-atomic-cfg-tidy=0 < %s \| FileCheck %s`
Add an unfolded offset field to LSR's Formula record. This is used to model constants which can be added to base registers via add-immediate instructions which don't require an additional register to materialize the immediate. llvm-svn: 130743 2011-05-03 08:46:49 +08:00
			`; LSR shouldn't introduce more induction variables than needed, increasing`
			`; register pressure and therefore spilling. There is more room for improvement`
			`; here.`

PGO branch weight: update edge weights in SelectionDAGBuilder. When converting from "or + br" to two branches, or converting from "and + br" to two branches, we correctly update the edge weights of the two branches. The previous attempt at r200431 was reverted at r200434 because of two testing case failures. I modified my patch a little, but forgot to re-run "make check-all". Testing case CodeGen/ARM/lsr-unfolded-offset.ll is updated because of the patch's impact on branch probability which causes changes in spill placement. llvm-svn: 200502 2014-01-31 08:42:44 +08:00			`; CHECK: sub sp, #{{40\|36\|32\|28\|24}}`
Add an unfolded offset field to LSR's Formula record. This is used to model constants which can be added to base registers via add-immediate instructions which don't require an additional register to materialize the immediate. llvm-svn: 130743 2011-05-03 08:46:49 +08:00
Be more aggressive about following hints. RAGreedy::tryAssign will now evict interference from the preferred register even when another register is free. To support this, add the EvictionCost struct that counts how many hints are broken by an eviction. We don't want to break one hint just to satisfy another. Rename canEvict to shouldEvict, and add the first bit of eviction policy that doesn't depend on spill weights: Always make room in the preferred register as long as the evictees can be split and aren't already assigned to their preferred register. Also make the CSR avoidance more accurate. When looking for a cheaper register it is OK to use a new volatile register. Only CSR aliases that have never been used before should be avoided. llvm-svn: 134735 2011-07-09 04:46:18 +08:00			`; CHECK: %for.inc`
Switch spill weights from a basic loop depth estimation to BlockFrequencyInfo. The main advantages here are way better heuristics, taking into account not just loop depth but also __builtin_expect and other static heuristics and will eventually learn how to use profile info. Most of the work in this patch is pushing the MachineBlockFrequencyInfo analysis into the right places. This is good for a 5% speedup on zlib's deflate (x86_64), there were some very unfortunate spilling decisions in its hottest loop in longest_match(). Other benchmarks I tried were mostly neutral. This changes register allocation in subtle ways, update the tests for it. 2012-02-20-MachineCPBug.ll was deleted as it's very fragile and the instruction it looked for was gone already (but the FileCheck pattern picked up unrelated stuff). llvm-svn: 184105 2013-06-18 03:00:36 +08:00			`; CHECK-NOT: ldr`
Be more aggressive about following hints. RAGreedy::tryAssign will now evict interference from the preferred register even when another register is free. To support this, add the EvictionCost struct that counts how many hints are broken by an eviction. We don't want to break one hint just to satisfy another. Rename canEvict to shouldEvict, and add the first bit of eviction policy that doesn't depend on spill weights: Always make room in the preferred register as long as the evictees can be split and aren't already assigned to their preferred register. Also make the CSR avoidance more accurate. When looking for a cheaper register it is OK to use a new volatile register. Only CSR aliases that have never been used before should be avoided. llvm-svn: 134735 2011-07-09 04:46:18 +08:00			`; CHECK: add`
Add an unfolded offset field to LSR's Formula record. This is used to model constants which can be added to base registers via add-immediate instructions which don't require an additional register to materialize the immediate. llvm-svn: 130743 2011-05-03 08:46:49 +08:00
			`target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"`
Fix more places which should be checking for iOS, not darwin. llvm-svn: 147513 2012-01-04 09:55:04 +08:00			`target triple = "thumbv7-apple-ios"`
Add an unfolded offset field to LSR's Formula record. This is used to model constants which can be added to base registers via add-immediate instructions which don't require an additional register to materialize the immediate. llvm-svn: 130743 2011-05-03 08:46:49 +08:00
			`%struct.partition_entry = type { i32, i32, i64, i64 }`

[ARM] Generate consistent frame records for Thumb2 There is not an official documented ABI for frame pointers in Thumb2, but we should try to emit something which is useful. We use r7 as the frame pointer for Thumb code, which currently means that if a function needs to save a high register (r8-r11), it will get pushed to the stack between the frame pointer (r7) and link register (r14). This means that while a stack unwinder can follow the chain of frame pointers up the stack, it cannot know the offset to lr, so does not know which functions correspond to the stack frames. To fix this, we need to push the callee-saved registers in two batches, with the first push saving the low registers, fp and lr, and the second push saving the high registers. This is already implemented, but previously only used for iOS. This patch turns it on for all Thumb2 targets when frame pointers are required by the ABI, and the frame pointer is r7 (Windows uses r11, so this isn't a problem there). If frame pointer elimination is enabled we still emit a single push/pop even if we need a frame pointer for other reasons, to avoid increasing code size. We must also ensure that lr is pushed to the stack when using a frame pointer, so that we end up with a complete frame record. Situations that could cause this were rare, because we already push lr in most situations so that we can return using the pop instruction. Differential Revision: https://reviews.llvm.org/D23516 llvm-svn: 279506 2016-08-23 17:19:22 +08:00			`define i32 @partition_overlap_check(%struct.partition_entry* nocapture %part, i32 %num_entries) nounwind readonly optsize ssp "no-frame-pointer-elim"="true" {`
Add an unfolded offset field to LSR's Formula record. This is used to model constants which can be added to base registers via add-immediate instructions which don't require an additional register to materialize the immediate. llvm-svn: 130743 2011-05-03 08:46:49 +08:00			`entry:`
			`%cmp79 = icmp sgt i32 %num_entries, 0`
			`br i1 %cmp79, label %outer.loop, label %for.end72`

			`outer.loop: ; preds = %for.inc69, %entry`
			`%overlap.081 = phi i32 [ %overlap.4, %for.inc69 ], [ 0, %entry ]`
			`%0 = phi i32 [ %inc71, %for.inc69 ], [ 0, %entry ]`
[opaque pointer type] Add textual IR support for explicit type parameter to getelementptr instruction One of several parallel first steps to remove the target type of pointers, replacing them with a single opaque pointer type. This adds an explicit type parameter to the gep instruction so that when the first parameter becomes an opaque pointer type, the type to gep through is still available to the instructions. * This doesn't modify gep operators, only instructions (operators will be handled separately) * Textual IR changes only. Bitcode (including upgrade) and changing the in-memory representation will be in separate changes. * geps of vectors are transformed as: getelementptr <4 x float> %x, ... ->getelementptr float, <4 x float> %x, ... Then, once the opaque pointer type is introduced, this will ultimately look like: getelementptr float, <4 x ptr> %x with the unambiguous interpretation that it is a vector of pointers to float. * address spaces remain on the pointer, not the type: getelementptr float addrspace(1)* %x ->getelementptr float, float addrspace(1)* %x Then, eventually: getelementptr float, ptr addrspace(1) %x Importantly, the massive amount of test case churn has been automated by same crappy python code. I had to manually update a few test cases that wouldn't fit the script's model (r228970,r229196,r229197,r229198). The python script just massages stdin and writes the result to stdout, I then wrapped that in a shell script to handle replacing files, then using the usual find+xargs to migrate all the files. update.py: import fileinput import sys import re ibrep = re.compile(r"(^.?[^%\w]getelementptr inbounds )(((?:<\d x )?)(.?)(\| addrspace\(\d\)) \(\|>)(?:$\| (?:%\|@\|null\|undef\|blockaddress\|getelementptr\|addrspacecast\|bitcast\|inttoptr\|\[\[[a-zA-Z]\|\{\{).$))") normrep = re.compile( r"(^.?[^%\w]getelementptr )(((?:<\d* x )?)(.?)(\| addrspace\(\d\)) \(\|>)(?:$\| (?:%\|@\|null\|undef\|blockaddress\|getelementptr\|addrspacecast\|bitcast\|inttoptr\|\[\[[a-zA-Z]\|\{\{).$))") def conv(match, line): if not match: return line line = match.groups()[0] if len(match.groups()[5]) == 0: line += match.groups()[2] line += match.groups()[3] line += ", " line += match.groups()[1] line += "\n" return line for line in sys.stdin: if line.find("getelementptr ") == line.find("getelementptr inbounds"): if line.find("getelementptr inbounds") != line.find("getelementptr inbounds ("): line = conv(re.match(ibrep, line), line) elif line.find("getelementptr ") != line.find("getelementptr ("): line = conv(re.match(normrep, line), line) sys.stdout.write(line) apply.sh: for name in "$@" do python3 `dirname "$0"`/update.py < "$name" > "$name.tmp" && mv "$name.tmp" "$name" rm -f "$name.tmp" done The actual commands: From llvm/src: find test/ -name .ll \| xargs ./apply.sh From llvm/src/tools/clang: find test/ -name .mm -o -name .m -o -name .cpp -o -name .c \| xargs -I '{}' ../../apply.sh "{}" From llvm/src/tools/polly: find test/ -name *.ll \| xargs ./apply.sh After that, check-all (with llvm, clang, clang-tools-extra, lld, compiler-rt, and polly all checked out). The extra 'rm' in the apply.sh script is due to a few files in clang's test suite using interesting unicode stuff that my python script was throwing exceptions on. None of those files needed to be migrated, so it seemed sufficient to ignore those cases. Reviewers: rafael, dexonsmith, grosser Differential Revision: http://reviews.llvm.org/D7636 llvm-svn: 230786 2015-02-28 03:29:02 +08:00			`%offset = getelementptr %struct.partition_entry, %struct.partition_entry* %part, i32 %0, i32 2`
			`%len = getelementptr %struct.partition_entry, %struct.partition_entry* %part, i32 %0, i32 3`
[opaque pointer type] Add textual IR support for explicit type parameter to load instruction Essentially the same as the GEP change in r230786. A similar migration script can be used to update test cases, though a few more test case improvements/changes were required this time around: (r229269-r229278) import fileinput import sys import re pat = re.compile(r"((?:=\|:\|^)\sload (?:atomic )?(?:volatile )?(.?))(\| addrspace\(\d+\) )\($\| (?:%\|@\|null\|undef\|blockaddress\|getelementptr\|addrspacecast\|bitcast\|inttoptr\|\[\[[a-zA-Z]\|\{\{).$)") for line in sys.stdin: sys.stdout.write(re.sub(pat, r"\1, \2\3*\4", line)) Reviewers: rafael, dexonsmith, grosser Differential Revision: http://reviews.llvm.org/D7649 llvm-svn: 230794 2015-02-28 05:17:42 +08:00			`%tmp5 = load i64, i64* %offset, align 4`
			`%tmp15 = load i64, i64* %len, align 4`
Add an unfolded offset field to LSR's Formula record. This is used to model constants which can be added to base registers via add-immediate instructions which don't require an additional register to materialize the immediate. llvm-svn: 130743 2011-05-03 08:46:49 +08:00			`%add = add nsw i64 %tmp15, %tmp5`
			`br label %inner.loop`

			`inner.loop: ; preds = %for.inc, %outer.loop`
			`%overlap.178 = phi i32 [ %overlap.081, %outer.loop ], [ %overlap.4, %for.inc ]`
			`%1 = phi i32 [ 0, %outer.loop ], [ %inc, %for.inc ]`
			`%cmp23 = icmp eq i32 %0, %1`
			`br i1 %cmp23, label %for.inc, label %if.end`

			`if.end: ; preds = %inner.loop`
[opaque pointer type] Add textual IR support for explicit type parameter to getelementptr instruction One of several parallel first steps to remove the target type of pointers, replacing them with a single opaque pointer type. This adds an explicit type parameter to the gep instruction so that when the first parameter becomes an opaque pointer type, the type to gep through is still available to the instructions. * This doesn't modify gep operators, only instructions (operators will be handled separately) * Textual IR changes only. Bitcode (including upgrade) and changing the in-memory representation will be in separate changes. * geps of vectors are transformed as: getelementptr <4 x float> %x, ... ->getelementptr float, <4 x float> %x, ... Then, once the opaque pointer type is introduced, this will ultimately look like: getelementptr float, <4 x ptr> %x with the unambiguous interpretation that it is a vector of pointers to float. * address spaces remain on the pointer, not the type: getelementptr float addrspace(1)* %x ->getelementptr float, float addrspace(1)* %x Then, eventually: getelementptr float, ptr addrspace(1) %x Importantly, the massive amount of test case churn has been automated by same crappy python code. I had to manually update a few test cases that wouldn't fit the script's model (r228970,r229196,r229197,r229198). The python script just massages stdin and writes the result to stdout, I then wrapped that in a shell script to handle replacing files, then using the usual find+xargs to migrate all the files. update.py: import fileinput import sys import re ibrep = re.compile(r"(^.?[^%\w]getelementptr inbounds )(((?:<\d x )?)(.?)(\| addrspace\(\d\)) \(\|>)(?:$\| (?:%\|@\|null\|undef\|blockaddress\|getelementptr\|addrspacecast\|bitcast\|inttoptr\|\[\[[a-zA-Z]\|\{\{).$))") normrep = re.compile( r"(^.?[^%\w]getelementptr )(((?:<\d* x )?)(.?)(\| addrspace\(\d\)) \(\|>)(?:$\| (?:%\|@\|null\|undef\|blockaddress\|getelementptr\|addrspacecast\|bitcast\|inttoptr\|\[\[[a-zA-Z]\|\{\{).$))") def conv(match, line): if not match: return line line = match.groups()[0] if len(match.groups()[5]) == 0: line += match.groups()[2] line += match.groups()[3] line += ", " line += match.groups()[1] line += "\n" return line for line in sys.stdin: if line.find("getelementptr ") == line.find("getelementptr inbounds"): if line.find("getelementptr inbounds") != line.find("getelementptr inbounds ("): line = conv(re.match(ibrep, line), line) elif line.find("getelementptr ") != line.find("getelementptr ("): line = conv(re.match(normrep, line), line) sys.stdout.write(line) apply.sh: for name in "$@" do python3 `dirname "$0"`/update.py < "$name" > "$name.tmp" && mv "$name.tmp" "$name" rm -f "$name.tmp" done The actual commands: From llvm/src: find test/ -name .ll \| xargs ./apply.sh From llvm/src/tools/clang: find test/ -name .mm -o -name .m -o -name .cpp -o -name .c \| xargs -I '{}' ../../apply.sh "{}" From llvm/src/tools/polly: find test/ -name *.ll \| xargs ./apply.sh After that, check-all (with llvm, clang, clang-tools-extra, lld, compiler-rt, and polly all checked out). The extra 'rm' in the apply.sh script is due to a few files in clang's test suite using interesting unicode stuff that my python script was throwing exceptions on. None of those files needed to be migrated, so it seemed sufficient to ignore those cases. Reviewers: rafael, dexonsmith, grosser Differential Revision: http://reviews.llvm.org/D7636 llvm-svn: 230786 2015-02-28 03:29:02 +08:00			`%len39 = getelementptr %struct.partition_entry, %struct.partition_entry* %part, i32 %1, i32 3`
			`%offset28 = getelementptr %struct.partition_entry, %struct.partition_entry* %part, i32 %1, i32 2`
[opaque pointer type] Add textual IR support for explicit type parameter to load instruction Essentially the same as the GEP change in r230786. A similar migration script can be used to update test cases, though a few more test case improvements/changes were required this time around: (r229269-r229278) import fileinput import sys import re pat = re.compile(r"((?:=\|:\|^)\sload (?:atomic )?(?:volatile )?(.?))(\| addrspace\(\d+\) )\($\| (?:%\|@\|null\|undef\|blockaddress\|getelementptr\|addrspacecast\|bitcast\|inttoptr\|\[\[[a-zA-Z]\|\{\{).$)") for line in sys.stdin: sys.stdout.write(re.sub(pat, r"\1, \2\3*\4", line)) Reviewers: rafael, dexonsmith, grosser Differential Revision: http://reviews.llvm.org/D7649 llvm-svn: 230794 2015-02-28 05:17:42 +08:00			`%tmp29 = load i64, i64* %offset28, align 4`
			`%tmp40 = load i64, i64* %len39, align 4`
Add an unfolded offset field to LSR's Formula record. This is used to model constants which can be added to base registers via add-immediate instructions which don't require an additional register to materialize the immediate. llvm-svn: 130743 2011-05-03 08:46:49 +08:00			`%add41 = add nsw i64 %tmp40, %tmp29`
			`%cmp44 = icmp sge i64 %tmp29, %tmp5`
			`%cmp47 = icmp slt i64 %tmp29, %add`
			`%or.cond = and i1 %cmp44, %cmp47`
			`%overlap.2 = select i1 %or.cond, i32 1, i32 %overlap.178`
			`%cmp52 = icmp sle i64 %add41, %add`
			`%cmp56 = icmp sgt i64 %add41, %tmp5`
			`%or.cond74 = and i1 %cmp52, %cmp56`
			`%overlap.3 = select i1 %or.cond74, i32 1, i32 %overlap.2`
			`%cmp61 = icmp sgt i64 %tmp29, %tmp5`
			`%cmp65 = icmp slt i64 %add41, %add`
			`%or.cond75 = or i1 %cmp61, %cmp65`
			`br i1 %or.cond75, label %for.inc, label %if.then66`

			`if.then66: ; preds = %if.end`
			`br label %for.inc`

			`for.inc: ; preds = %if.end, %if.then66, %inner.loop`
			`%overlap.4 = phi i32 [ %overlap.178, %inner.loop ], [ 1, %if.then66 ], [ %overlap.3, %if.end ]`
			`%inc = add nsw i32 %1, 1`
			`%exitcond = icmp eq i32 %inc, %num_entries`
			`br i1 %exitcond, label %for.inc69, label %inner.loop`

			`for.inc69: ; preds = %for.inc`
			`%inc71 = add nsw i32 %0, 1`
			`%exitcond83 = icmp eq i32 %inc71, %num_entries`
			`br i1 %exitcond83, label %for.end72, label %outer.loop`

			`for.end72: ; preds = %for.inc69, %entry`
			`%overlap.0.lcssa = phi i32 [ 0, %entry ], [ %overlap.4, %for.inc69 ]`
			`ret i32 %overlap.0.lcssa`
			`}`