llvm-project/llvm/test/CodeGen/AArch64/machine-combiner-madd.ll

; Test all AArch64 subarches with scheduling models.
; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=cortex-a57 < %s | FileCheck %s
; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=cortex-a72 < %s | FileCheck %s
; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=cortex-a73 < %s | FileCheck %s
; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=cyclone    < %s | FileCheck %s
; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=exynos-m1  < %s | FileCheck %s
; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=exynos-m2  < %s | FileCheck %s
; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=kryo       < %s | FileCheck %s
; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=thunderx2t99 < %s | FileCheck %s

; Make sure that inst-combine fuses the multiply add in the addressing mode of
; the load.

; CHECK-LABEL: fun:
; CHECK-NOT: mul
; CHECK:     madd
; CHECK-NOT: mul

%class.D = type { %class.basic_string.base, [4 x i8] }
%class.basic_string.base = type <{ i64, i64, i32 }>
@a = global %class.D* zeroinitializer, align 8
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1)
define internal void @fun() section ".text.startup" {
entry:
  %tmp.i.i = alloca %class.D, align 8
  %y = bitcast %class.D* %tmp.i.i to i8*
  br label %loop
loop:
  %conv11.i.i = phi i64 [ 0, %entry ], [ %inc.i.i, %loop ]
  %i = phi i64 [ undef, %entry ], [ %inc.i.i, %loop ]
  %x = load %class.D*, %class.D** getelementptr inbounds (%class.D*, %class.D** @a, i64 0), align 8
  %arrayidx.i.i.i = getelementptr inbounds %class.D, %class.D* %x, i64 %conv11.i.i
  %d = bitcast %class.D* %arrayidx.i.i.i to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull %y, i8* %d, i64 24, i32 8, i1 false)
  %inc.i.i = add i64 %i, 1
  %cmp.i.i = icmp slt i64 %inc.i.i, 0
  br i1 %cmp.i.i, label %loop, label %exit
exit:
  ret void
}
instr-combiner: sum up all latencies of the transformed instructions We have found that -- when the selected subarchitecture has a scheduling model and we are not optimizing for size -- the machine-instruction combiner uses a too-simple algorithm to compute the cost of one of the two alternatives [before and after running a combining pass on a section of code], and therefor it throws away the combination results too often. This fix has the potential to help any ISA with the potential to combine instructions and for which at least one subarchitecture has a scheduling model. As of now, this is only known to definitely affect AArch64 subarchitectures with a scheduling model. Regression tested on AMD64/GNU-Linux, new test case tested to fail on an unpatched compiler and pass on a patched compiler. Patch by Abe Skolnik and Sebastian Pop. llvm-svn: 289399 2016-12-12 03:39:32 +08:00			`; Test all AArch64 subarches with scheduling models.`
			`; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=cortex-a57 < %s \| FileCheck %s`
			`; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=cortex-a72 < %s \| FileCheck %s`
			`; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=cortex-a73 < %s \| FileCheck %s`
			`; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=cyclone < %s \| FileCheck %s`
			`; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=exynos-m1 < %s \| FileCheck %s`
			`; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=exynos-m2 < %s \| FileCheck %s`
			`; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=kryo < %s \| FileCheck %s`
[AArch64] Vulcan is now ThunderXT99 Broadcom Vulcan is now Cavium ThunderX2T99. LLVM Bugzilla: http://bugs.llvm.org/show_bug.cgi?id=32113 Minor fixes for the alignments of loops and functions for ThunderX T81/T83/T88 (better performance). Patch was tested with SpecCPU2006. Patch by Stefan Teleman Differential Revision: https://reviews.llvm.org/D30510 llvm-svn: 297190 2017-03-08 03:42:40 +08:00			`; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=thunderx2t99 < %s \| FileCheck %s`
instr-combiner: sum up all latencies of the transformed instructions We have found that -- when the selected subarchitecture has a scheduling model and we are not optimizing for size -- the machine-instruction combiner uses a too-simple algorithm to compute the cost of one of the two alternatives [before and after running a combining pass on a section of code], and therefor it throws away the combination results too often. This fix has the potential to help any ISA with the potential to combine instructions and for which at least one subarchitecture has a scheduling model. As of now, this is only known to definitely affect AArch64 subarchitectures with a scheduling model. Regression tested on AMD64/GNU-Linux, new test case tested to fail on an unpatched compiler and pass on a patched compiler. Patch by Abe Skolnik and Sebastian Pop. llvm-svn: 289399 2016-12-12 03:39:32 +08:00
			`; Make sure that inst-combine fuses the multiply add in the addressing mode of`
			`; the load.`

			`; CHECK-LABEL: fun:`
			`; CHECK-NOT: mul`
			`; CHECK: madd`
			`; CHECK-NOT: mul`

			`%class.D = type { %class.basic_string.base, [4 x i8] }`
			`%class.basic_string.base = type <{ i64, i64, i32 }>`
			`@a = global %class.D* zeroinitializer, align 8`
			`declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1)`
			`define internal void @fun() section ".text.startup" {`
			`entry:`
			`%tmp.i.i = alloca %class.D, align 8`
			`%y = bitcast %class.D* %tmp.i.i to i8*`
			`br label %loop`
			`loop:`
			`%conv11.i.i = phi i64 [ 0, %entry ], [ %inc.i.i, %loop ]`
			`%i = phi i64 [ undef, %entry ], [ %inc.i.i, %loop ]`
			`%x = load %class.D, %class.D* getelementptr inbounds (%class.D, %class.D* @a, i64 0), align 8`
			`%arrayidx.i.i.i = getelementptr inbounds %class.D, %class.D* %x, i64 %conv11.i.i`
			`%d = bitcast %class.D* %arrayidx.i.i.i to i8*`
			`call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull %y, i8* %d, i64 24, i32 8, i1 false)`
			`%inc.i.i = add i64 %i, 1`
			`%cmp.i.i = icmp slt i64 %inc.i.i, 0`
			`br i1 %cmp.i.i, label %loop, label %exit`
			`exit:`
			`ret void`
			`}`