forked from OSchip/llvm-project
[AArch64] Run a peephole pass right after AdvSIMD pass.
The AdvSIMD pass may produce copies that are not coalescer-friendly. The peephole optimizer knows how to fix that as demonstrated in the test case. <rdar://problem/12702965> llvm-svn: 216200
This commit is contained in:
parent
c83265a6c5
commit
0c740d4b9a
|
@ -195,8 +195,12 @@ bool AArch64PassConfig::addILPOpts() {
|
||||||
|
|
||||||
bool AArch64PassConfig::addPreRegAlloc() {
|
bool AArch64PassConfig::addPreRegAlloc() {
|
||||||
// Use AdvSIMD scalar instructions whenever profitable.
|
// Use AdvSIMD scalar instructions whenever profitable.
|
||||||
if (TM->getOptLevel() != CodeGenOpt::None && EnableAdvSIMDScalar)
|
if (TM->getOptLevel() != CodeGenOpt::None && EnableAdvSIMDScalar) {
|
||||||
addPass(createAArch64AdvSIMDScalar());
|
addPass(createAArch64AdvSIMDScalar());
|
||||||
|
// The AdvSIMD pass may produce copies that can be rewritten to
|
||||||
|
// be register coaleascer friendly.
|
||||||
|
addPass(&PeepholeOptimizerID);
|
||||||
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,15 +1,36 @@
|
||||||
; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -aarch64-simd-scalar=true -asm-verbose=false | FileCheck %s
|
; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -aarch64-simd-scalar=true -asm-verbose=false -disable-adv-copy-opt=true | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-NOOPT
|
||||||
; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=generic -aarch64-simd-scalar=true -asm-verbose=false | FileCheck %s -check-prefix=GENERIC
|
; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -aarch64-simd-scalar=true -asm-verbose=false -disable-adv-copy-opt=false | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-OPT
|
||||||
|
; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=generic -aarch64-simd-scalar=true -asm-verbose=false -disable-adv-copy-opt=true | FileCheck %s -check-prefix=GENERIC -check-prefix=GENERIC-NOOPT
|
||||||
|
; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=generic -aarch64-simd-scalar=true -asm-verbose=false -disable-adv-copy-opt=false | FileCheck %s -check-prefix=GENERIC -check-prefix=GENERIC-OPT
|
||||||
|
|
||||||
define <2 x i64> @bar(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
|
define <2 x i64> @bar(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
|
||||||
; CHECK-LABEL: bar:
|
; CHECK-LABEL: bar:
|
||||||
; CHECK: add.2d v[[REG:[0-9]+]], v0, v1
|
; CHECK: add.2d v[[REG:[0-9]+]], v0, v1
|
||||||
; CHECK: add d[[REG3:[0-9]+]], d[[REG]], d1
|
; CHECK: add d[[REG3:[0-9]+]], d[[REG]], d1
|
||||||
|
; Without advanced copy optimization, we end up with cross register
|
||||||
|
; banks copies that cannot be coalesced.
|
||||||
|
; CHECK-NOOPT: fmov [[COPY_REG3:x[0-9]+]], d[[REG3]]
|
||||||
|
; With advanced copy optimization, we end up with just one copy
|
||||||
|
; to insert the computed high part into the V register.
|
||||||
|
; CHECK-OPT-NOT: fmov
|
||||||
; CHECK: sub d[[REG2:[0-9]+]], d[[REG]], d1
|
; CHECK: sub d[[REG2:[0-9]+]], d[[REG]], d1
|
||||||
|
; CHECK: fmov [[COPY_REG2:x[0-9]+]], d[[REG2]]
|
||||||
|
; CHECK-NOOPT: fmov d0, [[COPY_REG3]]
|
||||||
|
; CHECK-OPT-NOT: fmov
|
||||||
|
; CHECK: ins.d v0[1], [[COPY_REG2]]
|
||||||
|
; CHECK-NEXT: ret
|
||||||
|
;
|
||||||
; GENERIC-LABEL: bar:
|
; GENERIC-LABEL: bar:
|
||||||
; GENERIC: add v[[REG:[0-9]+]].2d, v0.2d, v1.2d
|
; GENERIC: add v[[REG:[0-9]+]].2d, v0.2d, v1.2d
|
||||||
; GENERIC: add d[[REG3:[0-9]+]], d[[REG]], d1
|
; GENERIC: add d[[REG3:[0-9]+]], d[[REG]], d1
|
||||||
|
; GENERIC-NOOPT: fmov [[COPY_REG3:x[0-9]+]], d[[REG3]]
|
||||||
|
; GENERIC-OPT-NOT: fmov
|
||||||
; GENERIC: sub d[[REG2:[0-9]+]], d[[REG]], d1
|
; GENERIC: sub d[[REG2:[0-9]+]], d[[REG]], d1
|
||||||
|
; GENERIC: fmov [[COPY_REG2:x[0-9]+]], d[[REG2]]
|
||||||
|
; GENERIC-NOOPT: fmov d0, [[COPY_REG3]]
|
||||||
|
; GENERIC-OPT-NOT: fmov
|
||||||
|
; GENERIC: ins v0.d[1], [[COPY_REG2]]
|
||||||
|
; GENERIC-NEXT: ret
|
||||||
%add = add <2 x i64> %a, %b
|
%add = add <2 x i64> %a, %b
|
||||||
%vgetq_lane = extractelement <2 x i64> %add, i32 0
|
%vgetq_lane = extractelement <2 x i64> %add, i32 0
|
||||||
%vgetq_lane2 = extractelement <2 x i64> %b, i32 0
|
%vgetq_lane2 = extractelement <2 x i64> %b, i32 0
|
||||||
|
|
Loading…
Reference in New Issue