forked from OSchip/llvm-project
dc8a41de34
The patch attempts to optimize a sequence of SIMD loads from the same base pointer: %0 = gep float*, float* base, i32 4 %1 = bitcast float* %0 to <4 x float>* %2 = load <4 x float>, <4 x float>* %1 ... %n1 = gep float*, float* base, i32 N %n2 = bitcast float* %n1 to <4 x float>* %n3 = load <4 x float>, <4 x float>* %n2 For AArch64 the compiler generates a sequence of LDR Qt, [Xn, #16]. However, 32-bit NEON VLD1/VST1 lack the [Wn, #imm] addressing mode, so the address is computed before every ld/st instruction: add r2, r0, #32 add r0, r0, #16 vld1.32 {d18, d19}, [r2] vld1.32 {d22, d23}, [r0] This can be improved by computing address for the first load, and then using a post-indexed form of VLD1/VST1 to load the rest: add r0, r0, #16 vld1.32 {d18, d19}, [r0]! vld1.32 {d22, d23}, [r0] In order to do that, the patch adds more patterns to DAGCombine: - (load (add ptr inc1)) and (add ptr inc2) are now folded if inc1 and inc2 are constants. - (or ptr inc) is now recognized as a pointer increment if ptr is sufficiently aligned. In addition to that, we now search for all possible base updates and then pick the best one. Differential Revision: https://reviews.llvm.org/D108988 |
||
---|---|---|
.. | ||
AArch64 | ||
AMDGPU | ||
ARM | ||
NVPTX | ||
Power | ||
X86 | ||
2005-08-15-AddRecIV.ll | ||
2005-08-17-OutOfLoopVariant.ll | ||
2005-09-12-UsesOutOutsideOfLoop.ll | ||
2007-04-23-UseIterator.ll | ||
2008-08-13-CmpStride.ll | ||
2008-09-09-Overflow.ll | ||
2009-01-13-nonconstant-stride-outside-loop.ll | ||
2009-04-28-no-reduce-mul.ll | ||
2011-07-19-CritEdgeBreakCrash.ll | ||
2011-10-03-CritEdgeMerge.ll | ||
2011-10-06-ReusePhi.ll | ||
2011-10-13-SCEVChain.ll | ||
2011-10-14-IntPtr.ll | ||
2011-12-19-PostincQuadratic.ll | ||
2012-01-02-nopreheader.ll | ||
2012-01-16-nopreheader.ll | ||
2012-03-15-nopreheader.ll | ||
2012-03-26-constexpr.ll | ||
2012-07-13-ExpandUDiv.ll | ||
2012-07-18-LimitReassociate.ll | ||
2013-01-05-IndBr.ll | ||
2013-01-14-ReuseCast.ll | ||
addrec-gep-address-space.ll | ||
addrec-gep.ll | ||
address-space-loop.ll | ||
callbr-critical-edge-splitting.ll | ||
callbr-critical-edge-splitting2.ll | ||
count-to-zero.ll | ||
dbg-preserve-0.ll | ||
dbg-preserve-1.ll | ||
dbg-preserve-2.ll | ||
dead-phi.ll | ||
debuginfo-scev-salvage-0.ll | ||
debuginfo-scev-salvage-1.ll | ||
debuginfo-scev-salvage-2.ll | ||
debuginfo-scev-salvage-3.ll | ||
debuginfo-scev-salvage-4.ll | ||
different-type-ivs.ll | ||
dominate-assert.ll | ||
dont-hoist-simple-loop-constants.ll | ||
dont_insert_redundant_ops.ll | ||
dont_reduce_bytes.ll | ||
dont_reverse.ll | ||
ephemeral.ll | ||
exit_compare_live_range.ll | ||
funclet.ll | ||
gnarly-setupcost.ll | ||
hoist-parent-preheader.ll | ||
illegal-addr-modes.ll | ||
invariant_value_first.ll | ||
invariant_value_first_arg.ll | ||
ivchain.ll | ||
lsr-comp-time.ll | ||
lsr-overflow.ll | ||
missing-phi-operand-update.ll | ||
multi-edge-latch.ll | ||
negative-scale.ll | ||
nested-reduce.ll | ||
nonintegral.ll | ||
nonlinear-postinc.ll | ||
opaque-ptr.ll | ||
ops_after_indvar.ll | ||
optimizemax_debugloc.ll | ||
phi_node_update_multiple_preds.ll | ||
post-inc-icmpzero.ll | ||
post-inc-optsize.ll | ||
post-increment-insertion.ll | ||
pr2537.ll | ||
pr2570.ll | ||
pr3086.ll | ||
pr3399.ll | ||
pr3571.ll | ||
pr12018.ll | ||
pr12048.ll | ||
pr12691.ll | ||
pr18165.ll | ||
pr25541.ll | ||
pr27056.ll | ||
pr31627.ll | ||
pr48725.ll | ||
pr50765.ll | ||
pr50918.ll | ||
pr51329.ll | ||
pr51656.ll | ||
preserve-gep-loop-variant.ll | ||
related_indvars.ll | ||
remove_indvar.ll | ||
scaling-factor-incompat-type.ll | ||
scaling_factor_cost_crash.ll | ||
scev-after-loopinstsimplify.ll | ||
scev-expander-lcssa.ll | ||
scev-insertpt-bug.ll | ||
sext-ind-var.ll | ||
share_code_in_preheader.ll | ||
share_ivs.ll | ||
shl.ll | ||
two-combinations-bug.ll | ||
uglygep-address-space.ll | ||
uglygep.ll | ||
use_postinc_value_outside_loop.ll | ||
var_stride_used_by_compare.ll | ||
variable_stride.ll | ||
wrong-hoisting-iv.ll |