llvm-project/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll

; RUN: llc -march=x86-64 -mcpu=corei7 -mattr=+avx < %s | FileCheck %s

target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.8.0"

%struct.A = type { i8, i8, i8, i8, i8, i8, i8, i8 }
%struct.B = type { i32, i32, i32, i32, i32, i32, i32, i32 }

; CHECK: merge_const_store
; save 1,2,3 ... as one big integer.
; CHECK: movabsq $578437695752307201
; CHECK: ret
define void @merge_const_store(i32 %count, %struct.A* nocapture %p) nounwind uwtable noinline ssp {
  %1 = icmp sgt i32 %count, 0
  br i1 %1, label %.lr.ph, label %._crit_edge
.lr.ph:
  %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
  %.01 = phi %struct.A* [ %11, %.lr.ph ], [ %p, %0 ]
  %2 = getelementptr inbounds %struct.A* %.01, i64 0, i32 0
  store i8 1, i8* %2, align 1
  %3 = getelementptr inbounds %struct.A* %.01, i64 0, i32 1
  store i8 2, i8* %3, align 1
  %4 = getelementptr inbounds %struct.A* %.01, i64 0, i32 2
  store i8 3, i8* %4, align 1
  %5 = getelementptr inbounds %struct.A* %.01, i64 0, i32 3
  store i8 4, i8* %5, align 1
  %6 = getelementptr inbounds %struct.A* %.01, i64 0, i32 4
  store i8 5, i8* %6, align 1
  %7 = getelementptr inbounds %struct.A* %.01, i64 0, i32 5
  store i8 6, i8* %7, align 1
  %8 = getelementptr inbounds %struct.A* %.01, i64 0, i32 6
  store i8 7, i8* %8, align 1
  %9 = getelementptr inbounds %struct.A* %.01, i64 0, i32 7
  store i8 8, i8* %9, align 1
  %10 = add nsw i32 %i.02, 1
  %11 = getelementptr inbounds %struct.A* %.01, i64 1
  %exitcond = icmp eq i32 %10, %count
  br i1 %exitcond, label %._crit_edge, label %.lr.ph
._crit_edge:
  ret void
}

; No vectors because we use noimplicitfloat
; CHECK: merge_const_store_no_vec
; CHECK-NOT: vmovups
; CHECK: ret
define void @merge_const_store_no_vec(i32 %count, %struct.B* nocapture %p) noimplicitfloat{
  %1 = icmp sgt i32 %count, 0
  br i1 %1, label %.lr.ph, label %._crit_edge
.lr.ph:
  %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
  %.01 = phi %struct.B* [ %11, %.lr.ph ], [ %p, %0 ]
  %2 = getelementptr inbounds %struct.B* %.01, i64 0, i32 0
  store i32 0, i32* %2, align 4
  %3 = getelementptr inbounds %struct.B* %.01, i64 0, i32 1
  store i32 0, i32* %3, align 4
  %4 = getelementptr inbounds %struct.B* %.01, i64 0, i32 2
  store i32 0, i32* %4, align 4
  %5 = getelementptr inbounds %struct.B* %.01, i64 0, i32 3
  store i32 0, i32* %5, align 4
  %6 = getelementptr inbounds %struct.B* %.01, i64 0, i32 4
  store i32 0, i32* %6, align 4
  %7 = getelementptr inbounds %struct.B* %.01, i64 0, i32 5
  store i32 0, i32* %7, align 4
  %8 = getelementptr inbounds %struct.B* %.01, i64 0, i32 6
  store i32 0, i32* %8, align 4
  %9 = getelementptr inbounds %struct.B* %.01, i64 0, i32 7
  store i32 0, i32* %9, align 4
  %10 = add nsw i32 %i.02, 1
  %11 = getelementptr inbounds %struct.B* %.01, i64 1
  %exitcond = icmp eq i32 %10, %count
  br i1 %exitcond, label %._crit_edge, label %.lr.ph
._crit_edge:
  ret void
}

; Move the constants using a single vector store.
; CHECK: merge_const_store_vec
; CHECK: vmovups
; CHECK: ret
define void @merge_const_store_vec(i32 %count, %struct.B* nocapture %p) nounwind uwtable noinline ssp {
  %1 = icmp sgt i32 %count, 0
  br i1 %1, label %.lr.ph, label %._crit_edge
.lr.ph:
  %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
  %.01 = phi %struct.B* [ %11, %.lr.ph ], [ %p, %0 ]
  %2 = getelementptr inbounds %struct.B* %.01, i64 0, i32 0
  store i32 0, i32* %2, align 4
  %3 = getelementptr inbounds %struct.B* %.01, i64 0, i32 1
  store i32 0, i32* %3, align 4
  %4 = getelementptr inbounds %struct.B* %.01, i64 0, i32 2
  store i32 0, i32* %4, align 4
  %5 = getelementptr inbounds %struct.B* %.01, i64 0, i32 3
  store i32 0, i32* %5, align 4
  %6 = getelementptr inbounds %struct.B* %.01, i64 0, i32 4
  store i32 0, i32* %6, align 4
  %7 = getelementptr inbounds %struct.B* %.01, i64 0, i32 5
  store i32 0, i32* %7, align 4
  %8 = getelementptr inbounds %struct.B* %.01, i64 0, i32 6
  store i32 0, i32* %8, align 4
  %9 = getelementptr inbounds %struct.B* %.01, i64 0, i32 7
  store i32 0, i32* %9, align 4
  %10 = add nsw i32 %i.02, 1
  %11 = getelementptr inbounds %struct.B* %.01, i64 1
  %exitcond = icmp eq i32 %10, %count
  br i1 %exitcond, label %._crit_edge, label %.lr.ph
._crit_edge:
  ret void
}

; Move the first 4 constants as a single vector. Move the rest as scalars.
; CHECK: merge_nonconst_store
; CHECK: movl $67305985
; CHECK: movb
; CHECK: movb
; CHECK: movb
; CHECK: movb
; CHECK: ret
define void @merge_nonconst_store(i32 %count, i8 %zz, %struct.A* nocapture %p) nounwind uwtable noinline ssp {
  %1 = icmp sgt i32 %count, 0
  br i1 %1, label %.lr.ph, label %._crit_edge
.lr.ph:
  %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
  %.01 = phi %struct.A* [ %11, %.lr.ph ], [ %p, %0 ]
  %2 = getelementptr inbounds %struct.A* %.01, i64 0, i32 0
  store i8 1, i8* %2, align 1
  %3 = getelementptr inbounds %struct.A* %.01, i64 0, i32 1
  store i8 2, i8* %3, align 1
  %4 = getelementptr inbounds %struct.A* %.01, i64 0, i32 2
  store i8 3, i8* %4, align 1
  %5 = getelementptr inbounds %struct.A* %.01, i64 0, i32 3
  store i8 4, i8* %5, align 1
  %6 = getelementptr inbounds %struct.A* %.01, i64 0, i32 4
  store i8 %zz, i8* %6, align 1                     ;  <----------- Not a const;
  %7 = getelementptr inbounds %struct.A* %.01, i64 0, i32 5
  store i8 6, i8* %7, align 1
  %8 = getelementptr inbounds %struct.A* %.01, i64 0, i32 6
  store i8 7, i8* %8, align 1
  %9 = getelementptr inbounds %struct.A* %.01, i64 0, i32 7
  store i8 8, i8* %9, align 1
  %10 = add nsw i32 %i.02, 1
  %11 = getelementptr inbounds %struct.A* %.01, i64 1
  %exitcond = icmp eq i32 %10, %count
  br i1 %exitcond, label %._crit_edge, label %.lr.ph
._crit_edge:
  ret void
}


;CHECK: merge_loads_i16
; load:
;CHECK: movw
; store:
;CHECK: movw
;CHECK: ret
define void @merge_loads_i16(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp {
  %1 = icmp sgt i32 %count, 0
  br i1 %1, label %.lr.ph, label %._crit_edge

.lr.ph:                                           ; preds = %0
  %2 = getelementptr inbounds %struct.A* %q, i64 0, i32 0
  %3 = getelementptr inbounds %struct.A* %q, i64 0, i32 1
  br label %4

; <label>:4                                       ; preds = %4, %.lr.ph
  %i.02 = phi i32 [ 0, %.lr.ph ], [ %9, %4 ]
  %.01 = phi %struct.A* [ %p, %.lr.ph ], [ %10, %4 ]
  %5 = load i8* %2, align 1
  %6 = load i8* %3, align 1
  %7 = getelementptr inbounds %struct.A* %.01, i64 0, i32 0
  store i8 %5, i8* %7, align 1
  %8 = getelementptr inbounds %struct.A* %.01, i64 0, i32 1
  store i8 %6, i8* %8, align 1
  %9 = add nsw i32 %i.02, 1
  %10 = getelementptr inbounds %struct.A* %.01, i64 1
  %exitcond = icmp eq i32 %9, %count
  br i1 %exitcond, label %._crit_edge, label %4

._crit_edge:                                      ; preds = %4, %0
  ret void
}

; The loads and the stores are interleved. Can't merge them.
;CHECK: no_merge_loads
;CHECK: movb
;CHECK: movb
;CHECK: movb
;CHECK: movb
;CHECK: ret
define void @no_merge_loads(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp {
  %1 = icmp sgt i32 %count, 0
  br i1 %1, label %.lr.ph, label %._crit_edge

.lr.ph:                                           ; preds = %0
  %2 = getelementptr inbounds %struct.A* %q, i64 0, i32 0
  %3 = getelementptr inbounds %struct.A* %q, i64 0, i32 1
  br label %a4

a4:                                       ; preds = %4, %.lr.ph
  %i.02 = phi i32 [ 0, %.lr.ph ], [ %a9, %a4 ]
  %.01 = phi %struct.A* [ %p, %.lr.ph ], [ %a10, %a4 ]
  %a5 = load i8* %2, align 1
  %a7 = getelementptr inbounds %struct.A* %.01, i64 0, i32 0
  store i8 %a5, i8* %a7, align 1
  %a8 = getelementptr inbounds %struct.A* %.01, i64 0, i32 1
  %a6 = load i8* %3, align 1
  store i8 %a6, i8* %a8, align 1
  %a9 = add nsw i32 %i.02, 1
  %a10 = getelementptr inbounds %struct.A* %.01, i64 1
  %exitcond = icmp eq i32 %a9, %count
  br i1 %exitcond, label %._crit_edge, label %a4

._crit_edge:                                      ; preds = %4, %0
  ret void
}


;CHECK: merge_loads_integer
; load:
;CHECK: movq
; store:
;CHECK: movq
;CHECK: ret
define void @merge_loads_integer(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
  %1 = icmp sgt i32 %count, 0
  br i1 %1, label %.lr.ph, label %._crit_edge

.lr.ph:                                           ; preds = %0
  %2 = getelementptr inbounds %struct.B* %q, i64 0, i32 0
  %3 = getelementptr inbounds %struct.B* %q, i64 0, i32 1
  br label %4

; <label>:4                                       ; preds = %4, %.lr.ph
  %i.02 = phi i32 [ 0, %.lr.ph ], [ %9, %4 ]
  %.01 = phi %struct.B* [ %p, %.lr.ph ], [ %10, %4 ]
  %5 = load i32* %2
  %6 = load i32* %3
  %7 = getelementptr inbounds %struct.B* %.01, i64 0, i32 0
  store i32 %5, i32* %7
  %8 = getelementptr inbounds %struct.B* %.01, i64 0, i32 1
  store i32 %6, i32* %8
  %9 = add nsw i32 %i.02, 1
  %10 = getelementptr inbounds %struct.B* %.01, i64 1
  %exitcond = icmp eq i32 %9, %count
  br i1 %exitcond, label %._crit_edge, label %4

._crit_edge:                                      ; preds = %4, %0
  ret void
}


;CHECK: merge_loads_vector
; load:
;CHECK: movups
; store:
;CHECK: movups
;CHECK: ret
define void @merge_loads_vector(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
  %a1 = icmp sgt i32 %count, 0
  br i1 %a1, label %.lr.ph, label %._crit_edge

.lr.ph:                                           ; preds = %0
  %a2 = getelementptr inbounds %struct.B* %q, i64 0, i32 0
  %a3 = getelementptr inbounds %struct.B* %q, i64 0, i32 1
  %a4 = getelementptr inbounds %struct.B* %q, i64 0, i32 2
  %a5 = getelementptr inbounds %struct.B* %q, i64 0, i32 3
  br label %block4

block4:                                       ; preds = %4, %.lr.ph
  %i.02 = phi i32 [ 0, %.lr.ph ], [ %c9, %block4 ]
  %.01 = phi %struct.B* [ %p, %.lr.ph ], [ %c10, %block4 ]
  %a7 = getelementptr inbounds %struct.B* %.01, i64 0, i32 0
  %a8 = getelementptr inbounds %struct.B* %.01, i64 0, i32 1
  %a9 = getelementptr inbounds %struct.B* %.01, i64 0, i32 2
  %a10 = getelementptr inbounds %struct.B* %.01, i64 0, i32 3
  %b1 = load i32* %a2
  %b2 = load i32* %a3
  %b3 = load i32* %a4
  %b4 = load i32* %a5
  store i32 %b1, i32* %a7
  store i32 %b2, i32* %a8
  store i32 %b3, i32* %a9
  store i32 %b4, i32* %a10
  %c9 = add nsw i32 %i.02, 1
  %c10 = getelementptr inbounds %struct.B* %.01, i64 1
  %exitcond = icmp eq i32 %c9, %count
  br i1 %exitcond, label %._crit_edge, label %block4

._crit_edge:                                      ; preds = %4, %0
  ret void
}

;CHECK: merge_loads_no_align
; load:
;CHECK: movl
;CHECK: movl
;CHECK: movl
;CHECK: movl
; store:
;CHECK: movl
;CHECK: movl
;CHECK: movl
;CHECK: movl
;CHECK: ret
define void @merge_loads_no_align(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
  %a1 = icmp sgt i32 %count, 0
  br i1 %a1, label %.lr.ph, label %._crit_edge

.lr.ph:                                           ; preds = %0
  %a2 = getelementptr inbounds %struct.B* %q, i64 0, i32 0
  %a3 = getelementptr inbounds %struct.B* %q, i64 0, i32 1
  %a4 = getelementptr inbounds %struct.B* %q, i64 0, i32 2
  %a5 = getelementptr inbounds %struct.B* %q, i64 0, i32 3
  br label %block4

block4:                                       ; preds = %4, %.lr.ph
  %i.02 = phi i32 [ 0, %.lr.ph ], [ %c9, %block4 ]
  %.01 = phi %struct.B* [ %p, %.lr.ph ], [ %c10, %block4 ]
  %a7 = getelementptr inbounds %struct.B* %.01, i64 0, i32 0
  %a8 = getelementptr inbounds %struct.B* %.01, i64 0, i32 1
  %a9 = getelementptr inbounds %struct.B* %.01, i64 0, i32 2
  %a10 = getelementptr inbounds %struct.B* %.01, i64 0, i32 3
  %b1 = load i32* %a2, align 1
  %b2 = load i32* %a3, align 1
  %b3 = load i32* %a4, align 1
  %b4 = load i32* %a5, align 1
  store i32 %b1, i32* %a7, align 1
  store i32 %b2, i32* %a8, align 1
  store i32 %b3, i32* %a9, align 1
  store i32 %b4, i32* %a10, align 1
  %c9 = add nsw i32 %i.02, 1
  %c10 = getelementptr inbounds %struct.B* %.01, i64 1
  %exitcond = icmp eq i32 %c9, %count
  br i1 %exitcond, label %._crit_edge, label %block4

._crit_edge:                                      ; preds = %4, %0
  ret void
}

; Make sure that we merge the consecutive load/store sequence below and use a
; word (16 bit) instead of a byte copy.
; CHECK: MergeLoadStoreBaseIndexOffset
; CHECK: movw    (%{{.*}},%{{.*}}), [[REG:%[a-z]+]]
; CHECK: movw    [[REG]], (%{{.*}})
define void @MergeLoadStoreBaseIndexOffset(i64* %a, i8* %b, i8* %c, i32 %n) {
  br label %1

; <label>:1
  %.09 = phi i32 [ %n, %0 ], [ %11, %1 ]
  %.08 = phi i8* [ %b, %0 ], [ %10, %1 ]
  %.0 = phi i64* [ %a, %0 ], [ %2, %1 ]
  %2 = getelementptr inbounds i64* %.0, i64 1
  %3 = load i64* %.0, align 1
  %4 = getelementptr inbounds i8* %c, i64 %3
  %5 = load i8* %4, align 1
  %6 = add i64 %3, 1
  %7 = getelementptr inbounds i8* %c, i64 %6
  %8 = load i8* %7, align 1
  store i8 %5, i8* %.08, align 1
  %9 = getelementptr inbounds i8* %.08, i64 1
  store i8 %8, i8* %9, align 1
  %10 = getelementptr inbounds i8* %.08, i64 2
  %11 = add nsw i32 %.09, -1
  %12 = icmp eq i32 %11, 0
  br i1 %12, label %13, label %1

; <label>:13
  ret void
}

; Make sure that we merge the consecutive load/store sequence below and use a
; word (16 bit) instead of a byte copy even if there are intermediate sign
; extensions.
; CHECK: MergeLoadStoreBaseIndexOffsetSext
; CHECK: movw    (%{{.*}},%{{.*}}), [[REG:%[a-z]+]]
; CHECK: movw    [[REG]], (%{{.*}})
define void @MergeLoadStoreBaseIndexOffsetSext(i8* %a, i8* %b, i8* %c, i32 %n) {
  br label %1

; <label>:1
  %.09 = phi i32 [ %n, %0 ], [ %12, %1 ]
  %.08 = phi i8* [ %b, %0 ], [ %11, %1 ]
  %.0 = phi i8* [ %a, %0 ], [ %2, %1 ]
  %2 = getelementptr inbounds i8* %.0, i64 1
  %3 = load i8* %.0, align 1
  %4 = sext i8 %3 to i64
  %5 = getelementptr inbounds i8* %c, i64 %4
  %6 = load i8* %5, align 1
  %7 = add i64 %4, 1
  %8 = getelementptr inbounds i8* %c, i64 %7
  %9 = load i8* %8, align 1
  store i8 %6, i8* %.08, align 1
  %10 = getelementptr inbounds i8* %.08, i64 1
  store i8 %9, i8* %10, align 1
  %11 = getelementptr inbounds i8* %.08, i64 2
  %12 = add nsw i32 %.09, -1
  %13 = icmp eq i32 %12, 0
  br i1 %13, label %14, label %1

; <label>:14
  ret void
}

; However, we can only merge ignore sign extensions when they are on all memory
; computations;
; CHECK: loadStoreBaseIndexOffsetSextNoSex
; CHECK-NOT: movw    (%{{.*}},%{{.*}}), [[REG:%[a-z]+]]
; CHECK-NOT: movw    [[REG]], (%{{.*}})
define void @loadStoreBaseIndexOffsetSextNoSex(i8* %a, i8* %b, i8* %c, i32 %n) {
  br label %1

; <label>:1
  %.09 = phi i32 [ %n, %0 ], [ %12, %1 ]
  %.08 = phi i8* [ %b, %0 ], [ %11, %1 ]
  %.0 = phi i8* [ %a, %0 ], [ %2, %1 ]
  %2 = getelementptr inbounds i8* %.0, i64 1
  %3 = load i8* %.0, align 1
  %4 = sext i8 %3 to i64
  %5 = getelementptr inbounds i8* %c, i64 %4
  %6 = load i8* %5, align 1
  %7 = add i8 %3, 1
  %wrap.4 = sext i8 %7 to i64
  %8 = getelementptr inbounds i8* %c, i64 %wrap.4
  %9 = load i8* %8, align 1
  store i8 %6, i8* %.08, align 1
  %10 = getelementptr inbounds i8* %.08, i64 1
  store i8 %9, i8* %10, align 1
  %11 = getelementptr inbounds i8* %.08, i64 2
  %12 = add nsw i32 %.09, -1
  %13 = icmp eq i32 %12, 0
  br i1 %13, label %14, label %1

; <label>:14
  ret void
}
When merging connsecutive stores, use vectors to store the constant zero. llvm-svn: 165267 2012-10-05 06:35:15 +08:00			`; RUN: llc -march=x86-64 -mcpu=corei7 -mattr=+avx < %s \| FileCheck %s`
A DAGCombine optimization for mergeing consecutive stores to memory. The optimization is not profitable in many cases because modern processors perform multiple stores in parallel and merging stores prior to merging requires extra work. We handle two main cases: 1. Store of multiple consecutive constants: q->a = 3; q->4 = 5; In this case we store a single legal wide integer. 2. Store of multiple consecutive loads: int a = p->a; int b = p->b; q->a = a; q->b = b; In this case we load/store either ilegal vector registers or legal wide integer registers. llvm-svn: 165125 2012-10-04 00:11:15 +08:00
			`target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"`
			`target triple = "x86_64-apple-macosx10.8.0"`

			`%struct.A = type { i8, i8, i8, i8, i8, i8, i8, i8 }`
			`%struct.B = type { i32, i32, i32, i32, i32, i32, i32, i32 }`

			`; CHECK: merge_const_store`
			`; save 1,2,3 ... as one big integer.`
			`; CHECK: movabsq $578437695752307201`
			`; CHECK: ret`
			`define void @merge_const_store(i32 %count, %struct.A* nocapture %p) nounwind uwtable noinline ssp {`
			`%1 = icmp sgt i32 %count, 0`
			`br i1 %1, label %.lr.ph, label %._crit_edge`
			`.lr.ph:`
			`%i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]`
			`%.01 = phi %struct.A* [ %11, %.lr.ph ], [ %p, %0 ]`
			`%2 = getelementptr inbounds %struct.A* %.01, i64 0, i32 0`
			`store i8 1, i8* %2, align 1`
			`%3 = getelementptr inbounds %struct.A* %.01, i64 0, i32 1`
			`store i8 2, i8* %3, align 1`
			`%4 = getelementptr inbounds %struct.A* %.01, i64 0, i32 2`
			`store i8 3, i8* %4, align 1`
			`%5 = getelementptr inbounds %struct.A* %.01, i64 0, i32 3`
			`store i8 4, i8* %5, align 1`
			`%6 = getelementptr inbounds %struct.A* %.01, i64 0, i32 4`
			`store i8 5, i8* %6, align 1`
			`%7 = getelementptr inbounds %struct.A* %.01, i64 0, i32 5`
			`store i8 6, i8* %7, align 1`
			`%8 = getelementptr inbounds %struct.A* %.01, i64 0, i32 6`
			`store i8 7, i8* %8, align 1`
			`%9 = getelementptr inbounds %struct.A* %.01, i64 0, i32 7`
			`store i8 8, i8* %9, align 1`
			`%10 = add nsw i32 %i.02, 1`
			`%11 = getelementptr inbounds %struct.A* %.01, i64 1`
			`%exitcond = icmp eq i32 %10, %count`
			`br i1 %exitcond, label %._crit_edge, label %.lr.ph`
			`._crit_edge:`
			`ret void`
			`}`

Dont merge consecutive loads/stores into vectors when noimplicitfloat is used. llvm-svn: 175190 2013-02-15 02:28:52 +08:00			`; No vectors because we use noimplicitfloat`
			`; CHECK: merge_const_store_no_vec`
			`; CHECK-NOT: vmovups`
			`; CHECK: ret`
			`define void @merge_const_store_no_vec(i32 %count, %struct.B* nocapture %p) noimplicitfloat{`
			`%1 = icmp sgt i32 %count, 0`
			`br i1 %1, label %.lr.ph, label %._crit_edge`
			`.lr.ph:`
			`%i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]`
			`%.01 = phi %struct.B* [ %11, %.lr.ph ], [ %p, %0 ]`
			`%2 = getelementptr inbounds %struct.B* %.01, i64 0, i32 0`
			`store i32 0, i32* %2, align 4`
			`%3 = getelementptr inbounds %struct.B* %.01, i64 0, i32 1`
			`store i32 0, i32* %3, align 4`
			`%4 = getelementptr inbounds %struct.B* %.01, i64 0, i32 2`
			`store i32 0, i32* %4, align 4`
			`%5 = getelementptr inbounds %struct.B* %.01, i64 0, i32 3`
			`store i32 0, i32* %5, align 4`
			`%6 = getelementptr inbounds %struct.B* %.01, i64 0, i32 4`
			`store i32 0, i32* %6, align 4`
			`%7 = getelementptr inbounds %struct.B* %.01, i64 0, i32 5`
			`store i32 0, i32* %7, align 4`
			`%8 = getelementptr inbounds %struct.B* %.01, i64 0, i32 6`
			`store i32 0, i32* %8, align 4`
			`%9 = getelementptr inbounds %struct.B* %.01, i64 0, i32 7`
			`store i32 0, i32* %9, align 4`
			`%10 = add nsw i32 %i.02, 1`
			`%11 = getelementptr inbounds %struct.B* %.01, i64 1`
			`%exitcond = icmp eq i32 %10, %count`
			`br i1 %exitcond, label %._crit_edge, label %.lr.ph`
			`._crit_edge:`
			`ret void`
			`}`

When merging connsecutive stores, use vectors to store the constant zero. llvm-svn: 165267 2012-10-05 06:35:15 +08:00			`; Move the constants using a single vector store.`
			`; CHECK: merge_const_store_vec`
On Sandybridge split unaligned 256bit stores into two xmm-sized stores. llvm-svn: 172894 2013-01-19 16:38:41 +08:00			`; CHECK: vmovups`
When merging connsecutive stores, use vectors to store the constant zero. llvm-svn: 165267 2012-10-05 06:35:15 +08:00			`; CHECK: ret`
			`define void @merge_const_store_vec(i32 %count, %struct.B* nocapture %p) nounwind uwtable noinline ssp {`
			`%1 = icmp sgt i32 %count, 0`
			`br i1 %1, label %.lr.ph, label %._crit_edge`
			`.lr.ph:`
			`%i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]`
			`%.01 = phi %struct.B* [ %11, %.lr.ph ], [ %p, %0 ]`
			`%2 = getelementptr inbounds %struct.B* %.01, i64 0, i32 0`
			`store i32 0, i32* %2, align 4`
			`%3 = getelementptr inbounds %struct.B* %.01, i64 0, i32 1`
			`store i32 0, i32* %3, align 4`
			`%4 = getelementptr inbounds %struct.B* %.01, i64 0, i32 2`
			`store i32 0, i32* %4, align 4`
			`%5 = getelementptr inbounds %struct.B* %.01, i64 0, i32 3`
			`store i32 0, i32* %5, align 4`
			`%6 = getelementptr inbounds %struct.B* %.01, i64 0, i32 4`
			`store i32 0, i32* %6, align 4`
			`%7 = getelementptr inbounds %struct.B* %.01, i64 0, i32 5`
			`store i32 0, i32* %7, align 4`
			`%8 = getelementptr inbounds %struct.B* %.01, i64 0, i32 6`
			`store i32 0, i32* %8, align 4`
			`%9 = getelementptr inbounds %struct.B* %.01, i64 0, i32 7`
			`store i32 0, i32* %9, align 4`
			`%10 = add nsw i32 %i.02, 1`
			`%11 = getelementptr inbounds %struct.B* %.01, i64 1`
			`%exitcond = icmp eq i32 %10, %count`
			`br i1 %exitcond, label %._crit_edge, label %.lr.ph`
			`._crit_edge:`
			`ret void`
			`}`

A DAGCombine optimization for mergeing consecutive stores to memory. The optimization is not profitable in many cases because modern processors perform multiple stores in parallel and merging stores prior to merging requires extra work. We handle two main cases: 1. Store of multiple consecutive constants: q->a = 3; q->4 = 5; In this case we store a single legal wide integer. 2. Store of multiple consecutive loads: int a = p->a; int b = p->b; q->a = a; q->b = b; In this case we load/store either ilegal vector registers or legal wide integer registers. llvm-svn: 165125 2012-10-04 00:11:15 +08:00			`; Move the first 4 constants as a single vector. Move the rest as scalars.`
			`; CHECK: merge_nonconst_store`
			`; CHECK: movl $67305985`
			`; CHECK: movb`
			`; CHECK: movb`
			`; CHECK: movb`
			`; CHECK: movb`
			`; CHECK: ret`
			`define void @merge_nonconst_store(i32 %count, i8 %zz, %struct.A* nocapture %p) nounwind uwtable noinline ssp {`
			`%1 = icmp sgt i32 %count, 0`
			`br i1 %1, label %.lr.ph, label %._crit_edge`
			`.lr.ph:`
			`%i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]`
			`%.01 = phi %struct.A* [ %11, %.lr.ph ], [ %p, %0 ]`
			`%2 = getelementptr inbounds %struct.A* %.01, i64 0, i32 0`
			`store i8 1, i8* %2, align 1`
			`%3 = getelementptr inbounds %struct.A* %.01, i64 0, i32 1`
			`store i8 2, i8* %3, align 1`
			`%4 = getelementptr inbounds %struct.A* %.01, i64 0, i32 2`
			`store i8 3, i8* %4, align 1`
			`%5 = getelementptr inbounds %struct.A* %.01, i64 0, i32 3`
			`store i8 4, i8* %5, align 1`
			`%6 = getelementptr inbounds %struct.A* %.01, i64 0, i32 4`
			`store i8 %zz, i8* %6, align 1 ; <----------- Not a const;`
			`%7 = getelementptr inbounds %struct.A* %.01, i64 0, i32 5`
			`store i8 6, i8* %7, align 1`
			`%8 = getelementptr inbounds %struct.A* %.01, i64 0, i32 6`
			`store i8 7, i8* %8, align 1`
			`%9 = getelementptr inbounds %struct.A* %.01, i64 0, i32 7`
			`store i8 8, i8* %9, align 1`
			`%10 = add nsw i32 %i.02, 1`
			`%11 = getelementptr inbounds %struct.A* %.01, i64 1`
			`%exitcond = icmp eq i32 %10, %count`
			`br i1 %exitcond, label %._crit_edge, label %.lr.ph`
			`._crit_edge:`
			`ret void`
			`}`


			`;CHECK: merge_loads_i16`
			`; load:`
			`;CHECK: movw`
			`; store:`
			`;CHECK: movw`
			`;CHECK: ret`
			`define void @merge_loads_i16(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp {`
			`%1 = icmp sgt i32 %count, 0`
			`br i1 %1, label %.lr.ph, label %._crit_edge`

			`.lr.ph: ; preds = %0`
			`%2 = getelementptr inbounds %struct.A* %q, i64 0, i32 0`
			`%3 = getelementptr inbounds %struct.A* %q, i64 0, i32 1`
			`br label %4`

			`; <label>:4 ; preds = %4, %.lr.ph`
			`%i.02 = phi i32 [ 0, %.lr.ph ], [ %9, %4 ]`
			`%.01 = phi %struct.A* [ %p, %.lr.ph ], [ %10, %4 ]`
			`%5 = load i8* %2, align 1`
			`%6 = load i8* %3, align 1`
			`%7 = getelementptr inbounds %struct.A* %.01, i64 0, i32 0`
			`store i8 %5, i8* %7, align 1`
			`%8 = getelementptr inbounds %struct.A* %.01, i64 0, i32 1`
			`store i8 %6, i8* %8, align 1`
			`%9 = add nsw i32 %i.02, 1`
			`%10 = getelementptr inbounds %struct.A* %.01, i64 1`
			`%exitcond = icmp eq i32 %9, %count`
			`br i1 %exitcond, label %._crit_edge, label %4`

			`._crit_edge: ; preds = %4, %0`
			`ret void`
			`}`

			`; The loads and the stores are interleved. Can't merge them.`
			`;CHECK: no_merge_loads`
			`;CHECK: movb`
			`;CHECK: movb`
			`;CHECK: movb`
			`;CHECK: movb`
			`;CHECK: ret`
			`define void @no_merge_loads(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp {`
			`%1 = icmp sgt i32 %count, 0`
			`br i1 %1, label %.lr.ph, label %._crit_edge`

			`.lr.ph: ; preds = %0`
			`%2 = getelementptr inbounds %struct.A* %q, i64 0, i32 0`
			`%3 = getelementptr inbounds %struct.A* %q, i64 0, i32 1`
			`br label %a4`

			`a4: ; preds = %4, %.lr.ph`
			`%i.02 = phi i32 [ 0, %.lr.ph ], [ %a9, %a4 ]`
			`%.01 = phi %struct.A* [ %p, %.lr.ph ], [ %a10, %a4 ]`
			`%a5 = load i8* %2, align 1`
			`%a7 = getelementptr inbounds %struct.A* %.01, i64 0, i32 0`
			`store i8 %a5, i8* %a7, align 1`
			`%a8 = getelementptr inbounds %struct.A* %.01, i64 0, i32 1`
			`%a6 = load i8* %3, align 1`
			`store i8 %a6, i8* %a8, align 1`
			`%a9 = add nsw i32 %i.02, 1`
			`%a10 = getelementptr inbounds %struct.A* %.01, i64 1`
			`%exitcond = icmp eq i32 %a9, %count`
			`br i1 %exitcond, label %._crit_edge, label %a4`

			`._crit_edge: ; preds = %4, %0`
			`ret void`
			`}`


			`;CHECK: merge_loads_integer`
			`; load:`
			`;CHECK: movq`
			`; store:`
			`;CHECK: movq`
			`;CHECK: ret`
			`define void @merge_loads_integer(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {`
			`%1 = icmp sgt i32 %count, 0`
			`br i1 %1, label %.lr.ph, label %._crit_edge`

			`.lr.ph: ; preds = %0`
			`%2 = getelementptr inbounds %struct.B* %q, i64 0, i32 0`
			`%3 = getelementptr inbounds %struct.B* %q, i64 0, i32 1`
			`br label %4`

			`; <label>:4 ; preds = %4, %.lr.ph`
			`%i.02 = phi i32 [ 0, %.lr.ph ], [ %9, %4 ]`
			`%.01 = phi %struct.B* [ %p, %.lr.ph ], [ %10, %4 ]`
			`%5 = load i32* %2`
			`%6 = load i32* %3`
			`%7 = getelementptr inbounds %struct.B* %.01, i64 0, i32 0`
			`store i32 %5, i32* %7`
			`%8 = getelementptr inbounds %struct.B* %.01, i64 0, i32 1`
			`store i32 %6, i32* %8`
			`%9 = add nsw i32 %i.02, 1`
			`%10 = getelementptr inbounds %struct.B* %.01, i64 1`
			`%exitcond = icmp eq i32 %9, %count`
			`br i1 %exitcond, label %._crit_edge, label %4`

			`._crit_edge: ; preds = %4, %0`
			`ret void`
			`}`


			`;CHECK: merge_loads_vector`
			`; load:`
			`;CHECK: movups`
			`; store:`
			`;CHECK: movups`
			`;CHECK: ret`
			`define void @merge_loads_vector(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {`
			`%a1 = icmp sgt i32 %count, 0`
			`br i1 %a1, label %.lr.ph, label %._crit_edge`

			`.lr.ph: ; preds = %0`
			`%a2 = getelementptr inbounds %struct.B* %q, i64 0, i32 0`
			`%a3 = getelementptr inbounds %struct.B* %q, i64 0, i32 1`
			`%a4 = getelementptr inbounds %struct.B* %q, i64 0, i32 2`
			`%a5 = getelementptr inbounds %struct.B* %q, i64 0, i32 3`
			`br label %block4`

			`block4: ; preds = %4, %.lr.ph`
			`%i.02 = phi i32 [ 0, %.lr.ph ], [ %c9, %block4 ]`
			`%.01 = phi %struct.B* [ %p, %.lr.ph ], [ %c10, %block4 ]`
			`%a7 = getelementptr inbounds %struct.B* %.01, i64 0, i32 0`
			`%a8 = getelementptr inbounds %struct.B* %.01, i64 0, i32 1`
			`%a9 = getelementptr inbounds %struct.B* %.01, i64 0, i32 2`
			`%a10 = getelementptr inbounds %struct.B* %.01, i64 0, i32 3`
			`%b1 = load i32* %a2`
			`%b2 = load i32* %a3`
			`%b3 = load i32* %a4`
			`%b4 = load i32* %a5`
			`store i32 %b1, i32* %a7`
			`store i32 %b2, i32* %a8`
			`store i32 %b3, i32* %a9`
			`store i32 %b4, i32* %a10`
			`%c9 = add nsw i32 %i.02, 1`
			`%c10 = getelementptr inbounds %struct.B* %.01, i64 1`
			`%exitcond = icmp eq i32 %c9, %count`
			`br i1 %exitcond, label %._crit_edge, label %block4`

			`._crit_edge: ; preds = %4, %0`
			`ret void`
			`}`

			`;CHECK: merge_loads_no_align`
			`; load:`
			`;CHECK: movl`
			`;CHECK: movl`
			`;CHECK: movl`
			`;CHECK: movl`
			`; store:`
			`;CHECK: movl`
			`;CHECK: movl`
			`;CHECK: movl`
			`;CHECK: movl`
			`;CHECK: ret`
			`define void @merge_loads_no_align(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {`
			`%a1 = icmp sgt i32 %count, 0`
			`br i1 %a1, label %.lr.ph, label %._crit_edge`

			`.lr.ph: ; preds = %0`
			`%a2 = getelementptr inbounds %struct.B* %q, i64 0, i32 0`
			`%a3 = getelementptr inbounds %struct.B* %q, i64 0, i32 1`
			`%a4 = getelementptr inbounds %struct.B* %q, i64 0, i32 2`
			`%a5 = getelementptr inbounds %struct.B* %q, i64 0, i32 3`
			`br label %block4`

			`block4: ; preds = %4, %.lr.ph`
			`%i.02 = phi i32 [ 0, %.lr.ph ], [ %c9, %block4 ]`
			`%.01 = phi %struct.B* [ %p, %.lr.ph ], [ %c10, %block4 ]`
			`%a7 = getelementptr inbounds %struct.B* %.01, i64 0, i32 0`
			`%a8 = getelementptr inbounds %struct.B* %.01, i64 0, i32 1`
			`%a9 = getelementptr inbounds %struct.B* %.01, i64 0, i32 2`
			`%a10 = getelementptr inbounds %struct.B* %.01, i64 0, i32 3`
			`%b1 = load i32* %a2, align 1`
			`%b2 = load i32* %a3, align 1`
			`%b3 = load i32* %a4, align 1`
			`%b4 = load i32* %a5, align 1`
			`store i32 %b1, i32* %a7, align 1`
			`store i32 %b2, i32* %a8, align 1`
			`store i32 %b3, i32* %a9, align 1`
			`store i32 %b4, i32* %a10, align 1`
			`%c9 = add nsw i32 %i.02, 1`
			`%c10 = getelementptr inbounds %struct.B* %.01, i64 1`
			`%exitcond = icmp eq i32 %c9, %count`
			`br i1 %exitcond, label %._crit_edge, label %block4`

			`._crit_edge: ; preds = %4, %0`
			`ret void`
			`}`

Merge load/store sequences with adresses: base + index + offset We would also like to merge sequences that involve a variable index like in the example below. int index = *idx++ int i0 = c[index+0]; int i1 = c[index+1]; b[0] = i0; b[1] = i1; By extending the parsing of the base pointer to handle dags that contain a base, index, and offset we can handle examples like the one above. The dag for the code above will look something like: (load (i64 add (i64 copyfromreg %c) (i64 signextend (i8 load %index)))) (load (i64 add (i64 copyfromreg %c) (i64 signextend (i32 add (i32 signextend (i8 load %index)) (i32 1))))) The code that parses the tree ignores the intermediate sign extensions. However, if there is a sign extension it needs to be on all indexes. (load (i64 add (i64 copyfromreg %c) (i64 signextend (add (i8 load %index) (i8 1)))) vs (load (i64 add (i64 copyfromreg %c) (i64 signextend (i32 add (i32 signextend (i8 load %index)) (i32 1))))) radar://13536387 llvm-svn: 178483 2013-04-02 02:12:58 +08:00			`; Make sure that we merge the consecutive load/store sequence below and use a`
			`; word (16 bit) instead of a byte copy.`
			`; CHECK: MergeLoadStoreBaseIndexOffset`
			`; CHECK: movw (%{{.}},%{{.}}), [[REG:%[a-z]+]]`
			`; CHECK: movw [[REG]], (%{{.*}})`
			`define void @MergeLoadStoreBaseIndexOffset(i64* %a, i8* %b, i8* %c, i32 %n) {`
			`br label %1`

			`; <label>:1`
			`%.09 = phi i32 [ %n, %0 ], [ %11, %1 ]`
			`%.08 = phi i8* [ %b, %0 ], [ %10, %1 ]`
			`%.0 = phi i64* [ %a, %0 ], [ %2, %1 ]`
			`%2 = getelementptr inbounds i64* %.0, i64 1`
			`%3 = load i64* %.0, align 1`
			`%4 = getelementptr inbounds i8* %c, i64 %3`
			`%5 = load i8* %4, align 1`
			`%6 = add i64 %3, 1`
			`%7 = getelementptr inbounds i8* %c, i64 %6`
			`%8 = load i8* %7, align 1`
			`store i8 %5, i8* %.08, align 1`
			`%9 = getelementptr inbounds i8* %.08, i64 1`
			`store i8 %8, i8* %9, align 1`
			`%10 = getelementptr inbounds i8* %.08, i64 2`
			`%11 = add nsw i32 %.09, -1`
			`%12 = icmp eq i32 %11, 0`
			`br i1 %12, label %13, label %1`

			`; <label>:13`
			`ret void`
			`}`

			`; Make sure that we merge the consecutive load/store sequence below and use a`
			`; word (16 bit) instead of a byte copy even if there are intermediate sign`
			`; extensions.`
			`; CHECK: MergeLoadStoreBaseIndexOffsetSext`
			`; CHECK: movw (%{{.}},%{{.}}), [[REG:%[a-z]+]]`
			`; CHECK: movw [[REG]], (%{{.*}})`
			`define void @MergeLoadStoreBaseIndexOffsetSext(i8* %a, i8* %b, i8* %c, i32 %n) {`
			`br label %1`

			`; <label>:1`
			`%.09 = phi i32 [ %n, %0 ], [ %12, %1 ]`
			`%.08 = phi i8* [ %b, %0 ], [ %11, %1 ]`
			`%.0 = phi i8* [ %a, %0 ], [ %2, %1 ]`
			`%2 = getelementptr inbounds i8* %.0, i64 1`
			`%3 = load i8* %.0, align 1`
			`%4 = sext i8 %3 to i64`
			`%5 = getelementptr inbounds i8* %c, i64 %4`
			`%6 = load i8* %5, align 1`
			`%7 = add i64 %4, 1`
			`%8 = getelementptr inbounds i8* %c, i64 %7`
			`%9 = load i8* %8, align 1`
			`store i8 %6, i8* %.08, align 1`
			`%10 = getelementptr inbounds i8* %.08, i64 1`
			`store i8 %9, i8* %10, align 1`
			`%11 = getelementptr inbounds i8* %.08, i64 2`
			`%12 = add nsw i32 %.09, -1`
			`%13 = icmp eq i32 %12, 0`
			`br i1 %13, label %14, label %1`

			`; <label>:14`
			`ret void`
			`}`

			`; However, we can only merge ignore sign extensions when they are on all memory`
			`; computations;`
			`; CHECK: loadStoreBaseIndexOffsetSextNoSex`
			`; CHECK-NOT: movw (%{{.}},%{{.}}), [[REG:%[a-z]+]]`
			`; CHECK-NOT: movw [[REG]], (%{{.*}})`
			`define void @loadStoreBaseIndexOffsetSextNoSex(i8* %a, i8* %b, i8* %c, i32 %n) {`
			`br label %1`

			`; <label>:1`
			`%.09 = phi i32 [ %n, %0 ], [ %12, %1 ]`
			`%.08 = phi i8* [ %b, %0 ], [ %11, %1 ]`
			`%.0 = phi i8* [ %a, %0 ], [ %2, %1 ]`
			`%2 = getelementptr inbounds i8* %.0, i64 1`
			`%3 = load i8* %.0, align 1`
			`%4 = sext i8 %3 to i64`
			`%5 = getelementptr inbounds i8* %c, i64 %4`
			`%6 = load i8* %5, align 1`
			`%7 = add i8 %3, 1`
			`%wrap.4 = sext i8 %7 to i64`
			`%8 = getelementptr inbounds i8* %c, i64 %wrap.4`
			`%9 = load i8* %8, align 1`
			`store i8 %6, i8* %.08, align 1`
			`%10 = getelementptr inbounds i8* %.08, i64 1`
			`store i8 %9, i8* %10, align 1`
			`%11 = getelementptr inbounds i8* %.08, i64 2`
			`%12 = add nsw i32 %.09, -1`
			`%13 = icmp eq i32 %12, 0`
			`br i1 %13, label %14, label %1`

			`; <label>:14`
			`ret void`
			`}`