llvm-project/llvm/test/CodeGen/X86/combine-multiplies.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=sse2 | FileCheck %s

; Source file looks something like this:
;
; typedef int AAA[100][100];
;
; void testCombineMultiplies(AAA a,int lll)
; {
;   int LOC = lll + 5;
;
;   a[LOC][LOC] = 11;
;
;   a[LOC][20] = 22;
;   a[LOC+20][20] = 33;
; }
;
; We want to make sure we don't generate 2 multiply instructions,
; one for a[LOC][] and one for a[LOC+20]. visitMUL in DAGCombiner.cpp
; should combine the instructions in such a way to avoid the extra
; multiply.
;
; Output looks roughly like this:
;
;	movl	8(%esp), %eax
;	movl	12(%esp), %ecx
;	imull	$400, %ecx, %edx        # imm = 0x190
;	leal	(%edx,%eax), %esi
;	movl	$11, 2020(%esi,%ecx,4)
;	movl	$22, 2080(%edx,%eax)
;	movl	$33, 10080(%edx,%eax)

; Function Attrs: nounwind
define void @testCombineMultiplies([100 x i32]* nocapture %a, i32 %lll) nounwind {
; CHECK-LABEL: testCombineMultiplies:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    pushl %esi
; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT:    imull $400, %ecx, %edx # imm = 0x190
; CHECK-NEXT:    leal (%edx,%eax), %esi
; CHECK-NEXT:    movl $11, 2020(%esi,%ecx,4)
; CHECK-NEXT:    movl $22, 2080(%edx,%eax)
; CHECK-NEXT:    movl $33, 10080(%edx,%eax)
; CHECK-NEXT:    popl %esi
; CHECK-NEXT:    retl
entry:
  %add = add nsw i32 %lll, 5
  %arrayidx1 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add, i32 %add
  store i32 11, i32* %arrayidx1, align 4
  %arrayidx3 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add, i32 20
  store i32 22, i32* %arrayidx3, align 4
  %add4 = add nsw i32 %lll, 25
  %arrayidx6 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add4, i32 20
  store i32 33, i32* %arrayidx6, align 4
  ret void
}


; Test for the same optimization on vector multiplies.
;
; Source looks something like this:
;
; typedef int v4int __attribute__((__vector_size__(16)));
;
; v4int x;
; v4int v2, v3;
; void testCombineMultiplies_splat(v4int v1) {
;   v2 = (v1 + (v4int){ 11, 11, 11, 11 }) * (v4int) {22, 22, 22, 22};
;   v3 = (v1 + (v4int){ 33, 33, 33, 33 }) * (v4int) {22, 22, 22, 22};
;   x = (v1 + (v4int){ 11, 11, 11, 11 });
; }
;
; Output looks something like this:
;
; testCombineMultiplies_splat:                              # @testCombineMultiplies_splat
; # %bb.0:                                 # %entry
; 	movdqa	.LCPI1_0, %xmm1         # xmm1 = [11,11,11,11]
; 	paddd	%xmm0, %xmm1
; 	movdqa	.LCPI1_1, %xmm2         # xmm2 = [22,22,22,22]
; 	pshufd	$245, %xmm0, %xmm3      # xmm3 = xmm0[1,1,3,3]
; 	pmuludq	%xmm2, %xmm0
; 	pshufd	$232, %xmm0, %xmm0      # xmm0 = xmm0[0,2,2,3]
; 	pmuludq	%xmm2, %xmm3
; 	pshufd	$232, %xmm3, %xmm2      # xmm2 = xmm3[0,2,2,3]
; 	punpckldq	%xmm2, %xmm0    # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; 	movdqa	.LCPI1_2, %xmm2         # xmm2 = [242,242,242,242]
;	paddd	%xmm0, %xmm2
;	paddd	.LCPI1_3, %xmm0
;	movdqa	%xmm2, v2
;	movdqa	%xmm0, v3
;	movdqa	%xmm1, x
;	retl
;
; Again, we want to make sure we don't generate two different multiplies.
; We should have a single multiply for "v1 * {22, 22, 22, 22}" (made up of two
; pmuludq instructions), followed by two adds. Without this optimization, we'd
; do 2 adds, followed by 2 multiplies (i.e. 4 pmuludq instructions).

@v2 = common global <4 x i32> zeroinitializer, align 16
@v3 = common global <4 x i32> zeroinitializer, align 16
@x = common global <4 x i32> zeroinitializer, align 16

; Function Attrs: nounwind
define void @testCombineMultiplies_splat(<4 x i32> %v1) nounwind {
; CHECK-LABEL: testCombineMultiplies_splat:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [11,11,11,11]
; CHECK-NEXT:    paddd %xmm0, %xmm1
; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [22,22,22,22]
; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; CHECK-NEXT:    pmuludq %xmm2, %xmm0
; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-NEXT:    pmuludq %xmm2, %xmm3
; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
; CHECK-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [242,242,242,242]
; CHECK-NEXT:    paddd %xmm0, %xmm2
; CHECK-NEXT:    paddd {{\.LCPI.*}}, %xmm0
; CHECK-NEXT:    movdqa %xmm2, v2
; CHECK-NEXT:    movdqa %xmm0, v3
; CHECK-NEXT:    movdqa %xmm1, x
; CHECK-NEXT:    retl
entry:
  %add1 = add <4 x i32> %v1, <i32 11, i32 11, i32 11, i32 11>
  %mul1 = mul <4 x i32> %add1, <i32 22, i32 22, i32 22, i32 22>
  %add2 = add <4 x i32> %v1, <i32 33, i32 33, i32 33, i32 33>
  %mul2 = mul <4 x i32> %add2, <i32 22, i32 22, i32 22, i32 22>
  store <4 x i32> %mul1, <4 x i32>* @v2, align 16
  store <4 x i32> %mul2, <4 x i32>* @v3, align 16
  store <4 x i32> %add1, <4 x i32>* @x, align 16
  ret void
}

; Finally, check the non-splatted vector case. This is very similar
; to the previous test case, except for the vector values.

; Function Attrs: nounwind
define void @testCombineMultiplies_non_splat(<4 x i32> %v1) nounwind {
; CHECK-LABEL: testCombineMultiplies_non_splat:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [11,22,33,44]
; CHECK-NEXT:    paddd %xmm0, %xmm1
; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [22,33,44,55]
; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; CHECK-NEXT:    pmuludq %xmm2, %xmm0
; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; CHECK-NEXT:    pmuludq %xmm3, %xmm2
; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; CHECK-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [242,726,1452,2420]
; CHECK-NEXT:    paddd %xmm0, %xmm2
; CHECK-NEXT:    paddd {{\.LCPI.*}}, %xmm0
; CHECK-NEXT:    movdqa %xmm2, v2
; CHECK-NEXT:    movdqa %xmm0, v3
; CHECK-NEXT:    movdqa %xmm1, x
; CHECK-NEXT:    retl
entry:
  %add1 = add <4 x i32> %v1, <i32 11, i32 22, i32 33, i32 44>
  %mul1 = mul <4 x i32> %add1, <i32 22, i32 33, i32 44, i32 55>
  %add2 = add <4 x i32> %v1, <i32 33, i32 44, i32 55, i32 66>
  %mul2 = mul <4 x i32> %add2, <i32 22, i32 33, i32 44, i32 55>
  store <4 x i32> %mul1, <4 x i32>* @v2, align 16
  store <4 x i32> %mul2, <4 x i32>* @v3, align 16
  store <4 x i32> %add1, <4 x i32>* @x, align 16
  ret void
}
[X86][SSE] Regenerate multiple combine tests llvm-svn: 281973 2016-09-20 22:42:45 +08:00			`; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py`
			`; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=sse2 \| FileCheck %s`
[X86] - Catch extra combine opportunities for redundant imuls. When we fold "mul ((add x, c1), c1)" -> "add ((mul x, c2), c1*c2)", we bail if (add x, c1) has multiple users which would result in an extra add instruction. In such cases, this patch adds a check to see if we can eliminate a multiply instruction in exchange for the extra add. I also added the capability of doing the existing optimization with non-splatted vectors (splatted also works). Differential Revision: http://reviews.llvm.org/D13740 llvm-svn: 251028 2015-10-23 00:14:45 +08:00
			`; Source file looks something like this:`
			`;`
			`; typedef int AAA[100][100];`
			`;`
			`; void testCombineMultiplies(AAA a,int lll)`
			`; {`
			`; int LOC = lll + 5;`
			`;`
			`; a[LOC][LOC] = 11;`
			`;`
			`; a[LOC][20] = 22;`
			`; a[LOC+20][20] = 33;`
			`; }`
			`;`
			`; We want to make sure we don't generate 2 multiply instructions,`
			`; one for a[LOC][] and one for a[LOC+20]. visitMUL in DAGCombiner.cpp`
			`; should combine the instructions in such a way to avoid the extra`
			`; multiply.`
			`;`
			`; Output looks roughly like this:`
			`;`
			`; movl 8(%esp), %eax`
			`; movl 12(%esp), %ecx`
			`; imull $400, %ecx, %edx # imm = 0x190`
			`; leal (%edx,%eax), %esi`
			`; movl $11, 2020(%esi,%ecx,4)`
			`; movl $22, 2080(%edx,%eax)`
			`; movl $33, 10080(%edx,%eax)`

			`; Function Attrs: nounwind`
[X86][SSE] Regenerate multiple combine tests llvm-svn: 281973 2016-09-20 22:42:45 +08:00			`define void @testCombineMultiplies([100 x i32]* nocapture %a, i32 %lll) nounwind {`
			`; CHECK-LABEL: testCombineMultiplies:`
[CodeGen] Unify MBB reference format in both MIR and debug output As part of the unification of the debug format and the MIR format, print MBB references as '%bb.5'. The MIR printer prints the IR name of a MBB only for block definitions. * find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#" << ([a-zA-Z0-9_]+)->getNumber\(\)/" << printMBBReference(\1)/g' find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#" << ([a-zA-Z0-9_]+)\.getNumber\(\)/" << printMBBReference(\1)/g' * find . \( -name ".txt" -o -name ".s" -o -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#([0-9]+)/%bb.\1/g' * grep -nr 'BB#' and fix Differential Revision: https://reviews.llvm.org/D40422 llvm-svn: 319665 2017-12-05 01:18:51 +08:00			`; CHECK: # %bb.0: # %entry`
[X86][SSE] Regenerate multiple combine tests llvm-svn: 281973 2016-09-20 22:42:45 +08:00			`; CHECK-NEXT: pushl %esi`
			`; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax`
			`; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx`
			`; CHECK-NEXT: imull $400, %ecx, %edx # imm = 0x190`
[DAG] Refactor DAGCombiner::ReassociateOps Summary: Extract the logic for doing reassociations from DAGCombiner::reassociateOps into a helper function DAGCombiner::reassociateOpsCommutative, and use that helper to trigger reassociation on the original operand order, or the commuted operand order. Codegen is not identical since the operand order will be different when doing the reassociations for the commuted case. That causes some unfortunate churn in some test cases. Apart from that this should be NFC. Reviewers: spatel, craig.topper, tstellar Reviewed By: spatel Subscribers: dmgreen, dschuff, jvesely, nhaehnle, javed.absar, sbc100, jgravelle-google, hiraditya, aheejin, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D61199 llvm-svn: 359476 2019-04-30 01:50:10 +08:00			`; CHECK-NEXT: leal (%edx,%eax), %esi`
[X86][SSE] Regenerate multiple combine tests llvm-svn: 281973 2016-09-20 22:42:45 +08:00			`; CHECK-NEXT: movl $11, 2020(%esi,%ecx,4)`
[DAG] Refactor DAGCombiner::ReassociateOps Summary: Extract the logic for doing reassociations from DAGCombiner::reassociateOps into a helper function DAGCombiner::reassociateOpsCommutative, and use that helper to trigger reassociation on the original operand order, or the commuted operand order. Codegen is not identical since the operand order will be different when doing the reassociations for the commuted case. That causes some unfortunate churn in some test cases. Apart from that this should be NFC. Reviewers: spatel, craig.topper, tstellar Reviewed By: spatel Subscribers: dmgreen, dschuff, jvesely, nhaehnle, javed.absar, sbc100, jgravelle-google, hiraditya, aheejin, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D61199 llvm-svn: 359476 2019-04-30 01:50:10 +08:00			`; CHECK-NEXT: movl $22, 2080(%edx,%eax)`
			`; CHECK-NEXT: movl $33, 10080(%edx,%eax)`
[X86][SSE] Regenerate multiple combine tests llvm-svn: 281973 2016-09-20 22:42:45 +08:00			`; CHECK-NEXT: popl %esi`
			`; CHECK-NEXT: retl`
[X86] - Catch extra combine opportunities for redundant imuls. When we fold "mul ((add x, c1), c1)" -> "add ((mul x, c2), c1*c2)", we bail if (add x, c1) has multiple users which would result in an extra add instruction. In such cases, this patch adds a check to see if we can eliminate a multiply instruction in exchange for the extra add. I also added the capability of doing the existing optimization with non-splatted vectors (splatted also works). Differential Revision: http://reviews.llvm.org/D13740 llvm-svn: 251028 2015-10-23 00:14:45 +08:00			`entry:`
			`%add = add nsw i32 %lll, 5`
			`%arrayidx1 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add, i32 %add`
			`store i32 11, i32* %arrayidx1, align 4`
			`%arrayidx3 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add, i32 20`
			`store i32 22, i32* %arrayidx3, align 4`
			`%add4 = add nsw i32 %lll, 25`
			`%arrayidx6 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add4, i32 20`
			`store i32 33, i32* %arrayidx6, align 4`
			`ret void`
			`}`


			`; Test for the same optimization on vector multiplies.`
			`;`
			`; Source looks something like this:`
			`;`
			`; typedef int v4int __attribute__((__vector_size__(16)));`
			`;`
			`; v4int x;`
			`; v4int v2, v3;`
			`; void testCombineMultiplies_splat(v4int v1) {`
			`; v2 = (v1 + (v4int){ 11, 11, 11, 11 }) * (v4int) {22, 22, 22, 22};`
			`; v3 = (v1 + (v4int){ 33, 33, 33, 33 }) * (v4int) {22, 22, 22, 22};`
			`; x = (v1 + (v4int){ 11, 11, 11, 11 });`
			`; }`
			`;`
			`; Output looks something like this:`
			`;`
			`; testCombineMultiplies_splat: # @testCombineMultiplies_splat`
[CodeGen] Unify MBB reference format in both MIR and debug output As part of the unification of the debug format and the MIR format, print MBB references as '%bb.5'. The MIR printer prints the IR name of a MBB only for block definitions. * find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#" << ([a-zA-Z0-9_]+)->getNumber\(\)/" << printMBBReference(\1)/g' find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#" << ([a-zA-Z0-9_]+)\.getNumber\(\)/" << printMBBReference(\1)/g' * find . \( -name ".txt" -o -name ".s" -o -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#([0-9]+)/%bb.\1/g' * grep -nr 'BB#' and fix Differential Revision: https://reviews.llvm.org/D40422 llvm-svn: 319665 2017-12-05 01:18:51 +08:00			`; # %bb.0: # %entry`
[X86] - Catch extra combine opportunities for redundant imuls. When we fold "mul ((add x, c1), c1)" -> "add ((mul x, c2), c1*c2)", we bail if (add x, c1) has multiple users which would result in an extra add instruction. In such cases, this patch adds a check to see if we can eliminate a multiply instruction in exchange for the extra add. I also added the capability of doing the existing optimization with non-splatted vectors (splatted also works). Differential Revision: http://reviews.llvm.org/D13740 llvm-svn: 251028 2015-10-23 00:14:45 +08:00			`; movdqa .LCPI1_0, %xmm1 # xmm1 = [11,11,11,11]`
			`; paddd %xmm0, %xmm1`
			`; movdqa .LCPI1_1, %xmm2 # xmm2 = [22,22,22,22]`
			`; pshufd $245, %xmm0, %xmm3 # xmm3 = xmm0[1,1,3,3]`
			`; pmuludq %xmm2, %xmm0`
			`; pshufd $232, %xmm0, %xmm0 # xmm0 = xmm0[0,2,2,3]`
			`; pmuludq %xmm2, %xmm3`
			`; pshufd $232, %xmm3, %xmm2 # xmm2 = xmm3[0,2,2,3]`
			`; punpckldq %xmm2, %xmm0 # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]`
			`; movdqa .LCPI1_2, %xmm2 # xmm2 = [242,242,242,242]`
			`; paddd %xmm0, %xmm2`
			`; paddd .LCPI1_3, %xmm0`
			`; movdqa %xmm2, v2`
			`; movdqa %xmm0, v3`
			`; movdqa %xmm1, x`
			`; retl`
			`;`
			`; Again, we want to make sure we don't generate two different multiplies.`
			`; We should have a single multiply for "v1 * {22, 22, 22, 22}" (made up of two`
			`; pmuludq instructions), followed by two adds. Without this optimization, we'd`
			`; do 2 adds, followed by 2 multiplies (i.e. 4 pmuludq instructions).`

			`@v2 = common global <4 x i32> zeroinitializer, align 16`
			`@v3 = common global <4 x i32> zeroinitializer, align 16`
			`@x = common global <4 x i32> zeroinitializer, align 16`

			`; Function Attrs: nounwind`
[X86][SSE] Regenerate multiple combine tests llvm-svn: 281973 2016-09-20 22:42:45 +08:00			`define void @testCombineMultiplies_splat(<4 x i32> %v1) nounwind {`
			`; CHECK-LABEL: testCombineMultiplies_splat:`
[CodeGen] Unify MBB reference format in both MIR and debug output As part of the unification of the debug format and the MIR format, print MBB references as '%bb.5'. The MIR printer prints the IR name of a MBB only for block definitions. * find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#" << ([a-zA-Z0-9_]+)->getNumber\(\)/" << printMBBReference(\1)/g' find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#" << ([a-zA-Z0-9_]+)\.getNumber\(\)/" << printMBBReference(\1)/g' * find . \( -name ".txt" -o -name ".s" -o -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#([0-9]+)/%bb.\1/g' * grep -nr 'BB#' and fix Differential Revision: https://reviews.llvm.org/D40422 llvm-svn: 319665 2017-12-05 01:18:51 +08:00			`; CHECK: # %bb.0: # %entry`
[X86][SSE] Regenerate multiple combine tests llvm-svn: 281973 2016-09-20 22:42:45 +08:00			`; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [11,11,11,11]`
			`; CHECK-NEXT: paddd %xmm0, %xmm1`
			`; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [22,22,22,22]`
			`; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]`
			`; CHECK-NEXT: pmuludq %xmm2, %xmm0`
			`; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]`
			`; CHECK-NEXT: pmuludq %xmm2, %xmm3`
			`; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]`
			`; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]`
			`; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [242,242,242,242]`
			`; CHECK-NEXT: paddd %xmm0, %xmm2`
			`; CHECK-NEXT: paddd {{\.LCPI.*}}, %xmm0`
			`; CHECK-NEXT: movdqa %xmm2, v2`
			`; CHECK-NEXT: movdqa %xmm0, v3`
			`; CHECK-NEXT: movdqa %xmm1, x`
			`; CHECK-NEXT: retl`
[X86] - Catch extra combine opportunities for redundant imuls. When we fold "mul ((add x, c1), c1)" -> "add ((mul x, c2), c1*c2)", we bail if (add x, c1) has multiple users which would result in an extra add instruction. In such cases, this patch adds a check to see if we can eliminate a multiply instruction in exchange for the extra add. I also added the capability of doing the existing optimization with non-splatted vectors (splatted also works). Differential Revision: http://reviews.llvm.org/D13740 llvm-svn: 251028 2015-10-23 00:14:45 +08:00			`entry:`
			`%add1 = add <4 x i32> %v1, <i32 11, i32 11, i32 11, i32 11>`
			`%mul1 = mul <4 x i32> %add1, <i32 22, i32 22, i32 22, i32 22>`
			`%add2 = add <4 x i32> %v1, <i32 33, i32 33, i32 33, i32 33>`
			`%mul2 = mul <4 x i32> %add2, <i32 22, i32 22, i32 22, i32 22>`
			`store <4 x i32> %mul1, <4 x i32>* @v2, align 16`
			`store <4 x i32> %mul2, <4 x i32>* @v3, align 16`
			`store <4 x i32> %add1, <4 x i32>* @x, align 16`
			`ret void`
			`}`

			`; Finally, check the non-splatted vector case. This is very similar`
			`; to the previous test case, except for the vector values.`
[X86][SSE] Regenerate multiple combine tests llvm-svn: 281973 2016-09-20 22:42:45 +08:00
[X86] - Catch extra combine opportunities for redundant imuls. When we fold "mul ((add x, c1), c1)" -> "add ((mul x, c2), c1*c2)", we bail if (add x, c1) has multiple users which would result in an extra add instruction. In such cases, this patch adds a check to see if we can eliminate a multiply instruction in exchange for the extra add. I also added the capability of doing the existing optimization with non-splatted vectors (splatted also works). Differential Revision: http://reviews.llvm.org/D13740 llvm-svn: 251028 2015-10-23 00:14:45 +08:00			`; Function Attrs: nounwind`
[X86][SSE] Regenerate multiple combine tests llvm-svn: 281973 2016-09-20 22:42:45 +08:00			`define void @testCombineMultiplies_non_splat(<4 x i32> %v1) nounwind {`
			`; CHECK-LABEL: testCombineMultiplies_non_splat:`
[CodeGen] Unify MBB reference format in both MIR and debug output As part of the unification of the debug format and the MIR format, print MBB references as '%bb.5'. The MIR printer prints the IR name of a MBB only for block definitions. * find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#" << ([a-zA-Z0-9_]+)->getNumber\(\)/" << printMBBReference(\1)/g' find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#" << ([a-zA-Z0-9_]+)\.getNumber\(\)/" << printMBBReference(\1)/g' * find . \( -name ".txt" -o -name ".s" -o -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#([0-9]+)/%bb.\1/g' * grep -nr 'BB#' and fix Differential Revision: https://reviews.llvm.org/D40422 llvm-svn: 319665 2017-12-05 01:18:51 +08:00			`; CHECK: # %bb.0: # %entry`
[X86][SSE] Regenerate multiple combine tests llvm-svn: 281973 2016-09-20 22:42:45 +08:00			`; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [11,22,33,44]`
			`; CHECK-NEXT: paddd %xmm0, %xmm1`
			`; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [22,33,44,55]`
			`; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]`
			`; CHECK-NEXT: pmuludq %xmm2, %xmm0`
			`; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]`
			`; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]`
			`; CHECK-NEXT: pmuludq %xmm3, %xmm2`
			`; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]`
			`; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]`
			`; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [242,726,1452,2420]`
			`; CHECK-NEXT: paddd %xmm0, %xmm2`
			`; CHECK-NEXT: paddd {{\.LCPI.*}}, %xmm0`
			`; CHECK-NEXT: movdqa %xmm2, v2`
			`; CHECK-NEXT: movdqa %xmm0, v3`
			`; CHECK-NEXT: movdqa %xmm1, x`
			`; CHECK-NEXT: retl`
[X86] - Catch extra combine opportunities for redundant imuls. When we fold "mul ((add x, c1), c1)" -> "add ((mul x, c2), c1*c2)", we bail if (add x, c1) has multiple users which would result in an extra add instruction. In such cases, this patch adds a check to see if we can eliminate a multiply instruction in exchange for the extra add. I also added the capability of doing the existing optimization with non-splatted vectors (splatted also works). Differential Revision: http://reviews.llvm.org/D13740 llvm-svn: 251028 2015-10-23 00:14:45 +08:00			`entry:`
			`%add1 = add <4 x i32> %v1, <i32 11, i32 22, i32 33, i32 44>`
			`%mul1 = mul <4 x i32> %add1, <i32 22, i32 33, i32 44, i32 55>`
			`%add2 = add <4 x i32> %v1, <i32 33, i32 44, i32 55, i32 66>`
			`%mul2 = mul <4 x i32> %add2, <i32 22, i32 33, i32 44, i32 55>`
			`store <4 x i32> %mul1, <4 x i32>* @v2, align 16`
			`store <4 x i32> %mul2, <4 x i32>* @v3, align 16`
			`store <4 x i32> %add1, <4 x i32>* @x, align 16`
			`ret void`
			`}`