llvm-project/llvm/test/CodeGen/X86/combine-multiplies.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=sse2 | FileCheck %s

; Source file looks something like this:
;
; typedef int AAA[100][100];
;
; void testCombineMultiplies(AAA a,int lll)
; {
;   int LOC = lll + 5;
;
;   a[LOC][LOC] = 11;
;
;   a[LOC][20] = 22;
;   a[LOC+20][20] = 33;
; }
;
; We want to make sure we don't generate 2 multiply instructions,
; one for a[LOC][] and one for a[LOC+20]. visitMUL in DAGCombiner.cpp
; should combine the instructions in such a way to avoid the extra
; multiply.
;
; Output looks roughly like this:
;
;	movl	8(%esp), %eax
;	movl	12(%esp), %ecx
;	imull	$400, %ecx, %edx        # imm = 0x190
;	leal	(%edx,%eax), %esi
;	movl	$11, 2020(%esi,%ecx,4)
;	movl	$22, 2080(%edx,%eax)
;	movl	$33, 10080(%edx,%eax)

; Function Attrs: nounwind
define void @testCombineMultiplies([100 x i32]* nocapture %a, i32 %lll) nounwind {
; CHECK-LABEL: testCombineMultiplies:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    pushl %esi
; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT:    imull $400, %ecx, %edx # imm = 0x190
; CHECK-NEXT:    leal (%eax,%edx), %esi
; CHECK-NEXT:    movl $11, 2020(%esi,%ecx,4)
; CHECK-NEXT:    movl $22, 2080(%eax,%edx)
; CHECK-NEXT:    movl $33, 10080(%eax,%edx)
; CHECK-NEXT:    popl %esi
; CHECK-NEXT:    retl
entry:
  %add = add nsw i32 %lll, 5
  %arrayidx1 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add, i32 %add
  store i32 11, i32* %arrayidx1, align 4
  %arrayidx3 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add, i32 20
  store i32 22, i32* %arrayidx3, align 4
  %add4 = add nsw i32 %lll, 25
  %arrayidx6 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add4, i32 20
  store i32 33, i32* %arrayidx6, align 4
  ret void
}


; Test for the same optimization on vector multiplies.
;
; Source looks something like this:
;
; typedef int v4int __attribute__((__vector_size__(16)));
;
; v4int x;
; v4int v2, v3;
; void testCombineMultiplies_splat(v4int v1) {
;   v2 = (v1 + (v4int){ 11, 11, 11, 11 }) * (v4int) {22, 22, 22, 22};
;   v3 = (v1 + (v4int){ 33, 33, 33, 33 }) * (v4int) {22, 22, 22, 22};
;   x = (v1 + (v4int){ 11, 11, 11, 11 });
; }
;
; Output looks something like this:
;
; testCombineMultiplies_splat:                              # @testCombineMultiplies_splat
; # %bb.0:                                 # %entry
; 	movdqa	.LCPI1_0, %xmm1         # xmm1 = [11,11,11,11]
; 	paddd	%xmm0, %xmm1
; 	movdqa	.LCPI1_1, %xmm2         # xmm2 = [22,22,22,22]
; 	pshufd	$245, %xmm0, %xmm3      # xmm3 = xmm0[1,1,3,3]
; 	pmuludq	%xmm2, %xmm0
; 	pshufd	$232, %xmm0, %xmm0      # xmm0 = xmm0[0,2,2,3]
; 	pmuludq	%xmm2, %xmm3
; 	pshufd	$232, %xmm3, %xmm2      # xmm2 = xmm3[0,2,2,3]
; 	punpckldq	%xmm2, %xmm0    # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; 	movdqa	.LCPI1_2, %xmm2         # xmm2 = [242,242,242,242]
;	paddd	%xmm0, %xmm2
;	paddd	.LCPI1_3, %xmm0
;	movdqa	%xmm2, v2
;	movdqa	%xmm0, v3
;	movdqa	%xmm1, x
;	retl
;
; Again, we want to make sure we don't generate two different multiplies.
; We should have a single multiply for "v1 * {22, 22, 22, 22}" (made up of two
; pmuludq instructions), followed by two adds. Without this optimization, we'd
; do 2 adds, followed by 2 multiplies (i.e. 4 pmuludq instructions).

@v2 = common global <4 x i32> zeroinitializer, align 16
@v3 = common global <4 x i32> zeroinitializer, align 16
@x = common global <4 x i32> zeroinitializer, align 16

; Function Attrs: nounwind
define void @testCombineMultiplies_splat(<4 x i32> %v1) nounwind {
; CHECK-LABEL: testCombineMultiplies_splat:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [11,11,11,11]
; CHECK-NEXT:    paddd %xmm0, %xmm1
; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [22,22,22,22]
; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; CHECK-NEXT:    pmuludq %xmm2, %xmm0
; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-NEXT:    pmuludq %xmm2, %xmm3
; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
; CHECK-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [242,242,242,242]
; CHECK-NEXT:    paddd %xmm0, %xmm2
; CHECK-NEXT:    paddd {{\.LCPI.*}}, %xmm0
; CHECK-NEXT:    movdqa %xmm2, v2
; CHECK-NEXT:    movdqa %xmm0, v3
; CHECK-NEXT:    movdqa %xmm1, x
; CHECK-NEXT:    retl
entry:
  %add1 = add <4 x i32> %v1, <i32 11, i32 11, i32 11, i32 11>
  %mul1 = mul <4 x i32> %add1, <i32 22, i32 22, i32 22, i32 22>
  %add2 = add <4 x i32> %v1, <i32 33, i32 33, i32 33, i32 33>
  %mul2 = mul <4 x i32> %add2, <i32 22, i32 22, i32 22, i32 22>
  store <4 x i32> %mul1, <4 x i32>* @v2, align 16
  store <4 x i32> %mul2, <4 x i32>* @v3, align 16
  store <4 x i32> %add1, <4 x i32>* @x, align 16
  ret void
}

; Finally, check the non-splatted vector case. This is very similar
; to the previous test case, except for the vector values.

; Function Attrs: nounwind
define void @testCombineMultiplies_non_splat(<4 x i32> %v1) nounwind {
; CHECK-LABEL: testCombineMultiplies_non_splat:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [11,22,33,44]
; CHECK-NEXT:    paddd %xmm0, %xmm1
; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [22,33,44,55]
; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; CHECK-NEXT:    pmuludq %xmm2, %xmm0
; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; CHECK-NEXT:    pmuludq %xmm3, %xmm2
; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; CHECK-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [242,726,1452,2420]
; CHECK-NEXT:    paddd %xmm0, %xmm2
; CHECK-NEXT:    paddd {{\.LCPI.*}}, %xmm0
; CHECK-NEXT:    movdqa %xmm2, v2
; CHECK-NEXT:    movdqa %xmm0, v3
; CHECK-NEXT:    movdqa %xmm1, x
; CHECK-NEXT:    retl
entry:
  %add1 = add <4 x i32> %v1, <i32 11, i32 22, i32 33, i32 44>
  %mul1 = mul <4 x i32> %add1, <i32 22, i32 33, i32 44, i32 55>
  %add2 = add <4 x i32> %v1, <i32 33, i32 44, i32 55, i32 66>
  %mul2 = mul <4 x i32> %add2, <i32 22, i32 33, i32 44, i32 55>
  store <4 x i32> %mul1, <4 x i32>* @v2, align 16
  store <4 x i32> %mul2, <4 x i32>* @v3, align 16
  store <4 x i32> %add1, <4 x i32>* @x, align 16
  ret void
}
[X86][SSE] Regenerate multiple combine tests llvm-svn: 281973 2016-09-20 22:42:45 +08:00			`; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py`
			`; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=sse2 \| FileCheck %s`
[X86] - Catch extra combine opportunities for redundant imuls. When we fold "mul ((add x, c1), c1)" -> "add ((mul x, c2), c1*c2)", we bail if (add x, c1) has multiple users which would result in an extra add instruction. In such cases, this patch adds a check to see if we can eliminate a multiply instruction in exchange for the extra add. I also added the capability of doing the existing optimization with non-splatted vectors (splatted also works). Differential Revision: http://reviews.llvm.org/D13740 llvm-svn: 251028 2015-10-23 00:14:45 +08:00
			`; Source file looks something like this:`
			`;`
			`; typedef int AAA[100][100];`
			`;`
			`; void testCombineMultiplies(AAA a,int lll)`
			`; {`
			`; int LOC = lll + 5;`
			`;`
			`; a[LOC][LOC] = 11;`
			`;`
			`; a[LOC][20] = 22;`
			`; a[LOC+20][20] = 33;`
			`; }`
			`;`
			`; We want to make sure we don't generate 2 multiply instructions,`
			`; one for a[LOC][] and one for a[LOC+20]. visitMUL in DAGCombiner.cpp`
			`; should combine the instructions in such a way to avoid the extra`
			`; multiply.`
			`;`
			`; Output looks roughly like this:`
			`;`
			`; movl 8(%esp), %eax`
			`; movl 12(%esp), %ecx`
			`; imull $400, %ecx, %edx # imm = 0x190`
			`; leal (%edx,%eax), %esi`
			`; movl $11, 2020(%esi,%ecx,4)`
			`; movl $22, 2080(%edx,%eax)`
			`; movl $33, 10080(%edx,%eax)`

			`; Function Attrs: nounwind`
[X86][SSE] Regenerate multiple combine tests llvm-svn: 281973 2016-09-20 22:42:45 +08:00			`define void @testCombineMultiplies([100 x i32]* nocapture %a, i32 %lll) nounwind {`
			`; CHECK-LABEL: testCombineMultiplies:`
[CodeGen] Unify MBB reference format in both MIR and debug output As part of the unification of the debug format and the MIR format, print MBB references as '%bb.5'. The MIR printer prints the IR name of a MBB only for block definitions. * find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#" << ([a-zA-Z0-9_]+)->getNumber\(\)/" << printMBBReference(\1)/g' find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#" << ([a-zA-Z0-9_]+)\.getNumber\(\)/" << printMBBReference(\1)/g' * find . \( -name ".txt" -o -name ".s" -o -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#([0-9]+)/%bb.\1/g' * grep -nr 'BB#' and fix Differential Revision: https://reviews.llvm.org/D40422 llvm-svn: 319665 2017-12-05 01:18:51 +08:00			`; CHECK: # %bb.0: # %entry`
[X86][SSE] Regenerate multiple combine tests llvm-svn: 281973 2016-09-20 22:42:45 +08:00			`; CHECK-NEXT: pushl %esi`
			`; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax`
			`; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx`
			`; CHECK-NEXT: imull $400, %ecx, %edx # imm = 0x190`
			`; CHECK-NEXT: leal (%eax,%edx), %esi`
			`; CHECK-NEXT: movl $11, 2020(%esi,%ecx,4)`
			`; CHECK-NEXT: movl $22, 2080(%eax,%edx)`
			`; CHECK-NEXT: movl $33, 10080(%eax,%edx)`
			`; CHECK-NEXT: popl %esi`
			`; CHECK-NEXT: retl`
[X86] - Catch extra combine opportunities for redundant imuls. When we fold "mul ((add x, c1), c1)" -> "add ((mul x, c2), c1*c2)", we bail if (add x, c1) has multiple users which would result in an extra add instruction. In such cases, this patch adds a check to see if we can eliminate a multiply instruction in exchange for the extra add. I also added the capability of doing the existing optimization with non-splatted vectors (splatted also works). Differential Revision: http://reviews.llvm.org/D13740 llvm-svn: 251028 2015-10-23 00:14:45 +08:00			`entry:`
			`%add = add nsw i32 %lll, 5`
			`%arrayidx1 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add, i32 %add`
			`store i32 11, i32* %arrayidx1, align 4`
			`%arrayidx3 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add, i32 20`
			`store i32 22, i32* %arrayidx3, align 4`
			`%add4 = add nsw i32 %lll, 25`
			`%arrayidx6 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add4, i32 20`
			`store i32 33, i32* %arrayidx6, align 4`
			`ret void`
			`}`


			`; Test for the same optimization on vector multiplies.`
			`;`
			`; Source looks something like this:`
			`;`
			`; typedef int v4int __attribute__((__vector_size__(16)));`
			`;`
			`; v4int x;`
			`; v4int v2, v3;`
			`; void testCombineMultiplies_splat(v4int v1) {`
			`; v2 = (v1 + (v4int){ 11, 11, 11, 11 }) * (v4int) {22, 22, 22, 22};`
			`; v3 = (v1 + (v4int){ 33, 33, 33, 33 }) * (v4int) {22, 22, 22, 22};`
			`; x = (v1 + (v4int){ 11, 11, 11, 11 });`
			`; }`
			`;`
			`; Output looks something like this:`
			`;`
			`; testCombineMultiplies_splat: # @testCombineMultiplies_splat`
[CodeGen] Unify MBB reference format in both MIR and debug output As part of the unification of the debug format and the MIR format, print MBB references as '%bb.5'. The MIR printer prints the IR name of a MBB only for block definitions. * find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#" << ([a-zA-Z0-9_]+)->getNumber\(\)/" << printMBBReference(\1)/g' find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#" << ([a-zA-Z0-9_]+)\.getNumber\(\)/" << printMBBReference(\1)/g' * find . \( -name ".txt" -o -name ".s" -o -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#([0-9]+)/%bb.\1/g' * grep -nr 'BB#' and fix Differential Revision: https://reviews.llvm.org/D40422 llvm-svn: 319665 2017-12-05 01:18:51 +08:00			`; # %bb.0: # %entry`
[X86] - Catch extra combine opportunities for redundant imuls. When we fold "mul ((add x, c1), c1)" -> "add ((mul x, c2), c1*c2)", we bail if (add x, c1) has multiple users which would result in an extra add instruction. In such cases, this patch adds a check to see if we can eliminate a multiply instruction in exchange for the extra add. I also added the capability of doing the existing optimization with non-splatted vectors (splatted also works). Differential Revision: http://reviews.llvm.org/D13740 llvm-svn: 251028 2015-10-23 00:14:45 +08:00			`; movdqa .LCPI1_0, %xmm1 # xmm1 = [11,11,11,11]`
			`; paddd %xmm0, %xmm1`
			`; movdqa .LCPI1_1, %xmm2 # xmm2 = [22,22,22,22]`
			`; pshufd $245, %xmm0, %xmm3 # xmm3 = xmm0[1,1,3,3]`
			`; pmuludq %xmm2, %xmm0`
			`; pshufd $232, %xmm0, %xmm0 # xmm0 = xmm0[0,2,2,3]`
			`; pmuludq %xmm2, %xmm3`
			`; pshufd $232, %xmm3, %xmm2 # xmm2 = xmm3[0,2,2,3]`
			`; punpckldq %xmm2, %xmm0 # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]`
			`; movdqa .LCPI1_2, %xmm2 # xmm2 = [242,242,242,242]`
			`; paddd %xmm0, %xmm2`
			`; paddd .LCPI1_3, %xmm0`
			`; movdqa %xmm2, v2`
			`; movdqa %xmm0, v3`
			`; movdqa %xmm1, x`
			`; retl`
			`;`
			`; Again, we want to make sure we don't generate two different multiplies.`
			`; We should have a single multiply for "v1 * {22, 22, 22, 22}" (made up of two`
			`; pmuludq instructions), followed by two adds. Without this optimization, we'd`
			`; do 2 adds, followed by 2 multiplies (i.e. 4 pmuludq instructions).`

			`@v2 = common global <4 x i32> zeroinitializer, align 16`
			`@v3 = common global <4 x i32> zeroinitializer, align 16`
			`@x = common global <4 x i32> zeroinitializer, align 16`

			`; Function Attrs: nounwind`
[X86][SSE] Regenerate multiple combine tests llvm-svn: 281973 2016-09-20 22:42:45 +08:00			`define void @testCombineMultiplies_splat(<4 x i32> %v1) nounwind {`
			`; CHECK-LABEL: testCombineMultiplies_splat:`
[CodeGen] Unify MBB reference format in both MIR and debug output As part of the unification of the debug format and the MIR format, print MBB references as '%bb.5'. The MIR printer prints the IR name of a MBB only for block definitions. * find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#" << ([a-zA-Z0-9_]+)->getNumber\(\)/" << printMBBReference(\1)/g' find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#" << ([a-zA-Z0-9_]+)\.getNumber\(\)/" << printMBBReference(\1)/g' * find . \( -name ".txt" -o -name ".s" -o -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#([0-9]+)/%bb.\1/g' * grep -nr 'BB#' and fix Differential Revision: https://reviews.llvm.org/D40422 llvm-svn: 319665 2017-12-05 01:18:51 +08:00			`; CHECK: # %bb.0: # %entry`
[X86][SSE] Regenerate multiple combine tests llvm-svn: 281973 2016-09-20 22:42:45 +08:00			`; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [11,11,11,11]`
			`; CHECK-NEXT: paddd %xmm0, %xmm1`
			`; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [22,22,22,22]`
			`; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]`
			`; CHECK-NEXT: pmuludq %xmm2, %xmm0`
			`; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]`
			`; CHECK-NEXT: pmuludq %xmm2, %xmm3`
			`; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]`
			`; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]`
			`; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [242,242,242,242]`
			`; CHECK-NEXT: paddd %xmm0, %xmm2`
			`; CHECK-NEXT: paddd {{\.LCPI.*}}, %xmm0`
			`; CHECK-NEXT: movdqa %xmm2, v2`
			`; CHECK-NEXT: movdqa %xmm0, v3`
			`; CHECK-NEXT: movdqa %xmm1, x`
			`; CHECK-NEXT: retl`
[X86] - Catch extra combine opportunities for redundant imuls. When we fold "mul ((add x, c1), c1)" -> "add ((mul x, c2), c1*c2)", we bail if (add x, c1) has multiple users which would result in an extra add instruction. In such cases, this patch adds a check to see if we can eliminate a multiply instruction in exchange for the extra add. I also added the capability of doing the existing optimization with non-splatted vectors (splatted also works). Differential Revision: http://reviews.llvm.org/D13740 llvm-svn: 251028 2015-10-23 00:14:45 +08:00			`entry:`
			`%add1 = add <4 x i32> %v1, <i32 11, i32 11, i32 11, i32 11>`
			`%mul1 = mul <4 x i32> %add1, <i32 22, i32 22, i32 22, i32 22>`
			`%add2 = add <4 x i32> %v1, <i32 33, i32 33, i32 33, i32 33>`
			`%mul2 = mul <4 x i32> %add2, <i32 22, i32 22, i32 22, i32 22>`
			`store <4 x i32> %mul1, <4 x i32>* @v2, align 16`
			`store <4 x i32> %mul2, <4 x i32>* @v3, align 16`
			`store <4 x i32> %add1, <4 x i32>* @x, align 16`
			`ret void`
			`}`

			`; Finally, check the non-splatted vector case. This is very similar`
			`; to the previous test case, except for the vector values.`
[X86][SSE] Regenerate multiple combine tests llvm-svn: 281973 2016-09-20 22:42:45 +08:00
[X86] - Catch extra combine opportunities for redundant imuls. When we fold "mul ((add x, c1), c1)" -> "add ((mul x, c2), c1*c2)", we bail if (add x, c1) has multiple users which would result in an extra add instruction. In such cases, this patch adds a check to see if we can eliminate a multiply instruction in exchange for the extra add. I also added the capability of doing the existing optimization with non-splatted vectors (splatted also works). Differential Revision: http://reviews.llvm.org/D13740 llvm-svn: 251028 2015-10-23 00:14:45 +08:00			`; Function Attrs: nounwind`
[X86][SSE] Regenerate multiple combine tests llvm-svn: 281973 2016-09-20 22:42:45 +08:00			`define void @testCombineMultiplies_non_splat(<4 x i32> %v1) nounwind {`
			`; CHECK-LABEL: testCombineMultiplies_non_splat:`
[CodeGen] Unify MBB reference format in both MIR and debug output As part of the unification of the debug format and the MIR format, print MBB references as '%bb.5'. The MIR printer prints the IR name of a MBB only for block definitions. * find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#" << ([a-zA-Z0-9_]+)->getNumber\(\)/" << printMBBReference(\1)/g' find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#" << ([a-zA-Z0-9_]+)\.getNumber\(\)/" << printMBBReference(\1)/g' * find . \( -name ".txt" -o -name ".s" -o -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#([0-9]+)/%bb.\1/g' * grep -nr 'BB#' and fix Differential Revision: https://reviews.llvm.org/D40422 llvm-svn: 319665 2017-12-05 01:18:51 +08:00			`; CHECK: # %bb.0: # %entry`
[X86][SSE] Regenerate multiple combine tests llvm-svn: 281973 2016-09-20 22:42:45 +08:00			`; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [11,22,33,44]`
			`; CHECK-NEXT: paddd %xmm0, %xmm1`
			`; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [22,33,44,55]`
			`; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]`
			`; CHECK-NEXT: pmuludq %xmm2, %xmm0`
			`; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]`
			`; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]`
			`; CHECK-NEXT: pmuludq %xmm3, %xmm2`
			`; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]`
			`; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]`
			`; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [242,726,1452,2420]`
			`; CHECK-NEXT: paddd %xmm0, %xmm2`
			`; CHECK-NEXT: paddd {{\.LCPI.*}}, %xmm0`
			`; CHECK-NEXT: movdqa %xmm2, v2`
			`; CHECK-NEXT: movdqa %xmm0, v3`
			`; CHECK-NEXT: movdqa %xmm1, x`
			`; CHECK-NEXT: retl`
[X86] - Catch extra combine opportunities for redundant imuls. When we fold "mul ((add x, c1), c1)" -> "add ((mul x, c2), c1*c2)", we bail if (add x, c1) has multiple users which would result in an extra add instruction. In such cases, this patch adds a check to see if we can eliminate a multiply instruction in exchange for the extra add. I also added the capability of doing the existing optimization with non-splatted vectors (splatted also works). Differential Revision: http://reviews.llvm.org/D13740 llvm-svn: 251028 2015-10-23 00:14:45 +08:00			`entry:`
			`%add1 = add <4 x i32> %v1, <i32 11, i32 22, i32 33, i32 44>`
			`%mul1 = mul <4 x i32> %add1, <i32 22, i32 33, i32 44, i32 55>`
			`%add2 = add <4 x i32> %v1, <i32 33, i32 44, i32 55, i32 66>`
			`%mul2 = mul <4 x i32> %add2, <i32 22, i32 33, i32 44, i32 55>`
			`store <4 x i32> %mul1, <4 x i32>* @v2, align 16`
			`store <4 x i32> %mul2, <4 x i32>* @v3, align 16`
			`store <4 x i32> %add1, <4 x i32>* @x, align 16`
			`ret void`
			`}`