2016-09-20 22:42:45 +08:00
|
|
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
|
|
; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=sse2 | FileCheck %s
|
2015-10-23 00:14:45 +08:00
|
|
|
|
|
|
|
; Source file looks something like this:
|
|
|
|
;
|
|
|
|
; typedef int AAA[100][100];
|
|
|
|
;
|
|
|
|
; void testCombineMultiplies(AAA a,int lll)
|
|
|
|
; {
|
|
|
|
; int LOC = lll + 5;
|
|
|
|
;
|
|
|
|
; a[LOC][LOC] = 11;
|
|
|
|
;
|
|
|
|
; a[LOC][20] = 22;
|
|
|
|
; a[LOC+20][20] = 33;
|
|
|
|
; }
|
|
|
|
;
|
|
|
|
; We want to make sure we don't generate 2 multiply instructions,
|
|
|
|
; one for a[LOC][] and one for a[LOC+20]. visitMUL in DAGCombiner.cpp
|
|
|
|
; should combine the instructions in such a way to avoid the extra
|
|
|
|
; multiply.
|
|
|
|
;
|
|
|
|
; Output looks roughly like this:
|
|
|
|
;
|
|
|
|
; movl 8(%esp), %eax
|
|
|
|
; movl 12(%esp), %ecx
|
|
|
|
; imull $400, %ecx, %edx # imm = 0x190
|
|
|
|
; leal (%edx,%eax), %esi
|
|
|
|
; movl $11, 2020(%esi,%ecx,4)
|
|
|
|
; movl $22, 2080(%edx,%eax)
|
|
|
|
; movl $33, 10080(%edx,%eax)
|
|
|
|
|
|
|
|
; Function Attrs: nounwind
|
2016-09-20 22:42:45 +08:00
|
|
|
define void @testCombineMultiplies([100 x i32]* nocapture %a, i32 %lll) nounwind {
|
|
|
|
; CHECK-LABEL: testCombineMultiplies:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: # %bb.0: # %entry
|
2016-09-20 22:42:45 +08:00
|
|
|
; CHECK-NEXT: pushl %esi
|
|
|
|
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
|
|
|
; CHECK-NEXT: imull $400, %ecx, %edx # imm = 0x190
|
|
|
|
; CHECK-NEXT: leal (%eax,%edx), %esi
|
|
|
|
; CHECK-NEXT: movl $11, 2020(%esi,%ecx,4)
|
|
|
|
; CHECK-NEXT: movl $22, 2080(%eax,%edx)
|
|
|
|
; CHECK-NEXT: movl $33, 10080(%eax,%edx)
|
|
|
|
; CHECK-NEXT: popl %esi
|
|
|
|
; CHECK-NEXT: retl
|
2015-10-23 00:14:45 +08:00
|
|
|
entry:
|
|
|
|
%add = add nsw i32 %lll, 5
|
|
|
|
%arrayidx1 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add, i32 %add
|
|
|
|
store i32 11, i32* %arrayidx1, align 4
|
|
|
|
%arrayidx3 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add, i32 20
|
|
|
|
store i32 22, i32* %arrayidx3, align 4
|
|
|
|
%add4 = add nsw i32 %lll, 25
|
|
|
|
%arrayidx6 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add4, i32 20
|
|
|
|
store i32 33, i32* %arrayidx6, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
; Test for the same optimization on vector multiplies.
|
|
|
|
;
|
|
|
|
; Source looks something like this:
|
|
|
|
;
|
|
|
|
; typedef int v4int __attribute__((__vector_size__(16)));
|
|
|
|
;
|
|
|
|
; v4int x;
|
|
|
|
; v4int v2, v3;
|
|
|
|
; void testCombineMultiplies_splat(v4int v1) {
|
|
|
|
; v2 = (v1 + (v4int){ 11, 11, 11, 11 }) * (v4int) {22, 22, 22, 22};
|
|
|
|
; v3 = (v1 + (v4int){ 33, 33, 33, 33 }) * (v4int) {22, 22, 22, 22};
|
|
|
|
; x = (v1 + (v4int){ 11, 11, 11, 11 });
|
|
|
|
; }
|
|
|
|
;
|
|
|
|
; Output looks something like this:
|
|
|
|
;
|
|
|
|
; testCombineMultiplies_splat: # @testCombineMultiplies_splat
|
2017-12-05 01:18:51 +08:00
|
|
|
; # %bb.0: # %entry
|
2015-10-23 00:14:45 +08:00
|
|
|
; movdqa .LCPI1_0, %xmm1 # xmm1 = [11,11,11,11]
|
|
|
|
; paddd %xmm0, %xmm1
|
|
|
|
; movdqa .LCPI1_1, %xmm2 # xmm2 = [22,22,22,22]
|
|
|
|
; pshufd $245, %xmm0, %xmm3 # xmm3 = xmm0[1,1,3,3]
|
|
|
|
; pmuludq %xmm2, %xmm0
|
|
|
|
; pshufd $232, %xmm0, %xmm0 # xmm0 = xmm0[0,2,2,3]
|
|
|
|
; pmuludq %xmm2, %xmm3
|
|
|
|
; pshufd $232, %xmm3, %xmm2 # xmm2 = xmm3[0,2,2,3]
|
|
|
|
; punpckldq %xmm2, %xmm0 # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
|
|
|
|
; movdqa .LCPI1_2, %xmm2 # xmm2 = [242,242,242,242]
|
|
|
|
; paddd %xmm0, %xmm2
|
|
|
|
; paddd .LCPI1_3, %xmm0
|
|
|
|
; movdqa %xmm2, v2
|
|
|
|
; movdqa %xmm0, v3
|
|
|
|
; movdqa %xmm1, x
|
|
|
|
; retl
|
|
|
|
;
|
|
|
|
; Again, we want to make sure we don't generate two different multiplies.
|
|
|
|
; We should have a single multiply for "v1 * {22, 22, 22, 22}" (made up of two
|
|
|
|
; pmuludq instructions), followed by two adds. Without this optimization, we'd
|
|
|
|
; do 2 adds, followed by 2 multiplies (i.e. 4 pmuludq instructions).
|
|
|
|
|
|
|
|
@v2 = common global <4 x i32> zeroinitializer, align 16
|
|
|
|
@v3 = common global <4 x i32> zeroinitializer, align 16
|
|
|
|
@x = common global <4 x i32> zeroinitializer, align 16
|
|
|
|
|
|
|
|
; Function Attrs: nounwind
|
2016-09-20 22:42:45 +08:00
|
|
|
define void @testCombineMultiplies_splat(<4 x i32> %v1) nounwind {
|
|
|
|
; CHECK-LABEL: testCombineMultiplies_splat:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: # %bb.0: # %entry
|
2016-09-20 22:42:45 +08:00
|
|
|
; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [11,11,11,11]
|
|
|
|
; CHECK-NEXT: paddd %xmm0, %xmm1
|
|
|
|
; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [22,22,22,22]
|
|
|
|
; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
|
|
|
|
; CHECK-NEXT: pmuludq %xmm2, %xmm0
|
|
|
|
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
|
|
|
|
; CHECK-NEXT: pmuludq %xmm2, %xmm3
|
|
|
|
; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
|
|
|
|
; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
|
|
|
|
; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [242,242,242,242]
|
|
|
|
; CHECK-NEXT: paddd %xmm0, %xmm2
|
|
|
|
; CHECK-NEXT: paddd {{\.LCPI.*}}, %xmm0
|
|
|
|
; CHECK-NEXT: movdqa %xmm2, v2
|
|
|
|
; CHECK-NEXT: movdqa %xmm0, v3
|
|
|
|
; CHECK-NEXT: movdqa %xmm1, x
|
|
|
|
; CHECK-NEXT: retl
|
2015-10-23 00:14:45 +08:00
|
|
|
entry:
|
|
|
|
%add1 = add <4 x i32> %v1, <i32 11, i32 11, i32 11, i32 11>
|
|
|
|
%mul1 = mul <4 x i32> %add1, <i32 22, i32 22, i32 22, i32 22>
|
|
|
|
%add2 = add <4 x i32> %v1, <i32 33, i32 33, i32 33, i32 33>
|
|
|
|
%mul2 = mul <4 x i32> %add2, <i32 22, i32 22, i32 22, i32 22>
|
|
|
|
store <4 x i32> %mul1, <4 x i32>* @v2, align 16
|
|
|
|
store <4 x i32> %mul2, <4 x i32>* @v3, align 16
|
|
|
|
store <4 x i32> %add1, <4 x i32>* @x, align 16
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
; Finally, check the non-splatted vector case. This is very similar
|
|
|
|
; to the previous test case, except for the vector values.
|
2016-09-20 22:42:45 +08:00
|
|
|
|
2015-10-23 00:14:45 +08:00
|
|
|
; Function Attrs: nounwind
|
2016-09-20 22:42:45 +08:00
|
|
|
define void @testCombineMultiplies_non_splat(<4 x i32> %v1) nounwind {
|
|
|
|
; CHECK-LABEL: testCombineMultiplies_non_splat:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: # %bb.0: # %entry
|
2016-09-20 22:42:45 +08:00
|
|
|
; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [11,22,33,44]
|
|
|
|
; CHECK-NEXT: paddd %xmm0, %xmm1
|
|
|
|
; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [22,33,44,55]
|
|
|
|
; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
|
|
|
|
; CHECK-NEXT: pmuludq %xmm2, %xmm0
|
|
|
|
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
|
|
|
|
; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
|
|
|
|
; CHECK-NEXT: pmuludq %xmm3, %xmm2
|
|
|
|
; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
|
|
|
|
; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
|
|
|
|
; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [242,726,1452,2420]
|
|
|
|
; CHECK-NEXT: paddd %xmm0, %xmm2
|
|
|
|
; CHECK-NEXT: paddd {{\.LCPI.*}}, %xmm0
|
|
|
|
; CHECK-NEXT: movdqa %xmm2, v2
|
|
|
|
; CHECK-NEXT: movdqa %xmm0, v3
|
|
|
|
; CHECK-NEXT: movdqa %xmm1, x
|
|
|
|
; CHECK-NEXT: retl
|
2015-10-23 00:14:45 +08:00
|
|
|
entry:
|
|
|
|
%add1 = add <4 x i32> %v1, <i32 11, i32 22, i32 33, i32 44>
|
|
|
|
%mul1 = mul <4 x i32> %add1, <i32 22, i32 33, i32 44, i32 55>
|
|
|
|
%add2 = add <4 x i32> %v1, <i32 33, i32 44, i32 55, i32 66>
|
|
|
|
%mul2 = mul <4 x i32> %add2, <i32 22, i32 33, i32 44, i32 55>
|
|
|
|
store <4 x i32> %mul1, <4 x i32>* @v2, align 16
|
|
|
|
store <4 x i32> %mul2, <4 x i32>* @v3, align 16
|
|
|
|
store <4 x i32> %add1, <4 x i32>* @x, align 16
|
|
|
|
ret void
|
|
|
|
}
|