llvm-project/llvm/test/Transforms/InstCombine/insert-extract-shuffle.ll

; RUN: opt -S -instcombine %s | FileCheck %s

define <1 x i8> @test1(<8 x i8> %in) {
; CHECK-LABEL: @test1
; CHECK: shufflevector <8 x i8> %in, <8 x i8> undef, <1 x i32> <i32 5>
  %val = extractelement <8 x i8> %in, i32 5
  %vec = insertelement <1 x i8> undef, i8 %val, i32 0
  ret <1 x i8> %vec
}

define <4 x i16> @test2(<8 x i16> %in, <8 x i16> %in2) {
; CHECK-LABEL: @test2
; CHECK: shufflevector <8 x i16> %in2, <8 x i16> %in, <4 x i32> <i32 11, i32 9, i32 0, i32 10>
  %elt0 = extractelement <8 x i16> %in, i32 3
  %elt1 = extractelement <8 x i16> %in, i32 1
  %elt2 = extractelement <8 x i16> %in2, i32 0
  %elt3 = extractelement <8 x i16> %in, i32 2

  %vec.0 = insertelement <4 x i16> undef, i16 %elt0, i32 0
  %vec.1 = insertelement <4 x i16> %vec.0, i16 %elt1, i32 1
  %vec.2 = insertelement <4 x i16> %vec.1, i16 %elt2, i32 2
  %vec.3 = insertelement <4 x i16> %vec.2, i16 %elt3, i32 3

  ret <4 x i16> %vec.3
}

define <2 x i64> @test_vcopyq_lane_p64(<2 x i64> %a, <1 x i64> %b) {
; CHECK-LABEL: @test_vcopyq_lane_p64
; CHECK-NEXT: %[[WIDEVEC:.*]] = shufflevector <1 x i64> %b, <1 x i64> undef, <2 x i32> <i32 0, i32 undef>
; CHECK-NEXT: shufflevector <2 x i64> %a, <2 x i64> %[[WIDEVEC]], <2 x i32> <i32 0, i32 2>
; CHECK-NEXT: ret <2 x i64> %res
  %elt = extractelement <1 x i64> %b, i32 0
  %res = insertelement <2 x i64> %a, i64 %elt, i32 1
  ret <2 x i64> %res
}

; PR2109: https://llvm.org/bugs/show_bug.cgi?id=2109

define <4 x float> @widen_extract2(<4 x float> %ins, <2 x float> %ext) {
; CHECK-LABEL: @widen_extract2(
; CHECK-NEXT: %[[WIDEVEC:.*]] = shufflevector <2 x float> %ext, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; CHECK-NEXT: shufflevector <4 x float> %ins, <4 x float> %[[WIDEVEC]], <4 x i32> <i32 0, i32 4, i32 2, i32 5>
; CHECK-NEXT: ret <4 x float> %i2
  %e1 = extractelement <2 x float> %ext, i32 0
  %e2 = extractelement <2 x float> %ext, i32 1
  %i1 = insertelement <4 x float> %ins, float %e1, i32 1
  %i2 = insertelement <4 x float> %i1, float %e2, i32 3
  ret <4 x float> %i2
}

define <4 x float> @widen_extract3(<4 x float> %ins, <3 x float> %ext) {
; CHECK-LABEL: @widen_extract3(
; CHECK-NEXT: %[[WIDEVEC:.*]] = shufflevector <3 x float> %ext, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
; CHECK-NEXT: shufflevector <4 x float> %ins, <4 x float> %[[WIDEVEC]], <4 x i32> <i32 6, i32 5, i32 4, i32 3>
; CHECK-NEXT: ret <4 x float> %i3
  %e1 = extractelement <3 x float> %ext, i32 0
  %e2 = extractelement <3 x float> %ext, i32 1
  %e3 = extractelement <3 x float> %ext, i32 2
  %i1 = insertelement <4 x float> %ins, float %e1, i32 2
  %i2 = insertelement <4 x float> %i1, float %e2, i32 1
  %i3 = insertelement <4 x float> %i2, float %e3, i32 0
  ret <4 x float> %i3
}

define <8 x float> @widen_extract4(<8 x float> %ins, <2 x float> %ext) {
; CHECK-LABEL: @widen_extract4(
; CHECK-NEXT: %[[WIDEVEC:.*]] = shufflevector <2 x float> %ext, <2 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: shufflevector <8 x float> %ins, <8 x float> %[[WIDEVEC]], <8 x i32> <i32 0, i32 1, i32 8, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: ret <8 x float> %i1
  %e1 = extractelement <2 x float> %ext, i32 0
  %i1 = insertelement <8 x float> %ins, float %e1, i32 2
  ret <8 x float> %i1
}

; PR26015: https://llvm.org/bugs/show_bug.cgi?id=26015
; The widening shuffle must be inserted before any uses.

define <8 x i16> @pr26015(<4 x i16> %t0) {
; CHECK-LABEL: @pr26015(
; CHECK-NEXT:  %[[WIDEVEC:.*]] = shufflevector <4 x i16> %t0, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT:  %[[EXT:.*]] = extractelement <4 x i16> %t0, i32 2
; CHECK-NEXT:  %t2 = insertelement <8 x i16> <i16 0, i16 0, i16 0, i16 undef, i16 0, i16 0, i16 undef, i16 undef>, i16 %[[EXT]], i32 3
; CHECK-NEXT:  %t3 = insertelement <8 x i16> %t2, i16 0, i32 6
; CHECK-NEXT:  %t5 = shufflevector <8 x i16> %t3, <8 x i16> %[[WIDEVEC]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 11>
; CHECK-NEXT:  ret <8 x i16> %t5
  %t1 = extractelement <4 x i16> %t0, i32 2
  %t2 = insertelement <8 x i16> zeroinitializer, i16 %t1, i32 3
  %t3 = insertelement <8 x i16> %t2, i16 0, i32 6
  %t4 = extractelement <4 x i16> %t0, i32 3
  %t5 = insertelement <8 x i16> %t3, i16 %t4, i32 7
  ret <8 x i16> %t5
}

; PR25999: https://llvm.org/bugs/show_bug.cgi?id=25999
; TODO: The widening shuffle could be inserted at the start of the function to allow the first extract to use it.

define <8 x i16> @pr25999(<4 x i16> %t0, i1 %b) {
; CHECK-LABEL: @pr25999(
; CHECK-NEXT:  %t1 = extractelement <4 x i16> %t0, i32 2
; CHECK-NEXT:  br i1 %b, label %if, label %end
; CHECK:       if:
; CHECK-NEXT:  %[[WIDEVEC:.*]] = shufflevector <4 x i16> %t0, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT:  %t2 = insertelement <8 x i16> <i16 0, i16 0, i16 0, i16 undef, i16 0, i16 0, i16 undef, i16 undef>, i16 %t1, i32 3
; CHECK-NEXT:  %t3 = insertelement <8 x i16> %t2, i16 0, i32 6
; CHECK-NEXT:  %t5 = shufflevector <8 x i16> %t3, <8 x i16> %[[WIDEVEC]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 11>
; CHECK-NEXT:  ret <8 x i16> %t5
; CHECK:       end:
; CHECK-NEXT:  %a1 = add i16 %t1, 4
; CHECK-NEXT:  %t6 = insertelement <8 x i16> <i16 undef, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, i16 %a1, i32 0
; CHECK-NEXT:  ret <8 x i16> %t6

  %t1 = extractelement <4 x i16> %t0, i32 2
  br i1 %b, label %if, label %end

if:
  %t2 = insertelement <8 x i16> zeroinitializer, i16 %t1, i32 3
  %t3 = insertelement <8 x i16> %t2, i16 0, i32 6
  %t4 = extractelement <4 x i16> %t0, i32 3
  %t5 = insertelement <8 x i16> %t3, i16 %t4, i32 7
  ret <8 x i16> %t5

end:
  %a1 = add i16 %t1, 4
  %t6 = insertelement <8 x i16> zeroinitializer, i16 %a1, i32 0
  ret <8 x i16> %t6
}

; The widening shuffle must be inserted at a valid point (after the PHIs). 

define <4 x double> @pr25999_phis1(i1 %c, <2 x double> %a, <4 x double> %b) {
; CHECK-LABEL: @pr25999_phis1(
; CHECK:       %tmp1 = phi <2 x double> [ %a, %bb1 ], [ %r, %bb2 ]
; CHECK-NEXT:  %tmp2 = phi <4 x double> [ %b, %bb1 ], [ zeroinitializer, %bb2 ]
; CHECK-NEXT:  %[[WIDEVEC:.*]] = shufflevector <2 x double> %tmp1, <2 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT:  %tmp4 = shufflevector <4 x double> %tmp2, <4 x double> %[[WIDEVEC]], <4 x i32> <i32 0, i32 1, i32 4, i32 3>
; CHECK-NEXT:  ret <4 x double> %tmp4
bb1:
  br i1 %c, label %bb2, label %bb3

bb2:
  %r = call <2 x double> @dummy(<2 x double> %a)
  br label %bb3

bb3:
  %tmp1 = phi <2 x double> [ %a, %bb1 ], [ %r, %bb2 ]
  %tmp2 = phi <4 x double> [ %b, %bb1 ], [ zeroinitializer, %bb2 ]
  %tmp3 = extractelement <2 x double> %tmp1, i32 0
  %tmp4 = insertelement <4 x double> %tmp2, double %tmp3, i32 2
  ret <4 x double> %tmp4
}

declare <2 x double> @dummy(<2 x double>)

define <4 x double> @pr25999_phis2(i1 %c, <2 x double> %a, <4 x double> %b) {
; CHECK-LABEL: @pr25999_phis2(
; CHECK:       %tmp1 = phi <2 x double> [ %a, %bb1 ], [ %r, %bb2 ]
; CHECK-NEXT:  %tmp2 = phi <4 x double> [ %b, %bb1 ], [ zeroinitializer, %bb2 ]
; CHECK-NEXT:  %d = fadd <2 x double> %tmp1, %tmp1
; CHECK-NEXT:  %[[WIDEVEC:.*]] = shufflevector <2 x double> %d, <2 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT:  %tmp4 = shufflevector <4 x double> %tmp2, <4 x double> %[[WIDEVEC]], <4 x i32> <i32 0, i32 1, i32 4, i32 3>
; CHECK-NEXT:  ret <4 x double> %tmp4
bb1:
  br i1 %c, label %bb2, label %bb3

bb2:
  %r = call <2 x double> @dummy(<2 x double> %a)
  br label %bb3

bb3:
  %tmp1 = phi <2 x double> [ %a, %bb1 ], [ %r, %bb2 ]
  %tmp2 = phi <4 x double> [ %b, %bb1 ], [ zeroinitializer, %bb2 ]
  %d = fadd <2 x double> %tmp1, %tmp1
  %tmp3 = extractelement <2 x double> %d, i32 0
  %tmp4 = insertelement <4 x double> %tmp2, double %tmp3, i32 2
  ret <4 x double> %tmp4
}
InstCombine: form shuffles from wider range of insert/extractelements Sequences of insertelement/extractelements are sometimes used to build vectorsr; this code tries to put them back together into shuffles, but could only produce a completely uniform shuffle types (<N x T> from two <N x T> sources). This should allow shuffles with different numbers of elements on the input and output sides as well. llvm-svn: 203229 2014-03-07 18:24:44 +08:00			`; RUN: opt -S -instcombine %s \| FileCheck %s`

			`define <1 x i8> @test1(<8 x i8> %in) {`
			`; CHECK-LABEL: @test1`
			`; CHECK: shufflevector <8 x i8> %in, <8 x i8> undef, <1 x i32> <i32 5>`
			`%val = extractelement <8 x i8> %in, i32 5`
			`%vec = insertelement <1 x i8> undef, i8 %val, i32 0`
			`ret <1 x i8> %vec`
			`}`

			`define <4 x i16> @test2(<8 x i16> %in, <8 x i16> %in2) {`
			`; CHECK-LABEL: @test2`
			`; CHECK: shufflevector <8 x i16> %in2, <8 x i16> %in, <4 x i32> <i32 11, i32 9, i32 0, i32 10>`
			`%elt0 = extractelement <8 x i16> %in, i32 3`
			`%elt1 = extractelement <8 x i16> %in, i32 1`
			`%elt2 = extractelement <8 x i16> %in2, i32 0`
			`%elt3 = extractelement <8 x i16> %in, i32 2`

			`%vec.0 = insertelement <4 x i16> undef, i16 %elt0, i32 0`
			`%vec.1 = insertelement <4 x i16> %vec.0, i16 %elt1, i32 1`
			`%vec.2 = insertelement <4 x i16> %vec.1, i16 %elt2, i32 2`
			`%vec.3 = insertelement <4 x i16> %vec.2, i16 %elt3, i32 3`

			`ret <4 x i16> %vec.3`
			`}`

[InstCombine] add tests to show potential vector IR shuffle transforms llvm-svn: 254342 2015-12-01 06:39:36 +08:00			`define <2 x i64> @test_vcopyq_lane_p64(<2 x i64> %a, <1 x i64> %b) {`
InstCombine: form shuffles from wider range of insert/extractelements Sequences of insertelement/extractelements are sometimes used to build vectorsr; this code tries to put them back together into shuffles, but could only produce a completely uniform shuffle types (<N x T> from two <N x T> sources). This should allow shuffles with different numbers of elements on the input and output sides as well. llvm-svn: 203229 2014-03-07 18:24:44 +08:00			`; CHECK-LABEL: @test_vcopyq_lane_p64`
[InstCombine] transform more extract/insert pairs into shuffles (PR2109) This is an extension of the shuffle combining from r203229: http://reviews.llvm.org/rL203229 The idea is to widen a short input vector with undef elements so the existing shuffle transform for extract/insert can kick in. The motivation is to finally solve PR2109: https://llvm.org/bugs/show_bug.cgi?id=2109 For that example, the IR becomes: %1 = bitcast <2 x i32>* %P to <2 x float>* %ld1 = load <2 x float>, <2 x float>* %1, align 8 %2 = shufflevector <2 x float> %ld1, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> %i2 = shufflevector <4 x float> %A, <4 x float> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 5> ret <4 x float> %i2 And x86 SSE output improves from: movq (%rdi), %xmm1 ## xmm1 = mem[0],zero movdqa %xmm1, %xmm2 shufps $229, %xmm2, %xmm2 ## xmm2 = xmm2[1,1,2,3] shufps $48, %xmm0, %xmm1 ## xmm1 = xmm1[0,0],xmm0[3,0] shufps $132, %xmm1, %xmm0 ## xmm0 = xmm0[0,1],xmm1[0,2] shufps $32, %xmm0, %xmm2 ## xmm2 = xmm2[0,0],xmm0[2,0] shufps $36, %xmm2, %xmm0 ## xmm0 = xmm0[0,1],xmm2[2,0] retq To the almost optimal: movhpd (%rdi), %xmm0 Note: There's a tension in the existing transform related to generating arbitrary shufflevector masks. We avoid that in other places in InstCombine because we're scared that codegen can't handle strange masks, but it looks like we're ok with producing those here. I purposely chose weird insert/extract indexes for the regression tests to see the effect in these cases. For PowerPC+Altivec, AArch64, and X86+SSE/AVX, I think the codegen is equal or better for these examples. Differential Revision: http://reviews.llvm.org/D15096 llvm-svn: 256394 2015-12-25 05:17:56 +08:00			`; CHECK-NEXT: %[[WIDEVEC:.*]] = shufflevector <1 x i64> %b, <1 x i64> undef, <2 x i32> <i32 0, i32 undef>`
			`; CHECK-NEXT: shufflevector <2 x i64> %a, <2 x i64> %[[WIDEVEC]], <2 x i32> <i32 0, i32 2>`
[InstCombine] add tests to show potential vector IR shuffle transforms llvm-svn: 254342 2015-12-01 06:39:36 +08:00			`; CHECK-NEXT: ret <2 x i64> %res`
InstCombine: form shuffles from wider range of insert/extractelements Sequences of insertelement/extractelements are sometimes used to build vectorsr; this code tries to put them back together into shuffles, but could only produce a completely uniform shuffle types (<N x T> from two <N x T> sources). This should allow shuffles with different numbers of elements on the input and output sides as well. llvm-svn: 203229 2014-03-07 18:24:44 +08:00			`%elt = extractelement <1 x i64> %b, i32 0`
			`%res = insertelement <2 x i64> %a, i64 %elt, i32 1`
			`ret <2 x i64> %res`
			`}`

[InstCombine] add tests to show potential vector IR shuffle transforms llvm-svn: 254342 2015-12-01 06:39:36 +08:00			`; PR2109: https://llvm.org/bugs/show_bug.cgi?id=2109`

			`define <4 x float> @widen_extract2(<4 x float> %ins, <2 x float> %ext) {`
			`; CHECK-LABEL: @widen_extract2(`
[InstCombine] transform more extract/insert pairs into shuffles (PR2109) This is an extension of the shuffle combining from r203229: http://reviews.llvm.org/rL203229 The idea is to widen a short input vector with undef elements so the existing shuffle transform for extract/insert can kick in. The motivation is to finally solve PR2109: https://llvm.org/bugs/show_bug.cgi?id=2109 For that example, the IR becomes: %1 = bitcast <2 x i32>* %P to <2 x float>* %ld1 = load <2 x float>, <2 x float>* %1, align 8 %2 = shufflevector <2 x float> %ld1, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> %i2 = shufflevector <4 x float> %A, <4 x float> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 5> ret <4 x float> %i2 And x86 SSE output improves from: movq (%rdi), %xmm1 ## xmm1 = mem[0],zero movdqa %xmm1, %xmm2 shufps $229, %xmm2, %xmm2 ## xmm2 = xmm2[1,1,2,3] shufps $48, %xmm0, %xmm1 ## xmm1 = xmm1[0,0],xmm0[3,0] shufps $132, %xmm1, %xmm0 ## xmm0 = xmm0[0,1],xmm1[0,2] shufps $32, %xmm0, %xmm2 ## xmm2 = xmm2[0,0],xmm0[2,0] shufps $36, %xmm2, %xmm0 ## xmm0 = xmm0[0,1],xmm2[2,0] retq To the almost optimal: movhpd (%rdi), %xmm0 Note: There's a tension in the existing transform related to generating arbitrary shufflevector masks. We avoid that in other places in InstCombine because we're scared that codegen can't handle strange masks, but it looks like we're ok with producing those here. I purposely chose weird insert/extract indexes for the regression tests to see the effect in these cases. For PowerPC+Altivec, AArch64, and X86+SSE/AVX, I think the codegen is equal or better for these examples. Differential Revision: http://reviews.llvm.org/D15096 llvm-svn: 256394 2015-12-25 05:17:56 +08:00			`; CHECK-NEXT: %[[WIDEVEC:.*]] = shufflevector <2 x float> %ext, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>`
			`; CHECK-NEXT: shufflevector <4 x float> %ins, <4 x float> %[[WIDEVEC]], <4 x i32> <i32 0, i32 4, i32 2, i32 5>`
[InstCombine] add tests to show potential vector IR shuffle transforms llvm-svn: 254342 2015-12-01 06:39:36 +08:00			`; CHECK-NEXT: ret <4 x float> %i2`
			`%e1 = extractelement <2 x float> %ext, i32 0`
			`%e2 = extractelement <2 x float> %ext, i32 1`
			`%i1 = insertelement <4 x float> %ins, float %e1, i32 1`
			`%i2 = insertelement <4 x float> %i1, float %e2, i32 3`
			`ret <4 x float> %i2`
			`}`

			`define <4 x float> @widen_extract3(<4 x float> %ins, <3 x float> %ext) {`
			`; CHECK-LABEL: @widen_extract3(`
[InstCombine] transform more extract/insert pairs into shuffles (PR2109) This is an extension of the shuffle combining from r203229: http://reviews.llvm.org/rL203229 The idea is to widen a short input vector with undef elements so the existing shuffle transform for extract/insert can kick in. The motivation is to finally solve PR2109: https://llvm.org/bugs/show_bug.cgi?id=2109 For that example, the IR becomes: %1 = bitcast <2 x i32>* %P to <2 x float>* %ld1 = load <2 x float>, <2 x float>* %1, align 8 %2 = shufflevector <2 x float> %ld1, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> %i2 = shufflevector <4 x float> %A, <4 x float> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 5> ret <4 x float> %i2 And x86 SSE output improves from: movq (%rdi), %xmm1 ## xmm1 = mem[0],zero movdqa %xmm1, %xmm2 shufps $229, %xmm2, %xmm2 ## xmm2 = xmm2[1,1,2,3] shufps $48, %xmm0, %xmm1 ## xmm1 = xmm1[0,0],xmm0[3,0] shufps $132, %xmm1, %xmm0 ## xmm0 = xmm0[0,1],xmm1[0,2] shufps $32, %xmm0, %xmm2 ## xmm2 = xmm2[0,0],xmm0[2,0] shufps $36, %xmm2, %xmm0 ## xmm0 = xmm0[0,1],xmm2[2,0] retq To the almost optimal: movhpd (%rdi), %xmm0 Note: There's a tension in the existing transform related to generating arbitrary shufflevector masks. We avoid that in other places in InstCombine because we're scared that codegen can't handle strange masks, but it looks like we're ok with producing those here. I purposely chose weird insert/extract indexes for the regression tests to see the effect in these cases. For PowerPC+Altivec, AArch64, and X86+SSE/AVX, I think the codegen is equal or better for these examples. Differential Revision: http://reviews.llvm.org/D15096 llvm-svn: 256394 2015-12-25 05:17:56 +08:00			`; CHECK-NEXT: %[[WIDEVEC:.*]] = shufflevector <3 x float> %ext, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>`
			`; CHECK-NEXT: shufflevector <4 x float> %ins, <4 x float> %[[WIDEVEC]], <4 x i32> <i32 6, i32 5, i32 4, i32 3>`
[InstCombine] add tests to show potential vector IR shuffle transforms llvm-svn: 254342 2015-12-01 06:39:36 +08:00			`; CHECK-NEXT: ret <4 x float> %i3`
			`%e1 = extractelement <3 x float> %ext, i32 0`
			`%e2 = extractelement <3 x float> %ext, i32 1`
			`%e3 = extractelement <3 x float> %ext, i32 2`
			`%i1 = insertelement <4 x float> %ins, float %e1, i32 2`
			`%i2 = insertelement <4 x float> %i1, float %e2, i32 1`
			`%i3 = insertelement <4 x float> %i2, float %e3, i32 0`
			`ret <4 x float> %i3`
			`}`

[InstCombine] transform more extract/insert pairs into shuffles (PR2109) This is an extension of the shuffle combining from r203229: http://reviews.llvm.org/rL203229 The idea is to widen a short input vector with undef elements so the existing shuffle transform for extract/insert can kick in. The motivation is to finally solve PR2109: https://llvm.org/bugs/show_bug.cgi?id=2109 For that example, the IR becomes: %1 = bitcast <2 x i32>* %P to <2 x float>* %ld1 = load <2 x float>, <2 x float>* %1, align 8 %2 = shufflevector <2 x float> %ld1, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> %i2 = shufflevector <4 x float> %A, <4 x float> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 5> ret <4 x float> %i2 And x86 SSE output improves from: movq (%rdi), %xmm1 ## xmm1 = mem[0],zero movdqa %xmm1, %xmm2 shufps $229, %xmm2, %xmm2 ## xmm2 = xmm2[1,1,2,3] shufps $48, %xmm0, %xmm1 ## xmm1 = xmm1[0,0],xmm0[3,0] shufps $132, %xmm1, %xmm0 ## xmm0 = xmm0[0,1],xmm1[0,2] shufps $32, %xmm0, %xmm2 ## xmm2 = xmm2[0,0],xmm0[2,0] shufps $36, %xmm2, %xmm0 ## xmm0 = xmm0[0,1],xmm2[2,0] retq To the almost optimal: movhpd (%rdi), %xmm0 Note: There's a tension in the existing transform related to generating arbitrary shufflevector masks. We avoid that in other places in InstCombine because we're scared that codegen can't handle strange masks, but it looks like we're ok with producing those here. I purposely chose weird insert/extract indexes for the regression tests to see the effect in these cases. For PowerPC+Altivec, AArch64, and X86+SSE/AVX, I think the codegen is equal or better for these examples. Differential Revision: http://reviews.llvm.org/D15096 llvm-svn: 256394 2015-12-25 05:17:56 +08:00			`define <8 x float> @widen_extract4(<8 x float> %ins, <2 x float> %ext) {`
			`; CHECK-LABEL: @widen_extract4(`
			`; CHECK-NEXT: %[[WIDEVEC:.*]] = shufflevector <2 x float> %ext, <2 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>`
			`; CHECK-NEXT: shufflevector <8 x float> %ins, <8 x float> %[[WIDEVEC]], <8 x i32> <i32 0, i32 1, i32 8, i32 3, i32 4, i32 5, i32 6, i32 7>`
[InstCombine] add tests to show potential vector IR shuffle transforms llvm-svn: 254342 2015-12-01 06:39:36 +08:00			`; CHECK-NEXT: ret <8 x float> %i1`
			`%e1 = extractelement <2 x float> %ext, i32 0`
			`%i1 = insertelement <8 x float> %ins, float %e1, i32 2`
			`ret <8 x float> %i1`
			`}`

[InstCombine] insert a new shuffle before its uses (PR26015) Although this solves the test case in PR26015: https://llvm.org/bugs/show_bug.cgi?id=26015 And may solve PR25999: https://llvm.org/bugs/show_bug.cgi?id=25999 ...I suspect this is not the best solution. I think we want to insert the new shuffle just ahead of the earliest ExtractElementInst that we're replacing, but I don't know how that should be implemented. Differential Revision: http://reviews.llvm.org/D15878 llvm-svn: 256857 2016-01-06 03:09:47 +08:00			`; PR26015: https://llvm.org/bugs/show_bug.cgi?id=26015`
			`; The widening shuffle must be inserted before any uses.`

			`define <8 x i16> @pr26015(<4 x i16> %t0) {`
			`; CHECK-LABEL: @pr26015(`
			`; CHECK-NEXT: %[[WIDEVEC:.*]] = shufflevector <4 x i16> %t0, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>`
			`; CHECK-NEXT: %[[EXT:.*]] = extractelement <4 x i16> %t0, i32 2`
			`; CHECK-NEXT: %t2 = insertelement <8 x i16> <i16 0, i16 0, i16 0, i16 undef, i16 0, i16 0, i16 undef, i16 undef>, i16 %[[EXT]], i32 3`
			`; CHECK-NEXT: %t3 = insertelement <8 x i16> %t2, i16 0, i32 6`
			`; CHECK-NEXT: %t5 = shufflevector <8 x i16> %t3, <8 x i16> %[[WIDEVEC]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 11>`
			`; CHECK-NEXT: ret <8 x i16> %t5`
			`%t1 = extractelement <4 x i16> %t0, i32 2`
			`%t2 = insertelement <8 x i16> zeroinitializer, i16 %t1, i32 3`
			`%t3 = insertelement <8 x i16> %t2, i16 0, i32 6`
			`%t4 = extractelement <4 x i16> %t0, i32 3`
			`%t5 = insertelement <8 x i16> %t3, i16 %t4, i32 7`
			`ret <8 x i16> %t5`
			`}`

			`; PR25999: https://llvm.org/bugs/show_bug.cgi?id=25999`
			`; TODO: The widening shuffle could be inserted at the start of the function to allow the first extract to use it.`

			`define <8 x i16> @pr25999(<4 x i16> %t0, i1 %b) {`
			`; CHECK-LABEL: @pr25999(`
			`; CHECK-NEXT: %t1 = extractelement <4 x i16> %t0, i32 2`
			`; CHECK-NEXT: br i1 %b, label %if, label %end`
			`; CHECK: if:`
			`; CHECK-NEXT: %[[WIDEVEC:.*]] = shufflevector <4 x i16> %t0, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>`
			`; CHECK-NEXT: %t2 = insertelement <8 x i16> <i16 0, i16 0, i16 0, i16 undef, i16 0, i16 0, i16 undef, i16 undef>, i16 %t1, i32 3`
			`; CHECK-NEXT: %t3 = insertelement <8 x i16> %t2, i16 0, i32 6`
			`; CHECK-NEXT: %t5 = shufflevector <8 x i16> %t3, <8 x i16> %[[WIDEVEC]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 11>`
			`; CHECK-NEXT: ret <8 x i16> %t5`
			`; CHECK: end:`
			`; CHECK-NEXT: %a1 = add i16 %t1, 4`
			`; CHECK-NEXT: %t6 = insertelement <8 x i16> <i16 undef, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, i16 %a1, i32 0`
			`; CHECK-NEXT: ret <8 x i16> %t6`

			`%t1 = extractelement <4 x i16> %t0, i32 2`
			`br i1 %b, label %if, label %end`

			`if:`
			`%t2 = insertelement <8 x i16> zeroinitializer, i16 %t1, i32 3`
			`%t3 = insertelement <8 x i16> %t2, i16 0, i32 6`
			`%t4 = extractelement <4 x i16> %t0, i32 3`
			`%t5 = insertelement <8 x i16> %t3, i16 %t4, i32 7`
			`ret <8 x i16> %t5`

			`end:`
			`%a1 = add i16 %t1, 4`
			`%t6 = insertelement <8 x i16> zeroinitializer, i16 %a1, i32 0`
			`ret <8 x i16> %t6`
			`}`

[InstCombine] insert a new shuffle in a safe place (PR25999) Limit this transform to a basic block and guard against PHIs. Hopefully, this fixes the remaining failures in PR25999: https://llvm.org/bugs/show_bug.cgi?id=25999 llvm-svn: 257133 2016-01-08 09:39:16 +08:00			`; The widening shuffle must be inserted at a valid point (after the PHIs).`

			`define <4 x double> @pr25999_phis1(i1 %c, <2 x double> %a, <4 x double> %b) {`
			`; CHECK-LABEL: @pr25999_phis1(`
			`; CHECK: %tmp1 = phi <2 x double> [ %a, %bb1 ], [ %r, %bb2 ]`
			`; CHECK-NEXT: %tmp2 = phi <4 x double> [ %b, %bb1 ], [ zeroinitializer, %bb2 ]`
			`; CHECK-NEXT: %[[WIDEVEC:.*]] = shufflevector <2 x double> %tmp1, <2 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>`
			`; CHECK-NEXT: %tmp4 = shufflevector <4 x double> %tmp2, <4 x double> %[[WIDEVEC]], <4 x i32> <i32 0, i32 1, i32 4, i32 3>`
			`; CHECK-NEXT: ret <4 x double> %tmp4`
			`bb1:`
			`br i1 %c, label %bb2, label %bb3`

			`bb2:`
			`%r = call <2 x double> @dummy(<2 x double> %a)`
			`br label %bb3`

			`bb3:`
			`%tmp1 = phi <2 x double> [ %a, %bb1 ], [ %r, %bb2 ]`
			`%tmp2 = phi <4 x double> [ %b, %bb1 ], [ zeroinitializer, %bb2 ]`
			`%tmp3 = extractelement <2 x double> %tmp1, i32 0`
			`%tmp4 = insertelement <4 x double> %tmp2, double %tmp3, i32 2`
			`ret <4 x double> %tmp4`
			`}`

			`declare <2 x double> @dummy(<2 x double>)`

			`define <4 x double> @pr25999_phis2(i1 %c, <2 x double> %a, <4 x double> %b) {`
			`; CHECK-LABEL: @pr25999_phis2(`
			`; CHECK: %tmp1 = phi <2 x double> [ %a, %bb1 ], [ %r, %bb2 ]`
			`; CHECK-NEXT: %tmp2 = phi <4 x double> [ %b, %bb1 ], [ zeroinitializer, %bb2 ]`
			`; CHECK-NEXT: %d = fadd <2 x double> %tmp1, %tmp1`
			`; CHECK-NEXT: %[[WIDEVEC:.*]] = shufflevector <2 x double> %d, <2 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>`
			`; CHECK-NEXT: %tmp4 = shufflevector <4 x double> %tmp2, <4 x double> %[[WIDEVEC]], <4 x i32> <i32 0, i32 1, i32 4, i32 3>`
			`; CHECK-NEXT: ret <4 x double> %tmp4`
			`bb1:`
			`br i1 %c, label %bb2, label %bb3`

			`bb2:`
			`%r = call <2 x double> @dummy(<2 x double> %a)`
			`br label %bb3`

			`bb3:`
			`%tmp1 = phi <2 x double> [ %a, %bb1 ], [ %r, %bb2 ]`
			`%tmp2 = phi <4 x double> [ %b, %bb1 ], [ zeroinitializer, %bb2 ]`
			`%d = fadd <2 x double> %tmp1, %tmp1`
			`%tmp3 = extractelement <2 x double> %d, i32 0`
			`%tmp4 = insertelement <4 x double> %tmp2, double %tmp3, i32 2`
			`ret <4 x double> %tmp4`
			`}`