llvm-project/llvm/test/CodeGen/X86/masked_gather_scatter.ll

2233 lines
86 KiB
LLVM
Raw Normal View History

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL --check-prefix=KNL_64
; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL --check-prefix=KNL_32
; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX
; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX_32
; RUN: opt -mtriple=x86_64-apple-darwin -scalarize-masked-mem-intrin -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR
; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu -mcpu=skx < %s -o /dev/null
; SCALAR-LABEL: test1
; SCALAR: extractelement <16 x float*>
; SCALAR-NEXT: load float
; SCALAR-NEXT: insertelement <16 x float>
; SCALAR-NEXT: extractelement <16 x float*>
; SCALAR-NEXT: load float
define <16 x float> @test1(float* %base, <16 x i32> %ind) {
; KNL_64-LABEL: test1:
; KNL_64: # BB#0:
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
; KNL_64-NEXT: vmovaps %zmm1, %zmm0
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test1:
; KNL_32: # BB#0:
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: kxnorw %k0, %k0, %k1
; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
; KNL_32-NEXT: vmovaps %zmm1, %zmm0
; KNL_32-NEXT: retl
;
; SKX-LABEL: test1:
; SKX: # BB#0:
; SKX-NEXT: kxnorw %k0, %k0, %k1
; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
; SKX-NEXT: vmovaps %zmm1, %zmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test1:
; SKX_32: # BB#0:
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
; SKX_32-NEXT: vmovaps %zmm1, %zmm0
; SKX_32-NEXT: retl
%broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
%broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
%sext_ind = sext <16 x i32> %ind to <16 x i64>
%gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
%res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
ret <16 x float>%res
}
declare <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*>, i32, <16 x i1>, <16 x i32>)
declare <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*>, i32, <16 x i1>, <16 x float>)
declare <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> , i32, <8 x i1> , <8 x i32> )
; SCALAR-LABEL: test2
; SCALAR: extractelement <16 x float*>
; SCALAR-NEXT: load float
; SCALAR-NEXT: insertelement <16 x float>
; SCALAR-NEXT: br label %else
; SCALAR: else:
; SCALAR-NEXT: %res.phi.else = phi
; SCALAR-NEXT: %Mask1 = extractelement <16 x i1> %imask, i32 1
; SCALAR-NEXT: %ToLoad1 = icmp eq i1 %Mask1, true
; SCALAR-NEXT: br i1 %ToLoad1, label %cond.load1, label %else2
define <16 x float> @test2(float* %base, <16 x i32> %ind, i16 %mask) {
; KNL_64-LABEL: test2:
; KNL_64: # BB#0:
; KNL_64-NEXT: kmovw %esi, %k1
; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
; KNL_64-NEXT: vmovaps %zmm1, %zmm0
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test2:
; KNL_32: # BB#0:
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
; KNL_32-NEXT: vmovaps %zmm1, %zmm0
; KNL_32-NEXT: retl
;
; SKX-LABEL: test2:
; SKX: # BB#0:
; SKX-NEXT: kmovw %esi, %k1
; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
; SKX-NEXT: vmovaps %zmm1, %zmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test2:
; SKX_32: # BB#0:
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
; SKX_32-NEXT: vmovaps %zmm1, %zmm0
; SKX_32-NEXT: retl
%broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
%broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
%sext_ind = sext <16 x i32> %ind to <16 x i64>
%gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
%imask = bitcast i16 %mask to <16 x i1>
%res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> %imask, <16 x float>undef)
ret <16 x float> %res
}
define <16 x i32> @test3(i32* %base, <16 x i32> %ind, i16 %mask) {
; KNL_64-LABEL: test3:
; KNL_64: # BB#0:
; KNL_64-NEXT: kmovw %esi, %k1
; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm0
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test3:
; KNL_32: # BB#0:
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm0
; KNL_32-NEXT: retl
;
; SKX-LABEL: test3:
; SKX: # BB#0:
; SKX-NEXT: kmovw %esi, %k1
; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test3:
; SKX_32: # BB#0:
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; SKX_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm0
; SKX_32-NEXT: retl
%broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
%broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
%sext_ind = sext <16 x i32> %ind to <16 x i64>
%gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i64> %sext_ind
%imask = bitcast i16 %mask to <16 x i1>
%res = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
ret <16 x i32> %res
}
define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) {
; KNL_64-LABEL: test4:
; KNL_64: # BB#0:
; KNL_64-NEXT: kmovw %esi, %k1
; KNL_64-NEXT: kmovw %k1, %k2
; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm2
; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
; KNL_64-NEXT: vpaddd %zmm2, %zmm1, %zmm0
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test4:
; KNL_32: # BB#0:
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; KNL_32-NEXT: kmovw %k1, %k2
; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2}
; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm2
; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1}
; KNL_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
; KNL_32-NEXT: retl
;
; SKX-LABEL: test4:
; SKX: # BB#0:
; SKX-NEXT: kmovw %esi, %k1
; SKX-NEXT: kmovw %k1, %k2
; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
; SKX-NEXT: vmovdqa64 %zmm1, %zmm2
; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
; SKX-NEXT: vpaddd %zmm2, %zmm1, %zmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test4:
; SKX_32: # BB#0:
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; SKX_32-NEXT: kmovw %k1, %k2
; SKX_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2}
; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm2
; SKX_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1}
; SKX_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
; SKX_32-NEXT: retl
%broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
%broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
%gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
%imask = bitcast i16 %mask to <16 x i1>
%gt1 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
%gt2 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1)
%res = add <16 x i32> %gt1, %gt2
ret <16 x i32> %res
}
; SCALAR-LABEL: test5
; SCALAR: %Mask0 = extractelement <16 x i1> %imask, i32 0
; SCALAR-NEXT: %ToStore0 = icmp eq i1 %Mask0, true
; SCALAR-NEXT: br i1 %ToStore0, label %cond.store, label %else
; SCALAR: cond.store:
; SCALAR-NEXT: %Elt0 = extractelement <16 x i32> %val, i32 0
; SCALAR-NEXT: %Ptr0 = extractelement <16 x i32*> %gep.random, i32 0
; SCALAR-NEXT: store i32 %Elt0, i32* %Ptr0, align 4
; SCALAR-NEXT: br label %else
; SCALAR: else:
; SCALAR-NEXT: %Mask1 = extractelement <16 x i1> %imask, i32 1
; SCALAR-NEXT: %ToStore1 = icmp eq i1 %Mask1, true
; SCALAR-NEXT: br i1 %ToStore1, label %cond.store1, label %else2
define void @test5(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
; KNL_64-LABEL: test5:
; KNL_64: # BB#0:
; KNL_64-NEXT: kmovw %esi, %k1
; KNL_64-NEXT: kmovw %k1, %k2
; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test5:
; KNL_32: # BB#0:
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; KNL_32-NEXT: kmovw %k1, %k2
; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2}
; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test5:
; SKX: # BB#0:
; SKX-NEXT: kmovw %esi, %k1
; SKX-NEXT: kmovw %k1, %k2
; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; SKX_32-LABEL: test5:
; SKX_32: # BB#0:
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; SKX_32-NEXT: kmovw %k1, %k2
; SKX_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2}
; SKX_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
; SKX_32-NEXT: vzeroupper
; SKX_32-NEXT: retl
%broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
%broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
%gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
%imask = bitcast i16 %mask to <16 x i1>
call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
ret void
}
declare void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> , <8 x i32*> , i32 , <8 x i1> )
declare void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> , <16 x i32*> , i32 , <16 x i1> )
; SCALAR-LABEL: test6
; SCALAR: store i32 %Elt0, i32* %Ptr01, align 4
; SCALAR-NEXT: %Elt1 = extractelement <8 x i32> %a1, i32 1
; SCALAR-NEXT: %Ptr12 = extractelement <8 x i32*> %ptr, i32 1
; SCALAR-NEXT: store i32 %Elt1, i32* %Ptr12, align 4
; SCALAR-NEXT: %Elt2 = extractelement <8 x i32> %a1, i32 2
; SCALAR-NEXT: %Ptr23 = extractelement <8 x i32*> %ptr, i32 2
; SCALAR-NEXT: store i32 %Elt2, i32* %Ptr23, align 4
define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) {
; KNL_64-LABEL: test6:
; KNL_64: # BB#0:
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
; KNL_64-NEXT: kxnorw %k0, %k0, %k2
; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
; KNL_64-NEXT: vmovdqa %ymm2, %ymm0
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test6:
; KNL_32: # BB#0:
; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm2
; KNL_32-NEXT: kxnorw %k0, %k0, %k1
; KNL_32-NEXT: kxnorw %k0, %k0, %k2
; KNL_32-NEXT: vpgatherqd (,%zmm2), %ymm1 {%k2}
; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm2) {%k1}
; KNL_32-NEXT: vmovdqa %ymm1, %ymm0
; KNL_32-NEXT: retl
;
; SKX-LABEL: test6:
; SKX: # BB#0:
; SKX-NEXT: kxnorw %k0, %k0, %k1
; SKX-NEXT: kxnorw %k0, %k0, %k2
; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
; SKX-NEXT: vmovdqa %ymm2, %ymm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test6:
; SKX_32: # BB#0:
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
; SKX_32-NEXT: kxnorw %k0, %k0, %k2
; SKX_32-NEXT: vpgatherdd (,%ymm1), %ymm2 {%k2}
; SKX_32-NEXT: vpscatterdd %ymm0, (,%ymm1) {%k1}
; SKX_32-NEXT: vmovdqa %ymm2, %ymm0
; SKX_32-NEXT: retl
%a = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
ret <8 x i32>%a
}
define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) {
;
; KNL_64-LABEL: test7:
; KNL_64: # BB#0:
; KNL_64-NEXT: kmovw %esi, %k1
; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0
; KNL_64-NEXT: kmovw %k1, %k2
; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm1 {%k2}
; KNL_64-NEXT: vmovdqa %ymm1, %ymm2
; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm2 {%k1}
; KNL_64-NEXT: vpaddd %ymm2, %ymm1, %ymm0
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test7:
; KNL_32: # BB#0:
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; KNL_32-NEXT: kmovw %ecx, %k1
; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0
; KNL_32-NEXT: kmovw %k1, %k2
; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm1 {%k2}
; KNL_32-NEXT: vmovdqa %ymm1, %ymm2
; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm2 {%k1}
; KNL_32-NEXT: vpaddd %ymm2, %ymm1, %ymm0
; KNL_32-NEXT: retl
;
; SKX-LABEL: test7:
; SKX: # BB#0:
; SKX-NEXT: kmovw %esi, %k1
; SKX-NEXT: kmovw %k1, %k2
; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm1 {%k2}
; SKX-NEXT: vmovdqa %ymm1, %ymm2
; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm2 {%k1}
; SKX-NEXT: vpaddd %ymm2, %ymm1, %ymm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test7:
; SKX_32: # BB#0:
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: kmovb {{[0-9]+}}(%esp), %k1
; SKX_32-NEXT: kmovw %k1, %k2
; SKX_32-NEXT: vpgatherdd (%eax,%ymm0,4), %ymm1 {%k2}
; SKX_32-NEXT: vmovdqa %ymm1, %ymm2
; SKX_32-NEXT: vpgatherdd (%eax,%ymm0,4), %ymm2 {%k1}
; SKX_32-NEXT: vpaddd %ymm2, %ymm1, %ymm0
; SKX_32-NEXT: retl
%broadcast.splatinsert = insertelement <8 x i32*> undef, i32* %base, i32 0
%broadcast.splat = shufflevector <8 x i32*> %broadcast.splatinsert, <8 x i32*> undef, <8 x i32> zeroinitializer
%gep.random = getelementptr i32, <8 x i32*> %broadcast.splat, <8 x i32> %ind
%imask = bitcast i8 %mask to <8 x i1>
%gt1 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>undef)
%gt2 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>%gt1)
%res = add <8 x i32> %gt1, %gt2
ret <8 x i32> %res
}
; No uniform base in this case, index <8 x i64> contains addresses,
; each gather call will be split into two
define <16 x i32> @test8(<16 x i32*> %ptr.random, <16 x i32> %ind, i16 %mask) {
; KNL_64-LABEL: test8:
; KNL_64: # BB#0:
; KNL_64-NEXT: kmovw %edi, %k1
; KNL_64-NEXT: kshiftrw $8, %k1, %k2
; KNL_64-NEXT: kmovw %k2, %k3
; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k3}
; KNL_64-NEXT: kmovw %k1, %k3
; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k3}
; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm4
; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
; KNL_64-NEXT: vpaddd %zmm0, %zmm4, %zmm0
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test8:
; KNL_32: # BB#0:
; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; KNL_32-NEXT: kmovw %k1, %k2
; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2}
; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm2
; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
; KNL_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
; KNL_32-NEXT: retl
;
; SKX-LABEL: test8:
; SKX: # BB#0:
; SKX-NEXT: kmovw %edi, %k1
; SKX-NEXT: kshiftrw $8, %k1, %k2
; SKX-NEXT: kmovw %k2, %k3
; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k3}
; SKX-NEXT: kmovw %k1, %k3
; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k3}
; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm4
; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm0
; SKX-NEXT: vpaddd %zmm0, %zmm4, %zmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test8:
; SKX_32: # BB#0:
; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; SKX_32-NEXT: kmovw %k1, %k2
; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2}
; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm2
; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
; SKX_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
; SKX_32-NEXT: retl
%imask = bitcast i16 %mask to <16 x i1>
%gt1 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
%gt2 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1)
%res = add <16 x i32> %gt1, %gt2
ret <16 x i32> %res
}
%struct.RT = type { i8, [10 x [20 x i32]], i8 }
%struct.ST = type { i32, double, %struct.RT }
; Masked gather for agregate types
; Test9 and Test10 should give the same result (scalar and vector indices in GEP)
define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) {
; KNL_64-LABEL: test9:
; KNL_64: # BB#0: # %entry
; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2
; KNL_64-NEXT: vpbroadcastq {{.*#+}} zmm3 = [824,824,824,824,824,824,824,824]
; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4
; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0
; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0
; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0
; KNL_64-NEXT: vpaddq %zmm2, %zmm0, %zmm0
; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1
; KNL_64-NEXT: vpmuldq {{.*}}(%rip){1to8}, %zmm1, %zmm1
; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0
; KNL_64-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test9:
; KNL_32: # BB#0: # %entry
; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm2
; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm3 = [80,80,80,80,80,80,80,80]
; KNL_32-NEXT: vpmulld %ymm3, %ymm1, %ymm1
; KNL_32-NEXT: vpmovqd %zmm0, %ymm0
; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm3 = [820,820,820,820,820,820,820,820]
; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0
; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [68,68,68,68,68,68,68,68]
; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0
; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1
; KNL_32-NEXT: kxnorw %k0, %k0, %k1
; KNL_32-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
; KNL_32-NEXT: retl
;
; SKX-LABEL: test9:
; SKX: # BB#0: # %entry
; SKX-NEXT: vpbroadcastq %rdi, %zmm2
; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0
; SKX-NEXT: vpmovsxdq %ymm1, %zmm1
; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1
; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; SKX-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
; SKX-NEXT: kxnorw %k0, %k0, %k1
; SKX-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
; SKX-NEXT: retq
;
; SKX_32-LABEL: test9:
; SKX_32: # BB#0: # %entry
; SKX_32-NEXT: vpmulld {{\.LCPI.*}}{1to8}, %ymm1, %ymm1
; SKX_32-NEXT: vpmovqd %zmm0, %ymm0
; SKX_32-NEXT: vpmulld {{\.LCPI.*}}{1to8}, %ymm0, %ymm0
; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to8}, %ymm0, %ymm0
; SKX_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; SKX_32-NEXT: vpaddd {{\.LCPI.*}}{1to8}, %ymm0, %ymm1
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
; SKX_32-NEXT: vpgatherdd (,%ymm1), %ymm0 {%k1}
; SKX_32-NEXT: retl
entry:
%broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0
%broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer
%arrayidx = getelementptr %struct.ST, <8 x %struct.ST*> %broadcast.splat, <8 x i64> %ind1, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>, <8 x i32><i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <8 x i32> %ind5, <8 x i64> <i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13>
%res = call <8 x i32 > @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
ret <8 x i32> %res
}
define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) {
; KNL_64-LABEL: test10:
; KNL_64: # BB#0: # %entry
; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2
; KNL_64-NEXT: vpbroadcastq {{.*#+}} zmm3 = [824,824,824,824,824,824,824,824]
; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4
; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0
; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0
; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0
; KNL_64-NEXT: vpaddq %zmm2, %zmm0, %zmm0
; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1
; KNL_64-NEXT: vpmuldq {{.*}}(%rip){1to8}, %zmm1, %zmm1
; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0
; KNL_64-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test10:
; KNL_32: # BB#0: # %entry
; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm2
; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm3 = [80,80,80,80,80,80,80,80]
; KNL_32-NEXT: vpmulld %ymm3, %ymm1, %ymm1
; KNL_32-NEXT: vpmovqd %zmm0, %ymm0
; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm3 = [820,820,820,820,820,820,820,820]
; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0
; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [68,68,68,68,68,68,68,68]
; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0
; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1
; KNL_32-NEXT: kxnorw %k0, %k0, %k1
; KNL_32-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
; KNL_32-NEXT: retl
;
; SKX-LABEL: test10:
; SKX: # BB#0: # %entry
; SKX-NEXT: vpbroadcastq %rdi, %zmm2
; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0
; SKX-NEXT: vpmovsxdq %ymm1, %zmm1
; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1
; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; SKX-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
; SKX-NEXT: kxnorw %k0, %k0, %k1
; SKX-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
; SKX-NEXT: retq
;
; SKX_32-LABEL: test10:
; SKX_32: # BB#0: # %entry
; SKX_32-NEXT: vpmulld {{\.LCPI.*}}{1to8}, %ymm1, %ymm1
; SKX_32-NEXT: vpmovqd %zmm0, %ymm0
; SKX_32-NEXT: vpmulld {{\.LCPI.*}}{1to8}, %ymm0, %ymm0
; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to8}, %ymm0, %ymm0
; SKX_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; SKX_32-NEXT: vpaddd {{\.LCPI.*}}{1to8}, %ymm0, %ymm1
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
; SKX_32-NEXT: vpgatherdd (,%ymm1), %ymm0 {%k1}
; SKX_32-NEXT: retl
entry:
%broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0
%broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer
%arrayidx = getelementptr %struct.ST, <8 x %struct.ST*> %broadcast.splat, <8 x i64> %i1, i32 2, i32 1, <8 x i32> %ind5, i64 13
%res = call <8 x i32 > @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
ret <8 x i32> %res
}
; Splat index in GEP, requires broadcast
define <16 x float> @test11(float* %base, i32 %ind) {
; KNL_64-LABEL: test11:
; KNL_64: # BB#0:
; KNL_64-NEXT: vpbroadcastd %esi, %zmm1
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test11:
; KNL_32: # BB#0:
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vbroadcastss {{[0-9]+}}(%esp), %zmm1
; KNL_32-NEXT: kxnorw %k0, %k0, %k1
; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
; KNL_32-NEXT: retl
;
; SKX-LABEL: test11:
; SKX: # BB#0:
; SKX-NEXT: vpbroadcastd %esi, %zmm1
; SKX-NEXT: kxnorw %k0, %k0, %k1
; SKX-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
; SKX-NEXT: retq
;
; SKX_32-LABEL: test11:
; SKX_32: # BB#0:
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vbroadcastss {{[0-9]+}}(%esp), %zmm1
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
; SKX_32-NEXT: retl
%broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
%broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
%gep.random = getelementptr float, <16 x float*> %broadcast.splat, i32 %ind
%res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
ret <16 x float>%res
}
; We are checking the uniform base here. It is taken directly from input to vgatherdps
define <16 x float> @test12(float* %base, <16 x i32> %ind) {
; KNL_64-LABEL: test12:
; KNL_64: # BB#0:
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
; KNL_64-NEXT: vmovaps %zmm1, %zmm0
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test12:
; KNL_32: # BB#0:
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: kxnorw %k0, %k0, %k1
; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
; KNL_32-NEXT: vmovaps %zmm1, %zmm0
; KNL_32-NEXT: retl
;
; SKX-LABEL: test12:
; SKX: # BB#0:
; SKX-NEXT: kxnorw %k0, %k0, %k1
; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
; SKX-NEXT: vmovaps %zmm1, %zmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test12:
; SKX_32: # BB#0:
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
; SKX_32-NEXT: vmovaps %zmm1, %zmm0
; SKX_32-NEXT: retl
%sext_ind = sext <16 x i32> %ind to <16 x i64>
%gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
%res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
ret <16 x float>%res
}
; The same as the previous, but the mask is undefined
define <16 x float> @test13(float* %base, <16 x i32> %ind) {
; KNL_64-LABEL: test13:
; KNL_64: # BB#0:
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
; KNL_64-NEXT: vmovaps %zmm1, %zmm0
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test13:
; KNL_32: # BB#0:
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: kxnorw %k0, %k0, %k1
; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
; KNL_32-NEXT: vmovaps %zmm1, %zmm0
; KNL_32-NEXT: retl
;
; SKX-LABEL: test13:
; SKX: # BB#0:
; SKX-NEXT: kxnorw %k0, %k0, %k1
; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
; SKX-NEXT: vmovaps %zmm1, %zmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test13:
; SKX_32: # BB#0:
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
; SKX_32-NEXT: vmovaps %zmm1, %zmm0
; SKX_32-NEXT: retl
%sext_ind = sext <16 x i32> %ind to <16 x i64>
%gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
%res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
ret <16 x float>%res
}
; The base pointer is not splat, can't find unform base
define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) {
; KNL_64-LABEL: test14:
; KNL_64: # BB#0:
[X86] Don't create VBROADCAST nodes with 256-bit or 512-bit input types Summary: We don't seem to have great rules on what a valid VBROADCAST node looks like. And as a consequence we end up with a lot of patterns to try to catch everything. We have patterns with scalar inputs, 128-bit vector inputs, 256-bit vector inputs, and 512-bit vector inputs. As you can see from the things improved here we are currently missing patterns for 128-bit loads being extended to 256-bit before the vbroadcast. I'd like to propose that VBROADCAST should always take a 128-bit vector type as input. As a first step towards that this patch adds an EXTRACT_SUBVECTOR in front of VBROADCAST when the input is 256 or 512-bits. In the future I would like to add scalar_to_vector around all the scalar operations. And maybe we should consider adding a VBROADCAST+load node to avoid separating loads from the broadcasting operation when the load itself isn't foldable. This requires an additional change in target shuffle combining to look for the extract subvector and look through it to find the original operand. I'm sure this change isn't perfect but was enough to fix a few test failures that were being caused. Another interesting thing I noticed is that the changes in masked_gather_scatter.ll show cases were we don't remove a useless insert into element 1 before broadcasting element 0. Reviewers: delena, RKSimon, zvi Reviewed By: zvi Subscribers: igorb, llvm-commits Differential Revision: https://reviews.llvm.org/D28747 llvm-svn: 295155
2017-02-15 14:58:47 +08:00
; KNL_64-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0
; KNL_64-NEXT: vpbroadcastq %xmm0, %zmm0
; KNL_64-NEXT: vmovd %esi, %xmm1
; KNL_64-NEXT: vpbroadcastd %xmm1, %ymm1
; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1
; KNL_64-NEXT: vpsllq $2, %zmm1, %zmm1
; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
; KNL_64-NEXT: kshiftrw $8, %k1, %k2
; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm1 {%k2}
; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm2 {%k1}
; KNL_64-NEXT: vinsertf64x4 $1, %ymm1, %zmm2, %zmm0
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test14:
; KNL_32: # BB#0:
[X86] Don't create VBROADCAST nodes with 256-bit or 512-bit input types Summary: We don't seem to have great rules on what a valid VBROADCAST node looks like. And as a consequence we end up with a lot of patterns to try to catch everything. We have patterns with scalar inputs, 128-bit vector inputs, 256-bit vector inputs, and 512-bit vector inputs. As you can see from the things improved here we are currently missing patterns for 128-bit loads being extended to 256-bit before the vbroadcast. I'd like to propose that VBROADCAST should always take a 128-bit vector type as input. As a first step towards that this patch adds an EXTRACT_SUBVECTOR in front of VBROADCAST when the input is 256 or 512-bits. In the future I would like to add scalar_to_vector around all the scalar operations. And maybe we should consider adding a VBROADCAST+load node to avoid separating loads from the broadcasting operation when the load itself isn't foldable. This requires an additional change in target shuffle combining to look for the extract subvector and look through it to find the original operand. I'm sure this change isn't perfect but was enough to fix a few test failures that were being caused. Another interesting thing I noticed is that the changes in masked_gather_scatter.ll show cases were we don't remove a useless insert into element 1 before broadcasting element 0. Reviewers: delena, RKSimon, zvi Reviewed By: zvi Subscribers: igorb, llvm-commits Differential Revision: https://reviews.llvm.org/D28747 llvm-svn: 295155
2017-02-15 14:58:47 +08:00
; KNL_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
; KNL_32-NEXT: vpbroadcastd %xmm0, %zmm0
; KNL_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
; KNL_32-NEXT: vpaddd %zmm1, %zmm0, %zmm1
; KNL_32-NEXT: kxnorw %k0, %k0, %k1
; KNL_32-NEXT: vgatherdps (,%zmm1), %zmm0 {%k1}
; KNL_32-NEXT: retl
;
; SKX-LABEL: test14:
; SKX: # BB#0:
[X86] Don't create VBROADCAST nodes with 256-bit or 512-bit input types Summary: We don't seem to have great rules on what a valid VBROADCAST node looks like. And as a consequence we end up with a lot of patterns to try to catch everything. We have patterns with scalar inputs, 128-bit vector inputs, 256-bit vector inputs, and 512-bit vector inputs. As you can see from the things improved here we are currently missing patterns for 128-bit loads being extended to 256-bit before the vbroadcast. I'd like to propose that VBROADCAST should always take a 128-bit vector type as input. As a first step towards that this patch adds an EXTRACT_SUBVECTOR in front of VBROADCAST when the input is 256 or 512-bits. In the future I would like to add scalar_to_vector around all the scalar operations. And maybe we should consider adding a VBROADCAST+load node to avoid separating loads from the broadcasting operation when the load itself isn't foldable. This requires an additional change in target shuffle combining to look for the extract subvector and look through it to find the original operand. I'm sure this change isn't perfect but was enough to fix a few test failures that were being caused. Another interesting thing I noticed is that the changes in masked_gather_scatter.ll show cases were we don't remove a useless insert into element 1 before broadcasting element 0. Reviewers: delena, RKSimon, zvi Reviewed By: zvi Subscribers: igorb, llvm-commits Differential Revision: https://reviews.llvm.org/D28747 llvm-svn: 295155
2017-02-15 14:58:47 +08:00
; SKX-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0
; SKX-NEXT: vpbroadcastq %xmm0, %zmm0
; SKX-NEXT: vpbroadcastd %esi, %ymm1
; SKX-NEXT: vpmovsxdq %ymm1, %zmm1
; SKX-NEXT: vpsllq $2, %zmm1, %zmm1
; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; SKX-NEXT: kxnorw %k0, %k0, %k1
; SKX-NEXT: kshiftrw $8, %k1, %k2
; SKX-NEXT: vgatherqps (,%zmm0), %ymm1 {%k2}
; SKX-NEXT: vgatherqps (,%zmm0), %ymm2 {%k1}
; SKX-NEXT: vinsertf32x8 $1, %ymm1, %zmm2, %zmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test14:
; SKX_32: # BB#0:
[X86] Don't create VBROADCAST nodes with 256-bit or 512-bit input types Summary: We don't seem to have great rules on what a valid VBROADCAST node looks like. And as a consequence we end up with a lot of patterns to try to catch everything. We have patterns with scalar inputs, 128-bit vector inputs, 256-bit vector inputs, and 512-bit vector inputs. As you can see from the things improved here we are currently missing patterns for 128-bit loads being extended to 256-bit before the vbroadcast. I'd like to propose that VBROADCAST should always take a 128-bit vector type as input. As a first step towards that this patch adds an EXTRACT_SUBVECTOR in front of VBROADCAST when the input is 256 or 512-bits. In the future I would like to add scalar_to_vector around all the scalar operations. And maybe we should consider adding a VBROADCAST+load node to avoid separating loads from the broadcasting operation when the load itself isn't foldable. This requires an additional change in target shuffle combining to look for the extract subvector and look through it to find the original operand. I'm sure this change isn't perfect but was enough to fix a few test failures that were being caused. Another interesting thing I noticed is that the changes in masked_gather_scatter.ll show cases were we don't remove a useless insert into element 1 before broadcasting element 0. Reviewers: delena, RKSimon, zvi Reviewed By: zvi Subscribers: igorb, llvm-commits Differential Revision: https://reviews.llvm.org/D28747 llvm-svn: 295155
2017-02-15 14:58:47 +08:00
; SKX_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
; SKX_32-NEXT: vpbroadcastd %xmm0, %zmm0
; SKX_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
; SKX_32-NEXT: vpaddd %zmm1, %zmm0, %zmm1
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
; SKX_32-NEXT: vgatherdps (,%zmm1), %zmm0 {%k1}
; SKX_32-NEXT: retl
%broadcast.splatinsert = insertelement <16 x float*> %vec, float* %base, i32 1
%broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
%gep.random = getelementptr float, <16 x float*> %broadcast.splat, i32 %ind
%res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
ret <16 x float>%res
}
declare <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*>, i32, <4 x i1>, <4 x float>)
declare <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*>, i32, <4 x i1>, <4 x double>)
declare <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*>, i32, <2 x i1>, <2 x double>)
; Gather smaller than existing instruction
define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) {
; KNL_64-LABEL: test15:
; KNL_64: # BB#0:
; KNL_64-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
; KNL_64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; KNL_64-NEXT: vpxor %ymm2, %ymm2, %ymm2
; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm2
; KNL_64-NEXT: vpslld $31, %ymm1, %ymm0
; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k1
; KNL_64-NEXT: vgatherqps (%rdi,%zmm2,4), %ymm0 {%k1}
; KNL_64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test15:
; KNL_32: # BB#0:
; KNL_32-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
; KNL_32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; KNL_32-NEXT: vpxor %ymm2, %ymm2, %ymm2
; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm2
; KNL_32-NEXT: vpslld $31, %ymm1, %ymm0
; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k1
; KNL_32-NEXT: vgatherqps (%eax,%zmm2,4), %ymm0 {%k1}
; KNL_32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test15:
; SKX: # BB#0:
; SKX-NEXT: vpslld $31, %xmm1, %xmm1
; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1
; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm1 {%k1}
; SKX-NEXT: vmovaps %xmm1, %xmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test15:
; SKX_32: # BB#0:
; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
; SKX_32-NEXT: vptestmd %xmm1, %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm1 {%k1}
; SKX_32-NEXT: vmovaps %xmm1, %xmm0
; SKX_32-NEXT: retl
%sext_ind = sext <4 x i32> %ind to <4 x i64>
%gep.random = getelementptr float, float* %base, <4 x i64> %sext_ind
%res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.random, i32 4, <4 x i1> %mask, <4 x float> undef)
ret <4 x float>%res
}
; Gather smaller than existing instruction
define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x double> %src0) {
; KNL_64-LABEL: test16:
; KNL_64: # BB#0:
; KNL_64-NEXT: # kill: %YMM2<def> %YMM2<kill> %ZMM2<def>
; KNL_64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
; KNL_64-NEXT: vpsrad $31, %xmm1, %xmm1
; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1
; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
; KNL_64-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0
; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_64-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1}
; KNL_64-NEXT: vmovapd %ymm2, %ymm0
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test16:
; KNL_32: # BB#0:
; KNL_32-NEXT: # kill: %YMM2<def> %YMM2<kill> %ZMM2<def>
; KNL_32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
; KNL_32-NEXT: vpsrad $31, %xmm1, %xmm1
; KNL_32-NEXT: vpmovsxdq %xmm1, %ymm1
; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
; KNL_32-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0
; KNL_32-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1}
; KNL_32-NEXT: vmovapd %ymm2, %ymm0
; KNL_32-NEXT: retl
;
; SKX-LABEL: test16:
; SKX: # BB#0:
; SKX-NEXT: vpslld $31, %xmm1, %xmm1
; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1
; SKX-NEXT: vgatherdpd (%rdi,%xmm0,8), %ymm2 {%k1}
; SKX-NEXT: vmovapd %ymm2, %ymm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test16:
; SKX_32: # BB#0:
; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
; SKX_32-NEXT: vptestmd %xmm1, %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vgatherdpd (%eax,%xmm0,8), %ymm2 {%k1}
; SKX_32-NEXT: vmovapd %ymm2, %ymm0
; SKX_32-NEXT: retl
%sext_ind = sext <4 x i32> %ind to <4 x i64>
%gep.random = getelementptr double, double* %base, <4 x i64> %sext_ind
%res = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> %gep.random, i32 4, <4 x i1> %mask, <4 x double> %src0)
ret <4 x double>%res
}
define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x double> %src0) {
; KNL_64-LABEL: test17:
; KNL_64: # BB#0:
; KNL_64-NEXT: # kill: %XMM2<def> %XMM2<kill> %ZMM2<def>
; KNL_64-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_64-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1}
; KNL_64-NEXT: vmovapd %xmm2, %xmm0
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test17:
; KNL_32: # BB#0:
; KNL_32-NEXT: # kill: %XMM2<def> %XMM2<kill> %ZMM2<def>
; KNL_32-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1}
; KNL_32-NEXT: vmovapd %xmm2, %xmm0
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test17:
; SKX: # BB#0:
; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1
; SKX-NEXT: vgatherqpd (%rdi,%xmm0,8), %xmm2 {%k1}
; SKX-NEXT: vmovapd %xmm2, %xmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test17:
; SKX_32: # BB#0:
; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vgatherqpd (%eax,%xmm0,8), %xmm2 {%k1}
; SKX_32-NEXT: vmovapd %xmm2, %xmm0
; SKX_32-NEXT: retl
%sext_ind = sext <2 x i32> %ind to <2 x i64>
%gep.random = getelementptr double, double* %base, <2 x i64> %sext_ind
%res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %gep.random, i32 4, <2 x i1> %mask, <2 x double> %src0)
ret <2 x double>%res
}
declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> , <4 x i32*> , i32 , <4 x i1> )
declare void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> , <4 x double*> , i32 , <4 x i1> )
declare void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> , <2 x i64*> , i32 , <2 x i1> )
declare void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> , <2 x i32*> , i32 , <2 x i1> )
declare void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> , <2 x float*> , i32 , <2 x i1> )
define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) {
; KNL_64-LABEL: test18:
; KNL_64: # BB#0:
; KNL_64-NEXT: # kill: %XMM2<def> %XMM2<kill> %YMM2<def>
; KNL_64-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
; KNL_64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3
; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
; KNL_64-NEXT: vpslld $31, %ymm2, %ymm2
; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test18:
; KNL_32: # BB#0:
; KNL_32-NEXT: # kill: %XMM2<def> %XMM2<kill> %YMM2<def>
; KNL_32-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
; KNL_32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3
; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1
; KNL_32-NEXT: vpslld $31, %ymm2, %ymm2
; KNL_32-NEXT: vptestmd %zmm2, %zmm2, %k1
; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test18:
; SKX: # BB#0:
; SKX-NEXT: vpslld $31, %xmm2, %xmm2
; SKX-NEXT: vptestmd %xmm2, %xmm2, %k1
; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; SKX_32-LABEL: test18:
; SKX_32: # BB#0:
; SKX_32-NEXT: vpslld $31, %xmm2, %xmm2
; SKX_32-NEXT: vptestmd %xmm2, %xmm2, %k1
; SKX_32-NEXT: vpscatterdd %xmm0, (,%xmm1) {%k1}
; SKX_32-NEXT: retl
call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
ret void
}
define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind) {
; KNL_64-LABEL: test19:
; KNL_64: # BB#0:
; KNL_64-NEXT: # kill: %YMM2<def> %YMM2<kill> %ZMM2<def>
; KNL_64-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
; KNL_64-NEXT: vpsrad $31, %xmm1, %xmm1
; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1
; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
; KNL_64-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_64-NEXT: vscatterqpd %zmm0, (%rdi,%zmm2,8) {%k1}
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test19:
; KNL_32: # BB#0:
; KNL_32-NEXT: # kill: %YMM2<def> %YMM2<kill> %ZMM2<def>
; KNL_32-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
; KNL_32-NEXT: vpsrad $31, %xmm1, %xmm1
; KNL_32-NEXT: vpmovsxdq %xmm1, %ymm1
; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
; KNL_32-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_32-NEXT: vscatterqpd %zmm0, (%eax,%zmm2,8) {%k1}
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test19:
; SKX: # BB#0:
; SKX-NEXT: vpslld $31, %xmm1, %xmm1
; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1
; SKX-NEXT: vscatterqpd %ymm0, (%rdi,%ymm2,8) {%k1}
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; SKX_32-LABEL: test19:
; SKX_32: # BB#0:
; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
; SKX_32-NEXT: vptestmd %xmm1, %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vscatterqpd %ymm0, (%eax,%ymm2,8) {%k1}
; SKX_32-NEXT: vzeroupper
; SKX_32-NEXT: retl
%gep = getelementptr double, double* %ptr, <4 x i64> %ind
call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> %a1, <4 x double*> %gep, i32 8, <4 x i1> %mask)
ret void
}
; Data type requires widening
define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) {
; KNL_64-LABEL: test20:
; KNL_64: # BB#0:
; KNL_64-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
; KNL_64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; KNL_64-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,2],zero,zero
; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3
; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
; KNL_64-NEXT: vpslld $31, %ymm2, %ymm2
; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1}
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test20:
; KNL_32: # BB#0:
; KNL_32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; KNL_32-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,2],zero,zero
; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3
; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1
; KNL_32-NEXT: vpslld $31, %ymm2, %ymm2
; KNL_32-NEXT: vptestmd %zmm2, %zmm2, %k1
; KNL_32-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1}
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test20:
; SKX: # BB#0:
; SKX-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
; SKX-NEXT: vpsllq $63, %xmm2, %xmm2
; SKX-NEXT: vptestmq %xmm2, %xmm2, %k0
; SKX-NEXT: kshiftlb $6, %k0, %k0
; SKX-NEXT: kshiftrb $6, %k0, %k1
; SKX-NEXT: vscatterqps %xmm0, (,%ymm1) {%k1}
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; SKX_32-LABEL: test20:
; SKX_32: # BB#0:
; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2
; SKX_32-NEXT: vptestmq %xmm2, %xmm2, %k0
; SKX_32-NEXT: kshiftlb $6, %k0, %k0
; SKX_32-NEXT: kshiftrb $6, %k0, %k1
; SKX_32-NEXT: vscatterdps %xmm0, (,%xmm1) {%k1}
; SKX_32-NEXT: retl
call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> %a1, <2 x float*> %ptr, i32 4, <2 x i1> %mask)
ret void
}
; Data type requires promotion
define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) {
; KNL_64-LABEL: test21:
; KNL_64: # BB#0:
; KNL_64-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
; KNL_64-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm2
; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; KNL_64-NEXT: vpsllq $63, %zmm2, %zmm2
; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k1
; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test21:
; KNL_32: # BB#0:
; KNL_32-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
; KNL_32-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm2
; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; KNL_32-NEXT: vpsllq $63, %zmm2, %zmm2
; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test21:
; SKX: # BB#0:
; SKX-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
; SKX-NEXT: vpsllq $63, %xmm2, %xmm2
; SKX-NEXT: vptestmq %xmm2, %xmm2, %k0
; SKX-NEXT: kshiftlb $6, %k0, %k0
; SKX-NEXT: kshiftrb $6, %k0, %k1
; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; SKX_32-LABEL: test21:
; SKX_32: # BB#0:
; SKX_32-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2
; SKX_32-NEXT: vptestmq %xmm2, %xmm2, %k0
; SKX_32-NEXT: kshiftlb $6, %k0, %k0
; SKX_32-NEXT: kshiftrb $6, %k0, %k1
; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX_32-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
; SKX_32-NEXT: vzeroupper
; SKX_32-NEXT: retl
call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask)
ret void
}
; The result type requires widening
declare <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*>, i32, <2 x i1>, <2 x float>)
define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x float> %src0) {
; KNL_64-LABEL: test22:
; KNL_64: # BB#0:
; KNL_64-NEXT: # kill: %XMM2<def> %XMM2<kill> %YMM2<def>
; KNL_64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3
; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0
; KNL_64-NEXT: vpslld $31, %ymm1, %ymm1
; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k1
; KNL_64-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k1}
; KNL_64-NEXT: vmovaps %xmm2, %xmm0
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test22:
; KNL_32: # BB#0:
; KNL_32-NEXT: # kill: %XMM2<def> %XMM2<kill> %YMM2<def>
; KNL_32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3
; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0
; KNL_32-NEXT: vpslld $31, %ymm1, %ymm1
; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
; KNL_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm2 {%k1}
; KNL_32-NEXT: vmovaps %xmm2, %xmm0
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test22:
; SKX: # BB#0:
; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
; SKX-NEXT: vptestmq %xmm1, %xmm1, %k0
; SKX-NEXT: kshiftlb $6, %k0, %k0
; SKX-NEXT: kshiftrb $6, %k0, %k1
; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm2 {%k1}
; SKX-NEXT: vmovaps %xmm2, %xmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test22:
; SKX_32: # BB#0:
; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k0
; SKX_32-NEXT: kshiftlb $6, %k0, %k0
; SKX_32-NEXT: kshiftrb $6, %k0, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm2 {%k1}
; SKX_32-NEXT: vmovaps %xmm2, %xmm0
; SKX_32-NEXT: retl
%sext_ind = sext <2 x i32> %ind to <2 x i64>
%gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind
%res = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0)
ret <2 x float>%res
}
define <2 x float> @test22a(float* %base, <2 x i64> %ind, <2 x i1> %mask, <2 x float> %src0) {
; KNL_64-LABEL: test22a:
; KNL_64: # BB#0:
; KNL_64-NEXT: # kill: %XMM2<def> %XMM2<kill> %YMM2<def>
; KNL_64-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; KNL_64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3
; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
; KNL_64-NEXT: vpslld $31, %ymm1, %ymm1
; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k1
; KNL_64-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k1}
; KNL_64-NEXT: vmovaps %xmm2, %xmm0
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test22a:
; KNL_32: # BB#0:
; KNL_32-NEXT: # kill: %XMM2<def> %XMM2<kill> %YMM2<def>
; KNL_32-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; KNL_32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3
; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpslld $31, %ymm1, %ymm1
; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
; KNL_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm2 {%k1}
; KNL_32-NEXT: vmovaps %xmm2, %xmm0
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test22a:
; SKX: # BB#0:
; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1
; SKX-NEXT: vgatherqps (%rdi,%xmm0,4), %xmm2 {%k1}
; SKX-NEXT: vmovaps %xmm2, %xmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test22a:
; SKX_32: # BB#0:
; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vgatherqps (%eax,%xmm0,4), %xmm2 {%k1}
; SKX_32-NEXT: vmovaps %xmm2, %xmm0
; SKX_32-NEXT: retl
%gep.random = getelementptr float, float* %base, <2 x i64> %ind
%res = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0)
ret <2 x float>%res
}
declare <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>)
declare <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*>, i32, <2 x i1>, <2 x i64>)
define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %src0) {
; KNL_64-LABEL: test23:
; KNL_64: # BB#0:
; KNL_64-NEXT: # kill: %XMM2<def> %XMM2<kill> %ZMM2<def>
; KNL_64-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1}
; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test23:
; KNL_32: # BB#0:
; KNL_32-NEXT: # kill: %XMM2<def> %XMM2<kill> %ZMM2<def>
; KNL_32-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1}
; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test23:
; SKX: # BB#0:
; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1
; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
; SKX-NEXT: vpgatherqd (%rdi,%xmm0,4), %xmm1 {%k1}
; SKX-NEXT: vpmovsxdq %xmm1, %xmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test23:
; SKX_32: # BB#0:
; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
; SKX_32-NEXT: vpgatherqd (%eax,%xmm0,4), %xmm1 {%k1}
; SKX_32-NEXT: vpmovsxdq %xmm1, %xmm0
; SKX_32-NEXT: retl
%sext_ind = sext <2 x i32> %ind to <2 x i64>
%gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
%res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0)
ret <2 x i32>%res
}
define <2 x i32> @test24(i32* %base, <2 x i32> %ind) {
; KNL_64-LABEL: test24:
; KNL_64: # BB#0:
; KNL_64-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; KNL_64-NEXT: movb $3, %al
; KNL_64-NEXT: kmovw %eax, %k1
; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
; KNL_64-NEXT: vmovdqa %xmm1, %xmm0
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test24:
; KNL_32: # BB#0:
; KNL_32-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpxord %zmm1, %zmm1, %zmm1
; KNL_32-NEXT: vinserti32x4 $0, {{\.LCPI.*}}, %zmm1, %zmm1
; KNL_32-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1}
; KNL_32-NEXT: vmovdqa %xmm1, %xmm0
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test24:
; SKX: # BB#0:
; SKX-NEXT: kxnorw %k0, %k0, %k1
; SKX-NEXT: vpgatherqd (%rdi,%xmm0,4), %xmm1 {%k1}
; SKX-NEXT: vpmovsxdq %xmm1, %xmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test24:
; SKX_32: # BB#0:
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
; SKX_32-NEXT: vpgatherqd (%eax,%xmm0,4), %xmm1 {%k1}
; SKX_32-NEXT: vpmovsxdq %xmm1, %xmm0
; SKX_32-NEXT: retl
%sext_ind = sext <2 x i32> %ind to <2 x i64>
%gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
%res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
ret <2 x i32>%res
}
define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %src0) {
; KNL_64-LABEL: test25:
; KNL_64: # BB#0:
; KNL_64-NEXT: # kill: %XMM2<def> %XMM2<kill> %ZMM2<def>
; KNL_64-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1}
; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test25:
; KNL_32: # BB#0:
; KNL_32-NEXT: # kill: %XMM2<def> %XMM2<kill> %ZMM2<def>
; KNL_32-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1}
; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test25:
; SKX: # BB#0:
; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1
; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1}
; SKX-NEXT: vmovdqa %xmm2, %xmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test25:
; SKX_32: # BB#0:
; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm2 {%k1}
; SKX_32-NEXT: vmovdqa %xmm2, %xmm0
; SKX_32-NEXT: retl
%sext_ind = sext <2 x i32> %ind to <2 x i64>
%gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind
%res = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> %gep.random, i32 8, <2 x i1> %mask, <2 x i64> %src0)
ret <2 x i64>%res
}
define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) {
; KNL_64-LABEL: test26:
; KNL_64: # BB#0:
; KNL_64-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
; KNL_64-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; KNL_64-NEXT: movb $3, %al
; KNL_64-NEXT: kmovw %eax, %k1
; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
; KNL_64-NEXT: vmovdqa %xmm1, %xmm0
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test26:
; KNL_32: # BB#0:
; KNL_32-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
; KNL_32-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpxord %zmm2, %zmm2, %zmm2
; KNL_32-NEXT: vinserti32x4 $0, {{\.LCPI.*}}, %zmm2, %zmm2
; KNL_32-NEXT: vpsllq $63, %zmm2, %zmm2
; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1}
; KNL_32-NEXT: vmovdqa %xmm1, %xmm0
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test26:
; SKX: # BB#0:
; SKX-NEXT: kxnorw %k0, %k0, %k1
; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1}
; SKX-NEXT: vmovdqa %xmm1, %xmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test26:
; SKX_32: # BB#0:
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm1 {%k1}
; SKX_32-NEXT: vmovdqa %xmm1, %xmm0
; SKX_32-NEXT: retl
%sext_ind = sext <2 x i32> %ind to <2 x i64>
%gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind
%res = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> %gep.random, i32 8, <2 x i1> <i1 true, i1 true>, <2 x i64> %src0)
ret <2 x i64>%res
}
; Result type requires widening; all-ones mask
define <2 x float> @test27(float* %base, <2 x i32> %ind) {
; KNL_64-LABEL: test27:
; KNL_64: # BB#0:
; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm1
; KNL_64-NEXT: movb $3, %al
; KNL_64-NEXT: kmovw %eax, %k1
; KNL_64-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
; KNL_64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test27:
; KNL_32: # BB#0:
; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1
; KNL_32-NEXT: movb $3, %cl
; KNL_32-NEXT: kmovw %ecx, %k1
; KNL_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
; KNL_32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test27:
; SKX: # BB#0:
; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
; SKX-NEXT: movb $3, %al
; SKX-NEXT: kmovw %eax, %k1
; SKX-NEXT: vgatherdps (%rdi,%xmm1,4), %xmm0 {%k1}
; SKX-NEXT: retq
;
; SKX_32-LABEL: test27:
; SKX_32: # BB#0:
; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: movb $3, %cl
; SKX_32-NEXT: kmovw %ecx, %k1
; SKX_32-NEXT: vgatherdps (%eax,%xmm1,4), %xmm0 {%k1}
; SKX_32-NEXT: retl
%sext_ind = sext <2 x i32> %ind to <2 x i64>
%gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind
%res = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x float> undef)
ret <2 x float>%res
}
; Data type requires promotion, mask is all-ones
define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) {
; KNL_64-LABEL: test28:
; KNL_64: # BB#0:
; KNL_64-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; KNL_64-NEXT: movb $3, %al
; KNL_64-NEXT: kmovw %eax, %k1
; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test28:
; KNL_32: # BB#0:
; KNL_32-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; KNL_32-NEXT: vpxord %zmm2, %zmm2, %zmm2
; KNL_32-NEXT: vinserti32x4 $0, {{\.LCPI.*}}, %zmm2, %zmm2
; KNL_32-NEXT: vpsllq $63, %zmm2, %zmm2
; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test28:
; SKX: # BB#0:
; SKX-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
; SKX-NEXT: movb $3, %al
; SKX-NEXT: kmovw %eax, %k1
; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; SKX_32-LABEL: test28:
; SKX_32: # BB#0:
; SKX_32-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
; SKX_32-NEXT: movb $3, %al
; SKX_32-NEXT: kmovw %eax, %k1
; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX_32-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
; SKX_32-NEXT: vzeroupper
; SKX_32-NEXT: retl
call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> <i1 true, i1 true>)
ret void
}
; SCALAR-LABEL: test29
; SCALAR: extractelement <16 x float*>
; SCALAR-NEXT: load float
; SCALAR-NEXT: insertelement <16 x float>
; SCALAR-NEXT: extractelement <16 x float*>
; SCALAR-NEXT: load float
define <16 x float> @test29(float* %base, <16 x i32> %ind) {
; KNL_64-LABEL: test29:
; KNL_64: # BB#0:
; KNL_64-NEXT: movw $44, %ax
; KNL_64-NEXT: kmovw %eax, %k1
; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
; KNL_64-NEXT: vmovaps %zmm1, %zmm0
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test29:
; KNL_32: # BB#0:
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: movw $44, %cx
; KNL_32-NEXT: kmovw %ecx, %k1
; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
; KNL_32-NEXT: vmovaps %zmm1, %zmm0
; KNL_32-NEXT: retl
;
; SKX-LABEL: test29:
; SKX: # BB#0:
; SKX-NEXT: movw $44, %ax
; SKX-NEXT: kmovw %eax, %k1
; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
; SKX-NEXT: vmovaps %zmm1, %zmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test29:
; SKX_32: # BB#0:
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: movw $44, %cx
; SKX_32-NEXT: kmovw %ecx, %k1
; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
; SKX_32-NEXT: vmovaps %zmm1, %zmm0
; SKX_32-NEXT: retl
%broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
%broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
%sext_ind = sext <16 x i32> %ind to <16 x i64>
%gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
%res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> undef)
ret <16 x float>%res
}
; Check non-power-of-2 case. It should be scalarized.
declare <3 x i32> @llvm.masked.gather.v3i32.v3p0i32(<3 x i32*>, i32, <3 x i1>, <3 x i32>)
define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) {
; ALL-LABEL: test30
; ALL-NOT: gather
%sext_ind = sext <3 x i32> %ind to <3 x i64>
%gep.random = getelementptr i32, <3 x i32*> %base, <3 x i64> %sext_ind
%res = call <3 x i32> @llvm.masked.gather.v3i32.v3p0i32(<3 x i32*> %gep.random, i32 4, <3 x i1> %mask, <3 x i32> %src0)
ret <3 x i32>%res
}
declare <16 x float*> @llvm.masked.gather.v16p0f32.v16p0p0f32(<16 x float**>, i32, <16 x i1>, <16 x float*>)
define <16 x float*> @test31(<16 x float**> %ptrs) {
; KNL_64-LABEL: test31:
; KNL_64: # BB#0:
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
; KNL_64-NEXT: kxnorw %k0, %k0, %k2
; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm2 {%k2}
; KNL_64-NEXT: kshiftrw $8, %k1, %k1
; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm3 {%k1}
; KNL_64-NEXT: vmovdqa64 %zmm2, %zmm0
; KNL_64-NEXT: vmovdqa64 %zmm3, %zmm1
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test31:
; KNL_32: # BB#0:
; KNL_32-NEXT: kxnorw %k0, %k0, %k1
; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1}
; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm0
; KNL_32-NEXT: retl
;
; SKX-LABEL: test31:
; SKX: # BB#0:
; SKX-NEXT: kxnorw %k0, %k0, %k1
; SKX-NEXT: kxnorw %k0, %k0, %k2
; SKX-NEXT: vpgatherqq (,%zmm0), %zmm2 {%k2}
; SKX-NEXT: kshiftrw $8, %k1, %k1
; SKX-NEXT: vpgatherqq (,%zmm1), %zmm3 {%k1}
; SKX-NEXT: vmovdqa64 %zmm2, %zmm0
; SKX-NEXT: vmovdqa64 %zmm3, %zmm1
; SKX-NEXT: retq
;
; SKX_32-LABEL: test31:
; SKX_32: # BB#0:
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1}
; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm0
; SKX_32-NEXT: retl
%res = call <16 x float*> @llvm.masked.gather.v16p0f32.v16p0p0f32(<16 x float**> %ptrs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float*> undef)
ret <16 x float*>%res
}
define <16 x i32> @test_gather_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0) {
; KNL_64-LABEL: test_gather_16i32:
; KNL_64: # BB#0:
; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
; KNL_64-NEXT: vextracti64x4 $1, %zmm3, %ymm2
; KNL_64-NEXT: kshiftrw $8, %k1, %k2
; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test_gather_16i32:
; KNL_32: # BB#0:
; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
; KNL_32-NEXT: vmovdqa64 %zmm2, %zmm0
; KNL_32-NEXT: retl
;
; SKX-LABEL: test_gather_16i32:
; SKX: # BB#0:
; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
; SKX-NEXT: vpslld $31, %zmm2, %zmm2
; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
; SKX-NEXT: vextracti32x8 $1, %zmm3, %ymm2
; SKX-NEXT: kshiftrw $8, %k1, %k2
; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test_gather_16i32:
; SKX_32: # BB#0:
; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
; SKX_32-NEXT: vmovdqa64 %zmm2, %zmm0
; SKX_32-NEXT: retl
%res = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %ptrs, i32 4, <16 x i1> %mask, <16 x i32> %src0)
ret <16 x i32> %res
}
define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0) {
; KNL_64-LABEL: test_gather_16i64:
; KNL_64: # BB#0:
; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
; KNL_64-NEXT: kshiftrw $8, %k1, %k2
; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1}
; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm4 {%k2}
; KNL_64-NEXT: vmovdqa64 %zmm3, %zmm0
; KNL_64-NEXT: vmovdqa64 %zmm4, %zmm1
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test_gather_16i64:
; KNL_32: # BB#0:
; KNL_32-NEXT: pushl %ebp
; KNL_32-NEXT: .Lcfi0:
; KNL_32-NEXT: .cfi_def_cfa_offset 8
; KNL_32-NEXT: .Lcfi1:
; KNL_32-NEXT: .cfi_offset %ebp, -8
; KNL_32-NEXT: movl %esp, %ebp
; KNL_32-NEXT: .Lcfi2:
; KNL_32-NEXT: .cfi_def_cfa_register %ebp
; KNL_32-NEXT: andl $-64, %esp
; KNL_32-NEXT: subl $64, %esp
; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
; KNL_32-NEXT: vmovdqa64 8(%ebp), %zmm1
; KNL_32-NEXT: kshiftrw $8, %k1, %k2
; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm2 {%k1}
; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k2}
; KNL_32-NEXT: vmovdqa64 %zmm2, %zmm0
; KNL_32-NEXT: movl %ebp, %esp
; KNL_32-NEXT: popl %ebp
; KNL_32-NEXT: retl
;
; SKX-LABEL: test_gather_16i64:
; SKX: # BB#0:
; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
; SKX-NEXT: vpslld $31, %zmm2, %zmm2
; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
; SKX-NEXT: kshiftrw $8, %k1, %k2
; SKX-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1}
; SKX-NEXT: vpgatherqq (,%zmm1), %zmm4 {%k2}
; SKX-NEXT: vmovdqa64 %zmm3, %zmm0
; SKX-NEXT: vmovdqa64 %zmm4, %zmm1
; SKX-NEXT: retq
;
; SKX_32-LABEL: test_gather_16i64:
; SKX_32: # BB#0:
; SKX_32-NEXT: pushl %ebp
; SKX_32-NEXT: .Lcfi1:
[X86] Correct dwarf unwind information in function epilogue CFI instructions that set appropriate cfa offset and cfa register are now inserted in emitEpilogue() in X86FrameLowering. Majority of the changes in this patch: 1. Ensure that CFI instructions do not affect code generation. 2. Enable maintaining correct information about cfa offset and cfa register in a function when basic blocks are reordered, merged, split, duplicated. These changes are target independent and described below. Changed CFI instructions so that they: 1. are duplicable 2. are not counted as instructions when tail duplicating or tail merging 3. can be compared as equal Add information to each MachineBasicBlock about cfa offset and cfa register that are valid at its entry and exit (incoming and outgoing CFI info). Add support for updating this information when basic blocks are merged, split, duplicated, created. Add a verification pass (CFIInfoVerifier) that checks that outgoing cfa offset and register of predecessor blocks match incoming values of their successors. Incoming and outgoing CFI information is used by a late pass (CFIInstrInserter) that corrects CFA calculation rule for a basic block if needed. That means that additional CFI instructions get inserted at basic block beginning to correct the rule for calculating CFA. Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D18046 llvm-svn: 306529
2017-06-28 18:21:17 +08:00
; SKX_32-NEXT: .cfi_def_cfa_offset 8
; SKX_32-NEXT: .Lcfi2:
; SKX_32-NEXT: .cfi_offset %ebp, -8
; SKX_32-NEXT: movl %esp, %ebp
; SKX_32-NEXT: .Lcfi3:
; SKX_32-NEXT: .cfi_def_cfa_register %ebp
; SKX_32-NEXT: andl $-64, %esp
; SKX_32-NEXT: subl $64, %esp
; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
; SKX_32-NEXT: vmovdqa64 8(%ebp), %zmm1
; SKX_32-NEXT: kshiftrw $8, %k1, %k2
; SKX_32-NEXT: vpgatherdq (,%ymm0), %zmm2 {%k1}
; SKX_32-NEXT: vextracti32x8 $1, %zmm0, %ymm0
; SKX_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k2}
; SKX_32-NEXT: vmovdqa64 %zmm2, %zmm0
; SKX_32-NEXT: movl %ebp, %esp
; SKX_32-NEXT: popl %ebp
; SKX_32-NEXT: retl
%res = call <16 x i64> @llvm.masked.gather.v16i64.v16p0i64(<16 x i64*> %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0)
ret <16 x i64> %res
}
declare <16 x i64> @llvm.masked.gather.v16i64.v16p0i64(<16 x i64*> %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0)
define <16 x float> @test_gather_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0) {
; KNL_64-LABEL: test_gather_16f32:
; KNL_64: # BB#0:
; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
; KNL_64-NEXT: vextractf64x4 $1, %zmm3, %ymm2
; KNL_64-NEXT: kshiftrw $8, %k1, %k2
; KNL_64-NEXT: vgatherqps (,%zmm1), %ymm2 {%k2}
; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm3 {%k1}
; KNL_64-NEXT: vinsertf64x4 $1, %ymm2, %zmm3, %zmm0
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test_gather_16f32:
; KNL_32: # BB#0:
; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
; KNL_32-NEXT: vgatherdps (,%zmm0), %zmm2 {%k1}
; KNL_32-NEXT: vmovaps %zmm2, %zmm0
; KNL_32-NEXT: retl
;
; SKX-LABEL: test_gather_16f32:
; SKX: # BB#0:
; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
; SKX-NEXT: vpslld $31, %zmm2, %zmm2
; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
; SKX-NEXT: vextractf32x8 $1, %zmm3, %ymm2
; SKX-NEXT: kshiftrw $8, %k1, %k2
; SKX-NEXT: vgatherqps (,%zmm1), %ymm2 {%k2}
; SKX-NEXT: vgatherqps (,%zmm0), %ymm3 {%k1}
; SKX-NEXT: vinsertf32x8 $1, %ymm2, %zmm3, %zmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test_gather_16f32:
; SKX_32: # BB#0:
; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
; SKX_32-NEXT: vgatherdps (,%zmm0), %zmm2 {%k1}
; SKX_32-NEXT: vmovaps %zmm2, %zmm0
; SKX_32-NEXT: retl
%res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %ptrs, i32 4, <16 x i1> %mask, <16 x float> %src0)
ret <16 x float> %res
}
define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0) {
; KNL_64-LABEL: test_gather_16f64:
; KNL_64: # BB#0:
; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
; KNL_64-NEXT: kshiftrw $8, %k1, %k2
; KNL_64-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1}
; KNL_64-NEXT: vgatherqpd (,%zmm1), %zmm4 {%k2}
; KNL_64-NEXT: vmovapd %zmm3, %zmm0
; KNL_64-NEXT: vmovapd %zmm4, %zmm1
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test_gather_16f64:
; KNL_32: # BB#0:
; KNL_32-NEXT: pushl %ebp
; KNL_32-NEXT: .Lcfi3:
[X86] Correct dwarf unwind information in function epilogue CFI instructions that set appropriate cfa offset and cfa register are now inserted in emitEpilogue() in X86FrameLowering. Majority of the changes in this patch: 1. Ensure that CFI instructions do not affect code generation. 2. Enable maintaining correct information about cfa offset and cfa register in a function when basic blocks are reordered, merged, split, duplicated. These changes are target independent and described below. Changed CFI instructions so that they: 1. are duplicable 2. are not counted as instructions when tail duplicating or tail merging 3. can be compared as equal Add information to each MachineBasicBlock about cfa offset and cfa register that are valid at its entry and exit (incoming and outgoing CFI info). Add support for updating this information when basic blocks are merged, split, duplicated, created. Add a verification pass (CFIInfoVerifier) that checks that outgoing cfa offset and register of predecessor blocks match incoming values of their successors. Incoming and outgoing CFI information is used by a late pass (CFIInstrInserter) that corrects CFA calculation rule for a basic block if needed. That means that additional CFI instructions get inserted at basic block beginning to correct the rule for calculating CFA. Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D18046 llvm-svn: 306529
2017-06-28 18:21:17 +08:00
; KNL_32-NEXT: .cfi_def_cfa_offset 8
; KNL_32-NEXT: .Lcfi4:
; KNL_32-NEXT: .cfi_offset %ebp, -8
; KNL_32-NEXT: movl %esp, %ebp
; KNL_32-NEXT: .Lcfi5:
; KNL_32-NEXT: .cfi_def_cfa_register %ebp
; KNL_32-NEXT: andl $-64, %esp
; KNL_32-NEXT: subl $64, %esp
; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
; KNL_32-NEXT: vmovapd 8(%ebp), %zmm1
; KNL_32-NEXT: kshiftrw $8, %k1, %k2
; KNL_32-NEXT: vgatherdpd (,%ymm0), %zmm2 {%k1}
; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; KNL_32-NEXT: vgatherdpd (,%ymm0), %zmm1 {%k2}
; KNL_32-NEXT: vmovapd %zmm2, %zmm0
; KNL_32-NEXT: movl %ebp, %esp
; KNL_32-NEXT: popl %ebp
; KNL_32-NEXT: retl
;
; SKX-LABEL: test_gather_16f64:
; SKX: # BB#0:
; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
; SKX-NEXT: vpslld $31, %zmm2, %zmm2
; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
; SKX-NEXT: kshiftrw $8, %k1, %k2
; SKX-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1}
; SKX-NEXT: vgatherqpd (,%zmm1), %zmm4 {%k2}
; SKX-NEXT: vmovapd %zmm3, %zmm0
; SKX-NEXT: vmovapd %zmm4, %zmm1
; SKX-NEXT: retq
;
; SKX_32-LABEL: test_gather_16f64:
; SKX_32: # BB#0:
; SKX_32-NEXT: pushl %ebp
; SKX_32-NEXT: .Lcfi4:
; SKX_32-NEXT: .cfi_def_cfa_offset 8
; SKX_32-NEXT: .Lcfi5:
; SKX_32-NEXT: .cfi_offset %ebp, -8
; SKX_32-NEXT: movl %esp, %ebp
; SKX_32-NEXT: .Lcfi6:
; SKX_32-NEXT: .cfi_def_cfa_register %ebp
; SKX_32-NEXT: andl $-64, %esp
; SKX_32-NEXT: subl $64, %esp
; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
; SKX_32-NEXT: vmovapd 8(%ebp), %zmm1
; SKX_32-NEXT: kshiftrw $8, %k1, %k2
; SKX_32-NEXT: vgatherdpd (,%ymm0), %zmm2 {%k1}
; SKX_32-NEXT: vextracti32x8 $1, %zmm0, %ymm0
; SKX_32-NEXT: vgatherdpd (,%ymm0), %zmm1 {%k2}
; SKX_32-NEXT: vmovapd %zmm2, %zmm0
; SKX_32-NEXT: movl %ebp, %esp
; SKX_32-NEXT: popl %ebp
; SKX_32-NEXT: retl
%res = call <16 x double> @llvm.masked.gather.v16f64.v16p0f64(<16 x double*> %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0)
ret <16 x double> %res
}
declare <16 x double> @llvm.masked.gather.v16f64.v16p0f64(<16 x double*> %ptrs, i32, <16 x i1> %mask, <16 x double> %src0)
define void @test_scatter_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0) {
; KNL_64-LABEL: test_scatter_16i32:
; KNL_64: # BB#0:
; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
; KNL_64-NEXT: kshiftrw $8, %k1, %k2
; KNL_64-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1}
; KNL_64-NEXT: vextracti64x4 $1, %zmm3, %ymm0
; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2}
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test_scatter_16i32:
; KNL_32: # BB#0:
; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
; KNL_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1}
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test_scatter_16i32:
; SKX: # BB#0:
; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
; SKX-NEXT: vpslld $31, %zmm2, %zmm2
; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
; SKX-NEXT: kshiftrw $8, %k1, %k2
; SKX-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1}
; SKX-NEXT: vextracti32x8 $1, %zmm3, %ymm0
; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2}
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; SKX_32-LABEL: test_scatter_16i32:
; SKX_32: # BB#0:
; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
; SKX_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1}
; SKX_32-NEXT: vzeroupper
; SKX_32-NEXT: retl
call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %src0, <16 x i32*> %ptrs, i32 4, <16 x i1> %mask)
ret void
}
define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0) {
; KNL_64-LABEL: test_scatter_16i64:
; KNL_64: # BB#0:
; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
; KNL_64-NEXT: kshiftrw $8, %k1, %k2
; KNL_64-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1}
; KNL_64-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2}
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test_scatter_16i64:
; KNL_32: # BB#0:
; KNL_32-NEXT: pushl %ebp
; KNL_32-NEXT: .Lcfi6:
; KNL_32-NEXT: .cfi_def_cfa_offset 8
; KNL_32-NEXT: .Lcfi7:
; KNL_32-NEXT: .cfi_offset %ebp, -8
; KNL_32-NEXT: movl %esp, %ebp
; KNL_32-NEXT: .Lcfi8:
; KNL_32-NEXT: .cfi_def_cfa_register %ebp
; KNL_32-NEXT: andl $-64, %esp
; KNL_32-NEXT: subl $64, %esp
; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
; KNL_32-NEXT: vmovdqa64 8(%ebp), %zmm1
; KNL_32-NEXT: kshiftrw $8, %k1, %k2
; KNL_32-NEXT: vpscatterdq %zmm2, (,%ymm0) {%k1}
; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; KNL_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2}
; KNL_32-NEXT: movl %ebp, %esp
; KNL_32-NEXT: popl %ebp
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test_scatter_16i64:
; SKX: # BB#0:
; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
; SKX-NEXT: vpslld $31, %zmm2, %zmm2
; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
; SKX-NEXT: kshiftrw $8, %k1, %k2
; SKX-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1}
; SKX-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2}
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; SKX_32-LABEL: test_scatter_16i64:
; SKX_32: # BB#0:
; SKX_32-NEXT: pushl %ebp
; SKX_32-NEXT: .Lcfi7:
; SKX_32-NEXT: .cfi_def_cfa_offset 8
; SKX_32-NEXT: .Lcfi8:
; SKX_32-NEXT: .cfi_offset %ebp, -8
; SKX_32-NEXT: movl %esp, %ebp
; SKX_32-NEXT: .Lcfi9:
; SKX_32-NEXT: .cfi_def_cfa_register %ebp
; SKX_32-NEXT: andl $-64, %esp
; SKX_32-NEXT: subl $64, %esp
; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
; SKX_32-NEXT: vmovdqa64 8(%ebp), %zmm1
; SKX_32-NEXT: kshiftrw $8, %k1, %k2
; SKX_32-NEXT: vpscatterdq %zmm2, (,%ymm0) {%k1}
; SKX_32-NEXT: vextracti32x8 $1, %zmm0, %ymm0
; SKX_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2}
; SKX_32-NEXT: movl %ebp, %esp
; SKX_32-NEXT: popl %ebp
; SKX_32-NEXT: vzeroupper
; SKX_32-NEXT: retl
call void @llvm.masked.scatter.v16i64.v16p0i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32 4, <16 x i1> %mask)
ret void
}
declare void @llvm.masked.scatter.v16i64.v16p0i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32, <16 x i1> %mask)
define void @test_scatter_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0) {
; KNL_64-LABEL: test_scatter_16f32:
; KNL_64: # BB#0:
; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
; KNL_64-NEXT: kshiftrw $8, %k1, %k2
; KNL_64-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1}
; KNL_64-NEXT: vextractf64x4 $1, %zmm3, %ymm0
; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2}
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test_scatter_16f32:
; KNL_32: # BB#0:
; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
; KNL_32-NEXT: vscatterdps %zmm2, (,%zmm0) {%k1}
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test_scatter_16f32:
; SKX: # BB#0:
; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
; SKX-NEXT: vpslld $31, %zmm2, %zmm2
; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
; SKX-NEXT: kshiftrw $8, %k1, %k2
; SKX-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1}
; SKX-NEXT: vextractf32x8 $1, %zmm3, %ymm0
; SKX-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2}
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; SKX_32-LABEL: test_scatter_16f32:
; SKX_32: # BB#0:
; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
; SKX_32-NEXT: vscatterdps %zmm2, (,%zmm0) {%k1}
; SKX_32-NEXT: vzeroupper
; SKX_32-NEXT: retl
call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> %src0, <16 x float*> %ptrs, i32 4, <16 x i1> %mask)
ret void
}
declare void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> %src0, <16 x float*> %ptrs, i32, <16 x i1> %mask)
define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0) {
; KNL_64-LABEL: test_scatter_16f64:
; KNL_64: # BB#0:
; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
; KNL_64-NEXT: kshiftrw $8, %k1, %k2
; KNL_64-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1}
; KNL_64-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2}
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test_scatter_16f64:
; KNL_32: # BB#0:
; KNL_32-NEXT: pushl %ebp
; KNL_32-NEXT: .Lcfi9:
; KNL_32-NEXT: .cfi_def_cfa_offset 8
; KNL_32-NEXT: .Lcfi10:
; KNL_32-NEXT: .cfi_offset %ebp, -8
; KNL_32-NEXT: movl %esp, %ebp
; KNL_32-NEXT: .Lcfi11:
; KNL_32-NEXT: .cfi_def_cfa_register %ebp
; KNL_32-NEXT: andl $-64, %esp
; KNL_32-NEXT: subl $64, %esp
; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
; KNL_32-NEXT: vmovapd 8(%ebp), %zmm1
; KNL_32-NEXT: kshiftrw $8, %k1, %k2
; KNL_32-NEXT: vscatterdpd %zmm2, (,%ymm0) {%k1}
; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; KNL_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2}
; KNL_32-NEXT: movl %ebp, %esp
; KNL_32-NEXT: popl %ebp
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test_scatter_16f64:
; SKX: # BB#0:
; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
; SKX-NEXT: vpslld $31, %zmm2, %zmm2
; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
; SKX-NEXT: kshiftrw $8, %k1, %k2
; SKX-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1}
; SKX-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2}
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; SKX_32-LABEL: test_scatter_16f64:
; SKX_32: # BB#0:
; SKX_32-NEXT: pushl %ebp
; SKX_32-NEXT: .Lcfi10:
; SKX_32-NEXT: .cfi_def_cfa_offset 8
; SKX_32-NEXT: .Lcfi11:
; SKX_32-NEXT: .cfi_offset %ebp, -8
; SKX_32-NEXT: movl %esp, %ebp
; SKX_32-NEXT: .Lcfi12:
; SKX_32-NEXT: .cfi_def_cfa_register %ebp
; SKX_32-NEXT: andl $-64, %esp
; SKX_32-NEXT: subl $64, %esp
; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
; SKX_32-NEXT: vmovapd 8(%ebp), %zmm1
; SKX_32-NEXT: kshiftrw $8, %k1, %k2
; SKX_32-NEXT: vscatterdpd %zmm2, (,%ymm0) {%k1}
; SKX_32-NEXT: vextracti32x8 $1, %zmm0, %ymm0
; SKX_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2}
; SKX_32-NEXT: movl %ebp, %esp
; SKX_32-NEXT: popl %ebp
; SKX_32-NEXT: vzeroupper
; SKX_32-NEXT: retl
call void @llvm.masked.scatter.v16f64.v16p0f64(<16 x double> %src0, <16 x double*> %ptrs, i32 4, <16 x i1> %mask)
ret void
}
declare void @llvm.masked.scatter.v16f64.v16p0f64(<16 x double> %src0, <16 x double*> %ptrs, i32, <16 x i1> %mask)
define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i64> %d) {
; KNL_64-LABEL: test_pr28312:
; KNL_64: # BB#0:
; KNL_64-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
; KNL_64-NEXT: vpsrad $31, %xmm1, %xmm1
; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1
; KNL_64-NEXT: vpxord %zmm2, %zmm2, %zmm2
; KNL_64-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm1 {%k1}
; KNL_64-NEXT: vpaddq %ymm1, %ymm1, %ymm0
; KNL_64-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test_pr28312:
; KNL_32: # BB#0:
; KNL_32-NEXT: pushl %ebp
; KNL_32-NEXT: .Lcfi12:
; KNL_32-NEXT: .cfi_def_cfa_offset 8
; KNL_32-NEXT: .Lcfi13:
; KNL_32-NEXT: .cfi_offset %ebp, -8
; KNL_32-NEXT: movl %esp, %ebp
; KNL_32-NEXT: .Lcfi14:
; KNL_32-NEXT: .cfi_def_cfa_register %ebp
; KNL_32-NEXT: andl $-32, %esp
; KNL_32-NEXT: subl $32, %esp
; KNL_32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
; KNL_32-NEXT: vpsrad $31, %xmm1, %xmm1
; KNL_32-NEXT: vpmovsxdq %xmm1, %ymm1
; KNL_32-NEXT: vpxord %zmm2, %zmm2, %zmm2
; KNL_32-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0
; KNL_32-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_32-NEXT: vpgatherqq (,%zmm0), %zmm1 {%k1}
; KNL_32-NEXT: vpaddq %ymm1, %ymm1, %ymm0
; KNL_32-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; KNL_32-NEXT: movl %ebp, %esp
; KNL_32-NEXT: popl %ebp
; KNL_32-NEXT: retl
;
; SKX-LABEL: test_pr28312:
; SKX: # BB#0:
; SKX-NEXT: vpslld $31, %xmm1, %xmm1
; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1
; SKX-NEXT: vpgatherqq (,%ymm0), %ymm1 {%k1}
; SKX-NEXT: vpaddq %ymm1, %ymm1, %ymm0
; SKX-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test_pr28312:
; SKX_32: # BB#0:
; SKX_32-NEXT: pushl %ebp
; SKX_32-NEXT: .Lcfi13:
; SKX_32-NEXT: .cfi_def_cfa_offset 8
; SKX_32-NEXT: .Lcfi14:
; SKX_32-NEXT: .cfi_offset %ebp, -8
; SKX_32-NEXT: movl %esp, %ebp
; SKX_32-NEXT: .Lcfi15:
; SKX_32-NEXT: .cfi_def_cfa_register %ebp
; SKX_32-NEXT: andl $-32, %esp
; SKX_32-NEXT: subl $32, %esp
; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
; SKX_32-NEXT: vptestmd %xmm1, %xmm1, %k1
; SKX_32-NEXT: vpgatherdq (,%xmm0), %ymm1 {%k1}
; SKX_32-NEXT: vpaddq %ymm1, %ymm1, %ymm0
; SKX_32-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; SKX_32-NEXT: movl %ebp, %esp
; SKX_32-NEXT: popl %ebp
; SKX_32-NEXT: retl
%g1 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
%g2 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
%g3 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
%a = add <4 x i64> %g1, %g2
%b = add <4 x i64> %a, %g3
ret <4 x i64> %b
}
declare <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*>, i32, <4 x i1>, <4 x i64>)