llvm-project/llvm/test/CodeGen/X86/avx512-insert-extract.ll

2279 lines
79 KiB
LLVM
Raw Normal View History

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck --check-prefix=CHECK --check-prefix=KNL %s
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck --check-prefix=CHECK --check-prefix=SKX --check-prefix=SKX_ONLY %s
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq,+avx512vbmi | FileCheck --check-prefix=CHECK --check-prefix=SKX --check-prefix=SKX_VBMI %s
define <16 x float> @test1(<16 x float> %x, float* %br, float %y) nounwind {
; CHECK-LABEL: test1:
; CHECK: ## %bb.0:
; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],mem[0],xmm0[2,3]
; CHECK-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm2
; CHECK-NEXT: vextractf32x4 $3, %zmm0, %xmm0
; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
; CHECK-NEXT: vinsertf32x4 $3, %xmm0, %zmm2, %zmm0
; CHECK-NEXT: retq
%rrr = load float, float* %br
%rrr2 = insertelement <16 x float> %x, float %rrr, i32 1
%rrr3 = insertelement <16 x float> %rrr2, float %y, i32 14
ret <16 x float> %rrr3
}
define <8 x double> @test2(<8 x double> %x, double* %br, double %y) nounwind {
; CHECK-LABEL: test2:
; CHECK: ## %bb.0:
; CHECK-NEXT: vmovhpd {{.*#+}} xmm2 = xmm0[0],mem[0]
; CHECK-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm2
; CHECK-NEXT: vextractf32x4 $3, %zmm0, %xmm0
; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; CHECK-NEXT: vinsertf32x4 $3, %xmm0, %zmm2, %zmm0
; CHECK-NEXT: retq
%rrr = load double, double* %br
%rrr2 = insertelement <8 x double> %x, double %rrr, i32 1
%rrr3 = insertelement <8 x double> %rrr2, double %y, i32 6
ret <8 x double> %rrr3
}
define <16 x float> @test3(<16 x float> %x) nounwind {
; CHECK-LABEL: test3:
; CHECK: ## %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[2,3]
; CHECK-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%eee = extractelement <16 x float> %x, i32 4
%rrr2 = insertelement <16 x float> %x, float %eee, i32 1
ret <16 x float> %rrr2
}
define <8 x i64> @test4(<8 x i64> %x) nounwind {
; CHECK-LABEL: test4:
; CHECK: ## %bb.0:
; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm1
; CHECK-NEXT: vmovq %xmm1, %rax
; CHECK-NEXT: vpinsrq $1, %rax, %xmm0, %xmm1
; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%eee = extractelement <8 x i64> %x, i32 4
%rrr2 = insertelement <8 x i64> %x, i64 %eee, i32 1
ret <8 x i64> %rrr2
}
define i32 @test5(<4 x float> %x) nounwind {
; CHECK-LABEL: test5:
; CHECK: ## %bb.0:
; CHECK-NEXT: vextractps $3, %xmm0, %eax
; CHECK-NEXT: retq
%ef = extractelement <4 x float> %x, i32 3
%ei = bitcast float %ef to i32
ret i32 %ei
}
define void @test6(<4 x float> %x, float* %out) nounwind {
; CHECK-LABEL: test6:
; CHECK: ## %bb.0:
; CHECK-NEXT: vextractps $3, %xmm0, (%rdi)
; CHECK-NEXT: retq
%ef = extractelement <4 x float> %x, i32 3
store float %ef, float* %out, align 4
ret void
}
define float @test7(<16 x float> %x, i32 %ind) nounwind {
; CHECK-LABEL: test7:
; CHECK: ## %bb.0:
; CHECK-NEXT: pushq %rbp
; CHECK-NEXT: movq %rsp, %rbp
; CHECK-NEXT: andq $-64, %rsp
; CHECK-NEXT: subq $128, %rsp
; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
; CHECK-NEXT: vmovaps %zmm0, (%rsp)
; CHECK-NEXT: andl $15, %edi
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: movq %rbp, %rsp
; CHECK-NEXT: popq %rbp
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%e = extractelement <16 x float> %x, i32 %ind
ret float %e
}
define double @test8(<8 x double> %x, i32 %ind) nounwind {
; CHECK-LABEL: test8:
; CHECK: ## %bb.0:
; CHECK-NEXT: pushq %rbp
; CHECK-NEXT: movq %rsp, %rbp
; CHECK-NEXT: andq $-64, %rsp
; CHECK-NEXT: subq $128, %rsp
; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
; CHECK-NEXT: vmovaps %zmm0, (%rsp)
; CHECK-NEXT: andl $7, %edi
; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movq %rbp, %rsp
; CHECK-NEXT: popq %rbp
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%e = extractelement <8 x double> %x, i32 %ind
ret double %e
}
define float @test9(<8 x float> %x, i32 %ind) nounwind {
; CHECK-LABEL: test9:
; CHECK: ## %bb.0:
; CHECK-NEXT: pushq %rbp
; CHECK-NEXT: movq %rsp, %rbp
; CHECK-NEXT: andq $-32, %rsp
; CHECK-NEXT: subq $64, %rsp
; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
; CHECK-NEXT: vmovaps %ymm0, (%rsp)
; CHECK-NEXT: andl $7, %edi
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: movq %rbp, %rsp
; CHECK-NEXT: popq %rbp
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%e = extractelement <8 x float> %x, i32 %ind
ret float %e
}
define i32 @test10(<16 x i32> %x, i32 %ind) nounwind {
; CHECK-LABEL: test10:
; CHECK: ## %bb.0:
; CHECK-NEXT: pushq %rbp
; CHECK-NEXT: movq %rsp, %rbp
; CHECK-NEXT: andq $-64, %rsp
; CHECK-NEXT: subq $128, %rsp
; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
; CHECK-NEXT: vmovaps %zmm0, (%rsp)
; CHECK-NEXT: andl $15, %edi
; CHECK-NEXT: movl (%rsp,%rdi,4), %eax
; CHECK-NEXT: movq %rbp, %rsp
; CHECK-NEXT: popq %rbp
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%e = extractelement <16 x i32> %x, i32 %ind
ret i32 %e
}
define <16 x i32> @test11(<16 x i32>%a, <16 x i32>%b) {
; KNL-LABEL: test11:
; KNL: ## %bb.0:
; KNL-NEXT: vpcmpltud %zmm1, %zmm0, %k0
; KNL-NEXT: kshiftrw $4, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: testb $1, %al
; KNL-NEXT: je LBB10_2
; KNL-NEXT: ## %bb.1: ## %A
; KNL-NEXT: vmovdqa64 %zmm1, %zmm0
; KNL-NEXT: retq
; KNL-NEXT: LBB10_2: ## %B
; KNL-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: test11:
; SKX: ## %bb.0:
; SKX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
; SKX-NEXT: kshiftrw $4, %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: testb $1, %al
; SKX-NEXT: je LBB10_2
; SKX-NEXT: ## %bb.1: ## %A
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0
; SKX-NEXT: retq
; SKX-NEXT: LBB10_2: ## %B
; SKX-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; SKX-NEXT: retq
%cmp_res = icmp ult <16 x i32> %a, %b
%ia = extractelement <16 x i1> %cmp_res, i32 4
br i1 %ia, label %A, label %B
A:
ret <16 x i32>%b
B:
%c = add <16 x i32>%b, %a
ret <16 x i32>%c
}
define i64 @test12(<16 x i64>%a, <16 x i64>%b, i64 %a1, i64 %b1) {
; KNL-LABEL: test12:
; KNL: ## %bb.0:
; KNL-NEXT: vpcmpgtq %zmm0, %zmm2, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: testb $1, %al
; KNL-NEXT: cmoveq %rsi, %rdi
; KNL-NEXT: movq %rdi, %rax
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test12:
; SKX: ## %bb.0:
; SKX-NEXT: vpcmpgtq %zmm0, %zmm2, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: testb $1, %al
; SKX-NEXT: cmoveq %rsi, %rdi
; SKX-NEXT: movq %rdi, %rax
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%cmpvector_func.i = icmp slt <16 x i64> %a, %b
%extract24vector_func.i = extractelement <16 x i1> %cmpvector_func.i, i32 0
%res = select i1 %extract24vector_func.i, i64 %a1, i64 %b1
ret i64 %res
}
define i16 @test13(i32 %a, i32 %b) {
; KNL-LABEL: test13:
; KNL: ## %bb.0:
; KNL-NEXT: cmpl %esi, %edi
; KNL-NEXT: setb %al
; KNL-NEXT: movw $-4, %cx
; KNL-NEXT: kmovw %ecx, %k0
; KNL-NEXT: kshiftrw $1, %k0, %k0
; KNL-NEXT: kshiftlw $1, %k0, %k0
; KNL-NEXT: andl $1, %eax
; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: korw %k1, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: ## kill: def $ax killed $ax killed $eax
; KNL-NEXT: retq
;
; SKX-LABEL: test13:
; SKX: ## %bb.0:
; SKX-NEXT: cmpl %esi, %edi
; SKX-NEXT: setb %al
; SKX-NEXT: movw $-4, %cx
; SKX-NEXT: kmovd %ecx, %k0
; SKX-NEXT: kshiftrw $1, %k0, %k0
; SKX-NEXT: kshiftlw $1, %k0, %k0
; SKX-NEXT: andl $1, %eax
; SKX-NEXT: kmovw %eax, %k1
; SKX-NEXT: korw %k1, %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: ## kill: def $ax killed $ax killed $eax
; SKX-NEXT: retq
%cmp_res = icmp ult i32 %a, %b
%maskv = insertelement <16 x i1> <i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i1 %cmp_res, i32 0
%res = bitcast <16 x i1> %maskv to i16
ret i16 %res
}
define i64 @test14(<8 x i64>%a, <8 x i64>%b, i64 %a1, i64 %b1) {
; KNL-LABEL: test14:
; KNL: ## %bb.0:
; KNL-NEXT: vpcmpgtq %zmm0, %zmm1, %k0
; KNL-NEXT: kshiftrw $4, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: testb $1, %al
; KNL-NEXT: cmoveq %rsi, %rdi
; KNL-NEXT: movq %rdi, %rax
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test14:
; SKX: ## %bb.0:
; SKX-NEXT: vpcmpgtq %zmm0, %zmm1, %k0
; SKX-NEXT: kshiftrw $4, %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: testb $1, %al
; SKX-NEXT: cmoveq %rsi, %rdi
; SKX-NEXT: movq %rdi, %rax
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%cmpvector_func.i = icmp slt <8 x i64> %a, %b
%extract24vector_func.i = extractelement <8 x i1> %cmpvector_func.i, i32 4
%res = select i1 %extract24vector_func.i, i64 %a1, i64 %b1
ret i64 %res
}
define i16 @test15(i1 *%addr) {
; CHECK-LABEL: test15:
; CHECK: ## %bb.0:
; CHECK-NEXT: xorl %ecx, %ecx
; CHECK-NEXT: cmpb $0, (%rdi)
; CHECK-NEXT: movl $65535, %eax ## imm = 0xFFFF
; CHECK-NEXT: cmovel %ecx, %eax
; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax
; CHECK-NEXT: retq
%x = load i1 , i1 * %addr, align 1
%x1 = insertelement <16 x i1> undef, i1 %x, i32 10
%x2 = bitcast <16 x i1>%x1 to i16
ret i16 %x2
}
define i16 @test16(i1 *%addr, i16 %a) {
; KNL-LABEL: test16:
; KNL: ## %bb.0:
; KNL-NEXT: movb (%rdi), %al
; KNL-NEXT: kmovw %esi, %k0
; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: kshiftrw $10, %k0, %k2
; KNL-NEXT: kxorw %k1, %k2, %k1
; KNL-NEXT: kshiftlw $15, %k1, %k1
; KNL-NEXT: kshiftrw $5, %k1, %k1
; KNL-NEXT: kxorw %k1, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: ## kill: def $ax killed $ax killed $eax
; KNL-NEXT: retq
;
; SKX-LABEL: test16:
; SKX: ## %bb.0:
; SKX-NEXT: kmovb (%rdi), %k0
; SKX-NEXT: kmovd %esi, %k1
; SKX-NEXT: kshiftrw $10, %k1, %k2
; SKX-NEXT: kxorw %k0, %k2, %k0
; SKX-NEXT: kshiftlw $15, %k0, %k0
; SKX-NEXT: kshiftrw $5, %k0, %k0
; SKX-NEXT: kxorw %k0, %k1, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: ## kill: def $ax killed $ax killed $eax
; SKX-NEXT: retq
%x = load i1 , i1 * %addr, align 128
%a1 = bitcast i16 %a to <16 x i1>
%x1 = insertelement <16 x i1> %a1, i1 %x, i32 10
%x2 = bitcast <16 x i1>%x1 to i16
ret i16 %x2
}
define i8 @test17(i1 *%addr, i8 %a) {
; KNL-LABEL: test17:
; KNL: ## %bb.0:
; KNL-NEXT: movb (%rdi), %al
; KNL-NEXT: kmovw %esi, %k0
; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: kshiftrw $4, %k0, %k2
; KNL-NEXT: kxorw %k1, %k2, %k1
; KNL-NEXT: kshiftlw $15, %k1, %k1
; KNL-NEXT: kshiftrw $11, %k1, %k1
; KNL-NEXT: kxorw %k1, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: ## kill: def $al killed $al killed $eax
; KNL-NEXT: retq
;
; SKX-LABEL: test17:
; SKX: ## %bb.0:
; SKX-NEXT: kmovb (%rdi), %k0
; SKX-NEXT: kmovd %esi, %k1
; SKX-NEXT: kshiftrb $4, %k1, %k2
; SKX-NEXT: kxorb %k0, %k2, %k0
; SKX-NEXT: kshiftlb $7, %k0, %k0
; SKX-NEXT: kshiftrb $3, %k0, %k0
; SKX-NEXT: kxorb %k0, %k1, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: ## kill: def $al killed $al killed $eax
; SKX-NEXT: retq
%x = load i1 , i1 * %addr, align 128
%a1 = bitcast i8 %a to <8 x i1>
%x1 = insertelement <8 x i1> %a1, i1 %x, i32 4
%x2 = bitcast <8 x i1>%x1 to i8
ret i8 %x2
}
define i64 @extract_v8i64(<8 x i64> %x, i64* %dst) {
; CHECK-LABEL: extract_v8i64:
; CHECK: ## %bb.0:
; CHECK-NEXT: vpextrq $1, %xmm0, %rax
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
; CHECK-NEXT: vpextrq $1, %xmm0, (%rdi)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%r1 = extractelement <8 x i64> %x, i32 1
%r2 = extractelement <8 x i64> %x, i32 3
store i64 %r2, i64* %dst, align 1
ret i64 %r1
}
define i64 @extract_v4i64(<4 x i64> %x, i64* %dst) {
; CHECK-LABEL: extract_v4i64:
; CHECK: ## %bb.0:
; CHECK-NEXT: vpextrq $1, %xmm0, %rax
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
; CHECK-NEXT: vpextrq $1, %xmm0, (%rdi)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%r1 = extractelement <4 x i64> %x, i32 1
%r2 = extractelement <4 x i64> %x, i32 3
store i64 %r2, i64* %dst, align 1
ret i64 %r1
}
define i64 @extract_v2i64(<2 x i64> %x, i64* %dst) {
; CHECK-LABEL: extract_v2i64:
; CHECK: ## %bb.0:
; CHECK-NEXT: vmovq %xmm0, %rax
; CHECK-NEXT: vpextrq $1, %xmm0, (%rdi)
; CHECK-NEXT: retq
%r1 = extractelement <2 x i64> %x, i32 0
%r2 = extractelement <2 x i64> %x, i32 1
store i64 %r2, i64* %dst, align 1
ret i64 %r1
}
define i32 @extract_v16i32(<16 x i32> %x, i32* %dst) {
; CHECK-LABEL: extract_v16i32:
; CHECK: ## %bb.0:
; CHECK-NEXT: vextractps $1, %xmm0, %eax
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
; CHECK-NEXT: vextractps $1, %xmm0, (%rdi)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%r1 = extractelement <16 x i32> %x, i32 1
%r2 = extractelement <16 x i32> %x, i32 5
store i32 %r2, i32* %dst, align 1
ret i32 %r1
}
define i32 @extract_v8i32(<8 x i32> %x, i32* %dst) {
; CHECK-LABEL: extract_v8i32:
; CHECK: ## %bb.0:
; CHECK-NEXT: vextractps $1, %xmm0, %eax
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
; CHECK-NEXT: vextractps $1, %xmm0, (%rdi)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%r1 = extractelement <8 x i32> %x, i32 1
%r2 = extractelement <8 x i32> %x, i32 5
store i32 %r2, i32* %dst, align 1
ret i32 %r1
}
define i32 @extract_v4i32(<4 x i32> %x, i32* %dst) {
; CHECK-LABEL: extract_v4i32:
; CHECK: ## %bb.0:
; CHECK-NEXT: vextractps $1, %xmm0, %eax
; CHECK-NEXT: vextractps $3, %xmm0, (%rdi)
; CHECK-NEXT: retq
%r1 = extractelement <4 x i32> %x, i32 1
%r2 = extractelement <4 x i32> %x, i32 3
store i32 %r2, i32* %dst, align 1
ret i32 %r1
}
define i16 @extract_v32i16(<32 x i16> %x, i16* %dst) {
; CHECK-LABEL: extract_v32i16:
; CHECK: ## %bb.0:
; CHECK-NEXT: vpextrw $1, %xmm0, %eax
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
; CHECK-NEXT: vpextrw $1, %xmm0, (%rdi)
; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%r1 = extractelement <32 x i16> %x, i32 1
%r2 = extractelement <32 x i16> %x, i32 9
store i16 %r2, i16* %dst, align 1
ret i16 %r1
}
define i16 @extract_v16i16(<16 x i16> %x, i16* %dst) {
; CHECK-LABEL: extract_v16i16:
; CHECK: ## %bb.0:
; CHECK-NEXT: vpextrw $1, %xmm0, %eax
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
; CHECK-NEXT: vpextrw $1, %xmm0, (%rdi)
; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%r1 = extractelement <16 x i16> %x, i32 1
%r2 = extractelement <16 x i16> %x, i32 9
store i16 %r2, i16* %dst, align 1
ret i16 %r1
}
define i16 @extract_v8i16(<8 x i16> %x, i16* %dst) {
; CHECK-LABEL: extract_v8i16:
; CHECK: ## %bb.0:
; CHECK-NEXT: vpextrw $1, %xmm0, %eax
; CHECK-NEXT: vpextrw $3, %xmm0, (%rdi)
; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax
; CHECK-NEXT: retq
%r1 = extractelement <8 x i16> %x, i32 1
%r2 = extractelement <8 x i16> %x, i32 3
store i16 %r2, i16* %dst, align 1
ret i16 %r1
}
define i8 @extract_v64i8(<64 x i8> %x, i8* %dst) {
; CHECK-LABEL: extract_v64i8:
; CHECK: ## %bb.0:
; CHECK-NEXT: vpextrb $1, %xmm0, %eax
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
; CHECK-NEXT: vpextrb $1, %xmm0, (%rdi)
; CHECK-NEXT: ## kill: def $al killed $al killed $eax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%r1 = extractelement <64 x i8> %x, i32 1
%r2 = extractelement <64 x i8> %x, i32 17
store i8 %r2, i8* %dst, align 1
ret i8 %r1
}
define i8 @extract_v32i8(<32 x i8> %x, i8* %dst) {
; CHECK-LABEL: extract_v32i8:
; CHECK: ## %bb.0:
; CHECK-NEXT: vpextrb $1, %xmm0, %eax
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
; CHECK-NEXT: vpextrb $1, %xmm0, (%rdi)
; CHECK-NEXT: ## kill: def $al killed $al killed $eax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%r1 = extractelement <32 x i8> %x, i32 1
%r2 = extractelement <32 x i8> %x, i32 17
store i8 %r2, i8* %dst, align 1
ret i8 %r1
}
define i8 @extract_v16i8(<16 x i8> %x, i8* %dst) {
; CHECK-LABEL: extract_v16i8:
; CHECK: ## %bb.0:
; CHECK-NEXT: vpextrb $1, %xmm0, %eax
; CHECK-NEXT: vpextrb $3, %xmm0, (%rdi)
; CHECK-NEXT: ## kill: def $al killed $al killed $eax
; CHECK-NEXT: retq
%r1 = extractelement <16 x i8> %x, i32 1
%r2 = extractelement <16 x i8> %x, i32 3
store i8 %r2, i8* %dst, align 1
ret i8 %r1
}
define <8 x i64> @insert_v8i64(<8 x i64> %x, i64 %y , i64* %ptr) {
; CHECK-LABEL: insert_v8i64:
; CHECK: ## %bb.0:
; CHECK-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1
; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
; CHECK-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0
; CHECK-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%val = load i64, i64* %ptr
%r1 = insertelement <8 x i64> %x, i64 %val, i32 1
%r2 = insertelement <8 x i64> %r1, i64 %y, i32 3
ret <8 x i64> %r2
}
define <4 x i64> @insert_v4i64(<4 x i64> %x, i64 %y , i64* %ptr) {
; CHECK-LABEL: insert_v4i64:
; CHECK: ## %bb.0:
; CHECK-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1
; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
; CHECK-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0
; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; CHECK-NEXT: retq
%val = load i64, i64* %ptr
%r1 = insertelement <4 x i64> %x, i64 %val, i32 1
%r2 = insertelement <4 x i64> %r1, i64 %y, i32 3
ret <4 x i64> %r2
}
define <2 x i64> @insert_v2i64(<2 x i64> %x, i64 %y , i64* %ptr) {
; CHECK-LABEL: insert_v2i64:
; CHECK: ## %bb.0:
; CHECK-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0
; CHECK-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm0
; CHECK-NEXT: retq
%val = load i64, i64* %ptr
%r1 = insertelement <2 x i64> %x, i64 %val, i32 1
%r2 = insertelement <2 x i64> %r1, i64 %y, i32 0
ret <2 x i64> %r2
}
define <16 x i32> @insert_v16i32(<16 x i32> %x, i32 %y, i32* %ptr) {
; CHECK-LABEL: insert_v16i32:
; CHECK: ## %bb.0:
; CHECK-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1
; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
; CHECK-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0
; CHECK-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%val = load i32, i32* %ptr
%r1 = insertelement <16 x i32> %x, i32 %val, i32 1
%r2 = insertelement <16 x i32> %r1, i32 %y, i32 5
ret <16 x i32> %r2
}
define <8 x i32> @insert_v8i32(<8 x i32> %x, i32 %y, i32* %ptr) {
; CHECK-LABEL: insert_v8i32:
; CHECK: ## %bb.0:
; CHECK-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1
; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
; CHECK-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0
; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; CHECK-NEXT: retq
%val = load i32, i32* %ptr
%r1 = insertelement <8 x i32> %x, i32 %val, i32 1
%r2 = insertelement <8 x i32> %r1, i32 %y, i32 5
ret <8 x i32> %r2
}
define <4 x i32> @insert_v4i32(<4 x i32> %x, i32 %y, i32* %ptr) {
; CHECK-LABEL: insert_v4i32:
; CHECK: ## %bb.0:
; CHECK-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm0
; CHECK-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0
; CHECK-NEXT: retq
%val = load i32, i32* %ptr
%r1 = insertelement <4 x i32> %x, i32 %val, i32 1
%r2 = insertelement <4 x i32> %r1, i32 %y, i32 3
ret <4 x i32> %r2
}
define <32 x i16> @insert_v32i16(<32 x i16> %x, i16 %y, i16* %ptr) {
; KNL-LABEL: insert_v32i16:
; KNL: ## %bb.0:
; KNL-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm2
; KNL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0
; KNL-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
; KNL-NEXT: retq
;
; SKX-LABEL: insert_v32i16:
; SKX: ## %bb.0:
; SKX-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1
; SKX-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1
; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
; SKX-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0
; SKX-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
; SKX-NEXT: retq
%val = load i16, i16* %ptr
%r1 = insertelement <32 x i16> %x, i16 %val, i32 1
%r2 = insertelement <32 x i16> %r1, i16 %y, i32 9
ret <32 x i16> %r2
}
define <16 x i16> @insert_v16i16(<16 x i16> %x, i16 %y, i16* %ptr) {
; CHECK-LABEL: insert_v16i16:
; CHECK: ## %bb.0:
; CHECK-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1
; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
; CHECK-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0
; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; CHECK-NEXT: retq
%val = load i16, i16* %ptr
%r1 = insertelement <16 x i16> %x, i16 %val, i32 1
%r2 = insertelement <16 x i16> %r1, i16 %y, i32 9
ret <16 x i16> %r2
}
define <8 x i16> @insert_v8i16(<8 x i16> %x, i16 %y, i16* %ptr) {
; CHECK-LABEL: insert_v8i16:
; CHECK: ## %bb.0:
; CHECK-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm0
; CHECK-NEXT: vpinsrw $5, %edi, %xmm0, %xmm0
; CHECK-NEXT: retq
%val = load i16, i16* %ptr
%r1 = insertelement <8 x i16> %x, i16 %val, i32 1
%r2 = insertelement <8 x i16> %r1, i16 %y, i32 5
ret <8 x i16> %r2
}
define <64 x i8> @insert_v64i8(<64 x i8> %x, i8 %y, i8* %ptr) {
; KNL-LABEL: insert_v64i8:
; KNL: ## %bb.0:
; KNL-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm2
; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; KNL-NEXT: vextracti128 $1, %ymm1, %xmm2
; KNL-NEXT: vpinsrb $2, %edi, %xmm2, %xmm2
; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; KNL-NEXT: retq
;
; SKX-LABEL: insert_v64i8:
; SKX: ## %bb.0:
; SKX-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm1
; SKX-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1
; SKX-NEXT: vextracti32x4 $3, %zmm0, %xmm0
; SKX-NEXT: vpinsrb $2, %edi, %xmm0, %xmm0
; SKX-NEXT: vinserti32x4 $3, %xmm0, %zmm1, %zmm0
; SKX-NEXT: retq
%val = load i8, i8* %ptr
%r1 = insertelement <64 x i8> %x, i8 %val, i32 1
%r2 = insertelement <64 x i8> %r1, i8 %y, i32 50
ret <64 x i8> %r2
}
define <32 x i8> @insert_v32i8(<32 x i8> %x, i8 %y, i8* %ptr) {
; CHECK-LABEL: insert_v32i8:
; CHECK: ## %bb.0:
; CHECK-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm1
; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
; CHECK-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0
; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; CHECK-NEXT: retq
%val = load i8, i8* %ptr
%r1 = insertelement <32 x i8> %x, i8 %val, i32 1
%r2 = insertelement <32 x i8> %r1, i8 %y, i32 17
ret <32 x i8> %r2
}
define <16 x i8> @insert_v16i8(<16 x i8> %x, i8 %y, i8* %ptr) {
; CHECK-LABEL: insert_v16i8:
; CHECK: ## %bb.0:
; CHECK-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0
; CHECK-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; CHECK-NEXT: retq
%val = load i8, i8* %ptr
%r1 = insertelement <16 x i8> %x, i8 %val, i32 3
%r2 = insertelement <16 x i8> %r1, i8 %y, i32 10
ret <16 x i8> %r2
}
define <8 x i64> @test_insert_128_v8i64(<8 x i64> %x, i64 %y) {
; CHECK-LABEL: test_insert_128_v8i64:
; CHECK: ## %bb.0:
; CHECK-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1
; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%r = insertelement <8 x i64> %x, i64 %y, i32 1
ret <8 x i64> %r
}
define <16 x i32> @test_insert_128_v16i32(<16 x i32> %x, i32 %y) {
; CHECK-LABEL: test_insert_128_v16i32:
; CHECK: ## %bb.0:
; CHECK-NEXT: vpinsrd $1, %edi, %xmm0, %xmm1
; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%r = insertelement <16 x i32> %x, i32 %y, i32 1
ret <16 x i32> %r
}
define <8 x double> @test_insert_128_v8f64(<8 x double> %x, double %y) {
; CHECK-LABEL: test_insert_128_v8f64:
; CHECK: ## %bb.0:
; CHECK-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm1[0]
; CHECK-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%r = insertelement <8 x double> %x, double %y, i32 1
ret <8 x double> %r
}
define <16 x float> @test_insert_128_v16f32(<16 x float> %x, float %y) {
; CHECK-LABEL: test_insert_128_v16f32:
; CHECK: ## %bb.0:
; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[2,3]
; CHECK-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%r = insertelement <16 x float> %x, float %y, i32 1
ret <16 x float> %r
}
define <16 x i16> @test_insert_128_v16i16(<16 x i16> %x, i16 %y) {
; CHECK-LABEL: test_insert_128_v16i16:
; CHECK: ## %bb.0:
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
; CHECK-NEXT: vpinsrw $2, %edi, %xmm1, %xmm1
; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; CHECK-NEXT: retq
%r = insertelement <16 x i16> %x, i16 %y, i32 10
ret <16 x i16> %r
}
define <32 x i8> @test_insert_128_v32i8(<32 x i8> %x, i8 %y) {
; CHECK-LABEL: test_insert_128_v32i8:
; CHECK: ## %bb.0:
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
; CHECK-NEXT: vpinsrb $4, %edi, %xmm1, %xmm1
; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; CHECK-NEXT: retq
%r = insertelement <32 x i8> %x, i8 %y, i32 20
ret <32 x i8> %r
}
define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32> %y) {
; KNL-LABEL: test_insertelement_v32i1:
; KNL: ## %bb.0:
; KNL-NEXT: cmpl %esi, %edi
; KNL-NEXT: setb %al
; KNL-NEXT: vpcmpltud %zmm3, %zmm1, %k0
; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: shll $16, %ecx
; KNL-NEXT: vpcmpltud %zmm2, %zmm0, %k0
; KNL-NEXT: kshiftrw $4, %k0, %k1
; KNL-NEXT: kmovw %eax, %k2
; KNL-NEXT: kxorw %k2, %k1, %k1
; KNL-NEXT: kshiftlw $15, %k1, %k1
; KNL-NEXT: kshiftrw $11, %k1, %k1
; KNL-NEXT: kxorw %k1, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: orl %ecx, %eax
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_insertelement_v32i1:
; SKX: ## %bb.0:
; SKX-NEXT: cmpl %esi, %edi
; SKX-NEXT: setb %al
; SKX-NEXT: vpcmpltud %zmm2, %zmm0, %k0
; SKX-NEXT: vpcmpltud %zmm3, %zmm1, %k1
; SKX-NEXT: kunpckwd %k0, %k1, %k0
; SKX-NEXT: kshiftrd $4, %k0, %k1
; SKX-NEXT: kmovd %eax, %k2
; SKX-NEXT: kxord %k2, %k1, %k1
; SKX-NEXT: kshiftld $31, %k1, %k1
; SKX-NEXT: kshiftrd $27, %k1, %k1
; SKX-NEXT: kxord %k1, %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%cmp_res_i1 = icmp ult i32 %a, %b
%cmp_cmp_vec = icmp ult <32 x i32> %x, %y
%maskv = insertelement <32 x i1> %cmp_cmp_vec, i1 %cmp_res_i1, i32 4
%res = bitcast <32 x i1> %maskv to i32
ret i32 %res
}
define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y) {
; KNL-LABEL: test_iinsertelement_v4i1:
; KNL: ## %bb.0:
; KNL-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
; KNL-NEXT: cmpl %esi, %edi
; KNL-NEXT: setb %al
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; KNL-NEXT: vpcmpltud %zmm1, %zmm0, %k0
; KNL-NEXT: kshiftrw $2, %k0, %k1
; KNL-NEXT: kmovw %eax, %k2
; KNL-NEXT: kxorw %k2, %k1, %k1
; KNL-NEXT: kshiftlw $15, %k1, %k1
; KNL-NEXT: kshiftrw $13, %k1, %k1
; KNL-NEXT: kxorw %k1, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: ## kill: def $al killed $al killed $eax
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_iinsertelement_v4i1:
; SKX: ## %bb.0:
; SKX-NEXT: cmpl %esi, %edi
; SKX-NEXT: setb %al
; SKX-NEXT: vpcmpltud %xmm1, %xmm0, %k0
; SKX-NEXT: kshiftrb $2, %k0, %k1
; SKX-NEXT: kmovd %eax, %k2
; SKX-NEXT: kxorb %k2, %k1, %k1
; SKX-NEXT: kshiftlb $7, %k1, %k1
; SKX-NEXT: kshiftrb $5, %k1, %k1
; SKX-NEXT: kxorb %k1, %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: ## kill: def $al killed $al killed $eax
; SKX-NEXT: retq
%cmp_res_i1 = icmp ult i32 %a, %b
%cmp_cmp_vec = icmp ult <4 x i32> %x, %y
%maskv = insertelement <4 x i1> %cmp_cmp_vec, i1 %cmp_res_i1, i32 2
%res0 = shufflevector <4 x i1> %maskv, <4 x i1> undef , <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
%res = bitcast <8 x i1> %res0 to i8
ret i8 %res
}
define i8 @test_iinsertelement_v2i1(i32 %a, i32 %b, <2 x i64> %x , <2 x i64> %y) {
; KNL-LABEL: test_iinsertelement_v2i1:
; KNL: ## %bb.0:
; KNL-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
; KNL-NEXT: cmpl %esi, %edi
; KNL-NEXT: setb %al
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; KNL-NEXT: vpcmpltuq %zmm1, %zmm0, %k0
; KNL-NEXT: kshiftlw $15, %k0, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: kshiftlw $1, %k1, %k1
; KNL-NEXT: korw %k1, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: ## kill: def $al killed $al killed $eax
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_iinsertelement_v2i1:
; SKX: ## %bb.0:
; SKX-NEXT: cmpl %esi, %edi
; SKX-NEXT: setb %al
; SKX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0
; SKX-NEXT: kshiftlb $7, %k0, %k0
; SKX-NEXT: kshiftrb $7, %k0, %k0
; SKX-NEXT: kmovd %eax, %k1
; SKX-NEXT: kshiftlb $1, %k1, %k1
; SKX-NEXT: korb %k1, %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: ## kill: def $al killed $al killed $eax
; SKX-NEXT: retq
%cmp_res_i1 = icmp ult i32 %a, %b
%cmp_cmp_vec = icmp ult <2 x i64> %x, %y
%maskv = insertelement <2 x i1> %cmp_cmp_vec, i1 %cmp_res_i1, i32 1
%res0 = shufflevector <2 x i1> %maskv, <2 x i1> undef , <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
%res = bitcast <8 x i1> %res0 to i8
ret i8 %res
}
define zeroext i8 @test_extractelement_v2i1(<2 x i64> %a, <2 x i64> %b) {
; KNL-LABEL: test_extractelement_v2i1:
; KNL: ## %bb.0:
; KNL-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; KNL-NEXT: vpcmpnleuq %zmm1, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: andb $1, %al
; KNL-NEXT: movb $4, %cl
; KNL-NEXT: subb %al, %cl
; KNL-NEXT: movzbl %cl, %eax
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_v2i1:
; SKX: ## %bb.0:
; SKX-NEXT: vpcmpnleuq %xmm1, %xmm0, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: andb $1, %al
; SKX-NEXT: movb $4, %cl
; SKX-NEXT: subb %al, %cl
; SKX-NEXT: movzbl %cl, %eax
; SKX-NEXT: retq
%t1 = icmp ugt <2 x i64> %a, %b
%t2 = extractelement <2 x i1> %t1, i32 0
%res = select i1 %t2, i8 3, i8 4
ret i8 %res
}
define zeroext i8 @extractelement_v2i1_alt(<2 x i64> %a, <2 x i64> %b) {
; KNL-LABEL: extractelement_v2i1_alt:
; KNL: ## %bb.0:
; KNL-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; KNL-NEXT: vpcmpnleuq %zmm1, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: andb $1, %al
; KNL-NEXT: movb $4, %cl
; KNL-NEXT: subb %al, %cl
; KNL-NEXT: movzbl %cl, %eax
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: extractelement_v2i1_alt:
; SKX: ## %bb.0:
; SKX-NEXT: vpcmpnleuq %xmm1, %xmm0, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: andb $1, %al
; SKX-NEXT: movb $4, %cl
; SKX-NEXT: subb %al, %cl
; SKX-NEXT: movzbl %cl, %eax
; SKX-NEXT: retq
%t1 = icmp ugt <2 x i64> %a, %b
%t2 = extractelement <2 x i1> %t1, i32 0
%sext = sext i1 %t2 to i8
%res = add i8 %sext, 4
ret i8 %res
}
define zeroext i8 @test_extractelement_v4i1(<4 x i32> %a, <4 x i32> %b) {
; KNL-LABEL: test_extractelement_v4i1:
; KNL: ## %bb.0:
; KNL-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
; KNL-NEXT: kshiftrw $3, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: andl $1, %eax
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_v4i1:
; SKX: ## %bb.0:
; SKX-NEXT: vpcmpnleud %xmm1, %xmm0, %k0
; SKX-NEXT: kshiftrw $3, %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: andl $1, %eax
; SKX-NEXT: retq
%t1 = icmp ugt <4 x i32> %a, %b
%t2 = extractelement <4 x i1> %t1, i32 3
%res = zext i1 %t2 to i8
ret i8 %res
}
define zeroext i8 @test_extractelement_v32i1(<32 x i8> %a, <32 x i8> %b) {
; KNL-LABEL: test_extractelement_v32i1:
; KNL: ## %bb.0:
; KNL-NEXT: vpminub %ymm1, %ymm0, %ymm1
; KNL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kshiftrw $2, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: andl $1, %eax
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_v32i1:
; SKX: ## %bb.0:
; SKX-NEXT: vpcmpnleub %ymm1, %ymm0, %k0
; SKX-NEXT: kshiftrd $2, %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: andl $1, %eax
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t1 = icmp ugt <32 x i8> %a, %b
%t2 = extractelement <32 x i1> %t1, i32 2
%res = zext i1 %t2 to i8
ret i8 %res
}
define zeroext i8 @test_extractelement_v64i1(<64 x i8> %a, <64 x i8> %b) {
; KNL-LABEL: test_extractelement_v64i1:
; KNL: ## %bb.0:
; KNL-NEXT: vpminub %ymm3, %ymm1, %ymm0
; KNL-NEXT: vpcmpeqb %ymm0, %ymm1, %ymm0
; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: andb $1, %al
; KNL-NEXT: movb $4, %cl
; KNL-NEXT: subb %al, %cl
; KNL-NEXT: movzbl %cl, %eax
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_v64i1:
; SKX: ## %bb.0:
; SKX-NEXT: vpcmpnleub %zmm1, %zmm0, %k0
; SKX-NEXT: kshiftrq $63, %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: andb $1, %al
; SKX-NEXT: movb $4, %cl
; SKX-NEXT: subb %al, %cl
; SKX-NEXT: movzbl %cl, %eax
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t1 = icmp ugt <64 x i8> %a, %b
%t2 = extractelement <64 x i1> %t1, i32 63
%res = select i1 %t2, i8 3, i8 4
ret i8 %res
}
define zeroext i8 @extractelement_v64i1_alt(<64 x i8> %a, <64 x i8> %b) {
; KNL-LABEL: extractelement_v64i1_alt:
; KNL: ## %bb.0:
; KNL-NEXT: vpminub %ymm3, %ymm1, %ymm0
; KNL-NEXT: vpcmpeqb %ymm0, %ymm1, %ymm0
; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: andb $1, %al
; KNL-NEXT: movb $4, %cl
; KNL-NEXT: subb %al, %cl
; KNL-NEXT: movzbl %cl, %eax
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: extractelement_v64i1_alt:
; SKX: ## %bb.0:
; SKX-NEXT: vpcmpnleub %zmm1, %zmm0, %k0
; SKX-NEXT: kshiftrq $63, %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: andb $1, %al
; SKX-NEXT: movb $4, %cl
; SKX-NEXT: subb %al, %cl
; SKX-NEXT: movzbl %cl, %eax
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t1 = icmp ugt <64 x i8> %a, %b
%t2 = extractelement <64 x i1> %t1, i32 63
%sext = sext i1 %t2 to i8
%res = add i8 %sext, 4
ret i8 %res
}
define i64 @test_extractelement_variable_v2i64(<2 x i64> %t1, i32 %index) {
; CHECK-LABEL: test_extractelement_variable_v2i64:
; CHECK: ## %bb.0:
; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: andl $1, %edi
; CHECK-NEXT: movq -24(%rsp,%rdi,8), %rax
; CHECK-NEXT: retq
%t2 = extractelement <2 x i64> %t1, i32 %index
ret i64 %t2
}
define i64 @test_extractelement_variable_v4i64(<4 x i64> %t1, i32 %index) {
; CHECK-LABEL: test_extractelement_variable_v4i64:
; CHECK: ## %bb.0:
; CHECK-NEXT: pushq %rbp
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: .cfi_offset %rbp, -16
; CHECK-NEXT: movq %rsp, %rbp
; CHECK-NEXT: .cfi_def_cfa_register %rbp
; CHECK-NEXT: andq $-32, %rsp
; CHECK-NEXT: subq $64, %rsp
; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
; CHECK-NEXT: vmovaps %ymm0, (%rsp)
; CHECK-NEXT: andl $3, %edi
; CHECK-NEXT: movq (%rsp,%rdi,8), %rax
; CHECK-NEXT: movq %rbp, %rsp
; CHECK-NEXT: popq %rbp
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%t2 = extractelement <4 x i64> %t1, i32 %index
ret i64 %t2
}
define i64 @test_extractelement_variable_v8i64(<8 x i64> %t1, i32 %index) {
; CHECK-LABEL: test_extractelement_variable_v8i64:
; CHECK: ## %bb.0:
; CHECK-NEXT: pushq %rbp
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: .cfi_offset %rbp, -16
; CHECK-NEXT: movq %rsp, %rbp
; CHECK-NEXT: .cfi_def_cfa_register %rbp
; CHECK-NEXT: andq $-64, %rsp
; CHECK-NEXT: subq $128, %rsp
; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
; CHECK-NEXT: vmovaps %zmm0, (%rsp)
; CHECK-NEXT: andl $7, %edi
; CHECK-NEXT: movq (%rsp,%rdi,8), %rax
; CHECK-NEXT: movq %rbp, %rsp
; CHECK-NEXT: popq %rbp
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%t2 = extractelement <8 x i64> %t1, i32 %index
ret i64 %t2
}
define double @test_extractelement_variable_v2f64(<2 x double> %t1, i32 %index) {
; CHECK-LABEL: test_extractelement_variable_v2f64:
; CHECK: ## %bb.0:
; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: andl $1, %edi
; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: retq
%t2 = extractelement <2 x double> %t1, i32 %index
ret double %t2
}
define double @test_extractelement_variable_v4f64(<4 x double> %t1, i32 %index) {
; CHECK-LABEL: test_extractelement_variable_v4f64:
; CHECK: ## %bb.0:
; CHECK-NEXT: pushq %rbp
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: .cfi_offset %rbp, -16
; CHECK-NEXT: movq %rsp, %rbp
; CHECK-NEXT: .cfi_def_cfa_register %rbp
; CHECK-NEXT: andq $-32, %rsp
; CHECK-NEXT: subq $64, %rsp
; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
; CHECK-NEXT: vmovaps %ymm0, (%rsp)
; CHECK-NEXT: andl $3, %edi
; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movq %rbp, %rsp
; CHECK-NEXT: popq %rbp
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%t2 = extractelement <4 x double> %t1, i32 %index
ret double %t2
}
define double @test_extractelement_variable_v8f64(<8 x double> %t1, i32 %index) {
; CHECK-LABEL: test_extractelement_variable_v8f64:
; CHECK: ## %bb.0:
; CHECK-NEXT: pushq %rbp
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: .cfi_offset %rbp, -16
; CHECK-NEXT: movq %rsp, %rbp
; CHECK-NEXT: .cfi_def_cfa_register %rbp
; CHECK-NEXT: andq $-64, %rsp
; CHECK-NEXT: subq $128, %rsp
; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
; CHECK-NEXT: vmovaps %zmm0, (%rsp)
; CHECK-NEXT: andl $7, %edi
; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movq %rbp, %rsp
; CHECK-NEXT: popq %rbp
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%t2 = extractelement <8 x double> %t1, i32 %index
ret double %t2
}
define i32 @test_extractelement_variable_v4i32(<4 x i32> %t1, i32 %index) {
; CHECK-LABEL: test_extractelement_variable_v4i32:
; CHECK: ## %bb.0:
; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: andl $3, %edi
; CHECK-NEXT: movl -24(%rsp,%rdi,4), %eax
; CHECK-NEXT: retq
%t2 = extractelement <4 x i32> %t1, i32 %index
ret i32 %t2
}
define i32 @test_extractelement_variable_v8i32(<8 x i32> %t1, i32 %index) {
; CHECK-LABEL: test_extractelement_variable_v8i32:
; CHECK: ## %bb.0:
; CHECK-NEXT: pushq %rbp
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: .cfi_offset %rbp, -16
; CHECK-NEXT: movq %rsp, %rbp
; CHECK-NEXT: .cfi_def_cfa_register %rbp
; CHECK-NEXT: andq $-32, %rsp
; CHECK-NEXT: subq $64, %rsp
; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
; CHECK-NEXT: vmovaps %ymm0, (%rsp)
; CHECK-NEXT: andl $7, %edi
; CHECK-NEXT: movl (%rsp,%rdi,4), %eax
; CHECK-NEXT: movq %rbp, %rsp
; CHECK-NEXT: popq %rbp
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%t2 = extractelement <8 x i32> %t1, i32 %index
ret i32 %t2
}
define i32 @test_extractelement_variable_v16i32(<16 x i32> %t1, i32 %index) {
; CHECK-LABEL: test_extractelement_variable_v16i32:
; CHECK: ## %bb.0:
; CHECK-NEXT: pushq %rbp
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: .cfi_offset %rbp, -16
; CHECK-NEXT: movq %rsp, %rbp
; CHECK-NEXT: .cfi_def_cfa_register %rbp
; CHECK-NEXT: andq $-64, %rsp
; CHECK-NEXT: subq $128, %rsp
; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
; CHECK-NEXT: vmovaps %zmm0, (%rsp)
; CHECK-NEXT: andl $15, %edi
; CHECK-NEXT: movl (%rsp,%rdi,4), %eax
; CHECK-NEXT: movq %rbp, %rsp
; CHECK-NEXT: popq %rbp
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%t2 = extractelement <16 x i32> %t1, i32 %index
ret i32 %t2
}
define float @test_extractelement_variable_v4f32(<4 x float> %t1, i32 %index) {
; CHECK-LABEL: test_extractelement_variable_v4f32:
; CHECK: ## %bb.0:
; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: andl $3, %edi
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: retq
%t2 = extractelement <4 x float> %t1, i32 %index
ret float %t2
}
define float @test_extractelement_variable_v8f32(<8 x float> %t1, i32 %index) {
; CHECK-LABEL: test_extractelement_variable_v8f32:
; CHECK: ## %bb.0:
; CHECK-NEXT: pushq %rbp
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: .cfi_offset %rbp, -16
; CHECK-NEXT: movq %rsp, %rbp
; CHECK-NEXT: .cfi_def_cfa_register %rbp
; CHECK-NEXT: andq $-32, %rsp
; CHECK-NEXT: subq $64, %rsp
; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
; CHECK-NEXT: vmovaps %ymm0, (%rsp)
; CHECK-NEXT: andl $7, %edi
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: movq %rbp, %rsp
; CHECK-NEXT: popq %rbp
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%t2 = extractelement <8 x float> %t1, i32 %index
ret float %t2
}
define float @test_extractelement_variable_v16f32(<16 x float> %t1, i32 %index) {
; CHECK-LABEL: test_extractelement_variable_v16f32:
; CHECK: ## %bb.0:
; CHECK-NEXT: pushq %rbp
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: .cfi_offset %rbp, -16
; CHECK-NEXT: movq %rsp, %rbp
; CHECK-NEXT: .cfi_def_cfa_register %rbp
; CHECK-NEXT: andq $-64, %rsp
; CHECK-NEXT: subq $128, %rsp
; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
; CHECK-NEXT: vmovaps %zmm0, (%rsp)
; CHECK-NEXT: andl $15, %edi
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: movq %rbp, %rsp
; CHECK-NEXT: popq %rbp
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%t2 = extractelement <16 x float> %t1, i32 %index
ret float %t2
}
define i16 @test_extractelement_variable_v8i16(<8 x i16> %t1, i32 %index) {
; CHECK-LABEL: test_extractelement_variable_v8i16:
; CHECK: ## %bb.0:
; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: andl $7, %edi
; CHECK-NEXT: movzwl -24(%rsp,%rdi,2), %eax
; CHECK-NEXT: retq
%t2 = extractelement <8 x i16> %t1, i32 %index
ret i16 %t2
}
define i16 @test_extractelement_variable_v16i16(<16 x i16> %t1, i32 %index) {
; CHECK-LABEL: test_extractelement_variable_v16i16:
; CHECK: ## %bb.0:
; CHECK-NEXT: pushq %rbp
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: .cfi_offset %rbp, -16
; CHECK-NEXT: movq %rsp, %rbp
; CHECK-NEXT: .cfi_def_cfa_register %rbp
; CHECK-NEXT: andq $-32, %rsp
; CHECK-NEXT: subq $64, %rsp
; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
; CHECK-NEXT: vmovaps %ymm0, (%rsp)
; CHECK-NEXT: andl $15, %edi
; CHECK-NEXT: movzwl (%rsp,%rdi,2), %eax
; CHECK-NEXT: movq %rbp, %rsp
; CHECK-NEXT: popq %rbp
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%t2 = extractelement <16 x i16> %t1, i32 %index
ret i16 %t2
}
define i16 @test_extractelement_variable_v32i16(<32 x i16> %t1, i32 %index) {
; KNL-LABEL: test_extractelement_variable_v32i16:
; KNL: ## %bb.0:
; KNL-NEXT: pushq %rbp
; KNL-NEXT: .cfi_def_cfa_offset 16
; KNL-NEXT: .cfi_offset %rbp, -16
; KNL-NEXT: movq %rsp, %rbp
; KNL-NEXT: .cfi_def_cfa_register %rbp
; KNL-NEXT: andq $-64, %rsp
; KNL-NEXT: subq $128, %rsp
; KNL-NEXT: ## kill: def $edi killed $edi def $rdi
; KNL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
; KNL-NEXT: vmovaps %ymm0, (%rsp)
; KNL-NEXT: andl $31, %edi
; KNL-NEXT: movzwl (%rsp,%rdi,2), %eax
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_variable_v32i16:
; SKX: ## %bb.0:
; SKX-NEXT: pushq %rbp
; SKX-NEXT: .cfi_def_cfa_offset 16
; SKX-NEXT: .cfi_offset %rbp, -16
; SKX-NEXT: movq %rsp, %rbp
; SKX-NEXT: .cfi_def_cfa_register %rbp
; SKX-NEXT: andq $-64, %rsp
; SKX-NEXT: subq $128, %rsp
; SKX-NEXT: ## kill: def $edi killed $edi def $rdi
; SKX-NEXT: vmovaps %zmm0, (%rsp)
; SKX-NEXT: andl $31, %edi
; SKX-NEXT: movzwl (%rsp,%rdi,2), %eax
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t2 = extractelement <32 x i16> %t1, i32 %index
ret i16 %t2
}
define i8 @test_extractelement_variable_v16i8(<16 x i8> %t1, i32 %index) {
; CHECK-LABEL: test_extractelement_variable_v16i8:
; CHECK: ## %bb.0:
; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: andl $15, %edi
; CHECK-NEXT: movb -24(%rsp,%rdi), %al
; CHECK-NEXT: retq
%t2 = extractelement <16 x i8> %t1, i32 %index
ret i8 %t2
}
define i8 @test_extractelement_variable_v32i8(<32 x i8> %t1, i32 %index) {
; CHECK-LABEL: test_extractelement_variable_v32i8:
; CHECK: ## %bb.0:
; CHECK-NEXT: pushq %rbp
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: .cfi_offset %rbp, -16
; CHECK-NEXT: movq %rsp, %rbp
; CHECK-NEXT: .cfi_def_cfa_register %rbp
; CHECK-NEXT: andq $-32, %rsp
; CHECK-NEXT: subq $64, %rsp
; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
; CHECK-NEXT: vmovaps %ymm0, (%rsp)
; CHECK-NEXT: andl $31, %edi
; CHECK-NEXT: movb (%rsp,%rdi), %al
; CHECK-NEXT: movq %rbp, %rsp
; CHECK-NEXT: popq %rbp
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%t2 = extractelement <32 x i8> %t1, i32 %index
ret i8 %t2
}
define i8 @test_extractelement_variable_v64i8(<64 x i8> %t1, i32 %index) {
; KNL-LABEL: test_extractelement_variable_v64i8:
; KNL: ## %bb.0:
; KNL-NEXT: pushq %rbp
; KNL-NEXT: .cfi_def_cfa_offset 16
; KNL-NEXT: .cfi_offset %rbp, -16
; KNL-NEXT: movq %rsp, %rbp
; KNL-NEXT: .cfi_def_cfa_register %rbp
; KNL-NEXT: andq $-64, %rsp
; KNL-NEXT: subq $128, %rsp
; KNL-NEXT: ## kill: def $edi killed $edi def $rdi
; KNL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
; KNL-NEXT: vmovaps %ymm0, (%rsp)
; KNL-NEXT: andl $63, %edi
; KNL-NEXT: movb (%rsp,%rdi), %al
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_variable_v64i8:
; SKX: ## %bb.0:
; SKX-NEXT: pushq %rbp
; SKX-NEXT: .cfi_def_cfa_offset 16
; SKX-NEXT: .cfi_offset %rbp, -16
; SKX-NEXT: movq %rsp, %rbp
; SKX-NEXT: .cfi_def_cfa_register %rbp
; SKX-NEXT: andq $-64, %rsp
; SKX-NEXT: subq $128, %rsp
; SKX-NEXT: ## kill: def $edi killed $edi def $rdi
; SKX-NEXT: vmovaps %zmm0, (%rsp)
; SKX-NEXT: andl $63, %edi
; SKX-NEXT: movb (%rsp,%rdi), %al
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t2 = extractelement <64 x i8> %t1, i32 %index
ret i8 %t2
}
define i8 @test_extractelement_variable_v64i8_indexi8(<64 x i8> %t1, i8 %index) {
; KNL-LABEL: test_extractelement_variable_v64i8_indexi8:
; KNL: ## %bb.0:
; KNL-NEXT: pushq %rbp
; KNL-NEXT: .cfi_def_cfa_offset 16
; KNL-NEXT: .cfi_offset %rbp, -16
; KNL-NEXT: movq %rsp, %rbp
; KNL-NEXT: .cfi_def_cfa_register %rbp
; KNL-NEXT: andq $-64, %rsp
; KNL-NEXT: subq $128, %rsp
; KNL-NEXT: addb %dil, %dil
; KNL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
; KNL-NEXT: vmovaps %ymm0, (%rsp)
; KNL-NEXT: movzbl %dil, %eax
; KNL-NEXT: andl $63, %eax
; KNL-NEXT: movb (%rsp,%rax), %al
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_variable_v64i8_indexi8:
; SKX: ## %bb.0:
; SKX-NEXT: pushq %rbp
; SKX-NEXT: .cfi_def_cfa_offset 16
; SKX-NEXT: .cfi_offset %rbp, -16
; SKX-NEXT: movq %rsp, %rbp
; SKX-NEXT: .cfi_def_cfa_register %rbp
; SKX-NEXT: andq $-64, %rsp
; SKX-NEXT: subq $128, %rsp
; SKX-NEXT: addb %dil, %dil
; SKX-NEXT: vmovaps %zmm0, (%rsp)
; SKX-NEXT: movzbl %dil, %eax
; SKX-NEXT: andl $63, %eax
; SKX-NEXT: movb (%rsp,%rax), %al
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%i = add i8 %index, %index
%t2 = extractelement <64 x i8> %t1, i8 %i
ret i8 %t2
}
define zeroext i8 @test_extractelement_varible_v2i1(<2 x i64> %a, <2 x i64> %b, i32 %index) {
; KNL-LABEL: test_extractelement_varible_v2i1:
; KNL: ## %bb.0:
; KNL-NEXT: ## kill: def $edi killed $edi def $rdi
; KNL-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; KNL-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1
; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vextracti32x4 $0, %zmm0, -{{[0-9]+}}(%rsp)
; KNL-NEXT: andl $1, %edi
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; KNL-NEXT: movzbl -24(%rsp,%rdi,8), %eax
; KNL-NEXT: andl $1, %eax
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_varible_v2i1:
; SKX: ## %bb.0:
; SKX-NEXT: ## kill: def $edi killed $edi def $rdi
; SKX-NEXT: vpcmpnleuq %xmm1, %xmm0, %k0
; SKX-NEXT: vpmovm2q %k0, %xmm0
; SKX-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
; SKX-NEXT: andl $1, %edi
; SKX-NEXT: movzbl -24(%rsp,%rdi,8), %eax
; SKX-NEXT: andl $1, %eax
; SKX-NEXT: retq
%t1 = icmp ugt <2 x i64> %a, %b
%t2 = extractelement <2 x i1> %t1, i32 %index
%res = zext i1 %t2 to i8
ret i8 %res
}
define zeroext i8 @test_extractelement_varible_v4i1(<4 x i32> %a, <4 x i32> %b, i32 %index) {
; KNL-LABEL: test_extractelement_varible_v4i1:
; KNL: ## %bb.0:
; KNL-NEXT: ## kill: def $edi killed $edi def $rdi
; KNL-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k1
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vextracti32x4 $0, %zmm0, -{{[0-9]+}}(%rsp)
; KNL-NEXT: andl $3, %edi
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; KNL-NEXT: movzbl -24(%rsp,%rdi,4), %eax
; KNL-NEXT: andl $1, %eax
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_varible_v4i1:
; SKX: ## %bb.0:
; SKX-NEXT: ## kill: def $edi killed $edi def $rdi
; SKX-NEXT: vpcmpnleud %xmm1, %xmm0, %k0
; SKX-NEXT: vpmovm2d %k0, %xmm0
; SKX-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
; SKX-NEXT: andl $3, %edi
; SKX-NEXT: movzbl -24(%rsp,%rdi,4), %eax
; SKX-NEXT: andl $1, %eax
; SKX-NEXT: retq
%t1 = icmp ugt <4 x i32> %a, %b
%t2 = extractelement <4 x i1> %t1, i32 %index
%res = zext i1 %t2 to i8
ret i8 %res
}
define zeroext i8 @test_extractelement_varible_v8i1(<8 x i32> %a, <8 x i32> %b, i32 %index) {
; KNL-LABEL: test_extractelement_varible_v8i1:
; KNL: ## %bb.0:
; KNL-NEXT: ## kill: def $edi killed $edi def $rdi
; KNL-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k1
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdw %zmm0, %ymm0
; KNL-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
; KNL-NEXT: andl $7, %edi
; KNL-NEXT: movzbl -24(%rsp,%rdi,2), %eax
; KNL-NEXT: andl $1, %eax
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_varible_v8i1:
; SKX: ## %bb.0:
; SKX-NEXT: ## kill: def $edi killed $edi def $rdi
; SKX-NEXT: vpcmpnleud %ymm1, %ymm0, %k0
; SKX-NEXT: vpmovm2w %k0, %xmm0
; SKX-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
; SKX-NEXT: andl $7, %edi
; SKX-NEXT: movzbl -24(%rsp,%rdi,2), %eax
; SKX-NEXT: andl $1, %eax
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t1 = icmp ugt <8 x i32> %a, %b
%t2 = extractelement <8 x i1> %t1, i32 %index
%res = zext i1 %t2 to i8
ret i8 %res
}
define zeroext i8 @test_extractelement_varible_v16i1(<16 x i32> %a, <16 x i32> %b, i32 %index) {
; KNL-LABEL: test_extractelement_varible_v16i1:
; KNL: ## %bb.0:
; KNL-NEXT: ## kill: def $edi killed $edi def $rdi
; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k1
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm0, -{{[0-9]+}}(%rsp)
; KNL-NEXT: andl $15, %edi
; KNL-NEXT: movzbl -24(%rsp,%rdi), %eax
; KNL-NEXT: andl $1, %eax
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_varible_v16i1:
; SKX: ## %bb.0:
; SKX-NEXT: ## kill: def $edi killed $edi def $rdi
; SKX-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
; SKX-NEXT: vpmovm2b %k0, %xmm0
; SKX-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
; SKX-NEXT: andl $15, %edi
; SKX-NEXT: movzbl -24(%rsp,%rdi), %eax
; SKX-NEXT: andl $1, %eax
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t1 = icmp ugt <16 x i32> %a, %b
%t2 = extractelement <16 x i1> %t1, i32 %index
%res = zext i1 %t2 to i8
ret i8 %res
}
define zeroext i8 @test_extractelement_varible_v32i1(<32 x i8> %a, <32 x i8> %b, i32 %index) {
; KNL-LABEL: test_extractelement_varible_v32i1:
; KNL: ## %bb.0:
; KNL-NEXT: pushq %rbp
; KNL-NEXT: .cfi_def_cfa_offset 16
; KNL-NEXT: .cfi_offset %rbp, -16
; KNL-NEXT: movq %rsp, %rbp
; KNL-NEXT: .cfi_def_cfa_register %rbp
; KNL-NEXT: andq $-32, %rsp
; KNL-NEXT: subq $64, %rsp
; KNL-NEXT: ## kill: def $edi killed $edi def $rdi
; KNL-NEXT: vpminub %ymm1, %ymm0, %ymm1
; KNL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; KNL-NEXT: vmovdqa %ymm0, (%rsp)
; KNL-NEXT: andl $31, %edi
; KNL-NEXT: movzbl (%rsp,%rdi), %eax
; KNL-NEXT: andl $1, %eax
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_varible_v32i1:
; SKX: ## %bb.0:
; SKX-NEXT: pushq %rbp
; SKX-NEXT: .cfi_def_cfa_offset 16
; SKX-NEXT: .cfi_offset %rbp, -16
; SKX-NEXT: movq %rsp, %rbp
; SKX-NEXT: .cfi_def_cfa_register %rbp
; SKX-NEXT: andq $-32, %rsp
; SKX-NEXT: subq $64, %rsp
; SKX-NEXT: ## kill: def $edi killed $edi def $rdi
; SKX-NEXT: vpcmpnleub %ymm1, %ymm0, %k0
; SKX-NEXT: vpmovm2b %k0, %ymm0
; SKX-NEXT: vmovdqa %ymm0, (%rsp)
; SKX-NEXT: andl $31, %edi
; SKX-NEXT: movzbl (%rsp,%rdi), %eax
; SKX-NEXT: andl $1, %eax
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t1 = icmp ugt <32 x i8> %a, %b
%t2 = extractelement <32 x i1> %t1, i32 %index
%res = zext i1 %t2 to i8
ret i8 %res
}
define <8 x i64> @insert_double_zero(<2 x i64> %a) nounwind {
; CHECK-LABEL: insert_double_zero:
; CHECK: ## %bb.0:
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vinsertf32x4 $2, %xmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%b = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%d = shufflevector <4 x i64> %b, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%e = shufflevector <8 x i64> %d, <8 x i64> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3>
ret <8 x i64> %e
}
define i32 @test_insertelement_variable_v32i1(<32 x i8> %a, i8 %b, i32 %index) {
; KNL-LABEL: test_insertelement_variable_v32i1:
; KNL: ## %bb.0:
; KNL-NEXT: pushq %rbp
; KNL-NEXT: .cfi_def_cfa_offset 16
; KNL-NEXT: .cfi_offset %rbp, -16
; KNL-NEXT: movq %rsp, %rbp
; KNL-NEXT: .cfi_def_cfa_register %rbp
; KNL-NEXT: andq $-32, %rsp
; KNL-NEXT: subq $64, %rsp
; KNL-NEXT: ## kill: def $esi killed $esi def $rsi
; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; KNL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; KNL-NEXT: andl $31, %esi
; KNL-NEXT: testb %dil, %dil
; KNL-NEXT: vmovdqa %ymm0, (%rsp)
; KNL-NEXT: setne (%rsp,%rsi)
; KNL-NEXT: vpmovsxbd (%rsp), %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: shll $16, %eax
; KNL-NEXT: orl %ecx, %eax
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_insertelement_variable_v32i1:
; SKX: ## %bb.0:
; SKX-NEXT: pushq %rbp
; SKX-NEXT: .cfi_def_cfa_offset 16
; SKX-NEXT: .cfi_offset %rbp, -16
; SKX-NEXT: movq %rsp, %rbp
; SKX-NEXT: .cfi_def_cfa_register %rbp
; SKX-NEXT: andq $-32, %rsp
; SKX-NEXT: subq $64, %rsp
; SKX-NEXT: ## kill: def $esi killed $esi def $rsi
; SKX-NEXT: vptestmb %ymm0, %ymm0, %k0
; SKX-NEXT: andl $31, %esi
; SKX-NEXT: testb %dil, %dil
; SKX-NEXT: vpmovm2b %k0, %ymm0
; SKX-NEXT: vmovdqa %ymm0, (%rsp)
; SKX-NEXT: setne (%rsp,%rsi)
; SKX-NEXT: vpsllw $7, (%rsp), %ymm0
; SKX-NEXT: vpmovb2m %ymm0, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t1 = icmp ugt <32 x i8> %a, zeroinitializer
%t2 = icmp ugt i8 %b, 0
%t3 = insertelement <32 x i1> %t1, i1 %t2, i32 %index
%t4 = bitcast <32 x i1> %t3 to i32
ret i32 %t4
}
define i64 @test_insertelement_variable_v64i1(<64 x i8> %a, i8 %b, i32 %index) {
; KNL-LABEL: test_insertelement_variable_v64i1:
; KNL: ## %bb.0:
; KNL-NEXT: pushq %rbp
; KNL-NEXT: .cfi_def_cfa_offset 16
; KNL-NEXT: .cfi_offset %rbp, -16
; KNL-NEXT: movq %rsp, %rbp
; KNL-NEXT: .cfi_def_cfa_register %rbp
; KNL-NEXT: andq $-64, %rsp
; KNL-NEXT: subq $128, %rsp
; KNL-NEXT: ## kill: def $esi killed $esi def $rsi
; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; KNL-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; KNL-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1
; KNL-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1
; KNL-NEXT: andl $63, %esi
; KNL-NEXT: testb %dil, %dil
; KNL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
; KNL-NEXT: vmovdqa %ymm0, (%rsp)
; KNL-NEXT: setne (%rsp,%rsi)
; KNL-NEXT: vmovdqa (%rsp), %ymm0
; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm1
; KNL-NEXT: vpmovsxbd %xmm0, %zmm2
; KNL-NEXT: vpslld $31, %zmm2, %zmm2
; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: shll $16, %ecx
; KNL-NEXT: orl %eax, %ecx
; KNL-NEXT: vpmovsxbd %xmm1, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %edx
; KNL-NEXT: vextracti128 $1, %ymm1, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: shll $16, %eax
; KNL-NEXT: orl %edx, %eax
; KNL-NEXT: shlq $32, %rax
; KNL-NEXT: orq %rcx, %rax
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_insertelement_variable_v64i1:
; SKX: ## %bb.0:
; SKX-NEXT: pushq %rbp
; SKX-NEXT: .cfi_def_cfa_offset 16
; SKX-NEXT: .cfi_offset %rbp, -16
; SKX-NEXT: movq %rsp, %rbp
; SKX-NEXT: .cfi_def_cfa_register %rbp
; SKX-NEXT: andq $-64, %rsp
; SKX-NEXT: subq $128, %rsp
; SKX-NEXT: ## kill: def $esi killed $esi def $rsi
; SKX-NEXT: vptestmb %zmm0, %zmm0, %k0
; SKX-NEXT: andl $63, %esi
; SKX-NEXT: testb %dil, %dil
; SKX-NEXT: vpmovm2b %k0, %zmm0
; SKX-NEXT: vmovdqa64 %zmm0, (%rsp)
; SKX-NEXT: setne (%rsp,%rsi)
; SKX-NEXT: vpsllw $7, (%rsp), %zmm0
; SKX-NEXT: vpmovb2m %zmm0, %k0
; SKX-NEXT: kmovq %k0, %rax
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t1 = icmp ugt <64 x i8> %a, zeroinitializer
%t2 = icmp ugt i8 %b, 0
%t3 = insertelement <64 x i1> %t1, i1 %t2, i32 %index
%t4 = bitcast <64 x i1> %t3 to i64
ret i64 %t4
}
define i96 @test_insertelement_variable_v96i1(<96 x i8> %a, i8 %b, i32 %index) {
; KNL-LABEL: test_insertelement_variable_v96i1:
; KNL: ## %bb.0:
; KNL-NEXT: pushq %rbp
; KNL-NEXT: .cfi_def_cfa_offset 16
; KNL-NEXT: .cfi_offset %rbp, -16
; KNL-NEXT: movq %rsp, %rbp
; KNL-NEXT: .cfi_def_cfa_register %rbp
; KNL-NEXT: andq $-128, %rsp
; KNL-NEXT: subq $256, %rsp ## imm = 0x100
; KNL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; KNL-NEXT: vpinsrb $1, 488(%rbp), %xmm0, %xmm0
; KNL-NEXT: vpinsrb $2, 496(%rbp), %xmm0, %xmm0
; KNL-NEXT: vpinsrb $3, 504(%rbp), %xmm0, %xmm0
; KNL-NEXT: vpinsrb $4, 512(%rbp), %xmm0, %xmm0
; KNL-NEXT: vpinsrb $5, 520(%rbp), %xmm0, %xmm0
; KNL-NEXT: vpinsrb $6, 528(%rbp), %xmm0, %xmm0
; KNL-NEXT: vpinsrb $7, 536(%rbp), %xmm0, %xmm0
; KNL-NEXT: vpinsrb $8, 544(%rbp), %xmm0, %xmm0
; KNL-NEXT: vpinsrb $9, 552(%rbp), %xmm0, %xmm0
; KNL-NEXT: vpinsrb $10, 560(%rbp), %xmm0, %xmm0
; KNL-NEXT: vpinsrb $11, 568(%rbp), %xmm0, %xmm0
; KNL-NEXT: vpinsrb $12, 576(%rbp), %xmm0, %xmm0
; KNL-NEXT: vpinsrb $13, 584(%rbp), %xmm0, %xmm0
; KNL-NEXT: vpinsrb $14, 592(%rbp), %xmm0, %xmm0
; KNL-NEXT: vpinsrb $15, 600(%rbp), %xmm0, %xmm0
; KNL-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; KNL-NEXT: vpinsrb $1, 616(%rbp), %xmm1, %xmm1
; KNL-NEXT: vpinsrb $2, 624(%rbp), %xmm1, %xmm1
; KNL-NEXT: vpinsrb $3, 632(%rbp), %xmm1, %xmm1
; KNL-NEXT: vpinsrb $4, 640(%rbp), %xmm1, %xmm1
; KNL-NEXT: vpinsrb $5, 648(%rbp), %xmm1, %xmm1
; KNL-NEXT: vpinsrb $6, 656(%rbp), %xmm1, %xmm1
; KNL-NEXT: vpinsrb $7, 664(%rbp), %xmm1, %xmm1
; KNL-NEXT: vpinsrb $8, 672(%rbp), %xmm1, %xmm1
; KNL-NEXT: vpinsrb $9, 680(%rbp), %xmm1, %xmm1
; KNL-NEXT: vpinsrb $10, 688(%rbp), %xmm1, %xmm1
; KNL-NEXT: vpinsrb $11, 696(%rbp), %xmm1, %xmm1
; KNL-NEXT: vpinsrb $12, 704(%rbp), %xmm1, %xmm1
; KNL-NEXT: vpinsrb $13, 712(%rbp), %xmm1, %xmm1
; KNL-NEXT: vpinsrb $14, 720(%rbp), %xmm1, %xmm1
; KNL-NEXT: vpinsrb $15, 728(%rbp), %xmm1, %xmm1
; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; KNL-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; KNL-NEXT: vpinsrb $1, 232(%rbp), %xmm1, %xmm1
; KNL-NEXT: vpinsrb $2, 240(%rbp), %xmm1, %xmm1
; KNL-NEXT: vpinsrb $3, 248(%rbp), %xmm1, %xmm1
; KNL-NEXT: vpinsrb $4, 256(%rbp), %xmm1, %xmm1
; KNL-NEXT: vpinsrb $5, 264(%rbp), %xmm1, %xmm1
; KNL-NEXT: vpinsrb $6, 272(%rbp), %xmm1, %xmm1
; KNL-NEXT: vpinsrb $7, 280(%rbp), %xmm1, %xmm1
; KNL-NEXT: vpinsrb $8, 288(%rbp), %xmm1, %xmm1
; KNL-NEXT: vpinsrb $9, 296(%rbp), %xmm1, %xmm1
; KNL-NEXT: vpinsrb $10, 304(%rbp), %xmm1, %xmm1
; KNL-NEXT: vpinsrb $11, 312(%rbp), %xmm1, %xmm1
; KNL-NEXT: vpinsrb $12, 320(%rbp), %xmm1, %xmm1
; KNL-NEXT: vpinsrb $13, 328(%rbp), %xmm1, %xmm1
; KNL-NEXT: vpinsrb $14, 336(%rbp), %xmm1, %xmm1
; KNL-NEXT: vpinsrb $15, 344(%rbp), %xmm1, %xmm1
; KNL-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; KNL-NEXT: vpinsrb $1, 360(%rbp), %xmm2, %xmm2
; KNL-NEXT: vpinsrb $2, 368(%rbp), %xmm2, %xmm2
; KNL-NEXT: vpinsrb $3, 376(%rbp), %xmm2, %xmm2
; KNL-NEXT: vpinsrb $4, 384(%rbp), %xmm2, %xmm2
; KNL-NEXT: vpinsrb $5, 392(%rbp), %xmm2, %xmm2
; KNL-NEXT: vpinsrb $6, 400(%rbp), %xmm2, %xmm2
; KNL-NEXT: vpinsrb $7, 408(%rbp), %xmm2, %xmm2
; KNL-NEXT: vpinsrb $8, 416(%rbp), %xmm2, %xmm2
; KNL-NEXT: vpinsrb $9, 424(%rbp), %xmm2, %xmm2
; KNL-NEXT: vpinsrb $10, 432(%rbp), %xmm2, %xmm2
; KNL-NEXT: vpinsrb $11, 440(%rbp), %xmm2, %xmm2
; KNL-NEXT: vpinsrb $12, 448(%rbp), %xmm2, %xmm2
; KNL-NEXT: vpinsrb $13, 456(%rbp), %xmm2, %xmm2
; KNL-NEXT: vpinsrb $14, 464(%rbp), %xmm2, %xmm2
; KNL-NEXT: vpinsrb $15, 472(%rbp), %xmm2, %xmm2
; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; KNL-NEXT: vmovd %edi, %xmm2
; KNL-NEXT: vpinsrb $1, %esi, %xmm2, %xmm2
; KNL-NEXT: vpinsrb $2, %edx, %xmm2, %xmm2
; KNL-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2
; KNL-NEXT: vpinsrb $4, %r8d, %xmm2, %xmm2
; KNL-NEXT: vpinsrb $5, %r9d, %xmm2, %xmm2
; KNL-NEXT: vpinsrb $6, 16(%rbp), %xmm2, %xmm2
; KNL-NEXT: vpinsrb $7, 24(%rbp), %xmm2, %xmm2
; KNL-NEXT: vpinsrb $8, 32(%rbp), %xmm2, %xmm2
; KNL-NEXT: vpinsrb $9, 40(%rbp), %xmm2, %xmm2
; KNL-NEXT: vpinsrb $10, 48(%rbp), %xmm2, %xmm2
; KNL-NEXT: vpinsrb $11, 56(%rbp), %xmm2, %xmm2
; KNL-NEXT: vpinsrb $12, 64(%rbp), %xmm2, %xmm2
; KNL-NEXT: vpinsrb $13, 72(%rbp), %xmm2, %xmm2
; KNL-NEXT: vpinsrb $14, 80(%rbp), %xmm2, %xmm2
; KNL-NEXT: vpinsrb $15, 88(%rbp), %xmm2, %xmm2
; KNL-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
; KNL-NEXT: vpinsrb $1, 104(%rbp), %xmm3, %xmm3
; KNL-NEXT: vpinsrb $2, 112(%rbp), %xmm3, %xmm3
; KNL-NEXT: vpinsrb $3, 120(%rbp), %xmm3, %xmm3
; KNL-NEXT: vpinsrb $4, 128(%rbp), %xmm3, %xmm3
; KNL-NEXT: vpinsrb $5, 136(%rbp), %xmm3, %xmm3
; KNL-NEXT: vpinsrb $6, 144(%rbp), %xmm3, %xmm3
; KNL-NEXT: vpinsrb $7, 152(%rbp), %xmm3, %xmm3
; KNL-NEXT: vpinsrb $8, 160(%rbp), %xmm3, %xmm3
; KNL-NEXT: vpinsrb $9, 168(%rbp), %xmm3, %xmm3
; KNL-NEXT: vpinsrb $10, 176(%rbp), %xmm3, %xmm3
; KNL-NEXT: vpinsrb $11, 184(%rbp), %xmm3, %xmm3
; KNL-NEXT: vpinsrb $12, 192(%rbp), %xmm3, %xmm3
; KNL-NEXT: vpinsrb $13, 200(%rbp), %xmm3, %xmm3
; KNL-NEXT: vpinsrb $14, 208(%rbp), %xmm3, %xmm3
; KNL-NEXT: vpinsrb $15, 216(%rbp), %xmm3, %xmm3
; KNL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; KNL-NEXT: movl 744(%rbp), %eax
; KNL-NEXT: andl $127, %eax
; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3
; KNL-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2
; KNL-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2
; KNL-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1
; KNL-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1
; KNL-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0
; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; KNL-NEXT: cmpb $0, 736(%rbp)
; KNL-NEXT: vmovdqa %ymm3, {{[0-9]+}}(%rsp)
; KNL-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
; KNL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
; KNL-NEXT: vmovdqa %ymm2, (%rsp)
; KNL-NEXT: setne (%rsp,%rax)
; KNL-NEXT: vmovdqa (%rsp), %ymm2
; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm3
; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm1
; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm0
; KNL-NEXT: vpmovsxbd %xmm2, %zmm4
; KNL-NEXT: vpslld $31, %zmm4, %zmm4
; KNL-NEXT: vptestmd %zmm4, %zmm4, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: vextracti128 $1, %ymm2, %xmm2
; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
; KNL-NEXT: vpslld $31, %zmm2, %zmm2
; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0
; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: shll $16, %ecx
; KNL-NEXT: orl %eax, %ecx
; KNL-NEXT: vpmovsxbd %xmm3, %zmm2
; KNL-NEXT: vpslld $31, %zmm2, %zmm2
; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0
; KNL-NEXT: kmovw %k0, %edx
; KNL-NEXT: vextracti128 $1, %ymm3, %xmm2
; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
; KNL-NEXT: vpslld $31, %zmm2, %zmm2
; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: shll $16, %eax
; KNL-NEXT: orl %edx, %eax
; KNL-NEXT: shlq $32, %rax
; KNL-NEXT: orq %rcx, %rax
; KNL-NEXT: vpmovsxbd %xmm1, %zmm2
; KNL-NEXT: vpslld $31, %zmm2, %zmm2
; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0
; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: vextracti128 $1, %ymm1, %xmm1
; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
; KNL-NEXT: vpslld $31, %zmm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; KNL-NEXT: kmovw %k0, %esi
; KNL-NEXT: shll $16, %esi
; KNL-NEXT: orl %ecx, %esi
; KNL-NEXT: vpmovsxbd %xmm0, %zmm1
; KNL-NEXT: vpslld $31, %zmm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %edx
; KNL-NEXT: shll $16, %edx
; KNL-NEXT: orl %ecx, %edx
; KNL-NEXT: shlq $32, %rdx
; KNL-NEXT: orq %rsi, %rdx
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_insertelement_variable_v96i1:
; SKX: ## %bb.0:
; SKX-NEXT: pushq %rbp
; SKX-NEXT: .cfi_def_cfa_offset 16
; SKX-NEXT: .cfi_offset %rbp, -16
; SKX-NEXT: movq %rsp, %rbp
; SKX-NEXT: .cfi_def_cfa_register %rbp
; SKX-NEXT: andq $-128, %rsp
; SKX-NEXT: subq $256, %rsp ## imm = 0x100
; SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SKX-NEXT: vpinsrb $1, 232(%rbp), %xmm0, %xmm0
; SKX-NEXT: vpinsrb $2, 240(%rbp), %xmm0, %xmm0
; SKX-NEXT: vpinsrb $3, 248(%rbp), %xmm0, %xmm0
; SKX-NEXT: vpinsrb $4, 256(%rbp), %xmm0, %xmm0
; SKX-NEXT: vpinsrb $5, 264(%rbp), %xmm0, %xmm0
; SKX-NEXT: vpinsrb $6, 272(%rbp), %xmm0, %xmm0
; SKX-NEXT: vpinsrb $7, 280(%rbp), %xmm0, %xmm0
; SKX-NEXT: vpinsrb $8, 288(%rbp), %xmm0, %xmm0
; SKX-NEXT: vpinsrb $9, 296(%rbp), %xmm0, %xmm0
; SKX-NEXT: vpinsrb $10, 304(%rbp), %xmm0, %xmm0
; SKX-NEXT: vpinsrb $11, 312(%rbp), %xmm0, %xmm0
; SKX-NEXT: vpinsrb $12, 320(%rbp), %xmm0, %xmm0
; SKX-NEXT: vpinsrb $13, 328(%rbp), %xmm0, %xmm0
; SKX-NEXT: vpinsrb $14, 336(%rbp), %xmm0, %xmm0
; SKX-NEXT: vpinsrb $15, 344(%rbp), %xmm0, %xmm0
; SKX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SKX-NEXT: vpinsrb $1, 360(%rbp), %xmm1, %xmm1
; SKX-NEXT: vpinsrb $2, 368(%rbp), %xmm1, %xmm1
; SKX-NEXT: vpinsrb $3, 376(%rbp), %xmm1, %xmm1
; SKX-NEXT: vpinsrb $4, 384(%rbp), %xmm1, %xmm1
; SKX-NEXT: vpinsrb $5, 392(%rbp), %xmm1, %xmm1
; SKX-NEXT: vpinsrb $6, 400(%rbp), %xmm1, %xmm1
; SKX-NEXT: vpinsrb $7, 408(%rbp), %xmm1, %xmm1
; SKX-NEXT: vpinsrb $8, 416(%rbp), %xmm1, %xmm1
; SKX-NEXT: vpinsrb $9, 424(%rbp), %xmm1, %xmm1
; SKX-NEXT: vpinsrb $10, 432(%rbp), %xmm1, %xmm1
; SKX-NEXT: vpinsrb $11, 440(%rbp), %xmm1, %xmm1
; SKX-NEXT: vpinsrb $12, 448(%rbp), %xmm1, %xmm1
; SKX-NEXT: vpinsrb $13, 456(%rbp), %xmm1, %xmm1
; SKX-NEXT: vpinsrb $14, 464(%rbp), %xmm1, %xmm1
; SKX-NEXT: vpinsrb $15, 472(%rbp), %xmm1, %xmm1
; SKX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; SKX-NEXT: vmovd %edi, %xmm1
; SKX-NEXT: vpinsrb $1, %esi, %xmm1, %xmm1
; SKX-NEXT: vpinsrb $2, %edx, %xmm1, %xmm1
; SKX-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1
; SKX-NEXT: vpinsrb $4, %r8d, %xmm1, %xmm1
; SKX-NEXT: vpinsrb $5, %r9d, %xmm1, %xmm1
; SKX-NEXT: vpinsrb $6, 16(%rbp), %xmm1, %xmm1
; SKX-NEXT: vpinsrb $7, 24(%rbp), %xmm1, %xmm1
; SKX-NEXT: vpinsrb $8, 32(%rbp), %xmm1, %xmm1
; SKX-NEXT: vpinsrb $9, 40(%rbp), %xmm1, %xmm1
; SKX-NEXT: vpinsrb $10, 48(%rbp), %xmm1, %xmm1
; SKX-NEXT: vpinsrb $11, 56(%rbp), %xmm1, %xmm1
; SKX-NEXT: vpinsrb $12, 64(%rbp), %xmm1, %xmm1
; SKX-NEXT: vpinsrb $13, 72(%rbp), %xmm1, %xmm1
; SKX-NEXT: vpinsrb $14, 80(%rbp), %xmm1, %xmm1
; SKX-NEXT: vpinsrb $15, 88(%rbp), %xmm1, %xmm1
; SKX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SKX-NEXT: vpinsrb $1, 104(%rbp), %xmm2, %xmm2
; SKX-NEXT: vpinsrb $2, 112(%rbp), %xmm2, %xmm2
; SKX-NEXT: vpinsrb $3, 120(%rbp), %xmm2, %xmm2
; SKX-NEXT: vpinsrb $4, 128(%rbp), %xmm2, %xmm2
; SKX-NEXT: vpinsrb $5, 136(%rbp), %xmm2, %xmm2
; SKX-NEXT: vpinsrb $6, 144(%rbp), %xmm2, %xmm2
; SKX-NEXT: vpinsrb $7, 152(%rbp), %xmm2, %xmm2
; SKX-NEXT: vpinsrb $8, 160(%rbp), %xmm2, %xmm2
; SKX-NEXT: vpinsrb $9, 168(%rbp), %xmm2, %xmm2
; SKX-NEXT: vpinsrb $10, 176(%rbp), %xmm2, %xmm2
; SKX-NEXT: vpinsrb $11, 184(%rbp), %xmm2, %xmm2
; SKX-NEXT: vpinsrb $12, 192(%rbp), %xmm2, %xmm2
; SKX-NEXT: vpinsrb $13, 200(%rbp), %xmm2, %xmm2
; SKX-NEXT: vpinsrb $14, 208(%rbp), %xmm2, %xmm2
; SKX-NEXT: vpinsrb $15, 216(%rbp), %xmm2, %xmm2
; SKX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; SKX-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; SKX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SKX-NEXT: vpinsrb $1, 488(%rbp), %xmm1, %xmm1
; SKX-NEXT: vpinsrb $2, 496(%rbp), %xmm1, %xmm1
; SKX-NEXT: vpinsrb $3, 504(%rbp), %xmm1, %xmm1
; SKX-NEXT: vpinsrb $4, 512(%rbp), %xmm1, %xmm1
; SKX-NEXT: vpinsrb $5, 520(%rbp), %xmm1, %xmm1
; SKX-NEXT: vpinsrb $6, 528(%rbp), %xmm1, %xmm1
; SKX-NEXT: vpinsrb $7, 536(%rbp), %xmm1, %xmm1
; SKX-NEXT: vpinsrb $8, 544(%rbp), %xmm1, %xmm1
; SKX-NEXT: vpinsrb $9, 552(%rbp), %xmm1, %xmm1
; SKX-NEXT: vpinsrb $10, 560(%rbp), %xmm1, %xmm1
; SKX-NEXT: vpinsrb $11, 568(%rbp), %xmm1, %xmm1
; SKX-NEXT: vpinsrb $12, 576(%rbp), %xmm1, %xmm1
; SKX-NEXT: vpinsrb $13, 584(%rbp), %xmm1, %xmm1
; SKX-NEXT: vpinsrb $14, 592(%rbp), %xmm1, %xmm1
; SKX-NEXT: vpinsrb $15, 600(%rbp), %xmm1, %xmm1
; SKX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SKX-NEXT: vpinsrb $1, 616(%rbp), %xmm2, %xmm2
; SKX-NEXT: vpinsrb $2, 624(%rbp), %xmm2, %xmm2
; SKX-NEXT: vpinsrb $3, 632(%rbp), %xmm2, %xmm2
; SKX-NEXT: vpinsrb $4, 640(%rbp), %xmm2, %xmm2
; SKX-NEXT: vpinsrb $5, 648(%rbp), %xmm2, %xmm2
; SKX-NEXT: vpinsrb $6, 656(%rbp), %xmm2, %xmm2
; SKX-NEXT: vpinsrb $7, 664(%rbp), %xmm2, %xmm2
; SKX-NEXT: vpinsrb $8, 672(%rbp), %xmm2, %xmm2
; SKX-NEXT: vpinsrb $9, 680(%rbp), %xmm2, %xmm2
; SKX-NEXT: vpinsrb $10, 688(%rbp), %xmm2, %xmm2
; SKX-NEXT: vpinsrb $11, 696(%rbp), %xmm2, %xmm2
; SKX-NEXT: vpinsrb $12, 704(%rbp), %xmm2, %xmm2
; SKX-NEXT: vpinsrb $13, 712(%rbp), %xmm2, %xmm2
; SKX-NEXT: vpinsrb $14, 720(%rbp), %xmm2, %xmm2
; SKX-NEXT: vpinsrb $15, 728(%rbp), %xmm2, %xmm2
; SKX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; SKX-NEXT: movl 744(%rbp), %eax
; SKX-NEXT: andl $127, %eax
; SKX-NEXT: vptestmb %zmm0, %zmm0, %k0
; SKX-NEXT: vptestmb %zmm1, %zmm1, %k1
; SKX-NEXT: cmpb $0, 736(%rbp)
; SKX-NEXT: vpmovm2b %k1, %zmm0
; SKX-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
; SKX-NEXT: vpmovm2b %k0, %zmm0
; SKX-NEXT: vmovdqa64 %zmm0, (%rsp)
; SKX-NEXT: setne (%rsp,%rax)
; SKX-NEXT: vpsllw $7, {{[0-9]+}}(%rsp), %zmm0
; SKX-NEXT: vpmovb2m %zmm0, %k0
; SKX-NEXT: vpsllw $7, (%rsp), %zmm0
; SKX-NEXT: vpmovb2m %zmm0, %k1
; SKX-NEXT: kmovq %k1, %rax
; SKX-NEXT: kmovq %k0, %rdx
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t1 = icmp ugt <96 x i8> %a, zeroinitializer
%t2 = icmp ugt i8 %b, 0
%t3 = insertelement <96 x i1> %t1, i1 %t2, i32 %index
%t4 = bitcast <96 x i1> %t3 to i96
ret i96 %t4
}
define i128 @test_insertelement_variable_v128i1(<128 x i8> %a, i8 %b, i32 %index) {
; KNL-LABEL: test_insertelement_variable_v128i1:
; KNL: ## %bb.0:
; KNL-NEXT: pushq %rbp
; KNL-NEXT: .cfi_def_cfa_offset 16
; KNL-NEXT: .cfi_offset %rbp, -16
; KNL-NEXT: movq %rsp, %rbp
; KNL-NEXT: .cfi_def_cfa_register %rbp
; KNL-NEXT: andq $-128, %rsp
; KNL-NEXT: subq $256, %rsp ## imm = 0x100
; KNL-NEXT: ## kill: def $esi killed $esi def $rsi
; KNL-NEXT: vpxor %xmm4, %xmm4, %xmm4
; KNL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0
; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; KNL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm1
; KNL-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1
; KNL-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm2
; KNL-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2
; KNL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm3
; KNL-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3
; KNL-NEXT: andl $127, %esi
; KNL-NEXT: testb %dil, %dil
; KNL-NEXT: vmovdqa %ymm3, {{[0-9]+}}(%rsp)
; KNL-NEXT: vmovdqa %ymm2, {{[0-9]+}}(%rsp)
; KNL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
; KNL-NEXT: vmovdqa %ymm0, (%rsp)
; KNL-NEXT: setne (%rsp,%rsi)
; KNL-NEXT: vmovdqa (%rsp), %ymm2
; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm3
; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm1
; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm0
; KNL-NEXT: vpmovsxbd %xmm2, %zmm4
; KNL-NEXT: vpslld $31, %zmm4, %zmm4
; KNL-NEXT: vptestmd %zmm4, %zmm4, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: vextracti128 $1, %ymm2, %xmm2
; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
; KNL-NEXT: vpslld $31, %zmm2, %zmm2
; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0
; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: shll $16, %ecx
; KNL-NEXT: orl %eax, %ecx
; KNL-NEXT: vpmovsxbd %xmm3, %zmm2
; KNL-NEXT: vpslld $31, %zmm2, %zmm2
; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0
; KNL-NEXT: kmovw %k0, %edx
; KNL-NEXT: vextracti128 $1, %ymm3, %xmm2
; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
; KNL-NEXT: vpslld $31, %zmm2, %zmm2
; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: shll $16, %eax
; KNL-NEXT: orl %edx, %eax
; KNL-NEXT: shlq $32, %rax
; KNL-NEXT: orq %rcx, %rax
; KNL-NEXT: vpmovsxbd %xmm1, %zmm2
; KNL-NEXT: vpslld $31, %zmm2, %zmm2
; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0
; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: vextracti128 $1, %ymm1, %xmm1
; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
; KNL-NEXT: vpslld $31, %zmm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; KNL-NEXT: kmovw %k0, %esi
; KNL-NEXT: shll $16, %esi
; KNL-NEXT: orl %ecx, %esi
; KNL-NEXT: vpmovsxbd %xmm0, %zmm1
; KNL-NEXT: vpslld $31, %zmm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %edx
; KNL-NEXT: shll $16, %edx
; KNL-NEXT: orl %ecx, %edx
; KNL-NEXT: shlq $32, %rdx
; KNL-NEXT: orq %rsi, %rdx
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_insertelement_variable_v128i1:
; SKX: ## %bb.0:
; SKX-NEXT: pushq %rbp
; SKX-NEXT: .cfi_def_cfa_offset 16
; SKX-NEXT: .cfi_offset %rbp, -16
; SKX-NEXT: movq %rsp, %rbp
; SKX-NEXT: .cfi_def_cfa_register %rbp
; SKX-NEXT: andq $-128, %rsp
; SKX-NEXT: subq $256, %rsp ## imm = 0x100
; SKX-NEXT: ## kill: def $esi killed $esi def $rsi
; SKX-NEXT: vptestmb %zmm0, %zmm0, %k0
; SKX-NEXT: vptestmb %zmm1, %zmm1, %k1
; SKX-NEXT: andl $127, %esi
; SKX-NEXT: testb %dil, %dil
; SKX-NEXT: vpmovm2b %k1, %zmm0
; SKX-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
; SKX-NEXT: vpmovm2b %k0, %zmm0
; SKX-NEXT: vmovdqa64 %zmm0, (%rsp)
; SKX-NEXT: setne (%rsp,%rsi)
; SKX-NEXT: vpsllw $7, {{[0-9]+}}(%rsp), %zmm0
; SKX-NEXT: vpmovb2m %zmm0, %k0
; SKX-NEXT: vpsllw $7, (%rsp), %zmm0
; SKX-NEXT: vpmovb2m %zmm0, %k1
; SKX-NEXT: kmovq %k1, %rax
; SKX-NEXT: kmovq %k0, %rdx
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t1 = icmp ugt <128 x i8> %a, zeroinitializer
%t2 = icmp ugt i8 %b, 0
%t3 = insertelement <128 x i1> %t1, i1 %t2, i32 %index
%t4 = bitcast <128 x i1> %t3 to i128
ret i128 %t4
}