2016-07-09 08:19:07 +08:00
|
|
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
2017-09-27 22:44:15 +08:00
|
|
|
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck --check-prefix=CHECK --check-prefix=KNL %s
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck --check-prefix=CHECK --check-prefix=SKX --check-prefix=SKX_ONLY %s
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq,+avx512vbmi | FileCheck --check-prefix=CHECK --check-prefix=SKX --check-prefix=SKX_VBMI %s
|
2013-07-31 19:35:14 +08:00
|
|
|
|
|
|
|
define <16 x float> @test1(<16 x float> %x, float* %br, float %y) nounwind {
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-LABEL: test1:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],mem[0],xmm0[2,3]
|
|
|
|
; CHECK-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm2
|
|
|
|
; CHECK-NEXT: vextractf32x4 $3, %zmm0, %xmm0
|
|
|
|
; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
|
|
|
|
; CHECK-NEXT: vinsertf32x4 $3, %xmm0, %zmm2, %zmm0
|
|
|
|
; CHECK-NEXT: retq
|
2015-02-28 05:17:42 +08:00
|
|
|
%rrr = load float, float* %br
|
2013-07-31 19:35:14 +08:00
|
|
|
%rrr2 = insertelement <16 x float> %x, float %rrr, i32 1
|
|
|
|
%rrr3 = insertelement <16 x float> %rrr2, float %y, i32 14
|
|
|
|
ret <16 x float> %rrr3
|
|
|
|
}
|
|
|
|
|
|
|
|
define <8 x double> @test2(<8 x double> %x, double* %br, double %y) nounwind {
|
2017-08-17 23:40:25 +08:00
|
|
|
; CHECK-LABEL: test2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-08-17 23:40:25 +08:00
|
|
|
; CHECK-NEXT: vmovhpd {{.*#+}} xmm2 = xmm0[0],mem[0]
|
|
|
|
; CHECK-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm2
|
|
|
|
; CHECK-NEXT: vextractf32x4 $3, %zmm0, %xmm0
|
2018-07-14 10:05:08 +08:00
|
|
|
; CHECK-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
|
2017-08-17 23:40:25 +08:00
|
|
|
; CHECK-NEXT: vinsertf32x4 $3, %xmm0, %zmm2, %zmm0
|
|
|
|
; CHECK-NEXT: retq
|
2015-02-28 05:17:42 +08:00
|
|
|
%rrr = load double, double* %br
|
2013-07-31 19:35:14 +08:00
|
|
|
%rrr2 = insertelement <8 x double> %x, double %rrr, i32 1
|
|
|
|
%rrr3 = insertelement <8 x double> %rrr2, double %y, i32 6
|
|
|
|
ret <8 x double> %rrr3
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x float> @test3(<16 x float> %x) nounwind {
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-LABEL: test3:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-08-30 15:26:12 +08:00
|
|
|
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[2,3]
|
|
|
|
; CHECK-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
|
|
|
|
; CHECK-NEXT: retq
|
2013-07-31 19:35:14 +08:00
|
|
|
%eee = extractelement <16 x float> %x, i32 4
|
|
|
|
%rrr2 = insertelement <16 x float> %x, float %eee, i32 1
|
|
|
|
ret <16 x float> %rrr2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <8 x i64> @test4(<8 x i64> %x) nounwind {
|
2017-08-17 23:40:25 +08:00
|
|
|
; CHECK-LABEL: test4:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-08-17 23:40:25 +08:00
|
|
|
; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm1
|
|
|
|
; CHECK-NEXT: vmovq %xmm1, %rax
|
|
|
|
; CHECK-NEXT: vpinsrq $1, %rax, %xmm0, %xmm1
|
|
|
|
; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
|
|
|
|
; CHECK-NEXT: retq
|
2013-07-31 19:35:14 +08:00
|
|
|
%eee = extractelement <8 x i64> %x, i32 4
|
|
|
|
%rrr2 = insertelement <8 x i64> %x, i64 %eee, i32 1
|
|
|
|
ret <8 x i64> %rrr2
|
2013-08-04 18:46:07 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
define i32 @test5(<4 x float> %x) nounwind {
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-LABEL: test5:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-NEXT: vextractps $3, %xmm0, %eax
|
|
|
|
; CHECK-NEXT: retq
|
2013-08-04 18:46:07 +08:00
|
|
|
%ef = extractelement <4 x float> %x, i32 3
|
|
|
|
%ei = bitcast float %ef to i32
|
|
|
|
ret i32 %ei
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @test6(<4 x float> %x, float* %out) nounwind {
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-LABEL: test6:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-NEXT: vextractps $3, %xmm0, (%rdi)
|
|
|
|
; CHECK-NEXT: retq
|
2013-08-04 18:46:07 +08:00
|
|
|
%ef = extractelement <4 x float> %x, i32 3
|
|
|
|
store float %ef, float* %out, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2013-09-12 16:55:00 +08:00
|
|
|
define float @test7(<16 x float> %x, i32 %ind) nounwind {
|
2017-09-27 22:44:15 +08:00
|
|
|
; CHECK-LABEL: test7:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-09-27 22:44:15 +08:00
|
|
|
; CHECK-NEXT: pushq %rbp
|
|
|
|
; CHECK-NEXT: movq %rsp, %rbp
|
|
|
|
; CHECK-NEXT: andq $-64, %rsp
|
|
|
|
; CHECK-NEXT: subq $128, %rsp
|
2018-02-01 06:04:26 +08:00
|
|
|
; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
|
2017-09-27 22:44:15 +08:00
|
|
|
; CHECK-NEXT: vmovaps %zmm0, (%rsp)
|
|
|
|
; CHECK-NEXT: andl $15, %edi
|
|
|
|
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; CHECK-NEXT: movq %rbp, %rsp
|
|
|
|
; CHECK-NEXT: popq %rbp
|
|
|
|
; CHECK-NEXT: vzeroupper
|
|
|
|
; CHECK-NEXT: retq
|
2013-09-12 16:55:00 +08:00
|
|
|
%e = extractelement <16 x float> %x, i32 %ind
|
|
|
|
ret float %e
|
|
|
|
}
|
|
|
|
|
|
|
|
define double @test8(<8 x double> %x, i32 %ind) nounwind {
|
2017-09-27 22:44:15 +08:00
|
|
|
; CHECK-LABEL: test8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-09-27 22:44:15 +08:00
|
|
|
; CHECK-NEXT: pushq %rbp
|
|
|
|
; CHECK-NEXT: movq %rsp, %rbp
|
|
|
|
; CHECK-NEXT: andq $-64, %rsp
|
|
|
|
; CHECK-NEXT: subq $128, %rsp
|
2018-02-01 06:04:26 +08:00
|
|
|
; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
|
2017-09-27 22:44:15 +08:00
|
|
|
; CHECK-NEXT: vmovaps %zmm0, (%rsp)
|
|
|
|
; CHECK-NEXT: andl $7, %edi
|
|
|
|
; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
|
|
|
|
; CHECK-NEXT: movq %rbp, %rsp
|
|
|
|
; CHECK-NEXT: popq %rbp
|
|
|
|
; CHECK-NEXT: vzeroupper
|
|
|
|
; CHECK-NEXT: retq
|
2013-09-12 16:55:00 +08:00
|
|
|
%e = extractelement <8 x double> %x, i32 %ind
|
|
|
|
ret double %e
|
|
|
|
}
|
|
|
|
|
|
|
|
define float @test9(<8 x float> %x, i32 %ind) nounwind {
|
2017-09-27 22:44:15 +08:00
|
|
|
; CHECK-LABEL: test9:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-09-27 22:44:15 +08:00
|
|
|
; CHECK-NEXT: pushq %rbp
|
|
|
|
; CHECK-NEXT: movq %rsp, %rbp
|
|
|
|
; CHECK-NEXT: andq $-32, %rsp
|
|
|
|
; CHECK-NEXT: subq $64, %rsp
|
2018-02-01 06:04:26 +08:00
|
|
|
; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
|
2017-09-27 22:44:15 +08:00
|
|
|
; CHECK-NEXT: vmovaps %ymm0, (%rsp)
|
|
|
|
; CHECK-NEXT: andl $7, %edi
|
|
|
|
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; CHECK-NEXT: movq %rbp, %rsp
|
|
|
|
; CHECK-NEXT: popq %rbp
|
|
|
|
; CHECK-NEXT: vzeroupper
|
|
|
|
; CHECK-NEXT: retq
|
2013-09-12 16:55:00 +08:00
|
|
|
%e = extractelement <8 x float> %x, i32 %ind
|
|
|
|
ret float %e
|
|
|
|
}
|
|
|
|
|
|
|
|
define i32 @test10(<16 x i32> %x, i32 %ind) nounwind {
|
2017-09-27 22:44:15 +08:00
|
|
|
; CHECK-LABEL: test10:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-09-27 22:44:15 +08:00
|
|
|
; CHECK-NEXT: pushq %rbp
|
|
|
|
; CHECK-NEXT: movq %rsp, %rbp
|
|
|
|
; CHECK-NEXT: andq $-64, %rsp
|
|
|
|
; CHECK-NEXT: subq $128, %rsp
|
2018-02-01 06:04:26 +08:00
|
|
|
; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
|
2017-09-27 22:44:15 +08:00
|
|
|
; CHECK-NEXT: vmovaps %zmm0, (%rsp)
|
|
|
|
; CHECK-NEXT: andl $15, %edi
|
|
|
|
; CHECK-NEXT: movl (%rsp,%rdi,4), %eax
|
|
|
|
; CHECK-NEXT: movq %rbp, %rsp
|
|
|
|
; CHECK-NEXT: popq %rbp
|
|
|
|
; CHECK-NEXT: vzeroupper
|
|
|
|
; CHECK-NEXT: retq
|
2013-09-12 16:55:00 +08:00
|
|
|
%e = extractelement <16 x i32> %x, i32 %ind
|
|
|
|
ret i32 %e
|
|
|
|
}
|
|
|
|
|
2013-11-14 19:29:27 +08:00
|
|
|
define <16 x i32> @test11(<16 x i32>%a, <16 x i32>%b) {
|
2016-07-09 08:19:07 +08:00
|
|
|
; KNL-LABEL: test11:
|
2017-12-05 01:18:51 +08:00
|
|
|
; KNL: ## %bb.0:
|
2016-07-09 08:19:07 +08:00
|
|
|
; KNL-NEXT: vpcmpltud %zmm1, %zmm0, %k0
|
2017-12-15 02:35:25 +08:00
|
|
|
; KNL-NEXT: kshiftrw $4, %k0, %k0
|
2016-09-04 22:03:52 +08:00
|
|
|
; KNL-NEXT: kmovw %k0, %eax
|
2017-05-19 20:35:15 +08:00
|
|
|
; KNL-NEXT: testb $1, %al
|
2016-07-09 08:19:07 +08:00
|
|
|
; KNL-NEXT: je LBB10_2
|
2017-12-05 01:18:51 +08:00
|
|
|
; KNL-NEXT: ## %bb.1: ## %A
|
2016-07-22 13:00:52 +08:00
|
|
|
; KNL-NEXT: vmovdqa64 %zmm1, %zmm0
|
2016-07-09 08:19:07 +08:00
|
|
|
; KNL-NEXT: retq
|
|
|
|
; KNL-NEXT: LBB10_2: ## %B
|
|
|
|
; KNL-NEXT: vpaddd %zmm0, %zmm1, %zmm0
|
|
|
|
; KNL-NEXT: retq
|
|
|
|
;
|
|
|
|
; SKX-LABEL: test11:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SKX: ## %bb.0:
|
2016-07-09 08:19:07 +08:00
|
|
|
; SKX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
|
2017-12-15 02:35:25 +08:00
|
|
|
; SKX-NEXT: kshiftrw $4, %k0, %k0
|
2017-03-29 14:55:28 +08:00
|
|
|
; SKX-NEXT: kmovd %k0, %eax
|
2017-05-19 20:35:15 +08:00
|
|
|
; SKX-NEXT: testb $1, %al
|
2016-07-09 08:19:07 +08:00
|
|
|
; SKX-NEXT: je LBB10_2
|
2017-12-05 01:18:51 +08:00
|
|
|
; SKX-NEXT: ## %bb.1: ## %A
|
2016-07-22 13:00:52 +08:00
|
|
|
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0
|
2016-07-09 08:19:07 +08:00
|
|
|
; SKX-NEXT: retq
|
|
|
|
; SKX-NEXT: LBB10_2: ## %B
|
|
|
|
; SKX-NEXT: vpaddd %zmm0, %zmm1, %zmm0
|
|
|
|
; SKX-NEXT: retq
|
2013-11-14 19:29:27 +08:00
|
|
|
%cmp_res = icmp ult <16 x i32> %a, %b
|
|
|
|
%ia = extractelement <16 x i1> %cmp_res, i32 4
|
|
|
|
br i1 %ia, label %A, label %B
|
|
|
|
A:
|
|
|
|
ret <16 x i32>%b
|
|
|
|
B:
|
|
|
|
%c = add <16 x i32>%b, %a
|
2013-12-16 21:52:35 +08:00
|
|
|
ret <16 x i32>%c
|
2013-11-14 19:29:27 +08:00
|
|
|
}
|
2014-02-10 15:02:39 +08:00
|
|
|
|
|
|
|
define i64 @test12(<16 x i64>%a, <16 x i64>%b, i64 %a1, i64 %b1) {
|
2016-07-09 08:19:07 +08:00
|
|
|
; KNL-LABEL: test12:
|
2017-12-05 01:18:51 +08:00
|
|
|
; KNL: ## %bb.0:
|
2016-07-09 08:19:07 +08:00
|
|
|
; KNL-NEXT: movq %rdi, %rax
|
2018-09-20 02:59:08 +08:00
|
|
|
; KNL-NEXT: vpcmpgtq %zmm0, %zmm2, %k0
|
|
|
|
; KNL-NEXT: kmovw %k0, %ecx
|
|
|
|
; KNL-NEXT: testb $1, %cl
|
|
|
|
; KNL-NEXT: cmoveq %rsi, %rax
|
2017-09-27 22:44:15 +08:00
|
|
|
; KNL-NEXT: vzeroupper
|
2016-07-09 08:19:07 +08:00
|
|
|
; KNL-NEXT: retq
|
|
|
|
;
|
|
|
|
; SKX-LABEL: test12:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SKX: ## %bb.0:
|
2016-07-09 08:19:07 +08:00
|
|
|
; SKX-NEXT: movq %rdi, %rax
|
2018-09-20 02:59:08 +08:00
|
|
|
; SKX-NEXT: vpcmpgtq %zmm0, %zmm2, %k0
|
|
|
|
; SKX-NEXT: kmovd %k0, %ecx
|
|
|
|
; SKX-NEXT: testb $1, %cl
|
|
|
|
; SKX-NEXT: cmoveq %rsi, %rax
|
2017-03-03 17:03:24 +08:00
|
|
|
; SKX-NEXT: vzeroupper
|
2016-07-09 08:19:07 +08:00
|
|
|
; SKX-NEXT: retq
|
2014-02-10 15:02:39 +08:00
|
|
|
%cmpvector_func.i = icmp slt <16 x i64> %a, %b
|
|
|
|
%extract24vector_func.i = extractelement <16 x i1> %cmpvector_func.i, i32 0
|
|
|
|
%res = select i1 %extract24vector_func.i, i64 %a1, i64 %b1
|
|
|
|
ret i64 %res
|
|
|
|
}
|
|
|
|
|
2014-02-16 19:34:23 +08:00
|
|
|
define i16 @test13(i32 %a, i32 %b) {
|
2016-07-09 08:19:07 +08:00
|
|
|
; KNL-LABEL: test13:
|
2017-12-05 01:18:51 +08:00
|
|
|
; KNL: ## %bb.0:
|
2016-07-09 08:19:07 +08:00
|
|
|
; KNL-NEXT: cmpl %esi, %edi
|
2017-03-13 02:28:48 +08:00
|
|
|
; KNL-NEXT: setb %al
|
2017-05-19 20:35:15 +08:00
|
|
|
; KNL-NEXT: movw $-4, %cx
|
|
|
|
; KNL-NEXT: kmovw %ecx, %k0
|
|
|
|
; KNL-NEXT: kshiftrw $1, %k0, %k0
|
|
|
|
; KNL-NEXT: kshiftlw $1, %k0, %k0
|
2016-08-26 06:48:11 +08:00
|
|
|
; KNL-NEXT: andl $1, %eax
|
2016-07-09 08:19:07 +08:00
|
|
|
; KNL-NEXT: kmovw %eax, %k1
|
2017-05-19 20:35:15 +08:00
|
|
|
; KNL-NEXT: korw %k1, %k0, %k0
|
2016-07-09 08:19:07 +08:00
|
|
|
; KNL-NEXT: kmovw %k0, %eax
|
2018-02-01 06:04:26 +08:00
|
|
|
; KNL-NEXT: ## kill: def $ax killed $ax killed $eax
|
2016-07-09 08:19:07 +08:00
|
|
|
; KNL-NEXT: retq
|
|
|
|
;
|
|
|
|
; SKX-LABEL: test13:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SKX: ## %bb.0:
|
2016-07-09 08:19:07 +08:00
|
|
|
; SKX-NEXT: cmpl %esi, %edi
|
2017-03-13 02:28:48 +08:00
|
|
|
; SKX-NEXT: setb %al
|
2017-05-19 20:35:15 +08:00
|
|
|
; SKX-NEXT: movw $-4, %cx
|
|
|
|
; SKX-NEXT: kmovd %ecx, %k0
|
|
|
|
; SKX-NEXT: kshiftrw $1, %k0, %k0
|
|
|
|
; SKX-NEXT: kshiftlw $1, %k0, %k0
|
2016-08-26 06:48:11 +08:00
|
|
|
; SKX-NEXT: andl $1, %eax
|
2017-05-19 20:35:15 +08:00
|
|
|
; SKX-NEXT: kmovw %eax, %k1
|
|
|
|
; SKX-NEXT: korw %k1, %k0, %k0
|
2017-03-29 00:35:29 +08:00
|
|
|
; SKX-NEXT: kmovd %k0, %eax
|
2018-02-01 06:04:26 +08:00
|
|
|
; SKX-NEXT: ## kill: def $ax killed $ax killed $eax
|
2016-07-09 08:19:07 +08:00
|
|
|
; SKX-NEXT: retq
|
2014-02-16 19:34:23 +08:00
|
|
|
%cmp_res = icmp ult i32 %a, %b
|
|
|
|
%maskv = insertelement <16 x i1> <i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i1 %cmp_res, i32 0
|
|
|
|
%res = bitcast <16 x i1> %maskv to i16
|
|
|
|
ret i16 %res
|
|
|
|
}
|
|
|
|
|
2014-03-02 17:19:44 +08:00
|
|
|
define i64 @test14(<8 x i64>%a, <8 x i64>%b, i64 %a1, i64 %b1) {
|
2016-07-09 08:19:07 +08:00
|
|
|
; KNL-LABEL: test14:
|
2017-12-05 01:18:51 +08:00
|
|
|
; KNL: ## %bb.0:
|
2018-09-20 02:59:08 +08:00
|
|
|
; KNL-NEXT: movq %rdi, %rax
|
2016-07-09 08:19:07 +08:00
|
|
|
; KNL-NEXT: vpcmpgtq %zmm0, %zmm1, %k0
|
2017-12-15 02:35:25 +08:00
|
|
|
; KNL-NEXT: kshiftrw $4, %k0, %k0
|
2018-09-20 02:59:08 +08:00
|
|
|
; KNL-NEXT: kmovw %k0, %ecx
|
|
|
|
; KNL-NEXT: testb $1, %cl
|
|
|
|
; KNL-NEXT: cmoveq %rsi, %rax
|
2017-09-27 22:44:15 +08:00
|
|
|
; KNL-NEXT: vzeroupper
|
2016-07-09 08:19:07 +08:00
|
|
|
; KNL-NEXT: retq
|
|
|
|
;
|
|
|
|
; SKX-LABEL: test14:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SKX: ## %bb.0:
|
2018-09-20 02:59:08 +08:00
|
|
|
; SKX-NEXT: movq %rdi, %rax
|
2016-07-09 08:19:07 +08:00
|
|
|
; SKX-NEXT: vpcmpgtq %zmm0, %zmm1, %k0
|
2018-03-01 06:23:55 +08:00
|
|
|
; SKX-NEXT: kshiftrw $4, %k0, %k0
|
2018-09-20 02:59:08 +08:00
|
|
|
; SKX-NEXT: kmovd %k0, %ecx
|
|
|
|
; SKX-NEXT: testb $1, %cl
|
|
|
|
; SKX-NEXT: cmoveq %rsi, %rax
|
2017-03-03 17:03:24 +08:00
|
|
|
; SKX-NEXT: vzeroupper
|
2016-07-09 08:19:07 +08:00
|
|
|
; SKX-NEXT: retq
|
2014-03-02 17:19:44 +08:00
|
|
|
%cmpvector_func.i = icmp slt <8 x i64> %a, %b
|
|
|
|
%extract24vector_func.i = extractelement <8 x i1> %cmpvector_func.i, i32 4
|
|
|
|
%res = select i1 %extract24vector_func.i, i64 %a1, i64 %b1
|
|
|
|
ret i64 %res
|
|
|
|
}
|
2014-04-09 20:37:50 +08:00
|
|
|
|
|
|
|
define i16 @test15(i1 *%addr) {
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-LABEL: test15:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-NEXT: xorl %ecx, %ecx
|
2018-02-12 10:48:42 +08:00
|
|
|
; CHECK-NEXT: cmpb $0, (%rdi)
|
2018-02-21 01:41:00 +08:00
|
|
|
; CHECK-NEXT: movl $65535, %eax ## imm = 0xFFFF
|
|
|
|
; CHECK-NEXT: cmovel %ecx, %eax
|
|
|
|
; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-NEXT: retq
|
2015-05-20 22:32:03 +08:00
|
|
|
%x = load i1 , i1 * %addr, align 1
|
2014-04-09 20:37:50 +08:00
|
|
|
%x1 = insertelement <16 x i1> undef, i1 %x, i32 10
|
|
|
|
%x2 = bitcast <16 x i1>%x1 to i16
|
|
|
|
ret i16 %x2
|
|
|
|
}
|
|
|
|
|
|
|
|
define i16 @test16(i1 *%addr, i16 %a) {
|
2016-07-09 08:19:07 +08:00
|
|
|
; KNL-LABEL: test16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; KNL: ## %bb.0:
|
2017-05-19 20:35:15 +08:00
|
|
|
; KNL-NEXT: movb (%rdi), %al
|
2017-12-08 08:16:09 +08:00
|
|
|
; KNL-NEXT: kmovw %esi, %k0
|
|
|
|
; KNL-NEXT: kmovw %eax, %k1
|
|
|
|
; KNL-NEXT: kshiftrw $10, %k0, %k2
|
|
|
|
; KNL-NEXT: kxorw %k1, %k2, %k1
|
|
|
|
; KNL-NEXT: kshiftlw $15, %k1, %k1
|
|
|
|
; KNL-NEXT: kshiftrw $5, %k1, %k1
|
2018-01-23 23:56:36 +08:00
|
|
|
; KNL-NEXT: kxorw %k1, %k0, %k0
|
2016-07-09 08:19:07 +08:00
|
|
|
; KNL-NEXT: kmovw %k0, %eax
|
2018-02-01 06:04:26 +08:00
|
|
|
; KNL-NEXT: ## kill: def $ax killed $ax killed $eax
|
2016-07-09 08:19:07 +08:00
|
|
|
; KNL-NEXT: retq
|
|
|
|
;
|
|
|
|
; SKX-LABEL: test16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SKX: ## %bb.0:
|
2017-10-22 19:43:08 +08:00
|
|
|
; SKX-NEXT: kmovb (%rdi), %k0
|
|
|
|
; SKX-NEXT: kmovd %esi, %k1
|
2017-12-08 08:16:09 +08:00
|
|
|
; SKX-NEXT: kshiftrw $10, %k1, %k2
|
|
|
|
; SKX-NEXT: kxorw %k0, %k2, %k0
|
|
|
|
; SKX-NEXT: kshiftlw $15, %k0, %k0
|
|
|
|
; SKX-NEXT: kshiftrw $5, %k0, %k0
|
2018-01-23 23:56:36 +08:00
|
|
|
; SKX-NEXT: kxorw %k0, %k1, %k0
|
2017-03-29 00:35:29 +08:00
|
|
|
; SKX-NEXT: kmovd %k0, %eax
|
2018-02-01 06:04:26 +08:00
|
|
|
; SKX-NEXT: ## kill: def $ax killed $ax killed $eax
|
2016-07-09 08:19:07 +08:00
|
|
|
; SKX-NEXT: retq
|
2015-02-28 05:17:42 +08:00
|
|
|
%x = load i1 , i1 * %addr, align 128
|
2014-04-09 20:37:50 +08:00
|
|
|
%a1 = bitcast i16 %a to <16 x i1>
|
|
|
|
%x1 = insertelement <16 x i1> %a1, i1 %x, i32 10
|
|
|
|
%x2 = bitcast <16 x i1>%x1 to i16
|
|
|
|
ret i16 %x2
|
|
|
|
}
|
|
|
|
|
|
|
|
define i8 @test17(i1 *%addr, i8 %a) {
|
2016-07-09 08:19:07 +08:00
|
|
|
; KNL-LABEL: test17:
|
2017-12-05 01:18:51 +08:00
|
|
|
; KNL: ## %bb.0:
|
2017-05-19 20:35:15 +08:00
|
|
|
; KNL-NEXT: movb (%rdi), %al
|
2017-12-08 08:16:09 +08:00
|
|
|
; KNL-NEXT: kmovw %esi, %k0
|
|
|
|
; KNL-NEXT: kmovw %eax, %k1
|
|
|
|
; KNL-NEXT: kshiftrw $4, %k0, %k2
|
|
|
|
; KNL-NEXT: kxorw %k1, %k2, %k1
|
|
|
|
; KNL-NEXT: kshiftlw $15, %k1, %k1
|
|
|
|
; KNL-NEXT: kshiftrw $11, %k1, %k1
|
2018-01-23 23:56:36 +08:00
|
|
|
; KNL-NEXT: kxorw %k1, %k0, %k0
|
2016-07-09 08:19:07 +08:00
|
|
|
; KNL-NEXT: kmovw %k0, %eax
|
2018-02-01 06:04:26 +08:00
|
|
|
; KNL-NEXT: ## kill: def $al killed $al killed $eax
|
2016-07-09 08:19:07 +08:00
|
|
|
; KNL-NEXT: retq
|
|
|
|
;
|
|
|
|
; SKX-LABEL: test17:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SKX: ## %bb.0:
|
2017-10-22 19:43:08 +08:00
|
|
|
; SKX-NEXT: kmovb (%rdi), %k0
|
|
|
|
; SKX-NEXT: kmovd %esi, %k1
|
2017-12-08 08:16:09 +08:00
|
|
|
; SKX-NEXT: kshiftrb $4, %k1, %k2
|
|
|
|
; SKX-NEXT: kxorb %k0, %k2, %k0
|
|
|
|
; SKX-NEXT: kshiftlb $7, %k0, %k0
|
|
|
|
; SKX-NEXT: kshiftrb $3, %k0, %k0
|
2018-01-23 23:56:36 +08:00
|
|
|
; SKX-NEXT: kxorb %k0, %k1, %k0
|
2017-03-29 00:35:29 +08:00
|
|
|
; SKX-NEXT: kmovd %k0, %eax
|
2018-02-01 06:04:26 +08:00
|
|
|
; SKX-NEXT: ## kill: def $al killed $al killed $eax
|
2016-07-09 08:19:07 +08:00
|
|
|
; SKX-NEXT: retq
|
2015-02-28 05:17:42 +08:00
|
|
|
%x = load i1 , i1 * %addr, align 128
|
2014-04-09 20:37:50 +08:00
|
|
|
%a1 = bitcast i8 %a to <8 x i1>
|
2014-10-16 07:42:11 +08:00
|
|
|
%x1 = insertelement <8 x i1> %a1, i1 %x, i32 4
|
2014-04-09 20:37:50 +08:00
|
|
|
%x2 = bitcast <8 x i1>%x1 to i8
|
|
|
|
ret i8 %x2
|
|
|
|
}
|
|
|
|
|
2015-10-08 20:55:01 +08:00
|
|
|
define i64 @extract_v8i64(<8 x i64> %x, i64* %dst) {
|
2017-09-27 22:44:15 +08:00
|
|
|
; CHECK-LABEL: extract_v8i64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-09-27 22:44:15 +08:00
|
|
|
; CHECK-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
|
|
|
|
; CHECK-NEXT: vpextrq $1, %xmm0, (%rdi)
|
|
|
|
; CHECK-NEXT: vzeroupper
|
|
|
|
; CHECK-NEXT: retq
|
2015-10-08 20:55:01 +08:00
|
|
|
%r1 = extractelement <8 x i64> %x, i32 1
|
|
|
|
%r2 = extractelement <8 x i64> %x, i32 3
|
|
|
|
store i64 %r2, i64* %dst, align 1
|
|
|
|
ret i64 %r1
|
|
|
|
}
|
|
|
|
|
|
|
|
define i64 @extract_v4i64(<4 x i64> %x, i64* %dst) {
|
2017-09-27 22:44:15 +08:00
|
|
|
; CHECK-LABEL: extract_v4i64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-09-27 22:44:15 +08:00
|
|
|
; CHECK-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
|
|
|
|
; CHECK-NEXT: vpextrq $1, %xmm0, (%rdi)
|
|
|
|
; CHECK-NEXT: vzeroupper
|
|
|
|
; CHECK-NEXT: retq
|
2015-10-08 20:55:01 +08:00
|
|
|
%r1 = extractelement <4 x i64> %x, i32 1
|
|
|
|
%r2 = extractelement <4 x i64> %x, i32 3
|
|
|
|
store i64 %r2, i64* %dst, align 1
|
|
|
|
ret i64 %r1
|
|
|
|
}
|
|
|
|
|
|
|
|
define i64 @extract_v2i64(<2 x i64> %x, i64* %dst) {
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-LABEL: extract_v2i64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-NEXT: vmovq %xmm0, %rax
|
|
|
|
; CHECK-NEXT: vpextrq $1, %xmm0, (%rdi)
|
|
|
|
; CHECK-NEXT: retq
|
2015-10-08 20:55:01 +08:00
|
|
|
%r1 = extractelement <2 x i64> %x, i32 0
|
|
|
|
%r2 = extractelement <2 x i64> %x, i32 1
|
|
|
|
store i64 %r2, i64* %dst, align 1
|
|
|
|
ret i64 %r1
|
|
|
|
}
|
|
|
|
|
|
|
|
define i32 @extract_v16i32(<16 x i32> %x, i32* %dst) {
|
2017-09-27 22:44:15 +08:00
|
|
|
; CHECK-LABEL: extract_v16i32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-10-22 04:19:48 +08:00
|
|
|
; CHECK-NEXT: vextractps $1, %xmm0, %eax
|
|
|
|
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
|
; CHECK-NEXT: vextractps $1, %xmm0, (%rdi)
|
2017-09-27 22:44:15 +08:00
|
|
|
; CHECK-NEXT: vzeroupper
|
|
|
|
; CHECK-NEXT: retq
|
2015-10-08 20:55:01 +08:00
|
|
|
%r1 = extractelement <16 x i32> %x, i32 1
|
|
|
|
%r2 = extractelement <16 x i32> %x, i32 5
|
|
|
|
store i32 %r2, i32* %dst, align 1
|
|
|
|
ret i32 %r1
|
|
|
|
}
|
|
|
|
|
|
|
|
define i32 @extract_v8i32(<8 x i32> %x, i32* %dst) {
|
2017-09-27 22:44:15 +08:00
|
|
|
; CHECK-LABEL: extract_v8i32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-10-22 04:19:48 +08:00
|
|
|
; CHECK-NEXT: vextractps $1, %xmm0, %eax
|
|
|
|
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
|
; CHECK-NEXT: vextractps $1, %xmm0, (%rdi)
|
2017-09-27 22:44:15 +08:00
|
|
|
; CHECK-NEXT: vzeroupper
|
|
|
|
; CHECK-NEXT: retq
|
2015-10-08 20:55:01 +08:00
|
|
|
%r1 = extractelement <8 x i32> %x, i32 1
|
|
|
|
%r2 = extractelement <8 x i32> %x, i32 5
|
|
|
|
store i32 %r2, i32* %dst, align 1
|
|
|
|
ret i32 %r1
|
|
|
|
}
|
|
|
|
|
|
|
|
define i32 @extract_v4i32(<4 x i32> %x, i32* %dst) {
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-LABEL: extract_v4i32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-10-22 04:19:48 +08:00
|
|
|
; CHECK-NEXT: vextractps $1, %xmm0, %eax
|
|
|
|
; CHECK-NEXT: vextractps $3, %xmm0, (%rdi)
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-NEXT: retq
|
2015-10-08 20:55:01 +08:00
|
|
|
%r1 = extractelement <4 x i32> %x, i32 1
|
|
|
|
%r2 = extractelement <4 x i32> %x, i32 3
|
|
|
|
store i32 %r2, i32* %dst, align 1
|
|
|
|
ret i32 %r1
|
|
|
|
}
|
|
|
|
|
|
|
|
define i16 @extract_v32i16(<32 x i16> %x, i16* %dst) {
|
2017-09-27 22:44:15 +08:00
|
|
|
; CHECK-LABEL: extract_v32i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-09-27 22:44:15 +08:00
|
|
|
; CHECK-NEXT: vpextrw $1, %xmm0, %eax
|
|
|
|
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
|
|
|
|
; CHECK-NEXT: vpextrw $1, %xmm0, (%rdi)
|
2018-02-01 06:04:26 +08:00
|
|
|
; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax
|
2017-09-27 22:44:15 +08:00
|
|
|
; CHECK-NEXT: vzeroupper
|
|
|
|
; CHECK-NEXT: retq
|
2015-10-08 20:55:01 +08:00
|
|
|
%r1 = extractelement <32 x i16> %x, i32 1
|
|
|
|
%r2 = extractelement <32 x i16> %x, i32 9
|
|
|
|
store i16 %r2, i16* %dst, align 1
|
|
|
|
ret i16 %r1
|
|
|
|
}
|
|
|
|
|
|
|
|
define i16 @extract_v16i16(<16 x i16> %x, i16* %dst) {
|
2017-09-27 22:44:15 +08:00
|
|
|
; CHECK-LABEL: extract_v16i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-09-27 22:44:15 +08:00
|
|
|
; CHECK-NEXT: vpextrw $1, %xmm0, %eax
|
|
|
|
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
|
|
|
|
; CHECK-NEXT: vpextrw $1, %xmm0, (%rdi)
|
2018-02-01 06:04:26 +08:00
|
|
|
; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax
|
2017-09-27 22:44:15 +08:00
|
|
|
; CHECK-NEXT: vzeroupper
|
|
|
|
; CHECK-NEXT: retq
|
2015-10-08 20:55:01 +08:00
|
|
|
%r1 = extractelement <16 x i16> %x, i32 1
|
|
|
|
%r2 = extractelement <16 x i16> %x, i32 9
|
|
|
|
store i16 %r2, i16* %dst, align 1
|
|
|
|
ret i16 %r1
|
|
|
|
}
|
|
|
|
|
|
|
|
define i16 @extract_v8i16(<8 x i16> %x, i16* %dst) {
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-LABEL: extract_v8i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-NEXT: vpextrw $1, %xmm0, %eax
|
|
|
|
; CHECK-NEXT: vpextrw $3, %xmm0, (%rdi)
|
2018-02-01 06:04:26 +08:00
|
|
|
; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-NEXT: retq
|
2015-10-08 20:55:01 +08:00
|
|
|
%r1 = extractelement <8 x i16> %x, i32 1
|
|
|
|
%r2 = extractelement <8 x i16> %x, i32 3
|
|
|
|
store i16 %r2, i16* %dst, align 1
|
|
|
|
ret i16 %r1
|
|
|
|
}
|
|
|
|
|
|
|
|
define i8 @extract_v64i8(<64 x i8> %x, i8* %dst) {
|
2017-09-27 22:44:15 +08:00
|
|
|
; CHECK-LABEL: extract_v64i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-09-27 22:44:15 +08:00
|
|
|
; CHECK-NEXT: vpextrb $1, %xmm0, %eax
|
|
|
|
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
|
|
|
|
; CHECK-NEXT: vpextrb $1, %xmm0, (%rdi)
|
2018-02-01 06:04:26 +08:00
|
|
|
; CHECK-NEXT: ## kill: def $al killed $al killed $eax
|
2017-09-27 22:44:15 +08:00
|
|
|
; CHECK-NEXT: vzeroupper
|
|
|
|
; CHECK-NEXT: retq
|
2015-10-08 20:55:01 +08:00
|
|
|
%r1 = extractelement <64 x i8> %x, i32 1
|
|
|
|
%r2 = extractelement <64 x i8> %x, i32 17
|
|
|
|
store i8 %r2, i8* %dst, align 1
|
|
|
|
ret i8 %r1
|
|
|
|
}
|
|
|
|
|
|
|
|
define i8 @extract_v32i8(<32 x i8> %x, i8* %dst) {
|
2017-09-27 22:44:15 +08:00
|
|
|
; CHECK-LABEL: extract_v32i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-09-27 22:44:15 +08:00
|
|
|
; CHECK-NEXT: vpextrb $1, %xmm0, %eax
|
|
|
|
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
|
|
|
|
; CHECK-NEXT: vpextrb $1, %xmm0, (%rdi)
|
2018-02-01 06:04:26 +08:00
|
|
|
; CHECK-NEXT: ## kill: def $al killed $al killed $eax
|
2017-09-27 22:44:15 +08:00
|
|
|
; CHECK-NEXT: vzeroupper
|
|
|
|
; CHECK-NEXT: retq
|
2015-10-08 20:55:01 +08:00
|
|
|
%r1 = extractelement <32 x i8> %x, i32 1
|
|
|
|
%r2 = extractelement <32 x i8> %x, i32 17
|
|
|
|
store i8 %r2, i8* %dst, align 1
|
|
|
|
ret i8 %r1
|
|
|
|
}
|
|
|
|
|
|
|
|
define i8 @extract_v16i8(<16 x i8> %x, i8* %dst) {
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-LABEL: extract_v16i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-NEXT: vpextrb $1, %xmm0, %eax
|
|
|
|
; CHECK-NEXT: vpextrb $3, %xmm0, (%rdi)
|
2018-02-01 06:04:26 +08:00
|
|
|
; CHECK-NEXT: ## kill: def $al killed $al killed $eax
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-NEXT: retq
|
2015-10-08 20:55:01 +08:00
|
|
|
%r1 = extractelement <16 x i8> %x, i32 1
|
|
|
|
%r2 = extractelement <16 x i8> %x, i32 3
|
|
|
|
store i8 %r2, i8* %dst, align 1
|
|
|
|
ret i8 %r1
|
|
|
|
}
|
|
|
|
|
|
|
|
define <8 x i64> @insert_v8i64(<8 x i64> %x, i64 %y , i64* %ptr) {
|
2017-08-17 23:40:25 +08:00
|
|
|
; CHECK-LABEL: insert_v8i64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-08-17 23:40:25 +08:00
|
|
|
; CHECK-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1
|
|
|
|
; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1
|
2017-08-30 15:26:12 +08:00
|
|
|
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
|
2017-08-17 23:40:25 +08:00
|
|
|
; CHECK-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0
|
|
|
|
; CHECK-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
|
|
|
|
; CHECK-NEXT: retq
|
2015-10-08 20:55:01 +08:00
|
|
|
%val = load i64, i64* %ptr
|
|
|
|
%r1 = insertelement <8 x i64> %x, i64 %val, i32 1
|
|
|
|
%r2 = insertelement <8 x i64> %r1, i64 %y, i32 3
|
|
|
|
ret <8 x i64> %r2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x i64> @insert_v4i64(<4 x i64> %x, i64 %y , i64* %ptr) {
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-LABEL: insert_v4i64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1
|
|
|
|
; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
|
|
|
|
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
|
|
|
|
; CHECK-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0
|
|
|
|
; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
|
|
|
|
; CHECK-NEXT: retq
|
2015-10-08 20:55:01 +08:00
|
|
|
%val = load i64, i64* %ptr
|
|
|
|
%r1 = insertelement <4 x i64> %x, i64 %val, i32 1
|
|
|
|
%r2 = insertelement <4 x i64> %r1, i64 %y, i32 3
|
|
|
|
ret <4 x i64> %r2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @insert_v2i64(<2 x i64> %x, i64 %y , i64* %ptr) {
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-LABEL: insert_v2i64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0
|
|
|
|
; CHECK-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm0
|
|
|
|
; CHECK-NEXT: retq
|
2015-10-08 20:55:01 +08:00
|
|
|
%val = load i64, i64* %ptr
|
|
|
|
%r1 = insertelement <2 x i64> %x, i64 %val, i32 1
|
2016-12-04 07:03:26 +08:00
|
|
|
%r2 = insertelement <2 x i64> %r1, i64 %y, i32 0
|
2015-10-08 20:55:01 +08:00
|
|
|
ret <2 x i64> %r2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i32> @insert_v16i32(<16 x i32> %x, i32 %y, i32* %ptr) {
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-LABEL: insert_v16i32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1
|
|
|
|
; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1
|
2017-08-30 15:26:12 +08:00
|
|
|
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0
|
|
|
|
; CHECK-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
|
|
|
|
; CHECK-NEXT: retq
|
2015-10-08 20:55:01 +08:00
|
|
|
%val = load i32, i32* %ptr
|
|
|
|
%r1 = insertelement <16 x i32> %x, i32 %val, i32 1
|
|
|
|
%r2 = insertelement <16 x i32> %r1, i32 %y, i32 5
|
|
|
|
ret <16 x i32> %r2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <8 x i32> @insert_v8i32(<8 x i32> %x, i32 %y, i32* %ptr) {
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-LABEL: insert_v8i32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1
|
|
|
|
; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
|
|
|
|
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
|
|
|
|
; CHECK-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0
|
|
|
|
; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
|
|
|
|
; CHECK-NEXT: retq
|
2015-10-08 20:55:01 +08:00
|
|
|
%val = load i32, i32* %ptr
|
|
|
|
%r1 = insertelement <8 x i32> %x, i32 %val, i32 1
|
|
|
|
%r2 = insertelement <8 x i32> %r1, i32 %y, i32 5
|
|
|
|
ret <8 x i32> %r2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x i32> @insert_v4i32(<4 x i32> %x, i32 %y, i32* %ptr) {
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-LABEL: insert_v4i32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm0
|
|
|
|
; CHECK-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0
|
|
|
|
; CHECK-NEXT: retq
|
2015-10-08 20:55:01 +08:00
|
|
|
%val = load i32, i32* %ptr
|
|
|
|
%r1 = insertelement <4 x i32> %x, i32 %val, i32 1
|
|
|
|
%r2 = insertelement <4 x i32> %r1, i32 %y, i32 3
|
|
|
|
ret <4 x i32> %r2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <32 x i16> @insert_v32i16(<32 x i16> %x, i16 %y, i16* %ptr) {
|
|
|
|
; KNL-LABEL: insert_v32i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; KNL: ## %bb.0:
|
2015-10-08 20:55:01 +08:00
|
|
|
; KNL-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm2
|
2017-02-12 07:23:11 +08:00
|
|
|
; KNL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7]
|
|
|
|
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
|
|
|
|
; KNL-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0
|
|
|
|
; KNL-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
|
2015-10-08 20:55:01 +08:00
|
|
|
; KNL-NEXT: retq
|
|
|
|
;
|
|
|
|
; SKX-LABEL: insert_v32i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SKX: ## %bb.0:
|
2015-10-08 20:55:01 +08:00
|
|
|
; SKX-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1
|
2017-02-13 07:49:49 +08:00
|
|
|
; SKX-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1
|
2017-08-30 15:26:12 +08:00
|
|
|
; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
|
2017-02-13 07:49:49 +08:00
|
|
|
; SKX-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0
|
|
|
|
; SKX-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
|
2015-10-08 20:55:01 +08:00
|
|
|
; SKX-NEXT: retq
|
|
|
|
%val = load i16, i16* %ptr
|
|
|
|
%r1 = insertelement <32 x i16> %x, i16 %val, i32 1
|
|
|
|
%r2 = insertelement <32 x i16> %r1, i16 %y, i32 9
|
|
|
|
ret <32 x i16> %r2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i16> @insert_v16i16(<16 x i16> %x, i16 %y, i16* %ptr) {
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-LABEL: insert_v16i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1
|
|
|
|
; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
|
|
|
|
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
|
|
|
|
; CHECK-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0
|
|
|
|
; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
|
|
|
|
; CHECK-NEXT: retq
|
2015-10-08 20:55:01 +08:00
|
|
|
%val = load i16, i16* %ptr
|
|
|
|
%r1 = insertelement <16 x i16> %x, i16 %val, i32 1
|
|
|
|
%r2 = insertelement <16 x i16> %r1, i16 %y, i32 9
|
|
|
|
ret <16 x i16> %r2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <8 x i16> @insert_v8i16(<8 x i16> %x, i16 %y, i16* %ptr) {
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-LABEL: insert_v8i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm0
|
|
|
|
; CHECK-NEXT: vpinsrw $5, %edi, %xmm0, %xmm0
|
|
|
|
; CHECK-NEXT: retq
|
2015-10-08 20:55:01 +08:00
|
|
|
%val = load i16, i16* %ptr
|
|
|
|
%r1 = insertelement <8 x i16> %x, i16 %val, i32 1
|
|
|
|
%r2 = insertelement <8 x i16> %r1, i16 %y, i32 5
|
|
|
|
ret <8 x i16> %r2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <64 x i8> @insert_v64i8(<64 x i8> %x, i8 %y, i8* %ptr) {
|
|
|
|
; KNL-LABEL: insert_v64i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; KNL: ## %bb.0:
|
2015-10-08 20:55:01 +08:00
|
|
|
; KNL-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm2
|
|
|
|
; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
|
|
|
|
; KNL-NEXT: vextracti128 $1, %ymm1, %xmm2
|
|
|
|
; KNL-NEXT: vpinsrb $2, %edi, %xmm2, %xmm2
|
|
|
|
; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
|
|
|
|
; KNL-NEXT: retq
|
|
|
|
;
|
|
|
|
; SKX-LABEL: insert_v64i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SKX: ## %bb.0:
|
2015-10-08 20:55:01 +08:00
|
|
|
; SKX-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm1
|
2017-02-13 07:49:49 +08:00
|
|
|
; SKX-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1
|
|
|
|
; SKX-NEXT: vextracti32x4 $3, %zmm0, %xmm0
|
|
|
|
; SKX-NEXT: vpinsrb $2, %edi, %xmm0, %xmm0
|
|
|
|
; SKX-NEXT: vinserti32x4 $3, %xmm0, %zmm1, %zmm0
|
2015-10-08 20:55:01 +08:00
|
|
|
; SKX-NEXT: retq
|
|
|
|
%val = load i8, i8* %ptr
|
|
|
|
%r1 = insertelement <64 x i8> %x, i8 %val, i32 1
|
|
|
|
%r2 = insertelement <64 x i8> %r1, i8 %y, i32 50
|
|
|
|
ret <64 x i8> %r2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <32 x i8> @insert_v32i8(<32 x i8> %x, i8 %y, i8* %ptr) {
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-LABEL: insert_v32i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm1
|
|
|
|
; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
|
|
|
|
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
|
|
|
|
; CHECK-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0
|
|
|
|
; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
|
|
|
|
; CHECK-NEXT: retq
|
2015-10-08 20:55:01 +08:00
|
|
|
%val = load i8, i8* %ptr
|
|
|
|
%r1 = insertelement <32 x i8> %x, i8 %val, i32 1
|
|
|
|
%r2 = insertelement <32 x i8> %r1, i8 %y, i32 17
|
|
|
|
ret <32 x i8> %r2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i8> @insert_v16i8(<16 x i8> %x, i8 %y, i8* %ptr) {
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-LABEL: insert_v16i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0
|
|
|
|
; CHECK-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
|
|
|
|
; CHECK-NEXT: retq
|
2015-10-08 20:55:01 +08:00
|
|
|
%val = load i8, i8* %ptr
|
|
|
|
%r1 = insertelement <16 x i8> %x, i8 %val, i32 3
|
|
|
|
%r2 = insertelement <16 x i8> %r1, i8 %y, i32 10
|
|
|
|
ret <16 x i8> %r2
|
|
|
|
}
|
|
|
|
|
2015-09-20 14:52:42 +08:00
|
|
|
define <8 x i64> @test_insert_128_v8i64(<8 x i64> %x, i64 %y) {
|
2017-08-17 23:40:25 +08:00
|
|
|
; CHECK-LABEL: test_insert_128_v8i64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-08-17 23:40:25 +08:00
|
|
|
; CHECK-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1
|
|
|
|
; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
|
|
|
|
; CHECK-NEXT: retq
|
2015-09-20 14:52:42 +08:00
|
|
|
%r = insertelement <8 x i64> %x, i64 %y, i32 1
|
|
|
|
ret <8 x i64> %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i32> @test_insert_128_v16i32(<16 x i32> %x, i32 %y) {
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-LABEL: test_insert_128_v16i32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-NEXT: vpinsrd $1, %edi, %xmm0, %xmm1
|
|
|
|
; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
|
|
|
|
; CHECK-NEXT: retq
|
2015-09-20 14:52:42 +08:00
|
|
|
%r = insertelement <16 x i32> %x, i32 %y, i32 1
|
|
|
|
ret <16 x i32> %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define <8 x double> @test_insert_128_v8f64(<8 x double> %x, double %y) {
|
2017-08-17 23:40:25 +08:00
|
|
|
; CHECK-LABEL: test_insert_128_v8f64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-09-18 12:40:58 +08:00
|
|
|
; CHECK-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm1[0]
|
2017-08-17 23:40:25 +08:00
|
|
|
; CHECK-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
|
|
|
|
; CHECK-NEXT: retq
|
2015-09-20 14:52:42 +08:00
|
|
|
%r = insertelement <8 x double> %x, double %y, i32 1
|
|
|
|
ret <8 x double> %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x float> @test_insert_128_v16f32(<16 x float> %x, float %y) {
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-LABEL: test_insert_128_v16f32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[2,3]
|
|
|
|
; CHECK-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
|
|
|
|
; CHECK-NEXT: retq
|
2015-09-20 14:52:42 +08:00
|
|
|
%r = insertelement <16 x float> %x, float %y, i32 1
|
|
|
|
ret <16 x float> %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i16> @test_insert_128_v16i16(<16 x i16> %x, i16 %y) {
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-LABEL: test_insert_128_v16i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
|
|
; CHECK-NEXT: vpinsrw $2, %edi, %xmm1, %xmm1
|
|
|
|
; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; CHECK-NEXT: retq
|
2015-09-20 14:52:42 +08:00
|
|
|
%r = insertelement <16 x i16> %x, i16 %y, i32 10
|
|
|
|
ret <16 x i16> %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define <32 x i8> @test_insert_128_v32i8(<32 x i8> %x, i8 %y) {
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-LABEL: test_insert_128_v32i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
|
|
; CHECK-NEXT: vpinsrb $4, %edi, %xmm1, %xmm1
|
|
|
|
; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; CHECK-NEXT: retq
|
2015-09-20 14:52:42 +08:00
|
|
|
%r = insertelement <32 x i8> %x, i8 %y, i32 20
|
|
|
|
ret <32 x i8> %r
|
|
|
|
}
|
2016-08-11 20:13:46 +08:00
|
|
|
|
2016-08-14 13:25:07 +08:00
|
|
|
define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32> %y) {
|
2016-08-26 06:48:11 +08:00
|
|
|
; KNL-LABEL: test_insertelement_v32i1:
|
2017-12-05 01:18:51 +08:00
|
|
|
; KNL: ## %bb.0:
|
2017-06-28 19:23:31 +08:00
|
|
|
; KNL-NEXT: cmpl %esi, %edi
|
2017-09-27 22:44:15 +08:00
|
|
|
; KNL-NEXT: setb %al
|
2018-01-23 22:25:39 +08:00
|
|
|
; KNL-NEXT: vpcmpltud %zmm3, %zmm1, %k0
|
|
|
|
; KNL-NEXT: kmovw %k0, %ecx
|
|
|
|
; KNL-NEXT: shll $16, %ecx
|
|
|
|
; KNL-NEXT: vpcmpltud %zmm2, %zmm0, %k0
|
|
|
|
; KNL-NEXT: kshiftrw $4, %k0, %k1
|
|
|
|
; KNL-NEXT: kmovw %eax, %k2
|
|
|
|
; KNL-NEXT: kxorw %k2, %k1, %k1
|
|
|
|
; KNL-NEXT: kshiftlw $15, %k1, %k1
|
|
|
|
; KNL-NEXT: kshiftrw $11, %k1, %k1
|
2018-01-23 23:56:36 +08:00
|
|
|
; KNL-NEXT: kxorw %k1, %k0, %k0
|
2018-01-23 22:25:39 +08:00
|
|
|
; KNL-NEXT: kmovw %k0, %eax
|
|
|
|
; KNL-NEXT: orl %ecx, %eax
|
2017-09-27 22:44:15 +08:00
|
|
|
; KNL-NEXT: vzeroupper
|
2016-08-26 06:48:11 +08:00
|
|
|
; KNL-NEXT: retq
|
|
|
|
;
|
2016-08-14 13:25:07 +08:00
|
|
|
; SKX-LABEL: test_insertelement_v32i1:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SKX: ## %bb.0:
|
2016-08-14 13:25:07 +08:00
|
|
|
; SKX-NEXT: cmpl %esi, %edi
|
2017-03-13 02:28:48 +08:00
|
|
|
; SKX-NEXT: setb %al
|
2017-05-19 20:35:15 +08:00
|
|
|
; SKX-NEXT: vpcmpltud %zmm2, %zmm0, %k0
|
|
|
|
; SKX-NEXT: vpcmpltud %zmm3, %zmm1, %k1
|
|
|
|
; SKX-NEXT: kunpckwd %k0, %k1, %k0
|
2017-12-08 08:16:09 +08:00
|
|
|
; SKX-NEXT: kshiftrd $4, %k0, %k1
|
|
|
|
; SKX-NEXT: kmovd %eax, %k2
|
|
|
|
; SKX-NEXT: kxord %k2, %k1, %k1
|
|
|
|
; SKX-NEXT: kshiftld $31, %k1, %k1
|
|
|
|
; SKX-NEXT: kshiftrd $27, %k1, %k1
|
2018-01-23 23:56:36 +08:00
|
|
|
; SKX-NEXT: kxord %k1, %k0, %k0
|
2016-08-14 13:25:07 +08:00
|
|
|
; SKX-NEXT: kmovd %k0, %eax
|
2017-03-03 17:03:24 +08:00
|
|
|
; SKX-NEXT: vzeroupper
|
2016-08-14 13:25:07 +08:00
|
|
|
; SKX-NEXT: retq
|
|
|
|
%cmp_res_i1 = icmp ult i32 %a, %b
|
|
|
|
%cmp_cmp_vec = icmp ult <32 x i32> %x, %y
|
|
|
|
%maskv = insertelement <32 x i1> %cmp_cmp_vec, i1 %cmp_res_i1, i32 4
|
|
|
|
%res = bitcast <32 x i1> %maskv to i32
|
|
|
|
ret i32 %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y) {
|
2016-08-26 06:48:11 +08:00
|
|
|
; KNL-LABEL: test_iinsertelement_v4i1:
|
2017-12-05 01:18:51 +08:00
|
|
|
; KNL: ## %bb.0:
|
2018-02-01 06:04:26 +08:00
|
|
|
; KNL-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
|
|
|
|
; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
|
2016-08-26 06:48:11 +08:00
|
|
|
; KNL-NEXT: cmpl %esi, %edi
|
2017-09-27 22:44:15 +08:00
|
|
|
; KNL-NEXT: setb %al
|
2018-01-08 02:20:37 +08:00
|
|
|
; KNL-NEXT: vpcmpltud %zmm1, %zmm0, %k0
|
2017-12-08 08:16:09 +08:00
|
|
|
; KNL-NEXT: kshiftrw $2, %k0, %k1
|
|
|
|
; KNL-NEXT: kmovw %eax, %k2
|
|
|
|
; KNL-NEXT: kxorw %k2, %k1, %k1
|
|
|
|
; KNL-NEXT: kshiftlw $15, %k1, %k1
|
|
|
|
; KNL-NEXT: kshiftrw $13, %k1, %k1
|
2018-01-23 23:56:36 +08:00
|
|
|
; KNL-NEXT: kxorw %k1, %k0, %k0
|
2016-08-26 06:48:11 +08:00
|
|
|
; KNL-NEXT: kmovw %k0, %eax
|
2018-02-01 06:04:26 +08:00
|
|
|
; KNL-NEXT: ## kill: def $al killed $al killed $eax
|
2018-01-08 02:20:37 +08:00
|
|
|
; KNL-NEXT: vzeroupper
|
2016-08-26 06:48:11 +08:00
|
|
|
; KNL-NEXT: retq
|
|
|
|
;
|
2016-08-14 13:25:07 +08:00
|
|
|
; SKX-LABEL: test_iinsertelement_v4i1:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SKX: ## %bb.0:
|
2016-08-14 13:25:07 +08:00
|
|
|
; SKX-NEXT: cmpl %esi, %edi
|
2017-03-13 02:28:48 +08:00
|
|
|
; SKX-NEXT: setb %al
|
2017-05-19 20:35:15 +08:00
|
|
|
; SKX-NEXT: vpcmpltud %xmm1, %xmm0, %k0
|
2018-01-23 23:56:36 +08:00
|
|
|
; SKX-NEXT: kshiftrb $2, %k0, %k1
|
2017-12-08 08:16:09 +08:00
|
|
|
; SKX-NEXT: kmovd %eax, %k2
|
2018-01-23 23:56:36 +08:00
|
|
|
; SKX-NEXT: kxorb %k2, %k1, %k1
|
|
|
|
; SKX-NEXT: kshiftlb $7, %k1, %k1
|
|
|
|
; SKX-NEXT: kshiftrb $5, %k1, %k1
|
2018-10-30 22:14:34 +08:00
|
|
|
; SKX-NEXT: kxorw %k1, %k0, %k0
|
2017-03-29 00:35:29 +08:00
|
|
|
; SKX-NEXT: kmovd %k0, %eax
|
2018-02-01 06:04:26 +08:00
|
|
|
; SKX-NEXT: ## kill: def $al killed $al killed $eax
|
2016-08-14 13:25:07 +08:00
|
|
|
; SKX-NEXT: retq
|
|
|
|
%cmp_res_i1 = icmp ult i32 %a, %b
|
|
|
|
%cmp_cmp_vec = icmp ult <4 x i32> %x, %y
|
|
|
|
%maskv = insertelement <4 x i1> %cmp_cmp_vec, i1 %cmp_res_i1, i32 2
|
|
|
|
%res0 = shufflevector <4 x i1> %maskv, <4 x i1> undef , <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
|
|
|
|
%res = bitcast <8 x i1> %res0 to i8
|
|
|
|
ret i8 %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define i8 @test_iinsertelement_v2i1(i32 %a, i32 %b, <2 x i64> %x , <2 x i64> %y) {
|
2016-08-26 06:48:11 +08:00
|
|
|
; KNL-LABEL: test_iinsertelement_v2i1:
|
2017-12-05 01:18:51 +08:00
|
|
|
; KNL: ## %bb.0:
|
2018-02-01 06:04:26 +08:00
|
|
|
; KNL-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
|
|
|
|
; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
|
2016-08-26 06:48:11 +08:00
|
|
|
; KNL-NEXT: cmpl %esi, %edi
|
2017-03-13 02:28:48 +08:00
|
|
|
; KNL-NEXT: setb %al
|
2018-01-08 02:20:37 +08:00
|
|
|
; KNL-NEXT: vpcmpltuq %zmm1, %zmm0, %k0
|
2018-01-23 23:56:36 +08:00
|
|
|
; KNL-NEXT: kshiftlw $15, %k0, %k0
|
|
|
|
; KNL-NEXT: kshiftrw $15, %k0, %k0
|
|
|
|
; KNL-NEXT: kmovw %eax, %k1
|
|
|
|
; KNL-NEXT: kshiftlw $1, %k1, %k1
|
|
|
|
; KNL-NEXT: korw %k1, %k0, %k0
|
2016-08-26 06:48:11 +08:00
|
|
|
; KNL-NEXT: kmovw %k0, %eax
|
2018-02-01 06:04:26 +08:00
|
|
|
; KNL-NEXT: ## kill: def $al killed $al killed $eax
|
2018-01-08 02:20:37 +08:00
|
|
|
; KNL-NEXT: vzeroupper
|
2016-08-26 06:48:11 +08:00
|
|
|
; KNL-NEXT: retq
|
|
|
|
;
|
2016-08-14 13:25:07 +08:00
|
|
|
; SKX-LABEL: test_iinsertelement_v2i1:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SKX: ## %bb.0:
|
2016-08-14 13:25:07 +08:00
|
|
|
; SKX-NEXT: cmpl %esi, %edi
|
2017-03-13 02:28:48 +08:00
|
|
|
; SKX-NEXT: setb %al
|
2017-05-19 20:35:15 +08:00
|
|
|
; SKX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0
|
2018-01-23 23:56:36 +08:00
|
|
|
; SKX-NEXT: kshiftlb $7, %k0, %k0
|
|
|
|
; SKX-NEXT: kshiftrb $7, %k0, %k0
|
|
|
|
; SKX-NEXT: kmovd %eax, %k1
|
|
|
|
; SKX-NEXT: kshiftlb $1, %k1, %k1
|
2018-10-30 22:14:34 +08:00
|
|
|
; SKX-NEXT: korw %k1, %k0, %k0
|
2017-03-29 00:35:29 +08:00
|
|
|
; SKX-NEXT: kmovd %k0, %eax
|
2018-02-01 06:04:26 +08:00
|
|
|
; SKX-NEXT: ## kill: def $al killed $al killed $eax
|
2016-08-14 13:25:07 +08:00
|
|
|
; SKX-NEXT: retq
|
|
|
|
%cmp_res_i1 = icmp ult i32 %a, %b
|
|
|
|
%cmp_cmp_vec = icmp ult <2 x i64> %x, %y
|
|
|
|
%maskv = insertelement <2 x i1> %cmp_cmp_vec, i1 %cmp_res_i1, i32 1
|
|
|
|
%res0 = shufflevector <2 x i1> %maskv, <2 x i1> undef , <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
|
|
|
|
%res = bitcast <8 x i1> %res0 to i8
|
|
|
|
ret i8 %res
|
|
|
|
}
|
|
|
|
|
2016-08-11 20:13:46 +08:00
|
|
|
define zeroext i8 @test_extractelement_v2i1(<2 x i64> %a, <2 x i64> %b) {
|
|
|
|
; KNL-LABEL: test_extractelement_v2i1:
|
2017-12-05 01:18:51 +08:00
|
|
|
; KNL: ## %bb.0:
|
2018-02-01 06:04:26 +08:00
|
|
|
; KNL-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
|
|
|
|
; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
|
2018-01-08 02:20:37 +08:00
|
|
|
; KNL-NEXT: vpcmpnleuq %zmm1, %zmm0, %k0
|
|
|
|
; KNL-NEXT: kmovw %k0, %eax
|
2017-05-19 20:35:15 +08:00
|
|
|
; KNL-NEXT: andb $1, %al
|
|
|
|
; KNL-NEXT: movb $4, %cl
|
|
|
|
; KNL-NEXT: subb %al, %cl
|
|
|
|
; KNL-NEXT: movzbl %cl, %eax
|
2018-01-08 02:20:37 +08:00
|
|
|
; KNL-NEXT: vzeroupper
|
2016-08-11 20:13:46 +08:00
|
|
|
; KNL-NEXT: retq
|
|
|
|
;
|
|
|
|
; SKX-LABEL: test_extractelement_v2i1:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SKX: ## %bb.0:
|
2016-08-11 20:13:46 +08:00
|
|
|
; SKX-NEXT: vpcmpnleuq %xmm1, %xmm0, %k0
|
2017-03-29 14:55:28 +08:00
|
|
|
; SKX-NEXT: kmovd %k0, %eax
|
2017-05-19 20:35:15 +08:00
|
|
|
; SKX-NEXT: andb $1, %al
|
|
|
|
; SKX-NEXT: movb $4, %cl
|
|
|
|
; SKX-NEXT: subb %al, %cl
|
|
|
|
; SKX-NEXT: movzbl %cl, %eax
|
2016-08-11 20:13:46 +08:00
|
|
|
; SKX-NEXT: retq
|
|
|
|
%t1 = icmp ugt <2 x i64> %a, %b
|
|
|
|
%t2 = extractelement <2 x i1> %t1, i32 0
|
|
|
|
%res = select i1 %t2, i8 3, i8 4
|
|
|
|
ret i8 %res
|
|
|
|
}
|
|
|
|
|
2017-03-01 02:02:38 +08:00
|
|
|
define zeroext i8 @extractelement_v2i1_alt(<2 x i64> %a, <2 x i64> %b) {
|
|
|
|
; KNL-LABEL: extractelement_v2i1_alt:
|
2017-12-05 01:18:51 +08:00
|
|
|
; KNL: ## %bb.0:
|
2018-02-01 06:04:26 +08:00
|
|
|
; KNL-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
|
|
|
|
; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
|
2018-01-08 02:20:37 +08:00
|
|
|
; KNL-NEXT: vpcmpnleuq %zmm1, %zmm0, %k0
|
|
|
|
; KNL-NEXT: kmovw %k0, %eax
|
2017-05-19 20:35:15 +08:00
|
|
|
; KNL-NEXT: andb $1, %al
|
|
|
|
; KNL-NEXT: movb $4, %cl
|
|
|
|
; KNL-NEXT: subb %al, %cl
|
|
|
|
; KNL-NEXT: movzbl %cl, %eax
|
2018-01-08 02:20:37 +08:00
|
|
|
; KNL-NEXT: vzeroupper
|
2017-03-01 02:02:38 +08:00
|
|
|
; KNL-NEXT: retq
|
|
|
|
;
|
|
|
|
; SKX-LABEL: extractelement_v2i1_alt:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SKX: ## %bb.0:
|
2017-03-01 02:02:38 +08:00
|
|
|
; SKX-NEXT: vpcmpnleuq %xmm1, %xmm0, %k0
|
2017-03-29 14:55:28 +08:00
|
|
|
; SKX-NEXT: kmovd %k0, %eax
|
2017-05-19 20:35:15 +08:00
|
|
|
; SKX-NEXT: andb $1, %al
|
|
|
|
; SKX-NEXT: movb $4, %cl
|
|
|
|
; SKX-NEXT: subb %al, %cl
|
|
|
|
; SKX-NEXT: movzbl %cl, %eax
|
2017-03-01 02:02:38 +08:00
|
|
|
; SKX-NEXT: retq
|
|
|
|
%t1 = icmp ugt <2 x i64> %a, %b
|
|
|
|
%t2 = extractelement <2 x i1> %t1, i32 0
|
|
|
|
%sext = sext i1 %t2 to i8
|
|
|
|
%res = add i8 %sext, 4
|
|
|
|
ret i8 %res
|
|
|
|
}
|
|
|
|
|
2016-08-11 20:13:46 +08:00
|
|
|
define zeroext i8 @test_extractelement_v4i1(<4 x i32> %a, <4 x i32> %b) {
|
|
|
|
; KNL-LABEL: test_extractelement_v4i1:
|
2017-12-05 01:18:51 +08:00
|
|
|
; KNL: ## %bb.0:
|
2018-02-01 06:04:26 +08:00
|
|
|
; KNL-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
|
|
|
|
; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
|
2018-01-08 02:20:37 +08:00
|
|
|
; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
|
|
|
|
; KNL-NEXT: kshiftrw $3, %k0, %k0
|
|
|
|
; KNL-NEXT: kmovw %k0, %eax
|
2016-08-11 20:13:46 +08:00
|
|
|
; KNL-NEXT: andl $1, %eax
|
2018-01-08 02:20:37 +08:00
|
|
|
; KNL-NEXT: vzeroupper
|
2016-08-11 20:13:46 +08:00
|
|
|
; KNL-NEXT: retq
|
|
|
|
;
|
|
|
|
; SKX-LABEL: test_extractelement_v4i1:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SKX: ## %bb.0:
|
2016-08-11 20:13:46 +08:00
|
|
|
; SKX-NEXT: vpcmpnleud %xmm1, %xmm0, %k0
|
2017-12-15 02:35:25 +08:00
|
|
|
; SKX-NEXT: kshiftrw $3, %k0, %k0
|
2017-03-29 14:55:28 +08:00
|
|
|
; SKX-NEXT: kmovd %k0, %eax
|
2016-08-26 06:48:11 +08:00
|
|
|
; SKX-NEXT: andl $1, %eax
|
2016-08-11 20:13:46 +08:00
|
|
|
; SKX-NEXT: retq
|
|
|
|
%t1 = icmp ugt <4 x i32> %a, %b
|
|
|
|
%t2 = extractelement <4 x i1> %t1, i32 3
|
|
|
|
%res = zext i1 %t2 to i8
|
|
|
|
ret i8 %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define zeroext i8 @test_extractelement_v32i1(<32 x i8> %a, <32 x i8> %b) {
|
|
|
|
; KNL-LABEL: test_extractelement_v32i1:
|
2017-12-05 01:18:51 +08:00
|
|
|
; KNL: ## %bb.0:
|
2018-02-12 01:11:40 +08:00
|
|
|
; KNL-NEXT: vpminub %ymm1, %ymm0, %ymm1
|
|
|
|
; KNL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
|
|
|
|
; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
|
2018-01-23 22:25:39 +08:00
|
|
|
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
|
|
|
|
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
|
|
|
|
; KNL-NEXT: kshiftrw $2, %k0, %k0
|
|
|
|
; KNL-NEXT: kmovw %k0, %eax
|
2016-08-11 20:13:46 +08:00
|
|
|
; KNL-NEXT: andl $1, %eax
|
2017-09-27 22:44:15 +08:00
|
|
|
; KNL-NEXT: vzeroupper
|
2016-08-11 20:13:46 +08:00
|
|
|
; KNL-NEXT: retq
|
|
|
|
;
|
|
|
|
; SKX-LABEL: test_extractelement_v32i1:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SKX: ## %bb.0:
|
2016-08-11 20:13:46 +08:00
|
|
|
; SKX-NEXT: vpcmpnleub %ymm1, %ymm0, %k0
|
2017-12-15 02:35:25 +08:00
|
|
|
; SKX-NEXT: kshiftrd $2, %k0, %k0
|
2017-03-29 14:55:28 +08:00
|
|
|
; SKX-NEXT: kmovd %k0, %eax
|
2016-08-26 06:48:11 +08:00
|
|
|
; SKX-NEXT: andl $1, %eax
|
2017-03-03 17:03:24 +08:00
|
|
|
; SKX-NEXT: vzeroupper
|
2016-08-11 20:13:46 +08:00
|
|
|
; SKX-NEXT: retq
|
|
|
|
%t1 = icmp ugt <32 x i8> %a, %b
|
|
|
|
%t2 = extractelement <32 x i1> %t1, i32 2
|
|
|
|
%res = zext i1 %t2 to i8
|
|
|
|
ret i8 %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define zeroext i8 @test_extractelement_v64i1(<64 x i8> %a, <64 x i8> %b) {
|
|
|
|
; KNL-LABEL: test_extractelement_v64i1:
|
2017-12-05 01:18:51 +08:00
|
|
|
; KNL: ## %bb.0:
|
2018-02-12 01:11:40 +08:00
|
|
|
; KNL-NEXT: vpminub %ymm3, %ymm1, %ymm0
|
|
|
|
; KNL-NEXT: vpcmpeqb %ymm0, %ymm1, %ymm0
|
2016-08-11 20:13:46 +08:00
|
|
|
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
|
2018-10-30 22:14:34 +08:00
|
|
|
; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
|
2018-01-23 22:25:39 +08:00
|
|
|
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
|
|
|
|
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
|
|
|
|
; KNL-NEXT: kshiftrw $15, %k0, %k0
|
|
|
|
; KNL-NEXT: kmovw %k0, %eax
|
2017-05-19 20:35:15 +08:00
|
|
|
; KNL-NEXT: andb $1, %al
|
|
|
|
; KNL-NEXT: movb $4, %cl
|
|
|
|
; KNL-NEXT: subb %al, %cl
|
|
|
|
; KNL-NEXT: movzbl %cl, %eax
|
2017-09-27 22:44:15 +08:00
|
|
|
; KNL-NEXT: vzeroupper
|
2016-08-11 20:13:46 +08:00
|
|
|
; KNL-NEXT: retq
|
|
|
|
;
|
|
|
|
; SKX-LABEL: test_extractelement_v64i1:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SKX: ## %bb.0:
|
2016-08-11 20:13:46 +08:00
|
|
|
; SKX-NEXT: vpcmpnleub %zmm1, %zmm0, %k0
|
|
|
|
; SKX-NEXT: kshiftrq $63, %k0, %k0
|
2017-03-29 14:55:28 +08:00
|
|
|
; SKX-NEXT: kmovd %k0, %eax
|
2017-05-19 20:35:15 +08:00
|
|
|
; SKX-NEXT: andb $1, %al
|
|
|
|
; SKX-NEXT: movb $4, %cl
|
|
|
|
; SKX-NEXT: subb %al, %cl
|
|
|
|
; SKX-NEXT: movzbl %cl, %eax
|
2017-03-03 17:03:24 +08:00
|
|
|
; SKX-NEXT: vzeroupper
|
2016-08-11 20:13:46 +08:00
|
|
|
; SKX-NEXT: retq
|
|
|
|
%t1 = icmp ugt <64 x i8> %a, %b
|
|
|
|
%t2 = extractelement <64 x i1> %t1, i32 63
|
|
|
|
%res = select i1 %t2, i8 3, i8 4
|
|
|
|
ret i8 %res
|
|
|
|
}
|
2017-02-09 15:39:19 +08:00
|
|
|
|
2017-03-01 02:02:38 +08:00
|
|
|
define zeroext i8 @extractelement_v64i1_alt(<64 x i8> %a, <64 x i8> %b) {
|
|
|
|
; KNL-LABEL: extractelement_v64i1_alt:
|
2017-12-05 01:18:51 +08:00
|
|
|
; KNL: ## %bb.0:
|
2018-02-12 01:11:40 +08:00
|
|
|
; KNL-NEXT: vpminub %ymm3, %ymm1, %ymm0
|
|
|
|
; KNL-NEXT: vpcmpeqb %ymm0, %ymm1, %ymm0
|
2017-03-01 02:02:38 +08:00
|
|
|
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
|
2018-10-30 22:14:34 +08:00
|
|
|
; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
|
2018-01-23 22:25:39 +08:00
|
|
|
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
|
|
|
|
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
|
|
|
|
; KNL-NEXT: kshiftrw $15, %k0, %k0
|
|
|
|
; KNL-NEXT: kmovw %k0, %eax
|
2017-05-19 20:35:15 +08:00
|
|
|
; KNL-NEXT: andb $1, %al
|
|
|
|
; KNL-NEXT: movb $4, %cl
|
|
|
|
; KNL-NEXT: subb %al, %cl
|
|
|
|
; KNL-NEXT: movzbl %cl, %eax
|
2017-09-27 22:44:15 +08:00
|
|
|
; KNL-NEXT: vzeroupper
|
2017-03-01 02:02:38 +08:00
|
|
|
; KNL-NEXT: retq
|
|
|
|
;
|
|
|
|
; SKX-LABEL: extractelement_v64i1_alt:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SKX: ## %bb.0:
|
2017-03-01 02:02:38 +08:00
|
|
|
; SKX-NEXT: vpcmpnleub %zmm1, %zmm0, %k0
|
|
|
|
; SKX-NEXT: kshiftrq $63, %k0, %k0
|
2017-03-29 14:55:28 +08:00
|
|
|
; SKX-NEXT: kmovd %k0, %eax
|
2017-05-19 20:35:15 +08:00
|
|
|
; SKX-NEXT: andb $1, %al
|
|
|
|
; SKX-NEXT: movb $4, %cl
|
|
|
|
; SKX-NEXT: subb %al, %cl
|
|
|
|
; SKX-NEXT: movzbl %cl, %eax
|
2017-03-03 17:03:24 +08:00
|
|
|
; SKX-NEXT: vzeroupper
|
2017-03-01 02:02:38 +08:00
|
|
|
; SKX-NEXT: retq
|
|
|
|
%t1 = icmp ugt <64 x i8> %a, %b
|
|
|
|
%t2 = extractelement <64 x i1> %t1, i32 63
|
|
|
|
%sext = sext i1 %t2 to i8
|
|
|
|
%res = add i8 %sext, 4
|
|
|
|
ret i8 %res
|
|
|
|
}
|
|
|
|
|
2017-02-09 15:39:19 +08:00
|
|
|
define i64 @test_extractelement_variable_v2i64(<2 x i64> %t1, i32 %index) {
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-LABEL: test_extractelement_variable_v2i64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2018-02-01 06:04:26 +08:00
|
|
|
; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
|
|
|
|
; CHECK-NEXT: andl $1, %edi
|
|
|
|
; CHECK-NEXT: movq -24(%rsp,%rdi,8), %rax
|
|
|
|
; CHECK-NEXT: retq
|
2017-02-09 15:39:19 +08:00
|
|
|
%t2 = extractelement <2 x i64> %t1, i32 %index
|
|
|
|
ret i64 %t2
|
|
|
|
}
|
|
|
|
|
|
|
|
define i64 @test_extractelement_variable_v4i64(<4 x i64> %t1, i32 %index) {
|
2017-10-10 23:58:54 +08:00
|
|
|
; CHECK-LABEL: test_extractelement_variable_v4i64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-10-10 23:58:54 +08:00
|
|
|
; CHECK-NEXT: pushq %rbp
|
|
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
|
|
; CHECK-NEXT: .cfi_offset %rbp, -16
|
|
|
|
; CHECK-NEXT: movq %rsp, %rbp
|
|
|
|
; CHECK-NEXT: .cfi_def_cfa_register %rbp
|
|
|
|
; CHECK-NEXT: andq $-32, %rsp
|
|
|
|
; CHECK-NEXT: subq $64, %rsp
|
2018-02-01 06:04:26 +08:00
|
|
|
; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
|
2017-10-10 23:58:54 +08:00
|
|
|
; CHECK-NEXT: vmovaps %ymm0, (%rsp)
|
|
|
|
; CHECK-NEXT: andl $3, %edi
|
|
|
|
; CHECK-NEXT: movq (%rsp,%rdi,8), %rax
|
|
|
|
; CHECK-NEXT: movq %rbp, %rsp
|
|
|
|
; CHECK-NEXT: popq %rbp
|
|
|
|
; CHECK-NEXT: vzeroupper
|
|
|
|
; CHECK-NEXT: retq
|
2017-02-09 15:39:19 +08:00
|
|
|
%t2 = extractelement <4 x i64> %t1, i32 %index
|
|
|
|
ret i64 %t2
|
|
|
|
}
|
|
|
|
|
|
|
|
define i64 @test_extractelement_variable_v8i64(<8 x i64> %t1, i32 %index) {
|
2017-10-10 23:58:54 +08:00
|
|
|
; CHECK-LABEL: test_extractelement_variable_v8i64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-10-10 23:58:54 +08:00
|
|
|
; CHECK-NEXT: pushq %rbp
|
|
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
|
|
; CHECK-NEXT: .cfi_offset %rbp, -16
|
|
|
|
; CHECK-NEXT: movq %rsp, %rbp
|
|
|
|
; CHECK-NEXT: .cfi_def_cfa_register %rbp
|
|
|
|
; CHECK-NEXT: andq $-64, %rsp
|
|
|
|
; CHECK-NEXT: subq $128, %rsp
|
2018-02-01 06:04:26 +08:00
|
|
|
; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
|
2017-10-10 23:58:54 +08:00
|
|
|
; CHECK-NEXT: vmovaps %zmm0, (%rsp)
|
|
|
|
; CHECK-NEXT: andl $7, %edi
|
|
|
|
; CHECK-NEXT: movq (%rsp,%rdi,8), %rax
|
|
|
|
; CHECK-NEXT: movq %rbp, %rsp
|
|
|
|
; CHECK-NEXT: popq %rbp
|
|
|
|
; CHECK-NEXT: vzeroupper
|
|
|
|
; CHECK-NEXT: retq
|
2017-02-09 15:39:19 +08:00
|
|
|
%t2 = extractelement <8 x i64> %t1, i32 %index
|
|
|
|
ret i64 %t2
|
|
|
|
}
|
|
|
|
|
|
|
|
define double @test_extractelement_variable_v2f64(<2 x double> %t1, i32 %index) {
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-LABEL: test_extractelement_variable_v2f64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2018-02-01 06:04:26 +08:00
|
|
|
; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
|
|
|
|
; CHECK-NEXT: andl $1, %edi
|
|
|
|
; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
|
|
|
|
; CHECK-NEXT: retq
|
2017-02-09 15:39:19 +08:00
|
|
|
%t2 = extractelement <2 x double> %t1, i32 %index
|
|
|
|
ret double %t2
|
|
|
|
}
|
|
|
|
|
|
|
|
define double @test_extractelement_variable_v4f64(<4 x double> %t1, i32 %index) {
|
2017-10-10 23:58:54 +08:00
|
|
|
; CHECK-LABEL: test_extractelement_variable_v4f64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-10-10 23:58:54 +08:00
|
|
|
; CHECK-NEXT: pushq %rbp
|
|
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
|
|
; CHECK-NEXT: .cfi_offset %rbp, -16
|
|
|
|
; CHECK-NEXT: movq %rsp, %rbp
|
|
|
|
; CHECK-NEXT: .cfi_def_cfa_register %rbp
|
|
|
|
; CHECK-NEXT: andq $-32, %rsp
|
|
|
|
; CHECK-NEXT: subq $64, %rsp
|
2018-02-01 06:04:26 +08:00
|
|
|
; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
|
2017-10-10 23:58:54 +08:00
|
|
|
; CHECK-NEXT: vmovaps %ymm0, (%rsp)
|
|
|
|
; CHECK-NEXT: andl $3, %edi
|
|
|
|
; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
|
|
|
|
; CHECK-NEXT: movq %rbp, %rsp
|
|
|
|
; CHECK-NEXT: popq %rbp
|
|
|
|
; CHECK-NEXT: vzeroupper
|
|
|
|
; CHECK-NEXT: retq
|
2017-02-09 15:39:19 +08:00
|
|
|
%t2 = extractelement <4 x double> %t1, i32 %index
|
|
|
|
ret double %t2
|
|
|
|
}
|
|
|
|
|
|
|
|
define double @test_extractelement_variable_v8f64(<8 x double> %t1, i32 %index) {
|
2017-10-10 23:58:54 +08:00
|
|
|
; CHECK-LABEL: test_extractelement_variable_v8f64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-10-10 23:58:54 +08:00
|
|
|
; CHECK-NEXT: pushq %rbp
|
|
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
|
|
; CHECK-NEXT: .cfi_offset %rbp, -16
|
|
|
|
; CHECK-NEXT: movq %rsp, %rbp
|
|
|
|
; CHECK-NEXT: .cfi_def_cfa_register %rbp
|
|
|
|
; CHECK-NEXT: andq $-64, %rsp
|
|
|
|
; CHECK-NEXT: subq $128, %rsp
|
2018-02-01 06:04:26 +08:00
|
|
|
; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
|
2017-10-10 23:58:54 +08:00
|
|
|
; CHECK-NEXT: vmovaps %zmm0, (%rsp)
|
|
|
|
; CHECK-NEXT: andl $7, %edi
|
|
|
|
; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
|
|
|
|
; CHECK-NEXT: movq %rbp, %rsp
|
|
|
|
; CHECK-NEXT: popq %rbp
|
|
|
|
; CHECK-NEXT: vzeroupper
|
|
|
|
; CHECK-NEXT: retq
|
2017-02-09 15:39:19 +08:00
|
|
|
%t2 = extractelement <8 x double> %t1, i32 %index
|
|
|
|
ret double %t2
|
|
|
|
}
|
|
|
|
|
|
|
|
define i32 @test_extractelement_variable_v4i32(<4 x i32> %t1, i32 %index) {
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-LABEL: test_extractelement_variable_v4i32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2018-02-01 06:04:26 +08:00
|
|
|
; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
|
|
|
|
; CHECK-NEXT: andl $3, %edi
|
|
|
|
; CHECK-NEXT: movl -24(%rsp,%rdi,4), %eax
|
|
|
|
; CHECK-NEXT: retq
|
2017-02-09 15:39:19 +08:00
|
|
|
%t2 = extractelement <4 x i32> %t1, i32 %index
|
|
|
|
ret i32 %t2
|
|
|
|
}
|
|
|
|
|
|
|
|
define i32 @test_extractelement_variable_v8i32(<8 x i32> %t1, i32 %index) {
|
2017-10-10 23:58:54 +08:00
|
|
|
; CHECK-LABEL: test_extractelement_variable_v8i32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-10-10 23:58:54 +08:00
|
|
|
; CHECK-NEXT: pushq %rbp
|
|
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
|
|
; CHECK-NEXT: .cfi_offset %rbp, -16
|
|
|
|
; CHECK-NEXT: movq %rsp, %rbp
|
|
|
|
; CHECK-NEXT: .cfi_def_cfa_register %rbp
|
|
|
|
; CHECK-NEXT: andq $-32, %rsp
|
|
|
|
; CHECK-NEXT: subq $64, %rsp
|
2018-02-01 06:04:26 +08:00
|
|
|
; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
|
2017-10-10 23:58:54 +08:00
|
|
|
; CHECK-NEXT: vmovaps %ymm0, (%rsp)
|
|
|
|
; CHECK-NEXT: andl $7, %edi
|
|
|
|
; CHECK-NEXT: movl (%rsp,%rdi,4), %eax
|
|
|
|
; CHECK-NEXT: movq %rbp, %rsp
|
|
|
|
; CHECK-NEXT: popq %rbp
|
|
|
|
; CHECK-NEXT: vzeroupper
|
|
|
|
; CHECK-NEXT: retq
|
2017-02-09 15:39:19 +08:00
|
|
|
%t2 = extractelement <8 x i32> %t1, i32 %index
|
|
|
|
ret i32 %t2
|
|
|
|
}
|
|
|
|
|
|
|
|
define i32 @test_extractelement_variable_v16i32(<16 x i32> %t1, i32 %index) {
|
2017-10-10 23:58:54 +08:00
|
|
|
; CHECK-LABEL: test_extractelement_variable_v16i32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-10-10 23:58:54 +08:00
|
|
|
; CHECK-NEXT: pushq %rbp
|
|
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
|
|
; CHECK-NEXT: .cfi_offset %rbp, -16
|
|
|
|
; CHECK-NEXT: movq %rsp, %rbp
|
|
|
|
; CHECK-NEXT: .cfi_def_cfa_register %rbp
|
|
|
|
; CHECK-NEXT: andq $-64, %rsp
|
|
|
|
; CHECK-NEXT: subq $128, %rsp
|
2018-02-01 06:04:26 +08:00
|
|
|
; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
|
2017-10-10 23:58:54 +08:00
|
|
|
; CHECK-NEXT: vmovaps %zmm0, (%rsp)
|
|
|
|
; CHECK-NEXT: andl $15, %edi
|
|
|
|
; CHECK-NEXT: movl (%rsp,%rdi,4), %eax
|
|
|
|
; CHECK-NEXT: movq %rbp, %rsp
|
|
|
|
; CHECK-NEXT: popq %rbp
|
|
|
|
; CHECK-NEXT: vzeroupper
|
|
|
|
; CHECK-NEXT: retq
|
2017-02-09 15:39:19 +08:00
|
|
|
%t2 = extractelement <16 x i32> %t1, i32 %index
|
|
|
|
ret i32 %t2
|
|
|
|
}
|
|
|
|
|
|
|
|
define float @test_extractelement_variable_v4f32(<4 x float> %t1, i32 %index) {
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-LABEL: test_extractelement_variable_v4f32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2018-02-01 06:04:26 +08:00
|
|
|
; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
|
2017-08-01 05:20:06 +08:00
|
|
|
; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
|
|
|
|
; CHECK-NEXT: andl $3, %edi
|
|
|
|
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; CHECK-NEXT: retq
|
2017-02-09 15:39:19 +08:00
|
|
|
%t2 = extractelement <4 x float> %t1, i32 %index
|
|
|
|
ret float %t2
|
|
|
|
}
|
|
|
|
|
|
|
|
define float @test_extractelement_variable_v8f32(<8 x float> %t1, i32 %index) {
|
2017-10-10 23:58:54 +08:00
|
|
|
; CHECK-LABEL: test_extractelement_variable_v8f32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-10-10 23:58:54 +08:00
|
|
|
; CHECK-NEXT: pushq %rbp
|
|
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
|
|
; CHECK-NEXT: .cfi_offset %rbp, -16
|
|
|
|
; CHECK-NEXT: movq %rsp, %rbp
|
|
|
|
; CHECK-NEXT: .cfi_def_cfa_register %rbp
|
|
|
|
; CHECK-NEXT: andq $-32, %rsp
|
|
|
|
; CHECK-NEXT: subq $64, %rsp
|
2018-02-01 06:04:26 +08:00
|
|
|
; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
|
2017-10-10 23:58:54 +08:00
|
|
|
; CHECK-NEXT: vmovaps %ymm0, (%rsp)
|
|
|
|
; CHECK-NEXT: andl $7, %edi
|
|
|
|
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; CHECK-NEXT: movq %rbp, %rsp
|
|
|
|
; CHECK-NEXT: popq %rbp
|
|
|
|
; CHECK-NEXT: vzeroupper
|
|
|
|
; CHECK-NEXT: retq
|
2017-02-09 15:39:19 +08:00
|
|
|
%t2 = extractelement <8 x float> %t1, i32 %index
|
|
|
|
ret float %t2
|
|
|
|
}
|
|
|
|
|
|
|
|
define float @test_extractelement_variable_v16f32(<16 x float> %t1, i32 %index) {
|
2017-10-10 23:58:54 +08:00
|
|
|
; CHECK-LABEL: test_extractelement_variable_v16f32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-10-10 23:58:54 +08:00
|
|
|
; CHECK-NEXT: pushq %rbp
|
|
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
|
|
; CHECK-NEXT: .cfi_offset %rbp, -16
|
|
|
|
; CHECK-NEXT: movq %rsp, %rbp
|
|
|
|
; CHECK-NEXT: .cfi_def_cfa_register %rbp
|
|
|
|
; CHECK-NEXT: andq $-64, %rsp
|
|
|
|
; CHECK-NEXT: subq $128, %rsp
|
2018-02-01 06:04:26 +08:00
|
|
|
; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
|
2017-10-10 23:58:54 +08:00
|
|
|
; CHECK-NEXT: vmovaps %zmm0, (%rsp)
|
|
|
|
; CHECK-NEXT: andl $15, %edi
|
|
|
|
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; CHECK-NEXT: movq %rbp, %rsp
|
|
|
|
; CHECK-NEXT: popq %rbp
|
|
|
|
; CHECK-NEXT: vzeroupper
|
|
|
|
; CHECK-NEXT: retq
|
2017-02-09 15:39:19 +08:00
|
|
|
%t2 = extractelement <16 x float> %t1, i32 %index
|
|
|
|
ret float %t2
|
|
|
|
}
|
|
|
|
|
|
|
|
define i16 @test_extractelement_variable_v8i16(<8 x i16> %t1, i32 %index) {
|
2017-08-01 23:31:24 +08:00
|
|
|
; CHECK-LABEL: test_extractelement_variable_v8i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2018-02-01 06:04:26 +08:00
|
|
|
; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
|
2017-08-01 23:31:24 +08:00
|
|
|
; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
|
|
|
|
; CHECK-NEXT: andl $7, %edi
|
|
|
|
; CHECK-NEXT: movzwl -24(%rsp,%rdi,2), %eax
|
|
|
|
; CHECK-NEXT: retq
|
2017-02-09 15:39:19 +08:00
|
|
|
%t2 = extractelement <8 x i16> %t1, i32 %index
|
|
|
|
ret i16 %t2
|
|
|
|
}
|
|
|
|
|
|
|
|
define i16 @test_extractelement_variable_v16i16(<16 x i16> %t1, i32 %index) {
|
2017-10-10 23:58:54 +08:00
|
|
|
; CHECK-LABEL: test_extractelement_variable_v16i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-10-10 23:58:54 +08:00
|
|
|
; CHECK-NEXT: pushq %rbp
|
|
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
|
|
; CHECK-NEXT: .cfi_offset %rbp, -16
|
|
|
|
; CHECK-NEXT: movq %rsp, %rbp
|
|
|
|
; CHECK-NEXT: .cfi_def_cfa_register %rbp
|
|
|
|
; CHECK-NEXT: andq $-32, %rsp
|
|
|
|
; CHECK-NEXT: subq $64, %rsp
|
2018-02-01 06:04:26 +08:00
|
|
|
; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
|
2017-10-10 23:58:54 +08:00
|
|
|
; CHECK-NEXT: vmovaps %ymm0, (%rsp)
|
|
|
|
; CHECK-NEXT: andl $15, %edi
|
|
|
|
; CHECK-NEXT: movzwl (%rsp,%rdi,2), %eax
|
|
|
|
; CHECK-NEXT: movq %rbp, %rsp
|
|
|
|
; CHECK-NEXT: popq %rbp
|
|
|
|
; CHECK-NEXT: vzeroupper
|
|
|
|
; CHECK-NEXT: retq
|
2017-02-09 15:39:19 +08:00
|
|
|
%t2 = extractelement <16 x i16> %t1, i32 %index
|
|
|
|
ret i16 %t2
|
|
|
|
}
|
|
|
|
|
2017-02-20 22:16:29 +08:00
|
|
|
define i16 @test_extractelement_variable_v32i16(<32 x i16> %t1, i32 %index) {
|
|
|
|
; KNL-LABEL: test_extractelement_variable_v32i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; KNL: ## %bb.0:
|
2017-02-20 22:16:29 +08:00
|
|
|
; KNL-NEXT: pushq %rbp
|
|
|
|
; KNL-NEXT: .cfi_def_cfa_offset 16
|
|
|
|
; KNL-NEXT: .cfi_offset %rbp, -16
|
|
|
|
; KNL-NEXT: movq %rsp, %rbp
|
|
|
|
; KNL-NEXT: .cfi_def_cfa_register %rbp
|
|
|
|
; KNL-NEXT: andq $-64, %rsp
|
|
|
|
; KNL-NEXT: subq $128, %rsp
|
2018-02-01 06:04:26 +08:00
|
|
|
; KNL-NEXT: ## kill: def $edi killed $edi def $rdi
|
2017-02-20 22:16:29 +08:00
|
|
|
; KNL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
|
|
|
|
; KNL-NEXT: vmovaps %ymm0, (%rsp)
|
|
|
|
; KNL-NEXT: andl $31, %edi
|
|
|
|
; KNL-NEXT: movzwl (%rsp,%rdi,2), %eax
|
|
|
|
; KNL-NEXT: movq %rbp, %rsp
|
|
|
|
; KNL-NEXT: popq %rbp
|
2017-09-27 22:44:15 +08:00
|
|
|
; KNL-NEXT: vzeroupper
|
2017-02-20 22:16:29 +08:00
|
|
|
; KNL-NEXT: retq
|
|
|
|
;
|
|
|
|
; SKX-LABEL: test_extractelement_variable_v32i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SKX: ## %bb.0:
|
2017-02-20 22:16:29 +08:00
|
|
|
; SKX-NEXT: pushq %rbp
|
|
|
|
; SKX-NEXT: .cfi_def_cfa_offset 16
|
|
|
|
; SKX-NEXT: .cfi_offset %rbp, -16
|
|
|
|
; SKX-NEXT: movq %rsp, %rbp
|
|
|
|
; SKX-NEXT: .cfi_def_cfa_register %rbp
|
|
|
|
; SKX-NEXT: andq $-64, %rsp
|
|
|
|
; SKX-NEXT: subq $128, %rsp
|
2018-02-01 06:04:26 +08:00
|
|
|
; SKX-NEXT: ## kill: def $edi killed $edi def $rdi
|
2017-08-01 23:31:24 +08:00
|
|
|
; SKX-NEXT: vmovaps %zmm0, (%rsp)
|
2017-02-20 22:16:29 +08:00
|
|
|
; SKX-NEXT: andl $31, %edi
|
|
|
|
; SKX-NEXT: movzwl (%rsp,%rdi,2), %eax
|
|
|
|
; SKX-NEXT: movq %rbp, %rsp
|
|
|
|
; SKX-NEXT: popq %rbp
|
2017-03-03 17:03:24 +08:00
|
|
|
; SKX-NEXT: vzeroupper
|
2017-02-20 22:16:29 +08:00
|
|
|
; SKX-NEXT: retq
|
|
|
|
%t2 = extractelement <32 x i16> %t1, i32 %index
|
|
|
|
ret i16 %t2
|
|
|
|
}
|
2017-02-09 15:39:19 +08:00
|
|
|
|
|
|
|
define i8 @test_extractelement_variable_v16i8(<16 x i8> %t1, i32 %index) {
|
2017-09-27 22:44:15 +08:00
|
|
|
; CHECK-LABEL: test_extractelement_variable_v16i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2018-02-01 06:04:26 +08:00
|
|
|
; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
|
2017-09-27 22:44:15 +08:00
|
|
|
; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
|
|
|
|
; CHECK-NEXT: andl $15, %edi
|
2017-12-23 01:18:13 +08:00
|
|
|
; CHECK-NEXT: movb -24(%rsp,%rdi), %al
|
2017-09-27 22:44:15 +08:00
|
|
|
; CHECK-NEXT: retq
|
2017-02-09 15:39:19 +08:00
|
|
|
%t2 = extractelement <16 x i8> %t1, i32 %index
|
|
|
|
ret i8 %t2
|
|
|
|
}
|
|
|
|
|
|
|
|
define i8 @test_extractelement_variable_v32i8(<32 x i8> %t1, i32 %index) {
|
2017-10-10 23:58:54 +08:00
|
|
|
; CHECK-LABEL: test_extractelement_variable_v32i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-10-10 23:58:54 +08:00
|
|
|
; CHECK-NEXT: pushq %rbp
|
|
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
|
|
; CHECK-NEXT: .cfi_offset %rbp, -16
|
|
|
|
; CHECK-NEXT: movq %rsp, %rbp
|
|
|
|
; CHECK-NEXT: .cfi_def_cfa_register %rbp
|
|
|
|
; CHECK-NEXT: andq $-32, %rsp
|
|
|
|
; CHECK-NEXT: subq $64, %rsp
|
2018-02-01 06:04:26 +08:00
|
|
|
; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
|
2017-10-10 23:58:54 +08:00
|
|
|
; CHECK-NEXT: vmovaps %ymm0, (%rsp)
|
|
|
|
; CHECK-NEXT: andl $31, %edi
|
2017-12-23 01:18:13 +08:00
|
|
|
; CHECK-NEXT: movb (%rsp,%rdi), %al
|
2017-10-10 23:58:54 +08:00
|
|
|
; CHECK-NEXT: movq %rbp, %rsp
|
|
|
|
; CHECK-NEXT: popq %rbp
|
|
|
|
; CHECK-NEXT: vzeroupper
|
|
|
|
; CHECK-NEXT: retq
|
2017-02-09 15:39:19 +08:00
|
|
|
|
|
|
|
%t2 = extractelement <32 x i8> %t1, i32 %index
|
|
|
|
ret i8 %t2
|
|
|
|
}
|
|
|
|
|
2017-02-20 22:16:29 +08:00
|
|
|
define i8 @test_extractelement_variable_v64i8(<64 x i8> %t1, i32 %index) {
|
|
|
|
; KNL-LABEL: test_extractelement_variable_v64i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; KNL: ## %bb.0:
|
2017-02-20 22:16:29 +08:00
|
|
|
; KNL-NEXT: pushq %rbp
|
|
|
|
; KNL-NEXT: .cfi_def_cfa_offset 16
|
|
|
|
; KNL-NEXT: .cfi_offset %rbp, -16
|
|
|
|
; KNL-NEXT: movq %rsp, %rbp
|
|
|
|
; KNL-NEXT: .cfi_def_cfa_register %rbp
|
|
|
|
; KNL-NEXT: andq $-64, %rsp
|
|
|
|
; KNL-NEXT: subq $128, %rsp
|
2018-02-01 06:04:26 +08:00
|
|
|
; KNL-NEXT: ## kill: def $edi killed $edi def $rdi
|
2017-02-20 22:16:29 +08:00
|
|
|
; KNL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
|
|
|
|
; KNL-NEXT: vmovaps %ymm0, (%rsp)
|
|
|
|
; KNL-NEXT: andl $63, %edi
|
2017-12-23 01:18:13 +08:00
|
|
|
; KNL-NEXT: movb (%rsp,%rdi), %al
|
2017-02-20 22:16:29 +08:00
|
|
|
; KNL-NEXT: movq %rbp, %rsp
|
|
|
|
; KNL-NEXT: popq %rbp
|
2017-09-27 22:44:15 +08:00
|
|
|
; KNL-NEXT: vzeroupper
|
2017-02-20 22:16:29 +08:00
|
|
|
; KNL-NEXT: retq
|
|
|
|
;
|
|
|
|
; SKX-LABEL: test_extractelement_variable_v64i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SKX: ## %bb.0:
|
2017-02-20 22:16:29 +08:00
|
|
|
; SKX-NEXT: pushq %rbp
|
|
|
|
; SKX-NEXT: .cfi_def_cfa_offset 16
|
|
|
|
; SKX-NEXT: .cfi_offset %rbp, -16
|
|
|
|
; SKX-NEXT: movq %rsp, %rbp
|
|
|
|
; SKX-NEXT: .cfi_def_cfa_register %rbp
|
|
|
|
; SKX-NEXT: andq $-64, %rsp
|
|
|
|
; SKX-NEXT: subq $128, %rsp
|
2018-02-01 06:04:26 +08:00
|
|
|
; SKX-NEXT: ## kill: def $edi killed $edi def $rdi
|
2017-08-01 23:31:24 +08:00
|
|
|
; SKX-NEXT: vmovaps %zmm0, (%rsp)
|
2017-02-20 22:16:29 +08:00
|
|
|
; SKX-NEXT: andl $63, %edi
|
2017-12-23 01:18:13 +08:00
|
|
|
; SKX-NEXT: movb (%rsp,%rdi), %al
|
2017-02-20 22:16:29 +08:00
|
|
|
; SKX-NEXT: movq %rbp, %rsp
|
|
|
|
; SKX-NEXT: popq %rbp
|
2017-03-03 17:03:24 +08:00
|
|
|
; SKX-NEXT: vzeroupper
|
2017-02-20 22:16:29 +08:00
|
|
|
; SKX-NEXT: retq
|
|
|
|
|
|
|
|
%t2 = extractelement <64 x i8> %t1, i32 %index
|
|
|
|
ret i8 %t2
|
|
|
|
}
|
|
|
|
|
|
|
|
define i8 @test_extractelement_variable_v64i8_indexi8(<64 x i8> %t1, i8 %index) {
|
|
|
|
; KNL-LABEL: test_extractelement_variable_v64i8_indexi8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; KNL: ## %bb.0:
|
2017-02-20 22:16:29 +08:00
|
|
|
; KNL-NEXT: pushq %rbp
|
|
|
|
; KNL-NEXT: .cfi_def_cfa_offset 16
|
|
|
|
; KNL-NEXT: .cfi_offset %rbp, -16
|
|
|
|
; KNL-NEXT: movq %rsp, %rbp
|
|
|
|
; KNL-NEXT: .cfi_def_cfa_register %rbp
|
|
|
|
; KNL-NEXT: andq $-64, %rsp
|
|
|
|
; KNL-NEXT: subq $128, %rsp
|
|
|
|
; KNL-NEXT: addb %dil, %dil
|
|
|
|
; KNL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
|
|
|
|
; KNL-NEXT: vmovaps %ymm0, (%rsp)
|
|
|
|
; KNL-NEXT: movzbl %dil, %eax
|
|
|
|
; KNL-NEXT: andl $63, %eax
|
2017-12-23 01:18:13 +08:00
|
|
|
; KNL-NEXT: movb (%rsp,%rax), %al
|
2017-02-20 22:16:29 +08:00
|
|
|
; KNL-NEXT: movq %rbp, %rsp
|
|
|
|
; KNL-NEXT: popq %rbp
|
2017-09-27 22:44:15 +08:00
|
|
|
; KNL-NEXT: vzeroupper
|
2017-02-20 22:16:29 +08:00
|
|
|
; KNL-NEXT: retq
|
|
|
|
;
|
|
|
|
; SKX-LABEL: test_extractelement_variable_v64i8_indexi8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SKX: ## %bb.0:
|
2017-02-20 22:16:29 +08:00
|
|
|
; SKX-NEXT: pushq %rbp
|
|
|
|
; SKX-NEXT: .cfi_def_cfa_offset 16
|
|
|
|
; SKX-NEXT: .cfi_offset %rbp, -16
|
|
|
|
; SKX-NEXT: movq %rsp, %rbp
|
|
|
|
; SKX-NEXT: .cfi_def_cfa_register %rbp
|
|
|
|
; SKX-NEXT: andq $-64, %rsp
|
|
|
|
; SKX-NEXT: subq $128, %rsp
|
|
|
|
; SKX-NEXT: addb %dil, %dil
|
2017-08-01 23:31:24 +08:00
|
|
|
; SKX-NEXT: vmovaps %zmm0, (%rsp)
|
2017-02-20 22:16:29 +08:00
|
|
|
; SKX-NEXT: movzbl %dil, %eax
|
|
|
|
; SKX-NEXT: andl $63, %eax
|
2017-12-23 01:18:13 +08:00
|
|
|
; SKX-NEXT: movb (%rsp,%rax), %al
|
2017-02-20 22:16:29 +08:00
|
|
|
; SKX-NEXT: movq %rbp, %rsp
|
|
|
|
; SKX-NEXT: popq %rbp
|
2017-03-03 17:03:24 +08:00
|
|
|
; SKX-NEXT: vzeroupper
|
2017-02-20 22:16:29 +08:00
|
|
|
; SKX-NEXT: retq
|
|
|
|
|
|
|
|
%i = add i8 %index, %index
|
|
|
|
%t2 = extractelement <64 x i8> %t1, i8 %i
|
|
|
|
ret i8 %t2
|
|
|
|
}
|
2017-02-21 22:01:25 +08:00
|
|
|
|
|
|
|
define zeroext i8 @test_extractelement_varible_v2i1(<2 x i64> %a, <2 x i64> %b, i32 %index) {
|
|
|
|
; KNL-LABEL: test_extractelement_varible_v2i1:
|
2017-12-05 01:18:51 +08:00
|
|
|
; KNL: ## %bb.0:
|
2018-02-01 06:04:26 +08:00
|
|
|
; KNL-NEXT: ## kill: def $edi killed $edi def $rdi
|
|
|
|
; KNL-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
|
|
|
|
; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
|
2018-01-08 02:20:37 +08:00
|
|
|
; KNL-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1
|
|
|
|
; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
|
|
|
|
; KNL-NEXT: vextracti32x4 $0, %zmm0, -{{[0-9]+}}(%rsp)
|
2017-02-21 22:01:25 +08:00
|
|
|
; KNL-NEXT: andl $1, %edi
|
2018-01-08 02:20:37 +08:00
|
|
|
; KNL-NEXT: movzbl -24(%rsp,%rdi,8), %eax
|
2017-02-21 22:01:25 +08:00
|
|
|
; KNL-NEXT: andl $1, %eax
|
2018-01-08 02:20:37 +08:00
|
|
|
; KNL-NEXT: vzeroupper
|
2017-02-21 22:01:25 +08:00
|
|
|
; KNL-NEXT: retq
|
|
|
|
;
|
|
|
|
; SKX-LABEL: test_extractelement_varible_v2i1:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SKX: ## %bb.0:
|
2018-02-01 06:04:26 +08:00
|
|
|
; SKX-NEXT: ## kill: def $edi killed $edi def $rdi
|
2017-02-21 22:01:25 +08:00
|
|
|
; SKX-NEXT: vpcmpnleuq %xmm1, %xmm0, %k0
|
|
|
|
; SKX-NEXT: vpmovm2q %k0, %xmm0
|
|
|
|
; SKX-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
|
|
|
|
; SKX-NEXT: andl $1, %edi
|
2017-05-19 20:35:15 +08:00
|
|
|
; SKX-NEXT: movzbl -24(%rsp,%rdi,8), %eax
|
2017-02-21 22:01:25 +08:00
|
|
|
; SKX-NEXT: andl $1, %eax
|
|
|
|
; SKX-NEXT: retq
|
|
|
|
%t1 = icmp ugt <2 x i64> %a, %b
|
|
|
|
%t2 = extractelement <2 x i1> %t1, i32 %index
|
|
|
|
%res = zext i1 %t2 to i8
|
|
|
|
ret i8 %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define zeroext i8 @test_extractelement_varible_v4i1(<4 x i32> %a, <4 x i32> %b, i32 %index) {
|
|
|
|
; KNL-LABEL: test_extractelement_varible_v4i1:
|
2017-12-05 01:18:51 +08:00
|
|
|
; KNL: ## %bb.0:
|
2018-02-01 06:04:26 +08:00
|
|
|
; KNL-NEXT: ## kill: def $edi killed $edi def $rdi
|
|
|
|
; KNL-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
|
|
|
|
; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
|
2018-01-08 02:20:37 +08:00
|
|
|
; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k1
|
|
|
|
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
|
|
|
|
; KNL-NEXT: vextracti32x4 $0, %zmm0, -{{[0-9]+}}(%rsp)
|
2017-02-21 22:01:25 +08:00
|
|
|
; KNL-NEXT: andl $3, %edi
|
2018-01-08 02:20:37 +08:00
|
|
|
; KNL-NEXT: movzbl -24(%rsp,%rdi,4), %eax
|
2017-02-21 22:01:25 +08:00
|
|
|
; KNL-NEXT: andl $1, %eax
|
2018-01-08 02:20:37 +08:00
|
|
|
; KNL-NEXT: vzeroupper
|
2017-02-21 22:01:25 +08:00
|
|
|
; KNL-NEXT: retq
|
|
|
|
;
|
|
|
|
; SKX-LABEL: test_extractelement_varible_v4i1:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SKX: ## %bb.0:
|
2018-02-01 06:04:26 +08:00
|
|
|
; SKX-NEXT: ## kill: def $edi killed $edi def $rdi
|
2017-02-21 22:01:25 +08:00
|
|
|
; SKX-NEXT: vpcmpnleud %xmm1, %xmm0, %k0
|
|
|
|
; SKX-NEXT: vpmovm2d %k0, %xmm0
|
|
|
|
; SKX-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
|
|
|
|
; SKX-NEXT: andl $3, %edi
|
2017-05-19 20:35:15 +08:00
|
|
|
; SKX-NEXT: movzbl -24(%rsp,%rdi,4), %eax
|
2017-02-21 22:01:25 +08:00
|
|
|
; SKX-NEXT: andl $1, %eax
|
|
|
|
; SKX-NEXT: retq
|
|
|
|
%t1 = icmp ugt <4 x i32> %a, %b
|
|
|
|
%t2 = extractelement <4 x i1> %t1, i32 %index
|
|
|
|
%res = zext i1 %t2 to i8
|
|
|
|
ret i8 %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define zeroext i8 @test_extractelement_varible_v8i1(<8 x i32> %a, <8 x i32> %b, i32 %index) {
|
|
|
|
; KNL-LABEL: test_extractelement_varible_v8i1:
|
2017-12-05 01:18:51 +08:00
|
|
|
; KNL: ## %bb.0:
|
2018-02-01 06:04:26 +08:00
|
|
|
; KNL-NEXT: ## kill: def $edi killed $edi def $rdi
|
|
|
|
; KNL-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
|
|
|
|
; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
|
2017-02-21 22:01:25 +08:00
|
|
|
; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k1
|
2017-12-23 01:18:11 +08:00
|
|
|
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
|
|
|
|
; KNL-NEXT: vpmovdw %zmm0, %ymm0
|
|
|
|
; KNL-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
|
2017-02-21 22:01:25 +08:00
|
|
|
; KNL-NEXT: andl $7, %edi
|
2017-12-23 01:18:11 +08:00
|
|
|
; KNL-NEXT: movzbl -24(%rsp,%rdi,2), %eax
|
2017-02-21 22:01:25 +08:00
|
|
|
; KNL-NEXT: andl $1, %eax
|
2017-09-27 22:44:15 +08:00
|
|
|
; KNL-NEXT: vzeroupper
|
2017-02-21 22:01:25 +08:00
|
|
|
; KNL-NEXT: retq
|
|
|
|
;
|
|
|
|
; SKX-LABEL: test_extractelement_varible_v8i1:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SKX: ## %bb.0:
|
2018-02-01 06:04:26 +08:00
|
|
|
; SKX-NEXT: ## kill: def $edi killed $edi def $rdi
|
2017-02-21 22:01:25 +08:00
|
|
|
; SKX-NEXT: vpcmpnleud %ymm1, %ymm0, %k0
|
2017-12-23 01:18:11 +08:00
|
|
|
; SKX-NEXT: vpmovm2w %k0, %xmm0
|
|
|
|
; SKX-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
|
2017-02-21 22:01:25 +08:00
|
|
|
; SKX-NEXT: andl $7, %edi
|
2017-12-23 01:18:11 +08:00
|
|
|
; SKX-NEXT: movzbl -24(%rsp,%rdi,2), %eax
|
2017-02-21 22:01:25 +08:00
|
|
|
; SKX-NEXT: andl $1, %eax
|
2017-03-03 17:03:24 +08:00
|
|
|
; SKX-NEXT: vzeroupper
|
2017-02-21 22:01:25 +08:00
|
|
|
; SKX-NEXT: retq
|
|
|
|
%t1 = icmp ugt <8 x i32> %a, %b
|
|
|
|
%t2 = extractelement <8 x i1> %t1, i32 %index
|
|
|
|
%res = zext i1 %t2 to i8
|
|
|
|
ret i8 %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define zeroext i8 @test_extractelement_varible_v16i1(<16 x i32> %a, <16 x i32> %b, i32 %index) {
|
|
|
|
; KNL-LABEL: test_extractelement_varible_v16i1:
|
2017-12-05 01:18:51 +08:00
|
|
|
; KNL: ## %bb.0:
|
2018-02-01 06:04:26 +08:00
|
|
|
; KNL-NEXT: ## kill: def $edi killed $edi def $rdi
|
2017-02-21 22:01:25 +08:00
|
|
|
; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k1
|
|
|
|
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
|
2018-01-14 16:11:36 +08:00
|
|
|
; KNL-NEXT: vpmovdb %zmm0, -{{[0-9]+}}(%rsp)
|
2017-02-21 22:01:25 +08:00
|
|
|
; KNL-NEXT: andl $15, %edi
|
2017-12-23 01:18:13 +08:00
|
|
|
; KNL-NEXT: movzbl -24(%rsp,%rdi), %eax
|
2017-02-21 22:01:25 +08:00
|
|
|
; KNL-NEXT: andl $1, %eax
|
2017-09-27 22:44:15 +08:00
|
|
|
; KNL-NEXT: vzeroupper
|
2017-02-21 22:01:25 +08:00
|
|
|
; KNL-NEXT: retq
|
|
|
|
;
|
|
|
|
; SKX-LABEL: test_extractelement_varible_v16i1:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SKX: ## %bb.0:
|
2018-02-01 06:04:26 +08:00
|
|
|
; SKX-NEXT: ## kill: def $edi killed $edi def $rdi
|
2017-02-21 22:01:25 +08:00
|
|
|
; SKX-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
|
2017-12-23 01:18:11 +08:00
|
|
|
; SKX-NEXT: vpmovm2b %k0, %xmm0
|
|
|
|
; SKX-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
|
2017-02-21 22:01:25 +08:00
|
|
|
; SKX-NEXT: andl $15, %edi
|
2017-12-23 01:18:13 +08:00
|
|
|
; SKX-NEXT: movzbl -24(%rsp,%rdi), %eax
|
2017-02-21 22:01:25 +08:00
|
|
|
; SKX-NEXT: andl $1, %eax
|
2017-03-03 17:03:24 +08:00
|
|
|
; SKX-NEXT: vzeroupper
|
2017-02-21 22:01:25 +08:00
|
|
|
; SKX-NEXT: retq
|
|
|
|
%t1 = icmp ugt <16 x i32> %a, %b
|
|
|
|
%t2 = extractelement <16 x i1> %t1, i32 %index
|
|
|
|
%res = zext i1 %t2 to i8
|
|
|
|
ret i8 %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define zeroext i8 @test_extractelement_varible_v32i1(<32 x i8> %a, <32 x i8> %b, i32 %index) {
|
|
|
|
; KNL-LABEL: test_extractelement_varible_v32i1:
|
2017-12-05 01:18:51 +08:00
|
|
|
; KNL: ## %bb.0:
|
2017-02-21 22:01:25 +08:00
|
|
|
; KNL-NEXT: pushq %rbp
|
|
|
|
; KNL-NEXT: .cfi_def_cfa_offset 16
|
|
|
|
; KNL-NEXT: .cfi_offset %rbp, -16
|
|
|
|
; KNL-NEXT: movq %rsp, %rbp
|
|
|
|
; KNL-NEXT: .cfi_def_cfa_register %rbp
|
|
|
|
; KNL-NEXT: andq $-32, %rsp
|
|
|
|
; KNL-NEXT: subq $64, %rsp
|
2018-02-01 06:04:26 +08:00
|
|
|
; KNL-NEXT: ## kill: def $edi killed $edi def $rdi
|
2018-02-12 01:11:40 +08:00
|
|
|
; KNL-NEXT: vpminub %ymm1, %ymm0, %ymm1
|
|
|
|
; KNL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
|
|
|
|
; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
|
2017-02-21 22:01:25 +08:00
|
|
|
; KNL-NEXT: vmovdqa %ymm0, (%rsp)
|
|
|
|
; KNL-NEXT: andl $31, %edi
|
2017-12-23 01:18:13 +08:00
|
|
|
; KNL-NEXT: movzbl (%rsp,%rdi), %eax
|
2017-05-19 20:35:15 +08:00
|
|
|
; KNL-NEXT: andl $1, %eax
|
2017-02-21 22:01:25 +08:00
|
|
|
; KNL-NEXT: movq %rbp, %rsp
|
|
|
|
; KNL-NEXT: popq %rbp
|
2017-09-27 22:44:15 +08:00
|
|
|
; KNL-NEXT: vzeroupper
|
2017-02-21 22:01:25 +08:00
|
|
|
; KNL-NEXT: retq
|
|
|
|
;
|
|
|
|
; SKX-LABEL: test_extractelement_varible_v32i1:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SKX: ## %bb.0:
|
2017-02-21 22:01:25 +08:00
|
|
|
; SKX-NEXT: pushq %rbp
|
|
|
|
; SKX-NEXT: .cfi_def_cfa_offset 16
|
|
|
|
; SKX-NEXT: .cfi_offset %rbp, -16
|
|
|
|
; SKX-NEXT: movq %rsp, %rbp
|
|
|
|
; SKX-NEXT: .cfi_def_cfa_register %rbp
|
2017-12-23 01:18:11 +08:00
|
|
|
; SKX-NEXT: andq $-32, %rsp
|
|
|
|
; SKX-NEXT: subq $64, %rsp
|
2018-02-01 06:04:26 +08:00
|
|
|
; SKX-NEXT: ## kill: def $edi killed $edi def $rdi
|
2017-02-21 22:01:25 +08:00
|
|
|
; SKX-NEXT: vpcmpnleub %ymm1, %ymm0, %k0
|
2017-12-23 01:18:11 +08:00
|
|
|
; SKX-NEXT: vpmovm2b %k0, %ymm0
|
|
|
|
; SKX-NEXT: vmovdqa %ymm0, (%rsp)
|
2017-02-21 22:01:25 +08:00
|
|
|
; SKX-NEXT: andl $31, %edi
|
2017-12-23 01:18:13 +08:00
|
|
|
; SKX-NEXT: movzbl (%rsp,%rdi), %eax
|
2017-02-21 22:01:25 +08:00
|
|
|
; SKX-NEXT: andl $1, %eax
|
|
|
|
; SKX-NEXT: movq %rbp, %rsp
|
|
|
|
; SKX-NEXT: popq %rbp
|
2017-03-03 17:03:24 +08:00
|
|
|
; SKX-NEXT: vzeroupper
|
2017-02-21 22:01:25 +08:00
|
|
|
; SKX-NEXT: retq
|
|
|
|
%t1 = icmp ugt <32 x i8> %a, %b
|
|
|
|
%t2 = extractelement <32 x i1> %t1, i32 %index
|
|
|
|
%res = zext i1 %t2 to i8
|
|
|
|
ret i8 %res
|
|
|
|
}
|
|
|
|
|
2017-09-29 00:53:16 +08:00
|
|
|
define <8 x i64> @insert_double_zero(<2 x i64> %a) nounwind {
|
|
|
|
; CHECK-LABEL: insert_double_zero:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-09-29 00:53:16 +08:00
|
|
|
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
|
|
|
|
; CHECK-NEXT: vinsertf32x4 $2, %xmm0, %zmm1, %zmm0
|
|
|
|
; CHECK-NEXT: retq
|
|
|
|
%b = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
|
|
|
%d = shufflevector <4 x i64> %b, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
|
|
|
%e = shufflevector <8 x i64> %d, <8 x i64> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3>
|
|
|
|
ret <8 x i64> %e
|
|
|
|
}
|
[SelectionDAG][X86] Fix insert_vector_elt lowering for v32i1/v64i1 with non-constant index
Summary:
Currently we don't handle v32i1/v64i1 insert_vector_elt correctly as we fail to look at the number of elements closely and assume it can only be v16i1 or v8i1.
We also can't type legalize v64i1 insert_vector_elt correctly on KNL due to the type not being byte addressable as required by the legalizing through memory accesses path requires.
For the first issue, the patch now tries to pick a 512-bit register with the correct number of elements and promotes to that.
For the second issue, we now extend the vector to a byte addressable type, do the stores to memory, load the two halves, and then truncate the halves back to the original type. Technically since we changed the type, we may not need two loads, but actually checking that is more work and for the v64i1 case we do need them.
Reviewers: RKSimon, delena, spatel, zvi
Reviewed By: RKSimon
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D40942
llvm-svn: 320849
2017-12-16 03:35:22 +08:00
|
|
|
|
|
|
|
define i32 @test_insertelement_variable_v32i1(<32 x i8> %a, i8 %b, i32 %index) {
|
|
|
|
; KNL-LABEL: test_insertelement_variable_v32i1:
|
|
|
|
; KNL: ## %bb.0:
|
|
|
|
; KNL-NEXT: pushq %rbp
|
|
|
|
; KNL-NEXT: .cfi_def_cfa_offset 16
|
|
|
|
; KNL-NEXT: .cfi_offset %rbp, -16
|
|
|
|
; KNL-NEXT: movq %rsp, %rbp
|
|
|
|
; KNL-NEXT: .cfi_def_cfa_register %rbp
|
|
|
|
; KNL-NEXT: andq $-32, %rsp
|
2018-01-23 22:25:39 +08:00
|
|
|
; KNL-NEXT: subq $64, %rsp
|
2018-02-01 06:04:26 +08:00
|
|
|
; KNL-NEXT: ## kill: def $esi killed $esi def $rsi
|
2018-02-12 01:11:40 +08:00
|
|
|
; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
|
|
|
; KNL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
|
|
|
|
; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
|
[SelectionDAG][X86] Fix insert_vector_elt lowering for v32i1/v64i1 with non-constant index
Summary:
Currently we don't handle v32i1/v64i1 insert_vector_elt correctly as we fail to look at the number of elements closely and assume it can only be v16i1 or v8i1.
We also can't type legalize v64i1 insert_vector_elt correctly on KNL due to the type not being byte addressable as required by the legalizing through memory accesses path requires.
For the first issue, the patch now tries to pick a 512-bit register with the correct number of elements and promotes to that.
For the second issue, we now extend the vector to a byte addressable type, do the stores to memory, load the two halves, and then truncate the halves back to the original type. Technically since we changed the type, we may not need two loads, but actually checking that is more work and for the v64i1 case we do need them.
Reviewers: RKSimon, delena, spatel, zvi
Reviewed By: RKSimon
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D40942
llvm-svn: 320849
2017-12-16 03:35:22 +08:00
|
|
|
; KNL-NEXT: andl $31, %esi
|
|
|
|
; KNL-NEXT: testb %dil, %dil
|
2018-01-23 22:25:39 +08:00
|
|
|
; KNL-NEXT: vmovdqa %ymm0, (%rsp)
|
|
|
|
; KNL-NEXT: setne (%rsp,%rsi)
|
2018-11-10 03:05:51 +08:00
|
|
|
; KNL-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
|
[SelectionDAG][X86] Fix insert_vector_elt lowering for v32i1/v64i1 with non-constant index
Summary:
Currently we don't handle v32i1/v64i1 insert_vector_elt correctly as we fail to look at the number of elements closely and assume it can only be v16i1 or v8i1.
We also can't type legalize v64i1 insert_vector_elt correctly on KNL due to the type not being byte addressable as required by the legalizing through memory accesses path requires.
For the first issue, the patch now tries to pick a 512-bit register with the correct number of elements and promotes to that.
For the second issue, we now extend the vector to a byte addressable type, do the stores to memory, load the two halves, and then truncate the halves back to the original type. Technically since we changed the type, we may not need two loads, but actually checking that is more work and for the v64i1 case we do need them.
Reviewers: RKSimon, delena, spatel, zvi
Reviewed By: RKSimon
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D40942
llvm-svn: 320849
2017-12-16 03:35:22 +08:00
|
|
|
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
|
|
|
|
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
|
2018-01-23 22:25:39 +08:00
|
|
|
; KNL-NEXT: kmovw %k0, %ecx
|
2018-11-10 03:05:51 +08:00
|
|
|
; KNL-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
|
2018-01-23 22:25:39 +08:00
|
|
|
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
|
|
|
|
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
|
|
|
|
; KNL-NEXT: kmovw %k0, %eax
|
|
|
|
; KNL-NEXT: shll $16, %eax
|
|
|
|
; KNL-NEXT: orl %ecx, %eax
|
[SelectionDAG][X86] Fix insert_vector_elt lowering for v32i1/v64i1 with non-constant index
Summary:
Currently we don't handle v32i1/v64i1 insert_vector_elt correctly as we fail to look at the number of elements closely and assume it can only be v16i1 or v8i1.
We also can't type legalize v64i1 insert_vector_elt correctly on KNL due to the type not being byte addressable as required by the legalizing through memory accesses path requires.
For the first issue, the patch now tries to pick a 512-bit register with the correct number of elements and promotes to that.
For the second issue, we now extend the vector to a byte addressable type, do the stores to memory, load the two halves, and then truncate the halves back to the original type. Technically since we changed the type, we may not need two loads, but actually checking that is more work and for the v64i1 case we do need them.
Reviewers: RKSimon, delena, spatel, zvi
Reviewed By: RKSimon
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D40942
llvm-svn: 320849
2017-12-16 03:35:22 +08:00
|
|
|
; KNL-NEXT: movq %rbp, %rsp
|
|
|
|
; KNL-NEXT: popq %rbp
|
|
|
|
; KNL-NEXT: vzeroupper
|
|
|
|
; KNL-NEXT: retq
|
|
|
|
;
|
|
|
|
; SKX-LABEL: test_insertelement_variable_v32i1:
|
|
|
|
; SKX: ## %bb.0:
|
|
|
|
; SKX-NEXT: pushq %rbp
|
|
|
|
; SKX-NEXT: .cfi_def_cfa_offset 16
|
|
|
|
; SKX-NEXT: .cfi_offset %rbp, -16
|
|
|
|
; SKX-NEXT: movq %rsp, %rbp
|
|
|
|
; SKX-NEXT: .cfi_def_cfa_register %rbp
|
2017-12-23 01:18:11 +08:00
|
|
|
; SKX-NEXT: andq $-32, %rsp
|
|
|
|
; SKX-NEXT: subq $64, %rsp
|
2018-02-01 06:04:26 +08:00
|
|
|
; SKX-NEXT: ## kill: def $esi killed $esi def $rsi
|
2018-03-02 06:15:39 +08:00
|
|
|
; SKX-NEXT: vptestmb %ymm0, %ymm0, %k0
|
[SelectionDAG][X86] Fix insert_vector_elt lowering for v32i1/v64i1 with non-constant index
Summary:
Currently we don't handle v32i1/v64i1 insert_vector_elt correctly as we fail to look at the number of elements closely and assume it can only be v16i1 or v8i1.
We also can't type legalize v64i1 insert_vector_elt correctly on KNL due to the type not being byte addressable as required by the legalizing through memory accesses path requires.
For the first issue, the patch now tries to pick a 512-bit register with the correct number of elements and promotes to that.
For the second issue, we now extend the vector to a byte addressable type, do the stores to memory, load the two halves, and then truncate the halves back to the original type. Technically since we changed the type, we may not need two loads, but actually checking that is more work and for the v64i1 case we do need them.
Reviewers: RKSimon, delena, spatel, zvi
Reviewed By: RKSimon
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D40942
llvm-svn: 320849
2017-12-16 03:35:22 +08:00
|
|
|
; SKX-NEXT: andl $31, %esi
|
2017-12-23 01:18:11 +08:00
|
|
|
; SKX-NEXT: testb %dil, %dil
|
|
|
|
; SKX-NEXT: vpmovm2b %k0, %ymm0
|
|
|
|
; SKX-NEXT: vmovdqa %ymm0, (%rsp)
|
2017-12-23 01:18:13 +08:00
|
|
|
; SKX-NEXT: setne (%rsp,%rsi)
|
2017-12-23 01:18:11 +08:00
|
|
|
; SKX-NEXT: vpsllw $7, (%rsp), %ymm0
|
|
|
|
; SKX-NEXT: vpmovb2m %ymm0, %k0
|
[SelectionDAG][X86] Fix insert_vector_elt lowering for v32i1/v64i1 with non-constant index
Summary:
Currently we don't handle v32i1/v64i1 insert_vector_elt correctly as we fail to look at the number of elements closely and assume it can only be v16i1 or v8i1.
We also can't type legalize v64i1 insert_vector_elt correctly on KNL due to the type not being byte addressable as required by the legalizing through memory accesses path requires.
For the first issue, the patch now tries to pick a 512-bit register with the correct number of elements and promotes to that.
For the second issue, we now extend the vector to a byte addressable type, do the stores to memory, load the two halves, and then truncate the halves back to the original type. Technically since we changed the type, we may not need two loads, but actually checking that is more work and for the v64i1 case we do need them.
Reviewers: RKSimon, delena, spatel, zvi
Reviewed By: RKSimon
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D40942
llvm-svn: 320849
2017-12-16 03:35:22 +08:00
|
|
|
; SKX-NEXT: kmovd %k0, %eax
|
|
|
|
; SKX-NEXT: movq %rbp, %rsp
|
|
|
|
; SKX-NEXT: popq %rbp
|
|
|
|
; SKX-NEXT: vzeroupper
|
|
|
|
; SKX-NEXT: retq
|
|
|
|
%t1 = icmp ugt <32 x i8> %a, zeroinitializer
|
|
|
|
%t2 = icmp ugt i8 %b, 0
|
|
|
|
%t3 = insertelement <32 x i1> %t1, i1 %t2, i32 %index
|
|
|
|
%t4 = bitcast <32 x i1> %t3 to i32
|
|
|
|
ret i32 %t4
|
|
|
|
}
|
|
|
|
|
|
|
|
define i64 @test_insertelement_variable_v64i1(<64 x i8> %a, i8 %b, i32 %index) {
|
|
|
|
; KNL-LABEL: test_insertelement_variable_v64i1:
|
|
|
|
; KNL: ## %bb.0:
|
|
|
|
; KNL-NEXT: pushq %rbp
|
|
|
|
; KNL-NEXT: .cfi_def_cfa_offset 16
|
|
|
|
; KNL-NEXT: .cfi_offset %rbp, -16
|
|
|
|
; KNL-NEXT: movq %rsp, %rbp
|
|
|
|
; KNL-NEXT: .cfi_def_cfa_register %rbp
|
|
|
|
; KNL-NEXT: andq $-64, %rsp
|
2018-01-23 22:25:39 +08:00
|
|
|
; KNL-NEXT: subq $128, %rsp
|
2018-02-01 06:04:26 +08:00
|
|
|
; KNL-NEXT: ## kill: def $esi killed $esi def $rsi
|
2018-02-12 01:11:40 +08:00
|
|
|
; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
2018-03-02 06:15:39 +08:00
|
|
|
; KNL-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
|
2018-02-12 01:11:40 +08:00
|
|
|
; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
|
|
|
|
; KNL-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1
|
|
|
|
; KNL-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1
|
[SelectionDAG][X86] Fix insert_vector_elt lowering for v32i1/v64i1 with non-constant index
Summary:
Currently we don't handle v32i1/v64i1 insert_vector_elt correctly as we fail to look at the number of elements closely and assume it can only be v16i1 or v8i1.
We also can't type legalize v64i1 insert_vector_elt correctly on KNL due to the type not being byte addressable as required by the legalizing through memory accesses path requires.
For the first issue, the patch now tries to pick a 512-bit register with the correct number of elements and promotes to that.
For the second issue, we now extend the vector to a byte addressable type, do the stores to memory, load the two halves, and then truncate the halves back to the original type. Technically since we changed the type, we may not need two loads, but actually checking that is more work and for the v64i1 case we do need them.
Reviewers: RKSimon, delena, spatel, zvi
Reviewed By: RKSimon
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D40942
llvm-svn: 320849
2017-12-16 03:35:22 +08:00
|
|
|
; KNL-NEXT: andl $63, %esi
|
|
|
|
; KNL-NEXT: testb %dil, %dil
|
|
|
|
; KNL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
|
2018-01-23 22:25:39 +08:00
|
|
|
; KNL-NEXT: vmovdqa %ymm0, (%rsp)
|
|
|
|
; KNL-NEXT: setne (%rsp,%rsi)
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; KNL-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
|
|
|
|
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
|
|
|
|
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
|
2018-01-23 22:25:39 +08:00
|
|
|
; KNL-NEXT: kmovw %k0, %eax
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; KNL-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
|
[SelectionDAG][X86] Fix insert_vector_elt lowering for v32i1/v64i1 with non-constant index
Summary:
Currently we don't handle v32i1/v64i1 insert_vector_elt correctly as we fail to look at the number of elements closely and assume it can only be v16i1 or v8i1.
We also can't type legalize v64i1 insert_vector_elt correctly on KNL due to the type not being byte addressable as required by the legalizing through memory accesses path requires.
For the first issue, the patch now tries to pick a 512-bit register with the correct number of elements and promotes to that.
For the second issue, we now extend the vector to a byte addressable type, do the stores to memory, load the two halves, and then truncate the halves back to the original type. Technically since we changed the type, we may not need two loads, but actually checking that is more work and for the v64i1 case we do need them.
Reviewers: RKSimon, delena, spatel, zvi
Reviewed By: RKSimon
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D40942
llvm-svn: 320849
2017-12-16 03:35:22 +08:00
|
|
|
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
|
|
|
|
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
|
2018-01-23 22:25:39 +08:00
|
|
|
; KNL-NEXT: kmovw %k0, %ecx
|
|
|
|
; KNL-NEXT: shll $16, %ecx
|
|
|
|
; KNL-NEXT: orl %eax, %ecx
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; KNL-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
|
[SelectionDAG][X86] Fix insert_vector_elt lowering for v32i1/v64i1 with non-constant index
Summary:
Currently we don't handle v32i1/v64i1 insert_vector_elt correctly as we fail to look at the number of elements closely and assume it can only be v16i1 or v8i1.
We also can't type legalize v64i1 insert_vector_elt correctly on KNL due to the type not being byte addressable as required by the legalizing through memory accesses path requires.
For the first issue, the patch now tries to pick a 512-bit register with the correct number of elements and promotes to that.
For the second issue, we now extend the vector to a byte addressable type, do the stores to memory, load the two halves, and then truncate the halves back to the original type. Technically since we changed the type, we may not need two loads, but actually checking that is more work and for the v64i1 case we do need them.
Reviewers: RKSimon, delena, spatel, zvi
Reviewed By: RKSimon
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D40942
llvm-svn: 320849
2017-12-16 03:35:22 +08:00
|
|
|
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
|
|
|
|
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
|
2018-01-23 22:25:39 +08:00
|
|
|
; KNL-NEXT: kmovw %k0, %edx
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; KNL-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
|
[SelectionDAG][X86] Fix insert_vector_elt lowering for v32i1/v64i1 with non-constant index
Summary:
Currently we don't handle v32i1/v64i1 insert_vector_elt correctly as we fail to look at the number of elements closely and assume it can only be v16i1 or v8i1.
We also can't type legalize v64i1 insert_vector_elt correctly on KNL due to the type not being byte addressable as required by the legalizing through memory accesses path requires.
For the first issue, the patch now tries to pick a 512-bit register with the correct number of elements and promotes to that.
For the second issue, we now extend the vector to a byte addressable type, do the stores to memory, load the two halves, and then truncate the halves back to the original type. Technically since we changed the type, we may not need two loads, but actually checking that is more work and for the v64i1 case we do need them.
Reviewers: RKSimon, delena, spatel, zvi
Reviewed By: RKSimon
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D40942
llvm-svn: 320849
2017-12-16 03:35:22 +08:00
|
|
|
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
|
|
|
|
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
|
2018-01-23 22:25:39 +08:00
|
|
|
; KNL-NEXT: kmovw %k0, %eax
|
|
|
|
; KNL-NEXT: shll $16, %eax
|
|
|
|
; KNL-NEXT: orl %edx, %eax
|
[SelectionDAG][X86] Fix insert_vector_elt lowering for v32i1/v64i1 with non-constant index
Summary:
Currently we don't handle v32i1/v64i1 insert_vector_elt correctly as we fail to look at the number of elements closely and assume it can only be v16i1 or v8i1.
We also can't type legalize v64i1 insert_vector_elt correctly on KNL due to the type not being byte addressable as required by the legalizing through memory accesses path requires.
For the first issue, the patch now tries to pick a 512-bit register with the correct number of elements and promotes to that.
For the second issue, we now extend the vector to a byte addressable type, do the stores to memory, load the two halves, and then truncate the halves back to the original type. Technically since we changed the type, we may not need two loads, but actually checking that is more work and for the v64i1 case we do need them.
Reviewers: RKSimon, delena, spatel, zvi
Reviewed By: RKSimon
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D40942
llvm-svn: 320849
2017-12-16 03:35:22 +08:00
|
|
|
; KNL-NEXT: shlq $32, %rax
|
|
|
|
; KNL-NEXT: orq %rcx, %rax
|
|
|
|
; KNL-NEXT: movq %rbp, %rsp
|
|
|
|
; KNL-NEXT: popq %rbp
|
|
|
|
; KNL-NEXT: vzeroupper
|
|
|
|
; KNL-NEXT: retq
|
|
|
|
;
|
|
|
|
; SKX-LABEL: test_insertelement_variable_v64i1:
|
|
|
|
; SKX: ## %bb.0:
|
|
|
|
; SKX-NEXT: pushq %rbp
|
|
|
|
; SKX-NEXT: .cfi_def_cfa_offset 16
|
|
|
|
; SKX-NEXT: .cfi_offset %rbp, -16
|
|
|
|
; SKX-NEXT: movq %rsp, %rbp
|
|
|
|
; SKX-NEXT: .cfi_def_cfa_register %rbp
|
|
|
|
; SKX-NEXT: andq $-64, %rsp
|
|
|
|
; SKX-NEXT: subq $128, %rsp
|
2018-02-01 06:04:26 +08:00
|
|
|
; SKX-NEXT: ## kill: def $esi killed $esi def $rsi
|
2018-03-02 06:15:39 +08:00
|
|
|
; SKX-NEXT: vptestmb %zmm0, %zmm0, %k0
|
[SelectionDAG][X86] Fix insert_vector_elt lowering for v32i1/v64i1 with non-constant index
Summary:
Currently we don't handle v32i1/v64i1 insert_vector_elt correctly as we fail to look at the number of elements closely and assume it can only be v16i1 or v8i1.
We also can't type legalize v64i1 insert_vector_elt correctly on KNL due to the type not being byte addressable as required by the legalizing through memory accesses path requires.
For the first issue, the patch now tries to pick a 512-bit register with the correct number of elements and promotes to that.
For the second issue, we now extend the vector to a byte addressable type, do the stores to memory, load the two halves, and then truncate the halves back to the original type. Technically since we changed the type, we may not need two loads, but actually checking that is more work and for the v64i1 case we do need them.
Reviewers: RKSimon, delena, spatel, zvi
Reviewed By: RKSimon
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D40942
llvm-svn: 320849
2017-12-16 03:35:22 +08:00
|
|
|
; SKX-NEXT: andl $63, %esi
|
|
|
|
; SKX-NEXT: testb %dil, %dil
|
2017-12-22 06:08:23 +08:00
|
|
|
; SKX-NEXT: vpmovm2b %k0, %zmm0
|
2018-01-18 15:44:09 +08:00
|
|
|
; SKX-NEXT: vmovdqa64 %zmm0, (%rsp)
|
2017-12-23 01:18:13 +08:00
|
|
|
; SKX-NEXT: setne (%rsp,%rsi)
|
[SelectionDAG][X86] Fix insert_vector_elt lowering for v32i1/v64i1 with non-constant index
Summary:
Currently we don't handle v32i1/v64i1 insert_vector_elt correctly as we fail to look at the number of elements closely and assume it can only be v16i1 or v8i1.
We also can't type legalize v64i1 insert_vector_elt correctly on KNL due to the type not being byte addressable as required by the legalizing through memory accesses path requires.
For the first issue, the patch now tries to pick a 512-bit register with the correct number of elements and promotes to that.
For the second issue, we now extend the vector to a byte addressable type, do the stores to memory, load the two halves, and then truncate the halves back to the original type. Technically since we changed the type, we may not need two loads, but actually checking that is more work and for the v64i1 case we do need them.
Reviewers: RKSimon, delena, spatel, zvi
Reviewed By: RKSimon
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D40942
llvm-svn: 320849
2017-12-16 03:35:22 +08:00
|
|
|
; SKX-NEXT: vpsllw $7, (%rsp), %zmm0
|
|
|
|
; SKX-NEXT: vpmovb2m %zmm0, %k0
|
|
|
|
; SKX-NEXT: kmovq %k0, %rax
|
|
|
|
; SKX-NEXT: movq %rbp, %rsp
|
|
|
|
; SKX-NEXT: popq %rbp
|
|
|
|
; SKX-NEXT: vzeroupper
|
|
|
|
; SKX-NEXT: retq
|
|
|
|
%t1 = icmp ugt <64 x i8> %a, zeroinitializer
|
|
|
|
%t2 = icmp ugt i8 %b, 0
|
|
|
|
%t3 = insertelement <64 x i1> %t1, i1 %t2, i32 %index
|
|
|
|
%t4 = bitcast <64 x i1> %t3 to i64
|
|
|
|
ret i64 %t4
|
|
|
|
}
|
|
|
|
|
|
|
|
define i96 @test_insertelement_variable_v96i1(<96 x i8> %a, i8 %b, i32 %index) {
|
|
|
|
; KNL-LABEL: test_insertelement_variable_v96i1:
|
|
|
|
; KNL: ## %bb.0:
|
|
|
|
; KNL-NEXT: pushq %rbp
|
|
|
|
; KNL-NEXT: .cfi_def_cfa_offset 16
|
|
|
|
; KNL-NEXT: .cfi_offset %rbp, -16
|
|
|
|
; KNL-NEXT: movq %rsp, %rbp
|
|
|
|
; KNL-NEXT: .cfi_def_cfa_register %rbp
|
|
|
|
; KNL-NEXT: andq $-128, %rsp
|
2018-01-23 22:25:39 +08:00
|
|
|
; KNL-NEXT: subq $256, %rsp ## imm = 0x100
|
[SelectionDAG][X86] Fix insert_vector_elt lowering for v32i1/v64i1 with non-constant index
Summary:
Currently we don't handle v32i1/v64i1 insert_vector_elt correctly as we fail to look at the number of elements closely and assume it can only be v16i1 or v8i1.
We also can't type legalize v64i1 insert_vector_elt correctly on KNL due to the type not being byte addressable as required by the legalizing through memory accesses path requires.
For the first issue, the patch now tries to pick a 512-bit register with the correct number of elements and promotes to that.
For the second issue, we now extend the vector to a byte addressable type, do the stores to memory, load the two halves, and then truncate the halves back to the original type. Technically since we changed the type, we may not need two loads, but actually checking that is more work and for the v64i1 case we do need them.
Reviewers: RKSimon, delena, spatel, zvi
Reviewed By: RKSimon
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D40942
llvm-svn: 320849
2017-12-16 03:35:22 +08:00
|
|
|
; KNL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; KNL-NEXT: vpinsrb $1, 488(%rbp), %xmm0, %xmm0
|
|
|
|
; KNL-NEXT: vpinsrb $2, 496(%rbp), %xmm0, %xmm0
|
|
|
|
; KNL-NEXT: vpinsrb $3, 504(%rbp), %xmm0, %xmm0
|
|
|
|
; KNL-NEXT: vpinsrb $4, 512(%rbp), %xmm0, %xmm0
|
|
|
|
; KNL-NEXT: vpinsrb $5, 520(%rbp), %xmm0, %xmm0
|
|
|
|
; KNL-NEXT: vpinsrb $6, 528(%rbp), %xmm0, %xmm0
|
|
|
|
; KNL-NEXT: vpinsrb $7, 536(%rbp), %xmm0, %xmm0
|
|
|
|
; KNL-NEXT: vpinsrb $8, 544(%rbp), %xmm0, %xmm0
|
|
|
|
; KNL-NEXT: vpinsrb $9, 552(%rbp), %xmm0, %xmm0
|
|
|
|
; KNL-NEXT: vpinsrb $10, 560(%rbp), %xmm0, %xmm0
|
|
|
|
; KNL-NEXT: vpinsrb $11, 568(%rbp), %xmm0, %xmm0
|
|
|
|
; KNL-NEXT: vpinsrb $12, 576(%rbp), %xmm0, %xmm0
|
|
|
|
; KNL-NEXT: vpinsrb $13, 584(%rbp), %xmm0, %xmm0
|
|
|
|
; KNL-NEXT: vpinsrb $14, 592(%rbp), %xmm0, %xmm0
|
|
|
|
; KNL-NEXT: vpinsrb $15, 600(%rbp), %xmm0, %xmm0
|
|
|
|
; KNL-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
|
|
|
; KNL-NEXT: vpinsrb $1, 616(%rbp), %xmm1, %xmm1
|
|
|
|
; KNL-NEXT: vpinsrb $2, 624(%rbp), %xmm1, %xmm1
|
|
|
|
; KNL-NEXT: vpinsrb $3, 632(%rbp), %xmm1, %xmm1
|
|
|
|
; KNL-NEXT: vpinsrb $4, 640(%rbp), %xmm1, %xmm1
|
|
|
|
; KNL-NEXT: vpinsrb $5, 648(%rbp), %xmm1, %xmm1
|
|
|
|
; KNL-NEXT: vpinsrb $6, 656(%rbp), %xmm1, %xmm1
|
|
|
|
; KNL-NEXT: vpinsrb $7, 664(%rbp), %xmm1, %xmm1
|
|
|
|
; KNL-NEXT: vpinsrb $8, 672(%rbp), %xmm1, %xmm1
|
|
|
|
; KNL-NEXT: vpinsrb $9, 680(%rbp), %xmm1, %xmm1
|
|
|
|
; KNL-NEXT: vpinsrb $10, 688(%rbp), %xmm1, %xmm1
|
|
|
|
; KNL-NEXT: vpinsrb $11, 696(%rbp), %xmm1, %xmm1
|
|
|
|
; KNL-NEXT: vpinsrb $12, 704(%rbp), %xmm1, %xmm1
|
|
|
|
; KNL-NEXT: vpinsrb $13, 712(%rbp), %xmm1, %xmm1
|
|
|
|
; KNL-NEXT: vpinsrb $14, 720(%rbp), %xmm1, %xmm1
|
|
|
|
; KNL-NEXT: vpinsrb $15, 728(%rbp), %xmm1, %xmm1
|
|
|
|
; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; KNL-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
|
|
|
; KNL-NEXT: vpinsrb $1, 232(%rbp), %xmm1, %xmm1
|
|
|
|
; KNL-NEXT: vpinsrb $2, 240(%rbp), %xmm1, %xmm1
|
|
|
|
; KNL-NEXT: vpinsrb $3, 248(%rbp), %xmm1, %xmm1
|
|
|
|
; KNL-NEXT: vpinsrb $4, 256(%rbp), %xmm1, %xmm1
|
|
|
|
; KNL-NEXT: vpinsrb $5, 264(%rbp), %xmm1, %xmm1
|
|
|
|
; KNL-NEXT: vpinsrb $6, 272(%rbp), %xmm1, %xmm1
|
|
|
|
; KNL-NEXT: vpinsrb $7, 280(%rbp), %xmm1, %xmm1
|
|
|
|
; KNL-NEXT: vpinsrb $8, 288(%rbp), %xmm1, %xmm1
|
|
|
|
; KNL-NEXT: vpinsrb $9, 296(%rbp), %xmm1, %xmm1
|
|
|
|
; KNL-NEXT: vpinsrb $10, 304(%rbp), %xmm1, %xmm1
|
|
|
|
; KNL-NEXT: vpinsrb $11, 312(%rbp), %xmm1, %xmm1
|
|
|
|
; KNL-NEXT: vpinsrb $12, 320(%rbp), %xmm1, %xmm1
|
|
|
|
; KNL-NEXT: vpinsrb $13, 328(%rbp), %xmm1, %xmm1
|
|
|
|
; KNL-NEXT: vpinsrb $14, 336(%rbp), %xmm1, %xmm1
|
|
|
|
; KNL-NEXT: vpinsrb $15, 344(%rbp), %xmm1, %xmm1
|
|
|
|
; KNL-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
|
|
|
|
; KNL-NEXT: vpinsrb $1, 360(%rbp), %xmm2, %xmm2
|
|
|
|
; KNL-NEXT: vpinsrb $2, 368(%rbp), %xmm2, %xmm2
|
|
|
|
; KNL-NEXT: vpinsrb $3, 376(%rbp), %xmm2, %xmm2
|
|
|
|
; KNL-NEXT: vpinsrb $4, 384(%rbp), %xmm2, %xmm2
|
|
|
|
; KNL-NEXT: vpinsrb $5, 392(%rbp), %xmm2, %xmm2
|
|
|
|
; KNL-NEXT: vpinsrb $6, 400(%rbp), %xmm2, %xmm2
|
|
|
|
; KNL-NEXT: vpinsrb $7, 408(%rbp), %xmm2, %xmm2
|
|
|
|
; KNL-NEXT: vpinsrb $8, 416(%rbp), %xmm2, %xmm2
|
|
|
|
; KNL-NEXT: vpinsrb $9, 424(%rbp), %xmm2, %xmm2
|
|
|
|
; KNL-NEXT: vpinsrb $10, 432(%rbp), %xmm2, %xmm2
|
|
|
|
; KNL-NEXT: vpinsrb $11, 440(%rbp), %xmm2, %xmm2
|
|
|
|
; KNL-NEXT: vpinsrb $12, 448(%rbp), %xmm2, %xmm2
|
|
|
|
; KNL-NEXT: vpinsrb $13, 456(%rbp), %xmm2, %xmm2
|
|
|
|
; KNL-NEXT: vpinsrb $14, 464(%rbp), %xmm2, %xmm2
|
|
|
|
; KNL-NEXT: vpinsrb $15, 472(%rbp), %xmm2, %xmm2
|
|
|
|
; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
|
|
|
|
; KNL-NEXT: vmovd %edi, %xmm2
|
|
|
|
; KNL-NEXT: vpinsrb $1, %esi, %xmm2, %xmm2
|
|
|
|
; KNL-NEXT: vpinsrb $2, %edx, %xmm2, %xmm2
|
|
|
|
; KNL-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2
|
|
|
|
; KNL-NEXT: vpinsrb $4, %r8d, %xmm2, %xmm2
|
|
|
|
; KNL-NEXT: vpinsrb $5, %r9d, %xmm2, %xmm2
|
|
|
|
; KNL-NEXT: vpinsrb $6, 16(%rbp), %xmm2, %xmm2
|
|
|
|
; KNL-NEXT: vpinsrb $7, 24(%rbp), %xmm2, %xmm2
|
|
|
|
; KNL-NEXT: vpinsrb $8, 32(%rbp), %xmm2, %xmm2
|
|
|
|
; KNL-NEXT: vpinsrb $9, 40(%rbp), %xmm2, %xmm2
|
|
|
|
; KNL-NEXT: vpinsrb $10, 48(%rbp), %xmm2, %xmm2
|
|
|
|
; KNL-NEXT: vpinsrb $11, 56(%rbp), %xmm2, %xmm2
|
|
|
|
; KNL-NEXT: vpinsrb $12, 64(%rbp), %xmm2, %xmm2
|
|
|
|
; KNL-NEXT: vpinsrb $13, 72(%rbp), %xmm2, %xmm2
|
|
|
|
; KNL-NEXT: vpinsrb $14, 80(%rbp), %xmm2, %xmm2
|
|
|
|
; KNL-NEXT: vpinsrb $15, 88(%rbp), %xmm2, %xmm2
|
|
|
|
; KNL-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
|
|
|
|
; KNL-NEXT: vpinsrb $1, 104(%rbp), %xmm3, %xmm3
|
|
|
|
; KNL-NEXT: vpinsrb $2, 112(%rbp), %xmm3, %xmm3
|
|
|
|
; KNL-NEXT: vpinsrb $3, 120(%rbp), %xmm3, %xmm3
|
|
|
|
; KNL-NEXT: vpinsrb $4, 128(%rbp), %xmm3, %xmm3
|
|
|
|
; KNL-NEXT: vpinsrb $5, 136(%rbp), %xmm3, %xmm3
|
|
|
|
; KNL-NEXT: vpinsrb $6, 144(%rbp), %xmm3, %xmm3
|
|
|
|
; KNL-NEXT: vpinsrb $7, 152(%rbp), %xmm3, %xmm3
|
|
|
|
; KNL-NEXT: vpinsrb $8, 160(%rbp), %xmm3, %xmm3
|
|
|
|
; KNL-NEXT: vpinsrb $9, 168(%rbp), %xmm3, %xmm3
|
|
|
|
; KNL-NEXT: vpinsrb $10, 176(%rbp), %xmm3, %xmm3
|
|
|
|
; KNL-NEXT: vpinsrb $11, 184(%rbp), %xmm3, %xmm3
|
|
|
|
; KNL-NEXT: vpinsrb $12, 192(%rbp), %xmm3, %xmm3
|
|
|
|
; KNL-NEXT: vpinsrb $13, 200(%rbp), %xmm3, %xmm3
|
|
|
|
; KNL-NEXT: vpinsrb $14, 208(%rbp), %xmm3, %xmm3
|
|
|
|
; KNL-NEXT: vpinsrb $15, 216(%rbp), %xmm3, %xmm3
|
|
|
|
; KNL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
|
2018-05-12 02:40:08 +08:00
|
|
|
; KNL-NEXT: movl 744(%rbp), %eax
|
|
|
|
; KNL-NEXT: andl $127, %eax
|
[SelectionDAG][X86] Fix insert_vector_elt lowering for v32i1/v64i1 with non-constant index
Summary:
Currently we don't handle v32i1/v64i1 insert_vector_elt correctly as we fail to look at the number of elements closely and assume it can only be v16i1 or v8i1.
We also can't type legalize v64i1 insert_vector_elt correctly on KNL due to the type not being byte addressable as required by the legalizing through memory accesses path requires.
For the first issue, the patch now tries to pick a 512-bit register with the correct number of elements and promotes to that.
For the second issue, we now extend the vector to a byte addressable type, do the stores to memory, load the two halves, and then truncate the halves back to the original type. Technically since we changed the type, we may not need two loads, but actually checking that is more work and for the v64i1 case we do need them.
Reviewers: RKSimon, delena, spatel, zvi
Reviewed By: RKSimon
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D40942
llvm-svn: 320849
2017-12-16 03:35:22 +08:00
|
|
|
; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3
|
2018-03-02 06:15:39 +08:00
|
|
|
; KNL-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2
|
2018-02-12 01:11:40 +08:00
|
|
|
; KNL-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2
|
2018-03-02 06:15:39 +08:00
|
|
|
; KNL-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1
|
2018-02-12 01:11:40 +08:00
|
|
|
; KNL-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1
|
2018-03-02 06:15:39 +08:00
|
|
|
; KNL-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0
|
2018-02-12 01:11:40 +08:00
|
|
|
; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
|
[SelectionDAG][X86] Fix insert_vector_elt lowering for v32i1/v64i1 with non-constant index
Summary:
Currently we don't handle v32i1/v64i1 insert_vector_elt correctly as we fail to look at the number of elements closely and assume it can only be v16i1 or v8i1.
We also can't type legalize v64i1 insert_vector_elt correctly on KNL due to the type not being byte addressable as required by the legalizing through memory accesses path requires.
For the first issue, the patch now tries to pick a 512-bit register with the correct number of elements and promotes to that.
For the second issue, we now extend the vector to a byte addressable type, do the stores to memory, load the two halves, and then truncate the halves back to the original type. Technically since we changed the type, we may not need two loads, but actually checking that is more work and for the v64i1 case we do need them.
Reviewers: RKSimon, delena, spatel, zvi
Reviewed By: RKSimon
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D40942
llvm-svn: 320849
2017-12-16 03:35:22 +08:00
|
|
|
; KNL-NEXT: cmpb $0, 736(%rbp)
|
|
|
|
; KNL-NEXT: vmovdqa %ymm3, {{[0-9]+}}(%rsp)
|
|
|
|
; KNL-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
|
|
|
|
; KNL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
|
2018-01-23 22:25:39 +08:00
|
|
|
; KNL-NEXT: vmovdqa %ymm2, (%rsp)
|
|
|
|
; KNL-NEXT: setne (%rsp,%rax)
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; KNL-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
|
|
|
|
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
|
|
|
|
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
|
2018-01-23 22:25:39 +08:00
|
|
|
; KNL-NEXT: kmovw %k0, %eax
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; KNL-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
|
|
|
|
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
|
|
|
|
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
|
2018-01-23 22:25:39 +08:00
|
|
|
; KNL-NEXT: kmovw %k0, %ecx
|
|
|
|
; KNL-NEXT: shll $16, %ecx
|
|
|
|
; KNL-NEXT: orl %eax, %ecx
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; KNL-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
|
|
|
|
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
|
|
|
|
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
|
2018-01-23 22:25:39 +08:00
|
|
|
; KNL-NEXT: kmovw %k0, %edx
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; KNL-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
|
|
|
|
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
|
|
|
|
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
|
2018-01-23 22:25:39 +08:00
|
|
|
; KNL-NEXT: kmovw %k0, %eax
|
|
|
|
; KNL-NEXT: shll $16, %eax
|
|
|
|
; KNL-NEXT: orl %edx, %eax
|
|
|
|
; KNL-NEXT: shlq $32, %rax
|
|
|
|
; KNL-NEXT: orq %rcx, %rax
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; KNL-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
|
|
|
|
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
|
|
|
|
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
|
2018-01-23 22:25:39 +08:00
|
|
|
; KNL-NEXT: kmovw %k0, %ecx
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; KNL-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
|
|
|
|
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
|
|
|
|
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
|
2018-01-23 22:25:39 +08:00
|
|
|
; KNL-NEXT: kmovw %k0, %esi
|
|
|
|
; KNL-NEXT: shll $16, %esi
|
|
|
|
; KNL-NEXT: orl %ecx, %esi
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; KNL-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
|
|
|
|
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
|
|
|
|
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
|
2018-01-23 22:25:39 +08:00
|
|
|
; KNL-NEXT: kmovw %k0, %ecx
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; KNL-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
|
[SelectionDAG][X86] Fix insert_vector_elt lowering for v32i1/v64i1 with non-constant index
Summary:
Currently we don't handle v32i1/v64i1 insert_vector_elt correctly as we fail to look at the number of elements closely and assume it can only be v16i1 or v8i1.
We also can't type legalize v64i1 insert_vector_elt correctly on KNL due to the type not being byte addressable as required by the legalizing through memory accesses path requires.
For the first issue, the patch now tries to pick a 512-bit register with the correct number of elements and promotes to that.
For the second issue, we now extend the vector to a byte addressable type, do the stores to memory, load the two halves, and then truncate the halves back to the original type. Technically since we changed the type, we may not need two loads, but actually checking that is more work and for the v64i1 case we do need them.
Reviewers: RKSimon, delena, spatel, zvi
Reviewed By: RKSimon
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D40942
llvm-svn: 320849
2017-12-16 03:35:22 +08:00
|
|
|
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
|
|
|
|
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
|
2018-01-23 22:25:39 +08:00
|
|
|
; KNL-NEXT: kmovw %k0, %edx
|
|
|
|
; KNL-NEXT: shll $16, %edx
|
|
|
|
; KNL-NEXT: orl %ecx, %edx
|
[SelectionDAG][X86] Fix insert_vector_elt lowering for v32i1/v64i1 with non-constant index
Summary:
Currently we don't handle v32i1/v64i1 insert_vector_elt correctly as we fail to look at the number of elements closely and assume it can only be v16i1 or v8i1.
We also can't type legalize v64i1 insert_vector_elt correctly on KNL due to the type not being byte addressable as required by the legalizing through memory accesses path requires.
For the first issue, the patch now tries to pick a 512-bit register with the correct number of elements and promotes to that.
For the second issue, we now extend the vector to a byte addressable type, do the stores to memory, load the two halves, and then truncate the halves back to the original type. Technically since we changed the type, we may not need two loads, but actually checking that is more work and for the v64i1 case we do need them.
Reviewers: RKSimon, delena, spatel, zvi
Reviewed By: RKSimon
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D40942
llvm-svn: 320849
2017-12-16 03:35:22 +08:00
|
|
|
; KNL-NEXT: shlq $32, %rdx
|
2018-01-23 22:25:39 +08:00
|
|
|
; KNL-NEXT: orq %rsi, %rdx
|
[SelectionDAG][X86] Fix insert_vector_elt lowering for v32i1/v64i1 with non-constant index
Summary:
Currently we don't handle v32i1/v64i1 insert_vector_elt correctly as we fail to look at the number of elements closely and assume it can only be v16i1 or v8i1.
We also can't type legalize v64i1 insert_vector_elt correctly on KNL due to the type not being byte addressable as required by the legalizing through memory accesses path requires.
For the first issue, the patch now tries to pick a 512-bit register with the correct number of elements and promotes to that.
For the second issue, we now extend the vector to a byte addressable type, do the stores to memory, load the two halves, and then truncate the halves back to the original type. Technically since we changed the type, we may not need two loads, but actually checking that is more work and for the v64i1 case we do need them.
Reviewers: RKSimon, delena, spatel, zvi
Reviewed By: RKSimon
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D40942
llvm-svn: 320849
2017-12-16 03:35:22 +08:00
|
|
|
; KNL-NEXT: movq %rbp, %rsp
|
|
|
|
; KNL-NEXT: popq %rbp
|
|
|
|
; KNL-NEXT: vzeroupper
|
|
|
|
; KNL-NEXT: retq
|
|
|
|
;
|
|
|
|
; SKX-LABEL: test_insertelement_variable_v96i1:
|
|
|
|
; SKX: ## %bb.0:
|
|
|
|
; SKX-NEXT: pushq %rbp
|
|
|
|
; SKX-NEXT: .cfi_def_cfa_offset 16
|
|
|
|
; SKX-NEXT: .cfi_offset %rbp, -16
|
|
|
|
; SKX-NEXT: movq %rsp, %rbp
|
|
|
|
; SKX-NEXT: .cfi_def_cfa_register %rbp
|
|
|
|
; SKX-NEXT: andq $-128, %rsp
|
|
|
|
; SKX-NEXT: subq $256, %rsp ## imm = 0x100
|
|
|
|
; SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; SKX-NEXT: vpinsrb $1, 232(%rbp), %xmm0, %xmm0
|
|
|
|
; SKX-NEXT: vpinsrb $2, 240(%rbp), %xmm0, %xmm0
|
|
|
|
; SKX-NEXT: vpinsrb $3, 248(%rbp), %xmm0, %xmm0
|
|
|
|
; SKX-NEXT: vpinsrb $4, 256(%rbp), %xmm0, %xmm0
|
|
|
|
; SKX-NEXT: vpinsrb $5, 264(%rbp), %xmm0, %xmm0
|
|
|
|
; SKX-NEXT: vpinsrb $6, 272(%rbp), %xmm0, %xmm0
|
|
|
|
; SKX-NEXT: vpinsrb $7, 280(%rbp), %xmm0, %xmm0
|
|
|
|
; SKX-NEXT: vpinsrb $8, 288(%rbp), %xmm0, %xmm0
|
|
|
|
; SKX-NEXT: vpinsrb $9, 296(%rbp), %xmm0, %xmm0
|
|
|
|
; SKX-NEXT: vpinsrb $10, 304(%rbp), %xmm0, %xmm0
|
|
|
|
; SKX-NEXT: vpinsrb $11, 312(%rbp), %xmm0, %xmm0
|
|
|
|
; SKX-NEXT: vpinsrb $12, 320(%rbp), %xmm0, %xmm0
|
|
|
|
; SKX-NEXT: vpinsrb $13, 328(%rbp), %xmm0, %xmm0
|
|
|
|
; SKX-NEXT: vpinsrb $14, 336(%rbp), %xmm0, %xmm0
|
|
|
|
; SKX-NEXT: vpinsrb $15, 344(%rbp), %xmm0, %xmm0
|
|
|
|
; SKX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
|
|
|
; SKX-NEXT: vpinsrb $1, 360(%rbp), %xmm1, %xmm1
|
|
|
|
; SKX-NEXT: vpinsrb $2, 368(%rbp), %xmm1, %xmm1
|
|
|
|
; SKX-NEXT: vpinsrb $3, 376(%rbp), %xmm1, %xmm1
|
|
|
|
; SKX-NEXT: vpinsrb $4, 384(%rbp), %xmm1, %xmm1
|
|
|
|
; SKX-NEXT: vpinsrb $5, 392(%rbp), %xmm1, %xmm1
|
|
|
|
; SKX-NEXT: vpinsrb $6, 400(%rbp), %xmm1, %xmm1
|
|
|
|
; SKX-NEXT: vpinsrb $7, 408(%rbp), %xmm1, %xmm1
|
|
|
|
; SKX-NEXT: vpinsrb $8, 416(%rbp), %xmm1, %xmm1
|
|
|
|
; SKX-NEXT: vpinsrb $9, 424(%rbp), %xmm1, %xmm1
|
|
|
|
; SKX-NEXT: vpinsrb $10, 432(%rbp), %xmm1, %xmm1
|
|
|
|
; SKX-NEXT: vpinsrb $11, 440(%rbp), %xmm1, %xmm1
|
|
|
|
; SKX-NEXT: vpinsrb $12, 448(%rbp), %xmm1, %xmm1
|
|
|
|
; SKX-NEXT: vpinsrb $13, 456(%rbp), %xmm1, %xmm1
|
|
|
|
; SKX-NEXT: vpinsrb $14, 464(%rbp), %xmm1, %xmm1
|
|
|
|
; SKX-NEXT: vpinsrb $15, 472(%rbp), %xmm1, %xmm1
|
|
|
|
; SKX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; SKX-NEXT: vmovd %edi, %xmm1
|
|
|
|
; SKX-NEXT: vpinsrb $1, %esi, %xmm1, %xmm1
|
|
|
|
; SKX-NEXT: vpinsrb $2, %edx, %xmm1, %xmm1
|
|
|
|
; SKX-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1
|
|
|
|
; SKX-NEXT: vpinsrb $4, %r8d, %xmm1, %xmm1
|
|
|
|
; SKX-NEXT: vpinsrb $5, %r9d, %xmm1, %xmm1
|
|
|
|
; SKX-NEXT: vpinsrb $6, 16(%rbp), %xmm1, %xmm1
|
|
|
|
; SKX-NEXT: vpinsrb $7, 24(%rbp), %xmm1, %xmm1
|
|
|
|
; SKX-NEXT: vpinsrb $8, 32(%rbp), %xmm1, %xmm1
|
|
|
|
; SKX-NEXT: vpinsrb $9, 40(%rbp), %xmm1, %xmm1
|
|
|
|
; SKX-NEXT: vpinsrb $10, 48(%rbp), %xmm1, %xmm1
|
|
|
|
; SKX-NEXT: vpinsrb $11, 56(%rbp), %xmm1, %xmm1
|
|
|
|
; SKX-NEXT: vpinsrb $12, 64(%rbp), %xmm1, %xmm1
|
|
|
|
; SKX-NEXT: vpinsrb $13, 72(%rbp), %xmm1, %xmm1
|
|
|
|
; SKX-NEXT: vpinsrb $14, 80(%rbp), %xmm1, %xmm1
|
|
|
|
; SKX-NEXT: vpinsrb $15, 88(%rbp), %xmm1, %xmm1
|
|
|
|
; SKX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
|
|
|
|
; SKX-NEXT: vpinsrb $1, 104(%rbp), %xmm2, %xmm2
|
|
|
|
; SKX-NEXT: vpinsrb $2, 112(%rbp), %xmm2, %xmm2
|
|
|
|
; SKX-NEXT: vpinsrb $3, 120(%rbp), %xmm2, %xmm2
|
|
|
|
; SKX-NEXT: vpinsrb $4, 128(%rbp), %xmm2, %xmm2
|
|
|
|
; SKX-NEXT: vpinsrb $5, 136(%rbp), %xmm2, %xmm2
|
|
|
|
; SKX-NEXT: vpinsrb $6, 144(%rbp), %xmm2, %xmm2
|
|
|
|
; SKX-NEXT: vpinsrb $7, 152(%rbp), %xmm2, %xmm2
|
|
|
|
; SKX-NEXT: vpinsrb $8, 160(%rbp), %xmm2, %xmm2
|
|
|
|
; SKX-NEXT: vpinsrb $9, 168(%rbp), %xmm2, %xmm2
|
|
|
|
; SKX-NEXT: vpinsrb $10, 176(%rbp), %xmm2, %xmm2
|
|
|
|
; SKX-NEXT: vpinsrb $11, 184(%rbp), %xmm2, %xmm2
|
|
|
|
; SKX-NEXT: vpinsrb $12, 192(%rbp), %xmm2, %xmm2
|
|
|
|
; SKX-NEXT: vpinsrb $13, 200(%rbp), %xmm2, %xmm2
|
|
|
|
; SKX-NEXT: vpinsrb $14, 208(%rbp), %xmm2, %xmm2
|
|
|
|
; SKX-NEXT: vpinsrb $15, 216(%rbp), %xmm2, %xmm2
|
|
|
|
; SKX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
|
|
|
|
; SKX-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
|
|
|
|
; SKX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
|
|
|
; SKX-NEXT: vpinsrb $1, 488(%rbp), %xmm1, %xmm1
|
|
|
|
; SKX-NEXT: vpinsrb $2, 496(%rbp), %xmm1, %xmm1
|
|
|
|
; SKX-NEXT: vpinsrb $3, 504(%rbp), %xmm1, %xmm1
|
|
|
|
; SKX-NEXT: vpinsrb $4, 512(%rbp), %xmm1, %xmm1
|
|
|
|
; SKX-NEXT: vpinsrb $5, 520(%rbp), %xmm1, %xmm1
|
|
|
|
; SKX-NEXT: vpinsrb $6, 528(%rbp), %xmm1, %xmm1
|
|
|
|
; SKX-NEXT: vpinsrb $7, 536(%rbp), %xmm1, %xmm1
|
|
|
|
; SKX-NEXT: vpinsrb $8, 544(%rbp), %xmm1, %xmm1
|
|
|
|
; SKX-NEXT: vpinsrb $9, 552(%rbp), %xmm1, %xmm1
|
|
|
|
; SKX-NEXT: vpinsrb $10, 560(%rbp), %xmm1, %xmm1
|
|
|
|
; SKX-NEXT: vpinsrb $11, 568(%rbp), %xmm1, %xmm1
|
|
|
|
; SKX-NEXT: vpinsrb $12, 576(%rbp), %xmm1, %xmm1
|
|
|
|
; SKX-NEXT: vpinsrb $13, 584(%rbp), %xmm1, %xmm1
|
|
|
|
; SKX-NEXT: vpinsrb $14, 592(%rbp), %xmm1, %xmm1
|
|
|
|
; SKX-NEXT: vpinsrb $15, 600(%rbp), %xmm1, %xmm1
|
|
|
|
; SKX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
|
|
|
|
; SKX-NEXT: vpinsrb $1, 616(%rbp), %xmm2, %xmm2
|
|
|
|
; SKX-NEXT: vpinsrb $2, 624(%rbp), %xmm2, %xmm2
|
|
|
|
; SKX-NEXT: vpinsrb $3, 632(%rbp), %xmm2, %xmm2
|
|
|
|
; SKX-NEXT: vpinsrb $4, 640(%rbp), %xmm2, %xmm2
|
|
|
|
; SKX-NEXT: vpinsrb $5, 648(%rbp), %xmm2, %xmm2
|
|
|
|
; SKX-NEXT: vpinsrb $6, 656(%rbp), %xmm2, %xmm2
|
|
|
|
; SKX-NEXT: vpinsrb $7, 664(%rbp), %xmm2, %xmm2
|
|
|
|
; SKX-NEXT: vpinsrb $8, 672(%rbp), %xmm2, %xmm2
|
|
|
|
; SKX-NEXT: vpinsrb $9, 680(%rbp), %xmm2, %xmm2
|
|
|
|
; SKX-NEXT: vpinsrb $10, 688(%rbp), %xmm2, %xmm2
|
|
|
|
; SKX-NEXT: vpinsrb $11, 696(%rbp), %xmm2, %xmm2
|
|
|
|
; SKX-NEXT: vpinsrb $12, 704(%rbp), %xmm2, %xmm2
|
|
|
|
; SKX-NEXT: vpinsrb $13, 712(%rbp), %xmm2, %xmm2
|
|
|
|
; SKX-NEXT: vpinsrb $14, 720(%rbp), %xmm2, %xmm2
|
|
|
|
; SKX-NEXT: vpinsrb $15, 728(%rbp), %xmm2, %xmm2
|
|
|
|
; SKX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
|
|
|
|
; SKX-NEXT: movl 744(%rbp), %eax
|
|
|
|
; SKX-NEXT: andl $127, %eax
|
2018-05-12 02:40:08 +08:00
|
|
|
; SKX-NEXT: vptestmb %zmm0, %zmm0, %k0
|
|
|
|
; SKX-NEXT: vptestmb %zmm1, %zmm1, %k1
|
[SelectionDAG][X86] Fix insert_vector_elt lowering for v32i1/v64i1 with non-constant index
Summary:
Currently we don't handle v32i1/v64i1 insert_vector_elt correctly as we fail to look at the number of elements closely and assume it can only be v16i1 or v8i1.
We also can't type legalize v64i1 insert_vector_elt correctly on KNL due to the type not being byte addressable as required by the legalizing through memory accesses path requires.
For the first issue, the patch now tries to pick a 512-bit register with the correct number of elements and promotes to that.
For the second issue, we now extend the vector to a byte addressable type, do the stores to memory, load the two halves, and then truncate the halves back to the original type. Technically since we changed the type, we may not need two loads, but actually checking that is more work and for the v64i1 case we do need them.
Reviewers: RKSimon, delena, spatel, zvi
Reviewed By: RKSimon
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D40942
llvm-svn: 320849
2017-12-16 03:35:22 +08:00
|
|
|
; SKX-NEXT: cmpb $0, 736(%rbp)
|
2017-12-22 10:30:26 +08:00
|
|
|
; SKX-NEXT: vpmovm2b %k1, %zmm0
|
2018-01-18 15:44:09 +08:00
|
|
|
; SKX-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
|
2017-12-22 10:30:26 +08:00
|
|
|
; SKX-NEXT: vpmovm2b %k0, %zmm0
|
2018-01-18 15:44:09 +08:00
|
|
|
; SKX-NEXT: vmovdqa64 %zmm0, (%rsp)
|
2017-12-23 01:18:13 +08:00
|
|
|
; SKX-NEXT: setne (%rsp,%rax)
|
[SelectionDAG][X86] Fix insert_vector_elt lowering for v32i1/v64i1 with non-constant index
Summary:
Currently we don't handle v32i1/v64i1 insert_vector_elt correctly as we fail to look at the number of elements closely and assume it can only be v16i1 or v8i1.
We also can't type legalize v64i1 insert_vector_elt correctly on KNL due to the type not being byte addressable as required by the legalizing through memory accesses path requires.
For the first issue, the patch now tries to pick a 512-bit register with the correct number of elements and promotes to that.
For the second issue, we now extend the vector to a byte addressable type, do the stores to memory, load the two halves, and then truncate the halves back to the original type. Technically since we changed the type, we may not need two loads, but actually checking that is more work and for the v64i1 case we do need them.
Reviewers: RKSimon, delena, spatel, zvi
Reviewed By: RKSimon
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D40942
llvm-svn: 320849
2017-12-16 03:35:22 +08:00
|
|
|
; SKX-NEXT: vpsllw $7, {{[0-9]+}}(%rsp), %zmm0
|
|
|
|
; SKX-NEXT: vpmovb2m %zmm0, %k0
|
|
|
|
; SKX-NEXT: vpsllw $7, (%rsp), %zmm0
|
|
|
|
; SKX-NEXT: vpmovb2m %zmm0, %k1
|
|
|
|
; SKX-NEXT: kmovq %k1, %rax
|
|
|
|
; SKX-NEXT: kmovq %k0, %rdx
|
|
|
|
; SKX-NEXT: movq %rbp, %rsp
|
|
|
|
; SKX-NEXT: popq %rbp
|
|
|
|
; SKX-NEXT: vzeroupper
|
|
|
|
; SKX-NEXT: retq
|
|
|
|
%t1 = icmp ugt <96 x i8> %a, zeroinitializer
|
|
|
|
%t2 = icmp ugt i8 %b, 0
|
|
|
|
%t3 = insertelement <96 x i1> %t1, i1 %t2, i32 %index
|
|
|
|
%t4 = bitcast <96 x i1> %t3 to i96
|
|
|
|
ret i96 %t4
|
|
|
|
}
|
|
|
|
|
|
|
|
define i128 @test_insertelement_variable_v128i1(<128 x i8> %a, i8 %b, i32 %index) {
|
|
|
|
; KNL-LABEL: test_insertelement_variable_v128i1:
|
|
|
|
; KNL: ## %bb.0:
|
|
|
|
; KNL-NEXT: pushq %rbp
|
|
|
|
; KNL-NEXT: .cfi_def_cfa_offset 16
|
|
|
|
; KNL-NEXT: .cfi_offset %rbp, -16
|
|
|
|
; KNL-NEXT: movq %rsp, %rbp
|
|
|
|
; KNL-NEXT: .cfi_def_cfa_register %rbp
|
|
|
|
; KNL-NEXT: andq $-128, %rsp
|
2018-01-23 22:25:39 +08:00
|
|
|
; KNL-NEXT: subq $256, %rsp ## imm = 0x100
|
2018-02-01 06:04:26 +08:00
|
|
|
; KNL-NEXT: ## kill: def $esi killed $esi def $rsi
|
2018-02-12 01:11:40 +08:00
|
|
|
; KNL-NEXT: vpxor %xmm4, %xmm4, %xmm4
|
2018-03-02 06:15:39 +08:00
|
|
|
; KNL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0
|
2018-02-12 01:11:40 +08:00
|
|
|
; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
|
2018-03-02 06:15:39 +08:00
|
|
|
; KNL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm1
|
2018-02-12 01:11:40 +08:00
|
|
|
; KNL-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1
|
2018-03-02 06:15:39 +08:00
|
|
|
; KNL-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm2
|
2018-02-12 01:11:40 +08:00
|
|
|
; KNL-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2
|
|
|
|
; KNL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm3
|
|
|
|
; KNL-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3
|
[SelectionDAG][X86] Fix insert_vector_elt lowering for v32i1/v64i1 with non-constant index
Summary:
Currently we don't handle v32i1/v64i1 insert_vector_elt correctly as we fail to look at the number of elements closely and assume it can only be v16i1 or v8i1.
We also can't type legalize v64i1 insert_vector_elt correctly on KNL due to the type not being byte addressable as required by the legalizing through memory accesses path requires.
For the first issue, the patch now tries to pick a 512-bit register with the correct number of elements and promotes to that.
For the second issue, we now extend the vector to a byte addressable type, do the stores to memory, load the two halves, and then truncate the halves back to the original type. Technically since we changed the type, we may not need two loads, but actually checking that is more work and for the v64i1 case we do need them.
Reviewers: RKSimon, delena, spatel, zvi
Reviewed By: RKSimon
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D40942
llvm-svn: 320849
2017-12-16 03:35:22 +08:00
|
|
|
; KNL-NEXT: andl $127, %esi
|
|
|
|
; KNL-NEXT: testb %dil, %dil
|
|
|
|
; KNL-NEXT: vmovdqa %ymm3, {{[0-9]+}}(%rsp)
|
|
|
|
; KNL-NEXT: vmovdqa %ymm2, {{[0-9]+}}(%rsp)
|
|
|
|
; KNL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
|
2018-01-23 22:25:39 +08:00
|
|
|
; KNL-NEXT: vmovdqa %ymm0, (%rsp)
|
|
|
|
; KNL-NEXT: setne (%rsp,%rsi)
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; KNL-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
|
|
|
|
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
|
|
|
|
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
|
2018-01-23 22:25:39 +08:00
|
|
|
; KNL-NEXT: kmovw %k0, %eax
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; KNL-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
|
|
|
|
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
|
|
|
|
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
|
2018-01-23 22:25:39 +08:00
|
|
|
; KNL-NEXT: kmovw %k0, %ecx
|
|
|
|
; KNL-NEXT: shll $16, %ecx
|
|
|
|
; KNL-NEXT: orl %eax, %ecx
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; KNL-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
|
|
|
|
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
|
|
|
|
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
|
2018-01-23 22:25:39 +08:00
|
|
|
; KNL-NEXT: kmovw %k0, %edx
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; KNL-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
|
|
|
|
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
|
|
|
|
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
|
2018-01-23 22:25:39 +08:00
|
|
|
; KNL-NEXT: kmovw %k0, %eax
|
|
|
|
; KNL-NEXT: shll $16, %eax
|
|
|
|
; KNL-NEXT: orl %edx, %eax
|
|
|
|
; KNL-NEXT: shlq $32, %rax
|
|
|
|
; KNL-NEXT: orq %rcx, %rax
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; KNL-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
|
|
|
|
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
|
|
|
|
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
|
2018-01-23 22:25:39 +08:00
|
|
|
; KNL-NEXT: kmovw %k0, %ecx
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; KNL-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
|
|
|
|
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
|
|
|
|
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
|
2018-01-23 22:25:39 +08:00
|
|
|
; KNL-NEXT: kmovw %k0, %esi
|
|
|
|
; KNL-NEXT: shll $16, %esi
|
|
|
|
; KNL-NEXT: orl %ecx, %esi
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; KNL-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
|
|
|
|
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
|
|
|
|
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
|
2018-01-23 22:25:39 +08:00
|
|
|
; KNL-NEXT: kmovw %k0, %ecx
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; KNL-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
|
[SelectionDAG][X86] Fix insert_vector_elt lowering for v32i1/v64i1 with non-constant index
Summary:
Currently we don't handle v32i1/v64i1 insert_vector_elt correctly as we fail to look at the number of elements closely and assume it can only be v16i1 or v8i1.
We also can't type legalize v64i1 insert_vector_elt correctly on KNL due to the type not being byte addressable as required by the legalizing through memory accesses path requires.
For the first issue, the patch now tries to pick a 512-bit register with the correct number of elements and promotes to that.
For the second issue, we now extend the vector to a byte addressable type, do the stores to memory, load the two halves, and then truncate the halves back to the original type. Technically since we changed the type, we may not need two loads, but actually checking that is more work and for the v64i1 case we do need them.
Reviewers: RKSimon, delena, spatel, zvi
Reviewed By: RKSimon
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D40942
llvm-svn: 320849
2017-12-16 03:35:22 +08:00
|
|
|
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
|
|
|
|
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
|
2018-01-23 22:25:39 +08:00
|
|
|
; KNL-NEXT: kmovw %k0, %edx
|
|
|
|
; KNL-NEXT: shll $16, %edx
|
|
|
|
; KNL-NEXT: orl %ecx, %edx
|
[SelectionDAG][X86] Fix insert_vector_elt lowering for v32i1/v64i1 with non-constant index
Summary:
Currently we don't handle v32i1/v64i1 insert_vector_elt correctly as we fail to look at the number of elements closely and assume it can only be v16i1 or v8i1.
We also can't type legalize v64i1 insert_vector_elt correctly on KNL due to the type not being byte addressable as required by the legalizing through memory accesses path requires.
For the first issue, the patch now tries to pick a 512-bit register with the correct number of elements and promotes to that.
For the second issue, we now extend the vector to a byte addressable type, do the stores to memory, load the two halves, and then truncate the halves back to the original type. Technically since we changed the type, we may not need two loads, but actually checking that is more work and for the v64i1 case we do need them.
Reviewers: RKSimon, delena, spatel, zvi
Reviewed By: RKSimon
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D40942
llvm-svn: 320849
2017-12-16 03:35:22 +08:00
|
|
|
; KNL-NEXT: shlq $32, %rdx
|
2018-01-23 22:25:39 +08:00
|
|
|
; KNL-NEXT: orq %rsi, %rdx
|
[SelectionDAG][X86] Fix insert_vector_elt lowering for v32i1/v64i1 with non-constant index
Summary:
Currently we don't handle v32i1/v64i1 insert_vector_elt correctly as we fail to look at the number of elements closely and assume it can only be v16i1 or v8i1.
We also can't type legalize v64i1 insert_vector_elt correctly on KNL due to the type not being byte addressable as required by the legalizing through memory accesses path requires.
For the first issue, the patch now tries to pick a 512-bit register with the correct number of elements and promotes to that.
For the second issue, we now extend the vector to a byte addressable type, do the stores to memory, load the two halves, and then truncate the halves back to the original type. Technically since we changed the type, we may not need two loads, but actually checking that is more work and for the v64i1 case we do need them.
Reviewers: RKSimon, delena, spatel, zvi
Reviewed By: RKSimon
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D40942
llvm-svn: 320849
2017-12-16 03:35:22 +08:00
|
|
|
; KNL-NEXT: movq %rbp, %rsp
|
|
|
|
; KNL-NEXT: popq %rbp
|
|
|
|
; KNL-NEXT: vzeroupper
|
|
|
|
; KNL-NEXT: retq
|
|
|
|
;
|
|
|
|
; SKX-LABEL: test_insertelement_variable_v128i1:
|
|
|
|
; SKX: ## %bb.0:
|
|
|
|
; SKX-NEXT: pushq %rbp
|
|
|
|
; SKX-NEXT: .cfi_def_cfa_offset 16
|
|
|
|
; SKX-NEXT: .cfi_offset %rbp, -16
|
|
|
|
; SKX-NEXT: movq %rsp, %rbp
|
|
|
|
; SKX-NEXT: .cfi_def_cfa_register %rbp
|
|
|
|
; SKX-NEXT: andq $-128, %rsp
|
|
|
|
; SKX-NEXT: subq $256, %rsp ## imm = 0x100
|
2018-02-01 06:04:26 +08:00
|
|
|
; SKX-NEXT: ## kill: def $esi killed $esi def $rsi
|
2018-03-02 06:15:39 +08:00
|
|
|
; SKX-NEXT: vptestmb %zmm0, %zmm0, %k0
|
|
|
|
; SKX-NEXT: vptestmb %zmm1, %zmm1, %k1
|
[SelectionDAG][X86] Fix insert_vector_elt lowering for v32i1/v64i1 with non-constant index
Summary:
Currently we don't handle v32i1/v64i1 insert_vector_elt correctly as we fail to look at the number of elements closely and assume it can only be v16i1 or v8i1.
We also can't type legalize v64i1 insert_vector_elt correctly on KNL due to the type not being byte addressable as required by the legalizing through memory accesses path requires.
For the first issue, the patch now tries to pick a 512-bit register with the correct number of elements and promotes to that.
For the second issue, we now extend the vector to a byte addressable type, do the stores to memory, load the two halves, and then truncate the halves back to the original type. Technically since we changed the type, we may not need two loads, but actually checking that is more work and for the v64i1 case we do need them.
Reviewers: RKSimon, delena, spatel, zvi
Reviewed By: RKSimon
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D40942
llvm-svn: 320849
2017-12-16 03:35:22 +08:00
|
|
|
; SKX-NEXT: andl $127, %esi
|
|
|
|
; SKX-NEXT: testb %dil, %dil
|
2017-12-22 10:30:26 +08:00
|
|
|
; SKX-NEXT: vpmovm2b %k1, %zmm0
|
2018-01-18 15:44:09 +08:00
|
|
|
; SKX-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
|
2017-12-22 10:30:26 +08:00
|
|
|
; SKX-NEXT: vpmovm2b %k0, %zmm0
|
2018-01-18 15:44:09 +08:00
|
|
|
; SKX-NEXT: vmovdqa64 %zmm0, (%rsp)
|
2017-12-23 01:18:13 +08:00
|
|
|
; SKX-NEXT: setne (%rsp,%rsi)
|
[SelectionDAG][X86] Fix insert_vector_elt lowering for v32i1/v64i1 with non-constant index
Summary:
Currently we don't handle v32i1/v64i1 insert_vector_elt correctly as we fail to look at the number of elements closely and assume it can only be v16i1 or v8i1.
We also can't type legalize v64i1 insert_vector_elt correctly on KNL due to the type not being byte addressable as required by the legalizing through memory accesses path requires.
For the first issue, the patch now tries to pick a 512-bit register with the correct number of elements and promotes to that.
For the second issue, we now extend the vector to a byte addressable type, do the stores to memory, load the two halves, and then truncate the halves back to the original type. Technically since we changed the type, we may not need two loads, but actually checking that is more work and for the v64i1 case we do need them.
Reviewers: RKSimon, delena, spatel, zvi
Reviewed By: RKSimon
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D40942
llvm-svn: 320849
2017-12-16 03:35:22 +08:00
|
|
|
; SKX-NEXT: vpsllw $7, {{[0-9]+}}(%rsp), %zmm0
|
|
|
|
; SKX-NEXT: vpmovb2m %zmm0, %k0
|
|
|
|
; SKX-NEXT: vpsllw $7, (%rsp), %zmm0
|
|
|
|
; SKX-NEXT: vpmovb2m %zmm0, %k1
|
|
|
|
; SKX-NEXT: kmovq %k1, %rax
|
|
|
|
; SKX-NEXT: kmovq %k0, %rdx
|
|
|
|
; SKX-NEXT: movq %rbp, %rsp
|
|
|
|
; SKX-NEXT: popq %rbp
|
|
|
|
; SKX-NEXT: vzeroupper
|
|
|
|
; SKX-NEXT: retq
|
|
|
|
%t1 = icmp ugt <128 x i8> %a, zeroinitializer
|
|
|
|
%t2 = icmp ugt i8 %b, 0
|
|
|
|
%t3 = insertelement <128 x i1> %t1, i1 %t2, i32 %index
|
|
|
|
%t4 = bitcast <128 x i1> %t3 to i128
|
|
|
|
ret i128 %t4
|
|
|
|
}
|