2016-02-15 21:50:48 +08:00
|
|
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
2016-09-27 14:44:27 +08:00
|
|
|
; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32-SSE
|
|
|
|
; RUN: llc < %s -mtriple=i386-unknown -mattr=+avx | FileCheck %s --check-prefix=X32-AVX --check-prefix=X32-AVX1
|
|
|
|
; RUN: llc < %s -mtriple=i386-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X32-AVX --check-prefix=X32-AVX256 --check-prefix=X32-AVX2
|
|
|
|
; RUN: llc < %s -mtriple=i386-unknown -mcpu=knl | FileCheck %s --check-prefix=X32-AVX --check-prefix=X32-AVX256 --check-prefix=X32-AVX512 --check-prefix=X32-KNL
|
|
|
|
; RUN: llc < %s -mtriple=i386-unknown -mcpu=skx | FileCheck %s --check-prefix=X32-AVX --check-prefix=X32-AVX256 --check-prefix=X32-AVX512 --check-prefix=X32-SKX
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64-SSE
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=X64-AVX --check-prefix=X64-AVX1
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64-AVX --check-prefix=X64-AVX256 --check-prefix=X64-AVX2
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=knl | FileCheck %s --check-prefix=X64-AVX --check-prefix=X64-AVX256 --check-prefix=X64-AVX512 --check-prefix=X64-KNL
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=skx | FileCheck %s --check-prefix=X64-AVX --check-prefix=X64-AVX256 --check-prefix=X64-AVX512 --check-prefix=X64-SKX
|
2016-02-15 21:50:48 +08:00
|
|
|
|
2017-04-28 19:12:30 +08:00
|
|
|
define <16 x i8> @allones_v16i8() nounwind {
|
|
|
|
; X32-SSE-LABEL: allones_v16i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-SSE: # %bb.0:
|
2016-02-15 21:50:48 +08:00
|
|
|
; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
|
|
|
|
; X32-SSE-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X32-AVX-LABEL: allones_v16i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-AVX: # %bb.0:
|
2016-02-15 21:50:48 +08:00
|
|
|
; X32-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
|
|
|
|
; X32-AVX-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-SSE-LABEL: allones_v16i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-SSE: # %bb.0:
|
2016-02-15 21:50:48 +08:00
|
|
|
; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
|
|
|
|
; X64-SSE-NEXT: retq
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-AVX-LABEL: allones_v16i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX: # %bb.0:
|
2016-02-15 21:50:48 +08:00
|
|
|
; X64-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
|
|
|
|
; X64-AVX-NEXT: retq
|
|
|
|
ret <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
|
|
|
|
}
|
|
|
|
|
2017-04-28 19:12:30 +08:00
|
|
|
define <8 x i16> @allones_v8i16() nounwind {
|
|
|
|
; X32-SSE-LABEL: allones_v8i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-SSE: # %bb.0:
|
2016-02-15 21:50:48 +08:00
|
|
|
; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
|
|
|
|
; X32-SSE-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X32-AVX-LABEL: allones_v8i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-AVX: # %bb.0:
|
2016-02-15 21:50:48 +08:00
|
|
|
; X32-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
|
|
|
|
; X32-AVX-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-SSE-LABEL: allones_v8i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-SSE: # %bb.0:
|
2016-02-15 21:50:48 +08:00
|
|
|
; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
|
|
|
|
; X64-SSE-NEXT: retq
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-AVX-LABEL: allones_v8i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX: # %bb.0:
|
2016-02-15 21:50:48 +08:00
|
|
|
; X64-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
|
|
|
|
; X64-AVX-NEXT: retq
|
|
|
|
ret <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
|
|
|
|
}
|
2008-03-12 05:37:00 +08:00
|
|
|
|
2017-04-28 19:12:30 +08:00
|
|
|
define <4 x i32> @allones_v4i32() nounwind {
|
|
|
|
; X32-SSE-LABEL: allones_v4i32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-SSE: # %bb.0:
|
2016-02-15 21:50:48 +08:00
|
|
|
; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
|
|
|
|
; X32-SSE-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X32-AVX-LABEL: allones_v4i32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-AVX: # %bb.0:
|
2016-02-15 21:50:48 +08:00
|
|
|
; X32-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
|
|
|
|
; X32-AVX-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-SSE-LABEL: allones_v4i32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-SSE: # %bb.0:
|
2016-02-15 21:50:48 +08:00
|
|
|
; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
|
|
|
|
; X64-SSE-NEXT: retq
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-AVX-LABEL: allones_v4i32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX: # %bb.0:
|
2016-02-15 21:50:48 +08:00
|
|
|
; X64-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
|
|
|
|
; X64-AVX-NEXT: retq
|
|
|
|
ret <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
|
2008-03-12 05:37:00 +08:00
|
|
|
}
|
2016-02-15 21:50:48 +08:00
|
|
|
|
2017-04-28 19:12:30 +08:00
|
|
|
define <2 x i64> @allones_v2i64() nounwind {
|
|
|
|
; X32-SSE-LABEL: allones_v2i64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-SSE: # %bb.0:
|
2016-02-15 21:50:48 +08:00
|
|
|
; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
|
|
|
|
; X32-SSE-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X32-AVX-LABEL: allones_v2i64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-AVX: # %bb.0:
|
2016-02-15 21:50:48 +08:00
|
|
|
; X32-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
|
|
|
|
; X32-AVX-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-SSE-LABEL: allones_v2i64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-SSE: # %bb.0:
|
2016-02-15 21:50:48 +08:00
|
|
|
; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
|
|
|
|
; X64-SSE-NEXT: retq
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-AVX-LABEL: allones_v2i64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX: # %bb.0:
|
2016-02-15 21:50:48 +08:00
|
|
|
; X64-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
|
|
|
|
; X64-AVX-NEXT: retq
|
|
|
|
ret <2 x i64> <i64 -1, i64 -1>
|
2008-03-12 05:37:00 +08:00
|
|
|
}
|
2016-02-15 21:50:48 +08:00
|
|
|
|
2017-04-28 19:12:30 +08:00
|
|
|
define <2 x double> @allones_v2f64() nounwind {
|
|
|
|
; X32-SSE-LABEL: allones_v2f64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-SSE: # %bb.0:
|
2016-02-15 21:50:48 +08:00
|
|
|
; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
|
|
|
|
; X32-SSE-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X32-AVX-LABEL: allones_v2f64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-AVX: # %bb.0:
|
2016-02-15 21:50:48 +08:00
|
|
|
; X32-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
|
|
|
|
; X32-AVX-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-SSE-LABEL: allones_v2f64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-SSE: # %bb.0:
|
2016-02-15 21:50:48 +08:00
|
|
|
; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
|
|
|
|
; X64-SSE-NEXT: retq
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-AVX-LABEL: allones_v2f64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX: # %bb.0:
|
2016-02-15 21:50:48 +08:00
|
|
|
; X64-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
|
|
|
|
; X64-AVX-NEXT: retq
|
|
|
|
ret <2 x double> <double 0xffffffffffffffff, double 0xffffffffffffffff>
|
2008-03-12 05:37:00 +08:00
|
|
|
}
|
2016-02-15 21:50:48 +08:00
|
|
|
|
2017-04-28 19:12:30 +08:00
|
|
|
define <4 x float> @allones_v4f32() nounwind {
|
|
|
|
; X32-SSE-LABEL: allones_v4f32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-SSE: # %bb.0:
|
2016-02-15 21:50:48 +08:00
|
|
|
; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
|
|
|
|
; X32-SSE-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X32-AVX-LABEL: allones_v4f32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-AVX: # %bb.0:
|
2016-02-15 21:50:48 +08:00
|
|
|
; X32-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
|
|
|
|
; X32-AVX-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-SSE-LABEL: allones_v4f32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-SSE: # %bb.0:
|
2016-02-15 21:50:48 +08:00
|
|
|
; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
|
|
|
|
; X64-SSE-NEXT: retq
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-AVX-LABEL: allones_v4f32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX: # %bb.0:
|
2016-02-15 21:50:48 +08:00
|
|
|
; X64-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
|
|
|
|
; X64-AVX-NEXT: retq
|
|
|
|
ret <4 x float> <float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000>
|
2008-03-12 05:37:00 +08:00
|
|
|
}
|
2016-09-27 14:44:27 +08:00
|
|
|
|
2017-04-28 19:12:30 +08:00
|
|
|
define <32 x i8> @allones_v32i8() nounwind {
|
|
|
|
; X32-SSE-LABEL: allones_v32i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-SSE: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
|
|
|
|
; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1
|
|
|
|
; X32-SSE-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X32-AVX1-LABEL: allones_v32i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-AVX1: # %bb.0:
|
2017-07-28 01:47:01 +08:00
|
|
|
; X32-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
[x86, SSE] AVX1 PR28129 (256-bit all-ones rematerialization)
Further perf tests on Jaguar indicate that:
vxorps %ymm0, %ymm0, %ymm0
vcmpps $15, %ymm0, %ymm0, %ymm0
is consistently faster (by about 9%) than:
vpcmpeqd %xmm0, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm0, %ymm0
Testing equivalent code on a SandyBridge (E5-2640) puts it slightly (~3%) faster as well.
Committed on behalf of @dtemirbulatov
Differential Revision: https://reviews.llvm.org/D32416
llvm-svn: 302989
2017-05-13 21:42:35 +08:00
|
|
|
; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
|
2016-09-27 14:44:27 +08:00
|
|
|
; X32-AVX1-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X32-AVX256-LABEL: allones_v32i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-AVX256: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X32-AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
|
|
|
|
; X32-AVX256-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-SSE-LABEL: allones_v32i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-SSE: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
|
|
|
|
; X64-SSE-NEXT: pcmpeqd %xmm1, %xmm1
|
|
|
|
; X64-SSE-NEXT: retq
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-AVX1-LABEL: allones_v32i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX1: # %bb.0:
|
2017-07-28 01:47:01 +08:00
|
|
|
; X64-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
[x86, SSE] AVX1 PR28129 (256-bit all-ones rematerialization)
Further perf tests on Jaguar indicate that:
vxorps %ymm0, %ymm0, %ymm0
vcmpps $15, %ymm0, %ymm0, %ymm0
is consistently faster (by about 9%) than:
vpcmpeqd %xmm0, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm0, %ymm0
Testing equivalent code on a SandyBridge (E5-2640) puts it slightly (~3%) faster as well.
Committed on behalf of @dtemirbulatov
Differential Revision: https://reviews.llvm.org/D32416
llvm-svn: 302989
2017-05-13 21:42:35 +08:00
|
|
|
; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
|
2016-09-27 14:44:27 +08:00
|
|
|
; X64-AVX1-NEXT: retq
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-AVX256-LABEL: allones_v32i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX256: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X64-AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
|
|
|
|
; X64-AVX256-NEXT: retq
|
|
|
|
ret <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
|
|
|
|
}
|
|
|
|
|
2017-04-28 19:12:30 +08:00
|
|
|
define <16 x i16> @allones_v16i16() nounwind {
|
|
|
|
; X32-SSE-LABEL: allones_v16i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-SSE: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
|
|
|
|
; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1
|
|
|
|
; X32-SSE-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X32-AVX1-LABEL: allones_v16i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-AVX1: # %bb.0:
|
2017-07-28 01:47:01 +08:00
|
|
|
; X32-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
[x86, SSE] AVX1 PR28129 (256-bit all-ones rematerialization)
Further perf tests on Jaguar indicate that:
vxorps %ymm0, %ymm0, %ymm0
vcmpps $15, %ymm0, %ymm0, %ymm0
is consistently faster (by about 9%) than:
vpcmpeqd %xmm0, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm0, %ymm0
Testing equivalent code on a SandyBridge (E5-2640) puts it slightly (~3%) faster as well.
Committed on behalf of @dtemirbulatov
Differential Revision: https://reviews.llvm.org/D32416
llvm-svn: 302989
2017-05-13 21:42:35 +08:00
|
|
|
; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
|
2016-09-27 14:44:27 +08:00
|
|
|
; X32-AVX1-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X32-AVX256-LABEL: allones_v16i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-AVX256: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X32-AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
|
|
|
|
; X32-AVX256-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-SSE-LABEL: allones_v16i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-SSE: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
|
|
|
|
; X64-SSE-NEXT: pcmpeqd %xmm1, %xmm1
|
|
|
|
; X64-SSE-NEXT: retq
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-AVX1-LABEL: allones_v16i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX1: # %bb.0:
|
2017-07-28 01:47:01 +08:00
|
|
|
; X64-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
[x86, SSE] AVX1 PR28129 (256-bit all-ones rematerialization)
Further perf tests on Jaguar indicate that:
vxorps %ymm0, %ymm0, %ymm0
vcmpps $15, %ymm0, %ymm0, %ymm0
is consistently faster (by about 9%) than:
vpcmpeqd %xmm0, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm0, %ymm0
Testing equivalent code on a SandyBridge (E5-2640) puts it slightly (~3%) faster as well.
Committed on behalf of @dtemirbulatov
Differential Revision: https://reviews.llvm.org/D32416
llvm-svn: 302989
2017-05-13 21:42:35 +08:00
|
|
|
; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
|
2016-09-27 14:44:27 +08:00
|
|
|
; X64-AVX1-NEXT: retq
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-AVX256-LABEL: allones_v16i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX256: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X64-AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
|
|
|
|
; X64-AVX256-NEXT: retq
|
|
|
|
ret <16 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
|
|
|
|
}
|
|
|
|
|
2017-04-28 19:12:30 +08:00
|
|
|
define <8 x i32> @allones_v8i32() nounwind {
|
|
|
|
; X32-SSE-LABEL: allones_v8i32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-SSE: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
|
|
|
|
; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1
|
|
|
|
; X32-SSE-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X32-AVX1-LABEL: allones_v8i32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-AVX1: # %bb.0:
|
2017-07-28 01:47:01 +08:00
|
|
|
; X32-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
[x86, SSE] AVX1 PR28129 (256-bit all-ones rematerialization)
Further perf tests on Jaguar indicate that:
vxorps %ymm0, %ymm0, %ymm0
vcmpps $15, %ymm0, %ymm0, %ymm0
is consistently faster (by about 9%) than:
vpcmpeqd %xmm0, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm0, %ymm0
Testing equivalent code on a SandyBridge (E5-2640) puts it slightly (~3%) faster as well.
Committed on behalf of @dtemirbulatov
Differential Revision: https://reviews.llvm.org/D32416
llvm-svn: 302989
2017-05-13 21:42:35 +08:00
|
|
|
; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
|
2016-09-27 14:44:27 +08:00
|
|
|
; X32-AVX1-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X32-AVX256-LABEL: allones_v8i32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-AVX256: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X32-AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
|
|
|
|
; X32-AVX256-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-SSE-LABEL: allones_v8i32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-SSE: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
|
|
|
|
; X64-SSE-NEXT: pcmpeqd %xmm1, %xmm1
|
|
|
|
; X64-SSE-NEXT: retq
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-AVX1-LABEL: allones_v8i32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX1: # %bb.0:
|
2017-07-28 01:47:01 +08:00
|
|
|
; X64-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
[x86, SSE] AVX1 PR28129 (256-bit all-ones rematerialization)
Further perf tests on Jaguar indicate that:
vxorps %ymm0, %ymm0, %ymm0
vcmpps $15, %ymm0, %ymm0, %ymm0
is consistently faster (by about 9%) than:
vpcmpeqd %xmm0, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm0, %ymm0
Testing equivalent code on a SandyBridge (E5-2640) puts it slightly (~3%) faster as well.
Committed on behalf of @dtemirbulatov
Differential Revision: https://reviews.llvm.org/D32416
llvm-svn: 302989
2017-05-13 21:42:35 +08:00
|
|
|
; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
|
2016-09-27 14:44:27 +08:00
|
|
|
; X64-AVX1-NEXT: retq
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-AVX256-LABEL: allones_v8i32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX256: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X64-AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
|
|
|
|
; X64-AVX256-NEXT: retq
|
|
|
|
ret <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
|
|
|
|
}
|
|
|
|
|
2017-04-28 19:12:30 +08:00
|
|
|
define <4 x i64> @allones_v4i64() nounwind {
|
|
|
|
; X32-SSE-LABEL: allones_v4i64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-SSE: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
|
|
|
|
; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1
|
|
|
|
; X32-SSE-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X32-AVX1-LABEL: allones_v4i64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-AVX1: # %bb.0:
|
2017-07-28 01:47:01 +08:00
|
|
|
; X32-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
[x86, SSE] AVX1 PR28129 (256-bit all-ones rematerialization)
Further perf tests on Jaguar indicate that:
vxorps %ymm0, %ymm0, %ymm0
vcmpps $15, %ymm0, %ymm0, %ymm0
is consistently faster (by about 9%) than:
vpcmpeqd %xmm0, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm0, %ymm0
Testing equivalent code on a SandyBridge (E5-2640) puts it slightly (~3%) faster as well.
Committed on behalf of @dtemirbulatov
Differential Revision: https://reviews.llvm.org/D32416
llvm-svn: 302989
2017-05-13 21:42:35 +08:00
|
|
|
; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
|
2016-09-27 14:44:27 +08:00
|
|
|
; X32-AVX1-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X32-AVX256-LABEL: allones_v4i64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-AVX256: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X32-AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
|
|
|
|
; X32-AVX256-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-SSE-LABEL: allones_v4i64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-SSE: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
|
|
|
|
; X64-SSE-NEXT: pcmpeqd %xmm1, %xmm1
|
|
|
|
; X64-SSE-NEXT: retq
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-AVX1-LABEL: allones_v4i64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX1: # %bb.0:
|
2017-07-28 01:47:01 +08:00
|
|
|
; X64-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
[x86, SSE] AVX1 PR28129 (256-bit all-ones rematerialization)
Further perf tests on Jaguar indicate that:
vxorps %ymm0, %ymm0, %ymm0
vcmpps $15, %ymm0, %ymm0, %ymm0
is consistently faster (by about 9%) than:
vpcmpeqd %xmm0, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm0, %ymm0
Testing equivalent code on a SandyBridge (E5-2640) puts it slightly (~3%) faster as well.
Committed on behalf of @dtemirbulatov
Differential Revision: https://reviews.llvm.org/D32416
llvm-svn: 302989
2017-05-13 21:42:35 +08:00
|
|
|
; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
|
2016-09-27 14:44:27 +08:00
|
|
|
; X64-AVX1-NEXT: retq
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-AVX256-LABEL: allones_v4i64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX256: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X64-AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
|
|
|
|
; X64-AVX256-NEXT: retq
|
|
|
|
ret <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>
|
|
|
|
}
|
|
|
|
|
2017-04-28 19:12:30 +08:00
|
|
|
define <4 x double> @allones_v4f64() nounwind {
|
|
|
|
; X32-SSE-LABEL: allones_v4f64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-SSE: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
|
|
|
|
; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1
|
|
|
|
; X32-SSE-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X32-AVX1-LABEL: allones_v4f64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-AVX1: # %bb.0:
|
2017-07-28 01:47:01 +08:00
|
|
|
; X32-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
[x86, SSE] AVX1 PR28129 (256-bit all-ones rematerialization)
Further perf tests on Jaguar indicate that:
vxorps %ymm0, %ymm0, %ymm0
vcmpps $15, %ymm0, %ymm0, %ymm0
is consistently faster (by about 9%) than:
vpcmpeqd %xmm0, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm0, %ymm0
Testing equivalent code on a SandyBridge (E5-2640) puts it slightly (~3%) faster as well.
Committed on behalf of @dtemirbulatov
Differential Revision: https://reviews.llvm.org/D32416
llvm-svn: 302989
2017-05-13 21:42:35 +08:00
|
|
|
; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
|
2016-09-27 14:44:27 +08:00
|
|
|
; X32-AVX1-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X32-AVX256-LABEL: allones_v4f64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-AVX256: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X32-AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
|
|
|
|
; X32-AVX256-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-SSE-LABEL: allones_v4f64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-SSE: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
|
|
|
|
; X64-SSE-NEXT: pcmpeqd %xmm1, %xmm1
|
|
|
|
; X64-SSE-NEXT: retq
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-AVX1-LABEL: allones_v4f64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX1: # %bb.0:
|
2017-07-28 01:47:01 +08:00
|
|
|
; X64-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
[x86, SSE] AVX1 PR28129 (256-bit all-ones rematerialization)
Further perf tests on Jaguar indicate that:
vxorps %ymm0, %ymm0, %ymm0
vcmpps $15, %ymm0, %ymm0, %ymm0
is consistently faster (by about 9%) than:
vpcmpeqd %xmm0, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm0, %ymm0
Testing equivalent code on a SandyBridge (E5-2640) puts it slightly (~3%) faster as well.
Committed on behalf of @dtemirbulatov
Differential Revision: https://reviews.llvm.org/D32416
llvm-svn: 302989
2017-05-13 21:42:35 +08:00
|
|
|
; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
|
2016-09-27 14:44:27 +08:00
|
|
|
; X64-AVX1-NEXT: retq
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-AVX256-LABEL: allones_v4f64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX256: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X64-AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
|
|
|
|
; X64-AVX256-NEXT: retq
|
|
|
|
ret <4 x double> <double 0xffffffffffffffff, double 0xffffffffffffffff, double 0xffffffffffffffff, double 0xffffffffffffffff>
|
|
|
|
}
|
|
|
|
|
2017-04-28 19:12:30 +08:00
|
|
|
define <4 x double> @allones_v4f64_optsize() nounwind optsize {
|
|
|
|
; X32-SSE-LABEL: allones_v4f64_optsize:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-SSE: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
|
|
|
|
; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1
|
|
|
|
; X32-SSE-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X32-AVX1-LABEL: allones_v4f64_optsize:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-AVX1: # %bb.0:
|
2017-07-28 01:47:01 +08:00
|
|
|
; X32-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
[x86, SSE] AVX1 PR28129 (256-bit all-ones rematerialization)
Further perf tests on Jaguar indicate that:
vxorps %ymm0, %ymm0, %ymm0
vcmpps $15, %ymm0, %ymm0, %ymm0
is consistently faster (by about 9%) than:
vpcmpeqd %xmm0, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm0, %ymm0
Testing equivalent code on a SandyBridge (E5-2640) puts it slightly (~3%) faster as well.
Committed on behalf of @dtemirbulatov
Differential Revision: https://reviews.llvm.org/D32416
llvm-svn: 302989
2017-05-13 21:42:35 +08:00
|
|
|
; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
|
2016-09-27 14:44:27 +08:00
|
|
|
; X32-AVX1-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X32-AVX256-LABEL: allones_v4f64_optsize:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-AVX256: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X32-AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
|
|
|
|
; X32-AVX256-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-SSE-LABEL: allones_v4f64_optsize:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-SSE: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
|
|
|
|
; X64-SSE-NEXT: pcmpeqd %xmm1, %xmm1
|
|
|
|
; X64-SSE-NEXT: retq
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-AVX1-LABEL: allones_v4f64_optsize:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX1: # %bb.0:
|
2017-07-28 01:47:01 +08:00
|
|
|
; X64-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
[x86, SSE] AVX1 PR28129 (256-bit all-ones rematerialization)
Further perf tests on Jaguar indicate that:
vxorps %ymm0, %ymm0, %ymm0
vcmpps $15, %ymm0, %ymm0, %ymm0
is consistently faster (by about 9%) than:
vpcmpeqd %xmm0, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm0, %ymm0
Testing equivalent code on a SandyBridge (E5-2640) puts it slightly (~3%) faster as well.
Committed on behalf of @dtemirbulatov
Differential Revision: https://reviews.llvm.org/D32416
llvm-svn: 302989
2017-05-13 21:42:35 +08:00
|
|
|
; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
|
2016-09-27 14:44:27 +08:00
|
|
|
; X64-AVX1-NEXT: retq
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-AVX256-LABEL: allones_v4f64_optsize:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX256: # %bb.0:
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
|
|
|
|
; X64-AVX256-NEXT: retq
|
|
|
|
ret <4 x double> <double 0xffffffffffffffff, double 0xffffffffffffffff, double 0xffffffffffffffff, double 0xffffffffffffffff>
|
|
|
|
}
|
|
|
|
|
|
|
|
define <8 x float> @allones_v8f32() nounwind {
|
|
|
|
; X32-SSE-LABEL: allones_v8f32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-SSE: # %bb.0:
|
2017-04-28 19:12:30 +08:00
|
|
|
; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
|
|
|
|
; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1
|
|
|
|
; X32-SSE-NEXT: retl
|
|
|
|
;
|
|
|
|
; X32-AVX1-LABEL: allones_v8f32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-AVX1: # %bb.0:
|
2017-07-28 01:47:01 +08:00
|
|
|
; X32-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
[x86, SSE] AVX1 PR28129 (256-bit all-ones rematerialization)
Further perf tests on Jaguar indicate that:
vxorps %ymm0, %ymm0, %ymm0
vcmpps $15, %ymm0, %ymm0, %ymm0
is consistently faster (by about 9%) than:
vpcmpeqd %xmm0, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm0, %ymm0
Testing equivalent code on a SandyBridge (E5-2640) puts it slightly (~3%) faster as well.
Committed on behalf of @dtemirbulatov
Differential Revision: https://reviews.llvm.org/D32416
llvm-svn: 302989
2017-05-13 21:42:35 +08:00
|
|
|
; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
|
2017-04-28 19:12:30 +08:00
|
|
|
; X32-AVX1-NEXT: retl
|
|
|
|
;
|
|
|
|
; X32-AVX256-LABEL: allones_v8f32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-AVX256: # %bb.0:
|
2017-04-28 19:12:30 +08:00
|
|
|
; X32-AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
|
|
|
|
; X32-AVX256-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-SSE-LABEL: allones_v8f32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-SSE: # %bb.0:
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
|
|
|
|
; X64-SSE-NEXT: pcmpeqd %xmm1, %xmm1
|
|
|
|
; X64-SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; X64-AVX1-LABEL: allones_v8f32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX1: # %bb.0:
|
2017-07-28 01:47:01 +08:00
|
|
|
; X64-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
[x86, SSE] AVX1 PR28129 (256-bit all-ones rematerialization)
Further perf tests on Jaguar indicate that:
vxorps %ymm0, %ymm0, %ymm0
vcmpps $15, %ymm0, %ymm0, %ymm0
is consistently faster (by about 9%) than:
vpcmpeqd %xmm0, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm0, %ymm0
Testing equivalent code on a SandyBridge (E5-2640) puts it slightly (~3%) faster as well.
Committed on behalf of @dtemirbulatov
Differential Revision: https://reviews.llvm.org/D32416
llvm-svn: 302989
2017-05-13 21:42:35 +08:00
|
|
|
; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; X64-AVX256-LABEL: allones_v8f32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX256: # %bb.0:
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
|
|
|
|
; X64-AVX256-NEXT: retq
|
|
|
|
ret <8 x float> <float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000>
|
|
|
|
}
|
|
|
|
|
|
|
|
define <8 x float> @allones_v8f32_optsize() nounwind optsize {
|
|
|
|
; X32-SSE-LABEL: allones_v8f32_optsize:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-SSE: # %bb.0:
|
2017-04-28 19:12:30 +08:00
|
|
|
; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
|
|
|
|
; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1
|
|
|
|
; X32-SSE-NEXT: retl
|
|
|
|
;
|
|
|
|
; X32-AVX1-LABEL: allones_v8f32_optsize:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-AVX1: # %bb.0:
|
2017-07-28 01:47:01 +08:00
|
|
|
; X32-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
[x86, SSE] AVX1 PR28129 (256-bit all-ones rematerialization)
Further perf tests on Jaguar indicate that:
vxorps %ymm0, %ymm0, %ymm0
vcmpps $15, %ymm0, %ymm0, %ymm0
is consistently faster (by about 9%) than:
vpcmpeqd %xmm0, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm0, %ymm0
Testing equivalent code on a SandyBridge (E5-2640) puts it slightly (~3%) faster as well.
Committed on behalf of @dtemirbulatov
Differential Revision: https://reviews.llvm.org/D32416
llvm-svn: 302989
2017-05-13 21:42:35 +08:00
|
|
|
; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
|
2017-04-28 19:12:30 +08:00
|
|
|
; X32-AVX1-NEXT: retl
|
|
|
|
;
|
|
|
|
; X32-AVX256-LABEL: allones_v8f32_optsize:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-AVX256: # %bb.0:
|
2017-04-28 19:12:30 +08:00
|
|
|
; X32-AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
|
|
|
|
; X32-AVX256-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-SSE-LABEL: allones_v8f32_optsize:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-SSE: # %bb.0:
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
|
|
|
|
; X64-SSE-NEXT: pcmpeqd %xmm1, %xmm1
|
|
|
|
; X64-SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; X64-AVX1-LABEL: allones_v8f32_optsize:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX1: # %bb.0:
|
2017-07-28 01:47:01 +08:00
|
|
|
; X64-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
[x86, SSE] AVX1 PR28129 (256-bit all-ones rematerialization)
Further perf tests on Jaguar indicate that:
vxorps %ymm0, %ymm0, %ymm0
vcmpps $15, %ymm0, %ymm0, %ymm0
is consistently faster (by about 9%) than:
vpcmpeqd %xmm0, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm0, %ymm0
Testing equivalent code on a SandyBridge (E5-2640) puts it slightly (~3%) faster as well.
Committed on behalf of @dtemirbulatov
Differential Revision: https://reviews.llvm.org/D32416
llvm-svn: 302989
2017-05-13 21:42:35 +08:00
|
|
|
; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; X64-AVX256-LABEL: allones_v8f32_optsize:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX256: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X64-AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
|
|
|
|
; X64-AVX256-NEXT: retq
|
|
|
|
ret <8 x float> <float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000>
|
|
|
|
}
|
|
|
|
|
2017-04-28 19:12:30 +08:00
|
|
|
define <64 x i8> @allones_v64i8() nounwind {
|
|
|
|
; X32-SSE-LABEL: allones_v64i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-SSE: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
|
|
|
|
; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1
|
|
|
|
; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2
|
|
|
|
; X32-SSE-NEXT: pcmpeqd %xmm3, %xmm3
|
|
|
|
; X32-SSE-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X32-AVX1-LABEL: allones_v64i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-AVX1: # %bb.0:
|
2017-07-28 01:47:01 +08:00
|
|
|
; X32-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
[x86, SSE] AVX1 PR28129 (256-bit all-ones rematerialization)
Further perf tests on Jaguar indicate that:
vxorps %ymm0, %ymm0, %ymm0
vcmpps $15, %ymm0, %ymm0, %ymm0
is consistently faster (by about 9%) than:
vpcmpeqd %xmm0, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm0, %ymm0
Testing equivalent code on a SandyBridge (E5-2640) puts it slightly (~3%) faster as well.
Committed on behalf of @dtemirbulatov
Differential Revision: https://reviews.llvm.org/D32416
llvm-svn: 302989
2017-05-13 21:42:35 +08:00
|
|
|
; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
|
2016-09-27 14:44:27 +08:00
|
|
|
; X32-AVX1-NEXT: vmovaps %ymm0, %ymm1
|
|
|
|
; X32-AVX1-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X32-AVX2-LABEL: allones_v64i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-AVX2: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X32-AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
|
|
|
|
; X32-AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
|
|
|
|
; X32-AVX2-NEXT: retl
|
|
|
|
;
|
2019-08-31 01:35:08 +08:00
|
|
|
; X32-AVX512-LABEL: allones_v64i8:
|
|
|
|
; X32-AVX512: # %bb.0:
|
|
|
|
; X32-AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
|
|
|
|
; X32-AVX512-NEXT: retl
|
2016-09-27 14:44:27 +08:00
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-SSE-LABEL: allones_v64i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-SSE: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
|
|
|
|
; X64-SSE-NEXT: pcmpeqd %xmm1, %xmm1
|
|
|
|
; X64-SSE-NEXT: pcmpeqd %xmm2, %xmm2
|
|
|
|
; X64-SSE-NEXT: pcmpeqd %xmm3, %xmm3
|
|
|
|
; X64-SSE-NEXT: retq
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-AVX1-LABEL: allones_v64i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX1: # %bb.0:
|
2017-07-28 01:47:01 +08:00
|
|
|
; X64-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
[x86, SSE] AVX1 PR28129 (256-bit all-ones rematerialization)
Further perf tests on Jaguar indicate that:
vxorps %ymm0, %ymm0, %ymm0
vcmpps $15, %ymm0, %ymm0, %ymm0
is consistently faster (by about 9%) than:
vpcmpeqd %xmm0, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm0, %ymm0
Testing equivalent code on a SandyBridge (E5-2640) puts it slightly (~3%) faster as well.
Committed on behalf of @dtemirbulatov
Differential Revision: https://reviews.llvm.org/D32416
llvm-svn: 302989
2017-05-13 21:42:35 +08:00
|
|
|
; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
|
2016-09-27 14:44:27 +08:00
|
|
|
; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1
|
|
|
|
; X64-AVX1-NEXT: retq
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-AVX2-LABEL: allones_v64i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX2: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X64-AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
|
|
|
|
; X64-AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
|
|
|
|
; X64-AVX2-NEXT: retq
|
|
|
|
;
|
2019-08-31 01:35:08 +08:00
|
|
|
; X64-AVX512-LABEL: allones_v64i8:
|
|
|
|
; X64-AVX512: # %bb.0:
|
|
|
|
; X64-AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
|
|
|
|
; X64-AVX512-NEXT: retq
|
2016-09-27 14:44:27 +08:00
|
|
|
ret <64 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
|
|
|
|
}
|
|
|
|
|
2017-04-28 19:12:30 +08:00
|
|
|
define <32 x i16> @allones_v32i16() nounwind {
|
|
|
|
; X32-SSE-LABEL: allones_v32i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-SSE: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
|
|
|
|
; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1
|
|
|
|
; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2
|
|
|
|
; X32-SSE-NEXT: pcmpeqd %xmm3, %xmm3
|
|
|
|
; X32-SSE-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X32-AVX1-LABEL: allones_v32i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-AVX1: # %bb.0:
|
2017-07-28 01:47:01 +08:00
|
|
|
; X32-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
[x86, SSE] AVX1 PR28129 (256-bit all-ones rematerialization)
Further perf tests on Jaguar indicate that:
vxorps %ymm0, %ymm0, %ymm0
vcmpps $15, %ymm0, %ymm0, %ymm0
is consistently faster (by about 9%) than:
vpcmpeqd %xmm0, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm0, %ymm0
Testing equivalent code on a SandyBridge (E5-2640) puts it slightly (~3%) faster as well.
Committed on behalf of @dtemirbulatov
Differential Revision: https://reviews.llvm.org/D32416
llvm-svn: 302989
2017-05-13 21:42:35 +08:00
|
|
|
; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
|
2016-09-27 14:44:27 +08:00
|
|
|
; X32-AVX1-NEXT: vmovaps %ymm0, %ymm1
|
|
|
|
; X32-AVX1-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X32-AVX2-LABEL: allones_v32i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-AVX2: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X32-AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
|
|
|
|
; X32-AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
|
|
|
|
; X32-AVX2-NEXT: retl
|
|
|
|
;
|
2019-08-31 01:35:08 +08:00
|
|
|
; X32-AVX512-LABEL: allones_v32i16:
|
|
|
|
; X32-AVX512: # %bb.0:
|
|
|
|
; X32-AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
|
|
|
|
; X32-AVX512-NEXT: retl
|
2016-09-27 14:44:27 +08:00
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-SSE-LABEL: allones_v32i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-SSE: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
|
|
|
|
; X64-SSE-NEXT: pcmpeqd %xmm1, %xmm1
|
|
|
|
; X64-SSE-NEXT: pcmpeqd %xmm2, %xmm2
|
|
|
|
; X64-SSE-NEXT: pcmpeqd %xmm3, %xmm3
|
|
|
|
; X64-SSE-NEXT: retq
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-AVX1-LABEL: allones_v32i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX1: # %bb.0:
|
2017-07-28 01:47:01 +08:00
|
|
|
; X64-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
[x86, SSE] AVX1 PR28129 (256-bit all-ones rematerialization)
Further perf tests on Jaguar indicate that:
vxorps %ymm0, %ymm0, %ymm0
vcmpps $15, %ymm0, %ymm0, %ymm0
is consistently faster (by about 9%) than:
vpcmpeqd %xmm0, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm0, %ymm0
Testing equivalent code on a SandyBridge (E5-2640) puts it slightly (~3%) faster as well.
Committed on behalf of @dtemirbulatov
Differential Revision: https://reviews.llvm.org/D32416
llvm-svn: 302989
2017-05-13 21:42:35 +08:00
|
|
|
; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
|
2016-09-27 14:44:27 +08:00
|
|
|
; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1
|
|
|
|
; X64-AVX1-NEXT: retq
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-AVX2-LABEL: allones_v32i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX2: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X64-AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
|
|
|
|
; X64-AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
|
|
|
|
; X64-AVX2-NEXT: retq
|
|
|
|
;
|
2019-08-31 01:35:08 +08:00
|
|
|
; X64-AVX512-LABEL: allones_v32i16:
|
|
|
|
; X64-AVX512: # %bb.0:
|
|
|
|
; X64-AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
|
|
|
|
; X64-AVX512-NEXT: retq
|
2016-09-27 14:44:27 +08:00
|
|
|
ret <32 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
|
|
|
|
}
|
|
|
|
|
2017-04-28 19:12:30 +08:00
|
|
|
define <16 x i32> @allones_v16i32() nounwind {
|
|
|
|
; X32-SSE-LABEL: allones_v16i32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-SSE: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
|
|
|
|
; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1
|
|
|
|
; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2
|
|
|
|
; X32-SSE-NEXT: pcmpeqd %xmm3, %xmm3
|
|
|
|
; X32-SSE-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X32-AVX1-LABEL: allones_v16i32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-AVX1: # %bb.0:
|
2017-07-28 01:47:01 +08:00
|
|
|
; X32-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
[x86, SSE] AVX1 PR28129 (256-bit all-ones rematerialization)
Further perf tests on Jaguar indicate that:
vxorps %ymm0, %ymm0, %ymm0
vcmpps $15, %ymm0, %ymm0, %ymm0
is consistently faster (by about 9%) than:
vpcmpeqd %xmm0, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm0, %ymm0
Testing equivalent code on a SandyBridge (E5-2640) puts it slightly (~3%) faster as well.
Committed on behalf of @dtemirbulatov
Differential Revision: https://reviews.llvm.org/D32416
llvm-svn: 302989
2017-05-13 21:42:35 +08:00
|
|
|
; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
|
2016-09-27 14:44:27 +08:00
|
|
|
; X32-AVX1-NEXT: vmovaps %ymm0, %ymm1
|
|
|
|
; X32-AVX1-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X32-AVX2-LABEL: allones_v16i32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-AVX2: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X32-AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
|
|
|
|
; X32-AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
|
|
|
|
; X32-AVX2-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X32-AVX512-LABEL: allones_v16i32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-AVX512: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X32-AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
|
|
|
|
; X32-AVX512-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-SSE-LABEL: allones_v16i32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-SSE: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
|
|
|
|
; X64-SSE-NEXT: pcmpeqd %xmm1, %xmm1
|
|
|
|
; X64-SSE-NEXT: pcmpeqd %xmm2, %xmm2
|
|
|
|
; X64-SSE-NEXT: pcmpeqd %xmm3, %xmm3
|
|
|
|
; X64-SSE-NEXT: retq
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-AVX1-LABEL: allones_v16i32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX1: # %bb.0:
|
2017-07-28 01:47:01 +08:00
|
|
|
; X64-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
[x86, SSE] AVX1 PR28129 (256-bit all-ones rematerialization)
Further perf tests on Jaguar indicate that:
vxorps %ymm0, %ymm0, %ymm0
vcmpps $15, %ymm0, %ymm0, %ymm0
is consistently faster (by about 9%) than:
vpcmpeqd %xmm0, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm0, %ymm0
Testing equivalent code on a SandyBridge (E5-2640) puts it slightly (~3%) faster as well.
Committed on behalf of @dtemirbulatov
Differential Revision: https://reviews.llvm.org/D32416
llvm-svn: 302989
2017-05-13 21:42:35 +08:00
|
|
|
; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
|
2016-09-27 14:44:27 +08:00
|
|
|
; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1
|
|
|
|
; X64-AVX1-NEXT: retq
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-AVX2-LABEL: allones_v16i32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX2: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X64-AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
|
|
|
|
; X64-AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
|
|
|
|
; X64-AVX2-NEXT: retq
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-AVX512-LABEL: allones_v16i32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX512: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X64-AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
|
|
|
|
; X64-AVX512-NEXT: retq
|
|
|
|
ret <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
|
|
|
|
}
|
|
|
|
|
2017-04-28 19:12:30 +08:00
|
|
|
define <8 x i64> @allones_v8i64() nounwind {
|
|
|
|
; X32-SSE-LABEL: allones_v8i64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-SSE: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
|
|
|
|
; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1
|
|
|
|
; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2
|
|
|
|
; X32-SSE-NEXT: pcmpeqd %xmm3, %xmm3
|
|
|
|
; X32-SSE-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X32-AVX1-LABEL: allones_v8i64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-AVX1: # %bb.0:
|
2017-07-28 01:47:01 +08:00
|
|
|
; X32-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
[x86, SSE] AVX1 PR28129 (256-bit all-ones rematerialization)
Further perf tests on Jaguar indicate that:
vxorps %ymm0, %ymm0, %ymm0
vcmpps $15, %ymm0, %ymm0, %ymm0
is consistently faster (by about 9%) than:
vpcmpeqd %xmm0, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm0, %ymm0
Testing equivalent code on a SandyBridge (E5-2640) puts it slightly (~3%) faster as well.
Committed on behalf of @dtemirbulatov
Differential Revision: https://reviews.llvm.org/D32416
llvm-svn: 302989
2017-05-13 21:42:35 +08:00
|
|
|
; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
|
2016-09-27 14:44:27 +08:00
|
|
|
; X32-AVX1-NEXT: vmovaps %ymm0, %ymm1
|
|
|
|
; X32-AVX1-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X32-AVX2-LABEL: allones_v8i64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-AVX2: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X32-AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
|
|
|
|
; X32-AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
|
|
|
|
; X32-AVX2-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X32-AVX512-LABEL: allones_v8i64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-AVX512: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X32-AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
|
|
|
|
; X32-AVX512-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-SSE-LABEL: allones_v8i64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-SSE: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
|
|
|
|
; X64-SSE-NEXT: pcmpeqd %xmm1, %xmm1
|
|
|
|
; X64-SSE-NEXT: pcmpeqd %xmm2, %xmm2
|
|
|
|
; X64-SSE-NEXT: pcmpeqd %xmm3, %xmm3
|
|
|
|
; X64-SSE-NEXT: retq
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-AVX1-LABEL: allones_v8i64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX1: # %bb.0:
|
2017-07-28 01:47:01 +08:00
|
|
|
; X64-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
[x86, SSE] AVX1 PR28129 (256-bit all-ones rematerialization)
Further perf tests on Jaguar indicate that:
vxorps %ymm0, %ymm0, %ymm0
vcmpps $15, %ymm0, %ymm0, %ymm0
is consistently faster (by about 9%) than:
vpcmpeqd %xmm0, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm0, %ymm0
Testing equivalent code on a SandyBridge (E5-2640) puts it slightly (~3%) faster as well.
Committed on behalf of @dtemirbulatov
Differential Revision: https://reviews.llvm.org/D32416
llvm-svn: 302989
2017-05-13 21:42:35 +08:00
|
|
|
; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
|
2016-09-27 14:44:27 +08:00
|
|
|
; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1
|
|
|
|
; X64-AVX1-NEXT: retq
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-AVX2-LABEL: allones_v8i64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX2: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X64-AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
|
|
|
|
; X64-AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
|
|
|
|
; X64-AVX2-NEXT: retq
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-AVX512-LABEL: allones_v8i64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX512: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X64-AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
|
|
|
|
; X64-AVX512-NEXT: retq
|
|
|
|
ret <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
|
|
|
|
}
|
|
|
|
|
2017-04-28 19:12:30 +08:00
|
|
|
define <8 x double> @allones_v8f64() nounwind {
|
|
|
|
; X32-SSE-LABEL: allones_v8f64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-SSE: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
|
|
|
|
; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1
|
|
|
|
; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2
|
|
|
|
; X32-SSE-NEXT: pcmpeqd %xmm3, %xmm3
|
|
|
|
; X32-SSE-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X32-AVX1-LABEL: allones_v8f64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-AVX1: # %bb.0:
|
2017-07-28 01:47:01 +08:00
|
|
|
; X32-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
[x86, SSE] AVX1 PR28129 (256-bit all-ones rematerialization)
Further perf tests on Jaguar indicate that:
vxorps %ymm0, %ymm0, %ymm0
vcmpps $15, %ymm0, %ymm0, %ymm0
is consistently faster (by about 9%) than:
vpcmpeqd %xmm0, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm0, %ymm0
Testing equivalent code on a SandyBridge (E5-2640) puts it slightly (~3%) faster as well.
Committed on behalf of @dtemirbulatov
Differential Revision: https://reviews.llvm.org/D32416
llvm-svn: 302989
2017-05-13 21:42:35 +08:00
|
|
|
; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
|
2016-09-27 14:44:27 +08:00
|
|
|
; X32-AVX1-NEXT: vmovaps %ymm0, %ymm1
|
|
|
|
; X32-AVX1-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X32-AVX2-LABEL: allones_v8f64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-AVX2: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X32-AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
|
|
|
|
; X32-AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
|
|
|
|
; X32-AVX2-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X32-AVX512-LABEL: allones_v8f64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-AVX512: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X32-AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
|
|
|
|
; X32-AVX512-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-SSE-LABEL: allones_v8f64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-SSE: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
|
|
|
|
; X64-SSE-NEXT: pcmpeqd %xmm1, %xmm1
|
|
|
|
; X64-SSE-NEXT: pcmpeqd %xmm2, %xmm2
|
|
|
|
; X64-SSE-NEXT: pcmpeqd %xmm3, %xmm3
|
|
|
|
; X64-SSE-NEXT: retq
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-AVX1-LABEL: allones_v8f64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX1: # %bb.0:
|
2017-07-28 01:47:01 +08:00
|
|
|
; X64-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
[x86, SSE] AVX1 PR28129 (256-bit all-ones rematerialization)
Further perf tests on Jaguar indicate that:
vxorps %ymm0, %ymm0, %ymm0
vcmpps $15, %ymm0, %ymm0, %ymm0
is consistently faster (by about 9%) than:
vpcmpeqd %xmm0, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm0, %ymm0
Testing equivalent code on a SandyBridge (E5-2640) puts it slightly (~3%) faster as well.
Committed on behalf of @dtemirbulatov
Differential Revision: https://reviews.llvm.org/D32416
llvm-svn: 302989
2017-05-13 21:42:35 +08:00
|
|
|
; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
|
2016-09-27 14:44:27 +08:00
|
|
|
; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1
|
|
|
|
; X64-AVX1-NEXT: retq
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-AVX2-LABEL: allones_v8f64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX2: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X64-AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
|
|
|
|
; X64-AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
|
|
|
|
; X64-AVX2-NEXT: retq
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-AVX512-LABEL: allones_v8f64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX512: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X64-AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
|
|
|
|
; X64-AVX512-NEXT: retq
|
|
|
|
ret <8 x double> <double 0xffffffffffffffff, double 0xffffffffffffffff, double 0xffffffffffffffff, double 0xffffffffffffffff, double 0xffffffffffffffff, double 0xffffffffffffffff, double 0xffffffffffffffff, double 0xffffffffffffffff>
|
|
|
|
}
|
|
|
|
|
2017-04-28 19:12:30 +08:00
|
|
|
define <16 x float> @allones_v16f32() nounwind {
|
|
|
|
; X32-SSE-LABEL: allones_v16f32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-SSE: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
|
|
|
|
; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1
|
|
|
|
; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2
|
|
|
|
; X32-SSE-NEXT: pcmpeqd %xmm3, %xmm3
|
|
|
|
; X32-SSE-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X32-AVX1-LABEL: allones_v16f32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-AVX1: # %bb.0:
|
2017-07-28 01:47:01 +08:00
|
|
|
; X32-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
[x86, SSE] AVX1 PR28129 (256-bit all-ones rematerialization)
Further perf tests on Jaguar indicate that:
vxorps %ymm0, %ymm0, %ymm0
vcmpps $15, %ymm0, %ymm0, %ymm0
is consistently faster (by about 9%) than:
vpcmpeqd %xmm0, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm0, %ymm0
Testing equivalent code on a SandyBridge (E5-2640) puts it slightly (~3%) faster as well.
Committed on behalf of @dtemirbulatov
Differential Revision: https://reviews.llvm.org/D32416
llvm-svn: 302989
2017-05-13 21:42:35 +08:00
|
|
|
; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
|
2016-09-27 14:44:27 +08:00
|
|
|
; X32-AVX1-NEXT: vmovaps %ymm0, %ymm1
|
|
|
|
; X32-AVX1-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X32-AVX2-LABEL: allones_v16f32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-AVX2: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X32-AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
|
|
|
|
; X32-AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
|
|
|
|
; X32-AVX2-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X32-AVX512-LABEL: allones_v16f32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X32-AVX512: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X32-AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
|
|
|
|
; X32-AVX512-NEXT: retl
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-SSE-LABEL: allones_v16f32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-SSE: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
|
|
|
|
; X64-SSE-NEXT: pcmpeqd %xmm1, %xmm1
|
|
|
|
; X64-SSE-NEXT: pcmpeqd %xmm2, %xmm2
|
|
|
|
; X64-SSE-NEXT: pcmpeqd %xmm3, %xmm3
|
|
|
|
; X64-SSE-NEXT: retq
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-AVX1-LABEL: allones_v16f32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX1: # %bb.0:
|
2017-07-28 01:47:01 +08:00
|
|
|
; X64-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
[x86, SSE] AVX1 PR28129 (256-bit all-ones rematerialization)
Further perf tests on Jaguar indicate that:
vxorps %ymm0, %ymm0, %ymm0
vcmpps $15, %ymm0, %ymm0, %ymm0
is consistently faster (by about 9%) than:
vpcmpeqd %xmm0, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm0, %ymm0
Testing equivalent code on a SandyBridge (E5-2640) puts it slightly (~3%) faster as well.
Committed on behalf of @dtemirbulatov
Differential Revision: https://reviews.llvm.org/D32416
llvm-svn: 302989
2017-05-13 21:42:35 +08:00
|
|
|
; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
|
2016-09-27 14:44:27 +08:00
|
|
|
; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1
|
|
|
|
; X64-AVX1-NEXT: retq
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-AVX2-LABEL: allones_v16f32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX2: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X64-AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
|
|
|
|
; X64-AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
|
|
|
|
; X64-AVX2-NEXT: retq
|
|
|
|
;
|
2017-04-28 19:12:30 +08:00
|
|
|
; X64-AVX512-LABEL: allones_v16f32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX512: # %bb.0:
|
2016-09-27 14:44:27 +08:00
|
|
|
; X64-AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
|
|
|
|
; X64-AVX512-NEXT: retq
|
|
|
|
ret <16 x float> <float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000>
|
|
|
|
}
|