From 879c5b15c404fc48dde24ea292a4c07646ff8963 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 5 Nov 2017 19:48:24 +0000 Subject: [PATCH] [X86][SSE] Tests for integer min/max horizontal reductions Matching patterns that vectorizers should have created for us. The experimental intrinsics should probably be added as well. llvm-svn: 317439 --- .../CodeGen/X86/horizontal-reduce-smax.ll | 1896 ++++++++++++++ .../CodeGen/X86/horizontal-reduce-smin.ll | 1898 ++++++++++++++ .../CodeGen/X86/horizontal-reduce-umax.ll | 2203 ++++++++++++++++ .../CodeGen/X86/horizontal-reduce-umin.ll | 2207 +++++++++++++++++ 4 files changed, 8204 insertions(+) create mode 100644 llvm/test/CodeGen/X86/horizontal-reduce-smax.ll create mode 100644 llvm/test/CodeGen/X86/horizontal-reduce-smin.ll create mode 100644 llvm/test/CodeGen/X86/horizontal-reduce-umax.ll create mode 100644 llvm/test/CodeGen/X86/horizontal-reduce-umin.ll diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll new file mode 100644 index 000000000000..8f5aac493b54 --- /dev/null +++ b/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll @@ -0,0 +1,1896 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE2 +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE42 +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1 +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE2 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE42 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX512 + +; +; 128-bit Vectors +; + +define i64 @test_reduce_v2i64(<2 x i64> %a0) { +; X86-SSE2-LABEL: test_reduce_v2i64: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: pxor %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm4 +; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm5, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; X86-SSE2-NEXT: por %xmm2, %xmm3 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm3 +; X86-SSE2-NEXT: por %xmm0, %xmm3 +; X86-SSE2-NEXT: movd %xmm3, %eax +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] +; X86-SSE2-NEXT: movd %xmm0, %edx +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v2i64: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; X86-SSE42-NEXT: movd %xmm2, %eax +; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx +; X86-SSE42-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v2i64: +; X86-AVX: ## BB#0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; X86-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vmovd %xmm0, %eax +; X86-AVX-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v2i64: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pxor %xmm2, %xmm3 +; X64-SSE2-NEXT: pxor %xmm1, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm3, %xmm4 +; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm5, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; X64-SSE2-NEXT: por %xmm2, %xmm3 +; X64-SSE2-NEXT: pand %xmm3, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm3 +; X64-SSE2-NEXT: por %xmm0, %xmm3 +; X64-SSE2-NEXT: movq %xmm3, %rax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v2i64: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; X64-SSE42-NEXT: movq %xmm2, %rax +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v2i64: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v2i64: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v2i64: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovq %xmm0, %rax +; X64-AVX512-NEXT: retq + %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> + %2 = icmp sgt <2 x i64> %a0, %1 + %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %1 + %4 = extractelement <2 x i64> %3, i32 0 + ret i64 %4 +} + +define i32 @test_reduce_v4i32(<4 x i32> %a0) { +; X86-SSE2-LABEL: test_reduce_v4i32: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v4i32: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pmaxsd %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax +; X86-SSE42-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v4i32: +; X86-AVX: ## BB#0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovd %xmm0, %eax +; X86-AVX-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v4i32: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm2 +; X64-SSE2-NEXT: por %xmm0, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v4i32: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pmaxsd %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax +; X64-SSE42-NEXT: retq +; +; X64-AVX-LABEL: test_reduce_v4i32: +; X64-AVX: ## BB#0: +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovd %xmm0, %eax +; X64-AVX-NEXT: retq + %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> + %2 = icmp sgt <4 x i32> %a0, %1 + %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %1 + %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> + %5 = icmp sgt <4 x i32> %3, %4 + %6 = select <4 x i1> %5, <4 x i32> %3, <4 x i32> %4 + %7 = extractelement <4 x i32> %6, i32 0 + ret i32 %7 +} + +define i16 @test_reduce_v8i16(<8 x i16> %a0) { +; X86-SSE-LABEL: test_reduce_v8i16: +; X86-SSE: ## BB#0: +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE-NEXT: pmaxsw %xmm1, %xmm0 +; X86-SSE-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE-NEXT: psrld $16, %xmm1 +; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1 +; X86-SSE-NEXT: movd %xmm1, %eax +; X86-SSE-NEXT: ## kill: %AX %AX %EAX +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v8i16: +; X86-AVX: ## BB#0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovd %xmm0, %eax +; X86-AVX-NEXT: ## kill: %AX %AX %EAX +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: test_reduce_v8i16: +; X64-SSE: ## BB#0: +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE-NEXT: pmaxsw %xmm1, %xmm0 +; X64-SSE-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE-NEXT: psrld $16, %xmm1 +; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1 +; X64-SSE-NEXT: movd %xmm1, %eax +; X64-SSE-NEXT: ## kill: %AX %AX %EAX +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: test_reduce_v8i16: +; X64-AVX: ## BB#0: +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovd %xmm0, %eax +; X64-AVX-NEXT: ## kill: %AX %AX %EAX +; X64-AVX-NEXT: retq + %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> + %2 = icmp sgt <8 x i16> %a0, %1 + %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %1 + %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> + %5 = icmp sgt <8 x i16> %3, %4 + %6 = select <8 x i1> %5, <8 x i16> %3, <8 x i16> %4 + %7 = shufflevector <8 x i16> %6, <8 x i16> undef, <8 x i32> + %8 = icmp sgt <8 x i16> %6, %7 + %9 = select <8 x i1> %8, <8 x i16> %6, <8 x i16> %7 + %10 = extractelement <8 x i16> %9, i32 0 + ret i16 %10 +} + +define i8 @test_reduce_v16i8(<16 x i8> %a0) { +; X86-SSE2-LABEL: test_reduce_v16i8: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrld $16, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: ## kill: %AL %AL %EAX +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v16i8: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: psrld $16, %xmm1 +; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1 +; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE42-NEXT: psrlw $8, %xmm0 +; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X86-SSE42-NEXT: ## kill: %AL %AL %EAX +; X86-SSE42-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v16i8: +; X86-AVX: ## BB#0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX-NEXT: ## kill: %AL %AL %EAX +; X86-AVX-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v16i8: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm2 +; X64-SSE2-NEXT: por %xmm0, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE2-NEXT: psrld $16, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm1 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm1, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X64-SSE2-NEXT: psrlw $8, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: ## kill: %AL %AL %EAX +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v16i8: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: psrld $16, %xmm1 +; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1 +; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE42-NEXT: psrlw $8, %xmm0 +; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X64-SSE42-NEXT: ## kill: %AL %AL %EAX +; X64-SSE42-NEXT: retq +; +; X64-AVX-LABEL: test_reduce_v16i8: +; X64-AVX: ## BB#0: +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX-NEXT: ## kill: %AL %AL %EAX +; X64-AVX-NEXT: retq + %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> + %2 = icmp sgt <16 x i8> %a0, %1 + %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %1 + %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> + %5 = icmp sgt <16 x i8> %3, %4 + %6 = select <16 x i1> %5, <16 x i8> %3, <16 x i8> %4 + %7 = shufflevector <16 x i8> %6, <16 x i8> undef, <16 x i32> + %8 = icmp sgt <16 x i8> %6, %7 + %9 = select <16 x i1> %8, <16 x i8> %6, <16 x i8> %7 + %10 = shufflevector <16 x i8> %9, <16 x i8> undef, <16 x i32> + %11 = icmp sgt <16 x i8> %9, %10 + %12 = select <16 x i1> %11, <16 x i8> %9, <16 x i8> %10 + %13 = extractelement <16 x i8> %12, i32 0 + ret i8 %13 +} + +; +; 256-bit Vectors +; + +define i64 @test_reduce_v4i64(<4 x i64> %a0) { +; X86-SSE2-LABEL: test_reduce_v4i64: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 +; X86-SSE2-NEXT: pxor %xmm2, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm5 +; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm6, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; X86-SSE2-NEXT: por %xmm3, %xmm4 +; X86-SSE2-NEXT: pand %xmm4, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm4 +; X86-SSE2-NEXT: por %xmm0, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: pxor %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm5, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm4 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm4, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X86-SSE2-NEXT: movd %xmm0, %edx +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v4i64: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; X86-SSE42-NEXT: movd %xmm2, %eax +; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v4i64: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v4i64: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v4i64: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0] +; X64-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X64-SSE2-NEXT: pxor %xmm2, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm4 +; X64-SSE2-NEXT: pxor %xmm2, %xmm4 +; X64-SSE2-NEXT: movdqa %xmm4, %xmm5 +; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm4 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm6, %xmm3 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; X64-SSE2-NEXT: por %xmm3, %xmm4 +; X64-SSE2-NEXT: pand %xmm4, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm4 +; X64-SSE2-NEXT: por %xmm0, %xmm4 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X64-SSE2-NEXT: pxor %xmm2, %xmm1 +; X64-SSE2-NEXT: pxor %xmm0, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm5, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; X64-SSE2-NEXT: por %xmm1, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm4 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm4, %xmm2 +; X64-SSE2-NEXT: movq %xmm2, %rax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v4i64: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; X64-SSE42-NEXT: movq %xmm2, %rax +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v4i64: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v4i64: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v4i64: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vmovq %xmm0, %rax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> + %2 = icmp sgt <4 x i64> %a0, %1 + %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %1 + %4 = shufflevector <4 x i64> %3, <4 x i64> undef, <4 x i32> + %5 = icmp sgt <4 x i64> %3, %4 + %6 = select <4 x i1> %5, <4 x i64> %3, <4 x i64> %4 + %7 = extractelement <4 x i64> %6, i32 0 + ret i64 %7 +} + +define i32 @test_reduce_v8i32(<8 x i32> %a0) { +; X86-SSE2-LABEL: test_reduce_v8i32: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v8i32: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pmaxsd %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v8i32: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v8i32: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v8i32: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm2 +; X64-SSE2-NEXT: por %xmm0, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm1 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm1, %xmm2 +; X64-SSE2-NEXT: movd %xmm2, %eax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v8i32: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pmaxsd %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v8i32: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v8i32: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v8i32: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> + %2 = icmp sgt <8 x i32> %a0, %1 + %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %1 + %4 = shufflevector <8 x i32> %3, <8 x i32> undef, <8 x i32> + %5 = icmp sgt <8 x i32> %3, %4 + %6 = select <8 x i1> %5, <8 x i32> %3, <8 x i32> %4 + %7 = shufflevector <8 x i32> %6, <8 x i32> undef, <8 x i32> + %8 = icmp sgt <8 x i32> %6, %7 + %9 = select <8 x i1> %8, <8 x i32> %6, <8 x i32> %7 + %10 = extractelement <8 x i32> %9, i32 0 + ret i32 %10 +} + +define i16 @test_reduce_v16i16(<16 x i16> %a0) { +; X86-SSE-LABEL: test_reduce_v16i16: +; X86-SSE: ## BB#0: +; X86-SSE-NEXT: pmaxsw %xmm1, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE-NEXT: pmaxsw %xmm1, %xmm0 +; X86-SSE-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE-NEXT: psrld $16, %xmm1 +; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1 +; X86-SSE-NEXT: movd %xmm1, %eax +; X86-SSE-NEXT: ## kill: %AX %AX %EAX +; X86-SSE-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v16i16: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: ## kill: %AX %AX %EAX +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v16i16: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: %AX %AX %EAX +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE-LABEL: test_reduce_v16i16: +; X64-SSE: ## BB#0: +; X64-SSE-NEXT: pmaxsw %xmm1, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE-NEXT: pmaxsw %xmm1, %xmm0 +; X64-SSE-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE-NEXT: psrld $16, %xmm1 +; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1 +; X64-SSE-NEXT: movd %xmm1, %eax +; X64-SSE-NEXT: ## kill: %AX %AX %EAX +; X64-SSE-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v16i16: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: ## kill: %AX %AX %EAX +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v16i16: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: %AX %AX %EAX +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v16i16: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: ## kill: %AX %AX %EAX +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> + %2 = icmp sgt <16 x i16> %a0, %1 + %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %1 + %4 = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> + %5 = icmp sgt <16 x i16> %3, %4 + %6 = select <16 x i1> %5, <16 x i16> %3, <16 x i16> %4 + %7 = shufflevector <16 x i16> %6, <16 x i16> undef, <16 x i32> + %8 = icmp sgt <16 x i16> %6, %7 + %9 = select <16 x i1> %8, <16 x i16> %6, <16 x i16> %7 + %10 = shufflevector <16 x i16> %9, <16 x i16> undef, <16 x i32> + %11 = icmp sgt <16 x i16> %9, %10 + %12 = select <16 x i1> %11, <16 x i16> %9, <16 x i16> %10 + %13 = extractelement <16 x i16> %12, i32 0 + ret i16 %13 +} + +define i8 @test_reduce_v32i8(<32 x i8> %a0) { +; X86-SSE2-LABEL: test_reduce_v32i8: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: psrld $16, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: ## kill: %AL %AL %EAX +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v32i8: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: psrld $16, %xmm1 +; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1 +; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE42-NEXT: psrlw $8, %xmm0 +; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X86-SSE42-NEXT: ## kill: %AL %AL %EAX +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v32i8: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX1-NEXT: ## kill: %AL %AL %EAX +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v32i8: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX2-NEXT: ## kill: %AL %AL %EAX +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v32i8: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm2 +; X64-SSE2-NEXT: por %xmm0, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm1 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm1, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X64-SSE2-NEXT: psrld $16, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE2-NEXT: psrlw $8, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm1 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm1, %xmm2 +; X64-SSE2-NEXT: movd %xmm2, %eax +; X64-SSE2-NEXT: ## kill: %AL %AL %EAX +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v32i8: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: psrld $16, %xmm1 +; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1 +; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE42-NEXT: psrlw $8, %xmm0 +; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X64-SSE42-NEXT: ## kill: %AL %AL %EAX +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v32i8: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX1-NEXT: ## kill: %AL %AL %EAX +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v32i8: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX2-NEXT: ## kill: %AL %AL %EAX +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v32i8: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX512-NEXT: ## kill: %AL %AL %EAX +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> + %2 = icmp sgt <32 x i8> %a0, %1 + %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %1 + %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> + %5 = icmp sgt <32 x i8> %3, %4 + %6 = select <32 x i1> %5, <32 x i8> %3, <32 x i8> %4 + %7 = shufflevector <32 x i8> %6, <32 x i8> undef, <32 x i32> + %8 = icmp sgt <32 x i8> %6, %7 + %9 = select <32 x i1> %8, <32 x i8> %6, <32 x i8> %7 + %10 = shufflevector <32 x i8> %9, <32 x i8> undef, <32 x i32> + %11 = icmp sgt <32 x i8> %9, %10 + %12 = select <32 x i1> %11, <32 x i8> %9, <32 x i8> %10 + %13 = shufflevector <32 x i8> %12, <32 x i8> undef, <32 x i32> + %14 = icmp sgt <32 x i8> %12, %13 + %15 = select <32 x i1> %14, <32 x i8> %12, <32 x i8> %13 + %16 = extractelement <32 x i8> %15, i32 0 + ret i8 %16 +} + +; +; 512-bit Vectors +; + +define i64 @test_reduce_v8i64(<8 x i64> %a0) { +; X86-SSE2-LABEL: test_reduce_v8i64: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: subl $28, %esp +; X86-SSE2-NEXT: .cfi_def_cfa_offset 32 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm5, (%esp) ## 16-byte Spill +; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0] +; X86-SSE2-NEXT: pxor %xmm4, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm6 +; X86-SSE2-NEXT: pxor %xmm4, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm6, %xmm7 +; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; X86-SSE2-NEXT: pcmpeqd %xmm5, %xmm6 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm5, %xmm6 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; X86-SSE2-NEXT: por %xmm6, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm6 +; X86-SSE2-NEXT: pxor %xmm4, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm7 +; X86-SSE2-NEXT: pxor %xmm4, %xmm7 +; X86-SSE2-NEXT: movdqa %xmm7, %xmm0 +; X86-SSE2-NEXT: pcmpgtd %xmm6, %xmm0 +; X86-SSE2-NEXT: pcmpeqd %xmm6, %xmm7 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] +; X86-SSE2-NEXT: pand %xmm6, %xmm7 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; X86-SSE2-NEXT: por %xmm7, %xmm6 +; X86-SSE2-NEXT: pand %xmm6, %xmm1 +; X86-SSE2-NEXT: pandn %xmm3, %xmm6 +; X86-SSE2-NEXT: por %xmm1, %xmm6 +; X86-SSE2-NEXT: pand %xmm5, %xmm2 +; X86-SSE2-NEXT: pandn (%esp), %xmm5 ## 16-byte Folded Reload +; X86-SSE2-NEXT: por %xmm2, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm0 +; X86-SSE2-NEXT: pxor %xmm4, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm6, %xmm1 +; X86-SSE2-NEXT: pxor %xmm4, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; X86-SSE2-NEXT: por %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm0, %xmm6 +; X86-SSE2-NEXT: pandn %xmm5, %xmm0 +; X86-SSE2-NEXT: por %xmm6, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pxor %xmm4, %xmm2 +; X86-SSE2-NEXT: pxor %xmm1, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 +; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm3 +; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm2, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; X86-SSE2-NEXT: por %xmm4, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X86-SSE2-NEXT: movd %xmm0, %edx +; X86-SSE2-NEXT: addl $28, %esp +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v8i64: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: movdqa %xmm0, %xmm4 +; X86-SSE42-NEXT: movdqa %xmm4, %xmm5 +; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm5 +; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3 +; X86-SSE42-NEXT: movdqa %xmm5, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; X86-SSE42-NEXT: movapd %xmm2, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; X86-SSE42-NEXT: movdqa %xmm3, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 +; X86-SSE42-NEXT: movd %xmm1, %eax +; X86-SSE42-NEXT: pextrd $1, %xmm1, %edx +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v8i64: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v8i64: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v8i64: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0] +; X64-SSE2-NEXT: movdqa %xmm3, %xmm5 +; X64-SSE2-NEXT: pxor %xmm4, %xmm5 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm6 +; X64-SSE2-NEXT: pxor %xmm4, %xmm6 +; X64-SSE2-NEXT: movdqa %xmm6, %xmm7 +; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm5, %xmm6 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm8, %xmm6 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] +; X64-SSE2-NEXT: por %xmm6, %xmm8 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm6 +; X64-SSE2-NEXT: pxor %xmm4, %xmm6 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm7 +; X64-SSE2-NEXT: pxor %xmm4, %xmm7 +; X64-SSE2-NEXT: movdqa %xmm7, %xmm5 +; X64-SSE2-NEXT: pcmpgtd %xmm6, %xmm5 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm6, %xmm7 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm9, %xmm7 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] +; X64-SSE2-NEXT: por %xmm7, %xmm6 +; X64-SSE2-NEXT: pand %xmm6, %xmm0 +; X64-SSE2-NEXT: pandn %xmm2, %xmm6 +; X64-SSE2-NEXT: por %xmm0, %xmm6 +; X64-SSE2-NEXT: pand %xmm8, %xmm1 +; X64-SSE2-NEXT: pandn %xmm3, %xmm8 +; X64-SSE2-NEXT: por %xmm1, %xmm8 +; X64-SSE2-NEXT: movdqa %xmm8, %xmm0 +; X64-SSE2-NEXT: pxor %xmm4, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm6, %xmm1 +; X64-SSE2-NEXT: pxor %xmm4, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm3, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; X64-SSE2-NEXT: por %xmm0, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm6 +; X64-SSE2-NEXT: pandn %xmm8, %xmm1 +; X64-SSE2-NEXT: por %xmm6, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pxor %xmm4, %xmm2 +; X64-SSE2-NEXT: pxor %xmm0, %xmm4 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm3 +; X64-SSE2-NEXT: pcmpgtd %xmm4, %xmm3 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm2, %xmm4 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm5, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; X64-SSE2-NEXT: por %xmm2, %xmm3 +; X64-SSE2-NEXT: pand %xmm3, %xmm1 +; X64-SSE2-NEXT: pandn %xmm0, %xmm3 +; X64-SSE2-NEXT: por %xmm1, %xmm3 +; X64-SSE2-NEXT: movq %xmm3, %rax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v8i64: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: movdqa %xmm0, %xmm4 +; X64-SSE42-NEXT: movdqa %xmm4, %xmm5 +; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm5 +; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3 +; X64-SSE42-NEXT: movdqa %xmm5, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; X64-SSE42-NEXT: movapd %xmm2, %xmm0 +; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; X64-SSE42-NEXT: movdqa %xmm3, %xmm0 +; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 +; X64-SSE42-NEXT: movq %xmm1, %rax +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v8i64: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v8i64: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v8i64: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; X64-AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vmovq %xmm0, %rax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> + %2 = icmp sgt <8 x i64> %a0, %1 + %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %1 + %4 = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> + %5 = icmp sgt <8 x i64> %3, %4 + %6 = select <8 x i1> %5, <8 x i64> %3, <8 x i64> %4 + %7 = shufflevector <8 x i64> %6, <8 x i64> undef, <8 x i32> + %8 = icmp sgt <8 x i64> %6, %7 + %9 = select <8 x i1> %8, <8 x i64> %6, <8 x i64> %7 + %10 = extractelement <8 x i64> %9, i32 0 + ret i64 %10 +} + +define i32 @test_reduce_v16i32(<16 x i32> %a0) { +; X86-SSE2-LABEL: test_reduce_v16i32: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 +; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm5 +; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; X86-SSE2-NEXT: pand %xmm5, %xmm1 +; X86-SSE2-NEXT: pandn %xmm3, %xmm5 +; X86-SSE2-NEXT: por %xmm1, %xmm5 +; X86-SSE2-NEXT: pand %xmm4, %xmm0 +; X86-SSE2-NEXT: pandn %xmm2, %xmm4 +; X86-SSE2-NEXT: por %xmm0, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm0 +; X86-SSE2-NEXT: pand %xmm0, %xmm4 +; X86-SSE2-NEXT: pandn %xmm5, %xmm0 +; X86-SSE2-NEXT: por %xmm4, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v16i32: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pmaxsd %xmm3, %xmm1 +; X86-SSE42-NEXT: pmaxsd %xmm2, %xmm0 +; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pmaxsd %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v16i32: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX1-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2 +; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpmaxsd %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v16i32: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v16i32: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa %xmm0, %xmm4 +; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm5 +; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; X64-SSE2-NEXT: pand %xmm5, %xmm1 +; X64-SSE2-NEXT: pandn %xmm3, %xmm5 +; X64-SSE2-NEXT: por %xmm1, %xmm5 +; X64-SSE2-NEXT: pand %xmm4, %xmm0 +; X64-SSE2-NEXT: pandn %xmm2, %xmm4 +; X64-SSE2-NEXT: por %xmm0, %xmm4 +; X64-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm0 +; X64-SSE2-NEXT: pand %xmm0, %xmm4 +; X64-SSE2-NEXT: pandn %xmm5, %xmm0 +; X64-SSE2-NEXT: por %xmm4, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm2 +; X64-SSE2-NEXT: por %xmm0, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v16i32: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pmaxsd %xmm3, %xmm1 +; X64-SSE42-NEXT: pmaxsd %xmm2, %xmm0 +; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pmaxsd %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v16i32: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-AVX1-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2 +; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpmaxsd %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v16i32: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v16i32: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; X64-AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> + %2 = icmp sgt <16 x i32> %a0, %1 + %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %1 + %4 = shufflevector <16 x i32> %3, <16 x i32> undef, <16 x i32> + %5 = icmp sgt <16 x i32> %3, %4 + %6 = select <16 x i1> %5, <16 x i32> %3, <16 x i32> %4 + %7 = shufflevector <16 x i32> %6, <16 x i32> undef, <16 x i32> + %8 = icmp sgt <16 x i32> %6, %7 + %9 = select <16 x i1> %8, <16 x i32> %6, <16 x i32> %7 + %10 = shufflevector <16 x i32> %9, <16 x i32> undef, <16 x i32> + %11 = icmp sgt <16 x i32> %9, %10 + %12 = select <16 x i1> %11, <16 x i32> %9, <16 x i32> %10 + %13 = extractelement <16 x i32> %12, i32 0 + ret i32 %13 +} + +define i16 @test_reduce_v32i16(<32 x i16> %a0) { +; X86-SSE-LABEL: test_reduce_v32i16: +; X86-SSE: ## BB#0: +; X86-SSE-NEXT: pmaxsw %xmm3, %xmm1 +; X86-SSE-NEXT: pmaxsw %xmm2, %xmm0 +; X86-SSE-NEXT: pmaxsw %xmm1, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE-NEXT: pmaxsw %xmm1, %xmm0 +; X86-SSE-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE-NEXT: psrld $16, %xmm1 +; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1 +; X86-SSE-NEXT: movd %xmm1, %eax +; X86-SSE-NEXT: ## kill: %AX %AX %EAX +; X86-SSE-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v32i16: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX1-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2 +; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpmaxsw %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: ## kill: %AX %AX %EAX +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v32i16: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: %AX %AX %EAX +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE-LABEL: test_reduce_v32i16: +; X64-SSE: ## BB#0: +; X64-SSE-NEXT: pmaxsw %xmm3, %xmm1 +; X64-SSE-NEXT: pmaxsw %xmm2, %xmm0 +; X64-SSE-NEXT: pmaxsw %xmm1, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE-NEXT: pmaxsw %xmm1, %xmm0 +; X64-SSE-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE-NEXT: psrld $16, %xmm1 +; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1 +; X64-SSE-NEXT: movd %xmm1, %eax +; X64-SSE-NEXT: ## kill: %AX %AX %EAX +; X64-SSE-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v32i16: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-AVX1-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2 +; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpmaxsw %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: ## kill: %AX %AX %EAX +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v32i16: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: %AX %AX %EAX +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v32i16: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: ## kill: %AX %AX %EAX +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> + %2 = icmp sgt <32 x i16> %a0, %1 + %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %1 + %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> + %5 = icmp sgt <32 x i16> %3, %4 + %6 = select <32 x i1> %5, <32 x i16> %3, <32 x i16> %4 + %7 = shufflevector <32 x i16> %6, <32 x i16> undef, <32 x i32> + %8 = icmp sgt <32 x i16> %6, %7 + %9 = select <32 x i1> %8, <32 x i16> %6, <32 x i16> %7 + %10 = shufflevector <32 x i16> %9, <32 x i16> undef, <32 x i32> + %11 = icmp sgt <32 x i16> %9, %10 + %12 = select <32 x i1> %11, <32 x i16> %9, <32 x i16> %10 + %13 = shufflevector <32 x i16> %12, <32 x i16> undef, <32 x i32> + %14 = icmp sgt <32 x i16> %12, %13 + %15 = select <32 x i1> %14, <32 x i16> %12, <32 x i16> %13 + %16 = extractelement <32 x i16> %15, i32 0 + ret i16 %16 +} + +define i8 @test_reduce_v64i8(<64 x i8> %a0) { +; X86-SSE2-LABEL: test_reduce_v64i8: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 +; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm5 +; X86-SSE2-NEXT: pcmpgtb %xmm3, %xmm5 +; X86-SSE2-NEXT: pand %xmm5, %xmm1 +; X86-SSE2-NEXT: pandn %xmm3, %xmm5 +; X86-SSE2-NEXT: por %xmm1, %xmm5 +; X86-SSE2-NEXT: pand %xmm4, %xmm0 +; X86-SSE2-NEXT: pandn %xmm2, %xmm4 +; X86-SSE2-NEXT: por %xmm0, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X86-SSE2-NEXT: pcmpgtb %xmm5, %xmm0 +; X86-SSE2-NEXT: pand %xmm0, %xmm4 +; X86-SSE2-NEXT: pandn %xmm5, %xmm0 +; X86-SSE2-NEXT: por %xmm4, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrld $16, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: ## kill: %AL %AL %EAX +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v64i8: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pmaxsb %xmm3, %xmm1 +; X86-SSE42-NEXT: pmaxsb %xmm2, %xmm0 +; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: psrld $16, %xmm1 +; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1 +; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE42-NEXT: psrlw $8, %xmm0 +; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X86-SSE42-NEXT: ## kill: %AL %AL %EAX +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v64i8: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2 +; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpmaxsb %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX1-NEXT: ## kill: %AL %AL %EAX +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v64i8: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX2-NEXT: ## kill: %AL %AL %EAX +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v64i8: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa %xmm0, %xmm4 +; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm4 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm5 +; X64-SSE2-NEXT: pcmpgtb %xmm3, %xmm5 +; X64-SSE2-NEXT: pand %xmm5, %xmm1 +; X64-SSE2-NEXT: pandn %xmm3, %xmm5 +; X64-SSE2-NEXT: por %xmm1, %xmm5 +; X64-SSE2-NEXT: pand %xmm4, %xmm0 +; X64-SSE2-NEXT: pandn %xmm2, %xmm4 +; X64-SSE2-NEXT: por %xmm0, %xmm4 +; X64-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X64-SSE2-NEXT: pcmpgtb %xmm5, %xmm0 +; X64-SSE2-NEXT: pand %xmm0, %xmm4 +; X64-SSE2-NEXT: pandn %xmm5, %xmm0 +; X64-SSE2-NEXT: por %xmm4, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm2 +; X64-SSE2-NEXT: por %xmm0, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE2-NEXT: psrld $16, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm1 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm1, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X64-SSE2-NEXT: psrlw $8, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: ## kill: %AL %AL %EAX +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v64i8: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pmaxsb %xmm3, %xmm1 +; X64-SSE42-NEXT: pmaxsb %xmm2, %xmm0 +; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: psrld $16, %xmm1 +; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1 +; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE42-NEXT: psrlw $8, %xmm0 +; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X64-SSE42-NEXT: ## kill: %AL %AL %EAX +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v64i8: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2 +; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpmaxsb %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX1-NEXT: ## kill: %AL %AL %EAX +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v64i8: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX2-NEXT: ## kill: %AL %AL %EAX +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v64i8: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX512-NEXT: ## kill: %AL %AL %EAX +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> + %2 = icmp sgt <64 x i8> %a0, %1 + %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %1 + %4 = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> + %5 = icmp sgt <64 x i8> %3, %4 + %6 = select <64 x i1> %5, <64 x i8> %3, <64 x i8> %4 + %7 = shufflevector <64 x i8> %6, <64 x i8> undef, <64 x i32> + %8 = icmp sgt <64 x i8> %6, %7 + %9 = select <64 x i1> %8, <64 x i8> %6, <64 x i8> %7 + %10 = shufflevector <64 x i8> %9, <64 x i8> undef, <64 x i32> + %11 = icmp sgt <64 x i8> %9, %10 + %12 = select <64 x i1> %11, <64 x i8> %9, <64 x i8> %10 + %13 = shufflevector <64 x i8> %12, <64 x i8> undef, <64 x i32> + %14 = icmp sgt <64 x i8> %12, %13 + %15 = select <64 x i1> %14, <64 x i8> %12, <64 x i8> %13 + %16 = shufflevector <64 x i8> %15, <64 x i8> undef, <64 x i32> + %17 = icmp sgt <64 x i8> %15, %16 + %18 = select <64 x i1> %17, <64 x i8> %15, <64 x i8> %16 + %19 = extractelement <64 x i8> %18, i32 0 + ret i8 %19 +} diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll new file mode 100644 index 000000000000..6feb963426bb --- /dev/null +++ b/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll @@ -0,0 +1,1898 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE2 +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE42 +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1 +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE2 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE42 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX512 + +; +; 128-bit Vectors +; + +define i64 @test_reduce_v2i64(<2 x i64> %a0) { +; X86-SSE2-LABEL: test_reduce_v2i64: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: pxor %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm5, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; X86-SSE2-NEXT: por %xmm2, %xmm3 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm3 +; X86-SSE2-NEXT: por %xmm0, %xmm3 +; X86-SSE2-NEXT: movd %xmm3, %eax +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] +; X86-SSE2-NEXT: movd %xmm0, %edx +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v2i64: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X86-SSE42-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; X86-SSE42-NEXT: movd %xmm2, %eax +; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx +; X86-SSE42-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v2i64: +; X86-AVX: ## BB#0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; X86-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vmovd %xmm0, %eax +; X86-AVX-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v2i64: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pxor %xmm2, %xmm3 +; X64-SSE2-NEXT: pxor %xmm1, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm5, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; X64-SSE2-NEXT: por %xmm2, %xmm3 +; X64-SSE2-NEXT: pand %xmm3, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm3 +; X64-SSE2-NEXT: por %xmm0, %xmm3 +; X64-SSE2-NEXT: movq %xmm3, %rax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v2i64: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X64-SSE42-NEXT: movdqa %xmm2, %xmm0 +; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; X64-SSE42-NEXT: movq %xmm2, %rax +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v2i64: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v2i64: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v2i64: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpminsq %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovq %xmm0, %rax +; X64-AVX512-NEXT: retq + %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> + %2 = icmp slt <2 x i64> %a0, %1 + %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %1 + %4 = extractelement <2 x i64> %3, i32 0 + ret i64 %4 +} + +define i32 @test_reduce_v4i32(<4 x i32> %a0) { +; X86-SSE2-LABEL: test_reduce_v4i32: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v4i32: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pminsd %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pminsd %xmm1, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax +; X86-SSE42-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v4i32: +; X86-AVX: ## BB#0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovd %xmm0, %eax +; X86-AVX-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v4i32: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm2 +; X64-SSE2-NEXT: por %xmm0, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v4i32: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pminsd %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pminsd %xmm1, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax +; X64-SSE42-NEXT: retq +; +; X64-AVX-LABEL: test_reduce_v4i32: +; X64-AVX: ## BB#0: +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovd %xmm0, %eax +; X64-AVX-NEXT: retq + %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> + %2 = icmp slt <4 x i32> %a0, %1 + %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %1 + %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> + %5 = icmp slt <4 x i32> %3, %4 + %6 = select <4 x i1> %5, <4 x i32> %3, <4 x i32> %4 + %7 = extractelement <4 x i32> %6, i32 0 + ret i32 %7 +} + +define i16 @test_reduce_v8i16(<8 x i16> %a0) { +; X86-SSE-LABEL: test_reduce_v8i16: +; X86-SSE: ## BB#0: +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE-NEXT: pminsw %xmm0, %xmm1 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE-NEXT: pminsw %xmm1, %xmm0 +; X86-SSE-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE-NEXT: psrld $16, %xmm1 +; X86-SSE-NEXT: pminsw %xmm0, %xmm1 +; X86-SSE-NEXT: movd %xmm1, %eax +; X86-SSE-NEXT: ## kill: %AX %AX %EAX +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v8i16: +; X86-AVX: ## BB#0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovd %xmm0, %eax +; X86-AVX-NEXT: ## kill: %AX %AX %EAX +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: test_reduce_v8i16: +; X64-SSE: ## BB#0: +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE-NEXT: pminsw %xmm0, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE-NEXT: pminsw %xmm1, %xmm0 +; X64-SSE-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE-NEXT: psrld $16, %xmm1 +; X64-SSE-NEXT: pminsw %xmm0, %xmm1 +; X64-SSE-NEXT: movd %xmm1, %eax +; X64-SSE-NEXT: ## kill: %AX %AX %EAX +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: test_reduce_v8i16: +; X64-AVX: ## BB#0: +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovd %xmm0, %eax +; X64-AVX-NEXT: ## kill: %AX %AX %EAX +; X64-AVX-NEXT: retq + %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> + %2 = icmp slt <8 x i16> %a0, %1 + %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %1 + %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> + %5 = icmp slt <8 x i16> %3, %4 + %6 = select <8 x i1> %5, <8 x i16> %3, <8 x i16> %4 + %7 = shufflevector <8 x i16> %6, <8 x i16> undef, <8 x i32> + %8 = icmp slt <8 x i16> %6, %7 + %9 = select <8 x i1> %8, <8 x i16> %6, <8 x i16> %7 + %10 = extractelement <8 x i16> %9, i32 0 + ret i16 %10 +} + +define i8 @test_reduce_v16i8(<16 x i8> %a0) { +; X86-SSE2-LABEL: test_reduce_v16i8: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrld $16, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: ## kill: %AL %AL %EAX +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v16i8: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pminsb %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pminsb %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: psrld $16, %xmm1 +; X86-SSE42-NEXT: pminsb %xmm0, %xmm1 +; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE42-NEXT: psrlw $8, %xmm0 +; X86-SSE42-NEXT: pminsb %xmm1, %xmm0 +; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X86-SSE42-NEXT: ## kill: %AL %AL %EAX +; X86-SSE42-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v16i8: +; X86-AVX: ## BB#0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX-NEXT: ## kill: %AL %AL %EAX +; X86-AVX-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v16i8: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm2 +; X64-SSE2-NEXT: por %xmm0, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE2-NEXT: psrld $16, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm1 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm1, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X64-SSE2-NEXT: psrlw $8, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: ## kill: %AL %AL %EAX +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v16i8: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pminsb %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pminsb %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: psrld $16, %xmm1 +; X64-SSE42-NEXT: pminsb %xmm0, %xmm1 +; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE42-NEXT: psrlw $8, %xmm0 +; X64-SSE42-NEXT: pminsb %xmm1, %xmm0 +; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X64-SSE42-NEXT: ## kill: %AL %AL %EAX +; X64-SSE42-NEXT: retq +; +; X64-AVX-LABEL: test_reduce_v16i8: +; X64-AVX: ## BB#0: +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX-NEXT: ## kill: %AL %AL %EAX +; X64-AVX-NEXT: retq + %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> + %2 = icmp slt <16 x i8> %a0, %1 + %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %1 + %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> + %5 = icmp slt <16 x i8> %3, %4 + %6 = select <16 x i1> %5, <16 x i8> %3, <16 x i8> %4 + %7 = shufflevector <16 x i8> %6, <16 x i8> undef, <16 x i32> + %8 = icmp slt <16 x i8> %6, %7 + %9 = select <16 x i1> %8, <16 x i8> %6, <16 x i8> %7 + %10 = shufflevector <16 x i8> %9, <16 x i8> undef, <16 x i32> + %11 = icmp slt <16 x i8> %9, %10 + %12 = select <16 x i1> %11, <16 x i8> %9, <16 x i8> %10 + %13 = extractelement <16 x i8> %12, i32 0 + ret i8 %13 +} + +; +; 256-bit Vectors +; + +define i64 @test_reduce_v4i64(<4 x i64> %a0) { +; X86-SSE2-LABEL: test_reduce_v4i64: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 +; X86-SSE2-NEXT: pxor %xmm2, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm5 +; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm6, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; X86-SSE2-NEXT: por %xmm3, %xmm4 +; X86-SSE2-NEXT: pand %xmm4, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm4 +; X86-SSE2-NEXT: por %xmm0, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: pxor %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 +; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm5, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm4 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm4, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X86-SSE2-NEXT: movd %xmm0, %edx +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v4i64: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X86-SSE42-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; X86-SSE42-NEXT: movd %xmm2, %eax +; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v4i64: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v4i64: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v4i64: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pxor %xmm2, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm4 +; X64-SSE2-NEXT: pxor %xmm2, %xmm4 +; X64-SSE2-NEXT: movdqa %xmm4, %xmm5 +; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm4 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm6, %xmm3 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; X64-SSE2-NEXT: por %xmm3, %xmm4 +; X64-SSE2-NEXT: pand %xmm4, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm4 +; X64-SSE2-NEXT: por %xmm0, %xmm4 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X64-SSE2-NEXT: pxor %xmm2, %xmm1 +; X64-SSE2-NEXT: pxor %xmm0, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm3 +; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm5, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; X64-SSE2-NEXT: por %xmm1, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm4 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm4, %xmm2 +; X64-SSE2-NEXT: movq %xmm2, %rax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v4i64: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X64-SSE42-NEXT: movdqa %xmm2, %xmm0 +; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; X64-SSE42-NEXT: movq %xmm2, %rax +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v4i64: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v4i64: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v4i64: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpminsq %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpminsq %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vmovq %xmm0, %rax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> + %2 = icmp slt <4 x i64> %a0, %1 + %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %1 + %4 = shufflevector <4 x i64> %3, <4 x i64> undef, <4 x i32> + %5 = icmp slt <4 x i64> %3, %4 + %6 = select <4 x i1> %5, <4 x i64> %3, <4 x i64> %4 + %7 = extractelement <4 x i64> %6, i32 0 + ret i64 %7 +} + +define i32 @test_reduce_v8i32(<8 x i32> %a0) { +; X86-SSE2-LABEL: test_reduce_v8i32: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v8i32: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pminsd %xmm1, %xmm0 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pminsd %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pminsd %xmm1, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v8i32: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v8i32: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v8i32: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm2 +; X64-SSE2-NEXT: por %xmm0, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm1 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm1, %xmm2 +; X64-SSE2-NEXT: movd %xmm2, %eax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v8i32: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pminsd %xmm1, %xmm0 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pminsd %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pminsd %xmm1, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v8i32: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v8i32: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v8i32: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> + %2 = icmp slt <8 x i32> %a0, %1 + %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %1 + %4 = shufflevector <8 x i32> %3, <8 x i32> undef, <8 x i32> + %5 = icmp slt <8 x i32> %3, %4 + %6 = select <8 x i1> %5, <8 x i32> %3, <8 x i32> %4 + %7 = shufflevector <8 x i32> %6, <8 x i32> undef, <8 x i32> + %8 = icmp slt <8 x i32> %6, %7 + %9 = select <8 x i1> %8, <8 x i32> %6, <8 x i32> %7 + %10 = extractelement <8 x i32> %9, i32 0 + ret i32 %10 +} + +define i16 @test_reduce_v16i16(<16 x i16> %a0) { +; X86-SSE-LABEL: test_reduce_v16i16: +; X86-SSE: ## BB#0: +; X86-SSE-NEXT: pminsw %xmm1, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE-NEXT: pminsw %xmm0, %xmm1 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE-NEXT: pminsw %xmm1, %xmm0 +; X86-SSE-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE-NEXT: psrld $16, %xmm1 +; X86-SSE-NEXT: pminsw %xmm0, %xmm1 +; X86-SSE-NEXT: movd %xmm1, %eax +; X86-SSE-NEXT: ## kill: %AX %AX %EAX +; X86-SSE-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v16i16: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: ## kill: %AX %AX %EAX +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v16i16: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: %AX %AX %EAX +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE-LABEL: test_reduce_v16i16: +; X64-SSE: ## BB#0: +; X64-SSE-NEXT: pminsw %xmm1, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE-NEXT: pminsw %xmm0, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE-NEXT: pminsw %xmm1, %xmm0 +; X64-SSE-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE-NEXT: psrld $16, %xmm1 +; X64-SSE-NEXT: pminsw %xmm0, %xmm1 +; X64-SSE-NEXT: movd %xmm1, %eax +; X64-SSE-NEXT: ## kill: %AX %AX %EAX +; X64-SSE-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v16i16: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: ## kill: %AX %AX %EAX +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v16i16: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: %AX %AX %EAX +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v16i16: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: ## kill: %AX %AX %EAX +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> + %2 = icmp slt <16 x i16> %a0, %1 + %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %1 + %4 = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> + %5 = icmp slt <16 x i16> %3, %4 + %6 = select <16 x i1> %5, <16 x i16> %3, <16 x i16> %4 + %7 = shufflevector <16 x i16> %6, <16 x i16> undef, <16 x i32> + %8 = icmp slt <16 x i16> %6, %7 + %9 = select <16 x i1> %8, <16 x i16> %6, <16 x i16> %7 + %10 = shufflevector <16 x i16> %9, <16 x i16> undef, <16 x i32> + %11 = icmp slt <16 x i16> %9, %10 + %12 = select <16 x i1> %11, <16 x i16> %9, <16 x i16> %10 + %13 = extractelement <16 x i16> %12, i32 0 + ret i16 %13 +} + +define i8 @test_reduce_v32i8(<32 x i8> %a0) { +; X86-SSE2-LABEL: test_reduce_v32i8: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: psrld $16, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: ## kill: %AL %AL %EAX +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v32i8: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pminsb %xmm1, %xmm0 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pminsb %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pminsb %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: psrld $16, %xmm1 +; X86-SSE42-NEXT: pminsb %xmm0, %xmm1 +; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE42-NEXT: psrlw $8, %xmm0 +; X86-SSE42-NEXT: pminsb %xmm1, %xmm0 +; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X86-SSE42-NEXT: ## kill: %AL %AL %EAX +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v32i8: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX1-NEXT: ## kill: %AL %AL %EAX +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v32i8: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX2-NEXT: ## kill: %AL %AL %EAX +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v32i8: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm2 +; X64-SSE2-NEXT: por %xmm0, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm1 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm1, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X64-SSE2-NEXT: psrld $16, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE2-NEXT: psrlw $8, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm1 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm1, %xmm2 +; X64-SSE2-NEXT: movd %xmm2, %eax +; X64-SSE2-NEXT: ## kill: %AL %AL %EAX +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v32i8: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pminsb %xmm1, %xmm0 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pminsb %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pminsb %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: psrld $16, %xmm1 +; X64-SSE42-NEXT: pminsb %xmm0, %xmm1 +; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE42-NEXT: psrlw $8, %xmm0 +; X64-SSE42-NEXT: pminsb %xmm1, %xmm0 +; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X64-SSE42-NEXT: ## kill: %AL %AL %EAX +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v32i8: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX1-NEXT: ## kill: %AL %AL %EAX +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v32i8: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX2-NEXT: ## kill: %AL %AL %EAX +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v32i8: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX512-NEXT: ## kill: %AL %AL %EAX +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> + %2 = icmp slt <32 x i8> %a0, %1 + %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %1 + %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> + %5 = icmp slt <32 x i8> %3, %4 + %6 = select <32 x i1> %5, <32 x i8> %3, <32 x i8> %4 + %7 = shufflevector <32 x i8> %6, <32 x i8> undef, <32 x i32> + %8 = icmp slt <32 x i8> %6, %7 + %9 = select <32 x i1> %8, <32 x i8> %6, <32 x i8> %7 + %10 = shufflevector <32 x i8> %9, <32 x i8> undef, <32 x i32> + %11 = icmp slt <32 x i8> %9, %10 + %12 = select <32 x i1> %11, <32 x i8> %9, <32 x i8> %10 + %13 = shufflevector <32 x i8> %12, <32 x i8> undef, <32 x i32> + %14 = icmp slt <32 x i8> %12, %13 + %15 = select <32 x i1> %14, <32 x i8> %12, <32 x i8> %13 + %16 = extractelement <32 x i8> %15, i32 0 + ret i8 %16 +} + +; +; 512-bit Vectors +; + +define i64 @test_reduce_v8i64(<8 x i64> %a0) { +; X86-SSE2-LABEL: test_reduce_v8i64: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: subl $28, %esp +; X86-SSE2-NEXT: .cfi_def_cfa_offset 32 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm6, (%esp) ## 16-byte Spill +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X86-SSE2-NEXT: pxor %xmm4, %xmm5 +; X86-SSE2-NEXT: pxor %xmm4, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm6, %xmm7 +; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; X86-SSE2-NEXT: pcmpeqd %xmm5, %xmm6 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm5, %xmm6 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; X86-SSE2-NEXT: por %xmm6, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm6 +; X86-SSE2-NEXT: pxor %xmm4, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm7 +; X86-SSE2-NEXT: pxor %xmm4, %xmm7 +; X86-SSE2-NEXT: movdqa %xmm7, %xmm0 +; X86-SSE2-NEXT: pcmpgtd %xmm6, %xmm0 +; X86-SSE2-NEXT: pcmpeqd %xmm6, %xmm7 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] +; X86-SSE2-NEXT: pand %xmm6, %xmm7 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; X86-SSE2-NEXT: por %xmm7, %xmm6 +; X86-SSE2-NEXT: pand %xmm6, %xmm1 +; X86-SSE2-NEXT: pandn %xmm3, %xmm6 +; X86-SSE2-NEXT: por %xmm1, %xmm6 +; X86-SSE2-NEXT: pand %xmm5, %xmm2 +; X86-SSE2-NEXT: pandn (%esp), %xmm5 ## 16-byte Folded Reload +; X86-SSE2-NEXT: por %xmm2, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm0 +; X86-SSE2-NEXT: pxor %xmm4, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm6, %xmm1 +; X86-SSE2-NEXT: pxor %xmm4, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; X86-SSE2-NEXT: por %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm0, %xmm5 +; X86-SSE2-NEXT: pandn %xmm6, %xmm0 +; X86-SSE2-NEXT: por %xmm5, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pxor %xmm4, %xmm2 +; X86-SSE2-NEXT: pxor %xmm1, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm3 +; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm2, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; X86-SSE2-NEXT: por %xmm4, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X86-SSE2-NEXT: movd %xmm0, %edx +; X86-SSE2-NEXT: addl $28, %esp +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v8i64: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: movdqa %xmm0, %xmm4 +; X86-SSE42-NEXT: movdqa %xmm3, %xmm5 +; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm5 +; X86-SSE42-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm4, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; X86-SSE42-NEXT: movdqa %xmm5, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3 +; X86-SSE42-NEXT: movapd %xmm3, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 +; X86-SSE42-NEXT: movd %xmm1, %eax +; X86-SSE42-NEXT: pextrd $1, %xmm1, %edx +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v8i64: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v8i64: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v8i64: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,0,2147483648,0] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm5 +; X64-SSE2-NEXT: pxor %xmm9, %xmm5 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm6 +; X64-SSE2-NEXT: pxor %xmm9, %xmm6 +; X64-SSE2-NEXT: movdqa %xmm6, %xmm7 +; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm5, %xmm6 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm8, %xmm6 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; X64-SSE2-NEXT: por %xmm6, %xmm5 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm6 +; X64-SSE2-NEXT: pxor %xmm9, %xmm6 +; X64-SSE2-NEXT: movdqa %xmm3, %xmm7 +; X64-SSE2-NEXT: pxor %xmm9, %xmm7 +; X64-SSE2-NEXT: movdqa %xmm7, %xmm4 +; X64-SSE2-NEXT: pcmpgtd %xmm6, %xmm4 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm6, %xmm7 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm8, %xmm7 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] +; X64-SSE2-NEXT: por %xmm7, %xmm6 +; X64-SSE2-NEXT: pand %xmm6, %xmm1 +; X64-SSE2-NEXT: pandn %xmm3, %xmm6 +; X64-SSE2-NEXT: por %xmm1, %xmm6 +; X64-SSE2-NEXT: pand %xmm5, %xmm0 +; X64-SSE2-NEXT: pandn %xmm2, %xmm5 +; X64-SSE2-NEXT: por %xmm0, %xmm5 +; X64-SSE2-NEXT: movdqa %xmm5, %xmm0 +; X64-SSE2-NEXT: pxor %xmm9, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm6, %xmm1 +; X64-SSE2-NEXT: pxor %xmm9, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm3, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; X64-SSE2-NEXT: por %xmm0, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm5 +; X64-SSE2-NEXT: pandn %xmm6, %xmm1 +; X64-SSE2-NEXT: por %xmm5, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pxor %xmm9, %xmm2 +; X64-SSE2-NEXT: pxor %xmm0, %xmm9 +; X64-SSE2-NEXT: movdqa %xmm9, %xmm3 +; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm2, %xmm9 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm4, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; X64-SSE2-NEXT: por %xmm2, %xmm3 +; X64-SSE2-NEXT: pand %xmm3, %xmm1 +; X64-SSE2-NEXT: pandn %xmm0, %xmm3 +; X64-SSE2-NEXT: por %xmm1, %xmm3 +; X64-SSE2-NEXT: movq %xmm3, %rax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v8i64: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: movdqa %xmm0, %xmm4 +; X64-SSE42-NEXT: movdqa %xmm3, %xmm5 +; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm5 +; X64-SSE42-NEXT: movdqa %xmm2, %xmm0 +; X64-SSE42-NEXT: pcmpgtq %xmm4, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; X64-SSE42-NEXT: movdqa %xmm5, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3 +; X64-SSE42-NEXT: movapd %xmm3, %xmm0 +; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 +; X64-SSE42-NEXT: movq %xmm1, %rax +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v8i64: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v8i64: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v8i64: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; X64-AVX512-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vmovq %xmm0, %rax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> + %2 = icmp slt <8 x i64> %a0, %1 + %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %1 + %4 = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> + %5 = icmp slt <8 x i64> %3, %4 + %6 = select <8 x i1> %5, <8 x i64> %3, <8 x i64> %4 + %7 = shufflevector <8 x i64> %6, <8 x i64> undef, <8 x i32> + %8 = icmp slt <8 x i64> %6, %7 + %9 = select <8 x i1> %8, <8 x i64> %6, <8 x i64> %7 + %10 = extractelement <8 x i64> %9, i32 0 + ret i64 %10 +} + +define i32 @test_reduce_v16i32(<16 x i32> %a0) { +; X86-SSE2-LABEL: test_reduce_v16i32: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: movdqa %xmm3, %xmm4 +; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm5 +; X86-SSE2-NEXT: pand %xmm5, %xmm0 +; X86-SSE2-NEXT: pandn %xmm2, %xmm5 +; X86-SSE2-NEXT: por %xmm0, %xmm5 +; X86-SSE2-NEXT: pand %xmm4, %xmm1 +; X86-SSE2-NEXT: pandn %xmm3, %xmm4 +; X86-SSE2-NEXT: por %xmm1, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm0 +; X86-SSE2-NEXT: pand %xmm0, %xmm5 +; X86-SSE2-NEXT: pandn %xmm4, %xmm0 +; X86-SSE2-NEXT: por %xmm5, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v16i32: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pminsd %xmm3, %xmm1 +; X86-SSE42-NEXT: pminsd %xmm2, %xmm0 +; X86-SSE42-NEXT: pminsd %xmm1, %xmm0 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pminsd %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pminsd %xmm1, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v16i32: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX1-NEXT: vpminsd %xmm2, %xmm3, %xmm2 +; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpminsd %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v16i32: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v16i32: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa %xmm3, %xmm4 +; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm4 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm5 +; X64-SSE2-NEXT: pand %xmm5, %xmm0 +; X64-SSE2-NEXT: pandn %xmm2, %xmm5 +; X64-SSE2-NEXT: por %xmm0, %xmm5 +; X64-SSE2-NEXT: pand %xmm4, %xmm1 +; X64-SSE2-NEXT: pandn %xmm3, %xmm4 +; X64-SSE2-NEXT: por %xmm1, %xmm4 +; X64-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm0 +; X64-SSE2-NEXT: pand %xmm0, %xmm5 +; X64-SSE2-NEXT: pandn %xmm4, %xmm0 +; X64-SSE2-NEXT: por %xmm5, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm2 +; X64-SSE2-NEXT: por %xmm0, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v16i32: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pminsd %xmm3, %xmm1 +; X64-SSE42-NEXT: pminsd %xmm2, %xmm0 +; X64-SSE42-NEXT: pminsd %xmm1, %xmm0 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pminsd %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pminsd %xmm1, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v16i32: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-AVX1-NEXT: vpminsd %xmm2, %xmm3, %xmm2 +; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpminsd %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v16i32: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v16i32: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; X64-AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> + %2 = icmp slt <16 x i32> %a0, %1 + %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %1 + %4 = shufflevector <16 x i32> %3, <16 x i32> undef, <16 x i32> + %5 = icmp slt <16 x i32> %3, %4 + %6 = select <16 x i1> %5, <16 x i32> %3, <16 x i32> %4 + %7 = shufflevector <16 x i32> %6, <16 x i32> undef, <16 x i32> + %8 = icmp slt <16 x i32> %6, %7 + %9 = select <16 x i1> %8, <16 x i32> %6, <16 x i32> %7 + %10 = shufflevector <16 x i32> %9, <16 x i32> undef, <16 x i32> + %11 = icmp slt <16 x i32> %9, %10 + %12 = select <16 x i1> %11, <16 x i32> %9, <16 x i32> %10 + %13 = extractelement <16 x i32> %12, i32 0 + ret i32 %13 +} + +define i16 @test_reduce_v32i16(<32 x i16> %a0) { +; X86-SSE-LABEL: test_reduce_v32i16: +; X86-SSE: ## BB#0: +; X86-SSE-NEXT: pminsw %xmm3, %xmm1 +; X86-SSE-NEXT: pminsw %xmm2, %xmm0 +; X86-SSE-NEXT: pminsw %xmm1, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE-NEXT: pminsw %xmm0, %xmm1 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE-NEXT: pminsw %xmm1, %xmm0 +; X86-SSE-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE-NEXT: psrld $16, %xmm1 +; X86-SSE-NEXT: pminsw %xmm0, %xmm1 +; X86-SSE-NEXT: movd %xmm1, %eax +; X86-SSE-NEXT: ## kill: %AX %AX %EAX +; X86-SSE-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v32i16: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX1-NEXT: vpminsw %xmm2, %xmm3, %xmm2 +; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpminsw %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: ## kill: %AX %AX %EAX +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v32i16: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: %AX %AX %EAX +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE-LABEL: test_reduce_v32i16: +; X64-SSE: ## BB#0: +; X64-SSE-NEXT: pminsw %xmm3, %xmm1 +; X64-SSE-NEXT: pminsw %xmm2, %xmm0 +; X64-SSE-NEXT: pminsw %xmm1, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE-NEXT: pminsw %xmm0, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE-NEXT: pminsw %xmm1, %xmm0 +; X64-SSE-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE-NEXT: psrld $16, %xmm1 +; X64-SSE-NEXT: pminsw %xmm0, %xmm1 +; X64-SSE-NEXT: movd %xmm1, %eax +; X64-SSE-NEXT: ## kill: %AX %AX %EAX +; X64-SSE-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v32i16: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-AVX1-NEXT: vpminsw %xmm2, %xmm3, %xmm2 +; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpminsw %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: ## kill: %AX %AX %EAX +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v32i16: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: %AX %AX %EAX +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v32i16: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: ## kill: %AX %AX %EAX +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> + %2 = icmp slt <32 x i16> %a0, %1 + %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %1 + %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> + %5 = icmp slt <32 x i16> %3, %4 + %6 = select <32 x i1> %5, <32 x i16> %3, <32 x i16> %4 + %7 = shufflevector <32 x i16> %6, <32 x i16> undef, <32 x i32> + %8 = icmp slt <32 x i16> %6, %7 + %9 = select <32 x i1> %8, <32 x i16> %6, <32 x i16> %7 + %10 = shufflevector <32 x i16> %9, <32 x i16> undef, <32 x i32> + %11 = icmp slt <32 x i16> %9, %10 + %12 = select <32 x i1> %11, <32 x i16> %9, <32 x i16> %10 + %13 = shufflevector <32 x i16> %12, <32 x i16> undef, <32 x i32> + %14 = icmp slt <32 x i16> %12, %13 + %15 = select <32 x i1> %14, <32 x i16> %12, <32 x i16> %13 + %16 = extractelement <32 x i16> %15, i32 0 + ret i16 %16 +} + +define i8 @test_reduce_v64i8(<64 x i8> %a0) { +; X86-SSE2-LABEL: test_reduce_v64i8: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: movdqa %xmm3, %xmm4 +; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm5 +; X86-SSE2-NEXT: pand %xmm5, %xmm0 +; X86-SSE2-NEXT: pandn %xmm2, %xmm5 +; X86-SSE2-NEXT: por %xmm0, %xmm5 +; X86-SSE2-NEXT: pand %xmm4, %xmm1 +; X86-SSE2-NEXT: pandn %xmm3, %xmm4 +; X86-SSE2-NEXT: por %xmm1, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X86-SSE2-NEXT: pcmpgtb %xmm5, %xmm0 +; X86-SSE2-NEXT: pand %xmm0, %xmm5 +; X86-SSE2-NEXT: pandn %xmm4, %xmm0 +; X86-SSE2-NEXT: por %xmm5, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrld $16, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: ## kill: %AL %AL %EAX +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v64i8: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pminsb %xmm3, %xmm1 +; X86-SSE42-NEXT: pminsb %xmm2, %xmm0 +; X86-SSE42-NEXT: pminsb %xmm1, %xmm0 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pminsb %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pminsb %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: psrld $16, %xmm1 +; X86-SSE42-NEXT: pminsb %xmm0, %xmm1 +; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE42-NEXT: psrlw $8, %xmm0 +; X86-SSE42-NEXT: pminsb %xmm1, %xmm0 +; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X86-SSE42-NEXT: ## kill: %AL %AL %EAX +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v64i8: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm2 +; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpminsb %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX1-NEXT: ## kill: %AL %AL %EAX +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v64i8: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX2-NEXT: ## kill: %AL %AL %EAX +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v64i8: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa %xmm3, %xmm4 +; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm4 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm5 +; X64-SSE2-NEXT: pand %xmm5, %xmm0 +; X64-SSE2-NEXT: pandn %xmm2, %xmm5 +; X64-SSE2-NEXT: por %xmm0, %xmm5 +; X64-SSE2-NEXT: pand %xmm4, %xmm1 +; X64-SSE2-NEXT: pandn %xmm3, %xmm4 +; X64-SSE2-NEXT: por %xmm1, %xmm4 +; X64-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X64-SSE2-NEXT: pcmpgtb %xmm5, %xmm0 +; X64-SSE2-NEXT: pand %xmm0, %xmm5 +; X64-SSE2-NEXT: pandn %xmm4, %xmm0 +; X64-SSE2-NEXT: por %xmm5, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm2 +; X64-SSE2-NEXT: por %xmm0, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE2-NEXT: psrld $16, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm1 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm1, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X64-SSE2-NEXT: psrlw $8, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: ## kill: %AL %AL %EAX +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v64i8: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pminsb %xmm3, %xmm1 +; X64-SSE42-NEXT: pminsb %xmm2, %xmm0 +; X64-SSE42-NEXT: pminsb %xmm1, %xmm0 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pminsb %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pminsb %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: psrld $16, %xmm1 +; X64-SSE42-NEXT: pminsb %xmm0, %xmm1 +; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE42-NEXT: psrlw $8, %xmm0 +; X64-SSE42-NEXT: pminsb %xmm1, %xmm0 +; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X64-SSE42-NEXT: ## kill: %AL %AL %EAX +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v64i8: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm2 +; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpminsb %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX1-NEXT: ## kill: %AL %AL %EAX +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v64i8: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX2-NEXT: ## kill: %AL %AL %EAX +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v64i8: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX512-NEXT: ## kill: %AL %AL %EAX +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> + %2 = icmp slt <64 x i8> %a0, %1 + %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %1 + %4 = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> + %5 = icmp slt <64 x i8> %3, %4 + %6 = select <64 x i1> %5, <64 x i8> %3, <64 x i8> %4 + %7 = shufflevector <64 x i8> %6, <64 x i8> undef, <64 x i32> + %8 = icmp slt <64 x i8> %6, %7 + %9 = select <64 x i1> %8, <64 x i8> %6, <64 x i8> %7 + %10 = shufflevector <64 x i8> %9, <64 x i8> undef, <64 x i32> + %11 = icmp slt <64 x i8> %9, %10 + %12 = select <64 x i1> %11, <64 x i8> %9, <64 x i8> %10 + %13 = shufflevector <64 x i8> %12, <64 x i8> undef, <64 x i32> + %14 = icmp slt <64 x i8> %12, %13 + %15 = select <64 x i1> %14, <64 x i8> %12, <64 x i8> %13 + %16 = shufflevector <64 x i8> %15, <64 x i8> undef, <64 x i32> + %17 = icmp slt <64 x i8> %15, %16 + %18 = select <64 x i1> %17, <64 x i8> %15, <64 x i8> %16 + %19 = extractelement <64 x i8> %18, i32 0 + ret i8 %19 +} diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll new file mode 100644 index 000000000000..ee9d8955cb56 --- /dev/null +++ b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll @@ -0,0 +1,2203 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE2 +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE42 +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1 +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE2 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE42 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX512 + +; +; 128-bit Vectors +; + +define i64 @test_reduce_v2i64(<2 x i64> %a0) { +; X86-SSE2-LABEL: test_reduce_v2i64: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: pxor %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm4 +; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm5, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; X86-SSE2-NEXT: por %xmm2, %xmm3 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm3 +; X86-SSE2-NEXT: por %xmm0, %xmm3 +; X86-SSE2-NEXT: movd %xmm3, %eax +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] +; X86-SSE2-NEXT: movd %xmm0, %edx +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v2i64: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X86-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648] +; X86-SSE42-NEXT: pxor %xmm3, %xmm0 +; X86-SSE42-NEXT: pxor %xmm2, %xmm3 +; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; X86-SSE42-NEXT: movd %xmm2, %eax +; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx +; X86-SSE42-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v2i64: +; X86-AVX: ## BB#0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] +; X86-AVX-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; X86-AVX-NEXT: vpxor %xmm2, %xmm1, %xmm2 +; X86-AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; X86-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vmovd %xmm0, %eax +; X86-AVX-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v2i64: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pxor %xmm2, %xmm3 +; X64-SSE2-NEXT: pxor %xmm1, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm3, %xmm4 +; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm5, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; X64-SSE2-NEXT: por %xmm2, %xmm3 +; X64-SSE2-NEXT: pand %xmm3, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm3 +; X64-SSE2-NEXT: por %xmm0, %xmm3 +; X64-SSE2-NEXT: movq %xmm3, %rax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v2i64: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X64-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; X64-SSE42-NEXT: pxor %xmm3, %xmm0 +; X64-SSE42-NEXT: pxor %xmm2, %xmm3 +; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; X64-SSE42-NEXT: movq %xmm2, %rax +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v2i64: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; X64-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2 +; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v2i64: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; X64-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 +; X64-AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v2i64: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovq %xmm0, %rax +; X64-AVX512-NEXT: retq + %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> + %2 = icmp ugt <2 x i64> %a0, %1 + %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %1 + %4 = extractelement <2 x i64> %3, i32 0 + ret i64 %4 +} + +define i32 @test_reduce_v4i32(<4 x i32> %a0) { +; X86-SSE2-LABEL: test_reduce_v4i32: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 +; X86-SSE2-NEXT: pxor %xmm2, %xmm4 +; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm3 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm3 +; X86-SSE2-NEXT: por %xmm0, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm3, %xmm1 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: pxor %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm3 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm3, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v4i32: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pmaxud %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pmaxud %xmm1, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax +; X86-SSE42-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v4i32: +; X86-AVX: ## BB#0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovd %xmm0, %eax +; X86-AVX-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v4i32: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pxor %xmm2, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm4 +; X64-SSE2-NEXT: pxor %xmm2, %xmm4 +; X64-SSE2-NEXT: pcmpgtd %xmm4, %xmm3 +; X64-SSE2-NEXT: pand %xmm3, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm3 +; X64-SSE2-NEXT: por %xmm0, %xmm3 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm3, %xmm1 +; X64-SSE2-NEXT: pxor %xmm2, %xmm1 +; X64-SSE2-NEXT: pxor %xmm0, %xmm2 +; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm3 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm3, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v4i32: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pmaxud %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pmaxud %xmm1, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax +; X64-SSE42-NEXT: retq +; +; X64-AVX-LABEL: test_reduce_v4i32: +; X64-AVX: ## BB#0: +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovd %xmm0, %eax +; X64-AVX-NEXT: retq + %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> + %2 = icmp ugt <4 x i32> %a0, %1 + %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %1 + %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> + %5 = icmp ugt <4 x i32> %3, %4 + %6 = select <4 x i1> %5, <4 x i32> %3, <4 x i32> %4 + %7 = extractelement <4 x i32> %6, i32 0 + ret i32 %7 +} + +define i16 @test_reduce_v8i16(<8 x i16> %a0) { +; X86-SSE2-LABEL: test_reduce_v8i16: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm1, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE2-NEXT: pxor %xmm1, %xmm4 +; X86-SSE2-NEXT: pcmpgtw %xmm4, %xmm3 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: pandn %xmm2, %xmm3 +; X86-SSE2-NEXT: por %xmm0, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm3, %xmm2 +; X86-SSE2-NEXT: pxor %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 +; X86-SSE2-NEXT: pxor %xmm1, %xmm4 +; X86-SSE2-NEXT: pcmpgtw %xmm4, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm3 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm3, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: psrld $16, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 +; X86-SSE2-NEXT: pxor %xmm1, %xmm3 +; X86-SSE2-NEXT: pxor %xmm0, %xmm1 +; X86-SSE2-NEXT: pcmpgtw %xmm1, %xmm3 +; X86-SSE2-NEXT: pand %xmm3, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm3 +; X86-SSE2-NEXT: por %xmm2, %xmm3 +; X86-SSE2-NEXT: movd %xmm3, %eax +; X86-SSE2-NEXT: ## kill: %AX %AX %EAX +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v8i16: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pmaxuw %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pmaxuw %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: psrld $16, %xmm1 +; X86-SSE42-NEXT: pmaxuw %xmm0, %xmm1 +; X86-SSE42-NEXT: movd %xmm1, %eax +; X86-SSE42-NEXT: ## kill: %AX %AX %EAX +; X86-SSE42-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v8i16: +; X86-AVX: ## BB#0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovd %xmm0, %eax +; X86-AVX-NEXT: ## kill: %AX %AX %EAX +; X86-AVX-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v8i16: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pxor %xmm1, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X64-SSE2-NEXT: pxor %xmm1, %xmm4 +; X64-SSE2-NEXT: pcmpgtw %xmm4, %xmm3 +; X64-SSE2-NEXT: pand %xmm3, %xmm0 +; X64-SSE2-NEXT: pandn %xmm2, %xmm3 +; X64-SSE2-NEXT: por %xmm0, %xmm3 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm3, %xmm2 +; X64-SSE2-NEXT: pxor %xmm1, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm4 +; X64-SSE2-NEXT: pxor %xmm1, %xmm4 +; X64-SSE2-NEXT: pcmpgtw %xmm4, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm3 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm3, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X64-SSE2-NEXT: psrld $16, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm3 +; X64-SSE2-NEXT: pxor %xmm1, %xmm3 +; X64-SSE2-NEXT: pxor %xmm0, %xmm1 +; X64-SSE2-NEXT: pcmpgtw %xmm1, %xmm3 +; X64-SSE2-NEXT: pand %xmm3, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm3 +; X64-SSE2-NEXT: por %xmm2, %xmm3 +; X64-SSE2-NEXT: movd %xmm3, %eax +; X64-SSE2-NEXT: ## kill: %AX %AX %EAX +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v8i16: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pmaxuw %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pmaxuw %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: psrld $16, %xmm1 +; X64-SSE42-NEXT: pmaxuw %xmm0, %xmm1 +; X64-SSE42-NEXT: movd %xmm1, %eax +; X64-SSE42-NEXT: ## kill: %AX %AX %EAX +; X64-SSE42-NEXT: retq +; +; X64-AVX-LABEL: test_reduce_v8i16: +; X64-AVX: ## BB#0: +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovd %xmm0, %eax +; X64-AVX-NEXT: ## kill: %AX %AX %EAX +; X64-AVX-NEXT: retq + %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> + %2 = icmp ugt <8 x i16> %a0, %1 + %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %1 + %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> + %5 = icmp ugt <8 x i16> %3, %4 + %6 = select <8 x i1> %5, <8 x i16> %3, <8 x i16> %4 + %7 = shufflevector <8 x i16> %6, <8 x i16> undef, <8 x i32> + %8 = icmp ugt <8 x i16> %6, %7 + %9 = select <8 x i1> %8, <8 x i16> %6, <8 x i16> %7 + %10 = extractelement <8 x i16> %9, i32 0 + ret i16 %10 +} + +define i8 @test_reduce_v16i8(<16 x i8> %a0) { +; X86-SSE2-LABEL: test_reduce_v16i8: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0 +; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: ## kill: %AL %AL %EAX +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v16i8: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: psrld $16, %xmm1 +; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1 +; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE42-NEXT: psrlw $8, %xmm0 +; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0 +; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X86-SSE42-NEXT: ## kill: %AL %AL %EAX +; X86-SSE42-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v16i8: +; X86-AVX: ## BB#0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX-NEXT: ## kill: %AL %AL %EAX +; X86-AVX-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v16i8: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: psrld $16, %xmm1 +; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE2-NEXT: psrlw $8, %xmm0 +; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0 +; X64-SSE2-NEXT: movd %xmm0, %eax +; X64-SSE2-NEXT: ## kill: %AL %AL %EAX +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v16i8: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: psrld $16, %xmm1 +; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1 +; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE42-NEXT: psrlw $8, %xmm0 +; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0 +; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X64-SSE42-NEXT: ## kill: %AL %AL %EAX +; X64-SSE42-NEXT: retq +; +; X64-AVX-LABEL: test_reduce_v16i8: +; X64-AVX: ## BB#0: +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX-NEXT: ## kill: %AL %AL %EAX +; X64-AVX-NEXT: retq + %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> + %2 = icmp ugt <16 x i8> %a0, %1 + %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %1 + %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> + %5 = icmp ugt <16 x i8> %3, %4 + %6 = select <16 x i1> %5, <16 x i8> %3, <16 x i8> %4 + %7 = shufflevector <16 x i8> %6, <16 x i8> undef, <16 x i32> + %8 = icmp ugt <16 x i8> %6, %7 + %9 = select <16 x i1> %8, <16 x i8> %6, <16 x i8> %7 + %10 = shufflevector <16 x i8> %9, <16 x i8> undef, <16 x i32> + %11 = icmp ugt <16 x i8> %9, %10 + %12 = select <16 x i1> %11, <16 x i8> %9, <16 x i8> %10 + %13 = extractelement <16 x i8> %12, i32 0 + ret i8 %13 +} + +; +; 256-bit Vectors +; + +define i64 @test_reduce_v4i64(<4 x i64> %a0) { +; X86-SSE2-LABEL: test_reduce_v4i64: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 +; X86-SSE2-NEXT: pxor %xmm2, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm5 +; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm6, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; X86-SSE2-NEXT: por %xmm3, %xmm4 +; X86-SSE2-NEXT: pand %xmm4, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm4 +; X86-SSE2-NEXT: por %xmm0, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: pxor %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm5, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm4 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm4, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X86-SSE2-NEXT: movd %xmm0, %edx +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v4i64: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648] +; X86-SSE42-NEXT: movdqa %xmm1, %xmm4 +; X86-SSE42-NEXT: pxor %xmm3, %xmm4 +; X86-SSE42-NEXT: pxor %xmm3, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm4, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE42-NEXT: pxor %xmm3, %xmm0 +; X86-SSE42-NEXT: pxor %xmm2, %xmm3 +; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; X86-SSE42-NEXT: movd %xmm2, %eax +; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v4i64: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] +; X86-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 +; X86-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm4 +; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm4 +; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; X86-AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 +; X86-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm4 +; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm3 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; X86-AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm2 +; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v4i64: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] +; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3 +; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4 +; X86-AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3 +; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 +; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2 +; X86-AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2 +; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v4i64: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; X64-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X64-SSE2-NEXT: pxor %xmm2, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm4 +; X64-SSE2-NEXT: pxor %xmm2, %xmm4 +; X64-SSE2-NEXT: movdqa %xmm4, %xmm5 +; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm4 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm6, %xmm3 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; X64-SSE2-NEXT: por %xmm3, %xmm4 +; X64-SSE2-NEXT: pand %xmm4, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm4 +; X64-SSE2-NEXT: por %xmm0, %xmm4 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X64-SSE2-NEXT: pxor %xmm2, %xmm1 +; X64-SSE2-NEXT: pxor %xmm0, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm5, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; X64-SSE2-NEXT: por %xmm1, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm4 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm4, %xmm2 +; X64-SSE2-NEXT: movq %xmm2, %rax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v4i64: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; X64-SSE42-NEXT: movdqa %xmm1, %xmm4 +; X64-SSE42-NEXT: pxor %xmm3, %xmm4 +; X64-SSE42-NEXT: pxor %xmm3, %xmm0 +; X64-SSE42-NEXT: pcmpgtq %xmm4, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE42-NEXT: pxor %xmm3, %xmm0 +; X64-SSE42-NEXT: pxor %xmm2, %xmm3 +; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; X64-SSE42-NEXT: movq %xmm2, %rax +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v4i64: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; X64-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 +; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm4 +; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm4 +; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; X64-AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 +; X64-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm4 +; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm3 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; X64-AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm2 +; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v4i64: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; X64-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3 +; X64-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4 +; X64-AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3 +; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 +; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2 +; X64-AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2 +; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v4i64: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpmaxuq %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpmaxuq %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vmovq %xmm0, %rax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> + %2 = icmp ugt <4 x i64> %a0, %1 + %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %1 + %4 = shufflevector <4 x i64> %3, <4 x i64> undef, <4 x i32> + %5 = icmp ugt <4 x i64> %3, %4 + %6 = select <4 x i1> %5, <4 x i64> %3, <4 x i64> %4 + %7 = extractelement <4 x i64> %6, i32 0 + ret i64 %7 +} + +define i32 @test_reduce_v8i32(<8 x i32> %a0) { +; X86-SSE2-LABEL: test_reduce_v8i32: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 +; X86-SSE2-NEXT: pxor %xmm2, %xmm4 +; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; X86-SSE2-NEXT: pand %xmm4, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm4 +; X86-SSE2-NEXT: por %xmm0, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm4 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm4, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: pxor %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; X86-SSE2-NEXT: pand %xmm3, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm3 +; X86-SSE2-NEXT: por %xmm1, %xmm3 +; X86-SSE2-NEXT: movd %xmm3, %eax +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v8i32: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pmaxud %xmm1, %xmm0 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pmaxud %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pmaxud %xmm1, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v8i32: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v8i32: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v8i32: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; X64-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X64-SSE2-NEXT: pxor %xmm2, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm4 +; X64-SSE2-NEXT: pxor %xmm2, %xmm4 +; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; X64-SSE2-NEXT: pand %xmm4, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm4 +; X64-SSE2-NEXT: por %xmm0, %xmm4 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X64-SSE2-NEXT: pxor %xmm2, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pxor %xmm2, %xmm3 +; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm4 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm4, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X64-SSE2-NEXT: pxor %xmm2, %xmm3 +; X64-SSE2-NEXT: pxor %xmm0, %xmm2 +; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; X64-SSE2-NEXT: pand %xmm3, %xmm1 +; X64-SSE2-NEXT: pandn %xmm0, %xmm3 +; X64-SSE2-NEXT: por %xmm1, %xmm3 +; X64-SSE2-NEXT: movd %xmm3, %eax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v8i32: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pmaxud %xmm1, %xmm0 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pmaxud %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pmaxud %xmm1, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v8i32: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v8i32: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v8i32: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> + %2 = icmp ugt <8 x i32> %a0, %1 + %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %1 + %4 = shufflevector <8 x i32> %3, <8 x i32> undef, <8 x i32> + %5 = icmp ugt <8 x i32> %3, %4 + %6 = select <8 x i1> %5, <8 x i32> %3, <8 x i32> %4 + %7 = shufflevector <8 x i32> %6, <8 x i32> undef, <8 x i32> + %8 = icmp ugt <8 x i32> %6, %7 + %9 = select <8 x i1> %8, <8 x i32> %6, <8 x i32> %7 + %10 = extractelement <8 x i32> %9, i32 0 + ret i32 %10 +} + +define i16 @test_reduce_v16i16(<16 x i16> %a0) { +; X86-SSE2-LABEL: test_reduce_v16i16: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 +; X86-SSE2-NEXT: pxor %xmm2, %xmm4 +; X86-SSE2-NEXT: pcmpgtw %xmm3, %xmm4 +; X86-SSE2-NEXT: pand %xmm4, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm4 +; X86-SSE2-NEXT: por %xmm0, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: pcmpgtw %xmm3, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm4 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm4, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 +; X86-SSE2-NEXT: pxor %xmm2, %xmm4 +; X86-SSE2-NEXT: pcmpgtw %xmm4, %xmm3 +; X86-SSE2-NEXT: pand %xmm3, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm3 +; X86-SSE2-NEXT: por %xmm1, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm0 +; X86-SSE2-NEXT: psrld $16, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm1 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: pxor %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtw %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm3 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm3, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: ## kill: %AX %AX %EAX +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v16i16: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pmaxuw %xmm1, %xmm0 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pmaxuw %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pmaxuw %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: psrld $16, %xmm1 +; X86-SSE42-NEXT: pmaxuw %xmm0, %xmm1 +; X86-SSE42-NEXT: movd %xmm1, %eax +; X86-SSE42-NEXT: ## kill: %AX %AX %EAX +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v16i16: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: ## kill: %AX %AX %EAX +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v16i16: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: %AX %AX %EAX +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v16i16: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X64-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X64-SSE2-NEXT: pxor %xmm2, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm4 +; X64-SSE2-NEXT: pxor %xmm2, %xmm4 +; X64-SSE2-NEXT: pcmpgtw %xmm3, %xmm4 +; X64-SSE2-NEXT: pand %xmm4, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm4 +; X64-SSE2-NEXT: por %xmm0, %xmm4 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X64-SSE2-NEXT: pxor %xmm2, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pxor %xmm2, %xmm3 +; X64-SSE2-NEXT: pcmpgtw %xmm3, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm4 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm4, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X64-SSE2-NEXT: pxor %xmm2, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm4 +; X64-SSE2-NEXT: pxor %xmm2, %xmm4 +; X64-SSE2-NEXT: pcmpgtw %xmm4, %xmm3 +; X64-SSE2-NEXT: pand %xmm3, %xmm1 +; X64-SSE2-NEXT: pandn %xmm0, %xmm3 +; X64-SSE2-NEXT: por %xmm1, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm3, %xmm0 +; X64-SSE2-NEXT: psrld $16, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm3, %xmm1 +; X64-SSE2-NEXT: pxor %xmm2, %xmm1 +; X64-SSE2-NEXT: pxor %xmm0, %xmm2 +; X64-SSE2-NEXT: pcmpgtw %xmm2, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm3 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm3, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: ## kill: %AX %AX %EAX +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v16i16: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pmaxuw %xmm1, %xmm0 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pmaxuw %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pmaxuw %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: psrld $16, %xmm1 +; X64-SSE42-NEXT: pmaxuw %xmm0, %xmm1 +; X64-SSE42-NEXT: movd %xmm1, %eax +; X64-SSE42-NEXT: ## kill: %AX %AX %EAX +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v16i16: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: ## kill: %AX %AX %EAX +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v16i16: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: %AX %AX %EAX +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v16i16: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: ## kill: %AX %AX %EAX +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> + %2 = icmp ugt <16 x i16> %a0, %1 + %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %1 + %4 = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> + %5 = icmp ugt <16 x i16> %3, %4 + %6 = select <16 x i1> %5, <16 x i16> %3, <16 x i16> %4 + %7 = shufflevector <16 x i16> %6, <16 x i16> undef, <16 x i32> + %8 = icmp ugt <16 x i16> %6, %7 + %9 = select <16 x i1> %8, <16 x i16> %6, <16 x i16> %7 + %10 = shufflevector <16 x i16> %9, <16 x i16> undef, <16 x i32> + %11 = icmp ugt <16 x i16> %9, %10 + %12 = select <16 x i1> %11, <16 x i16> %9, <16 x i16> %10 + %13 = extractelement <16 x i16> %12, i32 0 + ret i16 %13 +} + +define i8 @test_reduce_v32i8(<32 x i8> %a0) { +; X86-SSE2-LABEL: test_reduce_v32i8: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0 +; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: ## kill: %AL %AL %EAX +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v32i8: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: psrld $16, %xmm1 +; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1 +; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE42-NEXT: psrlw $8, %xmm0 +; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0 +; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X86-SSE42-NEXT: ## kill: %AL %AL %EAX +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v32i8: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX1-NEXT: ## kill: %AL %AL %EAX +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v32i8: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX2-NEXT: ## kill: %AL %AL %EAX +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v32i8: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: psrld $16, %xmm1 +; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE2-NEXT: psrlw $8, %xmm0 +; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0 +; X64-SSE2-NEXT: movd %xmm0, %eax +; X64-SSE2-NEXT: ## kill: %AL %AL %EAX +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v32i8: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: psrld $16, %xmm1 +; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1 +; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE42-NEXT: psrlw $8, %xmm0 +; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0 +; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X64-SSE42-NEXT: ## kill: %AL %AL %EAX +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v32i8: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX1-NEXT: ## kill: %AL %AL %EAX +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v32i8: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX2-NEXT: ## kill: %AL %AL %EAX +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v32i8: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX512-NEXT: ## kill: %AL %AL %EAX +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> + %2 = icmp ugt <32 x i8> %a0, %1 + %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %1 + %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> + %5 = icmp ugt <32 x i8> %3, %4 + %6 = select <32 x i1> %5, <32 x i8> %3, <32 x i8> %4 + %7 = shufflevector <32 x i8> %6, <32 x i8> undef, <32 x i32> + %8 = icmp ugt <32 x i8> %6, %7 + %9 = select <32 x i1> %8, <32 x i8> %6, <32 x i8> %7 + %10 = shufflevector <32 x i8> %9, <32 x i8> undef, <32 x i32> + %11 = icmp ugt <32 x i8> %9, %10 + %12 = select <32 x i1> %11, <32 x i8> %9, <32 x i8> %10 + %13 = shufflevector <32 x i8> %12, <32 x i8> undef, <32 x i32> + %14 = icmp ugt <32 x i8> %12, %13 + %15 = select <32 x i1> %14, <32 x i8> %12, <32 x i8> %13 + %16 = extractelement <32 x i8> %15, i32 0 + ret i8 %16 +} + +; +; 512-bit Vectors +; + +define i64 @test_reduce_v8i64(<8 x i64> %a0) { +; X86-SSE2-LABEL: test_reduce_v8i64: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: subl $28, %esp +; X86-SSE2-NEXT: .cfi_def_cfa_offset 32 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm5, (%esp) ## 16-byte Spill +; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; X86-SSE2-NEXT: pxor %xmm4, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm6 +; X86-SSE2-NEXT: pxor %xmm4, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm6, %xmm7 +; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; X86-SSE2-NEXT: pcmpeqd %xmm5, %xmm6 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm5, %xmm6 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; X86-SSE2-NEXT: por %xmm6, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm6 +; X86-SSE2-NEXT: pxor %xmm4, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm7 +; X86-SSE2-NEXT: pxor %xmm4, %xmm7 +; X86-SSE2-NEXT: movdqa %xmm7, %xmm0 +; X86-SSE2-NEXT: pcmpgtd %xmm6, %xmm0 +; X86-SSE2-NEXT: pcmpeqd %xmm6, %xmm7 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] +; X86-SSE2-NEXT: pand %xmm6, %xmm7 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; X86-SSE2-NEXT: por %xmm7, %xmm6 +; X86-SSE2-NEXT: pand %xmm6, %xmm1 +; X86-SSE2-NEXT: pandn %xmm3, %xmm6 +; X86-SSE2-NEXT: por %xmm1, %xmm6 +; X86-SSE2-NEXT: pand %xmm5, %xmm2 +; X86-SSE2-NEXT: pandn (%esp), %xmm5 ## 16-byte Folded Reload +; X86-SSE2-NEXT: por %xmm2, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm0 +; X86-SSE2-NEXT: pxor %xmm4, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm6, %xmm1 +; X86-SSE2-NEXT: pxor %xmm4, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; X86-SSE2-NEXT: por %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm0, %xmm6 +; X86-SSE2-NEXT: pandn %xmm5, %xmm0 +; X86-SSE2-NEXT: por %xmm6, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pxor %xmm4, %xmm2 +; X86-SSE2-NEXT: pxor %xmm1, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 +; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm3 +; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm2, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; X86-SSE2-NEXT: por %xmm4, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X86-SSE2-NEXT: movd %xmm0, %edx +; X86-SSE2-NEXT: addl $28, %esp +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v8i64: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: movdqa %xmm0, %xmm4 +; X86-SSE42-NEXT: movdqa {{.*#+}} xmm6 = [0,2147483648,0,2147483648] +; X86-SSE42-NEXT: movdqa %xmm3, %xmm0 +; X86-SSE42-NEXT: pxor %xmm6, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm1, %xmm5 +; X86-SSE42-NEXT: pxor %xmm6, %xmm5 +; X86-SSE42-NEXT: pcmpgtq %xmm0, %xmm5 +; X86-SSE42-NEXT: movdqa %xmm2, %xmm7 +; X86-SSE42-NEXT: pxor %xmm6, %xmm7 +; X86-SSE42-NEXT: movdqa %xmm4, %xmm0 +; X86-SSE42-NEXT: pxor %xmm6, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm7, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; X86-SSE42-NEXT: movdqa %xmm5, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3 +; X86-SSE42-NEXT: movapd %xmm3, %xmm1 +; X86-SSE42-NEXT: xorpd %xmm6, %xmm1 +; X86-SSE42-NEXT: movapd %xmm2, %xmm0 +; X86-SSE42-NEXT: xorpd %xmm6, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; X86-SSE42-NEXT: movdqa %xmm3, %xmm0 +; X86-SSE42-NEXT: pxor %xmm6, %xmm0 +; X86-SSE42-NEXT: pxor %xmm1, %xmm6 +; X86-SSE42-NEXT: pcmpgtq %xmm6, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 +; X86-SSE42-NEXT: movd %xmm1, %eax +; X86-SSE42-NEXT: pextrd $1, %xmm1, %edx +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v8i64: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648] +; X86-AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; X86-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 +; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 +; X86-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4 +; X86-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm5 +; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm2 +; X86-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm4 +; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm4 +; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2 +; X86-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm4 +; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; X86-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm3 +; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v8i64: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] +; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3 +; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm4 +; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 +; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 +; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm4 +; X86-AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3 +; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 +; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2 +; X86-AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2 +; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v8i64: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; X64-SSE2-NEXT: movdqa %xmm3, %xmm5 +; X64-SSE2-NEXT: pxor %xmm4, %xmm5 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm6 +; X64-SSE2-NEXT: pxor %xmm4, %xmm6 +; X64-SSE2-NEXT: movdqa %xmm6, %xmm7 +; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm5, %xmm6 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm8, %xmm6 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] +; X64-SSE2-NEXT: por %xmm6, %xmm8 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm6 +; X64-SSE2-NEXT: pxor %xmm4, %xmm6 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm7 +; X64-SSE2-NEXT: pxor %xmm4, %xmm7 +; X64-SSE2-NEXT: movdqa %xmm7, %xmm5 +; X64-SSE2-NEXT: pcmpgtd %xmm6, %xmm5 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm6, %xmm7 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm9, %xmm7 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] +; X64-SSE2-NEXT: por %xmm7, %xmm6 +; X64-SSE2-NEXT: pand %xmm6, %xmm0 +; X64-SSE2-NEXT: pandn %xmm2, %xmm6 +; X64-SSE2-NEXT: por %xmm0, %xmm6 +; X64-SSE2-NEXT: pand %xmm8, %xmm1 +; X64-SSE2-NEXT: pandn %xmm3, %xmm8 +; X64-SSE2-NEXT: por %xmm1, %xmm8 +; X64-SSE2-NEXT: movdqa %xmm8, %xmm0 +; X64-SSE2-NEXT: pxor %xmm4, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm6, %xmm1 +; X64-SSE2-NEXT: pxor %xmm4, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm3, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; X64-SSE2-NEXT: por %xmm0, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm6 +; X64-SSE2-NEXT: pandn %xmm8, %xmm1 +; X64-SSE2-NEXT: por %xmm6, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pxor %xmm4, %xmm2 +; X64-SSE2-NEXT: pxor %xmm0, %xmm4 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm3 +; X64-SSE2-NEXT: pcmpgtd %xmm4, %xmm3 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm2, %xmm4 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm5, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; X64-SSE2-NEXT: por %xmm2, %xmm3 +; X64-SSE2-NEXT: pand %xmm3, %xmm1 +; X64-SSE2-NEXT: pandn %xmm0, %xmm3 +; X64-SSE2-NEXT: por %xmm1, %xmm3 +; X64-SSE2-NEXT: movq %xmm3, %rax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v8i64: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: movdqa %xmm0, %xmm4 +; X64-SSE42-NEXT: movdqa {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808] +; X64-SSE42-NEXT: movdqa %xmm3, %xmm0 +; X64-SSE42-NEXT: pxor %xmm6, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm1, %xmm5 +; X64-SSE42-NEXT: pxor %xmm6, %xmm5 +; X64-SSE42-NEXT: pcmpgtq %xmm0, %xmm5 +; X64-SSE42-NEXT: movdqa %xmm2, %xmm7 +; X64-SSE42-NEXT: pxor %xmm6, %xmm7 +; X64-SSE42-NEXT: movdqa %xmm4, %xmm0 +; X64-SSE42-NEXT: pxor %xmm6, %xmm0 +; X64-SSE42-NEXT: pcmpgtq %xmm7, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; X64-SSE42-NEXT: movdqa %xmm5, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3 +; X64-SSE42-NEXT: movapd %xmm3, %xmm1 +; X64-SSE42-NEXT: xorpd %xmm6, %xmm1 +; X64-SSE42-NEXT: movapd %xmm2, %xmm0 +; X64-SSE42-NEXT: xorpd %xmm6, %xmm0 +; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; X64-SSE42-NEXT: movdqa %xmm3, %xmm0 +; X64-SSE42-NEXT: pxor %xmm6, %xmm0 +; X64-SSE42-NEXT: pxor %xmm1, %xmm6 +; X64-SSE42-NEXT: pcmpgtq %xmm6, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 +; X64-SSE42-NEXT: movq %xmm1, %rax +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v8i64: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; X64-AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; X64-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 +; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 +; X64-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4 +; X64-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm5 +; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm2 +; X64-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm4 +; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm4 +; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2 +; X64-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm4 +; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; X64-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm3 +; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v8i64: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; X64-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3 +; X64-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm4 +; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 +; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 +; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm4 +; X64-AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3 +; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 +; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2 +; X64-AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2 +; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v8i64: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; X64-AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vmovq %xmm0, %rax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> + %2 = icmp ugt <8 x i64> %a0, %1 + %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %1 + %4 = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> + %5 = icmp ugt <8 x i64> %3, %4 + %6 = select <8 x i1> %5, <8 x i64> %3, <8 x i64> %4 + %7 = shufflevector <8 x i64> %6, <8 x i64> undef, <8 x i32> + %8 = icmp ugt <8 x i64> %6, %7 + %9 = select <8 x i1> %8, <8 x i64> %6, <8 x i64> %7 + %10 = extractelement <8 x i64> %9, i32 0 + ret i64 %10 +} + +define i32 @test_reduce_v16i32(<16 x i32> %a0) { +; X86-SSE2-LABEL: test_reduce_v16i32: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; X86-SSE2-NEXT: movdqa %xmm3, %xmm5 +; X86-SSE2-NEXT: pxor %xmm4, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm6 +; X86-SSE2-NEXT: pxor %xmm4, %xmm6 +; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X86-SSE2-NEXT: pxor %xmm4, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm7 +; X86-SSE2-NEXT: pxor %xmm4, %xmm7 +; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; X86-SSE2-NEXT: pand %xmm7, %xmm0 +; X86-SSE2-NEXT: pandn %xmm2, %xmm7 +; X86-SSE2-NEXT: por %xmm0, %xmm7 +; X86-SSE2-NEXT: pand %xmm6, %xmm1 +; X86-SSE2-NEXT: pandn %xmm3, %xmm6 +; X86-SSE2-NEXT: por %xmm1, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm6, %xmm0 +; X86-SSE2-NEXT: pxor %xmm4, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm7, %xmm1 +; X86-SSE2-NEXT: pxor %xmm4, %xmm1 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm7 +; X86-SSE2-NEXT: pandn %xmm6, %xmm1 +; X86-SSE2-NEXT: por %xmm7, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pxor %xmm4, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm4, %xmm3 +; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pxor %xmm4, %xmm1 +; X86-SSE2-NEXT: pxor %xmm0, %xmm4 +; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v16i32: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pmaxud %xmm3, %xmm1 +; X86-SSE42-NEXT: pmaxud %xmm2, %xmm0 +; X86-SSE42-NEXT: pmaxud %xmm1, %xmm0 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pmaxud %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pmaxud %xmm1, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v16i32: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm2 +; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpmaxud %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v16i32: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v16i32: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; X64-SSE2-NEXT: movdqa %xmm3, %xmm5 +; X64-SSE2-NEXT: pxor %xmm4, %xmm5 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm6 +; X64-SSE2-NEXT: pxor %xmm4, %xmm6 +; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X64-SSE2-NEXT: pxor %xmm4, %xmm5 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm7 +; X64-SSE2-NEXT: pxor %xmm4, %xmm7 +; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; X64-SSE2-NEXT: pand %xmm7, %xmm0 +; X64-SSE2-NEXT: pandn %xmm2, %xmm7 +; X64-SSE2-NEXT: por %xmm0, %xmm7 +; X64-SSE2-NEXT: pand %xmm6, %xmm1 +; X64-SSE2-NEXT: pandn %xmm3, %xmm6 +; X64-SSE2-NEXT: por %xmm1, %xmm6 +; X64-SSE2-NEXT: movdqa %xmm6, %xmm0 +; X64-SSE2-NEXT: pxor %xmm4, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm7, %xmm1 +; X64-SSE2-NEXT: pxor %xmm4, %xmm1 +; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm7 +; X64-SSE2-NEXT: pandn %xmm6, %xmm1 +; X64-SSE2-NEXT: por %xmm7, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pxor %xmm4, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pxor %xmm4, %xmm3 +; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm1 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm1, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X64-SSE2-NEXT: pxor %xmm4, %xmm1 +; X64-SSE2-NEXT: pxor %xmm0, %xmm4 +; X64-SSE2-NEXT: pcmpgtd %xmm4, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v16i32: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pmaxud %xmm3, %xmm1 +; X64-SSE42-NEXT: pmaxud %xmm2, %xmm0 +; X64-SSE42-NEXT: pmaxud %xmm1, %xmm0 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pmaxud %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pmaxud %xmm1, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v16i32: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm2 +; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpmaxud %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v16i32: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v16i32: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; X64-AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> + %2 = icmp ugt <16 x i32> %a0, %1 + %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %1 + %4 = shufflevector <16 x i32> %3, <16 x i32> undef, <16 x i32> + %5 = icmp ugt <16 x i32> %3, %4 + %6 = select <16 x i1> %5, <16 x i32> %3, <16 x i32> %4 + %7 = shufflevector <16 x i32> %6, <16 x i32> undef, <16 x i32> + %8 = icmp ugt <16 x i32> %6, %7 + %9 = select <16 x i1> %8, <16 x i32> %6, <16 x i32> %7 + %10 = shufflevector <16 x i32> %9, <16 x i32> undef, <16 x i32> + %11 = icmp ugt <16 x i32> %9, %10 + %12 = select <16 x i1> %11, <16 x i32> %9, <16 x i32> %10 + %13 = extractelement <16 x i32> %12, i32 0 + ret i32 %13 +} + +define i16 @test_reduce_v32i16(<32 x i16> %a0) { +; X86-SSE2-LABEL: test_reduce_v32i16: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X86-SSE2-NEXT: movdqa %xmm3, %xmm5 +; X86-SSE2-NEXT: pxor %xmm4, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm6 +; X86-SSE2-NEXT: pxor %xmm4, %xmm6 +; X86-SSE2-NEXT: pcmpgtw %xmm5, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X86-SSE2-NEXT: pxor %xmm4, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm7 +; X86-SSE2-NEXT: pxor %xmm4, %xmm7 +; X86-SSE2-NEXT: pcmpgtw %xmm5, %xmm7 +; X86-SSE2-NEXT: pand %xmm7, %xmm0 +; X86-SSE2-NEXT: pandn %xmm2, %xmm7 +; X86-SSE2-NEXT: por %xmm0, %xmm7 +; X86-SSE2-NEXT: pand %xmm6, %xmm1 +; X86-SSE2-NEXT: pandn %xmm3, %xmm6 +; X86-SSE2-NEXT: por %xmm1, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm6, %xmm0 +; X86-SSE2-NEXT: pxor %xmm4, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm7, %xmm1 +; X86-SSE2-NEXT: pxor %xmm4, %xmm1 +; X86-SSE2-NEXT: pcmpgtw %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm7 +; X86-SSE2-NEXT: pandn %xmm6, %xmm1 +; X86-SSE2-NEXT: por %xmm7, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pxor %xmm4, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm4, %xmm3 +; X86-SSE2-NEXT: pcmpgtw %xmm3, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pxor %xmm4, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm4, %xmm3 +; X86-SSE2-NEXT: pcmpgtw %xmm3, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrld $16, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pxor %xmm4, %xmm2 +; X86-SSE2-NEXT: pxor %xmm0, %xmm4 +; X86-SSE2-NEXT: pcmpgtw %xmm4, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: ## kill: %AX %AX %EAX +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v32i16: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pmaxuw %xmm3, %xmm1 +; X86-SSE42-NEXT: pmaxuw %xmm2, %xmm0 +; X86-SSE42-NEXT: pmaxuw %xmm1, %xmm0 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pmaxuw %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pmaxuw %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: psrld $16, %xmm1 +; X86-SSE42-NEXT: pmaxuw %xmm0, %xmm1 +; X86-SSE42-NEXT: movd %xmm1, %eax +; X86-SSE42-NEXT: ## kill: %AX %AX %EAX +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v32i16: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX1-NEXT: vpmaxuw %xmm2, %xmm3, %xmm2 +; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpmaxuw %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: ## kill: %AX %AX %EAX +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v32i16: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: %AX %AX %EAX +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v32i16: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X64-SSE2-NEXT: movdqa %xmm3, %xmm5 +; X64-SSE2-NEXT: pxor %xmm4, %xmm5 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm6 +; X64-SSE2-NEXT: pxor %xmm4, %xmm6 +; X64-SSE2-NEXT: pcmpgtw %xmm5, %xmm6 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X64-SSE2-NEXT: pxor %xmm4, %xmm5 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm7 +; X64-SSE2-NEXT: pxor %xmm4, %xmm7 +; X64-SSE2-NEXT: pcmpgtw %xmm5, %xmm7 +; X64-SSE2-NEXT: pand %xmm7, %xmm0 +; X64-SSE2-NEXT: pandn %xmm2, %xmm7 +; X64-SSE2-NEXT: por %xmm0, %xmm7 +; X64-SSE2-NEXT: pand %xmm6, %xmm1 +; X64-SSE2-NEXT: pandn %xmm3, %xmm6 +; X64-SSE2-NEXT: por %xmm1, %xmm6 +; X64-SSE2-NEXT: movdqa %xmm6, %xmm0 +; X64-SSE2-NEXT: pxor %xmm4, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm7, %xmm1 +; X64-SSE2-NEXT: pxor %xmm4, %xmm1 +; X64-SSE2-NEXT: pcmpgtw %xmm0, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm7 +; X64-SSE2-NEXT: pandn %xmm6, %xmm1 +; X64-SSE2-NEXT: por %xmm7, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pxor %xmm4, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pxor %xmm4, %xmm3 +; X64-SSE2-NEXT: pcmpgtw %xmm3, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm1 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm1, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X64-SSE2-NEXT: pxor %xmm4, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pxor %xmm4, %xmm3 +; X64-SSE2-NEXT: pcmpgtw %xmm3, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE2-NEXT: psrld $16, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pxor %xmm4, %xmm2 +; X64-SSE2-NEXT: pxor %xmm0, %xmm4 +; X64-SSE2-NEXT: pcmpgtw %xmm4, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm1 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm1, %xmm2 +; X64-SSE2-NEXT: movd %xmm2, %eax +; X64-SSE2-NEXT: ## kill: %AX %AX %EAX +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v32i16: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pmaxuw %xmm3, %xmm1 +; X64-SSE42-NEXT: pmaxuw %xmm2, %xmm0 +; X64-SSE42-NEXT: pmaxuw %xmm1, %xmm0 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pmaxuw %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pmaxuw %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: psrld $16, %xmm1 +; X64-SSE42-NEXT: pmaxuw %xmm0, %xmm1 +; X64-SSE42-NEXT: movd %xmm1, %eax +; X64-SSE42-NEXT: ## kill: %AX %AX %EAX +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v32i16: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-AVX1-NEXT: vpmaxuw %xmm2, %xmm3, %xmm2 +; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpmaxuw %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: ## kill: %AX %AX %EAX +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v32i16: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: %AX %AX %EAX +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v32i16: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: ## kill: %AX %AX %EAX +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> + %2 = icmp ugt <32 x i16> %a0, %1 + %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %1 + %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> + %5 = icmp ugt <32 x i16> %3, %4 + %6 = select <32 x i1> %5, <32 x i16> %3, <32 x i16> %4 + %7 = shufflevector <32 x i16> %6, <32 x i16> undef, <32 x i32> + %8 = icmp ugt <32 x i16> %6, %7 + %9 = select <32 x i1> %8, <32 x i16> %6, <32 x i16> %7 + %10 = shufflevector <32 x i16> %9, <32 x i16> undef, <32 x i32> + %11 = icmp ugt <32 x i16> %9, %10 + %12 = select <32 x i1> %11, <32 x i16> %9, <32 x i16> %10 + %13 = shufflevector <32 x i16> %12, <32 x i16> undef, <32 x i32> + %14 = icmp ugt <32 x i16> %12, %13 + %15 = select <32 x i1> %14, <32 x i16> %12, <32 x i16> %13 + %16 = extractelement <32 x i16> %15, i32 0 + ret i16 %16 +} + +define i8 @test_reduce_v64i8(<64 x i8> %a0) { +; X86-SSE2-LABEL: test_reduce_v64i8: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: pmaxub %xmm3, %xmm1 +; X86-SSE2-NEXT: pmaxub %xmm2, %xmm0 +; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0 +; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: ## kill: %AL %AL %EAX +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v64i8: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pmaxub %xmm3, %xmm1 +; X86-SSE42-NEXT: pmaxub %xmm2, %xmm0 +; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: psrld $16, %xmm1 +; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1 +; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE42-NEXT: psrlw $8, %xmm0 +; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0 +; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X86-SSE42-NEXT: ## kill: %AL %AL %EAX +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v64i8: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX1-NEXT: vpmaxub %xmm2, %xmm3, %xmm2 +; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpmaxub %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX1-NEXT: ## kill: %AL %AL %EAX +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v64i8: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX2-NEXT: ## kill: %AL %AL %EAX +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v64i8: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: pmaxub %xmm3, %xmm1 +; X64-SSE2-NEXT: pmaxub %xmm2, %xmm0 +; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: psrld $16, %xmm1 +; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE2-NEXT: psrlw $8, %xmm0 +; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0 +; X64-SSE2-NEXT: movd %xmm0, %eax +; X64-SSE2-NEXT: ## kill: %AL %AL %EAX +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v64i8: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pmaxub %xmm3, %xmm1 +; X64-SSE42-NEXT: pmaxub %xmm2, %xmm0 +; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: psrld $16, %xmm1 +; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1 +; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE42-NEXT: psrlw $8, %xmm0 +; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0 +; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X64-SSE42-NEXT: ## kill: %AL %AL %EAX +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v64i8: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-AVX1-NEXT: vpmaxub %xmm2, %xmm3, %xmm2 +; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpmaxub %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX1-NEXT: ## kill: %AL %AL %EAX +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v64i8: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX2-NEXT: ## kill: %AL %AL %EAX +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v64i8: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX512-NEXT: ## kill: %AL %AL %EAX +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> + %2 = icmp ugt <64 x i8> %a0, %1 + %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %1 + %4 = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> + %5 = icmp ugt <64 x i8> %3, %4 + %6 = select <64 x i1> %5, <64 x i8> %3, <64 x i8> %4 + %7 = shufflevector <64 x i8> %6, <64 x i8> undef, <64 x i32> + %8 = icmp ugt <64 x i8> %6, %7 + %9 = select <64 x i1> %8, <64 x i8> %6, <64 x i8> %7 + %10 = shufflevector <64 x i8> %9, <64 x i8> undef, <64 x i32> + %11 = icmp ugt <64 x i8> %9, %10 + %12 = select <64 x i1> %11, <64 x i8> %9, <64 x i8> %10 + %13 = shufflevector <64 x i8> %12, <64 x i8> undef, <64 x i32> + %14 = icmp ugt <64 x i8> %12, %13 + %15 = select <64 x i1> %14, <64 x i8> %12, <64 x i8> %13 + %16 = shufflevector <64 x i8> %15, <64 x i8> undef, <64 x i32> + %17 = icmp ugt <64 x i8> %15, %16 + %18 = select <64 x i1> %17, <64 x i8> %15, <64 x i8> %16 + %19 = extractelement <64 x i8> %18, i32 0 + ret i8 %19 +} diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll new file mode 100644 index 000000000000..433696730420 --- /dev/null +++ b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll @@ -0,0 +1,2207 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE2 +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE42 +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1 +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE2 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE42 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX512 + +; +; 128-bit Vectors +; + +define i64 @test_reduce_v2i64(<2 x i64> %a0) { +; X86-SSE2-LABEL: test_reduce_v2i64: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: pxor %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm5, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; X86-SSE2-NEXT: por %xmm2, %xmm3 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm3 +; X86-SSE2-NEXT: por %xmm0, %xmm3 +; X86-SSE2-NEXT: movd %xmm3, %eax +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] +; X86-SSE2-NEXT: movd %xmm0, %edx +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v2i64: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X86-SSE42-NEXT: movdqa {{.*#+}} xmm0 = [0,2147483648,0,2147483648] +; X86-SSE42-NEXT: movdqa %xmm1, %xmm3 +; X86-SSE42-NEXT: pxor %xmm0, %xmm3 +; X86-SSE42-NEXT: pxor %xmm2, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; X86-SSE42-NEXT: movd %xmm2, %eax +; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx +; X86-SSE42-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v2i64: +; X86-AVX: ## BB#0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] +; X86-AVX-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; X86-AVX-NEXT: vpxor %xmm2, %xmm1, %xmm2 +; X86-AVX-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 +; X86-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vmovd %xmm0, %eax +; X86-AVX-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v2i64: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pxor %xmm2, %xmm3 +; X64-SSE2-NEXT: pxor %xmm1, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm5, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; X64-SSE2-NEXT: por %xmm2, %xmm3 +; X64-SSE2-NEXT: pand %xmm3, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm3 +; X64-SSE2-NEXT: por %xmm0, %xmm3 +; X64-SSE2-NEXT: movq %xmm3, %rax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v2i64: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X64-SSE42-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808] +; X64-SSE42-NEXT: movdqa %xmm1, %xmm3 +; X64-SSE42-NEXT: pxor %xmm0, %xmm3 +; X64-SSE42-NEXT: pxor %xmm2, %xmm0 +; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; X64-SSE42-NEXT: movq %xmm2, %rax +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v2i64: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; X64-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2 +; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 +; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v2i64: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; X64-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 +; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 +; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v2i64: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpminuq %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovq %xmm0, %rax +; X64-AVX512-NEXT: retq + %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> + %2 = icmp ult <2 x i64> %a0, %1 + %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %1 + %4 = extractelement <2 x i64> %3, i32 0 + ret i64 %4 +} + +define i32 @test_reduce_v4i32(<4 x i32> %a0) { +; X86-SSE2-LABEL: test_reduce_v4i32: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 +; X86-SSE2-NEXT: pxor %xmm2, %xmm4 +; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; X86-SSE2-NEXT: pand %xmm4, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm4 +; X86-SSE2-NEXT: por %xmm0, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: pxor %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm4 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm4, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v4i32: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pminud %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pminud %xmm1, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax +; X86-SSE42-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v4i32: +; X86-AVX: ## BB#0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovd %xmm0, %eax +; X86-AVX-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v4i32: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pxor %xmm2, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm4 +; X64-SSE2-NEXT: pxor %xmm2, %xmm4 +; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; X64-SSE2-NEXT: pand %xmm4, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm4 +; X64-SSE2-NEXT: por %xmm0, %xmm4 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X64-SSE2-NEXT: pxor %xmm2, %xmm1 +; X64-SSE2-NEXT: pxor %xmm0, %xmm2 +; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm4 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm4, %xmm2 +; X64-SSE2-NEXT: movd %xmm2, %eax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v4i32: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pminud %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pminud %xmm1, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax +; X64-SSE42-NEXT: retq +; +; X64-AVX-LABEL: test_reduce_v4i32: +; X64-AVX: ## BB#0: +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovd %xmm0, %eax +; X64-AVX-NEXT: retq + %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> + %2 = icmp ult <4 x i32> %a0, %1 + %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %1 + %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> + %5 = icmp ult <4 x i32> %3, %4 + %6 = select <4 x i1> %5, <4 x i32> %3, <4 x i32> %4 + %7 = extractelement <4 x i32> %6, i32 0 + ret i32 %7 +} + +define i16 @test_reduce_v8i16(<8 x i16> %a0) { +; X86-SSE2-LABEL: test_reduce_v8i16: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm1, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE2-NEXT: pxor %xmm1, %xmm4 +; X86-SSE2-NEXT: pcmpgtw %xmm3, %xmm4 +; X86-SSE2-NEXT: pand %xmm4, %xmm0 +; X86-SSE2-NEXT: pandn %xmm2, %xmm4 +; X86-SSE2-NEXT: por %xmm0, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm4, %xmm2 +; X86-SSE2-NEXT: pxor %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm1, %xmm3 +; X86-SSE2-NEXT: pcmpgtw %xmm2, %xmm3 +; X86-SSE2-NEXT: pand %xmm3, %xmm4 +; X86-SSE2-NEXT: pandn %xmm0, %xmm3 +; X86-SSE2-NEXT: por %xmm4, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm0 +; X86-SSE2-NEXT: psrld $16, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm2 +; X86-SSE2-NEXT: pxor %xmm1, %xmm2 +; X86-SSE2-NEXT: pxor %xmm0, %xmm1 +; X86-SSE2-NEXT: pcmpgtw %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm3 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm3, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: ## kill: %AX %AX %EAX +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v8i16: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pminuw %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pminuw %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: psrld $16, %xmm1 +; X86-SSE42-NEXT: pminuw %xmm0, %xmm1 +; X86-SSE42-NEXT: movd %xmm1, %eax +; X86-SSE42-NEXT: ## kill: %AX %AX %EAX +; X86-SSE42-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v8i16: +; X86-AVX: ## BB#0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovd %xmm0, %eax +; X86-AVX-NEXT: ## kill: %AX %AX %EAX +; X86-AVX-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v8i16: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pxor %xmm1, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X64-SSE2-NEXT: pxor %xmm1, %xmm4 +; X64-SSE2-NEXT: pcmpgtw %xmm3, %xmm4 +; X64-SSE2-NEXT: pand %xmm4, %xmm0 +; X64-SSE2-NEXT: pandn %xmm2, %xmm4 +; X64-SSE2-NEXT: por %xmm0, %xmm4 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm4, %xmm2 +; X64-SSE2-NEXT: pxor %xmm1, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pxor %xmm1, %xmm3 +; X64-SSE2-NEXT: pcmpgtw %xmm2, %xmm3 +; X64-SSE2-NEXT: pand %xmm3, %xmm4 +; X64-SSE2-NEXT: pandn %xmm0, %xmm3 +; X64-SSE2-NEXT: por %xmm4, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm3, %xmm0 +; X64-SSE2-NEXT: psrld $16, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm3, %xmm2 +; X64-SSE2-NEXT: pxor %xmm1, %xmm2 +; X64-SSE2-NEXT: pxor %xmm0, %xmm1 +; X64-SSE2-NEXT: pcmpgtw %xmm2, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm3 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm3, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: ## kill: %AX %AX %EAX +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v8i16: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pminuw %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pminuw %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: psrld $16, %xmm1 +; X64-SSE42-NEXT: pminuw %xmm0, %xmm1 +; X64-SSE42-NEXT: movd %xmm1, %eax +; X64-SSE42-NEXT: ## kill: %AX %AX %EAX +; X64-SSE42-NEXT: retq +; +; X64-AVX-LABEL: test_reduce_v8i16: +; X64-AVX: ## BB#0: +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovd %xmm0, %eax +; X64-AVX-NEXT: ## kill: %AX %AX %EAX +; X64-AVX-NEXT: retq + %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> + %2 = icmp ult <8 x i16> %a0, %1 + %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %1 + %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> + %5 = icmp ult <8 x i16> %3, %4 + %6 = select <8 x i1> %5, <8 x i16> %3, <8 x i16> %4 + %7 = shufflevector <8 x i16> %6, <8 x i16> undef, <8 x i32> + %8 = icmp ult <8 x i16> %6, %7 + %9 = select <8 x i1> %8, <8 x i16> %6, <8 x i16> %7 + %10 = extractelement <8 x i16> %9, i32 0 + ret i16 %10 +} + +define i8 @test_reduce_v16i8(<16 x i8> %a0) { +; X86-SSE2-LABEL: test_reduce_v16i8: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pminub %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE2-NEXT: pminub %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: pminub %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: pminub %xmm1, %xmm0 +; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: ## kill: %AL %AL %EAX +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v16i8: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pminub %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pminub %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: psrld $16, %xmm1 +; X86-SSE42-NEXT: pminub %xmm0, %xmm1 +; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE42-NEXT: psrlw $8, %xmm0 +; X86-SSE42-NEXT: pminub %xmm1, %xmm0 +; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X86-SSE42-NEXT: ## kill: %AL %AL %EAX +; X86-SSE42-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v16i8: +; X86-AVX: ## BB#0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX-NEXT: ## kill: %AL %AL %EAX +; X86-AVX-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v16i8: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pminub %xmm0, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE2-NEXT: pminub %xmm1, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: psrld $16, %xmm1 +; X64-SSE2-NEXT: pminub %xmm0, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE2-NEXT: psrlw $8, %xmm0 +; X64-SSE2-NEXT: pminub %xmm1, %xmm0 +; X64-SSE2-NEXT: movd %xmm0, %eax +; X64-SSE2-NEXT: ## kill: %AL %AL %EAX +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v16i8: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pminub %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pminub %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: psrld $16, %xmm1 +; X64-SSE42-NEXT: pminub %xmm0, %xmm1 +; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE42-NEXT: psrlw $8, %xmm0 +; X64-SSE42-NEXT: pminub %xmm1, %xmm0 +; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X64-SSE42-NEXT: ## kill: %AL %AL %EAX +; X64-SSE42-NEXT: retq +; +; X64-AVX-LABEL: test_reduce_v16i8: +; X64-AVX: ## BB#0: +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX-NEXT: ## kill: %AL %AL %EAX +; X64-AVX-NEXT: retq + %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> + %2 = icmp ult <16 x i8> %a0, %1 + %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %1 + %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> + %5 = icmp ult <16 x i8> %3, %4 + %6 = select <16 x i1> %5, <16 x i8> %3, <16 x i8> %4 + %7 = shufflevector <16 x i8> %6, <16 x i8> undef, <16 x i32> + %8 = icmp ult <16 x i8> %6, %7 + %9 = select <16 x i1> %8, <16 x i8> %6, <16 x i8> %7 + %10 = shufflevector <16 x i8> %9, <16 x i8> undef, <16 x i32> + %11 = icmp ult <16 x i8> %9, %10 + %12 = select <16 x i1> %11, <16 x i8> %9, <16 x i8> %10 + %13 = extractelement <16 x i8> %12, i32 0 + ret i8 %13 +} + +; +; 256-bit Vectors +; + +define i64 @test_reduce_v4i64(<4 x i64> %a0) { +; X86-SSE2-LABEL: test_reduce_v4i64: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 +; X86-SSE2-NEXT: pxor %xmm2, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm5 +; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm6, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; X86-SSE2-NEXT: por %xmm3, %xmm4 +; X86-SSE2-NEXT: pand %xmm4, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm4 +; X86-SSE2-NEXT: por %xmm0, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: pxor %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 +; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm5, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm4 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm4, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X86-SSE2-NEXT: movd %xmm0, %edx +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v4i64: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648] +; X86-SSE42-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE42-NEXT: pxor %xmm3, %xmm4 +; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE42-NEXT: pxor %xmm3, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm4, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE42-NEXT: pxor %xmm3, %xmm0 +; X86-SSE42-NEXT: pxor %xmm2, %xmm3 +; X86-SSE42-NEXT: pcmpgtq %xmm0, %xmm3 +; X86-SSE42-NEXT: movdqa %xmm3, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; X86-SSE42-NEXT: movd %xmm2, %eax +; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v4i64: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] +; X86-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; X86-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm4 +; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm4 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; X86-AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 +; X86-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm4 +; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; X86-AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm2 +; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm2 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v4i64: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] +; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3 +; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4 +; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 +; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 +; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2 +; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2 +; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v4i64: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pxor %xmm2, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm4 +; X64-SSE2-NEXT: pxor %xmm2, %xmm4 +; X64-SSE2-NEXT: movdqa %xmm4, %xmm5 +; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm4 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm6, %xmm3 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; X64-SSE2-NEXT: por %xmm3, %xmm4 +; X64-SSE2-NEXT: pand %xmm4, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm4 +; X64-SSE2-NEXT: por %xmm0, %xmm4 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X64-SSE2-NEXT: pxor %xmm2, %xmm1 +; X64-SSE2-NEXT: pxor %xmm0, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm3 +; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm5, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; X64-SSE2-NEXT: por %xmm1, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm4 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm4, %xmm2 +; X64-SSE2-NEXT: movq %xmm2, %rax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v4i64: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; X64-SSE42-NEXT: movdqa %xmm2, %xmm4 +; X64-SSE42-NEXT: pxor %xmm3, %xmm4 +; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE42-NEXT: pxor %xmm3, %xmm0 +; X64-SSE42-NEXT: pcmpgtq %xmm4, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE42-NEXT: pxor %xmm3, %xmm0 +; X64-SSE42-NEXT: pxor %xmm2, %xmm3 +; X64-SSE42-NEXT: pcmpgtq %xmm0, %xmm3 +; X64-SSE42-NEXT: movdqa %xmm3, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; X64-SSE42-NEXT: movq %xmm2, %rax +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v4i64: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; X64-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm4 +; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm4 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; X64-AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 +; X64-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm4 +; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; X64-AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm2 +; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm2 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v4i64: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; X64-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3 +; X64-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4 +; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 +; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 +; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2 +; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2 +; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v4i64: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpminuq %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpminuq %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vmovq %xmm0, %rax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> + %2 = icmp ult <4 x i64> %a0, %1 + %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %1 + %4 = shufflevector <4 x i64> %3, <4 x i64> undef, <4 x i32> + %5 = icmp ult <4 x i64> %3, %4 + %6 = select <4 x i1> %5, <4 x i64> %3, <4 x i64> %4 + %7 = extractelement <4 x i64> %6, i32 0 + ret i64 %7 +} + +define i32 @test_reduce_v8i32(<8 x i32> %a0) { +; X86-SSE2-LABEL: test_reduce_v8i32: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 +; X86-SSE2-NEXT: pxor %xmm2, %xmm4 +; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; X86-SSE2-NEXT: pand %xmm4, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm4 +; X86-SSE2-NEXT: por %xmm0, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; X86-SSE2-NEXT: pand %xmm3, %xmm4 +; X86-SSE2-NEXT: pandn %xmm0, %xmm3 +; X86-SSE2-NEXT: por %xmm4, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm3, %xmm1 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: pxor %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm3 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm3, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v8i32: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pminud %xmm1, %xmm0 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pminud %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pminud %xmm1, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v8i32: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v8i32: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v8i32: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pxor %xmm2, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm4 +; X64-SSE2-NEXT: pxor %xmm2, %xmm4 +; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; X64-SSE2-NEXT: pand %xmm4, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm4 +; X64-SSE2-NEXT: por %xmm0, %xmm4 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X64-SSE2-NEXT: pxor %xmm2, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pxor %xmm2, %xmm3 +; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; X64-SSE2-NEXT: pand %xmm3, %xmm4 +; X64-SSE2-NEXT: pandn %xmm0, %xmm3 +; X64-SSE2-NEXT: por %xmm4, %xmm3 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm3, %xmm1 +; X64-SSE2-NEXT: pxor %xmm2, %xmm1 +; X64-SSE2-NEXT: pxor %xmm0, %xmm2 +; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm3 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm3, %xmm2 +; X64-SSE2-NEXT: movd %xmm2, %eax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v8i32: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pminud %xmm1, %xmm0 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pminud %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pminud %xmm1, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v8i32: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v8i32: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v8i32: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> + %2 = icmp ult <8 x i32> %a0, %1 + %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %1 + %4 = shufflevector <8 x i32> %3, <8 x i32> undef, <8 x i32> + %5 = icmp ult <8 x i32> %3, %4 + %6 = select <8 x i1> %5, <8 x i32> %3, <8 x i32> %4 + %7 = shufflevector <8 x i32> %6, <8 x i32> undef, <8 x i32> + %8 = icmp ult <8 x i32> %6, %7 + %9 = select <8 x i1> %8, <8 x i32> %6, <8 x i32> %7 + %10 = extractelement <8 x i32> %9, i32 0 + ret i32 %10 +} + +define i16 @test_reduce_v16i16(<16 x i16> %a0) { +; X86-SSE2-LABEL: test_reduce_v16i16: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 +; X86-SSE2-NEXT: pxor %xmm2, %xmm4 +; X86-SSE2-NEXT: pcmpgtw %xmm3, %xmm4 +; X86-SSE2-NEXT: pand %xmm4, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm4 +; X86-SSE2-NEXT: por %xmm0, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: pcmpgtw %xmm1, %xmm3 +; X86-SSE2-NEXT: pand %xmm3, %xmm4 +; X86-SSE2-NEXT: pandn %xmm0, %xmm3 +; X86-SSE2-NEXT: por %xmm4, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm3, %xmm1 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 +; X86-SSE2-NEXT: pxor %xmm2, %xmm4 +; X86-SSE2-NEXT: pcmpgtw %xmm1, %xmm4 +; X86-SSE2-NEXT: pand %xmm4, %xmm3 +; X86-SSE2-NEXT: pandn %xmm0, %xmm4 +; X86-SSE2-NEXT: por %xmm3, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X86-SSE2-NEXT: psrld $16, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: pxor %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtw %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm4 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm4, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: ## kill: %AX %AX %EAX +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v16i16: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pminuw %xmm1, %xmm0 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pminuw %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pminuw %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: psrld $16, %xmm1 +; X86-SSE42-NEXT: pminuw %xmm0, %xmm1 +; X86-SSE42-NEXT: movd %xmm1, %eax +; X86-SSE42-NEXT: ## kill: %AX %AX %EAX +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v16i16: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: ## kill: %AX %AX %EAX +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v16i16: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: %AX %AX %EAX +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v16i16: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pxor %xmm2, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm4 +; X64-SSE2-NEXT: pxor %xmm2, %xmm4 +; X64-SSE2-NEXT: pcmpgtw %xmm3, %xmm4 +; X64-SSE2-NEXT: pand %xmm4, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm4 +; X64-SSE2-NEXT: por %xmm0, %xmm4 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X64-SSE2-NEXT: pxor %xmm2, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pxor %xmm2, %xmm3 +; X64-SSE2-NEXT: pcmpgtw %xmm1, %xmm3 +; X64-SSE2-NEXT: pand %xmm3, %xmm4 +; X64-SSE2-NEXT: pandn %xmm0, %xmm3 +; X64-SSE2-NEXT: por %xmm4, %xmm3 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm3, %xmm1 +; X64-SSE2-NEXT: pxor %xmm2, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm4 +; X64-SSE2-NEXT: pxor %xmm2, %xmm4 +; X64-SSE2-NEXT: pcmpgtw %xmm1, %xmm4 +; X64-SSE2-NEXT: pand %xmm4, %xmm3 +; X64-SSE2-NEXT: pandn %xmm0, %xmm4 +; X64-SSE2-NEXT: por %xmm3, %xmm4 +; X64-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X64-SSE2-NEXT: psrld $16, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X64-SSE2-NEXT: pxor %xmm2, %xmm1 +; X64-SSE2-NEXT: pxor %xmm0, %xmm2 +; X64-SSE2-NEXT: pcmpgtw %xmm1, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm4 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm4, %xmm2 +; X64-SSE2-NEXT: movd %xmm2, %eax +; X64-SSE2-NEXT: ## kill: %AX %AX %EAX +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v16i16: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pminuw %xmm1, %xmm0 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pminuw %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pminuw %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: psrld $16, %xmm1 +; X64-SSE42-NEXT: pminuw %xmm0, %xmm1 +; X64-SSE42-NEXT: movd %xmm1, %eax +; X64-SSE42-NEXT: ## kill: %AX %AX %EAX +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v16i16: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: ## kill: %AX %AX %EAX +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v16i16: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: %AX %AX %EAX +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v16i16: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: ## kill: %AX %AX %EAX +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> + %2 = icmp ult <16 x i16> %a0, %1 + %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %1 + %4 = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> + %5 = icmp ult <16 x i16> %3, %4 + %6 = select <16 x i1> %5, <16 x i16> %3, <16 x i16> %4 + %7 = shufflevector <16 x i16> %6, <16 x i16> undef, <16 x i32> + %8 = icmp ult <16 x i16> %6, %7 + %9 = select <16 x i1> %8, <16 x i16> %6, <16 x i16> %7 + %10 = shufflevector <16 x i16> %9, <16 x i16> undef, <16 x i32> + %11 = icmp ult <16 x i16> %9, %10 + %12 = select <16 x i1> %11, <16 x i16> %9, <16 x i16> %10 + %13 = extractelement <16 x i16> %12, i32 0 + ret i16 %13 +} + +define i8 @test_reduce_v32i8(<32 x i8> %a0) { +; X86-SSE2-LABEL: test_reduce_v32i8: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: pminub %xmm1, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pminub %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE2-NEXT: pminub %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: pminub %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: pminub %xmm1, %xmm0 +; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: ## kill: %AL %AL %EAX +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v32i8: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pminub %xmm1, %xmm0 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pminub %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pminub %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: psrld $16, %xmm1 +; X86-SSE42-NEXT: pminub %xmm0, %xmm1 +; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE42-NEXT: psrlw $8, %xmm0 +; X86-SSE42-NEXT: pminub %xmm1, %xmm0 +; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X86-SSE42-NEXT: ## kill: %AL %AL %EAX +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v32i8: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX1-NEXT: ## kill: %AL %AL %EAX +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v32i8: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX2-NEXT: ## kill: %AL %AL %EAX +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v32i8: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: pminub %xmm1, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pminub %xmm0, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE2-NEXT: pminub %xmm1, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: psrld $16, %xmm1 +; X64-SSE2-NEXT: pminub %xmm0, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE2-NEXT: psrlw $8, %xmm0 +; X64-SSE2-NEXT: pminub %xmm1, %xmm0 +; X64-SSE2-NEXT: movd %xmm0, %eax +; X64-SSE2-NEXT: ## kill: %AL %AL %EAX +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v32i8: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pminub %xmm1, %xmm0 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pminub %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pminub %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: psrld $16, %xmm1 +; X64-SSE42-NEXT: pminub %xmm0, %xmm1 +; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE42-NEXT: psrlw $8, %xmm0 +; X64-SSE42-NEXT: pminub %xmm1, %xmm0 +; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X64-SSE42-NEXT: ## kill: %AL %AL %EAX +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v32i8: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX1-NEXT: ## kill: %AL %AL %EAX +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v32i8: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX2-NEXT: ## kill: %AL %AL %EAX +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v32i8: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX512-NEXT: ## kill: %AL %AL %EAX +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> + %2 = icmp ult <32 x i8> %a0, %1 + %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %1 + %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> + %5 = icmp ult <32 x i8> %3, %4 + %6 = select <32 x i1> %5, <32 x i8> %3, <32 x i8> %4 + %7 = shufflevector <32 x i8> %6, <32 x i8> undef, <32 x i32> + %8 = icmp ult <32 x i8> %6, %7 + %9 = select <32 x i1> %8, <32 x i8> %6, <32 x i8> %7 + %10 = shufflevector <32 x i8> %9, <32 x i8> undef, <32 x i32> + %11 = icmp ult <32 x i8> %9, %10 + %12 = select <32 x i1> %11, <32 x i8> %9, <32 x i8> %10 + %13 = shufflevector <32 x i8> %12, <32 x i8> undef, <32 x i32> + %14 = icmp ult <32 x i8> %12, %13 + %15 = select <32 x i1> %14, <32 x i8> %12, <32 x i8> %13 + %16 = extractelement <32 x i8> %15, i32 0 + ret i8 %16 +} + +; +; 512-bit Vectors +; + +define i64 @test_reduce_v8i64(<8 x i64> %a0) { +; X86-SSE2-LABEL: test_reduce_v8i64: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: subl $28, %esp +; X86-SSE2-NEXT: .cfi_def_cfa_offset 32 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm6, (%esp) ## 16-byte Spill +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X86-SSE2-NEXT: pxor %xmm4, %xmm5 +; X86-SSE2-NEXT: pxor %xmm4, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm6, %xmm7 +; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; X86-SSE2-NEXT: pcmpeqd %xmm5, %xmm6 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm5, %xmm6 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; X86-SSE2-NEXT: por %xmm6, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm6 +; X86-SSE2-NEXT: pxor %xmm4, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm7 +; X86-SSE2-NEXT: pxor %xmm4, %xmm7 +; X86-SSE2-NEXT: movdqa %xmm7, %xmm0 +; X86-SSE2-NEXT: pcmpgtd %xmm6, %xmm0 +; X86-SSE2-NEXT: pcmpeqd %xmm6, %xmm7 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] +; X86-SSE2-NEXT: pand %xmm6, %xmm7 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; X86-SSE2-NEXT: por %xmm7, %xmm6 +; X86-SSE2-NEXT: pand %xmm6, %xmm1 +; X86-SSE2-NEXT: pandn %xmm3, %xmm6 +; X86-SSE2-NEXT: por %xmm1, %xmm6 +; X86-SSE2-NEXT: pand %xmm5, %xmm2 +; X86-SSE2-NEXT: pandn (%esp), %xmm5 ## 16-byte Folded Reload +; X86-SSE2-NEXT: por %xmm2, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm0 +; X86-SSE2-NEXT: pxor %xmm4, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm6, %xmm1 +; X86-SSE2-NEXT: pxor %xmm4, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; X86-SSE2-NEXT: por %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm0, %xmm5 +; X86-SSE2-NEXT: pandn %xmm6, %xmm0 +; X86-SSE2-NEXT: por %xmm5, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pxor %xmm4, %xmm2 +; X86-SSE2-NEXT: pxor %xmm1, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm3 +; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm2, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; X86-SSE2-NEXT: por %xmm4, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X86-SSE2-NEXT: movd %xmm0, %edx +; X86-SSE2-NEXT: addl $28, %esp +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v8i64: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: movdqa %xmm0, %xmm5 +; X86-SSE42-NEXT: movdqa {{.*#+}} xmm4 = [0,2147483648,0,2147483648] +; X86-SSE42-NEXT: pxor %xmm4, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm2, %xmm6 +; X86-SSE42-NEXT: pxor %xmm4, %xmm6 +; X86-SSE42-NEXT: pcmpgtq %xmm0, %xmm6 +; X86-SSE42-NEXT: movdqa %xmm1, %xmm7 +; X86-SSE42-NEXT: pxor %xmm4, %xmm7 +; X86-SSE42-NEXT: movdqa %xmm3, %xmm0 +; X86-SSE42-NEXT: pxor %xmm4, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm7, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3 +; X86-SSE42-NEXT: movdqa %xmm6, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm2 +; X86-SSE42-NEXT: movapd %xmm2, %xmm1 +; X86-SSE42-NEXT: xorpd %xmm4, %xmm1 +; X86-SSE42-NEXT: movapd %xmm3, %xmm0 +; X86-SSE42-NEXT: xorpd %xmm4, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; X86-SSE42-NEXT: movdqa %xmm3, %xmm0 +; X86-SSE42-NEXT: pxor %xmm4, %xmm0 +; X86-SSE42-NEXT: pxor %xmm1, %xmm4 +; X86-SSE42-NEXT: pcmpgtq %xmm0, %xmm4 +; X86-SSE42-NEXT: movdqa %xmm4, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 +; X86-SSE42-NEXT: movd %xmm1, %eax +; X86-SSE42-NEXT: pextrd $1, %xmm1, %edx +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v8i64: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648] +; X86-AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; X86-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 +; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 +; X86-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm4 +; X86-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm5 +; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2 +; X86-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4 +; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 +; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm4 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2 +; X86-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm4 +; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; X86-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm3 +; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v8i64: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] +; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3 +; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4 +; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 +; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 +; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm4 +; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 +; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 +; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2 +; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2 +; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v8i64: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm5 +; X64-SSE2-NEXT: pxor %xmm9, %xmm5 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm6 +; X64-SSE2-NEXT: pxor %xmm9, %xmm6 +; X64-SSE2-NEXT: movdqa %xmm6, %xmm7 +; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm5, %xmm6 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm8, %xmm6 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; X64-SSE2-NEXT: por %xmm6, %xmm5 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm6 +; X64-SSE2-NEXT: pxor %xmm9, %xmm6 +; X64-SSE2-NEXT: movdqa %xmm3, %xmm7 +; X64-SSE2-NEXT: pxor %xmm9, %xmm7 +; X64-SSE2-NEXT: movdqa %xmm7, %xmm4 +; X64-SSE2-NEXT: pcmpgtd %xmm6, %xmm4 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm6, %xmm7 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm8, %xmm7 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] +; X64-SSE2-NEXT: por %xmm7, %xmm6 +; X64-SSE2-NEXT: pand %xmm6, %xmm1 +; X64-SSE2-NEXT: pandn %xmm3, %xmm6 +; X64-SSE2-NEXT: por %xmm1, %xmm6 +; X64-SSE2-NEXT: pand %xmm5, %xmm0 +; X64-SSE2-NEXT: pandn %xmm2, %xmm5 +; X64-SSE2-NEXT: por %xmm0, %xmm5 +; X64-SSE2-NEXT: movdqa %xmm5, %xmm0 +; X64-SSE2-NEXT: pxor %xmm9, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm6, %xmm1 +; X64-SSE2-NEXT: pxor %xmm9, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm3, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; X64-SSE2-NEXT: por %xmm0, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm5 +; X64-SSE2-NEXT: pandn %xmm6, %xmm1 +; X64-SSE2-NEXT: por %xmm5, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pxor %xmm9, %xmm2 +; X64-SSE2-NEXT: pxor %xmm0, %xmm9 +; X64-SSE2-NEXT: movdqa %xmm9, %xmm3 +; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm2, %xmm9 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm4, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; X64-SSE2-NEXT: por %xmm2, %xmm3 +; X64-SSE2-NEXT: pand %xmm3, %xmm1 +; X64-SSE2-NEXT: pandn %xmm0, %xmm3 +; X64-SSE2-NEXT: por %xmm1, %xmm3 +; X64-SSE2-NEXT: movq %xmm3, %rax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v8i64: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: movdqa %xmm0, %xmm5 +; X64-SSE42-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; X64-SSE42-NEXT: pxor %xmm4, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm2, %xmm6 +; X64-SSE42-NEXT: pxor %xmm4, %xmm6 +; X64-SSE42-NEXT: pcmpgtq %xmm0, %xmm6 +; X64-SSE42-NEXT: movdqa %xmm1, %xmm7 +; X64-SSE42-NEXT: pxor %xmm4, %xmm7 +; X64-SSE42-NEXT: movdqa %xmm3, %xmm0 +; X64-SSE42-NEXT: pxor %xmm4, %xmm0 +; X64-SSE42-NEXT: pcmpgtq %xmm7, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3 +; X64-SSE42-NEXT: movdqa %xmm6, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm2 +; X64-SSE42-NEXT: movapd %xmm2, %xmm1 +; X64-SSE42-NEXT: xorpd %xmm4, %xmm1 +; X64-SSE42-NEXT: movapd %xmm3, %xmm0 +; X64-SSE42-NEXT: xorpd %xmm4, %xmm0 +; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; X64-SSE42-NEXT: movdqa %xmm3, %xmm0 +; X64-SSE42-NEXT: pxor %xmm4, %xmm0 +; X64-SSE42-NEXT: pxor %xmm1, %xmm4 +; X64-SSE42-NEXT: pcmpgtq %xmm0, %xmm4 +; X64-SSE42-NEXT: movdqa %xmm4, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 +; X64-SSE42-NEXT: movq %xmm1, %rax +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v8i64: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; X64-AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; X64-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 +; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 +; X64-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm4 +; X64-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm5 +; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2 +; X64-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4 +; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 +; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm4 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2 +; X64-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm4 +; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; X64-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm3 +; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v8i64: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; X64-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3 +; X64-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4 +; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 +; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 +; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm4 +; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 +; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 +; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2 +; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2 +; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v8i64: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; X64-AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vmovq %xmm0, %rax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> + %2 = icmp ult <8 x i64> %a0, %1 + %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %1 + %4 = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> + %5 = icmp ult <8 x i64> %3, %4 + %6 = select <8 x i1> %5, <8 x i64> %3, <8 x i64> %4 + %7 = shufflevector <8 x i64> %6, <8 x i64> undef, <8 x i32> + %8 = icmp ult <8 x i64> %6, %7 + %9 = select <8 x i1> %8, <8 x i64> %6, <8 x i64> %7 + %10 = extractelement <8 x i64> %9, i32 0 + ret i64 %10 +} + +define i32 @test_reduce_v16i32(<16 x i32> %a0) { +; X86-SSE2-LABEL: test_reduce_v16i32: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm6 +; X86-SSE2-NEXT: pxor %xmm4, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X86-SSE2-NEXT: pxor %xmm4, %xmm5 +; X86-SSE2-NEXT: pcmpgtd %xmm6, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm6 +; X86-SSE2-NEXT: pxor %xmm4, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm7 +; X86-SSE2-NEXT: pxor %xmm4, %xmm7 +; X86-SSE2-NEXT: pcmpgtd %xmm6, %xmm7 +; X86-SSE2-NEXT: pand %xmm7, %xmm1 +; X86-SSE2-NEXT: pandn %xmm3, %xmm7 +; X86-SSE2-NEXT: por %xmm1, %xmm7 +; X86-SSE2-NEXT: pand %xmm5, %xmm0 +; X86-SSE2-NEXT: pandn %xmm2, %xmm5 +; X86-SSE2-NEXT: por %xmm0, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm0 +; X86-SSE2-NEXT: pxor %xmm4, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm7, %xmm1 +; X86-SSE2-NEXT: pxor %xmm4, %xmm1 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm5 +; X86-SSE2-NEXT: pandn %xmm7, %xmm1 +; X86-SSE2-NEXT: por %xmm5, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pxor %xmm4, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm4, %xmm3 +; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; X86-SSE2-NEXT: pand %xmm3, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm3 +; X86-SSE2-NEXT: por %xmm1, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm3, %xmm1 +; X86-SSE2-NEXT: pxor %xmm4, %xmm1 +; X86-SSE2-NEXT: pxor %xmm0, %xmm4 +; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm4 +; X86-SSE2-NEXT: pand %xmm4, %xmm3 +; X86-SSE2-NEXT: pandn %xmm0, %xmm4 +; X86-SSE2-NEXT: por %xmm3, %xmm4 +; X86-SSE2-NEXT: movd %xmm4, %eax +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v16i32: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pminud %xmm3, %xmm1 +; X86-SSE42-NEXT: pminud %xmm2, %xmm0 +; X86-SSE42-NEXT: pminud %xmm1, %xmm0 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pminud %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pminud %xmm1, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v16i32: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX1-NEXT: vpminud %xmm2, %xmm3, %xmm2 +; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v16i32: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v16i32: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm6 +; X64-SSE2-NEXT: pxor %xmm4, %xmm6 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X64-SSE2-NEXT: pxor %xmm4, %xmm5 +; X64-SSE2-NEXT: pcmpgtd %xmm6, %xmm5 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm6 +; X64-SSE2-NEXT: pxor %xmm4, %xmm6 +; X64-SSE2-NEXT: movdqa %xmm3, %xmm7 +; X64-SSE2-NEXT: pxor %xmm4, %xmm7 +; X64-SSE2-NEXT: pcmpgtd %xmm6, %xmm7 +; X64-SSE2-NEXT: pand %xmm7, %xmm1 +; X64-SSE2-NEXT: pandn %xmm3, %xmm7 +; X64-SSE2-NEXT: por %xmm1, %xmm7 +; X64-SSE2-NEXT: pand %xmm5, %xmm0 +; X64-SSE2-NEXT: pandn %xmm2, %xmm5 +; X64-SSE2-NEXT: por %xmm0, %xmm5 +; X64-SSE2-NEXT: movdqa %xmm5, %xmm0 +; X64-SSE2-NEXT: pxor %xmm4, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm7, %xmm1 +; X64-SSE2-NEXT: pxor %xmm4, %xmm1 +; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm5 +; X64-SSE2-NEXT: pandn %xmm7, %xmm1 +; X64-SSE2-NEXT: por %xmm5, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pxor %xmm4, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pxor %xmm4, %xmm3 +; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; X64-SSE2-NEXT: pand %xmm3, %xmm1 +; X64-SSE2-NEXT: pandn %xmm0, %xmm3 +; X64-SSE2-NEXT: por %xmm1, %xmm3 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm3, %xmm1 +; X64-SSE2-NEXT: pxor %xmm4, %xmm1 +; X64-SSE2-NEXT: pxor %xmm0, %xmm4 +; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm4 +; X64-SSE2-NEXT: pand %xmm4, %xmm3 +; X64-SSE2-NEXT: pandn %xmm0, %xmm4 +; X64-SSE2-NEXT: por %xmm3, %xmm4 +; X64-SSE2-NEXT: movd %xmm4, %eax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v16i32: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pminud %xmm3, %xmm1 +; X64-SSE42-NEXT: pminud %xmm2, %xmm0 +; X64-SSE42-NEXT: pminud %xmm1, %xmm0 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pminud %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pminud %xmm1, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v16i32: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-AVX1-NEXT: vpminud %xmm2, %xmm3, %xmm2 +; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v16i32: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v16i32: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; X64-AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> + %2 = icmp ult <16 x i32> %a0, %1 + %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %1 + %4 = shufflevector <16 x i32> %3, <16 x i32> undef, <16 x i32> + %5 = icmp ult <16 x i32> %3, %4 + %6 = select <16 x i1> %5, <16 x i32> %3, <16 x i32> %4 + %7 = shufflevector <16 x i32> %6, <16 x i32> undef, <16 x i32> + %8 = icmp ult <16 x i32> %6, %7 + %9 = select <16 x i1> %8, <16 x i32> %6, <16 x i32> %7 + %10 = shufflevector <16 x i32> %9, <16 x i32> undef, <16 x i32> + %11 = icmp ult <16 x i32> %9, %10 + %12 = select <16 x i1> %11, <16 x i32> %9, <16 x i32> %10 + %13 = extractelement <16 x i32> %12, i32 0 + ret i32 %13 +} + +define i16 @test_reduce_v32i16(<32 x i16> %a0) { +; X86-SSE2-LABEL: test_reduce_v32i16: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm6 +; X86-SSE2-NEXT: pxor %xmm4, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X86-SSE2-NEXT: pxor %xmm4, %xmm5 +; X86-SSE2-NEXT: pcmpgtw %xmm6, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm6 +; X86-SSE2-NEXT: pxor %xmm4, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm7 +; X86-SSE2-NEXT: pxor %xmm4, %xmm7 +; X86-SSE2-NEXT: pcmpgtw %xmm6, %xmm7 +; X86-SSE2-NEXT: pand %xmm7, %xmm1 +; X86-SSE2-NEXT: pandn %xmm3, %xmm7 +; X86-SSE2-NEXT: por %xmm1, %xmm7 +; X86-SSE2-NEXT: pand %xmm5, %xmm0 +; X86-SSE2-NEXT: pandn %xmm2, %xmm5 +; X86-SSE2-NEXT: por %xmm0, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm0 +; X86-SSE2-NEXT: pxor %xmm4, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm7, %xmm1 +; X86-SSE2-NEXT: pxor %xmm4, %xmm1 +; X86-SSE2-NEXT: pcmpgtw %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm5 +; X86-SSE2-NEXT: pandn %xmm7, %xmm1 +; X86-SSE2-NEXT: por %xmm5, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pxor %xmm4, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm4, %xmm3 +; X86-SSE2-NEXT: pcmpgtw %xmm2, %xmm3 +; X86-SSE2-NEXT: pand %xmm3, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm3 +; X86-SSE2-NEXT: por %xmm1, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm3, %xmm1 +; X86-SSE2-NEXT: pxor %xmm4, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pxor %xmm4, %xmm2 +; X86-SSE2-NEXT: pcmpgtw %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm3 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm3, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: psrld $16, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pxor %xmm4, %xmm1 +; X86-SSE2-NEXT: pxor %xmm0, %xmm4 +; X86-SSE2-NEXT: pcmpgtw %xmm1, %xmm4 +; X86-SSE2-NEXT: pand %xmm4, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm4 +; X86-SSE2-NEXT: por %xmm2, %xmm4 +; X86-SSE2-NEXT: movd %xmm4, %eax +; X86-SSE2-NEXT: ## kill: %AX %AX %EAX +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v32i16: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pminuw %xmm3, %xmm1 +; X86-SSE42-NEXT: pminuw %xmm2, %xmm0 +; X86-SSE42-NEXT: pminuw %xmm1, %xmm0 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pminuw %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pminuw %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: psrld $16, %xmm1 +; X86-SSE42-NEXT: pminuw %xmm0, %xmm1 +; X86-SSE42-NEXT: movd %xmm1, %eax +; X86-SSE42-NEXT: ## kill: %AX %AX %EAX +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v32i16: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX1-NEXT: vpminuw %xmm2, %xmm3, %xmm2 +; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpminuw %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: ## kill: %AX %AX %EAX +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v32i16: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: %AX %AX %EAX +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v32i16: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm6 +; X64-SSE2-NEXT: pxor %xmm4, %xmm6 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X64-SSE2-NEXT: pxor %xmm4, %xmm5 +; X64-SSE2-NEXT: pcmpgtw %xmm6, %xmm5 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm6 +; X64-SSE2-NEXT: pxor %xmm4, %xmm6 +; X64-SSE2-NEXT: movdqa %xmm3, %xmm7 +; X64-SSE2-NEXT: pxor %xmm4, %xmm7 +; X64-SSE2-NEXT: pcmpgtw %xmm6, %xmm7 +; X64-SSE2-NEXT: pand %xmm7, %xmm1 +; X64-SSE2-NEXT: pandn %xmm3, %xmm7 +; X64-SSE2-NEXT: por %xmm1, %xmm7 +; X64-SSE2-NEXT: pand %xmm5, %xmm0 +; X64-SSE2-NEXT: pandn %xmm2, %xmm5 +; X64-SSE2-NEXT: por %xmm0, %xmm5 +; X64-SSE2-NEXT: movdqa %xmm5, %xmm0 +; X64-SSE2-NEXT: pxor %xmm4, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm7, %xmm1 +; X64-SSE2-NEXT: pxor %xmm4, %xmm1 +; X64-SSE2-NEXT: pcmpgtw %xmm0, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm5 +; X64-SSE2-NEXT: pandn %xmm7, %xmm1 +; X64-SSE2-NEXT: por %xmm5, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pxor %xmm4, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pxor %xmm4, %xmm3 +; X64-SSE2-NEXT: pcmpgtw %xmm2, %xmm3 +; X64-SSE2-NEXT: pand %xmm3, %xmm1 +; X64-SSE2-NEXT: pandn %xmm0, %xmm3 +; X64-SSE2-NEXT: por %xmm1, %xmm3 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm3, %xmm1 +; X64-SSE2-NEXT: pxor %xmm4, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE2-NEXT: pxor %xmm4, %xmm2 +; X64-SSE2-NEXT: pcmpgtw %xmm1, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm3 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm3, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X64-SSE2-NEXT: psrld $16, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X64-SSE2-NEXT: pxor %xmm4, %xmm1 +; X64-SSE2-NEXT: pxor %xmm0, %xmm4 +; X64-SSE2-NEXT: pcmpgtw %xmm1, %xmm4 +; X64-SSE2-NEXT: pand %xmm4, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm4 +; X64-SSE2-NEXT: por %xmm2, %xmm4 +; X64-SSE2-NEXT: movd %xmm4, %eax +; X64-SSE2-NEXT: ## kill: %AX %AX %EAX +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v32i16: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pminuw %xmm3, %xmm1 +; X64-SSE42-NEXT: pminuw %xmm2, %xmm0 +; X64-SSE42-NEXT: pminuw %xmm1, %xmm0 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pminuw %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pminuw %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: psrld $16, %xmm1 +; X64-SSE42-NEXT: pminuw %xmm0, %xmm1 +; X64-SSE42-NEXT: movd %xmm1, %eax +; X64-SSE42-NEXT: ## kill: %AX %AX %EAX +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v32i16: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-AVX1-NEXT: vpminuw %xmm2, %xmm3, %xmm2 +; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpminuw %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: ## kill: %AX %AX %EAX +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v32i16: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: %AX %AX %EAX +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v32i16: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: ## kill: %AX %AX %EAX +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> + %2 = icmp ult <32 x i16> %a0, %1 + %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %1 + %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> + %5 = icmp ult <32 x i16> %3, %4 + %6 = select <32 x i1> %5, <32 x i16> %3, <32 x i16> %4 + %7 = shufflevector <32 x i16> %6, <32 x i16> undef, <32 x i32> + %8 = icmp ult <32 x i16> %6, %7 + %9 = select <32 x i1> %8, <32 x i16> %6, <32 x i16> %7 + %10 = shufflevector <32 x i16> %9, <32 x i16> undef, <32 x i32> + %11 = icmp ult <32 x i16> %9, %10 + %12 = select <32 x i1> %11, <32 x i16> %9, <32 x i16> %10 + %13 = shufflevector <32 x i16> %12, <32 x i16> undef, <32 x i32> + %14 = icmp ult <32 x i16> %12, %13 + %15 = select <32 x i1> %14, <32 x i16> %12, <32 x i16> %13 + %16 = extractelement <32 x i16> %15, i32 0 + ret i16 %16 +} + +define i8 @test_reduce_v64i8(<64 x i8> %a0) { +; X86-SSE2-LABEL: test_reduce_v64i8: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: pminub %xmm3, %xmm1 +; X86-SSE2-NEXT: pminub %xmm2, %xmm0 +; X86-SSE2-NEXT: pminub %xmm1, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pminub %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE2-NEXT: pminub %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: pminub %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: pminub %xmm1, %xmm0 +; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: ## kill: %AL %AL %EAX +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v64i8: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pminub %xmm3, %xmm1 +; X86-SSE42-NEXT: pminub %xmm2, %xmm0 +; X86-SSE42-NEXT: pminub %xmm1, %xmm0 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pminub %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pminub %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: psrld $16, %xmm1 +; X86-SSE42-NEXT: pminub %xmm0, %xmm1 +; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE42-NEXT: psrlw $8, %xmm0 +; X86-SSE42-NEXT: pminub %xmm1, %xmm0 +; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X86-SSE42-NEXT: ## kill: %AL %AL %EAX +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v64i8: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX1-NEXT: vpminub %xmm2, %xmm3, %xmm2 +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX1-NEXT: ## kill: %AL %AL %EAX +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v64i8: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX2-NEXT: ## kill: %AL %AL %EAX +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v64i8: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: pminub %xmm3, %xmm1 +; X64-SSE2-NEXT: pminub %xmm2, %xmm0 +; X64-SSE2-NEXT: pminub %xmm1, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pminub %xmm0, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE2-NEXT: pminub %xmm1, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: psrld $16, %xmm1 +; X64-SSE2-NEXT: pminub %xmm0, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE2-NEXT: psrlw $8, %xmm0 +; X64-SSE2-NEXT: pminub %xmm1, %xmm0 +; X64-SSE2-NEXT: movd %xmm0, %eax +; X64-SSE2-NEXT: ## kill: %AL %AL %EAX +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v64i8: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pminub %xmm3, %xmm1 +; X64-SSE42-NEXT: pminub %xmm2, %xmm0 +; X64-SSE42-NEXT: pminub %xmm1, %xmm0 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pminub %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pminub %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: psrld $16, %xmm1 +; X64-SSE42-NEXT: pminub %xmm0, %xmm1 +; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE42-NEXT: psrlw $8, %xmm0 +; X64-SSE42-NEXT: pminub %xmm1, %xmm0 +; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X64-SSE42-NEXT: ## kill: %AL %AL %EAX +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v64i8: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-AVX1-NEXT: vpminub %xmm2, %xmm3, %xmm2 +; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX1-NEXT: ## kill: %AL %AL %EAX +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v64i8: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX2-NEXT: ## kill: %AL %AL %EAX +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v64i8: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX512-NEXT: ## kill: %AL %AL %EAX +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> + %2 = icmp ult <64 x i8> %a0, %1 + %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %1 + %4 = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> + %5 = icmp ult <64 x i8> %3, %4 + %6 = select <64 x i1> %5, <64 x i8> %3, <64 x i8> %4 + %7 = shufflevector <64 x i8> %6, <64 x i8> undef, <64 x i32> + %8 = icmp ult <64 x i8> %6, %7 + %9 = select <64 x i1> %8, <64 x i8> %6, <64 x i8> %7 + %10 = shufflevector <64 x i8> %9, <64 x i8> undef, <64 x i32> + %11 = icmp ult <64 x i8> %9, %10 + %12 = select <64 x i1> %11, <64 x i8> %9, <64 x i8> %10 + %13 = shufflevector <64 x i8> %12, <64 x i8> undef, <64 x i32> + %14 = icmp ult <64 x i8> %12, %13 + %15 = select <64 x i1> %14, <64 x i8> %12, <64 x i8> %13 + %16 = shufflevector <64 x i8> %15, <64 x i8> undef, <64 x i32> + %17 = icmp ult <64 x i8> %15, %16 + %18 = select <64 x i1> %17, <64 x i8> %15, <64 x i8> %16 + %19 = extractelement <64 x i8> %18, i32 0 + ret i8 %19 +}