From 1490c5eb5b5f97026d557aef89bdb5b65ea1cbf4 Mon Sep 17 00:00:00 2001 From: Elena Demikhovsky Date: Mon, 19 Aug 2013 13:26:14 +0000 Subject: [PATCH] AVX-512: added arithmetic and logical operations. ADD, SUB, MUL integer and FP types. OR, AND, XOR. Added embeded broadcast form for these instructions. llvm-svn: 188673 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 11 +- llvm/lib/Target/X86/X86InstrAVX512.td | 247 +++++++++++++++++-- llvm/lib/Target/X86/X86InstrFragmentsSIMD.td | 18 +- llvm/test/CodeGen/X86/avx512-arith.ll | 225 +++++++++++++++++ llvm/test/CodeGen/X86/avx512-vec-cmp.ll | 7 +- 5 files changed, 478 insertions(+), 30 deletions(-) create mode 100644 llvm/test/CodeGen/X86/avx512-arith.ll diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 4e71f368e6ac..3c3f09f0fe1b 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1390,6 +1390,9 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::AND, MVT::v8i64, Legal); setOperationAction(ISD::OR, MVT::v8i64, Legal); setOperationAction(ISD::XOR, MVT::v8i64, Legal); + setOperationAction(ISD::AND, MVT::v16i32, Legal); + setOperationAction(ISD::OR, MVT::v16i32, Legal); + setOperationAction(ISD::XOR, MVT::v16i32, Legal); // Custom lower several nodes. for (int i = MVT::FIRST_VECTOR_VALUETYPE; @@ -1409,14 +1412,6 @@ void X86TargetLowering::resetOperationActions() { if (!VT.is512BitVector()) continue; - if (VT != MVT::v8i64) { - setOperationAction(ISD::XOR, VT, Promote); - AddPromotedToType (ISD::XOR, VT, MVT::v8i64); - setOperationAction(ISD::OR, VT, Promote); - AddPromotedToType (ISD::OR, VT, MVT::v8i64); - setOperationAction(ISD::AND, VT, Promote); - AddPromotedToType (ISD::AND, VT, MVT::v8i64); - } if ( EltSize >= 32) { setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 760b4ed120ce..a6035af70597 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -80,6 +80,21 @@ let Predicates = [HasAVX512] in { def : Pat<(v16i16 (bitconvert (v32i8 VR256X:$src))), (v16i16 VR256X:$src)>; } +// +// AVX-512: VPXOR instruction writes zero to its upper part, it's safe build zeros. +// + +let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, + isPseudo = 1, Predicates = [HasAVX512] in { +def AVX512_512_SET0 : I<0, Pseudo, (outs VR512:$dst), (ins), "", + [(set VR512:$dst, (v16f32 immAllZerosV))]>; +} + +def : Pat<(v8i64 immAllZerosV), (AVX512_512_SET0)>; +def : Pat<(v16i32 immAllZerosV), (AVX512_512_SET0)>; +def : Pat<(v8f64 immAllZerosV), (AVX512_512_SET0)>; +def : Pat<(v16f32 immAllZerosV), (AVX512_512_SET0)>; + //===----------------------------------------------------------------------===// // AVX-512 - VECTOR INSERT // @@ -518,16 +533,16 @@ multiclass avx512_perm opc, string OpcodeStr, RegisterClass RC, !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, - (OpVT (X86VPermv RC:$src1, - (bitconvert (mem_frag addr:$src2)))))]>, EVEX_4V; + (OpVT (X86VPermv RC:$src1, (mem_frag addr:$src2))))]>, + EVEX_4V; } -defm VPERMDZ : avx512_perm<0x36, "vpermd", VR512, memopv8i64, i512mem, +defm VPERMDZ : avx512_perm<0x36, "vpermd", VR512, memopv16i32, i512mem, v16i32>, EVEX_V512, EVEX_CD8<32, CD8VF>; defm VPERMQZ : avx512_perm<0x36, "vpermq", VR512, memopv8i64, i512mem, v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; let ExeDomain = SSEPackedSingle in -defm VPERMPSZ : avx512_perm<0x16, "vpermps", VR512, memopv8f64, f512mem, +defm VPERMPSZ : avx512_perm<0x16, "vpermps", VR512, memopv16f32, f512mem, v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>; let ExeDomain = SSEPackedDouble in defm VPERMPDZ : avx512_perm<0x16, "vpermpd", VR512, memopv8f64, f512mem, @@ -552,7 +567,7 @@ let Constraints = "$src1 = $dst" in { "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), [(set RC:$dst, (OpVT (X86VPermv3 RC:$src1, RC:$src2, - (bitconvert (mem_frag addr:$src3)))))]>, EVEX_4V; + (mem_frag addr:$src3))))]>, EVEX_4V; } } defm VPERMI2D : avx512_perm_3src<0x76, "vpermi2d", VR512, memopv16i32, i512mem, @@ -631,18 +646,17 @@ multiclass avx512_icmp_packed opc, string OpcodeStr, RegisterClass KRC, def rm : AVX512BI, EVEX_4V; } defm VPCMPEQDZ : avx512_icmp_packed<0x76, "vpcmpeqd", VK16, VR512, i512mem, - memopv8i64, X86pcmpeqm, v16i32>, EVEX_V512; + memopv16i32, X86pcmpeqm, v16i32>, EVEX_V512; defm VPCMPEQQZ : avx512_icmp_packed<0x29, "vpcmpeqq", VK8, VR512, i512mem, memopv8i64, X86pcmpeqm, v8i64>, T8, EVEX_V512, VEX_W; defm VPCMPGTDZ : avx512_icmp_packed<0x66, "vpcmpgtd", VK16, VR512, i512mem, - memopv8i64, X86pcmpgtm, v16i32>, EVEX_V512; + memopv16i32, X86pcmpgtm, v16i32>, EVEX_V512; defm VPCMPGTQZ : avx512_icmp_packed<0x37, "vpcmpgtq", VK8, VR512, i512mem, memopv8i64, X86pcmpgtm, v8i64>, T8, EVEX_V512, VEX_W; @@ -656,7 +670,6 @@ def : Pat<(v8i1 (X86pcmpeqm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)), (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm))), VK8)>; - multiclass avx512_icmp_cc opc, RegisterClass KRC, RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag, SDNode OpNode, ValueType vt, Operand CC, string asm, @@ -667,9 +680,8 @@ multiclass avx512_icmp_cc opc, RegisterClass KRC, IIC_SSE_CMPP_RR>, EVEX_4V; def rmi : AVX512AIi8, EVEX_4V; + [(set KRC:$dst, (OpNode (vt RC:$src1), (memop_frag addr:$src2), + imm:$cc))], IIC_SSE_CMPP_RM>, EVEX_4V; // Accept explicit immediate argument form instead of comparison code. let neverHasSideEffects = 1 in { def rri_alt : AVX512AIi8 opc, RegisterClass KRC, } } -defm VPCMPDZ : avx512_icmp_cc<0x1F, VK16, VR512, i512mem, memopv8i64, +defm VPCMPDZ : avx512_icmp_cc<0x1F, VK16, VR512, i512mem, memopv16i32, X86cmpm, v16i32, AVXCC, "vpcmp${cc}d\t{$src2, $src1, $dst|$dst, $src1, $src2}", "vpcmpd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPCMPUDZ : avx512_icmp_cc<0x1E, VK16, VR512, i512mem, memopv8i64, +defm VPCMPUDZ : avx512_icmp_cc<0x1E, VK16, VR512, i512mem, memopv16i32, X86cmpmu, v16i32, AVXCC, "vpcmp${cc}ud\t{$src2, $src1, $dst|$dst, $src1, $src2}", "vpcmpud\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">, @@ -1415,3 +1427,208 @@ let AddedComplexity = 20 in { (VMOVZPQILo2PQIZrr VR128X:$src)>; } +//===----------------------------------------------------------------------===// +// AVX-512 - Integer arithmetic +// +multiclass avx512_binop_rm opc, string OpcodeStr, SDNode OpNode, + ValueType OpVT, RegisterClass RC, PatFrag memop_frag, + X86MemOperand x86memop, PatFrag scalar_mfrag, + X86MemOperand x86scalar_mop, string BrdcstStr, + OpndItins itins, bit IsCommutable = 0> { + let isCommutable = IsCommutable in + def rr : AVX512BI, EVEX_4V; + def rm : AVX512BI, EVEX_4V; + def rmb : AVX512BI, EVEX_4V, EVEX_B; +} +multiclass avx512_binop_rm2 opc, string OpcodeStr, + ValueType DstVT, ValueType SrcVT, RegisterClass RC, + PatFrag memop_frag, X86MemOperand x86memop, + OpndItins itins, + bit IsCommutable = 0> { + let isCommutable = IsCommutable in + def rr : AVX512BI, EVEX_4V, VEX_W; + def rm : AVX512BI, EVEX_4V, VEX_W; +} + +defm VPADDDZ : avx512_binop_rm<0xFE, "vpaddd", add, v16i32, VR512, memopv16i32, + i512mem, loadi32, i32mem, "{1to16}", SSE_INTALU_ITINS_P, 1>, + EVEX_V512, EVEX_CD8<32, CD8VF>; + +defm VPSUBDZ : avx512_binop_rm<0xFA, "vpsubd", sub, v16i32, VR512, memopv16i32, + i512mem, loadi32, i32mem, "{1to16}", SSE_INTALU_ITINS_P, 0>, + EVEX_V512, EVEX_CD8<32, CD8VF>; + +defm VPMULLDZ : avx512_binop_rm<0x40, "vpmulld", mul, v16i32, VR512, memopv16i32, + i512mem, loadi32, i32mem, "{1to16}", SSE_INTALU_ITINS_P, 1>, + T8, EVEX_V512, EVEX_CD8<32, CD8VF>; + +defm VPADDQZ : avx512_binop_rm<0xD4, "vpaddq", add, v8i64, VR512, memopv8i64, + i512mem, loadi64, i64mem, "{1to8}", SSE_INTALU_ITINS_P, 1>, + EVEX_CD8<64, CD8VF>, EVEX_V512, VEX_W; + +defm VPSUBQZ : avx512_binop_rm<0xFB, "vpsubq", sub, v8i64, VR512, memopv8i64, + i512mem, loadi64, i64mem, "{1to8}", SSE_INTALU_ITINS_P, 0>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + +defm VPMULDQZ : avx512_binop_rm2<0x28, "vpmuldq", v8i64, v16i32, + VR512, memopv8i64, i512mem, SSE_INTALU_ITINS_P, 1>, T8, + EVEX_V512, EVEX_CD8<64, CD8VF>; + +defm VPMULUDQZ : avx512_binop_rm2<0xF4, "vpmuludq", v8i64, v16i32, + VR512, memopv8i64, i512mem, SSE_INTMUL_ITINS_P, 1>, EVEX_V512, + EVEX_CD8<64, CD8VF>; + +def : Pat<(v8i64 (X86pmuludq (v16i32 VR512:$src1), (v16i32 VR512:$src2))), + (VPMULUDQZrr VR512:$src1, VR512:$src2)>; + +//===----------------------------------------------------------------------===// +// AVX-512 Logical Instructions +//===----------------------------------------------------------------------===// + +defm VPANDDZ : avx512_binop_rm<0xDB, "vpandd", and, v16i32, VR512, memopv16i32, + i512mem, loadi32, i32mem, "{1to16}", SSE_BIT_ITINS_P, 1>, + EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VPANDQZ : avx512_binop_rm<0xDB, "vpandq", and, v8i64, VR512, memopv8i64, + i512mem, loadi64, i64mem, "{1to8}", SSE_BIT_ITINS_P, 1>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPORDZ : avx512_binop_rm<0xEB, "vpord", or, v16i32, VR512, memopv16i32, + i512mem, loadi32, i32mem, "{1to16}", SSE_BIT_ITINS_P, 1>, + EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VPORQZ : avx512_binop_rm<0xEB, "vporq", or, v8i64, VR512, memopv8i64, + i512mem, loadi64, i64mem, "{1to8}", SSE_BIT_ITINS_P, 1>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPXORDZ : avx512_binop_rm<0xEF, "vpxord", xor, v16i32, VR512, memopv16i32, + i512mem, loadi32, i32mem, "{1to16}", SSE_BIT_ITINS_P, 1>, + EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VPXORQZ : avx512_binop_rm<0xEF, "vpxorq", xor, v8i64, VR512, memopv8i64, + i512mem, loadi64, i64mem, "{1to8}", SSE_BIT_ITINS_P, 1>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPANDNDZ : avx512_binop_rm<0xDF, "vpandnd", X86andnp, v16i32, VR512, + memopv16i32, i512mem, loadi32, i32mem, "{1to16}", + SSE_BIT_ITINS_P, 0>, EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VPANDNQZ : avx512_binop_rm<0xDF, "vpandnq", X86andnp, v8i64, VR512, memopv8i64, + i512mem, loadi64, i64mem, "{1to8}", SSE_BIT_ITINS_P, 0>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + +//===----------------------------------------------------------------------===// +// AVX-512 FP arithmetic +//===----------------------------------------------------------------------===// + +multiclass avx512_binop_s opc, string OpcodeStr, SDNode OpNode, + SizeItins itins> { + defm SSZ : sse12_fp_scalar, XS, EVEX_4V, VEX_LIG, + EVEX_CD8<32, CD8VT1>; + defm SDZ : sse12_fp_scalar, XD, VEX_W, EVEX_4V, VEX_LIG, + EVEX_CD8<64, CD8VT1>; +} + +let isCommutable = 1 in { +defm VADD : avx512_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>; +defm VMUL : avx512_binop_s<0x59, "mul", fmul, SSE_ALU_ITINS_S>; +defm VMIN : avx512_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>; +defm VMAX : avx512_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>; +} +let isCommutable = 0 in { +defm VSUB : avx512_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>; +defm VDIV : avx512_binop_s<0x5E, "div", fdiv, SSE_ALU_ITINS_S>; +} + +multiclass avx512_fp_packed opc, string OpcodeStr, SDNode OpNode, + RegisterClass RC, ValueType vt, + X86MemOperand x86memop, PatFrag mem_frag, + X86MemOperand x86scalar_mop, PatFrag scalar_mfrag, + string BrdcstStr, + Domain d, OpndItins itins, bit commutable> { + let isCommutable = commutable in + def rr : PI, + EVEX_4V; + let mayLoad = 1 in { + def rm : PI, EVEX_4V; + def rmb : PI, EVEX_4V, EVEX_B; + } +} + +defm VADDPSZ : avx512_fp_packed<0x58, "addps", fadd, VR512, v16f32, f512mem, + memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle, + SSE_ALU_ITINS_P.s, 1>, EVEX_V512, EVEX_CD8<32, CD8VF>; + +defm VADDPDZ : avx512_fp_packed<0x58, "addpd", fadd, VR512, v8f64, f512mem, + memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble, + SSE_ALU_ITINS_P.d, 1>, + EVEX_V512, OpSize, VEX_W, EVEX_CD8<64, CD8VF>; + +defm VMULPSZ : avx512_fp_packed<0x59, "mulps", fmul, VR512, v16f32, f512mem, + memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle, + SSE_ALU_ITINS_P.s, 1>, EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VMULPDZ : avx512_fp_packed<0x59, "mulpd", fmul, VR512, v8f64, f512mem, + memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble, + SSE_ALU_ITINS_P.d, 1>, + EVEX_V512, OpSize, VEX_W, EVEX_CD8<64, CD8VF>; + +defm VMINPSZ : avx512_fp_packed<0x5D, "minps", X86fmin, VR512, v16f32, f512mem, + memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle, + SSE_ALU_ITINS_P.s, 1>, + EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VMAXPSZ : avx512_fp_packed<0x5F, "maxps", X86fmax, VR512, v16f32, f512mem, + memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle, + SSE_ALU_ITINS_P.s, 1>, + EVEX_V512, EVEX_CD8<32, CD8VF>; + +defm VMINPDZ : avx512_fp_packed<0x5D, "minpd", X86fmin, VR512, v8f64, f512mem, + memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble, + SSE_ALU_ITINS_P.d, 1>, + EVEX_V512, OpSize, VEX_W, EVEX_CD8<64, CD8VF>; +defm VMAXPDZ : avx512_fp_packed<0x5F, "maxpd", X86fmax, VR512, v8f64, f512mem, + memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble, + SSE_ALU_ITINS_P.d, 1>, + EVEX_V512, OpSize, VEX_W, EVEX_CD8<64, CD8VF>; + +defm VSUBPSZ : avx512_fp_packed<0x5C, "subps", fsub, VR512, v16f32, f512mem, + memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle, + SSE_ALU_ITINS_P.s, 0>, EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VDIVPSZ : avx512_fp_packed<0x5E, "divps", fdiv, VR512, v16f32, f512mem, + memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle, + SSE_ALU_ITINS_P.s, 0>, EVEX_V512, EVEX_CD8<32, CD8VF>; + +defm VSUBPDZ : avx512_fp_packed<0x5C, "subpd", fsub, VR512, v8f64, f512mem, + memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble, + SSE_ALU_ITINS_P.d, 0>, + EVEX_V512, OpSize, VEX_W, EVEX_CD8<64, CD8VF>; +defm VDIVPDZ : avx512_fp_packed<0x5E, "divpd", fdiv, VR512, v8f64, f512mem, + memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble, + SSE_ALU_ITINS_P.d, 0>, + EVEX_V512, OpSize, VEX_W, EVEX_CD8<64, CD8VF>; + diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index e6460e972bce..9f1c999cdd89 100644 --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -366,6 +366,16 @@ def memop : PatFrag<(ops node:$ptr), (load node:$ptr), [{ || cast(N)->getAlignment() >= 16; }]>; +def memop4 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return Subtarget->hasVectorUAMem() + || cast(N)->getAlignment() >= 4; +}]>; + +def memop8 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return Subtarget->hasVectorUAMem() + || cast(N)->getAlignment() >= 8; +}]>; + def memopfsf32 : PatFrag<(ops node:$ptr), (f32 (memop node:$ptr))>; def memopfsf64 : PatFrag<(ops node:$ptr), (f64 (memop node:$ptr))>; @@ -382,10 +392,10 @@ def memopv4f64 : PatFrag<(ops node:$ptr), (v4f64 (memop node:$ptr))>; def memopv4i64 : PatFrag<(ops node:$ptr), (v4i64 (memop node:$ptr))>; // 512-bit memop pattern fragments -def memopv16f32 : PatFrag<(ops node:$ptr), (v16f32 (memop node:$ptr))>; -def memopv8f64 : PatFrag<(ops node:$ptr), (v8f64 (memop node:$ptr))>; -def memopv16i32 : PatFrag<(ops node:$ptr), (v16i32 (memop node:$ptr))>; -def memopv8i64 : PatFrag<(ops node:$ptr), (v8i64 (memop node:$ptr))>; +def memopv16f32 : PatFrag<(ops node:$ptr), (v16f32 (memop4 node:$ptr))>; +def memopv8f64 : PatFrag<(ops node:$ptr), (v8f64 (memop8 node:$ptr))>; +def memopv16i32 : PatFrag<(ops node:$ptr), (v16i32 (memop4 node:$ptr))>; +def memopv8i64 : PatFrag<(ops node:$ptr), (v8i64 (memop8 node:$ptr))>; // SSSE3 uses MMX registers for some instructions. They aren't aligned on a // 16-byte boundary. diff --git a/llvm/test/CodeGen/X86/avx512-arith.ll b/llvm/test/CodeGen/X86/avx512-arith.ll new file mode 100644 index 000000000000..55ce9f9512d9 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512-arith.ll @@ -0,0 +1,225 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s + +; CHECK-LABEL: addpd512 +; CHECK: vaddpd +; CHECK: ret +define <8 x double> @addpd512(<8 x double> %y, <8 x double> %x) { +entry: + %add.i = fadd <8 x double> %x, %y + ret <8 x double> %add.i +} + +; CHECK-LABEL: addpd512fold +; CHECK: vaddpd LCP{{.*}}(%rip) +; CHECK: ret +define <8 x double> @addpd512fold(<8 x double> %y) { +entry: + %add.i = fadd <8 x double> %y, + ret <8 x double> %add.i +} + +; CHECK-LABEL: addps512 +; CHECK: vaddps +; CHECK: ret +define <16 x float> @addps512(<16 x float> %y, <16 x float> %x) { +entry: + %add.i = fadd <16 x float> %x, %y + ret <16 x float> %add.i +} + +; CHECK-LABEL: addps512fold +; CHECK: vaddps LCP{{.*}}(%rip) +; CHECK: ret +define <16 x float> @addps512fold(<16 x float> %y) { +entry: + %add.i = fadd <16 x float> %y, + ret <16 x float> %add.i +} + +; CHECK-LABEL: subpd512 +; CHECK: vsubpd +; CHECK: ret +define <8 x double> @subpd512(<8 x double> %y, <8 x double> %x) { +entry: + %sub.i = fsub <8 x double> %x, %y + ret <8 x double> %sub.i +} + +; CHECK-LABEL: @subpd512fold +; CHECK: vsubpd (% +; CHECK: ret +define <8 x double> @subpd512fold(<8 x double> %y, <8 x double>* %x) { +entry: + %tmp2 = load <8 x double>* %x, align 8 + %sub.i = fsub <8 x double> %y, %tmp2 + ret <8 x double> %sub.i +} + +; CHECK-LABEL: @subps512 +; CHECK: vsubps +; CHECK: ret +define <16 x float> @subps512(<16 x float> %y, <16 x float> %x) { +entry: + %sub.i = fsub <16 x float> %x, %y + ret <16 x float> %sub.i +} + +; CHECK-LABEL: subps512fold +; CHECK: vsubps (% +; CHECK: ret +define <16 x float> @subps512fold(<16 x float> %y, <16 x float>* %x) { +entry: + %tmp2 = load <16 x float>* %x, align 4 + %sub.i = fsub <16 x float> %y, %tmp2 + ret <16 x float> %sub.i +} + +; CHECK-LABEL: mulpd512 +; CHECK: vmulpd +; CHECK: ret +define <8 x double> @mulpd512(<8 x double> %y, <8 x double> %x) { +entry: + %mul.i = fmul <8 x double> %x, %y + ret <8 x double> %mul.i +} + +; CHECK-LABEL: mulpd512fold +; CHECK: vmulpd LCP{{.*}}(%rip) +; CHECK: ret +define <8 x double> @mulpd512fold(<8 x double> %y) { +entry: + %mul.i = fmul <8 x double> %y, + ret <8 x double> %mul.i +} + +; CHECK-LABEL: mulps512 +; CHECK: vmulps +; CHECK: ret +define <16 x float> @mulps512(<16 x float> %y, <16 x float> %x) { +entry: + %mul.i = fmul <16 x float> %x, %y + ret <16 x float> %mul.i +} + +; CHECK-LABEL: mulps512fold +; CHECK: vmulps LCP{{.*}}(%rip) +; CHECK: ret +define <16 x float> @mulps512fold(<16 x float> %y) { +entry: + %mul.i = fmul <16 x float> %y, + ret <16 x float> %mul.i +} + +; CHECK-LABEL: divpd512 +; CHECK: vdivpd +; CHECK: ret +define <8 x double> @divpd512(<8 x double> %y, <8 x double> %x) { +entry: + %div.i = fdiv <8 x double> %x, %y + ret <8 x double> %div.i +} + +; CHECK-LABEL: divpd512fold +; CHECK: vdivpd LCP{{.*}}(%rip) +; CHECK: ret +define <8 x double> @divpd512fold(<8 x double> %y) { +entry: + %div.i = fdiv <8 x double> %y, + ret <8 x double> %div.i +} + +; CHECK-LABEL: divps512 +; CHECK: vdivps +; CHECK: ret +define <16 x float> @divps512(<16 x float> %y, <16 x float> %x) { +entry: + %div.i = fdiv <16 x float> %x, %y + ret <16 x float> %div.i +} + +; CHECK-LABEL: divps512fold +; CHECK: vdivps LCP{{.*}}(%rip) +; CHECK: ret +define <16 x float> @divps512fold(<16 x float> %y) { +entry: + %div.i = fdiv <16 x float> %y, + ret <16 x float> %div.i +} + +; CHECK-LABEL: vpaddq_test +; CHECK: vpaddq %zmm +; CHECK: ret +define <8 x i64> @vpaddq_test(<8 x i64> %i, <8 x i64> %j) nounwind readnone { + %x = add <8 x i64> %i, %j + ret <8 x i64> %x +} + +; CHECK-LABEL: vpaddd_test +; CHECK: vpaddd %zmm +; CHECK: ret +define <16 x i32> @vpaddd_test(<16 x i32> %i, <16 x i32> %j) nounwind readnone { + %x = add <16 x i32> %i, %j + ret <16 x i32> %x +} + +; CHECK-LABEL: vpsubq_test +; CHECK: vpsubq %zmm +; CHECK: ret +define <8 x i64> @vpsubq_test(<8 x i64> %i, <8 x i64> %j) nounwind readnone { + %x = sub <8 x i64> %i, %j + ret <8 x i64> %x +} + +; CHECK-LABEL: vpsubd_test +; CHECK: vpsubd +; CHECK: ret +define <16 x i32> @vpsubd_test(<16 x i32> %i, <16 x i32> %j) nounwind readnone { + %x = sub <16 x i32> %i, %j + ret <16 x i32> %x +} + +; CHECK-LABEL: vpmulld_test +; CHECK: vpmulld %zmm +; CHECK: ret +define <16 x i32> @vpmulld_test(<16 x i32> %i, <16 x i32> %j) { + %x = mul <16 x i32> %i, %j + ret <16 x i32> %x +} + +; CHECK-LABEL: addq_broadcast +; CHECK: vpaddq LCP{{.*}}(%rip){1to8}, %zmm0, %zmm0 +; CHECK: ret +define <8 x i64> @addq_broadcast(<8 x i64> %a) nounwind { + %b = add <8 x i64> %a, + ret <8 x i64> %b +} + +; CHECK-LABEL: orq_broadcast +; CHECK: vporq LCP{{.*}}(%rip){1to8}, %zmm0, %zmm0 +; CHECK: ret +define <8 x i64> @orq_broadcast(<8 x i64> %a) nounwind { + %b = or <8 x i64> %a, + ret <8 x i64> %b +} + +; CHECK-LABEL: andd512fold +; CHECK: vpandd (% +; CHECK: ret +define <16 x i32> @andd512fold(<16 x i32> %y, <16 x i32>* %x) { +entry: + %a = load <16 x i32>* %x, align 4 + %b = and <16 x i32> %y, %a + ret <16 x i32> %b +} + +; CHECK-LABEL: andqbrst +; CHECK: vpandq (%rdi){1to8}, %zmm +; CHECK: ret +define <8 x i64> @andqbrst(<8 x i64> %p1, i64* %ap) { +entry: + %a = load i64* %ap, align 8 + %b = insertelement <8 x i64> undef, i64 %a, i32 0 + %c = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer + %d = and <8 x i64> %p1, %c + ret <8 x i64>%d +} \ No newline at end of file diff --git a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll index ee57af731f62..c9747a3e94f5 100644 --- a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll +++ b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll @@ -21,12 +21,13 @@ define <8 x double> @test2(<8 x double> %x, <8 x double> %y) nounwind { } ; CHECK-LABEL: test3 -; CHECK: vpcmpeqd +; CHECK: vpcmpeqd (%rdi) ; CHECK: vmovdqu32 ; CHECK: ret -define <16 x i32> @test3(<16 x i32> %x, <16 x i32> %y) nounwind { +define <16 x i32> @test3(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %yp) nounwind { + %y = load <16 x i32>* %yp, align 4 %mask = icmp eq <16 x i32> %x, %y - %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %y + %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1 ret <16 x i32> %max }