From e902b7d0b0f87489b52953ea83b92bf66039f452 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 3 Aug 2018 06:12:56 +0000 Subject: [PATCH] [X86] Support fp128 and/or/xor/load/store with VEX and EVEX encoded instructions. Move all the patterns to X86InstrVecCompiler.td so we can keep SSE/AVX/AVX512 all in one place. To save some patterns we'll use an existing DAG combine to convert f128 fand/for/fxor to integer when sse2 is enabled. This allows use to reuse all the existing patterns for v2i64. I believe this now makes SHA instructions the only case where VEX/EVEX and legacy encoded instructions could be generated simultaneously. llvm-svn: 338821 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 5 +- llvm/lib/Target/X86/X86InstrSSE.td | 45 -- llvm/lib/Target/X86/X86InstrVecCompiler.td | 78 ++++ llvm/test/CodeGen/X86/fp128-i128.ll | 463 ++++++++++++++------- 4 files changed, 393 insertions(+), 198 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 9cb7ed0c64f4..290622566108 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -613,7 +613,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // Long double always uses X87, except f128 in MMX. if (UseX87) { if (Subtarget.is64Bit() && Subtarget.hasMMX()) { - addRegisterClass(MVT::f128, &X86::VR128RegClass); + addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass + : &X86::VR128RegClass); ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat); setOperationAction(ISD::FABS , MVT::f128, Custom); setOperationAction(ISD::FNEG , MVT::f128, Custom); @@ -36981,7 +36982,7 @@ static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = N->getSimpleValueType(0); // If we have integer vector types available, use the integer opcodes. - if (VT.isVector() && Subtarget.hasSSE2()) { + if ((VT.isVector() || VT == MVT::f128) && Subtarget.hasSSE2()) { SDLoc dl(N); MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64); diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 6a9b20998210..d4dca67687fe 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -8131,51 +8131,6 @@ let Predicates = [UseAVX2] in { } } -//===----------------------------------------------------------------------===// -// Extra selection patterns for f128, f128mem - -// movaps is shorter than movdqa. movaps is in SSE and movdqa is in SSE2. -def : Pat<(alignedstore (f128 VR128:$src), addr:$dst), - (MOVAPSmr addr:$dst, (COPY_TO_REGCLASS (f128 VR128:$src), VR128))>; -def : Pat<(store (f128 VR128:$src), addr:$dst), - (MOVUPSmr addr:$dst, (COPY_TO_REGCLASS (f128 VR128:$src), VR128))>; - -def : Pat<(alignedloadf128 addr:$src), - (COPY_TO_REGCLASS (MOVAPSrm addr:$src), VR128)>; -def : Pat<(loadf128 addr:$src), - (COPY_TO_REGCLASS (MOVUPSrm addr:$src), VR128)>; - -// andps is shorter than andpd or pand. andps is SSE and andpd/pand are in SSE2 -def : Pat<(f128 (X86fand VR128:$src1, (memopf128 addr:$src2))), - (COPY_TO_REGCLASS - (ANDPSrm (COPY_TO_REGCLASS VR128:$src1, VR128), f128mem:$src2), - VR128)>; - -def : Pat<(f128 (X86fand VR128:$src1, VR128:$src2)), - (COPY_TO_REGCLASS - (ANDPSrr (COPY_TO_REGCLASS VR128:$src1, VR128), - (COPY_TO_REGCLASS VR128:$src2, VR128)), VR128)>; - -def : Pat<(f128 (X86for VR128:$src1, (memopf128 addr:$src2))), - (COPY_TO_REGCLASS - (ORPSrm (COPY_TO_REGCLASS VR128:$src1, VR128), f128mem:$src2), - VR128)>; - -def : Pat<(f128 (X86for VR128:$src1, VR128:$src2)), - (COPY_TO_REGCLASS - (ORPSrr (COPY_TO_REGCLASS VR128:$src1, VR128), - (COPY_TO_REGCLASS VR128:$src2, VR128)), VR128)>; - -def : Pat<(f128 (X86fxor VR128:$src1, (memopf128 addr:$src2))), - (COPY_TO_REGCLASS - (XORPSrm (COPY_TO_REGCLASS VR128:$src1, VR128), f128mem:$src2), - VR128)>; - -def : Pat<(f128 (X86fxor VR128:$src1, VR128:$src2)), - (COPY_TO_REGCLASS - (XORPSrr (COPY_TO_REGCLASS VR128:$src1, VR128), - (COPY_TO_REGCLASS VR128:$src2, VR128)), VR128)>; - //===----------------------------------------------------------------------===// // GFNI instructions //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/X86InstrVecCompiler.td b/llvm/lib/Target/X86/X86InstrVecCompiler.td index 322bdb74e2de..36be0472caaa 100644 --- a/llvm/lib/Target/X86/X86InstrVecCompiler.td +++ b/llvm/lib/Target/X86/X86InstrVecCompiler.td @@ -49,6 +49,19 @@ def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>; def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>; def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>; +def : Pat<(f128 (bitconvert (v2i64 VR128:$src))), (f128 VR128:$src)>; +def : Pat<(f128 (bitconvert (v4i32 VR128:$src))), (f128 VR128:$src)>; +def : Pat<(f128 (bitconvert (v8i16 VR128:$src))), (f128 VR128:$src)>; +def : Pat<(f128 (bitconvert (v16i8 VR128:$src))), (f128 VR128:$src)>; +def : Pat<(f128 (bitconvert (v2f64 VR128:$src))), (f128 VR128:$src)>; +def : Pat<(f128 (bitconvert (v4f32 VR128:$src))), (f128 VR128:$src)>; +def : Pat<(v2i64 (bitconvert (f128 VR128:$src))), (v2i64 VR128:$src)>; +def : Pat<(v4i32 (bitconvert (f128 VR128:$src))), (v4i32 VR128:$src)>; +def : Pat<(v8i16 (bitconvert (f128 VR128:$src))), (v8i16 VR128:$src)>; +def : Pat<(v16i8 (bitconvert (f128 VR128:$src))), (v16i8 VR128:$src)>; +def : Pat<(v2f64 (bitconvert (f128 VR128:$src))), (v2f64 VR128:$src)>; +def : Pat<(v4f32 (bitconvert (f128 VR128:$src))), (v4f32 VR128:$src)>; + // Bitcasts between 256-bit vector types. Return the original type since // no instruction is needed for the conversion def : Pat<(v4i64 (bitconvert (v8i32 VR256:$src))), (v4i64 VR256:$src)>; @@ -509,3 +522,68 @@ let Predicates = [HasBWI, HasVLX] in { (KSHIFTRQri (KSHIFTLQri (COPY_TO_REGCLASS VK4:$mask, VK64), (i8 60)), (i8 60))>; } + +//===----------------------------------------------------------------------===// +// Extra selection patterns for f128, f128mem + +// movaps is shorter than movdqa. movaps is in SSE and movdqa is in SSE2. +let Predicates = [NoAVX] in { +def : Pat<(alignedstore (f128 VR128:$src), addr:$dst), + (MOVAPSmr addr:$dst, VR128:$src)>; +def : Pat<(store (f128 VR128:$src), addr:$dst), + (MOVUPSmr addr:$dst, VR128:$src)>; + +def : Pat<(alignedloadf128 addr:$src), + (MOVAPSrm addr:$src)>; +def : Pat<(loadf128 addr:$src), + (MOVUPSrm addr:$src)>; +} + +let Predicates = [HasAVX, NoVLX] in { +def : Pat<(alignedstore (f128 VR128:$src), addr:$dst), + (VMOVAPSmr addr:$dst, VR128:$src)>; +def : Pat<(store (f128 VR128:$src), addr:$dst), + (VMOVUPSmr addr:$dst, VR128:$src)>; + +def : Pat<(alignedloadf128 addr:$src), + (VMOVAPSrm addr:$src)>; +def : Pat<(loadf128 addr:$src), + (VMOVUPSrm addr:$src)>; +} + +let Predicates = [HasVLX] in { +def : Pat<(alignedstore (f128 VR128X:$src), addr:$dst), + (VMOVAPSZ128mr addr:$dst, VR128X:$src)>; +def : Pat<(store (f128 VR128X:$src), addr:$dst), + (VMOVUPSZ128mr addr:$dst, VR128X:$src)>; + +def : Pat<(alignedloadf128 addr:$src), + (VMOVAPSZ128rm addr:$src)>; +def : Pat<(loadf128 addr:$src), + (VMOVUPSZ128rm addr:$src)>; +} + +// With SSE2 the DAG combiner converts fp logic ops to integer logic ops to +// reduce patterns. +let Predicates = [UseSSE1] in { +// andps is shorter than andpd or pand. andps is SSE and andpd/pand are in SSE2 +def : Pat<(f128 (X86fand VR128:$src1, (memopf128 addr:$src2))), + (ANDPSrm VR128:$src1, f128mem:$src2)>; + +def : Pat<(f128 (X86fand VR128:$src1, VR128:$src2)), + (ANDPSrr VR128:$src1, VR128:$src2)>; + +def : Pat<(f128 (X86for VR128:$src1, (memopf128 addr:$src2))), + (ORPSrm VR128:$src1, f128mem:$src2)>; + +def : Pat<(f128 (X86for VR128:$src1, VR128:$src2)), + (ORPSrr VR128:$src1, VR128:$src2)>; + +def : Pat<(f128 (X86fxor VR128:$src1, (memopf128 addr:$src2))), + (XORPSrm VR128:$src1, f128mem:$src2)>; + +def : Pat<(f128 (X86fxor VR128:$src1, VR128:$src2)), + (XORPSrr VR128:$src1, VR128:$src2)>; +} + + diff --git a/llvm/test/CodeGen/X86/fp128-i128.ll b/llvm/test/CodeGen/X86/fp128-i128.ll index 5c2853581954..6bfc0e5eb519 100644 --- a/llvm/test/CodeGen/X86/fp128-i128.ll +++ b/llvm/test/CodeGen/X86/fp128-i128.ll @@ -1,6 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx -enable-legalize-types-checking | FileCheck %s -; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+mmx -enable-legalize-types-checking | FileCheck %s +; RUN: llc < %s -O2 -verify-machineinstrs -mtriple=x86_64-linux-android -mattr=+mmx -enable-legalize-types-checking | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: llc < %s -O2 -verify-machineinstrs -mtriple=x86_64-linux-gnu -mattr=+mmx -enable-legalize-types-checking | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: llc < %s -O2 -verify-machineinstrs -mtriple=x86_64-linux-android -mattr=+mmx,avx2 -enable-legalize-types-checking | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: llc < %s -O2 -verify-machineinstrs -mtriple=x86_64-linux-gnu -mattr=+mmx,avx2 -enable-legalize-types-checking | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: llc < %s -O2 -verify-machineinstrs -mtriple=x86_64-linux-android -mattr=+mmx,avx512vl -enable-legalize-types-checking | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: llc < %s -O2 -verify-machineinstrs -mtriple=x86_64-linux-gnu -mattr=+mmx,avx512vl -enable-legalize-types-checking | FileCheck %s --check-prefixes=CHECK,AVX ; These tests were generated from simplified libm C code. ; When compiled for the x86_64-linux-android target, @@ -42,19 +46,33 @@ ; foo(w); ; } define void @TestUnionLD1(fp128 %s, i64 %n) #0 { -; CHECK-LABEL: TestUnionLD1: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; CHECK-NEXT: movabsq $281474976710655, %rcx # imm = 0xFFFFFFFFFFFF -; CHECK-NEXT: andq %rdi, %rcx -; CHECK-NEXT: movabsq $-281474976710656, %rdx # imm = 0xFFFF000000000000 -; CHECK-NEXT: andq -{{[0-9]+}}(%rsp), %rdx -; CHECK-NEXT: orq %rcx, %rdx -; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 -; CHECK-NEXT: jmp foo # TAILCALL +; SSE-LABEL: TestUnionLD1: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movabsq $281474976710655, %rcx # imm = 0xFFFFFFFFFFFF +; SSE-NEXT: andq %rdi, %rcx +; SSE-NEXT: movabsq $-281474976710656, %rdx # imm = 0xFFFF000000000000 +; SSE-NEXT: andq -{{[0-9]+}}(%rsp), %rdx +; SSE-NEXT: orq %rcx, %rdx +; SSE-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: jmp foo # TAILCALL +; +; AVX-LABEL: TestUnionLD1: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movabsq $281474976710655, %rcx # imm = 0xFFFFFFFFFFFF +; AVX-NEXT: andq %rdi, %rcx +; AVX-NEXT: movabsq $-281474976710656, %rdx # imm = 0xFFFF000000000000 +; AVX-NEXT: andq -{{[0-9]+}}(%rsp), %rdx +; AVX-NEXT: orq %rcx, %rdx +; AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; AVX-NEXT: jmp foo # TAILCALL entry: %0 = bitcast fp128 %s to i128 %1 = zext i64 %n to i128 @@ -77,14 +95,23 @@ entry: ; return w; ; } define fp128 @TestUnionLD2(fp128 %s) #0 { -; CHECK-LABEL: TestUnionLD2: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 -; CHECK-NEXT: retq +; SSE-LABEL: TestUnionLD2: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq $0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: TestUnionLD2: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq $0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; AVX-NEXT: retq entry: %0 = bitcast fp128 %s to i128 %bf.clear = and i128 %0, -18446744073709551616 @@ -101,25 +128,45 @@ entry: ; return (z.e < 0.1L) ? 1.0L : 2.0L; ; } define fp128 @TestI128_1(fp128 %x) #0 { -; CHECK-LABEL: TestI128_1: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: subq $40, %rsp -; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF -; CHECK-NEXT: andq {{[0-9]+}}(%rsp), %rax -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %rcx, (%rsp) -; CHECK-NEXT: movaps (%rsp), %xmm0 -; CHECK-NEXT: movaps {{.*}}(%rip), %xmm1 -; CHECK-NEXT: callq __lttf2 -; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: testl %eax, %eax -; CHECK-NEXT: sets %cl -; CHECK-NEXT: shlq $4, %rcx -; CHECK-NEXT: movaps {{\.LCPI.*}}(%rcx), %xmm0 -; CHECK-NEXT: addq $40, %rsp -; CHECK-NEXT: retq +; SSE-LABEL: TestI128_1: +; SSE: # %bb.0: # %entry +; SSE-NEXT: subq $40, %rsp +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF +; SSE-NEXT: andq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; SSE-NEXT: movq %rcx, (%rsp) +; SSE-NEXT: movaps (%rsp), %xmm0 +; SSE-NEXT: movaps {{.*}}(%rip), %xmm1 +; SSE-NEXT: callq __lttf2 +; SSE-NEXT: xorl %ecx, %ecx +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: sets %cl +; SSE-NEXT: shlq $4, %rcx +; SSE-NEXT: movaps {{\.LCPI.*}}(%rcx), %xmm0 +; SSE-NEXT: addq $40, %rsp +; SSE-NEXT: retq +; +; AVX-LABEL: TestI128_1: +; AVX: # %bb.0: # %entry +; AVX-NEXT: subq $40, %rsp +; AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) +; AVX-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF +; AVX-NEXT: andq {{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; AVX-NEXT: movq %rcx, (%rsp) +; AVX-NEXT: vmovaps (%rsp), %xmm0 +; AVX-NEXT: vmovaps {{.*}}(%rip), %xmm1 +; AVX-NEXT: callq __lttf2 +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: testl %eax, %eax +; AVX-NEXT: sets %cl +; AVX-NEXT: shlq $4, %rcx +; AVX-NEXT: vmovaps {{\.LCPI.*}}(%rcx), %xmm0 +; AVX-NEXT: addq $40, %rsp +; AVX-NEXT: retq entry: %0 = bitcast fp128 %x to i128 %bf.clear = and i128 %0, 170141183460469231731687303715884105727 @@ -139,15 +186,25 @@ entry: ; return (hx & 0x8000) == 0 ? x : y; ; } define fp128 @TestI128_2(fp128 %x, fp128 %y) #0 { -; CHECK-LABEL: TestI128_2: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: cmpq $0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: jns .LBB3_2 -; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: movaps %xmm1, %xmm0 -; CHECK-NEXT: .LBB3_2: # %entry -; CHECK-NEXT: retq +; SSE-LABEL: TestI128_2: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: cmpq $0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: jns .LBB3_2 +; SSE-NEXT: # %bb.1: # %entry +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: .LBB3_2: # %entry +; SSE-NEXT: retq +; +; AVX-LABEL: TestI128_2: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: cmpq $0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: jns .LBB3_2 +; AVX-NEXT: # %bb.1: # %entry +; AVX-NEXT: vmovaps %xmm1, %xmm0 +; AVX-NEXT: .LBB3_2: # %entry +; AVX-NEXT: retq entry: %0 = bitcast fp128 %x to i128 %cmp = icmp sgt i128 %0, -1 @@ -167,32 +224,59 @@ entry: ; return (u.e); ; } define fp128 @TestI128_3(fp128 %x, i32* nocapture readnone %ex) #0 { -; CHECK-LABEL: TestI128_3: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: subq $56, %rsp -; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax -; CHECK-NEXT: movabsq $9223090561878065152, %rcx # imm = 0x7FFF000000000000 -; CHECK-NEXT: testq %rcx, %rax -; CHECK-NEXT: je .LBB4_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; CHECK-NEXT: jmp .LBB4_3 -; CHECK-NEXT: .LBB4_2: # %if.then -; CHECK-NEXT: movaps {{.*}}(%rip), %xmm1 -; CHECK-NEXT: callq __multf3 -; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; CHECK-NEXT: movabsq $-9223090561878065153, %rdx # imm = 0x8000FFFFFFFFFFFF -; CHECK-NEXT: andq {{[0-9]+}}(%rsp), %rdx -; CHECK-NEXT: movabsq $4611123068473966592, %rax # imm = 0x3FFE000000000000 -; CHECK-NEXT: orq %rdx, %rax -; CHECK-NEXT: .LBB4_3: # %if.end -; CHECK-NEXT: movq %rcx, (%rsp) -; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movaps (%rsp), %xmm0 -; CHECK-NEXT: addq $56, %rsp -; CHECK-NEXT: retq +; SSE-LABEL: TestI128_3: +; SSE: # %bb.0: # %entry +; SSE-NEXT: subq $56, %rsp +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movabsq $9223090561878065152, %rcx # imm = 0x7FFF000000000000 +; SSE-NEXT: testq %rcx, %rax +; SSE-NEXT: je .LBB4_2 +; SSE-NEXT: # %bb.1: +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: jmp .LBB4_3 +; SSE-NEXT: .LBB4_2: # %if.then +; SSE-NEXT: movaps {{.*}}(%rip), %xmm1 +; SSE-NEXT: callq __multf3 +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movabsq $-9223090561878065153, %rdx # imm = 0x8000FFFFFFFFFFFF +; SSE-NEXT: andq {{[0-9]+}}(%rsp), %rdx +; SSE-NEXT: movabsq $4611123068473966592, %rax # imm = 0x3FFE000000000000 +; SSE-NEXT: orq %rdx, %rax +; SSE-NEXT: .LBB4_3: # %if.end +; SSE-NEXT: movq %rcx, (%rsp) +; SSE-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps (%rsp), %xmm0 +; SSE-NEXT: addq $56, %rsp +; SSE-NEXT: retq +; +; AVX-LABEL: TestI128_3: +; AVX: # %bb.0: # %entry +; AVX-NEXT: subq $56, %rsp +; AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movabsq $9223090561878065152, %rcx # imm = 0x7FFF000000000000 +; AVX-NEXT: testq %rcx, %rax +; AVX-NEXT: je .LBB4_2 +; AVX-NEXT: # %bb.1: +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: jmp .LBB4_3 +; AVX-NEXT: .LBB4_2: # %if.then +; AVX-NEXT: vmovaps {{.*}}(%rip), %xmm1 +; AVX-NEXT: callq __multf3 +; AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movabsq $-9223090561878065153, %rdx # imm = 0x8000FFFFFFFFFFFF +; AVX-NEXT: andq {{[0-9]+}}(%rsp), %rdx +; AVX-NEXT: movabsq $4611123068473966592, %rax # imm = 0x3FFE000000000000 +; AVX-NEXT: orq %rdx, %rax +; AVX-NEXT: .LBB4_3: # %if.end +; AVX-NEXT: movq %rcx, (%rsp) +; AVX-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; AVX-NEXT: vmovaps (%rsp), %xmm0 +; AVX-NEXT: addq $56, %rsp +; AVX-NEXT: retq entry: %0 = bitcast fp128 %x to i128 %bf.cast = and i128 %0, 170135991163610696904058773219554885632 @@ -223,18 +307,31 @@ if.end: ; preds = %if.then, %entry ; return x + df; ; } define fp128 @TestI128_4(fp128 %x) #0 { -; CHECK-LABEL: TestI128_4: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: subq $40, %rsp -; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax -; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq $0, (%rsp) -; CHECK-NEXT: movaps (%rsp), %xmm0 -; CHECK-NEXT: callq __addtf3 -; CHECK-NEXT: addq $40, %rsp -; CHECK-NEXT: retq +; SSE-LABEL: TestI128_4: +; SSE: # %bb.0: # %entry +; SSE-NEXT: subq $40, %rsp +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; SSE-NEXT: movq $0, (%rsp) +; SSE-NEXT: movaps (%rsp), %xmm0 +; SSE-NEXT: callq __addtf3 +; SSE-NEXT: addq $40, %rsp +; SSE-NEXT: retq +; +; AVX-LABEL: TestI128_4: +; AVX: # %bb.0: # %entry +; AVX-NEXT: subq $40, %rsp +; AVX-NEXT: vmovaps %xmm0, %xmm1 +; AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; AVX-NEXT: movq $0, (%rsp) +; AVX-NEXT: vmovaps (%rsp), %xmm0 +; AVX-NEXT: callq __addtf3 +; AVX-NEXT: addq $40, %rsp +; AVX-NEXT: retq entry: %0 = bitcast fp128 %x to i128 %bf.clear = and i128 %0, -18446744073709551616 @@ -271,18 +368,31 @@ entry: } define fp128 @acosl(fp128 %x) #0 { -; CHECK-LABEL: acosl: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: subq $40, %rsp -; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax -; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq $0, (%rsp) -; CHECK-NEXT: movaps (%rsp), %xmm0 -; CHECK-NEXT: callq __addtf3 -; CHECK-NEXT: addq $40, %rsp -; CHECK-NEXT: retq +; SSE-LABEL: acosl: +; SSE: # %bb.0: # %entry +; SSE-NEXT: subq $40, %rsp +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; SSE-NEXT: movq $0, (%rsp) +; SSE-NEXT: movaps (%rsp), %xmm0 +; SSE-NEXT: callq __addtf3 +; SSE-NEXT: addq $40, %rsp +; SSE-NEXT: retq +; +; AVX-LABEL: acosl: +; AVX: # %bb.0: # %entry +; AVX-NEXT: subq $40, %rsp +; AVX-NEXT: vmovaps %xmm0, %xmm1 +; AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; AVX-NEXT: movq $0, (%rsp) +; AVX-NEXT: vmovaps (%rsp), %xmm0 +; AVX-NEXT: callq __addtf3 +; AVX-NEXT: addq $40, %rsp +; AVX-NEXT: retq entry: %0 = bitcast fp128 %x to i128 %bf.clear = and i128 %0, -18446744073709551616 @@ -293,15 +403,25 @@ entry: ; Compare i128 values and check i128 constants. define fp128 @TestComp(fp128 %x, fp128 %y) #0 { -; CHECK-LABEL: TestComp: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: cmpq $0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: jns .LBB8_2 -; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: movaps %xmm1, %xmm0 -; CHECK-NEXT: .LBB8_2: # %entry -; CHECK-NEXT: retq +; SSE-LABEL: TestComp: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: cmpq $0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: jns .LBB8_2 +; SSE-NEXT: # %bb.1: # %entry +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: .LBB8_2: # %entry +; SSE-NEXT: retq +; +; AVX-LABEL: TestComp: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: cmpq $0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: jns .LBB8_2 +; AVX-NEXT: # %bb.1: # %entry +; AVX-NEXT: vmovaps %xmm1, %xmm0 +; AVX-NEXT: .LBB8_2: # %entry +; AVX-NEXT: retq entry: %0 = bitcast fp128 %x to i128 %cmp = icmp sgt i128 %0, -1 @@ -313,10 +433,15 @@ declare void @foo(fp128) #1 ; Test logical operations on fp128 values. define fp128 @TestFABS_LD(fp128 %x) #0 { -; CHECK-LABEL: TestFABS_LD: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: andps {{.*}}(%rip), %xmm0 -; CHECK-NEXT: retq +; SSE-LABEL: TestFABS_LD: +; SSE: # %bb.0: # %entry +; SSE-NEXT: andps {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: TestFABS_LD: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: retq entry: %call = tail call fp128 @fabsl(fp128 %x) #2 ret fp128 %call @@ -328,43 +453,79 @@ declare fp128 @copysignl(fp128, fp128) #1 ; Test more complicated logical operations generated from copysignl. define void @TestCopySign({ fp128, fp128 }* noalias nocapture sret %agg.result, { fp128, fp128 }* byval nocapture readonly align 16 %z) #0 { -; CHECK-LABEL: TestCopySign: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: subq $40, %rsp -; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: callq __gttf2 -; CHECK-NEXT: movl %eax, %ebp -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: callq __subtf3 -; CHECK-NEXT: testl %ebp, %ebp -; CHECK-NEXT: jle .LBB10_1 -; CHECK-NEXT: # %bb.2: # %if.then -; CHECK-NEXT: andps {{.*}}(%rip), %xmm0 -; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: movaps %xmm1, %xmm2 -; CHECK-NEXT: jmp .LBB10_3 -; CHECK-NEXT: .LBB10_1: -; CHECK-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload -; CHECK-NEXT: .LBB10_3: # %cleanup -; CHECK-NEXT: movaps {{.*}}(%rip), %xmm1 -; CHECK-NEXT: andps {{.*}}(%rip), %xmm0 -; CHECK-NEXT: andps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; CHECK-NEXT: orps %xmm1, %xmm0 -; CHECK-NEXT: movaps %xmm2, (%rbx) -; CHECK-NEXT: movaps %xmm0, 16(%rbx) -; CHECK-NEXT: movq %rbx, %rax -; CHECK-NEXT: addq $40, %rsp -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: popq %rbp -; CHECK-NEXT: retq +; SSE-LABEL: TestCopySign: +; SSE: # %bb.0: # %entry +; SSE-NEXT: pushq %rbp +; SSE-NEXT: pushq %rbx +; SSE-NEXT: subq $40, %rsp +; SSE-NEXT: movq %rdi, %rbx +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: callq __gttf2 +; SSE-NEXT: movl %eax, %ebp +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: callq __subtf3 +; SSE-NEXT: testl %ebp, %ebp +; SSE-NEXT: jle .LBB10_1 +; SSE-NEXT: # %bb.2: # %if.then +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: andps {{.*}}(%rip), %xmm1 +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: jmp .LBB10_3 +; SSE-NEXT: .LBB10_1: +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: .LBB10_3: # %cleanup +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: andps {{.*}}(%rip), %xmm2 +; SSE-NEXT: andps {{.*}}(%rip), %xmm0 +; SSE-NEXT: orps %xmm2, %xmm0 +; SSE-NEXT: movaps %xmm1, (%rbx) +; SSE-NEXT: movaps %xmm0, 16(%rbx) +; SSE-NEXT: movq %rbx, %rax +; SSE-NEXT: addq $40, %rsp +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %rbp +; SSE-NEXT: retq +; +; AVX-LABEL: TestCopySign: +; AVX: # %bb.0: # %entry +; AVX-NEXT: pushq %rbp +; AVX-NEXT: pushq %rbx +; AVX-NEXT: subq $40, %rsp +; AVX-NEXT: movq %rdi, %rbx +; AVX-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 +; AVX-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm1 +; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: callq __gttf2 +; AVX-NEXT: movl %eax, %ebp +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vmovaps %xmm0, %xmm1 +; AVX-NEXT: callq __subtf3 +; AVX-NEXT: testl %ebp, %ebp +; AVX-NEXT: jle .LBB10_1 +; AVX-NEXT: # %bb.2: # %if.then +; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm1 +; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX-NEXT: vmovaps %xmm1, %xmm2 +; AVX-NEXT: jmp .LBB10_3 +; AVX-NEXT: .LBB10_1: +; AVX-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload +; AVX-NEXT: .LBB10_3: # %cleanup +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovaps %xmm2, (%rbx) +; AVX-NEXT: vmovaps %xmm0, 16(%rbx) +; AVX-NEXT: movq %rbx, %rax +; AVX-NEXT: addq $40, %rsp +; AVX-NEXT: popq %rbx +; AVX-NEXT: popq %rbp +; AVX-NEXT: retq entry: %z.realp = getelementptr inbounds { fp128, fp128 }, { fp128, fp128 }* %z, i64 0, i32 0 %z.real = load fp128, fp128* %z.realp, align 16