[X86] Support fp128 and/or/xor/load/store with VEX and EVEX encoded instructions.

Move all the patterns to X86InstrVecCompiler.td so we can keep SSE/AVX/AVX512 all in one place.

To save some patterns we'll use an existing DAG combine to convert f128 fand/for/fxor to integer when sse2 is enabled. This allows use to reuse all the existing patterns for v2i64.

I believe this now makes SHA instructions the only case where VEX/EVEX and legacy encoded instructions could be generated simultaneously.

llvm-svn: 338821
This commit is contained in:
Craig Topper 2018-08-03 06:12:56 +00:00
parent 58d837d347
commit e902b7d0b0
4 changed files with 393 additions and 198 deletions

View File

@ -613,7 +613,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// Long double always uses X87, except f128 in MMX.
if (UseX87) {
if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
addRegisterClass(MVT::f128, &X86::VR128RegClass);
addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
: &X86::VR128RegClass);
ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
setOperationAction(ISD::FABS , MVT::f128, Custom);
setOperationAction(ISD::FNEG , MVT::f128, Custom);
@ -36981,7 +36982,7 @@ static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MVT VT = N->getSimpleValueType(0);
// If we have integer vector types available, use the integer opcodes.
if (VT.isVector() && Subtarget.hasSSE2()) {
if ((VT.isVector() || VT == MVT::f128) && Subtarget.hasSSE2()) {
SDLoc dl(N);
MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);

View File

@ -8131,51 +8131,6 @@ let Predicates = [UseAVX2] in {
}
}
//===----------------------------------------------------------------------===//
// Extra selection patterns for f128, f128mem
// movaps is shorter than movdqa. movaps is in SSE and movdqa is in SSE2.
def : Pat<(alignedstore (f128 VR128:$src), addr:$dst),
(MOVAPSmr addr:$dst, (COPY_TO_REGCLASS (f128 VR128:$src), VR128))>;
def : Pat<(store (f128 VR128:$src), addr:$dst),
(MOVUPSmr addr:$dst, (COPY_TO_REGCLASS (f128 VR128:$src), VR128))>;
def : Pat<(alignedloadf128 addr:$src),
(COPY_TO_REGCLASS (MOVAPSrm addr:$src), VR128)>;
def : Pat<(loadf128 addr:$src),
(COPY_TO_REGCLASS (MOVUPSrm addr:$src), VR128)>;
// andps is shorter than andpd or pand. andps is SSE and andpd/pand are in SSE2
def : Pat<(f128 (X86fand VR128:$src1, (memopf128 addr:$src2))),
(COPY_TO_REGCLASS
(ANDPSrm (COPY_TO_REGCLASS VR128:$src1, VR128), f128mem:$src2),
VR128)>;
def : Pat<(f128 (X86fand VR128:$src1, VR128:$src2)),
(COPY_TO_REGCLASS
(ANDPSrr (COPY_TO_REGCLASS VR128:$src1, VR128),
(COPY_TO_REGCLASS VR128:$src2, VR128)), VR128)>;
def : Pat<(f128 (X86for VR128:$src1, (memopf128 addr:$src2))),
(COPY_TO_REGCLASS
(ORPSrm (COPY_TO_REGCLASS VR128:$src1, VR128), f128mem:$src2),
VR128)>;
def : Pat<(f128 (X86for VR128:$src1, VR128:$src2)),
(COPY_TO_REGCLASS
(ORPSrr (COPY_TO_REGCLASS VR128:$src1, VR128),
(COPY_TO_REGCLASS VR128:$src2, VR128)), VR128)>;
def : Pat<(f128 (X86fxor VR128:$src1, (memopf128 addr:$src2))),
(COPY_TO_REGCLASS
(XORPSrm (COPY_TO_REGCLASS VR128:$src1, VR128), f128mem:$src2),
VR128)>;
def : Pat<(f128 (X86fxor VR128:$src1, VR128:$src2)),
(COPY_TO_REGCLASS
(XORPSrr (COPY_TO_REGCLASS VR128:$src1, VR128),
(COPY_TO_REGCLASS VR128:$src2, VR128)), VR128)>;
//===----------------------------------------------------------------------===//
// GFNI instructions
//===----------------------------------------------------------------------===//

View File

@ -49,6 +49,19 @@ def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
def : Pat<(f128 (bitconvert (v2i64 VR128:$src))), (f128 VR128:$src)>;
def : Pat<(f128 (bitconvert (v4i32 VR128:$src))), (f128 VR128:$src)>;
def : Pat<(f128 (bitconvert (v8i16 VR128:$src))), (f128 VR128:$src)>;
def : Pat<(f128 (bitconvert (v16i8 VR128:$src))), (f128 VR128:$src)>;
def : Pat<(f128 (bitconvert (v2f64 VR128:$src))), (f128 VR128:$src)>;
def : Pat<(f128 (bitconvert (v4f32 VR128:$src))), (f128 VR128:$src)>;
def : Pat<(v2i64 (bitconvert (f128 VR128:$src))), (v2i64 VR128:$src)>;
def : Pat<(v4i32 (bitconvert (f128 VR128:$src))), (v4i32 VR128:$src)>;
def : Pat<(v8i16 (bitconvert (f128 VR128:$src))), (v8i16 VR128:$src)>;
def : Pat<(v16i8 (bitconvert (f128 VR128:$src))), (v16i8 VR128:$src)>;
def : Pat<(v2f64 (bitconvert (f128 VR128:$src))), (v2f64 VR128:$src)>;
def : Pat<(v4f32 (bitconvert (f128 VR128:$src))), (v4f32 VR128:$src)>;
// Bitcasts between 256-bit vector types. Return the original type since
// no instruction is needed for the conversion
def : Pat<(v4i64 (bitconvert (v8i32 VR256:$src))), (v4i64 VR256:$src)>;
@ -509,3 +522,68 @@ let Predicates = [HasBWI, HasVLX] in {
(KSHIFTRQri (KSHIFTLQri (COPY_TO_REGCLASS VK4:$mask, VK64),
(i8 60)), (i8 60))>;
}
//===----------------------------------------------------------------------===//
// Extra selection patterns for f128, f128mem
// movaps is shorter than movdqa. movaps is in SSE and movdqa is in SSE2.
let Predicates = [NoAVX] in {
def : Pat<(alignedstore (f128 VR128:$src), addr:$dst),
(MOVAPSmr addr:$dst, VR128:$src)>;
def : Pat<(store (f128 VR128:$src), addr:$dst),
(MOVUPSmr addr:$dst, VR128:$src)>;
def : Pat<(alignedloadf128 addr:$src),
(MOVAPSrm addr:$src)>;
def : Pat<(loadf128 addr:$src),
(MOVUPSrm addr:$src)>;
}
let Predicates = [HasAVX, NoVLX] in {
def : Pat<(alignedstore (f128 VR128:$src), addr:$dst),
(VMOVAPSmr addr:$dst, VR128:$src)>;
def : Pat<(store (f128 VR128:$src), addr:$dst),
(VMOVUPSmr addr:$dst, VR128:$src)>;
def : Pat<(alignedloadf128 addr:$src),
(VMOVAPSrm addr:$src)>;
def : Pat<(loadf128 addr:$src),
(VMOVUPSrm addr:$src)>;
}
let Predicates = [HasVLX] in {
def : Pat<(alignedstore (f128 VR128X:$src), addr:$dst),
(VMOVAPSZ128mr addr:$dst, VR128X:$src)>;
def : Pat<(store (f128 VR128X:$src), addr:$dst),
(VMOVUPSZ128mr addr:$dst, VR128X:$src)>;
def : Pat<(alignedloadf128 addr:$src),
(VMOVAPSZ128rm addr:$src)>;
def : Pat<(loadf128 addr:$src),
(VMOVUPSZ128rm addr:$src)>;
}
// With SSE2 the DAG combiner converts fp logic ops to integer logic ops to
// reduce patterns.
let Predicates = [UseSSE1] in {
// andps is shorter than andpd or pand. andps is SSE and andpd/pand are in SSE2
def : Pat<(f128 (X86fand VR128:$src1, (memopf128 addr:$src2))),
(ANDPSrm VR128:$src1, f128mem:$src2)>;
def : Pat<(f128 (X86fand VR128:$src1, VR128:$src2)),
(ANDPSrr VR128:$src1, VR128:$src2)>;
def : Pat<(f128 (X86for VR128:$src1, (memopf128 addr:$src2))),
(ORPSrm VR128:$src1, f128mem:$src2)>;
def : Pat<(f128 (X86for VR128:$src1, VR128:$src2)),
(ORPSrr VR128:$src1, VR128:$src2)>;
def : Pat<(f128 (X86fxor VR128:$src1, (memopf128 addr:$src2))),
(XORPSrm VR128:$src1, f128mem:$src2)>;
def : Pat<(f128 (X86fxor VR128:$src1, VR128:$src2)),
(XORPSrr VR128:$src1, VR128:$src2)>;
}

View File

@ -1,6 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx -enable-legalize-types-checking | FileCheck %s
; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+mmx -enable-legalize-types-checking | FileCheck %s
; RUN: llc < %s -O2 -verify-machineinstrs -mtriple=x86_64-linux-android -mattr=+mmx -enable-legalize-types-checking | FileCheck %s --check-prefixes=CHECK,SSE
; RUN: llc < %s -O2 -verify-machineinstrs -mtriple=x86_64-linux-gnu -mattr=+mmx -enable-legalize-types-checking | FileCheck %s --check-prefixes=CHECK,SSE
; RUN: llc < %s -O2 -verify-machineinstrs -mtriple=x86_64-linux-android -mattr=+mmx,avx2 -enable-legalize-types-checking | FileCheck %s --check-prefixes=CHECK,AVX
; RUN: llc < %s -O2 -verify-machineinstrs -mtriple=x86_64-linux-gnu -mattr=+mmx,avx2 -enable-legalize-types-checking | FileCheck %s --check-prefixes=CHECK,AVX
; RUN: llc < %s -O2 -verify-machineinstrs -mtriple=x86_64-linux-android -mattr=+mmx,avx512vl -enable-legalize-types-checking | FileCheck %s --check-prefixes=CHECK,AVX
; RUN: llc < %s -O2 -verify-machineinstrs -mtriple=x86_64-linux-gnu -mattr=+mmx,avx512vl -enable-legalize-types-checking | FileCheck %s --check-prefixes=CHECK,AVX
; These tests were generated from simplified libm C code.
; When compiled for the x86_64-linux-android target,
@ -42,19 +46,33 @@
; foo(w);
; }
define void @TestUnionLD1(fp128 %s, i64 %n) #0 {
; CHECK-LABEL: TestUnionLD1:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; CHECK-NEXT: movabsq $281474976710655, %rcx # imm = 0xFFFFFFFFFFFF
; CHECK-NEXT: andq %rdi, %rcx
; CHECK-NEXT: movabsq $-281474976710656, %rdx # imm = 0xFFFF000000000000
; CHECK-NEXT: andq -{{[0-9]+}}(%rsp), %rdx
; CHECK-NEXT: orq %rcx, %rdx
; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
; CHECK-NEXT: jmp foo # TAILCALL
; SSE-LABEL: TestUnionLD1:
; SSE: # %bb.0: # %entry
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movabsq $281474976710655, %rcx # imm = 0xFFFFFFFFFFFF
; SSE-NEXT: andq %rdi, %rcx
; SSE-NEXT: movabsq $-281474976710656, %rdx # imm = 0xFFFF000000000000
; SSE-NEXT: andq -{{[0-9]+}}(%rsp), %rdx
; SSE-NEXT: orq %rcx, %rdx
; SSE-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT: jmp foo # TAILCALL
;
; AVX-LABEL: TestUnionLD1:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; AVX-NEXT: movabsq $281474976710655, %rcx # imm = 0xFFFFFFFFFFFF
; AVX-NEXT: andq %rdi, %rcx
; AVX-NEXT: movabsq $-281474976710656, %rdx # imm = 0xFFFF000000000000
; AVX-NEXT: andq -{{[0-9]+}}(%rsp), %rdx
; AVX-NEXT: orq %rcx, %rdx
; AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; AVX-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
; AVX-NEXT: jmp foo # TAILCALL
entry:
%0 = bitcast fp128 %s to i128
%1 = zext i64 %n to i128
@ -77,14 +95,23 @@ entry:
; return w;
; }
define fp128 @TestUnionLD2(fp128 %s) #0 {
; CHECK-LABEL: TestUnionLD2:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
; CHECK-NEXT: retq
; SSE-LABEL: TestUnionLD2:
; SSE: # %bb.0: # %entry
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: TestUnionLD2:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; AVX-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
; AVX-NEXT: retq
entry:
%0 = bitcast fp128 %s to i128
%bf.clear = and i128 %0, -18446744073709551616
@ -101,25 +128,45 @@ entry:
; return (z.e < 0.1L) ? 1.0L : 2.0L;
; }
define fp128 @TestI128_1(fp128 %x) #0 {
; CHECK-LABEL: TestI128_1:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $40, %rsp
; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
; CHECK-NEXT: andq {{[0-9]+}}(%rsp), %rax
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx
; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movq %rcx, (%rsp)
; CHECK-NEXT: movaps (%rsp), %xmm0
; CHECK-NEXT: movaps {{.*}}(%rip), %xmm1
; CHECK-NEXT: callq __lttf2
; CHECK-NEXT: xorl %ecx, %ecx
; CHECK-NEXT: testl %eax, %eax
; CHECK-NEXT: sets %cl
; CHECK-NEXT: shlq $4, %rcx
; CHECK-NEXT: movaps {{\.LCPI.*}}(%rcx), %xmm0
; CHECK-NEXT: addq $40, %rsp
; CHECK-NEXT: retq
; SSE-LABEL: TestI128_1:
; SSE: # %bb.0: # %entry
; SSE-NEXT: subq $40, %rsp
; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; SSE-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
; SSE-NEXT: andq {{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rcx
; SSE-NEXT: movq %rax, {{[0-9]+}}(%rsp)
; SSE-NEXT: movq %rcx, (%rsp)
; SSE-NEXT: movaps (%rsp), %xmm0
; SSE-NEXT: movaps {{.*}}(%rip), %xmm1
; SSE-NEXT: callq __lttf2
; SSE-NEXT: xorl %ecx, %ecx
; SSE-NEXT: testl %eax, %eax
; SSE-NEXT: sets %cl
; SSE-NEXT: shlq $4, %rcx
; SSE-NEXT: movaps {{\.LCPI.*}}(%rcx), %xmm0
; SSE-NEXT: addq $40, %rsp
; SSE-NEXT: retq
;
; AVX-LABEL: TestI128_1:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $40, %rsp
; AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp)
; AVX-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
; AVX-NEXT: andq {{[0-9]+}}(%rsp), %rax
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rcx
; AVX-NEXT: movq %rax, {{[0-9]+}}(%rsp)
; AVX-NEXT: movq %rcx, (%rsp)
; AVX-NEXT: vmovaps (%rsp), %xmm0
; AVX-NEXT: vmovaps {{.*}}(%rip), %xmm1
; AVX-NEXT: callq __lttf2
; AVX-NEXT: xorl %ecx, %ecx
; AVX-NEXT: testl %eax, %eax
; AVX-NEXT: sets %cl
; AVX-NEXT: shlq $4, %rcx
; AVX-NEXT: vmovaps {{\.LCPI.*}}(%rcx), %xmm0
; AVX-NEXT: addq $40, %rsp
; AVX-NEXT: retq
entry:
%0 = bitcast fp128 %x to i128
%bf.clear = and i128 %0, 170141183460469231731687303715884105727
@ -139,15 +186,25 @@ entry:
; return (hx & 0x8000) == 0 ? x : y;
; }
define fp128 @TestI128_2(fp128 %x, fp128 %y) #0 {
; CHECK-LABEL: TestI128_2:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: cmpq $0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: jns .LBB3_2
; CHECK-NEXT: # %bb.1: # %entry
; CHECK-NEXT: movaps %xmm1, %xmm0
; CHECK-NEXT: .LBB3_2: # %entry
; CHECK-NEXT: retq
; SSE-LABEL: TestI128_2:
; SSE: # %bb.0: # %entry
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: cmpq $0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: jns .LBB3_2
; SSE-NEXT: # %bb.1: # %entry
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: .LBB3_2: # %entry
; SSE-NEXT: retq
;
; AVX-LABEL: TestI128_2:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; AVX-NEXT: cmpq $0, -{{[0-9]+}}(%rsp)
; AVX-NEXT: jns .LBB3_2
; AVX-NEXT: # %bb.1: # %entry
; AVX-NEXT: vmovaps %xmm1, %xmm0
; AVX-NEXT: .LBB3_2: # %entry
; AVX-NEXT: retq
entry:
%0 = bitcast fp128 %x to i128
%cmp = icmp sgt i128 %0, -1
@ -167,32 +224,59 @@ entry:
; return (u.e);
; }
define fp128 @TestI128_3(fp128 %x, i32* nocapture readnone %ex) #0 {
; CHECK-LABEL: TestI128_3:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $56, %rsp
; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax
; CHECK-NEXT: movabsq $9223090561878065152, %rcx # imm = 0x7FFF000000000000
; CHECK-NEXT: testq %rcx, %rax
; CHECK-NEXT: je .LBB4_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx
; CHECK-NEXT: jmp .LBB4_3
; CHECK-NEXT: .LBB4_2: # %if.then
; CHECK-NEXT: movaps {{.*}}(%rip), %xmm1
; CHECK-NEXT: callq __multf3
; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx
; CHECK-NEXT: movabsq $-9223090561878065153, %rdx # imm = 0x8000FFFFFFFFFFFF
; CHECK-NEXT: andq {{[0-9]+}}(%rsp), %rdx
; CHECK-NEXT: movabsq $4611123068473966592, %rax # imm = 0x3FFE000000000000
; CHECK-NEXT: orq %rdx, %rax
; CHECK-NEXT: .LBB4_3: # %if.end
; CHECK-NEXT: movq %rcx, (%rsp)
; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movaps (%rsp), %xmm0
; CHECK-NEXT: addq $56, %rsp
; CHECK-NEXT: retq
; SSE-LABEL: TestI128_3:
; SSE: # %bb.0: # %entry
; SSE-NEXT: subq $56, %rsp
; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movabsq $9223090561878065152, %rcx # imm = 0x7FFF000000000000
; SSE-NEXT: testq %rcx, %rax
; SSE-NEXT: je .LBB4_2
; SSE-NEXT: # %bb.1:
; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rcx
; SSE-NEXT: jmp .LBB4_3
; SSE-NEXT: .LBB4_2: # %if.then
; SSE-NEXT: movaps {{.*}}(%rip), %xmm1
; SSE-NEXT: callq __multf3
; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rcx
; SSE-NEXT: movabsq $-9223090561878065153, %rdx # imm = 0x8000FFFFFFFFFFFF
; SSE-NEXT: andq {{[0-9]+}}(%rsp), %rdx
; SSE-NEXT: movabsq $4611123068473966592, %rax # imm = 0x3FFE000000000000
; SSE-NEXT: orq %rdx, %rax
; SSE-NEXT: .LBB4_3: # %if.end
; SSE-NEXT: movq %rcx, (%rsp)
; SSE-NEXT: movq %rax, {{[0-9]+}}(%rsp)
; SSE-NEXT: movaps (%rsp), %xmm0
; SSE-NEXT: addq $56, %rsp
; SSE-NEXT: retq
;
; AVX-LABEL: TestI128_3:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $56, %rsp
; AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp)
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX-NEXT: movabsq $9223090561878065152, %rcx # imm = 0x7FFF000000000000
; AVX-NEXT: testq %rcx, %rax
; AVX-NEXT: je .LBB4_2
; AVX-NEXT: # %bb.1:
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rcx
; AVX-NEXT: jmp .LBB4_3
; AVX-NEXT: .LBB4_2: # %if.then
; AVX-NEXT: vmovaps {{.*}}(%rip), %xmm1
; AVX-NEXT: callq __multf3
; AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp)
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rcx
; AVX-NEXT: movabsq $-9223090561878065153, %rdx # imm = 0x8000FFFFFFFFFFFF
; AVX-NEXT: andq {{[0-9]+}}(%rsp), %rdx
; AVX-NEXT: movabsq $4611123068473966592, %rax # imm = 0x3FFE000000000000
; AVX-NEXT: orq %rdx, %rax
; AVX-NEXT: .LBB4_3: # %if.end
; AVX-NEXT: movq %rcx, (%rsp)
; AVX-NEXT: movq %rax, {{[0-9]+}}(%rsp)
; AVX-NEXT: vmovaps (%rsp), %xmm0
; AVX-NEXT: addq $56, %rsp
; AVX-NEXT: retq
entry:
%0 = bitcast fp128 %x to i128
%bf.cast = and i128 %0, 170135991163610696904058773219554885632
@ -223,18 +307,31 @@ if.end: ; preds = %if.then, %entry
; return x + df;
; }
define fp128 @TestI128_4(fp128 %x) #0 {
; CHECK-LABEL: TestI128_4:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $40, %rsp
; CHECK-NEXT: movaps %xmm0, %xmm1
; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax
; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movq $0, (%rsp)
; CHECK-NEXT: movaps (%rsp), %xmm0
; CHECK-NEXT: callq __addtf3
; CHECK-NEXT: addq $40, %rsp
; CHECK-NEXT: retq
; SSE-LABEL: TestI128_4:
; SSE: # %bb.0: # %entry
; SSE-NEXT: subq $40, %rsp
; SSE-NEXT: movaps %xmm0, %xmm1
; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq %rax, {{[0-9]+}}(%rsp)
; SSE-NEXT: movq $0, (%rsp)
; SSE-NEXT: movaps (%rsp), %xmm0
; SSE-NEXT: callq __addtf3
; SSE-NEXT: addq $40, %rsp
; SSE-NEXT: retq
;
; AVX-LABEL: TestI128_4:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $40, %rsp
; AVX-NEXT: vmovaps %xmm0, %xmm1
; AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp)
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX-NEXT: movq %rax, {{[0-9]+}}(%rsp)
; AVX-NEXT: movq $0, (%rsp)
; AVX-NEXT: vmovaps (%rsp), %xmm0
; AVX-NEXT: callq __addtf3
; AVX-NEXT: addq $40, %rsp
; AVX-NEXT: retq
entry:
%0 = bitcast fp128 %x to i128
%bf.clear = and i128 %0, -18446744073709551616
@ -271,18 +368,31 @@ entry:
}
define fp128 @acosl(fp128 %x) #0 {
; CHECK-LABEL: acosl:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $40, %rsp
; CHECK-NEXT: movaps %xmm0, %xmm1
; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax
; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movq $0, (%rsp)
; CHECK-NEXT: movaps (%rsp), %xmm0
; CHECK-NEXT: callq __addtf3
; CHECK-NEXT: addq $40, %rsp
; CHECK-NEXT: retq
; SSE-LABEL: acosl:
; SSE: # %bb.0: # %entry
; SSE-NEXT: subq $40, %rsp
; SSE-NEXT: movaps %xmm0, %xmm1
; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq %rax, {{[0-9]+}}(%rsp)
; SSE-NEXT: movq $0, (%rsp)
; SSE-NEXT: movaps (%rsp), %xmm0
; SSE-NEXT: callq __addtf3
; SSE-NEXT: addq $40, %rsp
; SSE-NEXT: retq
;
; AVX-LABEL: acosl:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $40, %rsp
; AVX-NEXT: vmovaps %xmm0, %xmm1
; AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp)
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX-NEXT: movq %rax, {{[0-9]+}}(%rsp)
; AVX-NEXT: movq $0, (%rsp)
; AVX-NEXT: vmovaps (%rsp), %xmm0
; AVX-NEXT: callq __addtf3
; AVX-NEXT: addq $40, %rsp
; AVX-NEXT: retq
entry:
%0 = bitcast fp128 %x to i128
%bf.clear = and i128 %0, -18446744073709551616
@ -293,15 +403,25 @@ entry:
; Compare i128 values and check i128 constants.
define fp128 @TestComp(fp128 %x, fp128 %y) #0 {
; CHECK-LABEL: TestComp:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: cmpq $0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: jns .LBB8_2
; CHECK-NEXT: # %bb.1: # %entry
; CHECK-NEXT: movaps %xmm1, %xmm0
; CHECK-NEXT: .LBB8_2: # %entry
; CHECK-NEXT: retq
; SSE-LABEL: TestComp:
; SSE: # %bb.0: # %entry
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: cmpq $0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: jns .LBB8_2
; SSE-NEXT: # %bb.1: # %entry
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: .LBB8_2: # %entry
; SSE-NEXT: retq
;
; AVX-LABEL: TestComp:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; AVX-NEXT: cmpq $0, -{{[0-9]+}}(%rsp)
; AVX-NEXT: jns .LBB8_2
; AVX-NEXT: # %bb.1: # %entry
; AVX-NEXT: vmovaps %xmm1, %xmm0
; AVX-NEXT: .LBB8_2: # %entry
; AVX-NEXT: retq
entry:
%0 = bitcast fp128 %x to i128
%cmp = icmp sgt i128 %0, -1
@ -313,10 +433,15 @@ declare void @foo(fp128) #1
; Test logical operations on fp128 values.
define fp128 @TestFABS_LD(fp128 %x) #0 {
; CHECK-LABEL: TestFABS_LD:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: andps {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
; SSE-LABEL: TestFABS_LD:
; SSE: # %bb.0: # %entry
; SSE-NEXT: andps {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: TestFABS_LD:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
entry:
%call = tail call fp128 @fabsl(fp128 %x) #2
ret fp128 %call
@ -328,43 +453,79 @@ declare fp128 @copysignl(fp128, fp128) #1
; Test more complicated logical operations generated from copysignl.
define void @TestCopySign({ fp128, fp128 }* noalias nocapture sret %agg.result, { fp128, fp128 }* byval nocapture readonly align 16 %z) #0 {
; CHECK-LABEL: TestCopySign:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rbp
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: subq $40, %rsp
; CHECK-NEXT: movq %rdi, %rbx
; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: callq __gttf2
; CHECK-NEXT: movl %eax, %ebp
; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-NEXT: movaps %xmm0, %xmm1
; CHECK-NEXT: callq __subtf3
; CHECK-NEXT: testl %ebp, %ebp
; CHECK-NEXT: jle .LBB10_1
; CHECK-NEXT: # %bb.2: # %if.then
; CHECK-NEXT: andps {{.*}}(%rip), %xmm0
; CHECK-NEXT: movaps %xmm0, %xmm1
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; CHECK-NEXT: movaps %xmm1, %xmm2
; CHECK-NEXT: jmp .LBB10_3
; CHECK-NEXT: .LBB10_1:
; CHECK-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload
; CHECK-NEXT: .LBB10_3: # %cleanup
; CHECK-NEXT: movaps {{.*}}(%rip), %xmm1
; CHECK-NEXT: andps {{.*}}(%rip), %xmm0
; CHECK-NEXT: andps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: orps %xmm1, %xmm0
; CHECK-NEXT: movaps %xmm2, (%rbx)
; CHECK-NEXT: movaps %xmm0, 16(%rbx)
; CHECK-NEXT: movq %rbx, %rax
; CHECK-NEXT: addq $40, %rsp
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: popq %rbp
; CHECK-NEXT: retq
; SSE-LABEL: TestCopySign:
; SSE: # %bb.0: # %entry
; SSE-NEXT: pushq %rbp
; SSE-NEXT: pushq %rbx
; SSE-NEXT: subq $40, %rsp
; SSE-NEXT: movq %rdi, %rbx
; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; SSE-NEXT: callq __gttf2
; SSE-NEXT: movl %eax, %ebp
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, %xmm1
; SSE-NEXT: callq __subtf3
; SSE-NEXT: testl %ebp, %ebp
; SSE-NEXT: jle .LBB10_1
; SSE-NEXT: # %bb.2: # %if.then
; SSE-NEXT: movaps %xmm0, %xmm1
; SSE-NEXT: andps {{.*}}(%rip), %xmm1
; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; SSE-NEXT: jmp .LBB10_3
; SSE-NEXT: .LBB10_1:
; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload
; SSE-NEXT: .LBB10_3: # %cleanup
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; SSE-NEXT: andps {{.*}}(%rip), %xmm2
; SSE-NEXT: andps {{.*}}(%rip), %xmm0
; SSE-NEXT: orps %xmm2, %xmm0
; SSE-NEXT: movaps %xmm1, (%rbx)
; SSE-NEXT: movaps %xmm0, 16(%rbx)
; SSE-NEXT: movq %rbx, %rax
; SSE-NEXT: addq $40, %rsp
; SSE-NEXT: popq %rbx
; SSE-NEXT: popq %rbp
; SSE-NEXT: retq
;
; AVX-LABEL: TestCopySign:
; AVX: # %bb.0: # %entry
; AVX-NEXT: pushq %rbp
; AVX-NEXT: pushq %rbx
; AVX-NEXT: subq $40, %rsp
; AVX-NEXT: movq %rdi, %rbx
; AVX-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
; AVX-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: callq __gttf2
; AVX-NEXT: movl %eax, %ebp
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX-NEXT: vmovaps %xmm0, %xmm1
; AVX-NEXT: callq __subtf3
; AVX-NEXT: testl %ebp, %ebp
; AVX-NEXT: jle .LBB10_1
; AVX-NEXT: # %bb.2: # %if.then
; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm1
; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX-NEXT: vmovaps %xmm1, %xmm2
; AVX-NEXT: jmp .LBB10_3
; AVX-NEXT: .LBB10_1:
; AVX-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload
; AVX-NEXT: .LBB10_3: # %cleanup
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1
; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovaps %xmm2, (%rbx)
; AVX-NEXT: vmovaps %xmm0, 16(%rbx)
; AVX-NEXT: movq %rbx, %rax
; AVX-NEXT: addq $40, %rsp
; AVX-NEXT: popq %rbx
; AVX-NEXT: popq %rbp
; AVX-NEXT: retq
entry:
%z.realp = getelementptr inbounds { fp128, fp128 }, { fp128, fp128 }* %z, i64 0, i32 0
%z.real = load fp128, fp128* %z.realp, align 16