[X86] Teach load folding to accept scalar _Int users of MOVSS/MOVSD.

The _Int instructions are special, in that they operate on the full
VR128 instead of FR32.  The load folding then looks at MOVSS, at the
user, and bails out when it sees a size mismatch.

What we really know is that the rm_Int instructions don't load the
higher lanes, so folding is fine.

This happens for the straightforward intrinsic code, e.g.:

    _mm_add_ss(a, _mm_load_ss(p));

Fixes PR23349.

Differential Revision: http://reviews.llvm.org/D10554

llvm-svn: 240326
This commit is contained in:
Ahmed Bougacha 2015-06-22 20:51:51 +00:00
parent 8c6fe230d5
commit ed3c4d1a3d
2 changed files with 188 additions and 10 deletions

View File

@ -5295,21 +5295,57 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
Size, Alignment, /*AllowCommute=*/true);
}
static bool isPartialRegisterLoad(const MachineInstr &LoadMI,
const MachineFunction &MF) {
/// Check if \p LoadMI is a partial register load that we can't fold into \p MI
/// because the latter uses contents that wouldn't be defined in the folded
/// version. For instance, this transformation isn't legal:
/// movss (%rdi), %xmm0
/// addps %xmm0, %xmm0
/// ->
/// addps (%rdi), %xmm0
///
/// But this one is:
/// movss (%rdi), %xmm0
/// addss %xmm0, %xmm0
/// ->
/// addss (%rdi), %xmm0
///
static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
const MachineInstr &UserMI,
const MachineFunction &MF) {
unsigned Opc = LoadMI.getOpcode();
unsigned UserOpc = UserMI.getOpcode();
unsigned RegSize =
MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg())->getSize();
if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm) && RegSize > 4)
if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm) && RegSize > 4) {
// These instructions only load 32 bits, we can't fold them if the
// destination register is wider than 32 bits (4 bytes).
return true;
// destination register is wider than 32 bits (4 bytes), and its user
// instruction isn't scalar (SS).
switch (UserOpc) {
case X86::ADDSSrr_Int: case X86::VADDSSrr_Int:
case X86::DIVSSrr_Int: case X86::VDIVSSrr_Int:
case X86::MULSSrr_Int: case X86::VMULSSrr_Int:
case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int:
return false;
default:
return true;
}
}
if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm) && RegSize > 8)
if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm) && RegSize > 8) {
// These instructions only load 64 bits, we can't fold them if the
// destination register is wider than 64 bits (8 bytes).
return true;
// destination register is wider than 64 bits (8 bytes), and its user
// instruction isn't scalar (SD).
switch (UserOpc) {
case X86::ADDSDrr_Int: case X86::VADDSDrr_Int:
case X86::DIVSDrr_Int: case X86::VDIVSDrr_Int:
case X86::MULSDrr_Int: case X86::VMULSDrr_Int:
case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int:
return false;
default:
return true;
}
}
return false;
}
@ -5321,7 +5357,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
unsigned NumOps = LoadMI->getDesc().getNumOperands();
int FrameIndex;
if (isLoadFromStackSlot(LoadMI, FrameIndex)) {
if (isPartialRegisterLoad(*LoadMI, MF))
if (isNonFoldablePartialRegisterLoad(*LoadMI, *MI, MF))
return nullptr;
return foldMemoryOperandImpl(MF, MI, Ops, InsertPt, FrameIndex);
}
@ -5434,7 +5470,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
break;
}
default: {
if (isPartialRegisterLoad(*LoadMI, MF))
if (isNonFoldablePartialRegisterLoad(*LoadMI, *MI, MF))
return nullptr;
// Folding a normal load. Just copy the load's address operands.

View File

@ -0,0 +1,142 @@
; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse2 < %s | FileCheck %s --check-prefix=SSE
; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=AVX
; Verify that we're folding the load into the math instruction.
; This pattern is generated out of the simplest intrinsics usage:
; _mm_add_ss(a, _mm_load_ss(b));
define <4 x float> @addss(<4 x float> %va, float* %pb) {
; SSE-LABEL: addss:
; SSE: # BB#0:
; SSE-NEXT: addss (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: addss:
; AVX: # BB#0:
; AVX-NEXT: vaddss (%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
%a = extractelement <4 x float> %va, i32 0
%b = load float, float* %pb
%r = fadd float %a, %b
%vr = insertelement <4 x float> %va, float %r, i32 0
ret <4 x float> %vr
}
define <2 x double> @addsd(<2 x double> %va, double* %pb) {
; SSE-LABEL: addsd:
; SSE: # BB#0:
; SSE-NEXT: addsd (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: addsd:
; AVX: # BB#0:
; AVX-NEXT: vaddsd (%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
%a = extractelement <2 x double> %va, i32 0
%b = load double, double* %pb
%r = fadd double %a, %b
%vr = insertelement <2 x double> %va, double %r, i32 0
ret <2 x double> %vr
}
define <4 x float> @subss(<4 x float> %va, float* %pb) {
; SSE-LABEL: subss:
; SSE: # BB#0:
; SSE-NEXT: subss (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: subss:
; AVX: # BB#0:
; AVX-NEXT: vsubss (%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
%a = extractelement <4 x float> %va, i32 0
%b = load float, float* %pb
%r = fsub float %a, %b
%vr = insertelement <4 x float> %va, float %r, i32 0
ret <4 x float> %vr
}
define <2 x double> @subsd(<2 x double> %va, double* %pb) {
; SSE-LABEL: subsd:
; SSE: # BB#0:
; SSE-NEXT: subsd (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: subsd:
; AVX: # BB#0:
; AVX-NEXT: vsubsd (%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
%a = extractelement <2 x double> %va, i32 0
%b = load double, double* %pb
%r = fsub double %a, %b
%vr = insertelement <2 x double> %va, double %r, i32 0
ret <2 x double> %vr
}
define <4 x float> @mulss(<4 x float> %va, float* %pb) {
; SSE-LABEL: mulss:
; SSE: # BB#0:
; SSE-NEXT: mulss (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: mulss:
; AVX: # BB#0:
; AVX-NEXT: vmulss (%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
%a = extractelement <4 x float> %va, i32 0
%b = load float, float* %pb
%r = fmul float %a, %b
%vr = insertelement <4 x float> %va, float %r, i32 0
ret <4 x float> %vr
}
define <2 x double> @mulsd(<2 x double> %va, double* %pb) {
; SSE-LABEL: mulsd:
; SSE: # BB#0:
; SSE-NEXT: mulsd (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: mulsd:
; AVX: # BB#0:
; AVX-NEXT: vmulsd (%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
%a = extractelement <2 x double> %va, i32 0
%b = load double, double* %pb
%r = fmul double %a, %b
%vr = insertelement <2 x double> %va, double %r, i32 0
ret <2 x double> %vr
}
define <4 x float> @divss(<4 x float> %va, float* %pb) {
; SSE-LABEL: divss:
; SSE: # BB#0:
; SSE-NEXT: divss (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: divss:
; AVX: # BB#0:
; AVX-NEXT: vdivss (%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
%a = extractelement <4 x float> %va, i32 0
%b = load float, float* %pb
%r = fdiv float %a, %b
%vr = insertelement <4 x float> %va, float %r, i32 0
ret <4 x float> %vr
}
define <2 x double> @divsd(<2 x double> %va, double* %pb) {
; SSE-LABEL: divsd:
; SSE: # BB#0:
; SSE-NEXT: divsd (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: divsd:
; AVX: # BB#0:
; AVX-NEXT: vdivsd (%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
%a = extractelement <2 x double> %va, i32 0
%b = load double, double* %pb
%r = fdiv double %a, %b
%vr = insertelement <2 x double> %va, double %r, i32 0
ret <2 x double> %vr
}