forked from OSchip/llvm-project
[X86] Fix a bug in X86's peephole optimization.
Peephole optimization was folding MOVSDrm, which is a zero-extending double precision floating point load, into ADDPDrr, which is a SIMD add of two packed double precision floating point values. (before) %vreg21<def> = MOVSDrm <fi#0>, 1, %noreg, 0, %noreg; mem:LD8[%7](align=16)(tbaa=<badref>) VR128:%vreg21 %vreg23<def,tied1> = ADDPDrr %vreg20<tied0>, %vreg21; VR128:%vreg23,%vreg20,%vreg21 (after) %vreg23<def,tied1> = ADDPDrm %vreg20<tied0>, <fi#0>, 1, %noreg, 0, %noreg; mem:LD8[%7](align=16)(tbaa=<badref>) VR128:%vreg23,%vreg20 X86InstrInfo::foldMemoryOperandImpl already had the logic that prevented this from happening. However the check wasn't being conducted for loads from stack objects. This commit factors out the logic into a new function and uses it for checking loads from stack slots are not zero-extending loads. rdar://problem/18236850 llvm-svn: 217799
This commit is contained in:
parent
fdacdb26af
commit
760814a7e1
|
@ -4423,6 +4423,25 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
|
||||||
return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, Size, Alignment);
|
return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, Size, Alignment);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool isPartialRegisterLoad(const MachineInstr &LoadMI,
|
||||||
|
const MachineFunction &MF) {
|
||||||
|
unsigned Opc = LoadMI.getOpcode();
|
||||||
|
unsigned RegSize =
|
||||||
|
MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg())->getSize();
|
||||||
|
|
||||||
|
if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm) && RegSize > 4)
|
||||||
|
// These instructions only load 32 bits, we can't fold them if the
|
||||||
|
// destination register is wider than 32 bits (4 bytes).
|
||||||
|
return true;
|
||||||
|
|
||||||
|
if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm) && RegSize > 8)
|
||||||
|
// These instructions only load 64 bits, we can't fold them if the
|
||||||
|
// destination register is wider than 64 bits (8 bytes).
|
||||||
|
return true;
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
|
MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
|
||||||
MachineInstr *MI,
|
MachineInstr *MI,
|
||||||
const SmallVectorImpl<unsigned> &Ops,
|
const SmallVectorImpl<unsigned> &Ops,
|
||||||
|
@ -4430,8 +4449,11 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
|
||||||
// If loading from a FrameIndex, fold directly from the FrameIndex.
|
// If loading from a FrameIndex, fold directly from the FrameIndex.
|
||||||
unsigned NumOps = LoadMI->getDesc().getNumOperands();
|
unsigned NumOps = LoadMI->getDesc().getNumOperands();
|
||||||
int FrameIndex;
|
int FrameIndex;
|
||||||
if (isLoadFromStackSlot(LoadMI, FrameIndex))
|
if (isLoadFromStackSlot(LoadMI, FrameIndex)) {
|
||||||
|
if (isPartialRegisterLoad(*LoadMI, MF))
|
||||||
|
return nullptr;
|
||||||
return foldMemoryOperandImpl(MF, MI, Ops, FrameIndex);
|
return foldMemoryOperandImpl(MF, MI, Ops, FrameIndex);
|
||||||
|
}
|
||||||
|
|
||||||
// Check switch flag
|
// Check switch flag
|
||||||
if (NoFusing) return nullptr;
|
if (NoFusing) return nullptr;
|
||||||
|
@ -4542,19 +4564,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
default: {
|
default: {
|
||||||
if ((LoadMI->getOpcode() == X86::MOVSSrm ||
|
if (isPartialRegisterLoad(*LoadMI, MF))
|
||||||
LoadMI->getOpcode() == X86::VMOVSSrm) &&
|
|
||||||
MF.getRegInfo().getRegClass(LoadMI->getOperand(0).getReg())->getSize()
|
|
||||||
> 4)
|
|
||||||
// These instructions only load 32 bits, we can't fold them if the
|
|
||||||
// destination register is wider than 32 bits (4 bytes).
|
|
||||||
return nullptr;
|
|
||||||
if ((LoadMI->getOpcode() == X86::MOVSDrm ||
|
|
||||||
LoadMI->getOpcode() == X86::VMOVSDrm) &&
|
|
||||||
MF.getRegInfo().getRegClass(LoadMI->getOperand(0).getReg())->getSize()
|
|
||||||
> 8)
|
|
||||||
// These instructions only load 64 bits, we can't fold them if the
|
|
||||||
// destination register is wider than 64 bits (8 bytes).
|
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
|
||||||
// Folding a normal load. Just copy the load's address operands.
|
// Folding a normal load. Just copy the load's address operands.
|
||||||
|
|
|
@ -0,0 +1,31 @@
|
||||||
|
; RUN: llc -march=x86-64 < %s | FileCheck %s
|
||||||
|
;
|
||||||
|
; Check that x86's peephole optimization doesn't fold a 64-bit load (movsd) into
|
||||||
|
; addpd.
|
||||||
|
; rdar://problem/18236850
|
||||||
|
|
||||||
|
%struct.S1 = type { double, double }
|
||||||
|
|
||||||
|
@g = common global %struct.S1 zeroinitializer, align 8
|
||||||
|
|
||||||
|
declare void @foo3(%struct.S1*)
|
||||||
|
|
||||||
|
; CHECK: movsd (%rsp), [[R0:%xmm[0-9]+]]
|
||||||
|
; CHECK: addpd [[R0]], %xmm{{[0-9]+}}
|
||||||
|
|
||||||
|
define void @foo1(double %a.coerce0, double %a.coerce1, double %b.coerce0, double %b.coerce1) {
|
||||||
|
%1 = alloca <2 x double>, align 16
|
||||||
|
%tmpcast = bitcast <2 x double>* %1 to %struct.S1*
|
||||||
|
call void @foo3(%struct.S1* %tmpcast) #2
|
||||||
|
%p2 = getelementptr inbounds %struct.S1* %tmpcast, i64 0, i32 0
|
||||||
|
%2 = load double* %p2, align 16
|
||||||
|
%p3 = getelementptr inbounds %struct.S1* %tmpcast, i64 0, i32 1
|
||||||
|
%3 = load double* %p3, align 8
|
||||||
|
%4 = insertelement <2 x double> undef, double %2, i32 0
|
||||||
|
%5 = insertelement <2 x double> %4, double 0.000000e+00, i32 1
|
||||||
|
%6 = insertelement <2 x double> undef, double %3, i32 1
|
||||||
|
%7 = insertelement <2 x double> %6, double 1.000000e+00, i32 0
|
||||||
|
%8 = fadd <2 x double> %5, %7
|
||||||
|
store <2 x double> %8, <2 x double>* bitcast (%struct.S1* @g to <2 x double>*), align 16
|
||||||
|
ret void
|
||||||
|
}
|
Loading…
Reference in New Issue