[X86] Add load folding support to the custom isel we do for X86ISD::UMUL/SMUL.

The peephole pass isn't always able to fold the load because it can't commute the implicit usage of AL/AX/EAX/RAX.

llvm-svn: 350272
This commit is contained in:
Craig Topper 2019-01-02 23:24:08 +00:00
parent ce46bfa848
commit df5304d8de
3 changed files with 76 additions and 37 deletions

View File

@ -3454,31 +3454,73 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
SDValue N0 = Node->getOperand(0);
SDValue N1 = Node->getOperand(1);
unsigned LoReg, Opc;
unsigned LoReg, ROpc, MOpc;
switch (NVT.SimpleTy) {
default: llvm_unreachable("Unsupported VT!");
case MVT::i8:
LoReg = X86::AL;
Opc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r;
ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r;
MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m;
break;
case MVT::i16: LoReg = X86::AX; Opc = X86::MUL16r; break;
case MVT::i32: LoReg = X86::EAX; Opc = X86::MUL32r; break;
case MVT::i64: LoReg = X86::RAX; Opc = X86::MUL64r; break;
case MVT::i16:
LoReg = X86::AX;
ROpc = X86::MUL16r;
MOpc = X86::MUL16m;
break;
case MVT::i32:
LoReg = X86::EAX;
ROpc = X86::MUL32r;
MOpc = X86::MUL32m;
break;
case MVT::i64:
LoReg = X86::RAX;
ROpc = X86::MUL64r;
MOpc = X86::MUL64m;
break;
}
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
bool FoldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
// Multiply is commmutative.
if (!FoldedLoad) {
FoldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
if (FoldedLoad)
std::swap(N0, N1);
}
SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
N0, SDValue()).getValue(1);
// i16/i32/i64 use an instruction that produces a low and high result even
// though only the low result is used.
SDVTList VTs;
if (NVT == MVT::i8)
VTs = CurDAG->getVTList(NVT, MVT::i32);
else
VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);
MachineSDNode *CNode;
if (FoldedLoad) {
// i16/i32/i64 use an instruction that produces a low and high result even
// though only the low result is used.
SDVTList VTs;
if (NVT == MVT::i8)
VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
else
VTs = CurDAG->getVTList(NVT, NVT, MVT::i32, MVT::Other);
SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
InFlag };
CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
// Update the chain.
ReplaceUses(N1.getValue(1), SDValue(CNode, NVT == MVT::i8 ? 2 : 3));
// Record the mem-refs
CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
} else {
// i16/i32/i64 use an instruction that produces a low and high result even
// though only the low result is used.
SDVTList VTs;
if (NVT == MVT::i8)
VTs = CurDAG->getVTList(NVT, MVT::i32);
else
VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);
CNode = CurDAG->getMachineNode(ROpc, dl, VTs, {N1, InFlag});
}
SDValue Ops[] = {N1, InFlag};
SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
ReplaceUses(SDValue(Node, 1), SDValue(CNode, NVT == MVT::i8 ? 1 : 2));
CurDAG->RemoveDeadNode(Node);

View File

@ -7,9 +7,8 @@ declare {i32, i1} @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
define zeroext i1 @a(i32 %x) nounwind {
; X86-LABEL: a:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl $3, %ecx
; X86-NEXT: mull %ecx
; X86-NEXT: movl $3, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: seto %al
; X86-NEXT: retl
;

View File

@ -725,8 +725,9 @@ define i1 @bug27873(i64 %c1, i1 %c2) {
define zeroext i1 @smuloi8_load(i8* %ptr1, i8 %v2, i8* %res) {
; SDAG-LABEL: smuloi8_load:
; SDAG: ## %bb.0:
; SDAG-NEXT: movb (%rdi), %al
; SDAG-NEXT: imulb %sil
; SDAG-NEXT: movl %esi, %eax
; SDAG-NEXT: ## kill: def $al killed $al killed $eax
; SDAG-NEXT: imulb (%rdi)
; SDAG-NEXT: seto %cl
; SDAG-NEXT: movb %al, (%rdx)
; SDAG-NEXT: movl %ecx, %eax
@ -753,9 +754,8 @@ define zeroext i1 @smuloi8_load2(i8 %v1, i8* %ptr2, i8* %res) {
; SDAG-LABEL: smuloi8_load2:
; SDAG: ## %bb.0:
; SDAG-NEXT: movl %edi, %eax
; SDAG-NEXT: movb (%rsi), %cl
; SDAG-NEXT: ## kill: def $al killed $al killed $eax
; SDAG-NEXT: imulb %cl
; SDAG-NEXT: imulb (%rsi)
; SDAG-NEXT: seto %cl
; SDAG-NEXT: movb %al, (%rdx)
; SDAG-NEXT: movl %ecx, %eax
@ -926,8 +926,9 @@ define zeroext i1 @smuloi64_load2(i64 %v1, i64* %ptr2, i64* %res) {
define zeroext i1 @umuloi8_load(i8* %ptr1, i8 %v2, i8* %res) {
; SDAG-LABEL: umuloi8_load:
; SDAG: ## %bb.0:
; SDAG-NEXT: movb (%rdi), %al
; SDAG-NEXT: mulb %sil
; SDAG-NEXT: movl %esi, %eax
; SDAG-NEXT: ## kill: def $al killed $al killed $eax
; SDAG-NEXT: mulb (%rdi)
; SDAG-NEXT: seto %cl
; SDAG-NEXT: movb %al, (%rdx)
; SDAG-NEXT: movl %ecx, %eax
@ -954,9 +955,8 @@ define zeroext i1 @umuloi8_load2(i8 %v1, i8* %ptr2, i8* %res) {
; SDAG-LABEL: umuloi8_load2:
; SDAG: ## %bb.0:
; SDAG-NEXT: movl %edi, %eax
; SDAG-NEXT: movb (%rsi), %cl
; SDAG-NEXT: ## kill: def $al killed $al killed $eax
; SDAG-NEXT: mulb %cl
; SDAG-NEXT: mulb (%rsi)
; SDAG-NEXT: seto %cl
; SDAG-NEXT: movb %al, (%rdx)
; SDAG-NEXT: movl %ecx, %eax
@ -984,8 +984,9 @@ define zeroext i1 @umuloi16_load(i16* %ptr1, i16 %v2, i16* %res) {
; SDAG-LABEL: umuloi16_load:
; SDAG: ## %bb.0:
; SDAG-NEXT: movq %rdx, %rcx
; SDAG-NEXT: movzwl (%rdi), %eax
; SDAG-NEXT: mulw %si
; SDAG-NEXT: movl %esi, %eax
; SDAG-NEXT: ## kill: def $ax killed $ax killed $eax
; SDAG-NEXT: mulw (%rdi)
; SDAG-NEXT: seto %dl
; SDAG-NEXT: movw %ax, (%rcx)
; SDAG-NEXT: movl %edx, %eax
@ -1014,9 +1015,8 @@ define zeroext i1 @umuloi16_load2(i16 %v1, i16* %ptr2, i16* %res) {
; SDAG: ## %bb.0:
; SDAG-NEXT: movq %rdx, %rcx
; SDAG-NEXT: movl %edi, %eax
; SDAG-NEXT: movzwl (%rsi), %edx
; SDAG-NEXT: ## kill: def $ax killed $ax killed $eax
; SDAG-NEXT: mulw %dx
; SDAG-NEXT: mulw (%rsi)
; SDAG-NEXT: seto %dl
; SDAG-NEXT: movw %ax, (%rcx)
; SDAG-NEXT: movl %edx, %eax
@ -1045,8 +1045,8 @@ define zeroext i1 @umuloi32_load(i32* %ptr1, i32 %v2, i32* %res) {
; SDAG-LABEL: umuloi32_load:
; SDAG: ## %bb.0:
; SDAG-NEXT: movq %rdx, %rcx
; SDAG-NEXT: movl (%rdi), %eax
; SDAG-NEXT: mull %esi
; SDAG-NEXT: movl %esi, %eax
; SDAG-NEXT: mull (%rdi)
; SDAG-NEXT: seto %dl
; SDAG-NEXT: movl %eax, (%rcx)
; SDAG-NEXT: movl %edx, %eax
@ -1075,8 +1075,7 @@ define zeroext i1 @umuloi32_load2(i32 %v1, i32* %ptr2, i32* %res) {
; SDAG: ## %bb.0:
; SDAG-NEXT: movq %rdx, %rcx
; SDAG-NEXT: movl %edi, %eax
; SDAG-NEXT: movl (%rsi), %edx
; SDAG-NEXT: mull %edx
; SDAG-NEXT: mull (%rsi)
; SDAG-NEXT: seto %dl
; SDAG-NEXT: movl %eax, (%rcx)
; SDAG-NEXT: movl %edx, %eax
@ -1104,8 +1103,8 @@ define zeroext i1 @umuloi64_load(i64* %ptr1, i64 %v2, i64* %res) {
; SDAG-LABEL: umuloi64_load:
; SDAG: ## %bb.0:
; SDAG-NEXT: movq %rdx, %rcx
; SDAG-NEXT: movq (%rdi), %rax
; SDAG-NEXT: mulq %rsi
; SDAG-NEXT: movq %rsi, %rax
; SDAG-NEXT: mulq (%rdi)
; SDAG-NEXT: seto %dl
; SDAG-NEXT: movq %rax, (%rcx)
; SDAG-NEXT: movl %edx, %eax
@ -1134,8 +1133,7 @@ define zeroext i1 @umuloi64_load2(i64 %v1, i64* %ptr2, i64* %res) {
; SDAG: ## %bb.0:
; SDAG-NEXT: movq %rdx, %rcx
; SDAG-NEXT: movq %rdi, %rax
; SDAG-NEXT: movq (%rsi), %rdx
; SDAG-NEXT: mulq %rdx
; SDAG-NEXT: mulq (%rsi)
; SDAG-NEXT: seto %dl
; SDAG-NEXT: movq %rax, (%rcx)
; SDAG-NEXT: movl %edx, %eax