[X86] Add pseudo instructions to use MULX with a single destination when the low result isn't used.

The instruction is defined to only produce high result if both
destinations are the same. We can exploit this to avoid
unnecessarily clobbering a register.

In order to hide this from register allocation we use a pseudo
instruction and expand the result during MCInst creation.

Differential Revision: https://reviews.llvm.org/D80500
This commit is contained in:
Craig Topper 2020-05-30 15:51:56 -07:00
parent 1b6d29e06b
commit 07e8a780d8
6 changed files with 74 additions and 30 deletions

View File

@ -4759,20 +4759,25 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
unsigned LoReg, HiReg;
bool IsSigned = Opcode == ISD::SMUL_LOHI;
bool UseMULX = !IsSigned && Subtarget->hasBMI2();
bool UseMULXHi = UseMULX && SDValue(Node, 0).use_empty();
switch (NVT.SimpleTy) {
default: llvm_unreachable("Unsupported VT!");
case MVT::i32:
Opc = UseMULX ? X86::MULX32rr :
Opc = UseMULXHi ? X86::MULX32Hrr :
UseMULX ? X86::MULX32rr :
IsSigned ? X86::IMUL32r : X86::MUL32r;
MOpc = UseMULX ? X86::MULX32rm :
MOpc = UseMULXHi ? X86::MULX32Hrm :
UseMULX ? X86::MULX32rm :
IsSigned ? X86::IMUL32m : X86::MUL32m;
LoReg = UseMULX ? X86::EDX : X86::EAX;
HiReg = X86::EDX;
break;
case MVT::i64:
Opc = UseMULX ? X86::MULX64rr :
Opc = UseMULXHi ? X86::MULX64Hrr :
UseMULX ? X86::MULX64rr :
IsSigned ? X86::IMUL64r : X86::MUL64r;
MOpc = UseMULX ? X86::MULX64rm :
MOpc = UseMULXHi ? X86::MULX64Hrm :
UseMULX ? X86::MULX64rm :
IsSigned ? X86::IMUL64m : X86::MUL64m;
LoReg = UseMULX ? X86::RDX : X86::RAX;
HiReg = X86::RDX;
@ -4796,7 +4801,12 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
MachineSDNode *CNode = nullptr;
SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
InFlag };
if (UseMULX) {
if (UseMULXHi) {
SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
ResHi = SDValue(CNode, 0);
Chain = SDValue(CNode, 1);
} else if (UseMULX) {
SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other);
CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
ResHi = SDValue(CNode, 0);
@ -4815,7 +4825,11 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
} else {
SDValue Ops[] = { N1, InFlag };
if (UseMULX) {
if (UseMULXHi) {
SDVTList VTs = CurDAG->getVTList(NVT);
SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
ResHi = SDValue(CNode, 0);
} else if (UseMULX) {
SDVTList VTs = CurDAG->getVTList(NVT, NVT);
SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
ResHi = SDValue(CNode, 0);

View File

@ -1313,7 +1313,17 @@ let hasSideEffects = 0 in {
let mayLoad = 1 in
def rm : I<0xF6, MRMSrcMem, (outs RC:$dst1, RC:$dst2), (ins x86memop:$src),
!strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
[]>, T8XD, VEX_4V, Sched<[sched.Folded, WriteIMulH]>;
// Pseudo instructions to be used when the low result isn't used. The
// instruction is defined to keep the high if both destinations are the same.
def Hrr : PseudoI<(outs RC:$dst), (ins RC:$src),
[]>, Sched<[sched]>;
let mayLoad = 1 in
def Hrm : PseudoI<(outs RC:$dst), (ins x86memop:$src),
[]>, Sched<[sched.Folded]>;
}
}

View File

@ -509,6 +509,26 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
"LEA has segment specified!");
break;
case X86::MULX32Hrr:
case X86::MULX32Hrm:
case X86::MULX64Hrr:
case X86::MULX64Hrm: {
// Turn into regular MULX by duplicating the destination.
unsigned NewOpc;
switch (OutMI.getOpcode()) {
default: llvm_unreachable("Invalid opcode");
case X86::MULX32Hrr: NewOpc = X86::MULX32rr; break;
case X86::MULX32Hrm: NewOpc = X86::MULX32rr; break;
case X86::MULX64Hrr: NewOpc = X86::MULX64rr; break;
case X86::MULX64Hrm: NewOpc = X86::MULX64rm; break;
}
OutMI.setOpcode(NewOpc);
// Duplicate the destination.
unsigned DestReg = OutMI.getOperand(0).getReg();
OutMI.insert(OutMI.begin(), MCOperand::createReg(DestReg));
break;
}
// Commute operands to get a smaller encoding by using VEX.R instead of VEX.B
// if one of the registers is extended, but other isn't.
case X86::VMOVZPQILo2PQIrr:

View File

@ -839,14 +839,14 @@ define i64 @load_fold_udiv1(i64* %p) {
; CHECK-O3-CUR: # %bb.0:
; CHECK-O3-CUR-NEXT: movq (%rdi), %rdx
; CHECK-O3-CUR-NEXT: movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
; CHECK-O3-CUR-NEXT: mulxq %rax, %rcx, %rax
; CHECK-O3-CUR-NEXT: mulxq %rax, %rax, %rax
; CHECK-O3-CUR-NEXT: shrq $3, %rax
; CHECK-O3-CUR-NEXT: retq
;
; CHECK-O3-EX-LABEL: load_fold_udiv1:
; CHECK-O3-EX: # %bb.0:
; CHECK-O3-EX-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889
; CHECK-O3-EX-NEXT: mulxq (%rdi), %rcx, %rax
; CHECK-O3-EX-NEXT: mulxq (%rdi), %rax, %rax
; CHECK-O3-EX-NEXT: shrq $3, %rax
; CHECK-O3-EX-NEXT: retq
%v = load atomic i64, i64* %p unordered, align 8
@ -1034,9 +1034,9 @@ define i64 @load_fold_urem1(i64* %p) {
; CHECK-O3-NEXT: movq (%rdi), %rax
; CHECK-O3-NEXT: movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889
; CHECK-O3-NEXT: movq %rax, %rdx
; CHECK-O3-NEXT: mulxq %rcx, %rcx, %rdx
; CHECK-O3-NEXT: shrq $3, %rdx
; CHECK-O3-NEXT: leaq (%rdx,%rdx,4), %rcx
; CHECK-O3-NEXT: mulxq %rcx, %rcx, %rcx
; CHECK-O3-NEXT: shrq $3, %rcx
; CHECK-O3-NEXT: leaq (%rcx,%rcx,4), %rcx
; CHECK-O3-NEXT: leaq (%rcx,%rcx,2), %rcx
; CHECK-O3-NEXT: subq %rcx, %rax
; CHECK-O3-NEXT: retq
@ -1693,7 +1693,7 @@ define void @rmw_fold_udiv1(i64* %p, i64 %v) {
; CHECK-O0: # %bb.0:
; CHECK-O0-NEXT: movq (%rdi), %rdx
; CHECK-O0-NEXT: movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
; CHECK-O0-NEXT: mulxq %rax, %rcx, %rax
; CHECK-O0-NEXT: mulxq %rax, %rax, %rax
; CHECK-O0-NEXT: shrq $3, %rax
; CHECK-O0-NEXT: movq %rax, (%rdi)
; CHECK-O0-NEXT: retq
@ -1702,17 +1702,17 @@ define void @rmw_fold_udiv1(i64* %p, i64 %v) {
; CHECK-O3-CUR: # %bb.0:
; CHECK-O3-CUR-NEXT: movq (%rdi), %rdx
; CHECK-O3-CUR-NEXT: movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
; CHECK-O3-CUR-NEXT: mulxq %rax, %rax, %rcx
; CHECK-O3-CUR-NEXT: shrq $3, %rcx
; CHECK-O3-CUR-NEXT: movq %rcx, (%rdi)
; CHECK-O3-CUR-NEXT: mulxq %rax, %rax, %rax
; CHECK-O3-CUR-NEXT: shrq $3, %rax
; CHECK-O3-CUR-NEXT: movq %rax, (%rdi)
; CHECK-O3-CUR-NEXT: retq
;
; CHECK-O3-EX-LABEL: rmw_fold_udiv1:
; CHECK-O3-EX: # %bb.0:
; CHECK-O3-EX-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889
; CHECK-O3-EX-NEXT: mulxq (%rdi), %rax, %rcx
; CHECK-O3-EX-NEXT: shrq $3, %rcx
; CHECK-O3-EX-NEXT: movq %rcx, (%rdi)
; CHECK-O3-EX-NEXT: mulxq (%rdi), %rax, %rax
; CHECK-O3-EX-NEXT: shrq $3, %rax
; CHECK-O3-EX-NEXT: movq %rax, (%rdi)
; CHECK-O3-EX-NEXT: retq
%prev = load atomic i64, i64* %p unordered, align 8
%val = udiv i64 %prev, 15
@ -1840,7 +1840,7 @@ define void @rmw_fold_urem1(i64* %p, i64 %v) {
; CHECK-O0-NEXT: movq (%rdi), %rax
; CHECK-O0-NEXT: movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889
; CHECK-O0-NEXT: movq %rax, %rdx
; CHECK-O0-NEXT: mulxq %rcx, %rdx, %rcx
; CHECK-O0-NEXT: mulxq %rcx, %rcx, %rcx
; CHECK-O0-NEXT: shrq $3, %rcx
; CHECK-O0-NEXT: leaq (%rcx,%rcx,4), %rcx
; CHECK-O0-NEXT: leaq (%rcx,%rcx,2), %rcx
@ -1852,9 +1852,9 @@ define void @rmw_fold_urem1(i64* %p, i64 %v) {
; CHECK-O3: # %bb.0:
; CHECK-O3-NEXT: movq (%rdi), %rdx
; CHECK-O3-NEXT: movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
; CHECK-O3-NEXT: mulxq %rax, %rax, %rcx
; CHECK-O3-NEXT: shrq $3, %rcx
; CHECK-O3-NEXT: leaq (%rcx,%rcx,4), %rax
; CHECK-O3-NEXT: mulxq %rax, %rax, %rax
; CHECK-O3-NEXT: shrq $3, %rax
; CHECK-O3-NEXT: leaq (%rax,%rax,4), %rax
; CHECK-O3-NEXT: leaq (%rax,%rax,2), %rax
; CHECK-O3-NEXT: subq %rax, %rdx
; CHECK-O3-NEXT: movq %rdx, (%rdi)

View File

@ -54,7 +54,7 @@ define i64 @foo(i64 %x, i64 %y) nounwind {
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-BMI-NEXT: movl %eax, %edx
; X86-BMI-NEXT: mulxl %esi, %edx, %ebx
; X86-BMI-NEXT: mulxl %esi, %ebx, %ebx
; X86-BMI-NEXT: movl %ecx, %edx
; X86-BMI-NEXT: mulxl %esi, %esi, %ebp
; X86-BMI-NEXT: addl %ebx, %esi
@ -85,7 +85,7 @@ define i64 @foo(i64 %x, i64 %y) nounwind {
; X64-BMI-LABEL: foo:
; X64-BMI: # %bb.0:
; X64-BMI-NEXT: movq %rdi, %rdx
; X64-BMI-NEXT: mulxq %rsi, %rcx, %rax
; X64-BMI-NEXT: mulxq %rsi, %rax, %rax
; X64-BMI-NEXT: retq
%tmp0 = zext i64 %x to i128
%tmp1 = zext i64 %y to i128

View File

@ -7,9 +7,9 @@ define void @_Z15uint64_to_asciimPc(i64 %arg) {
; HSW: # %bb.0: # %bb
; HSW-NEXT: movabsq $811296384146066817, %rax # imm = 0xB424DC35095CD81
; HSW-NEXT: movq %rdi, %rdx
; HSW-NEXT: mulxq %rax, %rax, %rcx
; HSW-NEXT: shrq $42, %rcx
; HSW-NEXT: imulq $281474977, %rcx, %rax # imm = 0x10C6F7A1
; HSW-NEXT: mulxq %rax, %rax, %rax
; HSW-NEXT: shrq $42, %rax
; HSW-NEXT: imulq $281474977, %rax, %rax # imm = 0x10C6F7A1
; HSW-NEXT: shrq $20, %rax
; HSW-NEXT: leal (%rax,%rax,4), %eax
; HSW-NEXT: addl $5, %eax
@ -24,9 +24,9 @@ define void @_Z15uint64_to_asciimPc(i64 %arg) {
; ZN: # %bb.0: # %bb
; ZN-NEXT: movabsq $811296384146066817, %rax # imm = 0xB424DC35095CD81
; ZN-NEXT: movq %rdi, %rdx
; ZN-NEXT: mulxq %rax, %rax, %rcx
; ZN-NEXT: shrq $42, %rcx
; ZN-NEXT: imulq $281474977, %rcx, %rax # imm = 0x10C6F7A1
; ZN-NEXT: mulxq %rax, %rax, %rax
; ZN-NEXT: shrq $42, %rax
; ZN-NEXT: imulq $281474977, %rax, %rax # imm = 0x10C6F7A1
; ZN-NEXT: shrq $20, %rax
; ZN-NEXT: leal 5(%rax,%rax,4), %eax
; ZN-NEXT: andl $134217727, %eax # imm = 0x7FFFFFF