forked from OSchip/llvm-project
[X86] Add pseudo instructions to use MULX with a single destination when the low result isn't used.
The instruction is defined to only produce high result if both destinations are the same. We can exploit this to avoid unnecessarily clobbering a register. In order to hide this from register allocation we use a pseudo instruction and expand the result during MCInst creation. Differential Revision: https://reviews.llvm.org/D80500
This commit is contained in:
parent
1b6d29e06b
commit
07e8a780d8
|
@ -4759,20 +4759,25 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
|
|||
unsigned LoReg, HiReg;
|
||||
bool IsSigned = Opcode == ISD::SMUL_LOHI;
|
||||
bool UseMULX = !IsSigned && Subtarget->hasBMI2();
|
||||
bool UseMULXHi = UseMULX && SDValue(Node, 0).use_empty();
|
||||
switch (NVT.SimpleTy) {
|
||||
default: llvm_unreachable("Unsupported VT!");
|
||||
case MVT::i32:
|
||||
Opc = UseMULX ? X86::MULX32rr :
|
||||
Opc = UseMULXHi ? X86::MULX32Hrr :
|
||||
UseMULX ? X86::MULX32rr :
|
||||
IsSigned ? X86::IMUL32r : X86::MUL32r;
|
||||
MOpc = UseMULX ? X86::MULX32rm :
|
||||
MOpc = UseMULXHi ? X86::MULX32Hrm :
|
||||
UseMULX ? X86::MULX32rm :
|
||||
IsSigned ? X86::IMUL32m : X86::MUL32m;
|
||||
LoReg = UseMULX ? X86::EDX : X86::EAX;
|
||||
HiReg = X86::EDX;
|
||||
break;
|
||||
case MVT::i64:
|
||||
Opc = UseMULX ? X86::MULX64rr :
|
||||
Opc = UseMULXHi ? X86::MULX64Hrr :
|
||||
UseMULX ? X86::MULX64rr :
|
||||
IsSigned ? X86::IMUL64r : X86::MUL64r;
|
||||
MOpc = UseMULX ? X86::MULX64rm :
|
||||
MOpc = UseMULXHi ? X86::MULX64Hrm :
|
||||
UseMULX ? X86::MULX64rm :
|
||||
IsSigned ? X86::IMUL64m : X86::MUL64m;
|
||||
LoReg = UseMULX ? X86::RDX : X86::RAX;
|
||||
HiReg = X86::RDX;
|
||||
|
@ -4796,7 +4801,12 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
|
|||
MachineSDNode *CNode = nullptr;
|
||||
SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
|
||||
InFlag };
|
||||
if (UseMULX) {
|
||||
if (UseMULXHi) {
|
||||
SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
|
||||
CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
|
||||
ResHi = SDValue(CNode, 0);
|
||||
Chain = SDValue(CNode, 1);
|
||||
} else if (UseMULX) {
|
||||
SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other);
|
||||
CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
|
||||
ResHi = SDValue(CNode, 0);
|
||||
|
@ -4815,7 +4825,11 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
|
|||
CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
|
||||
} else {
|
||||
SDValue Ops[] = { N1, InFlag };
|
||||
if (UseMULX) {
|
||||
if (UseMULXHi) {
|
||||
SDVTList VTs = CurDAG->getVTList(NVT);
|
||||
SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
|
||||
ResHi = SDValue(CNode, 0);
|
||||
} else if (UseMULX) {
|
||||
SDVTList VTs = CurDAG->getVTList(NVT, NVT);
|
||||
SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
|
||||
ResHi = SDValue(CNode, 0);
|
||||
|
|
|
@ -1313,7 +1313,17 @@ let hasSideEffects = 0 in {
|
|||
let mayLoad = 1 in
|
||||
def rm : I<0xF6, MRMSrcMem, (outs RC:$dst1, RC:$dst2), (ins x86memop:$src),
|
||||
!strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
|
||||
|
||||
[]>, T8XD, VEX_4V, Sched<[sched.Folded, WriteIMulH]>;
|
||||
|
||||
// Pseudo instructions to be used when the low result isn't used. The
|
||||
// instruction is defined to keep the high if both destinations are the same.
|
||||
def Hrr : PseudoI<(outs RC:$dst), (ins RC:$src),
|
||||
[]>, Sched<[sched]>;
|
||||
|
||||
let mayLoad = 1 in
|
||||
def Hrm : PseudoI<(outs RC:$dst), (ins x86memop:$src),
|
||||
[]>, Sched<[sched.Folded]>;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -509,6 +509,26 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
|
|||
"LEA has segment specified!");
|
||||
break;
|
||||
|
||||
case X86::MULX32Hrr:
|
||||
case X86::MULX32Hrm:
|
||||
case X86::MULX64Hrr:
|
||||
case X86::MULX64Hrm: {
|
||||
// Turn into regular MULX by duplicating the destination.
|
||||
unsigned NewOpc;
|
||||
switch (OutMI.getOpcode()) {
|
||||
default: llvm_unreachable("Invalid opcode");
|
||||
case X86::MULX32Hrr: NewOpc = X86::MULX32rr; break;
|
||||
case X86::MULX32Hrm: NewOpc = X86::MULX32rr; break;
|
||||
case X86::MULX64Hrr: NewOpc = X86::MULX64rr; break;
|
||||
case X86::MULX64Hrm: NewOpc = X86::MULX64rm; break;
|
||||
}
|
||||
OutMI.setOpcode(NewOpc);
|
||||
// Duplicate the destination.
|
||||
unsigned DestReg = OutMI.getOperand(0).getReg();
|
||||
OutMI.insert(OutMI.begin(), MCOperand::createReg(DestReg));
|
||||
break;
|
||||
}
|
||||
|
||||
// Commute operands to get a smaller encoding by using VEX.R instead of VEX.B
|
||||
// if one of the registers is extended, but other isn't.
|
||||
case X86::VMOVZPQILo2PQIrr:
|
||||
|
|
|
@ -839,14 +839,14 @@ define i64 @load_fold_udiv1(i64* %p) {
|
|||
; CHECK-O3-CUR: # %bb.0:
|
||||
; CHECK-O3-CUR-NEXT: movq (%rdi), %rdx
|
||||
; CHECK-O3-CUR-NEXT: movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
|
||||
; CHECK-O3-CUR-NEXT: mulxq %rax, %rcx, %rax
|
||||
; CHECK-O3-CUR-NEXT: mulxq %rax, %rax, %rax
|
||||
; CHECK-O3-CUR-NEXT: shrq $3, %rax
|
||||
; CHECK-O3-CUR-NEXT: retq
|
||||
;
|
||||
; CHECK-O3-EX-LABEL: load_fold_udiv1:
|
||||
; CHECK-O3-EX: # %bb.0:
|
||||
; CHECK-O3-EX-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889
|
||||
; CHECK-O3-EX-NEXT: mulxq (%rdi), %rcx, %rax
|
||||
; CHECK-O3-EX-NEXT: mulxq (%rdi), %rax, %rax
|
||||
; CHECK-O3-EX-NEXT: shrq $3, %rax
|
||||
; CHECK-O3-EX-NEXT: retq
|
||||
%v = load atomic i64, i64* %p unordered, align 8
|
||||
|
@ -1034,9 +1034,9 @@ define i64 @load_fold_urem1(i64* %p) {
|
|||
; CHECK-O3-NEXT: movq (%rdi), %rax
|
||||
; CHECK-O3-NEXT: movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889
|
||||
; CHECK-O3-NEXT: movq %rax, %rdx
|
||||
; CHECK-O3-NEXT: mulxq %rcx, %rcx, %rdx
|
||||
; CHECK-O3-NEXT: shrq $3, %rdx
|
||||
; CHECK-O3-NEXT: leaq (%rdx,%rdx,4), %rcx
|
||||
; CHECK-O3-NEXT: mulxq %rcx, %rcx, %rcx
|
||||
; CHECK-O3-NEXT: shrq $3, %rcx
|
||||
; CHECK-O3-NEXT: leaq (%rcx,%rcx,4), %rcx
|
||||
; CHECK-O3-NEXT: leaq (%rcx,%rcx,2), %rcx
|
||||
; CHECK-O3-NEXT: subq %rcx, %rax
|
||||
; CHECK-O3-NEXT: retq
|
||||
|
@ -1693,7 +1693,7 @@ define void @rmw_fold_udiv1(i64* %p, i64 %v) {
|
|||
; CHECK-O0: # %bb.0:
|
||||
; CHECK-O0-NEXT: movq (%rdi), %rdx
|
||||
; CHECK-O0-NEXT: movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
|
||||
; CHECK-O0-NEXT: mulxq %rax, %rcx, %rax
|
||||
; CHECK-O0-NEXT: mulxq %rax, %rax, %rax
|
||||
; CHECK-O0-NEXT: shrq $3, %rax
|
||||
; CHECK-O0-NEXT: movq %rax, (%rdi)
|
||||
; CHECK-O0-NEXT: retq
|
||||
|
@ -1702,17 +1702,17 @@ define void @rmw_fold_udiv1(i64* %p, i64 %v) {
|
|||
; CHECK-O3-CUR: # %bb.0:
|
||||
; CHECK-O3-CUR-NEXT: movq (%rdi), %rdx
|
||||
; CHECK-O3-CUR-NEXT: movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
|
||||
; CHECK-O3-CUR-NEXT: mulxq %rax, %rax, %rcx
|
||||
; CHECK-O3-CUR-NEXT: shrq $3, %rcx
|
||||
; CHECK-O3-CUR-NEXT: movq %rcx, (%rdi)
|
||||
; CHECK-O3-CUR-NEXT: mulxq %rax, %rax, %rax
|
||||
; CHECK-O3-CUR-NEXT: shrq $3, %rax
|
||||
; CHECK-O3-CUR-NEXT: movq %rax, (%rdi)
|
||||
; CHECK-O3-CUR-NEXT: retq
|
||||
;
|
||||
; CHECK-O3-EX-LABEL: rmw_fold_udiv1:
|
||||
; CHECK-O3-EX: # %bb.0:
|
||||
; CHECK-O3-EX-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889
|
||||
; CHECK-O3-EX-NEXT: mulxq (%rdi), %rax, %rcx
|
||||
; CHECK-O3-EX-NEXT: shrq $3, %rcx
|
||||
; CHECK-O3-EX-NEXT: movq %rcx, (%rdi)
|
||||
; CHECK-O3-EX-NEXT: mulxq (%rdi), %rax, %rax
|
||||
; CHECK-O3-EX-NEXT: shrq $3, %rax
|
||||
; CHECK-O3-EX-NEXT: movq %rax, (%rdi)
|
||||
; CHECK-O3-EX-NEXT: retq
|
||||
%prev = load atomic i64, i64* %p unordered, align 8
|
||||
%val = udiv i64 %prev, 15
|
||||
|
@ -1840,7 +1840,7 @@ define void @rmw_fold_urem1(i64* %p, i64 %v) {
|
|||
; CHECK-O0-NEXT: movq (%rdi), %rax
|
||||
; CHECK-O0-NEXT: movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889
|
||||
; CHECK-O0-NEXT: movq %rax, %rdx
|
||||
; CHECK-O0-NEXT: mulxq %rcx, %rdx, %rcx
|
||||
; CHECK-O0-NEXT: mulxq %rcx, %rcx, %rcx
|
||||
; CHECK-O0-NEXT: shrq $3, %rcx
|
||||
; CHECK-O0-NEXT: leaq (%rcx,%rcx,4), %rcx
|
||||
; CHECK-O0-NEXT: leaq (%rcx,%rcx,2), %rcx
|
||||
|
@ -1852,9 +1852,9 @@ define void @rmw_fold_urem1(i64* %p, i64 %v) {
|
|||
; CHECK-O3: # %bb.0:
|
||||
; CHECK-O3-NEXT: movq (%rdi), %rdx
|
||||
; CHECK-O3-NEXT: movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
|
||||
; CHECK-O3-NEXT: mulxq %rax, %rax, %rcx
|
||||
; CHECK-O3-NEXT: shrq $3, %rcx
|
||||
; CHECK-O3-NEXT: leaq (%rcx,%rcx,4), %rax
|
||||
; CHECK-O3-NEXT: mulxq %rax, %rax, %rax
|
||||
; CHECK-O3-NEXT: shrq $3, %rax
|
||||
; CHECK-O3-NEXT: leaq (%rax,%rax,4), %rax
|
||||
; CHECK-O3-NEXT: leaq (%rax,%rax,2), %rax
|
||||
; CHECK-O3-NEXT: subq %rax, %rdx
|
||||
; CHECK-O3-NEXT: movq %rdx, (%rdi)
|
||||
|
|
|
@ -54,7 +54,7 @@ define i64 @foo(i64 %x, i64 %y) nounwind {
|
|||
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
|
||||
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edi
|
||||
; X86-BMI-NEXT: movl %eax, %edx
|
||||
; X86-BMI-NEXT: mulxl %esi, %edx, %ebx
|
||||
; X86-BMI-NEXT: mulxl %esi, %ebx, %ebx
|
||||
; X86-BMI-NEXT: movl %ecx, %edx
|
||||
; X86-BMI-NEXT: mulxl %esi, %esi, %ebp
|
||||
; X86-BMI-NEXT: addl %ebx, %esi
|
||||
|
@ -85,7 +85,7 @@ define i64 @foo(i64 %x, i64 %y) nounwind {
|
|||
; X64-BMI-LABEL: foo:
|
||||
; X64-BMI: # %bb.0:
|
||||
; X64-BMI-NEXT: movq %rdi, %rdx
|
||||
; X64-BMI-NEXT: mulxq %rsi, %rcx, %rax
|
||||
; X64-BMI-NEXT: mulxq %rsi, %rax, %rax
|
||||
; X64-BMI-NEXT: retq
|
||||
%tmp0 = zext i64 %x to i128
|
||||
%tmp1 = zext i64 %y to i128
|
||||
|
|
|
@ -7,9 +7,9 @@ define void @_Z15uint64_to_asciimPc(i64 %arg) {
|
|||
; HSW: # %bb.0: # %bb
|
||||
; HSW-NEXT: movabsq $811296384146066817, %rax # imm = 0xB424DC35095CD81
|
||||
; HSW-NEXT: movq %rdi, %rdx
|
||||
; HSW-NEXT: mulxq %rax, %rax, %rcx
|
||||
; HSW-NEXT: shrq $42, %rcx
|
||||
; HSW-NEXT: imulq $281474977, %rcx, %rax # imm = 0x10C6F7A1
|
||||
; HSW-NEXT: mulxq %rax, %rax, %rax
|
||||
; HSW-NEXT: shrq $42, %rax
|
||||
; HSW-NEXT: imulq $281474977, %rax, %rax # imm = 0x10C6F7A1
|
||||
; HSW-NEXT: shrq $20, %rax
|
||||
; HSW-NEXT: leal (%rax,%rax,4), %eax
|
||||
; HSW-NEXT: addl $5, %eax
|
||||
|
@ -24,9 +24,9 @@ define void @_Z15uint64_to_asciimPc(i64 %arg) {
|
|||
; ZN: # %bb.0: # %bb
|
||||
; ZN-NEXT: movabsq $811296384146066817, %rax # imm = 0xB424DC35095CD81
|
||||
; ZN-NEXT: movq %rdi, %rdx
|
||||
; ZN-NEXT: mulxq %rax, %rax, %rcx
|
||||
; ZN-NEXT: shrq $42, %rcx
|
||||
; ZN-NEXT: imulq $281474977, %rcx, %rax # imm = 0x10C6F7A1
|
||||
; ZN-NEXT: mulxq %rax, %rax, %rax
|
||||
; ZN-NEXT: shrq $42, %rax
|
||||
; ZN-NEXT: imulq $281474977, %rax, %rax # imm = 0x10C6F7A1
|
||||
; ZN-NEXT: shrq $20, %rax
|
||||
; ZN-NEXT: leal 5(%rax,%rax,4), %eax
|
||||
; ZN-NEXT: andl $134217727, %eax # imm = 0x7FFFFFF
|
||||
|
|
Loading…
Reference in New Issue