forked from OSchip/llvm-project
[TargetLowering] Improve expandFunnelShift shift amount masking
For the 'inverse shift', we currently always perform a subtraction of the original (masked) shift amount. But for the case where we are handling power-of-2 type widths, we can replace: (sub bw-1, (and amt, bw-1) ) -> (and (xor amt, bw-1), bw-1) -> (and ~amt, bw-1) This allows x86 shifts to fold away the and-mask. Followup to D77301 + D80466. http://volta.cs.utah.edu:8080/z/Nod0Gr Differential Revision: https://reviews.llvm.org/D80489
This commit is contained in:
parent
72210ce7f5
commit
1603106725
|
@ -6008,15 +6008,17 @@ bool TargetLowering::expandFunnelShift(SDNode *Node, SDValue &Result,
|
|||
|
||||
EVT ShVT = Z.getValueType();
|
||||
SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, ShVT);
|
||||
SDValue ShAmt;
|
||||
SDValue ShAmt, InvShAmt;
|
||||
if (isPowerOf2_32(EltSizeInBits)) {
|
||||
// Z % BW -> Z & (BW - 1)
|
||||
ShAmt = DAG.getNode(ISD::AND, DL, ShVT, Z, Mask);
|
||||
// (BW - 1) - (Z % BW) -> ~Z & (BW - 1)
|
||||
InvShAmt = DAG.getNode(ISD::AND, DL, ShVT, DAG.getNOT(DL, Z, ShVT), Mask);
|
||||
} else {
|
||||
SDValue BitWidthC = DAG.getConstant(EltSizeInBits, DL, ShVT);
|
||||
ShAmt = DAG.getNode(ISD::UREM, DL, ShVT, Z, BitWidthC);
|
||||
InvShAmt = DAG.getNode(ISD::SUB, DL, ShVT, Mask, ShAmt);
|
||||
}
|
||||
SDValue InvShAmt = DAG.getNode(ISD::SUB, DL, ShVT, Mask, ShAmt);
|
||||
|
||||
SDValue One = DAG.getConstant(1, DL, ShVT);
|
||||
SDValue ShX, ShY;
|
||||
|
|
|
@ -106,9 +106,8 @@ define i32 @var_shift_i32(i32 %x, i32 %y, i32 %z) nounwind {
|
|||
; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %cl
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx
|
||||
; X86-SLOW-NEXT: shll %cl, %edx
|
||||
; X86-SLOW-NEXT: notb %cl
|
||||
; X86-SLOW-NEXT: shrl %eax
|
||||
; X86-SLOW-NEXT: andb $31, %cl
|
||||
; X86-SLOW-NEXT: xorb $31, %cl
|
||||
; X86-SLOW-NEXT: shrl %cl, %eax
|
||||
; X86-SLOW-NEXT: orl %edx, %eax
|
||||
; X86-SLOW-NEXT: retl
|
||||
|
@ -127,8 +126,7 @@ define i32 @var_shift_i32(i32 %x, i32 %y, i32 %z) nounwind {
|
|||
; X64-SLOW-NEXT: movl %esi, %eax
|
||||
; X64-SLOW-NEXT: shll %cl, %edi
|
||||
; X64-SLOW-NEXT: shrl %eax
|
||||
; X64-SLOW-NEXT: andb $31, %cl
|
||||
; X64-SLOW-NEXT: xorb $31, %cl
|
||||
; X64-SLOW-NEXT: notb %cl
|
||||
; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
|
||||
; X64-SLOW-NEXT: shrl %cl, %eax
|
||||
; X64-SLOW-NEXT: orl %edi, %eax
|
||||
|
@ -240,7 +238,7 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
|
|||
; X86-SLOW-NEXT: pushl %ebx
|
||||
; X86-SLOW-NEXT: pushl %edi
|
||||
; X86-SLOW-NEXT: pushl %esi
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx
|
||||
|
@ -249,32 +247,30 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
|
|||
; X86-SLOW-NEXT: subb %bl, %ch
|
||||
; X86-SLOW-NEXT: movb %ch, %cl
|
||||
; X86-SLOW-NEXT: shrl %cl, %edx
|
||||
; X86-SLOW-NEXT: andb $31, %cl
|
||||
; X86-SLOW-NEXT: xorb $31, %cl
|
||||
; X86-SLOW-NEXT: notb %cl
|
||||
; X86-SLOW-NEXT: addl %eax, %eax
|
||||
; X86-SLOW-NEXT: shll %cl, %eax
|
||||
; X86-SLOW-NEXT: movb %bl, %cl
|
||||
; X86-SLOW-NEXT: shll %cl, %ebp
|
||||
; X86-SLOW-NEXT: shll %cl, %edi
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi
|
||||
; X86-SLOW-NEXT: movl %esi, %edi
|
||||
; X86-SLOW-NEXT: shrl %edi
|
||||
; X86-SLOW-NEXT: andb $31, %cl
|
||||
; X86-SLOW-NEXT: xorb $31, %cl
|
||||
; X86-SLOW-NEXT: shrl %cl, %edi
|
||||
; X86-SLOW-NEXT: movl %esi, %ebp
|
||||
; X86-SLOW-NEXT: shrl %ebp
|
||||
; X86-SLOW-NEXT: notb %cl
|
||||
; X86-SLOW-NEXT: shrl %cl, %ebp
|
||||
; X86-SLOW-NEXT: movb %bl, %cl
|
||||
; X86-SLOW-NEXT: shll %cl, %esi
|
||||
; X86-SLOW-NEXT: testb $32, %bl
|
||||
; X86-SLOW-NEXT: jne .LBB5_1
|
||||
; X86-SLOW-NEXT: # %bb.2:
|
||||
; X86-SLOW-NEXT: orl %edi, %ebp
|
||||
; X86-SLOW-NEXT: orl %ebp, %edi
|
||||
; X86-SLOW-NEXT: jmp .LBB5_3
|
||||
; X86-SLOW-NEXT: .LBB5_1:
|
||||
; X86-SLOW-NEXT: movl %esi, %ebp
|
||||
; X86-SLOW-NEXT: movl %esi, %edi
|
||||
; X86-SLOW-NEXT: xorl %esi, %esi
|
||||
; X86-SLOW-NEXT: .LBB5_3:
|
||||
; X86-SLOW-NEXT: movb %ch, %cl
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi
|
||||
; X86-SLOW-NEXT: shrl %cl, %edi
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp
|
||||
; X86-SLOW-NEXT: shrl %cl, %ebp
|
||||
; X86-SLOW-NEXT: testb $32, %ch
|
||||
; X86-SLOW-NEXT: jne .LBB5_4
|
||||
; X86-SLOW-NEXT: # %bb.5:
|
||||
|
@ -282,17 +278,17 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
|
|||
; X86-SLOW-NEXT: movl %eax, %ecx
|
||||
; X86-SLOW-NEXT: jmp .LBB5_6
|
||||
; X86-SLOW-NEXT: .LBB5_4:
|
||||
; X86-SLOW-NEXT: movl %edi, %ecx
|
||||
; X86-SLOW-NEXT: xorl %edi, %edi
|
||||
; X86-SLOW-NEXT: movl %ebp, %ecx
|
||||
; X86-SLOW-NEXT: xorl %ebp, %ebp
|
||||
; X86-SLOW-NEXT: .LBB5_6:
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X86-SLOW-NEXT: testl %ebx, %ebx
|
||||
; X86-SLOW-NEXT: je .LBB5_8
|
||||
; X86-SLOW-NEXT: # %bb.7:
|
||||
; X86-SLOW-NEXT: orl %edi, %ebp
|
||||
; X86-SLOW-NEXT: orl %ebp, %edi
|
||||
; X86-SLOW-NEXT: orl %ecx, %esi
|
||||
; X86-SLOW-NEXT: movl %ebp, %edx
|
||||
; X86-SLOW-NEXT: movl %edi, %edx
|
||||
; X86-SLOW-NEXT: movl %esi, %eax
|
||||
; X86-SLOW-NEXT: .LBB5_8:
|
||||
; X86-SLOW-NEXT: popl %esi
|
||||
|
@ -315,8 +311,7 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
|
|||
; X64-SLOW-NEXT: movq %rsi, %rax
|
||||
; X64-SLOW-NEXT: shlq %cl, %rdi
|
||||
; X64-SLOW-NEXT: shrq %rax
|
||||
; X64-SLOW-NEXT: andb $63, %cl
|
||||
; X64-SLOW-NEXT: xorb $63, %cl
|
||||
; X64-SLOW-NEXT: notb %cl
|
||||
; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $rcx
|
||||
; X64-SLOW-NEXT: shrq %cl, %rax
|
||||
; X64-SLOW-NEXT: orq %rdi, %rax
|
||||
|
|
|
@ -103,9 +103,8 @@ define i32 @var_shift_i32(i32 %x, i32 %y, i32 %z) nounwind {
|
|||
; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %cl
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx
|
||||
; X86-SLOW-NEXT: shrl %cl, %edx
|
||||
; X86-SLOW-NEXT: notb %cl
|
||||
; X86-SLOW-NEXT: addl %eax, %eax
|
||||
; X86-SLOW-NEXT: andb $31, %cl
|
||||
; X86-SLOW-NEXT: xorb $31, %cl
|
||||
; X86-SLOW-NEXT: shll %cl, %eax
|
||||
; X86-SLOW-NEXT: orl %edx, %eax
|
||||
; X86-SLOW-NEXT: retl
|
||||
|
@ -124,8 +123,7 @@ define i32 @var_shift_i32(i32 %x, i32 %y, i32 %z) nounwind {
|
|||
; X64-SLOW-NEXT: # kill: def $edi killed $edi def $rdi
|
||||
; X64-SLOW-NEXT: shrl %cl, %esi
|
||||
; X64-SLOW-NEXT: leal (%rdi,%rdi), %eax
|
||||
; X64-SLOW-NEXT: andb $31, %cl
|
||||
; X64-SLOW-NEXT: xorb $31, %cl
|
||||
; X64-SLOW-NEXT: notb %cl
|
||||
; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
|
||||
; X64-SLOW-NEXT: shll %cl, %eax
|
||||
; X64-SLOW-NEXT: orl %esi, %eax
|
||||
|
@ -234,60 +232,58 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
|
|||
; X86-SLOW-NEXT: pushl %edi
|
||||
; X86-SLOW-NEXT: pushl %esi
|
||||
; X86-SLOW-NEXT: subl $8, %esp
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx
|
||||
; X86-SLOW-NEXT: andl $63, %ebx
|
||||
; X86-SLOW-NEXT: movb $64, %ch
|
||||
; X86-SLOW-NEXT: subb %bl, %ch
|
||||
; X86-SLOW-NEXT: movb %ch, %cl
|
||||
; X86-SLOW-NEXT: shll %cl, %edx
|
||||
; X86-SLOW-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
|
||||
; X86-SLOW-NEXT: movl %esi, %edx
|
||||
; X86-SLOW-NEXT: andb $31, %cl
|
||||
; X86-SLOW-NEXT: xorb $31, %cl
|
||||
; X86-SLOW-NEXT: shll %cl, %eax
|
||||
; X86-SLOW-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
|
||||
; X86-SLOW-NEXT: notb %cl
|
||||
; X86-SLOW-NEXT: shrl %esi
|
||||
; X86-SLOW-NEXT: shrl %cl, %esi
|
||||
; X86-SLOW-NEXT: movb %bl, %cl
|
||||
; X86-SLOW-NEXT: shrl %cl, %eax
|
||||
; X86-SLOW-NEXT: andb $31, %cl
|
||||
; X86-SLOW-NEXT: xorb $31, %cl
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi
|
||||
; X86-SLOW-NEXT: leal (%edi,%edi), %ebp
|
||||
; X86-SLOW-NEXT: shrl %cl, %edi
|
||||
; X86-SLOW-NEXT: notb %cl
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X86-SLOW-NEXT: leal (%eax,%eax), %ebp
|
||||
; X86-SLOW-NEXT: shll %cl, %ebp
|
||||
; X86-SLOW-NEXT: movb %bl, %cl
|
||||
; X86-SLOW-NEXT: shrl %cl, %edi
|
||||
; X86-SLOW-NEXT: shrl %cl, %eax
|
||||
; X86-SLOW-NEXT: testb $32, %bl
|
||||
; X86-SLOW-NEXT: jne .LBB5_1
|
||||
; X86-SLOW-NEXT: # %bb.2:
|
||||
; X86-SLOW-NEXT: orl %eax, %ebp
|
||||
; X86-SLOW-NEXT: orl %edi, %ebp
|
||||
; X86-SLOW-NEXT: jmp .LBB5_3
|
||||
; X86-SLOW-NEXT: .LBB5_1:
|
||||
; X86-SLOW-NEXT: movl %edi, %ebp
|
||||
; X86-SLOW-NEXT: xorl %edi, %edi
|
||||
; X86-SLOW-NEXT: movl %eax, %ebp
|
||||
; X86-SLOW-NEXT: xorl %eax, %eax
|
||||
; X86-SLOW-NEXT: .LBB5_3:
|
||||
; X86-SLOW-NEXT: movb %ch, %cl
|
||||
; X86-SLOW-NEXT: shll %cl, %edx
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi
|
||||
; X86-SLOW-NEXT: shll %cl, %edi
|
||||
; X86-SLOW-NEXT: testb $32, %ch
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X86-SLOW-NEXT: movl %eax, (%esp) # 4-byte Spill
|
||||
; X86-SLOW-NEXT: jne .LBB5_4
|
||||
; X86-SLOW-NEXT: # %bb.5:
|
||||
; X86-SLOW-NEXT: movl %edx, (%esp) # 4-byte Spill
|
||||
; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
|
||||
; X86-SLOW-NEXT: orl %esi, %ecx
|
||||
; X86-SLOW-NEXT: jmp .LBB5_6
|
||||
; X86-SLOW-NEXT: .LBB5_4:
|
||||
; X86-SLOW-NEXT: movl %edx, %ecx
|
||||
; X86-SLOW-NEXT: movl $0, (%esp) # 4-byte Folded Spill
|
||||
; X86-SLOW-NEXT: movl %edi, %ecx
|
||||
; X86-SLOW-NEXT: xorl %edi, %edi
|
||||
; X86-SLOW-NEXT: .LBB5_6:
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx
|
||||
; X86-SLOW-NEXT: testl %ebx, %ebx
|
||||
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X86-SLOW-NEXT: je .LBB5_8
|
||||
; X86-SLOW-NEXT: # %bb.7:
|
||||
; X86-SLOW-NEXT: movl (%esp), %eax # 4-byte Reload
|
||||
; X86-SLOW-NEXT: orl %ebp, %eax
|
||||
; X86-SLOW-NEXT: orl %edi, %ecx
|
||||
; X86-SLOW-NEXT: orl %ebp, %edi
|
||||
; X86-SLOW-NEXT: orl (%esp), %ecx # 4-byte Folded Reload
|
||||
; X86-SLOW-NEXT: movl %edi, %eax
|
||||
; X86-SLOW-NEXT: movl %ecx, %edx
|
||||
; X86-SLOW-NEXT: .LBB5_8:
|
||||
; X86-SLOW-NEXT: addl $8, %esp
|
||||
|
@ -310,8 +306,7 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
|
|||
; X64-SLOW-NEXT: movq %rdx, %rcx
|
||||
; X64-SLOW-NEXT: shrq %cl, %rsi
|
||||
; X64-SLOW-NEXT: leaq (%rdi,%rdi), %rax
|
||||
; X64-SLOW-NEXT: andb $63, %cl
|
||||
; X64-SLOW-NEXT: xorb $63, %cl
|
||||
; X64-SLOW-NEXT: notb %cl
|
||||
; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $rcx
|
||||
; X64-SLOW-NEXT: shlq %cl, %rax
|
||||
; X64-SLOW-NEXT: orq %rsi, %rax
|
||||
|
|
Loading…
Reference in New Issue