[DAGCombiner] fold shift-trunc-shift to shift-mask-trunc (2nd try)

The initial attempt (rG89633320) botched the logic by reversing
the source/dest types. Added x86 tests for additional coverage.
The vector tests show a potential improvement (fold vector load
instead of broadcasting), but that's a known/existing problem.

This fold is done in IR by instcombine, and we have a special
form of it already here in DAGCombiner, but we want the more
general transform too:
https://rise4fun.com/Alive/3jZm

Name: general
Pre: (C1 + zext(C2) < 64)
%s = lshr i64 %x, C1
%t = trunc i64 %s to i16
%r = lshr i16 %t, C2
=>
%s2 = lshr i64 %x, C1 + zext(C2)
%a = and i64 %s2, zext((1 << (16 - C2)) - 1)
%r = trunc %a to i16

Name: special
Pre: C1 == 48
%s = lshr i64 %x, C1
%t = trunc i64 %s to i16
%r = lshr i16 %t, C2
=>
%s2 = lshr i64 %x, C1 + zext(C2)
%r = trunc %s2 to i16

...because D58017 exposes a regression without this fold.
This commit is contained in:
Sanjay Patel 2019-12-13 09:40:33 -05:00
parent ed50e6060b
commit 2f0c7fd2db
5 changed files with 44 additions and 26 deletions

View File

@ -7943,6 +7943,20 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
InnerShift.getOperand(0), NewShiftAmt);
return DAG.getNode(ISD::TRUNCATE, DL, VT, NewShift);
}
// In the more general case, we can clear the high bits after the shift:
// srl (trunc (srl x, c1)), c2 --> trunc (and (srl x, (c1+c2)), Mask)
if (N0.hasOneUse() && InnerShift.hasOneUse() &&
c1 + c2 < InnerShiftSize) {
SDLoc DL(N);
SDValue NewShiftAmt = DAG.getConstant(c1 + c2, DL, ShiftAmtVT);
SDValue NewShift = DAG.getNode(ISD::SRL, DL, InnerShiftVT,
InnerShift.getOperand(0), NewShiftAmt);
SDValue Mask = DAG.getConstant(APInt::getLowBitsSet(InnerShiftSize,
OpSizeInBits - c2),
DL, InnerShiftVT);
SDValue And = DAG.getNode(ISD::AND, DL, InnerShiftVT, NewShift, Mask);
return DAG.getNode(ISD::TRUNCATE, DL, VT, And);
}
}
}

View File

@ -670,8 +670,7 @@ define i64 @reg64_lshr_by_masked_negated_unfolded_add_b(i64 %val, i64 %a, i64 %b
define i32 @t(i64 %x) {
; CHECK-LABEL: t:
; CHECK: // %bb.0:
; CHECK-NEXT: lsr x8, x0, #13
; CHECK-NEXT: ubfx x0, x8, #4, #28
; CHECK-NEXT: ubfx x0, x0, #17, #28
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
; CHECK-NEXT: ret
%s = lshr i64 %x, 13

View File

@ -25,8 +25,7 @@ cond.false: ; preds = %entry
define i32 @sh_trunc_sh(i64 %x) {
; CHECK-LABEL: sh_trunc_sh:
; CHECK: # %bb.0:
; CHECK-NEXT: rldicl 3, 3, 51, 13
; CHECK-NEXT: srwi 3, 3, 4
; CHECK-NEXT: rldicl 3, 3, 47, 36
; CHECK-NEXT: blr
%s = lshr i64 %x, 13
%t = trunc i64 %s to i32

View File

@ -1564,10 +1564,10 @@ define i16 @sh_trunc_sh(i64 %x) {
;
; X64-LABEL: sh_trunc_sh:
; X64: # %bb.0:
; X64-NEXT: shrq $24, %rdi
; X64-NEXT: movzwl %di, %eax
; X64-NEXT: shrl $12, %eax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: shrq $36, %rax
; X64-NEXT: andl $15, %eax
; X64-NEXT: # kill: def $ax killed $ax killed $rax
; X64-NEXT: retq
%s = lshr i64 %x, 24
%t = trunc i64 %s to i16

View File

@ -1399,71 +1399,77 @@ define <4 x i32> @sh_trunc_sh_vec(<4 x i64> %x) {
; AVX1-LABEL: sh_trunc_sh_vec:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpsrlq $24, %xmm1, %xmm1
; AVX1-NEXT: vpsrlq $24, %xmm0, %xmm0
; AVX1-NEXT: vpsrlq $36, %xmm1, %xmm1
; AVX1-NEXT: vpsrlq $36, %xmm0, %xmm0
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX1-NEXT: vpsrld $12, %xmm0, %xmm0
; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: sh_trunc_sh_vec:
; AVX2: # %bb.0:
; AVX2-NEXT: vpsrlq $24, %ymm0, %ymm0
; AVX2-NEXT: vpsrlq $36, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-NEXT: vpsrld $12, %xmm0, %xmm0
; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [1048575,1048575,1048575,1048575]
; AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: sh_trunc_sh_vec:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; XOPAVX1-NEXT: vpperm {{.*#+}} xmm0 = xmm0[3,4,5,6,11,12,13,14],xmm1[3,4,5,6,11,12,13,14]
; XOPAVX1-NEXT: vpsrld $12, %xmm0, %xmm0
; XOPAVX1-NEXT: vpsrlq $36, %xmm1, %xmm1
; XOPAVX1-NEXT: vpsrlq $36, %xmm0, %xmm0
; XOPAVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; XOPAVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
; XOPAVX1-NEXT: vzeroupper
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: sh_trunc_sh_vec:
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpsrlq $24, %ymm0, %ymm0
; XOPAVX2-NEXT: vpsrlq $36, %ymm0, %ymm0
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; XOPAVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; XOPAVX2-NEXT: vpsrld $12, %xmm0, %xmm0
; XOPAVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [1048575,1048575,1048575,1048575]
; XOPAVX2-NEXT: vandps %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: vzeroupper
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: sh_trunc_sh_vec:
; AVX512: # %bb.0:
; AVX512-NEXT: vpsrlq $24, %ymm0, %ymm0
; AVX512-NEXT: vpsrlq $36, %ymm0, %ymm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: vpsrld $12, %xmm0, %xmm0
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1048575,1048575,1048575,1048575]
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: sh_trunc_sh_vec:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsrlq $24, %ymm0, %ymm0
; AVX512VL-NEXT: vpsrlq $36, %ymm0, %ymm0
; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0
; AVX512VL-NEXT: vpsrld $12, %xmm0, %xmm0
; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm0, %xmm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; X32-AVX1-LABEL: sh_trunc_sh_vec:
; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X32-AVX1-NEXT: vpsrlq $24, %xmm1, %xmm1
; X32-AVX1-NEXT: vpsrlq $24, %xmm0, %xmm0
; X32-AVX1-NEXT: vpsrlq $36, %xmm1, %xmm1
; X32-AVX1-NEXT: vpsrlq $36, %xmm0, %xmm0
; X32-AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; X32-AVX1-NEXT: vpsrld $12, %xmm0, %xmm0
; X32-AVX1-NEXT: vandps {{\.LCPI.*}}, %xmm0, %xmm0
; X32-AVX1-NEXT: vzeroupper
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: sh_trunc_sh_vec:
; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpsrlq $24, %ymm0, %ymm0
; X32-AVX2-NEXT: vpsrlq $36, %ymm0, %ymm0
; X32-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; X32-AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; X32-AVX2-NEXT: vpsrld $12, %xmm0, %xmm0
; X32-AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [1048575,1048575,1048575,1048575]
; X32-AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0
; X32-AVX2-NEXT: vzeroupper
; X32-AVX2-NEXT: retl
%s = lshr <4 x i64> %x, <i64 24, i64 24, i64 24, i64 24>