[PowerPC] Combine 64-bit bswap(load) without LDBRX

When targeting CPUs that don't have LDBRX, we end up producing code that is
very inefficient and large for this common idiom. This patch just
optimizes it two 32-bit LWBRX instructions along with a merge.

This fixes https://bugs.llvm.org/show_bug.cgi?id=49610

Differential revision: https://reviews.llvm.org/D104836
This commit is contained in:
Nemanja Ivanovic 2021-06-24 14:44:17 -05:00
parent 8e0ff44bf8
commit 0464586ac5
3 changed files with 96 additions and 19 deletions

View File

@ -15202,13 +15202,17 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
}
}
break;
case ISD::BSWAP:
case ISD::BSWAP: {
// Turn BSWAP (LOAD) -> lhbrx/lwbrx.
if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) &&
N->getOperand(0).hasOneUse() &&
// For subtargets without LDBRX, we can still do better than the default
// expansion even for 64-bit BSWAP (LOAD).
bool Is64BitBswapOn64BitTgt =
Subtarget.isPPC64() && N->getValueType(0) == MVT::i64;
bool IsSingleUseNormalLd = ISD::isNormalLoad(N->getOperand(0).getNode()) &&
N->getOperand(0).hasOneUse();
if (IsSingleUseNormalLd &&
(N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i16 ||
(Subtarget.hasLDBRX() && Subtarget.isPPC64() &&
N->getValueType(0) == MVT::i64))) {
(Subtarget.hasLDBRX() && Is64BitBswapOn64BitTgt))) {
SDValue Load = N->getOperand(0);
LoadSDNode *LD = cast<LoadSDNode>(Load);
// Create the byte-swapping load.
@ -15239,7 +15243,32 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
// Return N so it doesn't get rechecked!
return SDValue(N, 0);
}
break;
// Convert this to two 32-bit bswap loads and a BUILD_PAIR. Do this only
// before legalization so that the BUILD_PAIR is handled correctly.
if (!DCI.isBeforeLegalize() || !Is64BitBswapOn64BitTgt ||
!IsSingleUseNormalLd)
return SDValue();
LoadSDNode *LD = cast<LoadSDNode>(N->getOperand(0));
// Can't split volatile or atomic loads.
if (!LD->isSimple())
return SDValue();
SDValue BasePtr = LD->getBasePtr();
SDValue Lo = DAG.getLoad(MVT::i32, dl, LD->getChain(), BasePtr,
LD->getPointerInfo(), LD->getAlignment());
Lo = DAG.getNode(ISD::BSWAP, dl, MVT::i32, Lo);
BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
DAG.getIntPtrConstant(4, dl));
SDValue Hi = DAG.getLoad(MVT::i32, dl, LD->getChain(), BasePtr,
LD->getPointerInfo(), LD->getAlignment());
Hi = DAG.getNode(ISD::BSWAP, dl, MVT::i32, Hi);
SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Hi, Lo);
SDValue TF =
DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
Hi.getOperand(0).getValue(1), Lo.getOperand(0).getValue(1));
DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), TF);
return Res;
}
case PPCISD::VCMP:
// If a VCMP_rec node already exists with exactly the same operands as this
// node, use its result instead of this node (VCMP_rec computes both a CR6

View File

@ -101,6 +101,8 @@ define i16 @LHBRX(i8* %ptr, i32 %off) {
ret i16 %tmp6
}
; TODO: combine the bswap feeding a store on subtargets
; that do not have an STDBRX.
define void @STDBRX(i64 %i, i8* %ptr, i64 %off) {
; PWR7_32-LABEL: STDBRX:
; PWR7_32: # %bb.0:
@ -149,19 +151,11 @@ define i64 @LDBRX(i8* %ptr, i64 %off) {
;
; X64-LABEL: LDBRX:
; X64: # %bb.0:
; X64-NEXT: ldx r4, r3, r4
; X64-NEXT: rotldi r5, r4, 16
; X64-NEXT: rotldi r3, r4, 8
; X64-NEXT: rldimi r3, r5, 8, 48
; X64-NEXT: rotldi r5, r4, 24
; X64-NEXT: rldimi r3, r5, 16, 40
; X64-NEXT: rotldi r5, r4, 32
; X64-NEXT: rldimi r3, r5, 24, 32
; X64-NEXT: rotldi r5, r4, 48
; X64-NEXT: rldimi r3, r5, 40, 16
; X64-NEXT: rotldi r5, r4, 56
; X64-NEXT: rldimi r3, r5, 48, 8
; X64-NEXT: rldimi r3, r4, 56, 0
; X64-NEXT: li r5, 4
; X64-NEXT: lwbrx r6, r3, r4
; X64-NEXT: add r3, r3, r4
; X64-NEXT: lwbrx r3, r3, r5
; X64-NEXT: rldimi r3, r6, 32, 0
; X64-NEXT: blr
;
; PWR7_64-LABEL: LDBRX:

View File

@ -0,0 +1,54 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=powerpc64-- -mcpu=pwr5 -verify-machineinstrs < %s | \
; RUN: FileCheck %s
define void @bs(i64* %p) {
; CHECK-LABEL: bs:
; CHECK: # %bb.0:
; CHECK-NEXT: li 4, 4
; CHECK-NEXT: lwbrx 5, 0, 3
; CHECK-NEXT: lwbrx 4, 3, 4
; CHECK-NEXT: rldimi 4, 5, 32, 0
; CHECK-NEXT: std 4, 0(3)
; CHECK-NEXT: blr
%x = load i64, i64* %p, align 8
%b = call i64 @llvm.bswap.i64(i64 %x)
store i64 %b, i64* %p, align 8
ret void
}
define i64 @volatile_ld(i64* %p) {
; CHECK-LABEL: volatile_ld:
; CHECK: # %bb.0:
; CHECK-NEXT: ld 4, 0(3)
; CHECK-NEXT: rotldi 5, 4, 16
; CHECK-NEXT: rotldi 3, 4, 8
; CHECK-NEXT: rldimi 3, 5, 8, 48
; CHECK-NEXT: rotldi 5, 4, 24
; CHECK-NEXT: rldimi 3, 5, 16, 40
; CHECK-NEXT: rotldi 5, 4, 32
; CHECK-NEXT: rldimi 3, 5, 24, 32
; CHECK-NEXT: rotldi 5, 4, 48
; CHECK-NEXT: rldimi 3, 5, 40, 16
; CHECK-NEXT: rotldi 5, 4, 56
; CHECK-NEXT: rldimi 3, 5, 48, 8
; CHECK-NEXT: rldimi 3, 4, 56, 0
; CHECK-NEXT: blr
%x = load volatile i64, i64* %p, align 8
%b = call i64 @llvm.bswap.i64(i64 %x)
ret i64 %b
}
define i64 @misaligned_ld(i64* %p) {
; CHECK-LABEL: misaligned_ld:
; CHECK: # %bb.0:
; CHECK-NEXT: li 4, 4
; CHECK-NEXT: lwbrx 5, 0, 3
; CHECK-NEXT: lwbrx 3, 3, 4
; CHECK-NEXT: rldimi 3, 5, 32, 0
; CHECK-NEXT: blr
%x = load i64, i64* %p, align 1
%b = call i64 @llvm.bswap.i64(i64 %x)
ret i64 %b
}
declare i64 @llvm.bswap.i64(i64) #2