From 88e205525ca3b65f9b129d87e76e6e94c7ed032f Mon Sep 17 00:00:00 2001 From: Clement Courbet Date: Fri, 22 Nov 2019 09:00:16 +0100 Subject: [PATCH] Revert "[DAGCombiner] Allow zextended load combines." Breaks some bots. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 73 +++++-------------- .../AArch64/load-combine-big-endian.ll | 12 ++- llvm/test/CodeGen/AArch64/load-combine.ll | 12 ++- .../CodeGen/ARM/load-combine-big-endian.ll | 38 ++++++---- llvm/test/CodeGen/ARM/load-combine.ll | 36 +++++---- llvm/test/CodeGen/X86/load-combine.ll | 24 ++++-- 6 files changed, 97 insertions(+), 98 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 71ee8e141bfa..fb7ddf5b2339 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -6750,6 +6750,12 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) { return SDValue(); unsigned ByteWidth = VT.getSizeInBits() / 8; + // Before legalize we can introduce too wide illegal loads which will be later + // split into legal sized loads. This enables us to combine i64 load by i8 + // patterns to a couple of i32 loads on 32 bit targets. + if (LegalOperations && !TLI.isOperationLegal(ISD::LOAD, VT)) + return SDValue(); + bool IsBigEndianTarget = DAG.getDataLayout().isBigEndian(); auto MemoryByteOffset = [&] (ByteProvider P) { assert(P.isMemory() && "Must be a memory byte provider"); @@ -6772,21 +6778,11 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) { // Check if all the bytes of the OR we are looking at are loaded from the same // base address. Collect bytes offsets from Base address in ByteOffsets. SmallVector ByteOffsets(ByteWidth); - unsigned ZeroExtendedBytes = 0; - for (int i = ByteWidth - 1; i >= 0; --i) { + for (unsigned i = 0; i < ByteWidth; i++) { auto P = calculateByteProvider(SDValue(N, 0), i, 0, /*Root=*/true); - if (!P) + if (!P || !P->isMemory()) // All the bytes must be loaded from memory return SDValue(); - if (P->isConstantZero()) { - // It's OK for the N most significant bytes to be 0, we can just - // zero-extend the load. - if (++ZeroExtendedBytes != (ByteWidth - static_cast(i))) - return SDValue(); - continue; - } - assert(P->isMemory() && "provenance should either be memory or zero"); - LoadSDNode *L = P->Load; assert(L->hasNUsesOfValue(1, 0) && L->isSimple() && !L->isIndexed() && @@ -6825,23 +6821,9 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) { assert(Base && "Base address of the accessed memory location must be set"); assert(FirstOffset != INT64_MAX && "First byte offset must be set"); - bool NeedsZext = ZeroExtendedBytes > 0; - - EVT MemVT = - EVT::getIntegerVT(*DAG.getContext(), (ByteWidth - ZeroExtendedBytes) * 8); - - // Before legalize we can introduce too wide illegal loads which will be later - // split into legal sized loads. This enables us to combine i64 load by i8 - // patterns to a couple of i32 loads on 32 bit targets. - if (LegalOperations && - !TLI.isOperationLegal(NeedsZext ? ISD::ZEXTLOAD : ISD::NON_EXTLOAD, - MemVT)) - return SDValue(); - // Check if the bytes of the OR we are looking at match with either big or // little endian value load - Optional IsBigEndian = isBigEndian( - makeArrayRef(ByteOffsets).drop_back(ZeroExtendedBytes), FirstOffset); + Optional IsBigEndian = isBigEndian(ByteOffsets, FirstOffset); if (!IsBigEndian.hasValue()) return SDValue(); @@ -6854,8 +6836,7 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) { LoadSDNode *FirstLoad = FirstByteProvider->Load; // The node we are looking at matches with the pattern, check if we can - // replace it with a single (possibly zero-extended) load and bswap + shift if - // needed. + // replace it with a single load and bswap if needed. // If the load needs byte swap check if the target supports it bool NeedsBswap = IsBigEndianTarget != *IsBigEndian; @@ -6863,45 +6844,25 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) { // Before legalize we can introduce illegal bswaps which will be later // converted to an explicit bswap sequence. This way we end up with a single // load and byte shuffling instead of several loads and byte shuffling. - // We do not introduce illegal bswaps when zero-extending as this tends to - // introduce too many arithmetic instructions. - if (NeedsBswap && (LegalOperations || NeedsZext) && - !TLI.isOperationLegal(ISD::BSWAP, VT)) - return SDValue(); - - // If we need to bswap and zero extend, we have to insert a shift. Check that - // it is legal. - if (NeedsBswap && NeedsZext && LegalOperations && - !TLI.isOperationLegal(ISD::SHL, VT)) + if (NeedsBswap && LegalOperations && !TLI.isOperationLegal(ISD::BSWAP, VT)) return SDValue(); // Check that a load of the wide type is both allowed and fast on the target bool Fast = false; - bool Allowed = - TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT, - *FirstLoad->getMemOperand(), &Fast); + bool Allowed = TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), + VT, *FirstLoad->getMemOperand(), &Fast); if (!Allowed || !Fast) return SDValue(); - SDValue NewLoad = DAG.getExtLoad(NeedsZext ? ISD::ZEXTLOAD : ISD::NON_EXTLOAD, - SDLoc(N), VT, Chain, FirstLoad->getBasePtr(), - FirstLoad->getPointerInfo(), MemVT, - FirstLoad->getAlignment()); + SDValue NewLoad = + DAG.getLoad(VT, SDLoc(N), Chain, FirstLoad->getBasePtr(), + FirstLoad->getPointerInfo(), FirstLoad->getAlignment()); // Transfer chain users from old loads to the new load. for (LoadSDNode *L : Loads) DAG.ReplaceAllUsesOfValueWith(SDValue(L, 1), SDValue(NewLoad.getNode(), 1)); - if (!NeedsBswap) - return NewLoad; - - SDValue ShiftedLoad = - NeedsZext - ? DAG.getNode(ISD::SHL, SDLoc(N), VT, NewLoad, - DAG.getShiftAmountConstant(ZeroExtendedBytes * 8, VT, - SDLoc(N), LegalOperations)) - : NewLoad; - return DAG.getNode(ISD::BSWAP, SDLoc(N), VT, ShiftedLoad); + return NeedsBswap ? DAG.getNode(ISD::BSWAP, SDLoc(N), VT, NewLoad) : NewLoad; } // If the target has andn, bsl, or a similar bit-select instruction, diff --git a/llvm/test/CodeGen/AArch64/load-combine-big-endian.ll b/llvm/test/CodeGen/AArch64/load-combine-big-endian.ll index 19de95198c19..426bb880ed1b 100644 --- a/llvm/test/CodeGen/AArch64/load-combine-big-endian.ll +++ b/llvm/test/CodeGen/AArch64/load-combine-big-endian.ll @@ -445,9 +445,10 @@ define i32 @load_i32_by_i8_base_offset_index_2(i8* %arg, i32 %i) { define i32 @zext_load_i32_by_i8(i32* %arg) { ; CHECK-LABEL: zext_load_i32_by_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrh w8, [x0] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: rev w0, w8 +; CHECK-NEXT: ldrb w8, [x0] +; CHECK-NEXT: ldrb w9, [x0, #1] +; CHECK-NEXT: bfi w8, w9, #8, #8 +; CHECK-NEXT: mov w0, w8 ; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i8* @@ -514,7 +515,10 @@ define i32 @zext_load_i32_by_i8_shl_16(i32* %arg) { define i32 @zext_load_i32_by_i8_bswap(i32* %arg) { ; CHECK-LABEL: zext_load_i32_by_i8_bswap: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrh w0, [x0] +; CHECK-NEXT: ldrb w8, [x0, #1] +; CHECK-NEXT: ldrb w9, [x0] +; CHECK-NEXT: bfi w8, w9, #8, #8 +; CHECK-NEXT: mov w0, w8 ; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i8* diff --git a/llvm/test/CodeGen/AArch64/load-combine.ll b/llvm/test/CodeGen/AArch64/load-combine.ll index 066ecb21dc10..906646cda15e 100644 --- a/llvm/test/CodeGen/AArch64/load-combine.ll +++ b/llvm/test/CodeGen/AArch64/load-combine.ll @@ -431,7 +431,10 @@ define i32 @load_i32_by_i8_base_offset_index_2(i8* %arg, i32 %i) { define i32 @zext_load_i32_by_i8(i32* %arg) { ; CHECK-LABEL: zext_load_i32_by_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrh w0, [x0] +; CHECK-NEXT: ldrb w8, [x0] +; CHECK-NEXT: ldrb w9, [x0, #1] +; CHECK-NEXT: bfi w8, w9, #8, #8 +; CHECK-NEXT: mov w0, w8 ; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i8* @@ -498,9 +501,10 @@ define i32 @zext_load_i32_by_i8_shl_16(i32* %arg) { define i32 @zext_load_i32_by_i8_bswap(i32* %arg) { ; CHECK-LABEL: zext_load_i32_by_i8_bswap: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrh w8, [x0] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: rev w0, w8 +; CHECK-NEXT: ldrb w8, [x0, #1] +; CHECK-NEXT: ldrb w9, [x0] +; CHECK-NEXT: bfi w8, w9, #8, #8 +; CHECK-NEXT: mov w0, w8 ; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i8* diff --git a/llvm/test/CodeGen/ARM/load-combine-big-endian.ll b/llvm/test/CodeGen/ARM/load-combine-big-endian.ll index 0ed85501a7b6..d045f1f96ee3 100644 --- a/llvm/test/CodeGen/ARM/load-combine-big-endian.ll +++ b/llvm/test/CodeGen/ARM/load-combine-big-endian.ll @@ -824,23 +824,25 @@ define i32 @zext_load_i32_by_i8(i32* %arg) { ; ; CHECK-ARMv6-LABEL: zext_load_i32_by_i8: ; CHECK-ARMv6: @ %bb.0: -; CHECK-ARMv6-NEXT: ldrh r0, [r0] -; CHECK-ARMv6-NEXT: lsl r0, r0, #16 -; CHECK-ARMv6-NEXT: rev r0, r0 +; CHECK-ARMv6-NEXT: ldrb r1, [r0] +; CHECK-ARMv6-NEXT: ldrb r0, [r0, #1] +; CHECK-ARMv6-NEXT: orr r0, r1, r0, lsl #8 ; CHECK-ARMv6-NEXT: bx lr ; ; CHECK-THUMBv6-LABEL: zext_load_i32_by_i8: ; CHECK-THUMBv6: @ %bb.0: -; CHECK-THUMBv6-NEXT: ldrh r0, [r0] -; CHECK-THUMBv6-NEXT: lsls r0, r0, #16 -; CHECK-THUMBv6-NEXT: rev r0, r0 +; CHECK-THUMBv6-NEXT: ldrb r1, [r0] +; CHECK-THUMBv6-NEXT: ldrb r0, [r0, #1] +; CHECK-THUMBv6-NEXT: lsls r0, r0, #8 +; CHECK-THUMBv6-NEXT: adds r0, r0, r1 ; CHECK-THUMBv6-NEXT: bx lr ; ; CHECK-THUMBv7-LABEL: zext_load_i32_by_i8: ; CHECK-THUMBv7: @ %bb.0: -; CHECK-THUMBv7-NEXT: ldrh r0, [r0] -; CHECK-THUMBv7-NEXT: lsls r0, r0, #16 -; CHECK-THUMBv7-NEXT: rev r0, r0 +; CHECK-THUMBv7-NEXT: ldrb r1, [r0] +; CHECK-THUMBv7-NEXT: ldrb r0, [r0, #1] +; CHECK-THUMBv7-NEXT: lsls r0, r0, #8 +; CHECK-THUMBv7-NEXT: adds r0, r0, r1 ; CHECK-THUMBv7-NEXT: bx lr %tmp = bitcast i32* %arg to i8* @@ -960,22 +962,32 @@ define i32 @zext_load_i32_by_i8_shl_16(i32* %arg) { define i32 @zext_load_i32_by_i8_bswap(i32* %arg) { ; CHECK-LABEL: zext_load_i32_by_i8_bswap: ; CHECK: @ %bb.0: -; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: ldrb r1, [r0] +; CHECK-NEXT: ldrb r0, [r0, #1] +; CHECK-NEXT: orr r0, r0, r1, lsl #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: zext_load_i32_by_i8_bswap: ; CHECK-ARMv6: @ %bb.0: -; CHECK-ARMv6-NEXT: ldrh r0, [r0] +; CHECK-ARMv6-NEXT: ldrb r1, [r0] +; CHECK-ARMv6-NEXT: ldrb r0, [r0, #1] +; CHECK-ARMv6-NEXT: orr r0, r0, r1, lsl #8 ; CHECK-ARMv6-NEXT: bx lr ; ; CHECK-THUMBv6-LABEL: zext_load_i32_by_i8_bswap: ; CHECK-THUMBv6: @ %bb.0: -; CHECK-THUMBv6-NEXT: ldrh r0, [r0] +; CHECK-THUMBv6-NEXT: ldrb r1, [r0, #1] +; CHECK-THUMBv6-NEXT: ldrb r0, [r0] +; CHECK-THUMBv6-NEXT: lsls r0, r0, #8 +; CHECK-THUMBv6-NEXT: adds r0, r0, r1 ; CHECK-THUMBv6-NEXT: bx lr ; ; CHECK-THUMBv7-LABEL: zext_load_i32_by_i8_bswap: ; CHECK-THUMBv7: @ %bb.0: -; CHECK-THUMBv7-NEXT: ldrh r0, [r0] +; CHECK-THUMBv7-NEXT: ldrb r1, [r0, #1] +; CHECK-THUMBv7-NEXT: ldrb r0, [r0] +; CHECK-THUMBv7-NEXT: lsls r0, r0, #8 +; CHECK-THUMBv7-NEXT: adds r0, r0, r1 ; CHECK-THUMBv7-NEXT: bx lr %tmp = bitcast i32* %arg to i8* diff --git a/llvm/test/CodeGen/ARM/load-combine.ll b/llvm/test/CodeGen/ARM/load-combine.ll index bf03898c891d..d173a098b9bf 100644 --- a/llvm/test/CodeGen/ARM/load-combine.ll +++ b/llvm/test/CodeGen/ARM/load-combine.ll @@ -734,22 +734,31 @@ define i32 @load_i32_by_i8_base_offset_index_2(i8* %arg, i32 %i) { define i32 @zext_load_i32_by_i8(i32* %arg) { ; CHECK-LABEL: zext_load_i32_by_i8: ; CHECK: @ %bb.0: -; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: ldrb r1, [r0] +; CHECK-NEXT: ldrb r0, [r0, #1] +; CHECK-NEXT: orr r0, r1, r0, lsl #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: zext_load_i32_by_i8: ; CHECK-ARMv6: @ %bb.0: -; CHECK-ARMv6-NEXT: ldrh r0, [r0] +; CHECK-ARMv6-NEXT: ldrb r1, [r0] +; CHECK-ARMv6-NEXT: ldrb r0, [r0, #1] +; CHECK-ARMv6-NEXT: orr r0, r1, r0, lsl #8 ; CHECK-ARMv6-NEXT: bx lr ; ; CHECK-THUMBv6-LABEL: zext_load_i32_by_i8: ; CHECK-THUMBv6: @ %bb.0: -; CHECK-THUMBv6-NEXT: ldrh r0, [r0] +; CHECK-THUMBv6-NEXT: ldrb r1, [r0] +; CHECK-THUMBv6-NEXT: ldrb r0, [r0, #1] +; CHECK-THUMBv6-NEXT: lsls r0, r0, #8 +; CHECK-THUMBv6-NEXT: adds r0, r0, r1 ; CHECK-THUMBv6-NEXT: bx lr ; ; CHECK-THUMBv7-LABEL: zext_load_i32_by_i8: ; CHECK-THUMBv7: @ %bb.0: -; CHECK-THUMBv7-NEXT: ldrh r0, [r0] +; CHECK-THUMBv7-NEXT: ldrb r1, [r0] +; CHECK-THUMBv7-NEXT: ldrb r0, [r0, #1] +; CHECK-THUMBv7-NEXT: orr.w r0, r1, r0, lsl #8 ; CHECK-THUMBv7-NEXT: bx lr %tmp = bitcast i32* %arg to i8* @@ -874,23 +883,24 @@ define i32 @zext_load_i32_by_i8_bswap(i32* %arg) { ; ; CHECK-ARMv6-LABEL: zext_load_i32_by_i8_bswap: ; CHECK-ARMv6: @ %bb.0: -; CHECK-ARMv6-NEXT: ldrh r0, [r0] -; CHECK-ARMv6-NEXT: lsl r0, r0, #16 -; CHECK-ARMv6-NEXT: rev r0, r0 +; CHECK-ARMv6-NEXT: ldrb r1, [r0] +; CHECK-ARMv6-NEXT: ldrb r0, [r0, #1] +; CHECK-ARMv6-NEXT: orr r0, r0, r1, lsl #8 ; CHECK-ARMv6-NEXT: bx lr ; ; CHECK-THUMBv6-LABEL: zext_load_i32_by_i8_bswap: ; CHECK-THUMBv6: @ %bb.0: -; CHECK-THUMBv6-NEXT: ldrh r0, [r0] -; CHECK-THUMBv6-NEXT: lsls r0, r0, #16 -; CHECK-THUMBv6-NEXT: rev r0, r0 +; CHECK-THUMBv6-NEXT: ldrb r1, [r0, #1] +; CHECK-THUMBv6-NEXT: ldrb r0, [r0] +; CHECK-THUMBv6-NEXT: lsls r0, r0, #8 +; CHECK-THUMBv6-NEXT: adds r0, r0, r1 ; CHECK-THUMBv6-NEXT: bx lr ; ; CHECK-THUMBv7-LABEL: zext_load_i32_by_i8_bswap: ; CHECK-THUMBv7: @ %bb.0: -; CHECK-THUMBv7-NEXT: ldrh r0, [r0] -; CHECK-THUMBv7-NEXT: lsls r0, r0, #16 -; CHECK-THUMBv7-NEXT: rev r0, r0 +; CHECK-THUMBv7-NEXT: ldrb r1, [r0] +; CHECK-THUMBv7-NEXT: ldrb r0, [r0, #1] +; CHECK-THUMBv7-NEXT: orr.w r0, r0, r1, lsl #8 ; CHECK-THUMBv7-NEXT: bx lr %tmp = bitcast i32* %arg to i8* diff --git a/llvm/test/CodeGen/X86/load-combine.ll b/llvm/test/CodeGen/X86/load-combine.ll index 5184e99d0180..1d08ee065315 100644 --- a/llvm/test/CodeGen/X86/load-combine.ll +++ b/llvm/test/CodeGen/X86/load-combine.ll @@ -1119,12 +1119,18 @@ define i32 @zext_load_i32_by_i8(i32* %arg) { ; CHECK-LABEL: zext_load_i32_by_i8: ; CHECK: # %bb.0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movzwl (%eax), %eax +; CHECK-NEXT: movzbl (%eax), %ecx +; CHECK-NEXT: movzbl 1(%eax), %eax +; CHECK-NEXT: shll $8, %eax +; CHECK-NEXT: orl %ecx, %eax ; CHECK-NEXT: retl ; ; CHECK64-LABEL: zext_load_i32_by_i8: ; CHECK64: # %bb.0: -; CHECK64-NEXT: movzwl (%rdi), %eax +; CHECK64-NEXT: movzbl (%rdi), %ecx +; CHECK64-NEXT: movzbl 1(%rdi), %eax +; CHECK64-NEXT: shll $8, %eax +; CHECK64-NEXT: orl %ecx, %eax ; CHECK64-NEXT: retq %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0 @@ -1212,16 +1218,18 @@ define i32 @zext_load_i32_by_i8_bswap(i32* %arg) { ; CHECK-LABEL: zext_load_i32_by_i8_bswap: ; CHECK: # %bb.0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movzwl (%eax), %eax -; CHECK-NEXT: shll $16, %eax -; CHECK-NEXT: bswapl %eax +; CHECK-NEXT: movzbl 1(%eax), %ecx +; CHECK-NEXT: movzbl (%eax), %eax +; CHECK-NEXT: shll $8, %eax +; CHECK-NEXT: orl %ecx, %eax ; CHECK-NEXT: retl ; ; CHECK64-LABEL: zext_load_i32_by_i8_bswap: ; CHECK64: # %bb.0: -; CHECK64-NEXT: movzwl (%rdi), %eax -; CHECK64-NEXT: shll $16, %eax -; CHECK64-NEXT: bswapl %eax +; CHECK64-NEXT: movzbl 1(%rdi), %ecx +; CHECK64-NEXT: movzbl (%rdi), %eax +; CHECK64-NEXT: shll $8, %eax +; CHECK64-NEXT: orl %ecx, %eax ; CHECK64-NEXT: retq %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1