From e22cf4d7cb2e33d05d564932342023ec376a88fc Mon Sep 17 00:00:00 2001 From: Clement Courbet Date: Thu, 20 Dec 2018 09:58:33 +0000 Subject: [PATCH] Revert r349731 "[CodeGen][ExpandMemcmp] Add an option for allowing overlapping loads." Forgot to update PowerPC tests for the GEP->bitcast change. llvm-svn: 349733 --- .../llvm/Analysis/TargetTransformInfo.h | 8 +- llvm/lib/CodeGen/ExpandMemCmp.cpp | 235 ++++------ .../lib/Target/X86/X86TargetTransformInfo.cpp | 5 +- llvm/test/CodeGen/X86/memcmp-optsize.ll | 73 +-- llvm/test/CodeGen/X86/memcmp.ll | 161 +++---- .../Transforms/ExpandMemCmp/X86/memcmp.ll | 424 ++++-------------- 6 files changed, 266 insertions(+), 640 deletions(-) diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 87e3fabcf51a..6ddea8bbdce0 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -581,17 +581,13 @@ public: struct MemCmpExpansionOptions { // The list of available load sizes (in bytes), sorted in decreasing order. SmallVector LoadSizes; - // Set to true to allow overlapping loads. For example, 7-byte compares can - // be done with two 4-byte compares instead of 4+2+1-byte compares. This - // requires all loads in LoadSizes to be doable in an unaligned way. - bool AllowOverlappingLoads = false; }; const MemCmpExpansionOptions *enableMemCmpExpansion(bool IsZeroCmp) const; /// Enable matching of interleaved access groups. bool enableInterleavedAccessVectorization() const; - /// Enable matching of interleaved access groups that contain predicated + /// Enable matching of interleaved access groups that contain predicated /// accesses or gaps and therefore vectorized using masked /// vector loads/stores. bool enableMaskedInterleavedAccessVectorization() const; @@ -776,7 +772,7 @@ public: /// \return The cost of a shuffle instruction of kind Kind and of type Tp. /// The index and subtype parameters are used by the subvector insertion and /// extraction shuffle kinds to show the insert/extract point and the type of - /// the subvector being inserted/extracted. + /// the subvector being inserted/extracted. /// NOTE: For subvector extractions Tp represents the source type. int getShuffleCost(ShuffleKind Kind, Type *Tp, int Index = 0, Type *SubTp = nullptr) const; diff --git a/llvm/lib/CodeGen/ExpandMemCmp.cpp b/llvm/lib/CodeGen/ExpandMemCmp.cpp index ee7683adbcdd..d7562cbf1e90 100644 --- a/llvm/lib/CodeGen/ExpandMemCmp.cpp +++ b/llvm/lib/CodeGen/ExpandMemCmp.cpp @@ -66,18 +66,23 @@ class MemCmpExpansion { // Represents the decomposition in blocks of the expansion. For example, // comparing 33 bytes on X86+sse can be done with 2x16-byte loads and // 1x1-byte load, which would be represented as [{16, 0}, {16, 16}, {32, 1}. + // TODO(courbet): Involve the target more in this computation. On X86, 7 + // bytes can be done more efficiently with two overlaping 4-byte loads than + // covering the interval with [{4, 0},{2, 4},{1, 6}}. struct LoadEntry { LoadEntry(unsigned LoadSize, uint64_t Offset) : LoadSize(LoadSize), Offset(Offset) { + assert(Offset % LoadSize == 0 && "invalid load entry"); } + uint64_t getGEPIndex() const { return Offset / LoadSize; } + // The size of the load for this block, in bytes. - unsigned LoadSize; - // The offset of this load from the base pointer, in bytes. - uint64_t Offset; + const unsigned LoadSize; + // The offset of this load WRT the base pointer, in bytes. + const uint64_t Offset; }; - using LoadEntryVector = SmallVector; - LoadEntryVector LoadSequence; + SmallVector LoadSequence; void createLoadCmpBlocks(); void createResultBlock(); @@ -87,23 +92,13 @@ class MemCmpExpansion { void emitLoadCompareBlock(unsigned BlockIndex); void emitLoadCompareBlockMultipleLoads(unsigned BlockIndex, unsigned &LoadIndex); - void emitLoadCompareByteBlock(unsigned BlockIndex, unsigned OffsetBytes); + void emitLoadCompareByteBlock(unsigned BlockIndex, unsigned GEPIndex); void emitMemCmpResultBlock(); Value *getMemCmpExpansionZeroCase(); Value *getMemCmpEqZeroOneBlock(); Value *getMemCmpOneBlock(); - Value *getPtrToElementAtOffset(Value *Source, Type *LoadSizeType, - uint64_t OffsetBytes); - static LoadEntryVector - computeGreedyLoadSequence(uint64_t Size, llvm::ArrayRef LoadSizes, - unsigned MaxNumLoads, unsigned &NumLoadsNonOneByte); - static LoadEntryVector - computeOverlappingLoadSequence(uint64_t Size, unsigned MaxLoadSize, - unsigned MaxNumLoads, - unsigned &NumLoadsNonOneByte); - -public: + public: MemCmpExpansion(CallInst *CI, uint64_t Size, const TargetTransformInfo::MemCmpExpansionOptions &Options, unsigned MaxNumLoads, const bool IsUsedForZeroCmp, @@ -115,76 +110,6 @@ public: Value *getMemCmpExpansion(); }; -MemCmpExpansion::LoadEntryVector MemCmpExpansion::computeGreedyLoadSequence( - uint64_t Size, llvm::ArrayRef LoadSizes, - const unsigned MaxNumLoads, unsigned &NumLoadsNonOneByte) { - NumLoadsNonOneByte = 0; - LoadEntryVector LoadSequence; - uint64_t Offset = 0; - while (Size && !LoadSizes.empty()) { - const unsigned LoadSize = LoadSizes.front(); - const uint64_t NumLoadsForThisSize = Size / LoadSize; - if (LoadSequence.size() + NumLoadsForThisSize > MaxNumLoads) { - // Do not expand if the total number of loads is larger than what the - // target allows. Note that it's important that we exit before completing - // the expansion to avoid using a ton of memory to store the expansion for - // large sizes. - return {}; - } - if (NumLoadsForThisSize > 0) { - for (uint64_t I = 0; I < NumLoadsForThisSize; ++I) { - LoadSequence.push_back({LoadSize, Offset}); - Offset += LoadSize; - } - if (LoadSize > 1) - ++NumLoadsNonOneByte; - Size = Size % LoadSize; - } - LoadSizes = LoadSizes.drop_front(); - } - return LoadSequence; -} - -MemCmpExpansion::LoadEntryVector -MemCmpExpansion::computeOverlappingLoadSequence(uint64_t Size, - const unsigned MaxLoadSize, - const unsigned MaxNumLoads, - unsigned &NumLoadsNonOneByte) { - // These are already handled by the greedy approach. - if (Size < 2 || MaxLoadSize < 2) - return {}; - - // We try to do as many non-overlapping loads as possible starting from the - // beginning. - const uint64_t NumNonOverlappingLoads = Size / MaxLoadSize; - assert(NumNonOverlappingLoads && "there must be at least one load"); - // There remain 0 to (MaxLoadSize - 1) bytes to load, this will be done with - // an overlapping load. - Size = Size - NumNonOverlappingLoads * MaxLoadSize; - // Bail if we do not need an overloapping store, this is already handled by - // the greedy approach. - if (Size == 0) - return {}; - // Bail if the number of loads (non-overlapping + potential overlapping one) - // is larger than the max allowed. - if ((NumNonOverlappingLoads + 1) > MaxNumLoads) - return {}; - - // Add non-overlapping loads. - LoadEntryVector LoadSequence; - uint64_t Offset = 0; - for (uint64_t I = 0; I < NumNonOverlappingLoads; ++I) { - LoadSequence.push_back({MaxLoadSize, Offset}); - Offset += MaxLoadSize; - } - - // Add the last overlapping load. - assert(Size > 0 && Size < MaxLoadSize && "broken invariant"); - LoadSequence.push_back({MaxLoadSize, Offset - (MaxLoadSize - Size)}); - NumLoadsNonOneByte = 1; - return LoadSequence; -} - // Initialize the basic block structure required for expansion of memcmp call // with given maximum load size and memcmp size parameter. // This structure includes: @@ -208,31 +133,38 @@ MemCmpExpansion::MemCmpExpansion( Builder(CI) { assert(Size > 0 && "zero blocks"); // Scale the max size down if the target can load more bytes than we need. - llvm::ArrayRef LoadSizes(Options.LoadSizes); - while (!LoadSizes.empty() && LoadSizes.front() > Size) { - LoadSizes = LoadSizes.drop_front(); + size_t LoadSizeIndex = 0; + while (LoadSizeIndex < Options.LoadSizes.size() && + Options.LoadSizes[LoadSizeIndex] > Size) { + ++LoadSizeIndex; } - assert(!LoadSizes.empty() && "cannot load Size bytes"); - MaxLoadSize = LoadSizes.front(); + this->MaxLoadSize = Options.LoadSizes[LoadSizeIndex]; // Compute the decomposition. - unsigned GreedyNumLoadsNonOneByte = 0; - LoadSequence = computeGreedyLoadSequence(Size, LoadSizes, MaxNumLoads, - GreedyNumLoadsNonOneByte); - NumLoadsNonOneByte = GreedyNumLoadsNonOneByte; - assert(LoadSequence.size() <= MaxNumLoads && "broken invariant"); - // If we allow overlapping loads and the load sequence is not already optimal, - // use overlapping loads. - if (Options.AllowOverlappingLoads && - (LoadSequence.empty() || LoadSequence.size() > 2)) { - unsigned OverlappingNumLoadsNonOneByte = 0; - auto OverlappingLoads = computeOverlappingLoadSequence( - Size, MaxLoadSize, MaxNumLoads, OverlappingNumLoadsNonOneByte); - if (!OverlappingLoads.empty() && - (LoadSequence.empty() || - OverlappingLoads.size() < LoadSequence.size())) { - LoadSequence = OverlappingLoads; - NumLoadsNonOneByte = OverlappingNumLoadsNonOneByte; + uint64_t CurSize = Size; + uint64_t Offset = 0; + while (CurSize && LoadSizeIndex < Options.LoadSizes.size()) { + const unsigned LoadSize = Options.LoadSizes[LoadSizeIndex]; + assert(LoadSize > 0 && "zero load size"); + const uint64_t NumLoadsForThisSize = CurSize / LoadSize; + if (LoadSequence.size() + NumLoadsForThisSize > MaxNumLoads) { + // Do not expand if the total number of loads is larger than what the + // target allows. Note that it's important that we exit before completing + // the expansion to avoid using a ton of memory to store the expansion for + // large sizes. + LoadSequence.clear(); + return; } + if (NumLoadsForThisSize > 0) { + for (uint64_t I = 0; I < NumLoadsForThisSize; ++I) { + LoadSequence.push_back({LoadSize, Offset}); + Offset += LoadSize; + } + if (LoadSize > 1) { + ++NumLoadsNonOneByte; + } + CurSize = CurSize % LoadSize; + } + ++LoadSizeIndex; } assert(LoadSequence.size() <= MaxNumLoads && "broken invariant"); } @@ -257,32 +189,30 @@ void MemCmpExpansion::createResultBlock() { EndBlock->getParent(), EndBlock); } -/// Return a pointer to an element of type `LoadSizeType` at offset -/// `OffsetBytes`. -Value *MemCmpExpansion::getPtrToElementAtOffset(Value *Source, - Type *LoadSizeType, - uint64_t OffsetBytes) { - if (OffsetBytes > 0) { - auto *ByteType = Type::getInt8Ty(CI->getContext()); - Source = Builder.CreateGEP( - ByteType, Builder.CreateBitCast(Source, ByteType->getPointerTo()), - ConstantInt::get(ByteType, OffsetBytes)); - } - return Builder.CreateBitCast(Source, LoadSizeType->getPointerTo()); -} - // This function creates the IR instructions for loading and comparing 1 byte. // It loads 1 byte from each source of the memcmp parameters with the given // GEPIndex. It then subtracts the two loaded values and adds this result to the // final phi node for selecting the memcmp result. void MemCmpExpansion::emitLoadCompareByteBlock(unsigned BlockIndex, - unsigned OffsetBytes) { + unsigned GEPIndex) { + Value *Source1 = CI->getArgOperand(0); + Value *Source2 = CI->getArgOperand(1); + Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]); Type *LoadSizeType = Type::getInt8Ty(CI->getContext()); - Value *Source1 = - getPtrToElementAtOffset(CI->getArgOperand(0), LoadSizeType, OffsetBytes); - Value *Source2 = - getPtrToElementAtOffset(CI->getArgOperand(1), LoadSizeType, OffsetBytes); + // Cast source to LoadSizeType*. + if (Source1->getType() != LoadSizeType) + Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); + if (Source2->getType() != LoadSizeType) + Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); + + // Get the base address using the GEPIndex. + if (GEPIndex != 0) { + Source1 = Builder.CreateGEP(LoadSizeType, Source1, + ConstantInt::get(LoadSizeType, GEPIndex)); + Source2 = Builder.CreateGEP(LoadSizeType, Source2, + ConstantInt::get(LoadSizeType, GEPIndex)); + } Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); @@ -340,10 +270,24 @@ Value *MemCmpExpansion::getCompareLoadPairs(unsigned BlockIndex, IntegerType *LoadSizeType = IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8); - Value *Source1 = getPtrToElementAtOffset(CI->getArgOperand(0), LoadSizeType, - CurLoadEntry.Offset); - Value *Source2 = getPtrToElementAtOffset(CI->getArgOperand(1), LoadSizeType, - CurLoadEntry.Offset); + Value *Source1 = CI->getArgOperand(0); + Value *Source2 = CI->getArgOperand(1); + + // Cast source to LoadSizeType*. + if (Source1->getType() != LoadSizeType) + Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); + if (Source2->getType() != LoadSizeType) + Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); + + // Get the base address using a GEP. + if (CurLoadEntry.Offset != 0) { + Source1 = Builder.CreateGEP( + LoadSizeType, Source1, + ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); + Source2 = Builder.CreateGEP( + LoadSizeType, Source2, + ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); + } // Get a constant or load a value for each source address. Value *LoadSrc1 = nullptr; @@ -434,7 +378,8 @@ void MemCmpExpansion::emitLoadCompareBlock(unsigned BlockIndex) { const LoadEntry &CurLoadEntry = LoadSequence[BlockIndex]; if (CurLoadEntry.LoadSize == 1) { - MemCmpExpansion::emitLoadCompareByteBlock(BlockIndex, CurLoadEntry.Offset); + MemCmpExpansion::emitLoadCompareByteBlock(BlockIndex, + CurLoadEntry.getGEPIndex()); return; } @@ -443,12 +388,25 @@ void MemCmpExpansion::emitLoadCompareBlock(unsigned BlockIndex) { Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); assert(CurLoadEntry.LoadSize <= MaxLoadSize && "Unexpected load type"); - Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]); + Value *Source1 = CI->getArgOperand(0); + Value *Source2 = CI->getArgOperand(1); - Value *Source1 = getPtrToElementAtOffset(CI->getArgOperand(0), LoadSizeType, - CurLoadEntry.Offset); - Value *Source2 = getPtrToElementAtOffset(CI->getArgOperand(1), LoadSizeType, - CurLoadEntry.Offset); + Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]); + // Cast source to LoadSizeType*. + if (Source1->getType() != LoadSizeType) + Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); + if (Source2->getType() != LoadSizeType) + Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); + + // Get the base address using a GEP. + if (CurLoadEntry.Offset != 0) { + Source1 = Builder.CreateGEP( + LoadSizeType, Source1, + ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); + Source2 = Builder.CreateGEP( + LoadSizeType, Source2, + ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); + } // Load LoadSizeType from the base address. Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); @@ -736,6 +694,7 @@ static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI, if (SizeVal == 0) { return false; } + // TTI call to check if target would like to expand memcmp. Also, get the // available load sizes. const bool IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI); diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 401ad7979809..788932215926 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1886,7 +1886,7 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/ }; static const CostTblEntry X64CostTbl[] = { // 64-bit targets - { ISD::BITREVERSE, MVT::i64, 14 } + { ISD::BITREVERSE, MVT::i64, 14 } }; static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets { ISD::BITREVERSE, MVT::i32, 14 }, @@ -2899,9 +2899,6 @@ X86TTIImpl::enableMemCmpExpansion(bool IsZeroCmp) const { Options.LoadSizes.push_back(4); Options.LoadSizes.push_back(2); Options.LoadSizes.push_back(1); - // All GPR and vector loads can be unaligned. SIMD compare requires integer - // vectors (SSE2/AVX2). - Options.AllowOverlappingLoads = true; return Options; }(); return IsZeroCmp ? &EqZeroOptions : &ThreeWayOptions; diff --git a/llvm/test/CodeGen/X86/memcmp-optsize.ll b/llvm/test/CodeGen/X86/memcmp-optsize.ll index d2b390f0943a..7683d1a4b311 100644 --- a/llvm/test/CodeGen/X86/memcmp-optsize.ll +++ b/llvm/test/CodeGen/X86/memcmp-optsize.ll @@ -639,33 +639,17 @@ define i32 @length24(i8* %X, i8* %Y) nounwind optsize { } define i1 @length24_eq(i8* %x, i8* %y) nounwind optsize { -; X86-NOSSE-LABEL: length24_eq: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $24 -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: sete %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE2-LABEL: length24_eq: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE2-NEXT: movdqu 8(%ecx), %xmm1 -; X86-SSE2-NEXT: movdqu (%eax), %xmm2 -; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 -; X86-SSE2-NEXT: movdqu 8(%eax), %xmm0 -; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; X86-SSE2-NEXT: pand %xmm2, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: sete %al -; X86-SSE2-NEXT: retl +; X86-LABEL: length24_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $0 +; X86-NEXT: pushl $24 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl ; ; X64-SSE2-LABEL: length24_eq: ; X64-SSE2: # %bb.0: @@ -699,30 +683,17 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind optsize { } define i1 @length24_eq_const(i8* %X) nounwind optsize { -; X86-NOSSE-LABEL: length24_eq_const: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $24 -; X86-NOSSE-NEXT: pushl $.L.str -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: setne %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE2-LABEL: length24_eq_const: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movdqu (%eax), %xmm0 -; X86-SSE2-NEXT: movdqu 8(%eax), %xmm1 -; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm1 -; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: setne %al -; X86-SSE2-NEXT: retl +; X86-LABEL: length24_eq_const: +; X86: # %bb.0: +; X86-NEXT: pushl $0 +; X86-NEXT: pushl $24 +; X86-NEXT: pushl $.L.str +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl ; ; X64-SSE2-LABEL: length24_eq_const: ; X64-SSE2: # %bb.0: diff --git a/llvm/test/CodeGen/X86/memcmp.ll b/llvm/test/CodeGen/X86/memcmp.ll index 371c16881ff3..0bb46eeeac92 100644 --- a/llvm/test/CodeGen/X86/memcmp.ll +++ b/llvm/test/CodeGen/X86/memcmp.ll @@ -362,24 +362,24 @@ define i1 @length5_eq(i8* %X, i8* %Y) nounwind { define i1 @length7_eq(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length7_eq: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: movl 3(%ecx), %ecx -; X86-NEXT: xorl (%eax), %edx -; X86-NEXT: xorl 3(%eax), %ecx -; X86-NEXT: orl %edx, %ecx +; X86-NEXT: pushl $0 +; X86-NEXT: pushl $7 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: testl %eax, %eax ; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: length7_eq: ; X64: # %bb.0: -; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: movl 3(%rdi), %ecx -; X64-NEXT: xorl (%rsi), %eax -; X64-NEXT: xorl 3(%rsi), %ecx -; X64-NEXT: orl %eax, %ecx +; X64-NEXT: pushq %rax +; X64-NEXT: movl $7, %edx +; X64-NEXT: callq memcmp +; X64-NEXT: testl %eax, %eax ; X64-NEXT: setne %al +; X64-NEXT: popq %rcx ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 7) nounwind %c = icmp ne i32 %m, 0 @@ -548,12 +548,12 @@ define i1 @length11_eq(i8* %X, i8* %Y) nounwind { ; ; X64-LABEL: length11_eq: ; X64: # %bb.0: -; X64-NEXT: movq (%rdi), %rax -; X64-NEXT: movq 3(%rdi), %rcx -; X64-NEXT: xorq (%rsi), %rax -; X64-NEXT: xorq 3(%rsi), %rcx -; X64-NEXT: orq %rax, %rcx +; X64-NEXT: pushq %rax +; X64-NEXT: movl $11, %edx +; X64-NEXT: callq memcmp +; X64-NEXT: testl %eax, %eax ; X64-NEXT: sete %al +; X64-NEXT: popq %rcx ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 11) nounwind %c = icmp eq i32 %m, 0 @@ -640,12 +640,12 @@ define i1 @length13_eq(i8* %X, i8* %Y) nounwind { ; ; X64-LABEL: length13_eq: ; X64: # %bb.0: -; X64-NEXT: movq (%rdi), %rax -; X64-NEXT: movq 5(%rdi), %rcx -; X64-NEXT: xorq (%rsi), %rax -; X64-NEXT: xorq 5(%rsi), %rcx -; X64-NEXT: orq %rax, %rcx +; X64-NEXT: pushq %rax +; X64-NEXT: movl $13, %edx +; X64-NEXT: callq memcmp +; X64-NEXT: testl %eax, %eax ; X64-NEXT: sete %al +; X64-NEXT: popq %rcx ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 13) nounwind %c = icmp eq i32 %m, 0 @@ -667,12 +667,12 @@ define i1 @length14_eq(i8* %X, i8* %Y) nounwind { ; ; X64-LABEL: length14_eq: ; X64: # %bb.0: -; X64-NEXT: movq (%rdi), %rax -; X64-NEXT: movq 6(%rdi), %rcx -; X64-NEXT: xorq (%rsi), %rax -; X64-NEXT: xorq 6(%rsi), %rcx -; X64-NEXT: orq %rax, %rcx +; X64-NEXT: pushq %rax +; X64-NEXT: movl $14, %edx +; X64-NEXT: callq memcmp +; X64-NEXT: testl %eax, %eax ; X64-NEXT: sete %al +; X64-NEXT: popq %rcx ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 14) nounwind %c = icmp eq i32 %m, 0 @@ -694,12 +694,12 @@ define i1 @length15_eq(i8* %X, i8* %Y) nounwind { ; ; X64-LABEL: length15_eq: ; X64: # %bb.0: -; X64-NEXT: movq (%rdi), %rax -; X64-NEXT: movq 7(%rdi), %rcx -; X64-NEXT: xorq (%rsi), %rax -; X64-NEXT: xorq 7(%rsi), %rcx -; X64-NEXT: orq %rax, %rcx +; X64-NEXT: pushq %rax +; X64-NEXT: movl $15, %edx +; X64-NEXT: callq memcmp +; X64-NEXT: testl %eax, %eax ; X64-NEXT: sete %al +; X64-NEXT: popq %rcx ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 15) nounwind %c = icmp eq i32 %m, 0 @@ -885,45 +885,17 @@ define i32 @length24(i8* %X, i8* %Y) nounwind { } define i1 @length24_eq(i8* %x, i8* %y) nounwind { -; X86-NOSSE-LABEL: length24_eq: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $24 -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: sete %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE1-LABEL: length24_eq: -; X86-SSE1: # %bb.0: -; X86-SSE1-NEXT: pushl $0 -; X86-SSE1-NEXT: pushl $24 -; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: calll memcmp -; X86-SSE1-NEXT: addl $16, %esp -; X86-SSE1-NEXT: testl %eax, %eax -; X86-SSE1-NEXT: sete %al -; X86-SSE1-NEXT: retl -; -; X86-SSE2-LABEL: length24_eq: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE2-NEXT: movdqu 8(%ecx), %xmm1 -; X86-SSE2-NEXT: movdqu (%eax), %xmm2 -; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 -; X86-SSE2-NEXT: movdqu 8(%eax), %xmm0 -; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; X86-SSE2-NEXT: pand %xmm2, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: sete %al -; X86-SSE2-NEXT: retl +; X86-LABEL: length24_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $0 +; X86-NEXT: pushl $24 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl ; ; X64-SSE2-LABEL: length24_eq: ; X64-SSE2: # %bb.0: @@ -957,42 +929,17 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind { } define i1 @length24_eq_const(i8* %X) nounwind { -; X86-NOSSE-LABEL: length24_eq_const: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $24 -; X86-NOSSE-NEXT: pushl $.L.str -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: setne %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE1-LABEL: length24_eq_const: -; X86-SSE1: # %bb.0: -; X86-SSE1-NEXT: pushl $0 -; X86-SSE1-NEXT: pushl $24 -; X86-SSE1-NEXT: pushl $.L.str -; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: calll memcmp -; X86-SSE1-NEXT: addl $16, %esp -; X86-SSE1-NEXT: testl %eax, %eax -; X86-SSE1-NEXT: setne %al -; X86-SSE1-NEXT: retl -; -; X86-SSE2-LABEL: length24_eq_const: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movdqu (%eax), %xmm0 -; X86-SSE2-NEXT: movdqu 8(%eax), %xmm1 -; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm1 -; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: setne %al -; X86-SSE2-NEXT: retl +; X86-LABEL: length24_eq_const: +; X86: # %bb.0: +; X86-NEXT: pushl $0 +; X86-NEXT: pushl $24 +; X86-NEXT: pushl $.L.str +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl ; ; X64-SSE2-LABEL: length24_eq_const: ; X64-SSE2: # %bb.0: diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll index c1cbcc3272c7..37bd85029b9f 100644 --- a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll +++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll @@ -130,11 +130,11 @@ define i32 @cmp6(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]] ; ALL-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] ; ALL: loadbb1: -; ALL-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i8 4 -; ALL-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to i16* -; ALL-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[Y]], i8 4 -; ALL-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP12]] to i16* -; ALL-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP11]] +; ALL-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i16* +; ALL-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i16* +; ALL-NEXT: [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 2 +; ALL-NEXT: [[TMP13:%.*]] = getelementptr i16, i16* [[TMP11]], i16 2 +; ALL-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP12]] ; ALL-NEXT: [[TMP15:%.*]] = load i16, i16* [[TMP13]] ; ALL-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]]) ; ALL-NEXT: [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]]) @@ -178,11 +178,11 @@ define i32 @cmp8(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X32-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]] ; X32-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] ; X32: loadbb1: -; X32-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i8 4 -; X32-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to i32* -; X32-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[Y]], i8 4 -; X32-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP12]] to i32* -; X32-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP11]] +; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1 +; X32-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP11]], i32 1 +; X32-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] ; X32-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP13]] ; X32-NEXT: [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) ; X32-NEXT: [[TMP17]] = call i32 @llvm.bswap.i32(i32 [[TMP15]]) @@ -272,11 +272,11 @@ define i32 @cmp10(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]] ; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] ; X64: loadbb1: -; X64-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i8 8 -; X64-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to i16* -; X64-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[Y]], i8 8 -; X64-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP12]] to i16* -; X64-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP11]] +; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i16* +; X64-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i16* +; X64-NEXT: [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 4 +; X64-NEXT: [[TMP13:%.*]] = getelementptr i16, i16* [[TMP11]], i16 4 +; X64-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP12]] ; X64-NEXT: [[TMP15:%.*]] = load i16, i16* [[TMP13]] ; X64-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]]) ; X64-NEXT: [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]]) @@ -324,11 +324,11 @@ define i32 @cmp12(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]] ; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] ; X64: loadbb1: -; X64-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i8 8 -; X64-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to i32* -; X64-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[Y]], i8 8 -; X64-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP12]] to i32* -; X64-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP11]] +; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i32* +; X64-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i32* +; X64-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 2 +; X64-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP11]], i32 2 +; X64-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] ; X64-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP13]] ; X64-NEXT: [[TMP16:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) ; X64-NEXT: [[TMP17:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP15]]) @@ -394,11 +394,11 @@ define i32 @cmp16(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]] ; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] ; X64: loadbb1: -; X64-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i8 8 -; X64-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to i64* -; X64-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[Y]], i8 8 -; X64-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP12]] to i64* -; X64-NEXT: [[TMP14:%.*]] = load i64, i64* [[TMP11]] +; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i64* +; X64-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i64* +; X64-NEXT: [[TMP12:%.*]] = getelementptr i64, i64* [[TMP10]], i64 1 +; X64-NEXT: [[TMP13:%.*]] = getelementptr i64, i64* [[TMP11]], i64 1 +; X64-NEXT: [[TMP14:%.*]] = load i64, i64* [[TMP12]] ; X64-NEXT: [[TMP15:%.*]] = load i64, i64* [[TMP13]] ; X64-NEXT: [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]]) ; X64-NEXT: [[TMP17]] = call i64 @llvm.bswap.i64(i64 [[TMP15]]) @@ -597,11 +597,11 @@ define i32 @cmp_eq6(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] ; X32-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] ; X32-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] -; X32-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 4 -; X32-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i16* -; X32-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 4 -; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i16* -; X32-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP7]] +; X32-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i16* +; X32-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i16* +; X32-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 2 +; X32-NEXT: [[TMP9:%.*]] = getelementptr i16, i16* [[TMP7]], i16 2 +; X32-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]] ; X32-NEXT: [[TMP11:%.*]] = load i16, i16* [[TMP9]] ; X32-NEXT: [[TMP12:%.*]] = zext i16 [[TMP10]] to i32 ; X32-NEXT: [[TMP13:%.*]] = zext i16 [[TMP11]] to i32 @@ -625,11 +625,11 @@ define i32 @cmp_eq6(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X64_1LD-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]] ; X64_1LD-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] ; X64_1LD: loadbb1: -; X64_1LD-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 4 -; X64_1LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i16* -; X64_1LD-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 4 -; X64_1LD-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i16* -; X64_1LD-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP7]] +; X64_1LD-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i16* +; X64_1LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i16* +; X64_1LD-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 2 +; X64_1LD-NEXT: [[TMP9:%.*]] = getelementptr i16, i16* [[TMP7]], i16 2 +; X64_1LD-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]] ; X64_1LD-NEXT: [[TMP11:%.*]] = load i16, i16* [[TMP9]] ; X64_1LD-NEXT: [[TMP12:%.*]] = icmp ne i16 [[TMP10]], [[TMP11]] ; X64_1LD-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]] @@ -645,11 +645,11 @@ define i32 @cmp_eq6(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X64_2LD-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] ; X64_2LD-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] ; X64_2LD-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] -; X64_2LD-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 4 -; X64_2LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i16* -; X64_2LD-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 4 -; X64_2LD-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i16* -; X64_2LD-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP7]] +; X64_2LD-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i16* +; X64_2LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i16* +; X64_2LD-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 2 +; X64_2LD-NEXT: [[TMP9:%.*]] = getelementptr i16, i16* [[TMP7]], i16 2 +; X64_2LD-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]] ; X64_2LD-NEXT: [[TMP11:%.*]] = load i16, i16* [[TMP9]] ; X64_2LD-NEXT: [[TMP12:%.*]] = zext i16 [[TMP10]] to i32 ; X64_2LD-NEXT: [[TMP13:%.*]] = zext i16 [[TMP11]] to i32 @@ -668,71 +668,11 @@ define i32 @cmp_eq6(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp_eq7(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp_eq7( -; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* -; X32-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] -; X32-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] -; X32-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] -; X32-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 3 -; X32-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* -; X32-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 3 -; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* -; X32-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP7]] -; X32-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP9]] -; X32-NEXT: [[TMP12:%.*]] = xor i32 [[TMP10]], [[TMP11]] -; X32-NEXT: [[TMP13:%.*]] = or i32 [[TMP5]], [[TMP12]] -; X32-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 -; X32-NEXT: [[TMP15:%.*]] = zext i1 [[TMP14]] to i32 -; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0 -; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X32-NEXT: ret i32 [[CONV]] -; -; X64_1LD-LABEL: @cmp_eq7( -; X64_1LD-NEXT: br label [[LOADBB:%.*]] -; X64_1LD: res_block: -; X64_1LD-NEXT: br label [[ENDBLOCK:%.*]] -; X64_1LD: loadbb: -; X64_1LD-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* -; X64_1LD-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; X64_1LD-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] -; X64_1LD-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] -; X64_1LD-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]] -; X64_1LD-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] -; X64_1LD: loadbb1: -; X64_1LD-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 3 -; X64_1LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* -; X64_1LD-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 3 -; X64_1LD-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* -; X64_1LD-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP7]] -; X64_1LD-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP9]] -; X64_1LD-NEXT: [[TMP12:%.*]] = icmp ne i32 [[TMP10]], [[TMP11]] -; X64_1LD-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]] -; X64_1LD: endblock: -; X64_1LD-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] -; X64_1LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 -; X64_1LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X64_1LD-NEXT: ret i32 [[CONV]] -; -; X64_2LD-LABEL: @cmp_eq7( -; X64_2LD-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* -; X64_2LD-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; X64_2LD-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] -; X64_2LD-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] -; X64_2LD-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] -; X64_2LD-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 3 -; X64_2LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* -; X64_2LD-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 3 -; X64_2LD-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* -; X64_2LD-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP7]] -; X64_2LD-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP9]] -; X64_2LD-NEXT: [[TMP12:%.*]] = xor i32 [[TMP10]], [[TMP11]] -; X64_2LD-NEXT: [[TMP13:%.*]] = or i32 [[TMP5]], [[TMP12]] -; X64_2LD-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 -; X64_2LD-NEXT: [[TMP15:%.*]] = zext i1 [[TMP14]] to i32 -; X64_2LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0 -; X64_2LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X64_2LD-NEXT: ret i32 [[CONV]] +; ALL-LABEL: @cmp_eq7( +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 7) +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; ALL-NEXT: ret i32 [[CONV]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 7) %cmp = icmp eq i32 %call, 0 @@ -747,11 +687,11 @@ define i32 @cmp_eq8(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] ; X32-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] ; X32-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] -; X32-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 4 -; X32-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* -; X32-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 4 -; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* -; X32-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP7]] +; X32-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1 +; X32-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP7]], i32 1 +; X32-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] ; X32-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP9]] ; X32-NEXT: [[TMP12:%.*]] = xor i32 [[TMP10]], [[TMP11]] ; X32-NEXT: [[TMP13:%.*]] = or i32 [[TMP5]], [[TMP12]] @@ -854,11 +794,11 @@ define i32 @cmp_eq10(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X64_1LD-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]] ; X64_1LD-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] ; X64_1LD: loadbb1: -; X64_1LD-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 8 -; X64_1LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i16* -; X64_1LD-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 8 -; X64_1LD-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i16* -; X64_1LD-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP7]] +; X64_1LD-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i16* +; X64_1LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i16* +; X64_1LD-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 4 +; X64_1LD-NEXT: [[TMP9:%.*]] = getelementptr i16, i16* [[TMP7]], i16 4 +; X64_1LD-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]] ; X64_1LD-NEXT: [[TMP11:%.*]] = load i16, i16* [[TMP9]] ; X64_1LD-NEXT: [[TMP12:%.*]] = icmp ne i16 [[TMP10]], [[TMP11]] ; X64_1LD-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]] @@ -874,11 +814,11 @@ define i32 @cmp_eq10(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X64_2LD-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] ; X64_2LD-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] ; X64_2LD-NEXT: [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]] -; X64_2LD-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 8 -; X64_2LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i16* -; X64_2LD-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 8 -; X64_2LD-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i16* -; X64_2LD-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP7]] +; X64_2LD-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i16* +; X64_2LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i16* +; X64_2LD-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 4 +; X64_2LD-NEXT: [[TMP9:%.*]] = getelementptr i16, i16* [[TMP7]], i16 4 +; X64_2LD-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]] ; X64_2LD-NEXT: [[TMP11:%.*]] = load i16, i16* [[TMP9]] ; X64_2LD-NEXT: [[TMP12:%.*]] = zext i16 [[TMP10]] to i64 ; X64_2LD-NEXT: [[TMP13:%.*]] = zext i16 [[TMP11]] to i64 @@ -897,57 +837,11 @@ define i32 @cmp_eq10(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp_eq11(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp_eq11( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 11) -; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X32-NEXT: ret i32 [[CONV]] -; -; X64_1LD-LABEL: @cmp_eq11( -; X64_1LD-NEXT: br label [[LOADBB:%.*]] -; X64_1LD: res_block: -; X64_1LD-NEXT: br label [[ENDBLOCK:%.*]] -; X64_1LD: loadbb: -; X64_1LD-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64_1LD-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64_1LD-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64_1LD-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] -; X64_1LD-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]] -; X64_1LD-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] -; X64_1LD: loadbb1: -; X64_1LD-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 3 -; X64_1LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i64* -; X64_1LD-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 3 -; X64_1LD-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i64* -; X64_1LD-NEXT: [[TMP10:%.*]] = load i64, i64* [[TMP7]] -; X64_1LD-NEXT: [[TMP11:%.*]] = load i64, i64* [[TMP9]] -; X64_1LD-NEXT: [[TMP12:%.*]] = icmp ne i64 [[TMP10]], [[TMP11]] -; X64_1LD-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]] -; X64_1LD: endblock: -; X64_1LD-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] -; X64_1LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 -; X64_1LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X64_1LD-NEXT: ret i32 [[CONV]] -; -; X64_2LD-LABEL: @cmp_eq11( -; X64_2LD-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64_2LD-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64_2LD-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64_2LD-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] -; X64_2LD-NEXT: [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]] -; X64_2LD-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 3 -; X64_2LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i64* -; X64_2LD-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 3 -; X64_2LD-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i64* -; X64_2LD-NEXT: [[TMP10:%.*]] = load i64, i64* [[TMP7]] -; X64_2LD-NEXT: [[TMP11:%.*]] = load i64, i64* [[TMP9]] -; X64_2LD-NEXT: [[TMP12:%.*]] = xor i64 [[TMP10]], [[TMP11]] -; X64_2LD-NEXT: [[TMP13:%.*]] = or i64 [[TMP5]], [[TMP12]] -; X64_2LD-NEXT: [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0 -; X64_2LD-NEXT: [[TMP15:%.*]] = zext i1 [[TMP14]] to i32 -; X64_2LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0 -; X64_2LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X64_2LD-NEXT: ret i32 [[CONV]] +; ALL-LABEL: @cmp_eq11( +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 11) +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; ALL-NEXT: ret i32 [[CONV]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 11) %cmp = icmp eq i32 %call, 0 @@ -974,11 +868,11 @@ define i32 @cmp_eq12(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X64_1LD-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]] ; X64_1LD-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] ; X64_1LD: loadbb1: -; X64_1LD-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 8 -; X64_1LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* -; X64_1LD-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 8 -; X64_1LD-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* -; X64_1LD-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP7]] +; X64_1LD-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i32* +; X64_1LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i32* +; X64_1LD-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 2 +; X64_1LD-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP7]], i32 2 +; X64_1LD-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] ; X64_1LD-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP9]] ; X64_1LD-NEXT: [[TMP12:%.*]] = icmp ne i32 [[TMP10]], [[TMP11]] ; X64_1LD-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]] @@ -994,11 +888,11 @@ define i32 @cmp_eq12(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X64_2LD-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] ; X64_2LD-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] ; X64_2LD-NEXT: [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]] -; X64_2LD-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 8 -; X64_2LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* -; X64_2LD-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 8 -; X64_2LD-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* -; X64_2LD-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP7]] +; X64_2LD-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i32* +; X64_2LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i32* +; X64_2LD-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 2 +; X64_2LD-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP7]], i32 2 +; X64_2LD-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] ; X64_2LD-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP9]] ; X64_2LD-NEXT: [[TMP12:%.*]] = zext i32 [[TMP10]] to i64 ; X64_2LD-NEXT: [[TMP13:%.*]] = zext i32 [[TMP11]] to i64 @@ -1017,57 +911,11 @@ define i32 @cmp_eq12(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp_eq13(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp_eq13( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 13) -; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X32-NEXT: ret i32 [[CONV]] -; -; X64_1LD-LABEL: @cmp_eq13( -; X64_1LD-NEXT: br label [[LOADBB:%.*]] -; X64_1LD: res_block: -; X64_1LD-NEXT: br label [[ENDBLOCK:%.*]] -; X64_1LD: loadbb: -; X64_1LD-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64_1LD-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64_1LD-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64_1LD-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] -; X64_1LD-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]] -; X64_1LD-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] -; X64_1LD: loadbb1: -; X64_1LD-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 5 -; X64_1LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i64* -; X64_1LD-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 5 -; X64_1LD-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i64* -; X64_1LD-NEXT: [[TMP10:%.*]] = load i64, i64* [[TMP7]] -; X64_1LD-NEXT: [[TMP11:%.*]] = load i64, i64* [[TMP9]] -; X64_1LD-NEXT: [[TMP12:%.*]] = icmp ne i64 [[TMP10]], [[TMP11]] -; X64_1LD-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]] -; X64_1LD: endblock: -; X64_1LD-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] -; X64_1LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 -; X64_1LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X64_1LD-NEXT: ret i32 [[CONV]] -; -; X64_2LD-LABEL: @cmp_eq13( -; X64_2LD-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64_2LD-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64_2LD-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64_2LD-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] -; X64_2LD-NEXT: [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]] -; X64_2LD-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 5 -; X64_2LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i64* -; X64_2LD-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 5 -; X64_2LD-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i64* -; X64_2LD-NEXT: [[TMP10:%.*]] = load i64, i64* [[TMP7]] -; X64_2LD-NEXT: [[TMP11:%.*]] = load i64, i64* [[TMP9]] -; X64_2LD-NEXT: [[TMP12:%.*]] = xor i64 [[TMP10]], [[TMP11]] -; X64_2LD-NEXT: [[TMP13:%.*]] = or i64 [[TMP5]], [[TMP12]] -; X64_2LD-NEXT: [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0 -; X64_2LD-NEXT: [[TMP15:%.*]] = zext i1 [[TMP14]] to i32 -; X64_2LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0 -; X64_2LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X64_2LD-NEXT: ret i32 [[CONV]] +; ALL-LABEL: @cmp_eq13( +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 13) +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; ALL-NEXT: ret i32 [[CONV]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 13) %cmp = icmp eq i32 %call, 0 @@ -1076,57 +924,11 @@ define i32 @cmp_eq13(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp_eq14(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp_eq14( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 14) -; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X32-NEXT: ret i32 [[CONV]] -; -; X64_1LD-LABEL: @cmp_eq14( -; X64_1LD-NEXT: br label [[LOADBB:%.*]] -; X64_1LD: res_block: -; X64_1LD-NEXT: br label [[ENDBLOCK:%.*]] -; X64_1LD: loadbb: -; X64_1LD-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64_1LD-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64_1LD-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64_1LD-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] -; X64_1LD-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]] -; X64_1LD-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] -; X64_1LD: loadbb1: -; X64_1LD-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 6 -; X64_1LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i64* -; X64_1LD-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 6 -; X64_1LD-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i64* -; X64_1LD-NEXT: [[TMP10:%.*]] = load i64, i64* [[TMP7]] -; X64_1LD-NEXT: [[TMP11:%.*]] = load i64, i64* [[TMP9]] -; X64_1LD-NEXT: [[TMP12:%.*]] = icmp ne i64 [[TMP10]], [[TMP11]] -; X64_1LD-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]] -; X64_1LD: endblock: -; X64_1LD-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] -; X64_1LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 -; X64_1LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X64_1LD-NEXT: ret i32 [[CONV]] -; -; X64_2LD-LABEL: @cmp_eq14( -; X64_2LD-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64_2LD-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64_2LD-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64_2LD-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] -; X64_2LD-NEXT: [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]] -; X64_2LD-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 6 -; X64_2LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i64* -; X64_2LD-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 6 -; X64_2LD-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i64* -; X64_2LD-NEXT: [[TMP10:%.*]] = load i64, i64* [[TMP7]] -; X64_2LD-NEXT: [[TMP11:%.*]] = load i64, i64* [[TMP9]] -; X64_2LD-NEXT: [[TMP12:%.*]] = xor i64 [[TMP10]], [[TMP11]] -; X64_2LD-NEXT: [[TMP13:%.*]] = or i64 [[TMP5]], [[TMP12]] -; X64_2LD-NEXT: [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0 -; X64_2LD-NEXT: [[TMP15:%.*]] = zext i1 [[TMP14]] to i32 -; X64_2LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0 -; X64_2LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X64_2LD-NEXT: ret i32 [[CONV]] +; ALL-LABEL: @cmp_eq14( +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 14) +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; ALL-NEXT: ret i32 [[CONV]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 14) %cmp = icmp eq i32 %call, 0 @@ -1135,57 +937,11 @@ define i32 @cmp_eq14(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp_eq15(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp_eq15( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15) -; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X32-NEXT: ret i32 [[CONV]] -; -; X64_1LD-LABEL: @cmp_eq15( -; X64_1LD-NEXT: br label [[LOADBB:%.*]] -; X64_1LD: res_block: -; X64_1LD-NEXT: br label [[ENDBLOCK:%.*]] -; X64_1LD: loadbb: -; X64_1LD-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64_1LD-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64_1LD-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64_1LD-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] -; X64_1LD-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]] -; X64_1LD-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] -; X64_1LD: loadbb1: -; X64_1LD-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 7 -; X64_1LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i64* -; X64_1LD-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 7 -; X64_1LD-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i64* -; X64_1LD-NEXT: [[TMP10:%.*]] = load i64, i64* [[TMP7]] -; X64_1LD-NEXT: [[TMP11:%.*]] = load i64, i64* [[TMP9]] -; X64_1LD-NEXT: [[TMP12:%.*]] = icmp ne i64 [[TMP10]], [[TMP11]] -; X64_1LD-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]] -; X64_1LD: endblock: -; X64_1LD-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] -; X64_1LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 -; X64_1LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X64_1LD-NEXT: ret i32 [[CONV]] -; -; X64_2LD-LABEL: @cmp_eq15( -; X64_2LD-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64_2LD-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64_2LD-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64_2LD-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] -; X64_2LD-NEXT: [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]] -; X64_2LD-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 7 -; X64_2LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i64* -; X64_2LD-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 7 -; X64_2LD-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i64* -; X64_2LD-NEXT: [[TMP10:%.*]] = load i64, i64* [[TMP7]] -; X64_2LD-NEXT: [[TMP11:%.*]] = load i64, i64* [[TMP9]] -; X64_2LD-NEXT: [[TMP12:%.*]] = xor i64 [[TMP10]], [[TMP11]] -; X64_2LD-NEXT: [[TMP13:%.*]] = or i64 [[TMP5]], [[TMP12]] -; X64_2LD-NEXT: [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0 -; X64_2LD-NEXT: [[TMP15:%.*]] = zext i1 [[TMP14]] to i32 -; X64_2LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0 -; X64_2LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X64_2LD-NEXT: ret i32 [[CONV]] +; ALL-LABEL: @cmp_eq15( +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15) +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; ALL-NEXT: ret i32 [[CONV]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 15) %cmp = icmp eq i32 %call, 0