2010-01-05 15:32:13 +08:00
|
|
|
//===- InstCombineCalls.cpp -----------------------------------------------===//
|
|
|
|
//
|
2019-01-19 16:50:56 +08:00
|
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
2010-01-05 15:32:13 +08:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
2019-02-09 04:48:56 +08:00
|
|
|
// This file implements the visitCall, visitInvoke, and visitCallBr functions.
|
2010-01-05 15:32:13 +08:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2015-01-22 13:25:13 +08:00
|
|
|
#include "InstCombineInternal.h"
|
2016-08-12 01:20:18 +08:00
|
|
|
#include "llvm/ADT/APFloat.h"
|
|
|
|
#include "llvm/ADT/APInt.h"
|
2019-05-30 02:37:13 +08:00
|
|
|
#include "llvm/ADT/APSInt.h"
|
2016-08-12 01:20:18 +08:00
|
|
|
#include "llvm/ADT/ArrayRef.h"
|
Consolidate internal denormal flushing controls
Currently there are 4 different mechanisms for controlling denormal
flushing behavior, and about as many equivalent frontend controls.
- AMDGPU uses the fp32-denormals and fp64-f16-denormals subtarget features
- NVPTX uses the nvptx-f32ftz attribute
- ARM directly uses the denormal-fp-math attribute
- Other targets indirectly use denormal-fp-math in one DAGCombine
- cl-denorms-are-zero has a corresponding denorms-are-zero attribute
AMDGPU wants a distinct control for f32 flushing from f16/f64, and as
far as I can tell the same is true for NVPTX (based on the attribute
name).
Work on consolidating these into the denormal-fp-math attribute, and a
new type specific denormal-fp-math-f32 variant. Only ARM seems to
support the two different flush modes, so this is overkill for the
other use cases. Ideally we would error on the unsupported
positive-zero mode on other targets from somewhere.
Move the logic for selecting the flush mode into the compiler driver,
instead of handling it in cc1. denormal-fp-math/denormal-fp-math-f32
are now both cc1 flags, but denormal-fp-math-f32 is not yet exposed as
a user flag.
-cl-denorms-are-zero, -fcuda-flush-denormals-to-zero and
-fno-cuda-flush-denormals-to-zero will be mapped to
-fp-denormal-math-f32=ieee or preserve-sign rather than the old
attributes.
Stop emitting the denorms-are-zero attribute for the OpenCL flag. It
has no in-tree users. The meaning would also be target dependent, such
as the AMDGPU choice to treat this as only meaning allow flushing of
f32 and not f16 or f64. The naming is also potentially confusing,
since DAZ in other contexts refers to instructions implicitly treating
input denormals as zero, not necessarily flushing output denormals to
zero.
This also does not attempt to change the behavior for the current
attribute. The LangRef now states that the default is ieee behavior,
but this is inaccurate for the current implementation. The clang
handling is slightly hacky to avoid touching the existing
denormal-fp-math uses. Fixing this will be left for a future patch.
AMDGPU is still using the subtarget feature to control the denormal
mode, but the new attribute are now emitted. A future change will
switch this and remove the subtarget features.
2019-11-02 08:57:29 +08:00
|
|
|
#include "llvm/ADT/FloatingPointMode.h"
|
2016-08-12 01:20:18 +08:00
|
|
|
#include "llvm/ADT/None.h"
|
2017-10-25 05:24:53 +08:00
|
|
|
#include "llvm/ADT/Optional.h"
|
2016-08-12 01:20:18 +08:00
|
|
|
#include "llvm/ADT/STLExtras.h"
|
|
|
|
#include "llvm/ADT/SmallVector.h"
|
2017-06-06 19:49:48 +08:00
|
|
|
#include "llvm/ADT/Statistic.h"
|
2016-08-12 01:20:18 +08:00
|
|
|
#include "llvm/ADT/Twine.h"
|
2017-10-25 05:24:53 +08:00
|
|
|
#include "llvm/Analysis/AssumptionCache.h"
|
2015-05-22 11:56:46 +08:00
|
|
|
#include "llvm/Analysis/InstructionSimplify.h"
|
2019-04-25 17:49:37 +08:00
|
|
|
#include "llvm/Analysis/Loads.h"
|
2010-01-05 15:32:13 +08:00
|
|
|
#include "llvm/Analysis/MemoryBuiltins.h"
|
2016-08-12 01:20:18 +08:00
|
|
|
#include "llvm/Analysis/ValueTracking.h"
|
2019-03-21 02:44:58 +08:00
|
|
|
#include "llvm/Analysis/VectorUtils.h"
|
2017-10-25 05:24:53 +08:00
|
|
|
#include "llvm/IR/Attributes.h"
|
2016-08-12 01:20:18 +08:00
|
|
|
#include "llvm/IR/BasicBlock.h"
|
|
|
|
#include "llvm/IR/Constant.h"
|
2017-10-25 05:24:53 +08:00
|
|
|
#include "llvm/IR/Constants.h"
|
2016-08-12 01:20:18 +08:00
|
|
|
#include "llvm/IR/DataLayout.h"
|
|
|
|
#include "llvm/IR/DerivedTypes.h"
|
|
|
|
#include "llvm/IR/Function.h"
|
|
|
|
#include "llvm/IR/GlobalVariable.h"
|
|
|
|
#include "llvm/IR/InstrTypes.h"
|
|
|
|
#include "llvm/IR/Instruction.h"
|
|
|
|
#include "llvm/IR/Instructions.h"
|
|
|
|
#include "llvm/IR/IntrinsicInst.h"
|
|
|
|
#include "llvm/IR/Intrinsics.h"
|
2019-12-11 23:55:26 +08:00
|
|
|
#include "llvm/IR/IntrinsicsX86.h"
|
|
|
|
#include "llvm/IR/IntrinsicsARM.h"
|
|
|
|
#include "llvm/IR/IntrinsicsAArch64.h"
|
|
|
|
#include "llvm/IR/IntrinsicsNVPTX.h"
|
|
|
|
#include "llvm/IR/IntrinsicsAMDGPU.h"
|
|
|
|
#include "llvm/IR/IntrinsicsPowerPC.h"
|
2016-08-12 01:20:18 +08:00
|
|
|
#include "llvm/IR/LLVMContext.h"
|
|
|
|
#include "llvm/IR/Metadata.h"
|
2014-03-04 19:08:18 +08:00
|
|
|
#include "llvm/IR/PatternMatch.h"
|
[Statepoints 3/4] Statepoint infrastructure for garbage collection: SelectionDAGBuilder
This is the third patch in a small series. It contains the CodeGen support for lowering the gc.statepoint intrinsic sequences (223078) to the STATEPOINT pseudo machine instruction (223085). The change also includes the set of helper routines and classes for working with gc.statepoints, gc.relocates, and gc.results since the lowering code uses them.
With this change, gc.statepoints should be functionally complete. The documentation will follow in the fourth change, and there will likely be some cleanup changes, but interested parties can start experimenting now.
I'm not particularly happy with the amount of code or complexity involved with the lowering step, but at least it's fairly well isolated. The statepoint lowering code is split into it's own files and anyone not working on the statepoint support itself should be able to ignore it.
During the lowering process, we currently spill aggressively to stack. This is not entirely ideal (and we have plans to do better), but it's functional, relatively straight forward, and matches closely the implementations of the patchpoint intrinsics. Most of the complexity comes from trying to keep relocated copies of values in the same stack slots across statepoints. Doing so avoids the insertion of pointless load and store instructions to reshuffle the stack. The current implementation isn't as effective as I'd like, but it is functional and 'good enough' for many common use cases.
In the long term, I'd like to figure out how to integrate the statepoint lowering with the register allocator. In principal, we shouldn't need to eagerly spill at all. The register allocator should do any spilling required and the statepoint should simply record that fact. Depending on how challenging that turns out to be, we may invest in a smarter global stack slot assignment mechanism as a stop gap measure.
Reviewed by: atrick, ributzka
llvm-svn: 223137
2014-12-03 02:50:36 +08:00
|
|
|
#include "llvm/IR/Statepoint.h"
|
2016-08-12 01:20:18 +08:00
|
|
|
#include "llvm/IR/Type.h"
|
2017-10-25 05:24:53 +08:00
|
|
|
#include "llvm/IR/User.h"
|
2016-08-12 01:20:18 +08:00
|
|
|
#include "llvm/IR/Value.h"
|
|
|
|
#include "llvm/IR/ValueHandle.h"
|
2017-10-25 05:24:53 +08:00
|
|
|
#include "llvm/Support/AtomicOrdering.h"
|
2016-08-12 01:20:18 +08:00
|
|
|
#include "llvm/Support/Casting.h"
|
2017-10-25 05:24:53 +08:00
|
|
|
#include "llvm/Support/CommandLine.h"
|
|
|
|
#include "llvm/Support/Compiler.h"
|
2016-08-12 01:20:18 +08:00
|
|
|
#include "llvm/Support/Debug.h"
|
2017-10-25 05:24:53 +08:00
|
|
|
#include "llvm/Support/ErrorHandling.h"
|
2017-04-27 00:39:58 +08:00
|
|
|
#include "llvm/Support/KnownBits.h"
|
2016-08-12 01:20:18 +08:00
|
|
|
#include "llvm/Support/MathExtras.h"
|
2017-10-25 05:24:53 +08:00
|
|
|
#include "llvm/Support/raw_ostream.h"
|
|
|
|
#include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
|
2019-04-25 10:30:17 +08:00
|
|
|
#include "llvm/Transforms/Utils/Local.h"
|
2015-01-21 19:23:40 +08:00
|
|
|
#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
|
2016-08-12 01:20:18 +08:00
|
|
|
#include <algorithm>
|
|
|
|
#include <cassert>
|
|
|
|
#include <cstdint>
|
|
|
|
#include <cstring>
|
2017-10-25 05:24:53 +08:00
|
|
|
#include <utility>
|
2016-08-12 01:20:18 +08:00
|
|
|
#include <vector>
|
|
|
|
|
2010-01-05 15:32:13 +08:00
|
|
|
using namespace llvm;
|
2012-12-13 11:13:36 +08:00
|
|
|
using namespace PatternMatch;
|
2010-01-05 15:32:13 +08:00
|
|
|
|
2014-04-22 10:55:47 +08:00
|
|
|
#define DEBUG_TYPE "instcombine"
|
|
|
|
|
2012-11-30 12:05:06 +08:00
|
|
|
STATISTIC(NumSimplified, "Number of library calls simplified");
|
|
|
|
|
2018-05-10 06:56:32 +08:00
|
|
|
static cl::opt<unsigned> GuardWideningWindow(
|
|
|
|
"instcombine-guard-widening-window",
|
|
|
|
cl::init(3),
|
|
|
|
cl::desc("How wide an instruction window to bypass looking for "
|
|
|
|
"another guard"));
|
|
|
|
|
2016-01-21 06:24:38 +08:00
|
|
|
/// Return the specified type promoted as it would be to pass though a va_arg
|
|
|
|
/// area.
|
2011-07-18 12:54:35 +08:00
|
|
|
static Type *getPromotedType(Type *Ty) {
|
|
|
|
if (IntegerType* ITy = dyn_cast<IntegerType>(Ty)) {
|
2010-01-05 15:32:13 +08:00
|
|
|
if (ITy->getBitWidth() < 32)
|
|
|
|
return Type::getInt32Ty(Ty->getContext());
|
|
|
|
}
|
|
|
|
return Ty;
|
|
|
|
}
|
|
|
|
|
2016-02-22 01:29:33 +08:00
|
|
|
/// Return a constant boolean vector that has true elements in all positions
|
2016-02-22 01:33:31 +08:00
|
|
|
/// where the input constant data vector has an element with the sign bit set.
|
2016-02-22 01:29:33 +08:00
|
|
|
static Constant *getNegativeIsTrueBoolVec(ConstantDataVector *V) {
|
|
|
|
SmallVector<Constant *, 32> BoolVec;
|
|
|
|
IntegerType *BoolTy = Type::getInt1Ty(V->getContext());
|
|
|
|
for (unsigned I = 0, E = V->getNumElements(); I != E; ++I) {
|
|
|
|
Constant *Elt = V->getElementAsConstant(I);
|
|
|
|
assert((isa<ConstantInt>(Elt) || isa<ConstantFP>(Elt)) &&
|
|
|
|
"Unexpected constant data vector element type");
|
|
|
|
bool Sign = V->getElementType()->isIntegerTy()
|
|
|
|
? cast<ConstantInt>(Elt)->isNegative()
|
|
|
|
: cast<ConstantFP>(Elt)->isNegative();
|
|
|
|
BoolVec.push_back(ConstantInt::get(BoolTy, Sign));
|
|
|
|
}
|
|
|
|
return ConstantVector::get(BoolVec);
|
|
|
|
}
|
|
|
|
|
2018-05-11 22:30:02 +08:00
|
|
|
Instruction *InstCombiner::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) {
|
[InstCombine] Simplify MemTransferInst's source and dest alignments separately
Summary:
This change is part of step five in the series of changes to remove alignment argument from
memcpy/memmove/memset in favour of alignment attributes. In particular, this changes the
InstCombine pass to cease using the deprecated MemoryIntrinsic::getAlignment() method, and
instead we use the separate getSourceAlignment and getDestAlignment APIs to simplify
the source and destination alignment attributes separately.
Steps:
Step 1) Remove alignment parameter and create alignment parameter attributes for
memcpy/memmove/memset. ( rL322965, rC322964, rL322963 )
Step 2) Expand the IRBuilder API to allow creation of memcpy/memmove with differing
source and dest alignments. ( rL323597 )
Step 3) Update Clang to use the new IRBuilder API. ( rC323617 )
Step 4) Update Polly to use the new IRBuilder API. ( rL323618 )
Step 5) Update LLVM passes that create memcpy/memmove calls to use the new IRBuilder API,
and those that use use MemIntrinsicInst::[get|set]Alignment() to use [get|set]DestAlignment()
and [get|set]SourceAlignment() instead. ( rL323886, rL323891, rL324148, rL324273, rL324278,
rL324384, rL324395, rL324402, rL324626, rL324642, rL324653, rL324654, rL324773, rL324774,
rL324781, rL324784, rL324955 )
Step 6) Remove the single-alignment IRBuilder API for memcpy/memmove, and the
MemIntrinsicInst::[get|set]Alignment() methods.
Reference
http://lists.llvm.org/pipermail/llvm-dev/2015-August/089384.html
http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20151109/312083.html
Reviewers: majnemer, bollu, efriedma
Reviewed By: efriedma
Subscribers: efriedma, llvm-commits
Differential Revision: https://reviews.llvm.org/D42871
llvm-svn: 324960
2018-02-13 07:06:55 +08:00
|
|
|
unsigned DstAlign = getKnownAlignment(MI->getRawDest(), DL, MI, &AC, &DT);
|
|
|
|
unsigned CopyDstAlign = MI->getDestAlignment();
|
|
|
|
if (CopyDstAlign < DstAlign){
|
|
|
|
MI->setDestAlignment(DstAlign);
|
|
|
|
return MI;
|
|
|
|
}
|
2010-01-05 15:32:13 +08:00
|
|
|
|
2018-05-11 22:30:02 +08:00
|
|
|
unsigned SrcAlign = getKnownAlignment(MI->getRawSource(), DL, MI, &AC, &DT);
|
|
|
|
unsigned CopySrcAlign = MI->getSourceAlignment();
|
[InstCombine] Simplify MemTransferInst's source and dest alignments separately
Summary:
This change is part of step five in the series of changes to remove alignment argument from
memcpy/memmove/memset in favour of alignment attributes. In particular, this changes the
InstCombine pass to cease using the deprecated MemoryIntrinsic::getAlignment() method, and
instead we use the separate getSourceAlignment and getDestAlignment APIs to simplify
the source and destination alignment attributes separately.
Steps:
Step 1) Remove alignment parameter and create alignment parameter attributes for
memcpy/memmove/memset. ( rL322965, rC322964, rL322963 )
Step 2) Expand the IRBuilder API to allow creation of memcpy/memmove with differing
source and dest alignments. ( rL323597 )
Step 3) Update Clang to use the new IRBuilder API. ( rC323617 )
Step 4) Update Polly to use the new IRBuilder API. ( rL323618 )
Step 5) Update LLVM passes that create memcpy/memmove calls to use the new IRBuilder API,
and those that use use MemIntrinsicInst::[get|set]Alignment() to use [get|set]DestAlignment()
and [get|set]SourceAlignment() instead. ( rL323886, rL323891, rL324148, rL324273, rL324278,
rL324384, rL324395, rL324402, rL324626, rL324642, rL324653, rL324654, rL324773, rL324774,
rL324781, rL324784, rL324955 )
Step 6) Remove the single-alignment IRBuilder API for memcpy/memmove, and the
MemIntrinsicInst::[get|set]Alignment() methods.
Reference
http://lists.llvm.org/pipermail/llvm-dev/2015-August/089384.html
http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20151109/312083.html
Reviewers: majnemer, bollu, efriedma
Reviewed By: efriedma
Subscribers: efriedma, llvm-commits
Differential Revision: https://reviews.llvm.org/D42871
llvm-svn: 324960
2018-02-13 07:06:55 +08:00
|
|
|
if (CopySrcAlign < SrcAlign) {
|
2018-05-11 22:30:02 +08:00
|
|
|
MI->setSourceAlignment(SrcAlign);
|
2010-01-05 15:32:13 +08:00
|
|
|
return MI;
|
|
|
|
}
|
2012-02-03 08:07:04 +08:00
|
|
|
|
2019-04-23 04:28:19 +08:00
|
|
|
// If we have a store to a location which is known constant, we can conclude
|
|
|
|
// that the store must be storing the constant value (else the memory
|
|
|
|
// wouldn't be constant), and this must be a noop.
|
|
|
|
if (AA->pointsToConstantMemory(MI->getDest())) {
|
|
|
|
// Set the size of the copy to 0, it will be deleted on the next iteration.
|
|
|
|
MI->setLength(Constant::getNullValue(MI->getLength()->getType()));
|
|
|
|
return MI;
|
|
|
|
}
|
|
|
|
|
2010-01-05 15:32:13 +08:00
|
|
|
// If MemCpyInst length is 1/2/4/8 bytes then replace memcpy with
|
|
|
|
// load/store.
|
2018-05-11 22:30:02 +08:00
|
|
|
ConstantInt *MemOpLength = dyn_cast<ConstantInt>(MI->getLength());
|
2014-04-25 13:29:35 +08:00
|
|
|
if (!MemOpLength) return nullptr;
|
2012-02-03 08:07:04 +08:00
|
|
|
|
2010-01-05 15:32:13 +08:00
|
|
|
// Source and destination pointer types are always "i8*" for intrinsic. See
|
|
|
|
// if the size is something we can handle with a single primitive load/store.
|
|
|
|
// A single load+store correctly handles overlapping memory in the memmove
|
|
|
|
// case.
|
2012-08-15 11:49:59 +08:00
|
|
|
uint64_t Size = MemOpLength->getLimitedValue();
|
2014-01-25 01:20:08 +08:00
|
|
|
assert(Size && "0-sized memory transferring should be removed already.");
|
2012-02-03 08:07:04 +08:00
|
|
|
|
2010-01-05 15:32:13 +08:00
|
|
|
if (Size > 8 || (Size&(Size-1)))
|
2014-04-25 13:29:35 +08:00
|
|
|
return nullptr; // If not 1/2/4/8 bytes, exit.
|
2012-02-03 08:07:04 +08:00
|
|
|
|
2019-01-16 12:36:26 +08:00
|
|
|
// If it is an atomic and alignment is less than the size then we will
|
|
|
|
// introduce the unaligned memory access which will be later transformed
|
|
|
|
// into libcall in CodeGen. This is not evident performance gain so disable
|
|
|
|
// it now.
|
|
|
|
if (isa<AtomicMemTransferInst>(MI))
|
|
|
|
if (CopyDstAlign < Size || CopySrcAlign < Size)
|
|
|
|
return nullptr;
|
|
|
|
|
2010-01-05 15:32:13 +08:00
|
|
|
// Use an integer load+store unless we can find something better.
|
Reapply address space patch after fixing an issue in MemCopyOptimizer.
Added support for address spaces and added a isVolatile field to memcpy, memmove, and memset,
e.g., llvm.memcpy.i32(i8*, i8*, i32, i32) -> llvm.memcpy.p0i8.p0i8.i32(i8*, i8*, i32, i32, i1)
llvm-svn: 100304
2010-04-04 11:10:48 +08:00
|
|
|
unsigned SrcAddrSp =
|
2010-06-24 21:54:33 +08:00
|
|
|
cast<PointerType>(MI->getArgOperand(1)->getType())->getAddressSpace();
|
2010-04-16 23:33:14 +08:00
|
|
|
unsigned DstAddrSp =
|
2010-06-24 21:54:33 +08:00
|
|
|
cast<PointerType>(MI->getArgOperand(0)->getType())->getAddressSpace();
|
Reapply address space patch after fixing an issue in MemCopyOptimizer.
Added support for address spaces and added a isVolatile field to memcpy, memmove, and memset,
e.g., llvm.memcpy.i32(i8*, i8*, i32, i32) -> llvm.memcpy.p0i8.p0i8.i32(i8*, i8*, i32, i32, i1)
llvm-svn: 100304
2010-04-04 11:10:48 +08:00
|
|
|
|
2011-07-18 12:54:35 +08:00
|
|
|
IntegerType* IntType = IntegerType::get(MI->getContext(), Size<<3);
|
Reapply address space patch after fixing an issue in MemCopyOptimizer.
Added support for address spaces and added a isVolatile field to memcpy, memmove, and memset,
e.g., llvm.memcpy.i32(i8*, i8*, i32, i32) -> llvm.memcpy.p0i8.p0i8.i32(i8*, i8*, i32, i32, i1)
llvm-svn: 100304
2010-04-04 11:10:48 +08:00
|
|
|
Type *NewSrcPtrTy = PointerType::get(IntType, SrcAddrSp);
|
|
|
|
Type *NewDstPtrTy = PointerType::get(IntType, DstAddrSp);
|
2012-02-03 08:07:04 +08:00
|
|
|
|
2017-03-01 14:45:20 +08:00
|
|
|
// If the memcpy has metadata describing the members, see if we can get the
|
|
|
|
// TBAA tag describing our copy.
|
2014-04-25 13:29:35 +08:00
|
|
|
MDNode *CopyMD = nullptr;
|
2018-02-19 20:10:20 +08:00
|
|
|
if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa)) {
|
|
|
|
CopyMD = M;
|
|
|
|
} else if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa_struct)) {
|
2017-03-01 14:45:20 +08:00
|
|
|
if (M->getNumOperands() == 3 && M->getOperand(0) &&
|
|
|
|
mdconst::hasa<ConstantInt>(M->getOperand(0)) &&
|
2017-07-07 02:39:47 +08:00
|
|
|
mdconst::extract<ConstantInt>(M->getOperand(0))->isZero() &&
|
2017-03-01 14:45:20 +08:00
|
|
|
M->getOperand(1) &&
|
|
|
|
mdconst::hasa<ConstantInt>(M->getOperand(1)) &&
|
|
|
|
mdconst::extract<ConstantInt>(M->getOperand(1))->getValue() ==
|
|
|
|
Size &&
|
|
|
|
M->getOperand(2) && isa<MDNode>(M->getOperand(2)))
|
|
|
|
CopyMD = cast<MDNode>(M->getOperand(2));
|
2010-01-05 15:32:13 +08:00
|
|
|
}
|
2012-02-03 08:07:04 +08:00
|
|
|
|
2017-07-08 07:16:26 +08:00
|
|
|
Value *Src = Builder.CreateBitCast(MI->getArgOperand(1), NewSrcPtrTy);
|
|
|
|
Value *Dest = Builder.CreateBitCast(MI->getArgOperand(0), NewDstPtrTy);
|
2019-02-02 04:44:24 +08:00
|
|
|
LoadInst *L = Builder.CreateLoad(IntType, Src);
|
[InstCombine] Simplify MemTransferInst's source and dest alignments separately
Summary:
This change is part of step five in the series of changes to remove alignment argument from
memcpy/memmove/memset in favour of alignment attributes. In particular, this changes the
InstCombine pass to cease using the deprecated MemoryIntrinsic::getAlignment() method, and
instead we use the separate getSourceAlignment and getDestAlignment APIs to simplify
the source and destination alignment attributes separately.
Steps:
Step 1) Remove alignment parameter and create alignment parameter attributes for
memcpy/memmove/memset. ( rL322965, rC322964, rL322963 )
Step 2) Expand the IRBuilder API to allow creation of memcpy/memmove with differing
source and dest alignments. ( rL323597 )
Step 3) Update Clang to use the new IRBuilder API. ( rC323617 )
Step 4) Update Polly to use the new IRBuilder API. ( rL323618 )
Step 5) Update LLVM passes that create memcpy/memmove calls to use the new IRBuilder API,
and those that use use MemIntrinsicInst::[get|set]Alignment() to use [get|set]DestAlignment()
and [get|set]SourceAlignment() instead. ( rL323886, rL323891, rL324148, rL324273, rL324278,
rL324384, rL324395, rL324402, rL324626, rL324642, rL324653, rL324654, rL324773, rL324774,
rL324781, rL324784, rL324955 )
Step 6) Remove the single-alignment IRBuilder API for memcpy/memmove, and the
MemIntrinsicInst::[get|set]Alignment() methods.
Reference
http://lists.llvm.org/pipermail/llvm-dev/2015-August/089384.html
http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20151109/312083.html
Reviewers: majnemer, bollu, efriedma
Reviewed By: efriedma
Subscribers: efriedma, llvm-commits
Differential Revision: https://reviews.llvm.org/D42871
llvm-svn: 324960
2018-02-13 07:06:55 +08:00
|
|
|
// Alignment from the mem intrinsic will be better, so use it.
|
2019-10-03 21:17:21 +08:00
|
|
|
L->setAlignment(
|
|
|
|
MaybeAlign(CopySrcAlign)); // FIXME: Check if we can use Align instead.
|
2012-09-14 05:51:01 +08:00
|
|
|
if (CopyMD)
|
|
|
|
L->setMetadata(LLVMContext::MD_tbaa, CopyMD);
|
2016-09-04 15:49:39 +08:00
|
|
|
MDNode *LoopMemParallelMD =
|
|
|
|
MI->getMetadata(LLVMContext::MD_mem_parallel_loop_access);
|
|
|
|
if (LoopMemParallelMD)
|
|
|
|
L->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD);
|
2018-12-20 12:58:07 +08:00
|
|
|
MDNode *AccessGroupMD = MI->getMetadata(LLVMContext::MD_access_group);
|
|
|
|
if (AccessGroupMD)
|
|
|
|
L->setMetadata(LLVMContext::MD_access_group, AccessGroupMD);
|
2016-09-04 15:06:00 +08:00
|
|
|
|
2018-05-11 22:30:02 +08:00
|
|
|
StoreInst *S = Builder.CreateStore(L, Dest);
|
[InstCombine] Simplify MemTransferInst's source and dest alignments separately
Summary:
This change is part of step five in the series of changes to remove alignment argument from
memcpy/memmove/memset in favour of alignment attributes. In particular, this changes the
InstCombine pass to cease using the deprecated MemoryIntrinsic::getAlignment() method, and
instead we use the separate getSourceAlignment and getDestAlignment APIs to simplify
the source and destination alignment attributes separately.
Steps:
Step 1) Remove alignment parameter and create alignment parameter attributes for
memcpy/memmove/memset. ( rL322965, rC322964, rL322963 )
Step 2) Expand the IRBuilder API to allow creation of memcpy/memmove with differing
source and dest alignments. ( rL323597 )
Step 3) Update Clang to use the new IRBuilder API. ( rC323617 )
Step 4) Update Polly to use the new IRBuilder API. ( rL323618 )
Step 5) Update LLVM passes that create memcpy/memmove calls to use the new IRBuilder API,
and those that use use MemIntrinsicInst::[get|set]Alignment() to use [get|set]DestAlignment()
and [get|set]SourceAlignment() instead. ( rL323886, rL323891, rL324148, rL324273, rL324278,
rL324384, rL324395, rL324402, rL324626, rL324642, rL324653, rL324654, rL324773, rL324774,
rL324781, rL324784, rL324955 )
Step 6) Remove the single-alignment IRBuilder API for memcpy/memmove, and the
MemIntrinsicInst::[get|set]Alignment() methods.
Reference
http://lists.llvm.org/pipermail/llvm-dev/2015-August/089384.html
http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20151109/312083.html
Reviewers: majnemer, bollu, efriedma
Reviewed By: efriedma
Subscribers: efriedma, llvm-commits
Differential Revision: https://reviews.llvm.org/D42871
llvm-svn: 324960
2018-02-13 07:06:55 +08:00
|
|
|
// Alignment from the mem intrinsic will be better, so use it.
|
2019-10-03 21:17:21 +08:00
|
|
|
S->setAlignment(
|
|
|
|
MaybeAlign(CopyDstAlign)); // FIXME: Check if we can use Align instead.
|
2012-09-14 05:51:01 +08:00
|
|
|
if (CopyMD)
|
|
|
|
S->setMetadata(LLVMContext::MD_tbaa, CopyMD);
|
2016-09-04 15:49:39 +08:00
|
|
|
if (LoopMemParallelMD)
|
|
|
|
S->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD);
|
2018-12-20 12:58:07 +08:00
|
|
|
if (AccessGroupMD)
|
|
|
|
S->setMetadata(LLVMContext::MD_access_group, AccessGroupMD);
|
2010-01-05 15:32:13 +08:00
|
|
|
|
2018-05-11 22:30:02 +08:00
|
|
|
if (auto *MT = dyn_cast<MemTransferInst>(MI)) {
|
|
|
|
// non-atomics can be volatile
|
|
|
|
L->setVolatile(MT->isVolatile());
|
|
|
|
S->setVolatile(MT->isVolatile());
|
|
|
|
}
|
|
|
|
if (isa<AtomicMemTransferInst>(MI)) {
|
|
|
|
// atomics have to be unordered
|
|
|
|
L->setOrdering(AtomicOrdering::Unordered);
|
|
|
|
S->setOrdering(AtomicOrdering::Unordered);
|
|
|
|
}
|
|
|
|
|
2010-01-05 15:32:13 +08:00
|
|
|
// Set the size of the copy to 0, it will be deleted on the next iteration.
|
2018-05-11 22:30:02 +08:00
|
|
|
MI->setLength(Constant::getNullValue(MemOpLength->getType()));
|
2010-01-05 15:32:13 +08:00
|
|
|
return MI;
|
|
|
|
}
|
|
|
|
|
2018-05-12 04:04:50 +08:00
|
|
|
Instruction *InstCombiner::SimplifyAnyMemSet(AnyMemSetInst *MI) {
|
2019-10-03 21:17:21 +08:00
|
|
|
const unsigned KnownAlignment =
|
|
|
|
getKnownAlignment(MI->getDest(), DL, MI, &AC, &DT);
|
|
|
|
if (MI->getDestAlignment() < KnownAlignment) {
|
|
|
|
MI->setDestAlignment(KnownAlignment);
|
2010-01-05 15:32:13 +08:00
|
|
|
return MI;
|
|
|
|
}
|
2012-02-03 08:07:04 +08:00
|
|
|
|
2019-04-23 04:28:19 +08:00
|
|
|
// If we have a store to a location which is known constant, we can conclude
|
|
|
|
// that the store must be storing the constant value (else the memory
|
|
|
|
// wouldn't be constant), and this must be a noop.
|
|
|
|
if (AA->pointsToConstantMemory(MI->getDest())) {
|
|
|
|
// Set the size of the copy to 0, it will be deleted on the next iteration.
|
|
|
|
MI->setLength(Constant::getNullValue(MI->getLength()->getType()));
|
|
|
|
return MI;
|
|
|
|
}
|
|
|
|
|
2010-01-05 15:32:13 +08:00
|
|
|
// Extract the length and alignment and fill if they are constant.
|
|
|
|
ConstantInt *LenC = dyn_cast<ConstantInt>(MI->getLength());
|
|
|
|
ConstantInt *FillC = dyn_cast<ConstantInt>(MI->getValue());
|
2010-02-16 00:12:20 +08:00
|
|
|
if (!LenC || !FillC || !FillC->getType()->isIntegerTy(8))
|
2014-04-25 13:29:35 +08:00
|
|
|
return nullptr;
|
2019-10-03 21:17:21 +08:00
|
|
|
const uint64_t Len = LenC->getLimitedValue();
|
2012-08-15 11:49:59 +08:00
|
|
|
assert(Len && "0-sized memory setting should be removed already.");
|
2019-10-03 21:17:21 +08:00
|
|
|
const Align Alignment = assumeAligned(MI->getDestAlignment());
|
2019-01-16 12:36:26 +08:00
|
|
|
|
|
|
|
// If it is an atomic and alignment is less than the size then we will
|
|
|
|
// introduce the unaligned memory access which will be later transformed
|
|
|
|
// into libcall in CodeGen. This is not evident performance gain so disable
|
|
|
|
// it now.
|
|
|
|
if (isa<AtomicMemSetInst>(MI))
|
|
|
|
if (Alignment < Len)
|
|
|
|
return nullptr;
|
|
|
|
|
2010-01-05 15:32:13 +08:00
|
|
|
// memset(s,c,n) -> store s, c (for n=1,2,4,8)
|
|
|
|
if (Len <= 8 && isPowerOf2_32((uint32_t)Len)) {
|
2011-07-18 12:54:35 +08:00
|
|
|
Type *ITy = IntegerType::get(MI->getContext(), Len*8); // n=1 -> i8.
|
2012-02-03 08:07:04 +08:00
|
|
|
|
2010-01-05 15:32:13 +08:00
|
|
|
Value *Dest = MI->getDest();
|
2010-12-20 09:05:30 +08:00
|
|
|
unsigned DstAddrSp = cast<PointerType>(Dest->getType())->getAddressSpace();
|
|
|
|
Type *NewDstPtrTy = PointerType::get(ITy, DstAddrSp);
|
2017-07-08 07:16:26 +08:00
|
|
|
Dest = Builder.CreateBitCast(Dest, NewDstPtrTy);
|
2010-01-05 15:32:13 +08:00
|
|
|
|
|
|
|
// Extract the fill value and store.
|
|
|
|
uint64_t Fill = FillC->getZExtValue()*0x0101010101010101ULL;
|
2017-07-08 07:16:26 +08:00
|
|
|
StoreInst *S = Builder.CreateStore(ConstantInt::get(ITy, Fill), Dest,
|
|
|
|
MI->isVolatile());
|
2011-05-19 03:57:14 +08:00
|
|
|
S->setAlignment(Alignment);
|
2018-05-12 04:04:50 +08:00
|
|
|
if (isa<AtomicMemSetInst>(MI))
|
|
|
|
S->setOrdering(AtomicOrdering::Unordered);
|
2012-02-03 08:07:04 +08:00
|
|
|
|
2010-01-05 15:32:13 +08:00
|
|
|
// Set the size of the copy to 0, it will be deleted on the next iteration.
|
|
|
|
MI->setLength(Constant::getNullValue(LenC->getType()));
|
|
|
|
return MI;
|
|
|
|
}
|
|
|
|
|
2015-08-05 16:18:00 +08:00
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
2016-01-30 07:27:03 +08:00
|
|
|
static Value *simplifyX86immShift(const IntrinsicInst &II,
|
2015-08-13 15:39:03 +08:00
|
|
|
InstCombiner::BuilderTy &Builder) {
|
|
|
|
bool LogicalShift = false;
|
|
|
|
bool ShiftLeft = false;
|
|
|
|
|
|
|
|
switch (II.getIntrinsicID()) {
|
2016-11-13 15:26:19 +08:00
|
|
|
default: llvm_unreachable("Unexpected intrinsic!");
|
2015-08-13 15:39:03 +08:00
|
|
|
case Intrinsic::x86_sse2_psra_d:
|
|
|
|
case Intrinsic::x86_sse2_psra_w:
|
|
|
|
case Intrinsic::x86_sse2_psrai_d:
|
|
|
|
case Intrinsic::x86_sse2_psrai_w:
|
|
|
|
case Intrinsic::x86_avx2_psra_d:
|
|
|
|
case Intrinsic::x86_avx2_psra_w:
|
|
|
|
case Intrinsic::x86_avx2_psrai_d:
|
|
|
|
case Intrinsic::x86_avx2_psrai_w:
|
2016-11-13 09:51:55 +08:00
|
|
|
case Intrinsic::x86_avx512_psra_q_128:
|
|
|
|
case Intrinsic::x86_avx512_psrai_q_128:
|
|
|
|
case Intrinsic::x86_avx512_psra_q_256:
|
|
|
|
case Intrinsic::x86_avx512_psrai_q_256:
|
|
|
|
case Intrinsic::x86_avx512_psra_d_512:
|
|
|
|
case Intrinsic::x86_avx512_psra_q_512:
|
|
|
|
case Intrinsic::x86_avx512_psra_w_512:
|
|
|
|
case Intrinsic::x86_avx512_psrai_d_512:
|
|
|
|
case Intrinsic::x86_avx512_psrai_q_512:
|
|
|
|
case Intrinsic::x86_avx512_psrai_w_512:
|
2015-08-13 15:39:03 +08:00
|
|
|
LogicalShift = false; ShiftLeft = false;
|
|
|
|
break;
|
|
|
|
case Intrinsic::x86_sse2_psrl_d:
|
|
|
|
case Intrinsic::x86_sse2_psrl_q:
|
|
|
|
case Intrinsic::x86_sse2_psrl_w:
|
|
|
|
case Intrinsic::x86_sse2_psrli_d:
|
|
|
|
case Intrinsic::x86_sse2_psrli_q:
|
|
|
|
case Intrinsic::x86_sse2_psrli_w:
|
|
|
|
case Intrinsic::x86_avx2_psrl_d:
|
|
|
|
case Intrinsic::x86_avx2_psrl_q:
|
|
|
|
case Intrinsic::x86_avx2_psrl_w:
|
|
|
|
case Intrinsic::x86_avx2_psrli_d:
|
|
|
|
case Intrinsic::x86_avx2_psrli_q:
|
|
|
|
case Intrinsic::x86_avx2_psrli_w:
|
2016-11-13 09:51:55 +08:00
|
|
|
case Intrinsic::x86_avx512_psrl_d_512:
|
|
|
|
case Intrinsic::x86_avx512_psrl_q_512:
|
|
|
|
case Intrinsic::x86_avx512_psrl_w_512:
|
|
|
|
case Intrinsic::x86_avx512_psrli_d_512:
|
|
|
|
case Intrinsic::x86_avx512_psrli_q_512:
|
|
|
|
case Intrinsic::x86_avx512_psrli_w_512:
|
2015-08-13 15:39:03 +08:00
|
|
|
LogicalShift = true; ShiftLeft = false;
|
|
|
|
break;
|
|
|
|
case Intrinsic::x86_sse2_psll_d:
|
|
|
|
case Intrinsic::x86_sse2_psll_q:
|
|
|
|
case Intrinsic::x86_sse2_psll_w:
|
|
|
|
case Intrinsic::x86_sse2_pslli_d:
|
|
|
|
case Intrinsic::x86_sse2_pslli_q:
|
|
|
|
case Intrinsic::x86_sse2_pslli_w:
|
|
|
|
case Intrinsic::x86_avx2_psll_d:
|
|
|
|
case Intrinsic::x86_avx2_psll_q:
|
|
|
|
case Intrinsic::x86_avx2_psll_w:
|
|
|
|
case Intrinsic::x86_avx2_pslli_d:
|
|
|
|
case Intrinsic::x86_avx2_pslli_q:
|
|
|
|
case Intrinsic::x86_avx2_pslli_w:
|
2016-11-13 09:51:55 +08:00
|
|
|
case Intrinsic::x86_avx512_psll_d_512:
|
|
|
|
case Intrinsic::x86_avx512_psll_q_512:
|
|
|
|
case Intrinsic::x86_avx512_psll_w_512:
|
|
|
|
case Intrinsic::x86_avx512_pslli_d_512:
|
|
|
|
case Intrinsic::x86_avx512_pslli_q_512:
|
|
|
|
case Intrinsic::x86_avx512_pslli_w_512:
|
2015-08-13 15:39:03 +08:00
|
|
|
LogicalShift = true; ShiftLeft = true;
|
|
|
|
break;
|
|
|
|
}
|
2015-08-11 04:21:15 +08:00
|
|
|
assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
|
|
|
|
|
2015-08-08 02:22:50 +08:00
|
|
|
// Simplify if count is constant.
|
|
|
|
auto Arg1 = II.getArgOperand(1);
|
|
|
|
auto CAZ = dyn_cast<ConstantAggregateZero>(Arg1);
|
|
|
|
auto CDV = dyn_cast<ConstantDataVector>(Arg1);
|
|
|
|
auto CInt = dyn_cast<ConstantInt>(Arg1);
|
|
|
|
if (!CAZ && !CDV && !CInt)
|
2015-08-05 16:18:00 +08:00
|
|
|
return nullptr;
|
2015-08-08 02:22:50 +08:00
|
|
|
|
|
|
|
APInt Count(64, 0);
|
|
|
|
if (CDV) {
|
|
|
|
// SSE2/AVX2 uses all the first 64-bits of the 128-bit vector
|
|
|
|
// operand to compute the shift amount.
|
|
|
|
auto VT = cast<VectorType>(CDV->getType());
|
|
|
|
unsigned BitWidth = VT->getElementType()->getPrimitiveSizeInBits();
|
|
|
|
assert((64 % BitWidth) == 0 && "Unexpected packed shift size");
|
|
|
|
unsigned NumSubElts = 64 / BitWidth;
|
|
|
|
|
|
|
|
// Concatenate the sub-elements to create the 64-bit value.
|
|
|
|
for (unsigned i = 0; i != NumSubElts; ++i) {
|
|
|
|
unsigned SubEltIdx = (NumSubElts - 1) - i;
|
|
|
|
auto SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
|
2017-04-28 11:36:24 +08:00
|
|
|
Count <<= BitWidth;
|
2015-08-08 02:22:50 +08:00
|
|
|
Count |= SubElt->getValue().zextOrTrunc(64);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else if (CInt)
|
|
|
|
Count = CInt->getValue();
|
2015-08-05 16:18:00 +08:00
|
|
|
|
|
|
|
auto Vec = II.getArgOperand(0);
|
|
|
|
auto VT = cast<VectorType>(Vec->getType());
|
|
|
|
auto SVT = VT->getElementType();
|
|
|
|
unsigned VWidth = VT->getNumElements();
|
2015-08-08 02:22:50 +08:00
|
|
|
unsigned BitWidth = SVT->getPrimitiveSizeInBits();
|
|
|
|
|
|
|
|
// If shift-by-zero then just return the original value.
|
2017-06-07 15:40:37 +08:00
|
|
|
if (Count.isNullValue())
|
2015-08-08 02:22:50 +08:00
|
|
|
return Vec;
|
|
|
|
|
2015-08-11 04:21:15 +08:00
|
|
|
// Handle cases when Shift >= BitWidth.
|
|
|
|
if (Count.uge(BitWidth)) {
|
|
|
|
// If LogicalShift - just return zero.
|
|
|
|
if (LogicalShift)
|
|
|
|
return ConstantAggregateZero::get(VT);
|
|
|
|
|
|
|
|
// If ArithmeticShift - clamp Shift to (BitWidth - 1).
|
|
|
|
Count = APInt(64, BitWidth - 1);
|
|
|
|
}
|
2015-08-05 16:18:00 +08:00
|
|
|
|
|
|
|
// Get a constant vector of the same type as the first operand.
|
2015-08-08 02:22:50 +08:00
|
|
|
auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth));
|
|
|
|
auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt);
|
2015-08-05 16:18:00 +08:00
|
|
|
|
|
|
|
if (ShiftLeft)
|
2015-08-08 02:22:50 +08:00
|
|
|
return Builder.CreateShl(Vec, ShiftVec);
|
2015-08-05 16:18:00 +08:00
|
|
|
|
2015-08-11 04:21:15 +08:00
|
|
|
if (LogicalShift)
|
|
|
|
return Builder.CreateLShr(Vec, ShiftVec);
|
|
|
|
|
|
|
|
return Builder.CreateAShr(Vec, ShiftVec);
|
2015-08-05 16:18:00 +08:00
|
|
|
}
|
|
|
|
|
2016-06-07 18:27:15 +08:00
|
|
|
// Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift.
|
|
|
|
// Unlike the generic IR shifts, the intrinsics have defined behaviour for out
|
|
|
|
// of range shift amounts (logical - set to zero, arithmetic - splat sign bit).
|
|
|
|
static Value *simplifyX86varShift(const IntrinsicInst &II,
|
|
|
|
InstCombiner::BuilderTy &Builder) {
|
|
|
|
bool LogicalShift = false;
|
|
|
|
bool ShiftLeft = false;
|
|
|
|
|
|
|
|
switch (II.getIntrinsicID()) {
|
2016-11-13 15:26:19 +08:00
|
|
|
default: llvm_unreachable("Unexpected intrinsic!");
|
2016-06-07 18:27:15 +08:00
|
|
|
case Intrinsic::x86_avx2_psrav_d:
|
|
|
|
case Intrinsic::x86_avx2_psrav_d_256:
|
2016-11-13 15:26:19 +08:00
|
|
|
case Intrinsic::x86_avx512_psrav_q_128:
|
|
|
|
case Intrinsic::x86_avx512_psrav_q_256:
|
|
|
|
case Intrinsic::x86_avx512_psrav_d_512:
|
|
|
|
case Intrinsic::x86_avx512_psrav_q_512:
|
2016-11-18 14:04:33 +08:00
|
|
|
case Intrinsic::x86_avx512_psrav_w_128:
|
|
|
|
case Intrinsic::x86_avx512_psrav_w_256:
|
|
|
|
case Intrinsic::x86_avx512_psrav_w_512:
|
2016-06-07 18:27:15 +08:00
|
|
|
LogicalShift = false;
|
|
|
|
ShiftLeft = false;
|
|
|
|
break;
|
|
|
|
case Intrinsic::x86_avx2_psrlv_d:
|
|
|
|
case Intrinsic::x86_avx2_psrlv_d_256:
|
|
|
|
case Intrinsic::x86_avx2_psrlv_q:
|
|
|
|
case Intrinsic::x86_avx2_psrlv_q_256:
|
2016-11-13 15:26:19 +08:00
|
|
|
case Intrinsic::x86_avx512_psrlv_d_512:
|
|
|
|
case Intrinsic::x86_avx512_psrlv_q_512:
|
2016-11-18 14:04:33 +08:00
|
|
|
case Intrinsic::x86_avx512_psrlv_w_128:
|
|
|
|
case Intrinsic::x86_avx512_psrlv_w_256:
|
|
|
|
case Intrinsic::x86_avx512_psrlv_w_512:
|
2016-06-07 18:27:15 +08:00
|
|
|
LogicalShift = true;
|
|
|
|
ShiftLeft = false;
|
|
|
|
break;
|
|
|
|
case Intrinsic::x86_avx2_psllv_d:
|
|
|
|
case Intrinsic::x86_avx2_psllv_d_256:
|
|
|
|
case Intrinsic::x86_avx2_psllv_q:
|
|
|
|
case Intrinsic::x86_avx2_psllv_q_256:
|
2016-11-13 15:26:19 +08:00
|
|
|
case Intrinsic::x86_avx512_psllv_d_512:
|
|
|
|
case Intrinsic::x86_avx512_psllv_q_512:
|
2016-11-18 14:04:33 +08:00
|
|
|
case Intrinsic::x86_avx512_psllv_w_128:
|
|
|
|
case Intrinsic::x86_avx512_psllv_w_256:
|
|
|
|
case Intrinsic::x86_avx512_psllv_w_512:
|
2016-06-07 18:27:15 +08:00
|
|
|
LogicalShift = true;
|
|
|
|
ShiftLeft = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
|
|
|
|
|
|
|
|
// Simplify if all shift amounts are constant/undef.
|
|
|
|
auto *CShift = dyn_cast<Constant>(II.getArgOperand(1));
|
|
|
|
if (!CShift)
|
|
|
|
return nullptr;
|
|
|
|
|
|
|
|
auto Vec = II.getArgOperand(0);
|
|
|
|
auto VT = cast<VectorType>(II.getType());
|
|
|
|
auto SVT = VT->getVectorElementType();
|
|
|
|
int NumElts = VT->getNumElements();
|
|
|
|
int BitWidth = SVT->getIntegerBitWidth();
|
|
|
|
|
|
|
|
// Collect each element's shift amount.
|
|
|
|
// We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.
|
|
|
|
bool AnyOutOfRange = false;
|
|
|
|
SmallVector<int, 8> ShiftAmts;
|
|
|
|
for (int I = 0; I < NumElts; ++I) {
|
|
|
|
auto *CElt = CShift->getAggregateElement(I);
|
|
|
|
if (CElt && isa<UndefValue>(CElt)) {
|
|
|
|
ShiftAmts.push_back(-1);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
auto *COp = dyn_cast_or_null<ConstantInt>(CElt);
|
|
|
|
if (!COp)
|
|
|
|
return nullptr;
|
|
|
|
|
|
|
|
// Handle out of range shifts.
|
|
|
|
// If LogicalShift - set to BitWidth (special case).
|
|
|
|
// If ArithmeticShift - set to (BitWidth - 1) (sign splat).
|
|
|
|
APInt ShiftVal = COp->getValue();
|
|
|
|
if (ShiftVal.uge(BitWidth)) {
|
|
|
|
AnyOutOfRange = LogicalShift;
|
|
|
|
ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
ShiftAmts.push_back((int)ShiftVal.getZExtValue());
|
|
|
|
}
|
|
|
|
|
|
|
|
// If all elements out of range or UNDEF, return vector of zeros/undefs.
|
|
|
|
// ArithmeticShift should only hit this if they are all UNDEF.
|
|
|
|
auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); };
|
2017-10-25 05:24:53 +08:00
|
|
|
if (llvm::all_of(ShiftAmts, OutOfRange)) {
|
2016-06-07 18:27:15 +08:00
|
|
|
SmallVector<Constant *, 8> ConstantVec;
|
|
|
|
for (int Idx : ShiftAmts) {
|
|
|
|
if (Idx < 0) {
|
|
|
|
ConstantVec.push_back(UndefValue::get(SVT));
|
|
|
|
} else {
|
|
|
|
assert(LogicalShift && "Logical shift expected");
|
|
|
|
ConstantVec.push_back(ConstantInt::getNullValue(SVT));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return ConstantVector::get(ConstantVec);
|
|
|
|
}
|
|
|
|
|
|
|
|
// We can't handle only some out of range values with generic logical shifts.
|
|
|
|
if (AnyOutOfRange)
|
|
|
|
return nullptr;
|
|
|
|
|
|
|
|
// Build the shift amount constant vector.
|
|
|
|
SmallVector<Constant *, 8> ShiftVecAmts;
|
|
|
|
for (int Idx : ShiftAmts) {
|
|
|
|
if (Idx < 0)
|
|
|
|
ShiftVecAmts.push_back(UndefValue::get(SVT));
|
|
|
|
else
|
|
|
|
ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx));
|
|
|
|
}
|
|
|
|
auto ShiftVec = ConstantVector::get(ShiftVecAmts);
|
|
|
|
|
|
|
|
if (ShiftLeft)
|
|
|
|
return Builder.CreateShl(Vec, ShiftVec);
|
|
|
|
|
|
|
|
if (LogicalShift)
|
|
|
|
return Builder.CreateLShr(Vec, ShiftVec);
|
|
|
|
|
|
|
|
return Builder.CreateAShr(Vec, ShiftVec);
|
|
|
|
}
|
|
|
|
|
2019-04-25 00:53:17 +08:00
|
|
|
static Value *simplifyX86pack(IntrinsicInst &II,
|
|
|
|
InstCombiner::BuilderTy &Builder, bool IsSigned) {
|
2017-01-25 22:37:24 +08:00
|
|
|
Value *Arg0 = II.getArgOperand(0);
|
|
|
|
Value *Arg1 = II.getArgOperand(1);
|
|
|
|
Type *ResTy = II.getType();
|
|
|
|
|
|
|
|
// Fast all undef handling.
|
|
|
|
if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))
|
|
|
|
return UndefValue::get(ResTy);
|
|
|
|
|
|
|
|
Type *ArgTy = Arg0->getType();
|
|
|
|
unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128;
|
|
|
|
unsigned NumSrcElts = ArgTy->getVectorNumElements();
|
2019-04-25 00:53:17 +08:00
|
|
|
assert(ResTy->getVectorNumElements() == (2 * NumSrcElts) &&
|
|
|
|
"Unexpected packing types");
|
2017-01-25 22:37:24 +08:00
|
|
|
|
|
|
|
unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
|
|
|
|
unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits();
|
2019-04-25 00:53:17 +08:00
|
|
|
unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits();
|
|
|
|
assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) &&
|
2017-01-25 22:37:24 +08:00
|
|
|
"Unexpected packing types");
|
|
|
|
|
|
|
|
// Constant folding.
|
2019-04-25 00:53:17 +08:00
|
|
|
if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
|
2017-01-25 22:37:24 +08:00
|
|
|
return nullptr;
|
|
|
|
|
2019-04-25 00:53:17 +08:00
|
|
|
// Clamp Values - signed/unsigned both use signed clamp values, but they
|
|
|
|
// differ on the min/max values.
|
|
|
|
APInt MinValue, MaxValue;
|
|
|
|
if (IsSigned) {
|
|
|
|
// PACKSS: Truncate signed value with signed saturation.
|
|
|
|
// Source values less than dst minint are saturated to minint.
|
|
|
|
// Source values greater than dst maxint are saturated to maxint.
|
|
|
|
MinValue =
|
|
|
|
APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
|
|
|
|
MaxValue =
|
|
|
|
APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
|
|
|
|
} else {
|
|
|
|
// PACKUS: Truncate signed value with unsigned saturation.
|
|
|
|
// Source values less than zero are saturated to zero.
|
|
|
|
// Source values greater than dst maxuint are saturated to maxuint.
|
|
|
|
MinValue = APInt::getNullValue(SrcScalarSizeInBits);
|
|
|
|
MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits);
|
|
|
|
}
|
|
|
|
|
|
|
|
auto *MinC = Constant::getIntegerValue(ArgTy, MinValue);
|
|
|
|
auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue);
|
|
|
|
Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0);
|
|
|
|
Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1);
|
|
|
|
Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0);
|
|
|
|
Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1);
|
|
|
|
|
2019-04-25 21:51:57 +08:00
|
|
|
// Shuffle clamped args together at the lane level.
|
2019-04-25 00:53:17 +08:00
|
|
|
SmallVector<unsigned, 32> PackMask;
|
2017-01-25 22:37:24 +08:00
|
|
|
for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
|
2019-04-25 00:53:17 +08:00
|
|
|
for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
|
|
|
|
PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane));
|
|
|
|
for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
|
|
|
|
PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts);
|
2017-01-25 22:37:24 +08:00
|
|
|
}
|
2019-04-25 21:51:57 +08:00
|
|
|
auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask);
|
2017-01-25 22:37:24 +08:00
|
|
|
|
2019-04-25 21:51:57 +08:00
|
|
|
// Truncate to dst size.
|
|
|
|
return Builder.CreateTrunc(Shuffle, ResTy);
|
2017-01-25 22:37:24 +08:00
|
|
|
}
|
|
|
|
|
2018-12-12 00:38:03 +08:00
|
|
|
static Value *simplifyX86movmsk(const IntrinsicInst &II,
|
|
|
|
InstCombiner::BuilderTy &Builder) {
|
2016-06-07 16:18:35 +08:00
|
|
|
Value *Arg = II.getArgOperand(0);
|
|
|
|
Type *ResTy = II.getType();
|
|
|
|
Type *ArgTy = Arg->getType();
|
|
|
|
|
|
|
|
// movmsk(undef) -> zero as we must ensure the upper bits are zero.
|
|
|
|
if (isa<UndefValue>(Arg))
|
|
|
|
return Constant::getNullValue(ResTy);
|
|
|
|
|
|
|
|
// We can't easily peek through x86_mmx types.
|
|
|
|
if (!ArgTy->isVectorTy())
|
|
|
|
return nullptr;
|
|
|
|
|
2019-04-08 21:17:51 +08:00
|
|
|
// Expand MOVMSK to compare/bitcast/zext:
|
|
|
|
// e.g. PMOVMSKB(v16i8 x):
|
|
|
|
// %cmp = icmp slt <16 x i8> %x, zeroinitializer
|
|
|
|
// %int = bitcast <16 x i1> %cmp to i16
|
|
|
|
// %res = zext i16 %int to i32
|
|
|
|
unsigned NumElts = ArgTy->getVectorNumElements();
|
|
|
|
Type *IntegerVecTy = VectorType::getInteger(cast<VectorType>(ArgTy));
|
|
|
|
Type *IntegerTy = Builder.getIntNTy(NumElts);
|
|
|
|
|
|
|
|
Value *Res = Builder.CreateBitCast(Arg, IntegerVecTy);
|
|
|
|
Res = Builder.CreateICmpSLT(Res, Constant::getNullValue(IntegerVecTy));
|
|
|
|
Res = Builder.CreateBitCast(Res, IntegerTy);
|
|
|
|
Res = Builder.CreateZExtOrTrunc(Res, ResTy);
|
|
|
|
return Res;
|
2016-06-07 16:18:35 +08:00
|
|
|
}
|
|
|
|
|
2019-02-01 22:14:47 +08:00
|
|
|
static Value *simplifyX86addcarry(const IntrinsicInst &II,
|
|
|
|
InstCombiner::BuilderTy &Builder) {
|
|
|
|
Value *CarryIn = II.getArgOperand(0);
|
|
|
|
Value *Op1 = II.getArgOperand(1);
|
|
|
|
Value *Op2 = II.getArgOperand(2);
|
|
|
|
Type *RetTy = II.getType();
|
|
|
|
Type *OpTy = Op1->getType();
|
|
|
|
assert(RetTy->getStructElementType(0)->isIntegerTy(8) &&
|
|
|
|
RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() &&
|
|
|
|
"Unexpected types for x86 addcarry");
|
|
|
|
|
|
|
|
// If carry-in is zero, this is just an unsigned add with overflow.
|
|
|
|
if (match(CarryIn, m_ZeroInt())) {
|
|
|
|
Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy,
|
|
|
|
{ Op1, Op2 });
|
|
|
|
// The types have to be adjusted to match the x86 call types.
|
|
|
|
Value *UAddResult = Builder.CreateExtractValue(UAdd, 0);
|
|
|
|
Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1),
|
|
|
|
Builder.getInt8Ty());
|
2019-02-01 22:37:49 +08:00
|
|
|
Value *Res = UndefValue::get(RetTy);
|
2019-02-01 22:14:47 +08:00
|
|
|
Res = Builder.CreateInsertValue(Res, UAddOV, 0);
|
|
|
|
return Builder.CreateInsertValue(Res, UAddResult, 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
2016-01-30 07:27:03 +08:00
|
|
|
static Value *simplifyX86insertps(const IntrinsicInst &II,
|
2015-04-17 01:52:13 +08:00
|
|
|
InstCombiner::BuilderTy &Builder) {
|
2016-01-28 08:03:16 +08:00
|
|
|
auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
|
|
|
|
if (!CInt)
|
|
|
|
return nullptr;
|
|
|
|
|
|
|
|
VectorType *VecTy = cast<VectorType>(II.getType());
|
|
|
|
assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
|
|
|
|
|
|
|
|
// The immediate permute control byte looks like this:
|
|
|
|
// [3:0] - zero mask for each 32-bit lane
|
|
|
|
// [5:4] - select one 32-bit destination lane
|
|
|
|
// [7:6] - select one 32-bit source lane
|
|
|
|
|
|
|
|
uint8_t Imm = CInt->getZExtValue();
|
|
|
|
uint8_t ZMask = Imm & 0xf;
|
|
|
|
uint8_t DestLane = (Imm >> 4) & 0x3;
|
|
|
|
uint8_t SourceLane = (Imm >> 6) & 0x3;
|
|
|
|
|
|
|
|
ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
|
|
|
|
|
|
|
|
// If all zero mask bits are set, this was just a weird way to
|
|
|
|
// generate a zero vector.
|
|
|
|
if (ZMask == 0xf)
|
|
|
|
return ZeroVector;
|
|
|
|
|
|
|
|
// Initialize by passing all of the first source bits through.
|
2016-06-12 08:41:19 +08:00
|
|
|
uint32_t ShuffleMask[4] = { 0, 1, 2, 3 };
|
2016-01-28 08:03:16 +08:00
|
|
|
|
|
|
|
// We may replace the second operand with the zero vector.
|
|
|
|
Value *V1 = II.getArgOperand(1);
|
|
|
|
|
|
|
|
if (ZMask) {
|
|
|
|
// If the zero mask is being used with a single input or the zero mask
|
|
|
|
// overrides the destination lane, this is a shuffle with the zero vector.
|
|
|
|
if ((II.getArgOperand(0) == II.getArgOperand(1)) ||
|
|
|
|
(ZMask & (1 << DestLane))) {
|
|
|
|
V1 = ZeroVector;
|
|
|
|
// We may still move 32-bits of the first source vector from one lane
|
|
|
|
// to another.
|
|
|
|
ShuffleMask[DestLane] = SourceLane;
|
|
|
|
// The zero mask may override the previous insert operation.
|
|
|
|
for (unsigned i = 0; i < 4; ++i)
|
|
|
|
if ((ZMask >> i) & 0x1)
|
|
|
|
ShuffleMask[i] = i + 4;
|
2015-04-26 04:55:25 +08:00
|
|
|
} else {
|
2016-01-28 08:03:16 +08:00
|
|
|
// TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
|
|
|
|
return nullptr;
|
2015-04-26 04:55:25 +08:00
|
|
|
}
|
2016-01-28 08:03:16 +08:00
|
|
|
} else {
|
|
|
|
// Replace the selected destination lane with the selected source lane.
|
|
|
|
ShuffleMask[DestLane] = SourceLane + 4;
|
2015-04-17 01:52:13 +08:00
|
|
|
}
|
2016-01-28 08:03:16 +08:00
|
|
|
|
|
|
|
return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
|
2015-04-17 01:52:13 +08:00
|
|
|
}
|
|
|
|
|
2015-10-17 19:40:05 +08:00
|
|
|
/// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding
|
|
|
|
/// or conversion to a shuffle vector.
|
2016-01-30 07:27:03 +08:00
|
|
|
static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0,
|
2015-10-17 19:40:05 +08:00
|
|
|
ConstantInt *CILength, ConstantInt *CIIndex,
|
|
|
|
InstCombiner::BuilderTy &Builder) {
|
|
|
|
auto LowConstantHighUndef = [&](uint64_t Val) {
|
|
|
|
Type *IntTy64 = Type::getInt64Ty(II.getContext());
|
|
|
|
Constant *Args[] = {ConstantInt::get(IntTy64, Val),
|
|
|
|
UndefValue::get(IntTy64)};
|
|
|
|
return ConstantVector::get(Args);
|
|
|
|
};
|
|
|
|
|
|
|
|
// See if we're dealing with constant values.
|
|
|
|
Constant *C0 = dyn_cast<Constant>(Op0);
|
|
|
|
ConstantInt *CI0 =
|
2016-09-07 20:03:03 +08:00
|
|
|
C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
|
2015-10-17 19:40:05 +08:00
|
|
|
: nullptr;
|
|
|
|
|
|
|
|
// Attempt to constant fold.
|
|
|
|
if (CILength && CIIndex) {
|
|
|
|
// From AMD documentation: "The bit index and field length are each six
|
|
|
|
// bits in length other bits of the field are ignored."
|
|
|
|
APInt APIndex = CIIndex->getValue().zextOrTrunc(6);
|
|
|
|
APInt APLength = CILength->getValue().zextOrTrunc(6);
|
|
|
|
|
|
|
|
unsigned Index = APIndex.getZExtValue();
|
|
|
|
|
|
|
|
// From AMD documentation: "a value of zero in the field length is
|
|
|
|
// defined as length of 64".
|
|
|
|
unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
|
|
|
|
|
|
|
|
// From AMD documentation: "If the sum of the bit index + length field
|
|
|
|
// is greater than 64, the results are undefined".
|
|
|
|
unsigned End = Index + Length;
|
|
|
|
|
|
|
|
// Note that both field index and field length are 8-bit quantities.
|
|
|
|
// Since variables 'Index' and 'Length' are unsigned values
|
|
|
|
// obtained from zero-extending field index and field length
|
|
|
|
// respectively, their sum should never wrap around.
|
|
|
|
if (End > 64)
|
|
|
|
return UndefValue::get(II.getType());
|
|
|
|
|
|
|
|
// If we are inserting whole bytes, we can convert this to a shuffle.
|
|
|
|
// Lowering can recognize EXTRQI shuffle masks.
|
|
|
|
if ((Length % 8) == 0 && (Index % 8) == 0) {
|
|
|
|
// Convert bit indices to byte indices.
|
|
|
|
Length /= 8;
|
|
|
|
Index /= 8;
|
|
|
|
|
|
|
|
Type *IntTy8 = Type::getInt8Ty(II.getContext());
|
|
|
|
Type *IntTy32 = Type::getInt32Ty(II.getContext());
|
|
|
|
VectorType *ShufTy = VectorType::get(IntTy8, 16);
|
|
|
|
|
|
|
|
SmallVector<Constant *, 16> ShuffleMask;
|
|
|
|
for (int i = 0; i != (int)Length; ++i)
|
|
|
|
ShuffleMask.push_back(
|
|
|
|
Constant::getIntegerValue(IntTy32, APInt(32, i + Index)));
|
|
|
|
for (int i = Length; i != 8; ++i)
|
|
|
|
ShuffleMask.push_back(
|
|
|
|
Constant::getIntegerValue(IntTy32, APInt(32, i + 16)));
|
|
|
|
for (int i = 8; i != 16; ++i)
|
|
|
|
ShuffleMask.push_back(UndefValue::get(IntTy32));
|
|
|
|
|
|
|
|
Value *SV = Builder.CreateShuffleVector(
|
|
|
|
Builder.CreateBitCast(Op0, ShufTy),
|
|
|
|
ConstantAggregateZero::get(ShufTy), ConstantVector::get(ShuffleMask));
|
|
|
|
return Builder.CreateBitCast(SV, II.getType());
|
|
|
|
}
|
|
|
|
|
|
|
|
// Constant Fold - shift Index'th bit to lowest position and mask off
|
|
|
|
// Length bits.
|
|
|
|
if (CI0) {
|
|
|
|
APInt Elt = CI0->getValue();
|
2017-04-19 01:14:21 +08:00
|
|
|
Elt.lshrInPlace(Index);
|
|
|
|
Elt = Elt.zextOrTrunc(Length);
|
2015-10-17 19:40:05 +08:00
|
|
|
return LowConstantHighUndef(Elt.getZExtValue());
|
|
|
|
}
|
|
|
|
|
|
|
|
// If we were an EXTRQ call, we'll save registers if we convert to EXTRQI.
|
|
|
|
if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) {
|
|
|
|
Value *Args[] = {Op0, CILength, CIIndex};
|
2015-12-15 01:24:23 +08:00
|
|
|
Module *M = II.getModule();
|
2019-02-02 04:43:25 +08:00
|
|
|
Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi);
|
2015-10-17 19:40:05 +08:00
|
|
|
return Builder.CreateCall(F, Args);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Constant Fold - extraction from zero is always {zero, undef}.
|
2017-07-07 02:39:49 +08:00
|
|
|
if (CI0 && CI0->isZero())
|
2015-10-17 19:40:05 +08:00
|
|
|
return LowConstantHighUndef(0);
|
|
|
|
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant
|
|
|
|
/// folding or conversion to a shuffle vector.
|
2016-01-30 07:27:03 +08:00
|
|
|
static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1,
|
2015-10-17 19:40:05 +08:00
|
|
|
APInt APLength, APInt APIndex,
|
|
|
|
InstCombiner::BuilderTy &Builder) {
|
|
|
|
// From AMD documentation: "The bit index and field length are each six bits
|
|
|
|
// in length other bits of the field are ignored."
|
|
|
|
APIndex = APIndex.zextOrTrunc(6);
|
|
|
|
APLength = APLength.zextOrTrunc(6);
|
|
|
|
|
|
|
|
// Attempt to constant fold.
|
|
|
|
unsigned Index = APIndex.getZExtValue();
|
|
|
|
|
|
|
|
// From AMD documentation: "a value of zero in the field length is
|
|
|
|
// defined as length of 64".
|
|
|
|
unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
|
|
|
|
|
|
|
|
// From AMD documentation: "If the sum of the bit index + length field
|
|
|
|
// is greater than 64, the results are undefined".
|
|
|
|
unsigned End = Index + Length;
|
|
|
|
|
|
|
|
// Note that both field index and field length are 8-bit quantities.
|
|
|
|
// Since variables 'Index' and 'Length' are unsigned values
|
|
|
|
// obtained from zero-extending field index and field length
|
|
|
|
// respectively, their sum should never wrap around.
|
|
|
|
if (End > 64)
|
|
|
|
return UndefValue::get(II.getType());
|
|
|
|
|
|
|
|
// If we are inserting whole bytes, we can convert this to a shuffle.
|
|
|
|
// Lowering can recognize INSERTQI shuffle masks.
|
|
|
|
if ((Length % 8) == 0 && (Index % 8) == 0) {
|
|
|
|
// Convert bit indices to byte indices.
|
|
|
|
Length /= 8;
|
|
|
|
Index /= 8;
|
|
|
|
|
|
|
|
Type *IntTy8 = Type::getInt8Ty(II.getContext());
|
|
|
|
Type *IntTy32 = Type::getInt32Ty(II.getContext());
|
|
|
|
VectorType *ShufTy = VectorType::get(IntTy8, 16);
|
|
|
|
|
|
|
|
SmallVector<Constant *, 16> ShuffleMask;
|
|
|
|
for (int i = 0; i != (int)Index; ++i)
|
|
|
|
ShuffleMask.push_back(Constant::getIntegerValue(IntTy32, APInt(32, i)));
|
|
|
|
for (int i = 0; i != (int)Length; ++i)
|
|
|
|
ShuffleMask.push_back(
|
|
|
|
Constant::getIntegerValue(IntTy32, APInt(32, i + 16)));
|
|
|
|
for (int i = Index + Length; i != 8; ++i)
|
|
|
|
ShuffleMask.push_back(Constant::getIntegerValue(IntTy32, APInt(32, i)));
|
|
|
|
for (int i = 8; i != 16; ++i)
|
|
|
|
ShuffleMask.push_back(UndefValue::get(IntTy32));
|
|
|
|
|
|
|
|
Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy),
|
|
|
|
Builder.CreateBitCast(Op1, ShufTy),
|
|
|
|
ConstantVector::get(ShuffleMask));
|
|
|
|
return Builder.CreateBitCast(SV, II.getType());
|
|
|
|
}
|
|
|
|
|
|
|
|
// See if we're dealing with constant values.
|
|
|
|
Constant *C0 = dyn_cast<Constant>(Op0);
|
|
|
|
Constant *C1 = dyn_cast<Constant>(Op1);
|
|
|
|
ConstantInt *CI00 =
|
2016-09-07 20:47:53 +08:00
|
|
|
C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
|
2015-10-17 19:40:05 +08:00
|
|
|
: nullptr;
|
|
|
|
ConstantInt *CI10 =
|
2016-09-07 20:47:53 +08:00
|
|
|
C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
|
2015-10-17 19:40:05 +08:00
|
|
|
: nullptr;
|
|
|
|
|
|
|
|
// Constant Fold - insert bottom Length bits starting at the Index'th bit.
|
|
|
|
if (CI00 && CI10) {
|
|
|
|
APInt V00 = CI00->getValue();
|
|
|
|
APInt V10 = CI10->getValue();
|
|
|
|
APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index);
|
|
|
|
V00 = V00 & ~Mask;
|
|
|
|
V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index);
|
|
|
|
APInt Val = V00 | V10;
|
|
|
|
Type *IntTy64 = Type::getInt64Ty(II.getContext());
|
|
|
|
Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()),
|
|
|
|
UndefValue::get(IntTy64)};
|
|
|
|
return ConstantVector::get(Args);
|
|
|
|
}
|
|
|
|
|
|
|
|
// If we were an INSERTQ call, we'll save demanded elements if we convert to
|
|
|
|
// INSERTQI.
|
|
|
|
if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) {
|
|
|
|
Type *IntTy8 = Type::getInt8Ty(II.getContext());
|
|
|
|
Constant *CILength = ConstantInt::get(IntTy8, Length, false);
|
|
|
|
Constant *CIIndex = ConstantInt::get(IntTy8, Index, false);
|
|
|
|
|
|
|
|
Value *Args[] = {Op0, Op1, CILength, CIIndex};
|
2015-12-15 01:24:23 +08:00
|
|
|
Module *M = II.getModule();
|
2019-02-02 04:43:25 +08:00
|
|
|
Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
|
2015-10-17 19:40:05 +08:00
|
|
|
return Builder.CreateCall(F, Args);
|
|
|
|
}
|
|
|
|
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
2016-04-25 01:00:34 +08:00
|
|
|
/// Attempt to convert pshufb* to shufflevector if the mask is constant.
|
|
|
|
static Value *simplifyX86pshufb(const IntrinsicInst &II,
|
|
|
|
InstCombiner::BuilderTy &Builder) {
|
2016-04-30 05:34:54 +08:00
|
|
|
Constant *V = dyn_cast<Constant>(II.getArgOperand(1));
|
|
|
|
if (!V)
|
|
|
|
return nullptr;
|
|
|
|
|
2016-05-02 03:26:21 +08:00
|
|
|
auto *VecTy = cast<VectorType>(II.getType());
|
|
|
|
auto *MaskEltTy = Type::getInt32Ty(II.getContext());
|
|
|
|
unsigned NumElts = VecTy->getNumElements();
|
2016-12-11 08:23:50 +08:00
|
|
|
assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
|
2016-04-25 01:00:34 +08:00
|
|
|
"Unexpected number of elements in shuffle mask!");
|
2016-04-30 05:34:54 +08:00
|
|
|
|
2016-05-02 03:26:21 +08:00
|
|
|
// Construct a shuffle mask from constant integers or UNDEFs.
|
2016-12-11 08:23:50 +08:00
|
|
|
Constant *Indexes[64] = {nullptr};
|
2016-04-25 01:00:34 +08:00
|
|
|
|
2016-04-30 05:34:54 +08:00
|
|
|
// Each byte in the shuffle control mask forms an index to permute the
|
|
|
|
// corresponding byte in the destination operand.
|
|
|
|
for (unsigned I = 0; I < NumElts; ++I) {
|
|
|
|
Constant *COp = V->getAggregateElement(I);
|
2016-05-02 03:26:21 +08:00
|
|
|
if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
|
2016-04-30 05:34:54 +08:00
|
|
|
return nullptr;
|
|
|
|
|
2016-05-02 03:26:21 +08:00
|
|
|
if (isa<UndefValue>(COp)) {
|
|
|
|
Indexes[I] = UndefValue::get(MaskEltTy);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2016-04-30 05:34:54 +08:00
|
|
|
int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue();
|
|
|
|
|
|
|
|
// If the most significant bit (bit[7]) of each byte of the shuffle
|
|
|
|
// control mask is set, then zero is written in the result byte.
|
|
|
|
// The zero vector is in the right-hand side of the resulting
|
|
|
|
// shufflevector.
|
|
|
|
|
2016-05-02 03:26:21 +08:00
|
|
|
// The value of each index for the high 128-bit lane is the least
|
|
|
|
// significant 4 bits of the respective shuffle control byte.
|
|
|
|
Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0);
|
|
|
|
Indexes[I] = ConstantInt::get(MaskEltTy, Index);
|
2016-04-30 05:34:54 +08:00
|
|
|
}
|
2016-04-25 01:00:34 +08:00
|
|
|
|
2016-05-02 03:26:21 +08:00
|
|
|
auto ShuffleMask = ConstantVector::get(makeArrayRef(Indexes, NumElts));
|
2016-04-25 01:00:34 +08:00
|
|
|
auto V1 = II.getArgOperand(0);
|
2016-05-02 03:26:21 +08:00
|
|
|
auto V2 = Constant::getNullValue(VecTy);
|
2016-04-25 01:00:34 +08:00
|
|
|
return Builder.CreateShuffleVector(V1, V2, ShuffleMask);
|
|
|
|
}
|
|
|
|
|
2016-04-25 01:23:46 +08:00
|
|
|
/// Attempt to convert vpermilvar* to shufflevector if the mask is constant.
|
|
|
|
static Value *simplifyX86vpermilvar(const IntrinsicInst &II,
|
|
|
|
InstCombiner::BuilderTy &Builder) {
|
2016-04-30 15:23:30 +08:00
|
|
|
Constant *V = dyn_cast<Constant>(II.getArgOperand(1));
|
|
|
|
if (!V)
|
|
|
|
return nullptr;
|
2016-04-25 01:23:46 +08:00
|
|
|
|
2016-12-11 09:59:36 +08:00
|
|
|
auto *VecTy = cast<VectorType>(II.getType());
|
2016-05-02 04:22:42 +08:00
|
|
|
auto *MaskEltTy = Type::getInt32Ty(II.getContext());
|
2016-12-11 09:59:36 +08:00
|
|
|
unsigned NumElts = VecTy->getVectorNumElements();
|
|
|
|
bool IsPD = VecTy->getScalarType()->isDoubleTy();
|
|
|
|
unsigned NumLaneElts = IsPD ? 2 : 4;
|
|
|
|
assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2);
|
2016-04-25 01:23:46 +08:00
|
|
|
|
2016-05-02 04:22:42 +08:00
|
|
|
// Construct a shuffle mask from constant integers or UNDEFs.
|
2016-12-11 09:59:36 +08:00
|
|
|
Constant *Indexes[16] = {nullptr};
|
2016-04-30 15:23:30 +08:00
|
|
|
|
|
|
|
// The intrinsics only read one or two bits, clear the rest.
|
2016-05-02 04:22:42 +08:00
|
|
|
for (unsigned I = 0; I < NumElts; ++I) {
|
2016-04-30 15:23:30 +08:00
|
|
|
Constant *COp = V->getAggregateElement(I);
|
2016-05-02 04:22:42 +08:00
|
|
|
if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
|
2016-04-30 15:23:30 +08:00
|
|
|
return nullptr;
|
|
|
|
|
2016-05-02 04:22:42 +08:00
|
|
|
if (isa<UndefValue>(COp)) {
|
|
|
|
Indexes[I] = UndefValue::get(MaskEltTy);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
APInt Index = cast<ConstantInt>(COp)->getValue();
|
|
|
|
Index = Index.zextOrTrunc(32).getLoBits(2);
|
2016-04-30 15:23:30 +08:00
|
|
|
|
|
|
|
// The PD variants uses bit 1 to select per-lane element index, so
|
|
|
|
// shift down to convert to generic shuffle mask index.
|
2016-12-11 09:59:36 +08:00
|
|
|
if (IsPD)
|
2017-04-19 01:14:21 +08:00
|
|
|
Index.lshrInPlace(1);
|
2016-05-02 04:22:42 +08:00
|
|
|
|
|
|
|
// The _256 variants are a bit trickier since the mask bits always index
|
|
|
|
// into the corresponding 128 half. In order to convert to a generic
|
|
|
|
// shuffle, we have to make that explicit.
|
2016-12-11 09:59:36 +08:00
|
|
|
Index += APInt(32, (I / NumLaneElts) * NumLaneElts);
|
2016-04-25 01:23:46 +08:00
|
|
|
|
2016-05-02 04:22:42 +08:00
|
|
|
Indexes[I] = ConstantInt::get(MaskEltTy, Index);
|
2016-04-25 01:23:46 +08:00
|
|
|
}
|
|
|
|
|
2016-05-02 04:22:42 +08:00
|
|
|
auto ShuffleMask = ConstantVector::get(makeArrayRef(Indexes, NumElts));
|
2016-04-25 01:23:46 +08:00
|
|
|
auto V1 = II.getArgOperand(0);
|
|
|
|
auto V2 = UndefValue::get(V1->getType());
|
|
|
|
return Builder.CreateShuffleVector(V1, V2, ShuffleMask);
|
|
|
|
}
|
|
|
|
|
2016-05-02 00:41:22 +08:00
|
|
|
/// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
|
|
|
|
static Value *simplifyX86vpermv(const IntrinsicInst &II,
|
|
|
|
InstCombiner::BuilderTy &Builder) {
|
|
|
|
auto *V = dyn_cast<Constant>(II.getArgOperand(1));
|
|
|
|
if (!V)
|
|
|
|
return nullptr;
|
|
|
|
|
2016-05-02 04:43:02 +08:00
|
|
|
auto *VecTy = cast<VectorType>(II.getType());
|
|
|
|
auto *MaskEltTy = Type::getInt32Ty(II.getContext());
|
2016-05-02 00:41:22 +08:00
|
|
|
unsigned Size = VecTy->getNumElements();
|
2016-12-26 07:58:57 +08:00
|
|
|
assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) &&
|
|
|
|
"Unexpected shuffle mask size");
|
2016-05-02 00:41:22 +08:00
|
|
|
|
2016-05-02 04:43:02 +08:00
|
|
|
// Construct a shuffle mask from constant integers or UNDEFs.
|
2016-12-26 07:58:57 +08:00
|
|
|
Constant *Indexes[64] = {nullptr};
|
2016-05-02 00:41:22 +08:00
|
|
|
|
|
|
|
for (unsigned I = 0; I < Size; ++I) {
|
|
|
|
Constant *COp = V->getAggregateElement(I);
|
2016-05-02 04:43:02 +08:00
|
|
|
if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
|
2016-05-02 00:41:22 +08:00
|
|
|
return nullptr;
|
|
|
|
|
2016-05-02 04:43:02 +08:00
|
|
|
if (isa<UndefValue>(COp)) {
|
|
|
|
Indexes[I] = UndefValue::get(MaskEltTy);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2016-12-26 07:58:57 +08:00
|
|
|
uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
|
|
|
|
Index &= Size - 1;
|
2016-05-02 04:43:02 +08:00
|
|
|
Indexes[I] = ConstantInt::get(MaskEltTy, Index);
|
2016-05-02 00:41:22 +08:00
|
|
|
}
|
|
|
|
|
2016-05-02 04:43:02 +08:00
|
|
|
auto ShuffleMask = ConstantVector::get(makeArrayRef(Indexes, Size));
|
2016-05-02 00:41:22 +08:00
|
|
|
auto V1 = II.getArgOperand(0);
|
|
|
|
auto V2 = UndefValue::get(VecTy);
|
|
|
|
return Builder.CreateShuffleVector(V1, V2, ShuffleMask);
|
|
|
|
}
|
|
|
|
|
2019-03-20 11:36:05 +08:00
|
|
|
// TODO, Obvious Missing Transforms:
|
|
|
|
// * Narrow width by halfs excluding zero/undef lanes
|
2019-04-25 09:18:56 +08:00
|
|
|
Value *InstCombiner::simplifyMaskedLoad(IntrinsicInst &II) {
|
2019-04-23 23:25:14 +08:00
|
|
|
Value *LoadPtr = II.getArgOperand(0);
|
|
|
|
unsigned Alignment = cast<ConstantInt>(II.getArgOperand(1))->getZExtValue();
|
|
|
|
|
2016-07-14 14:58:42 +08:00
|
|
|
// If the mask is all ones or undefs, this is a plain vector load of the 1st
|
|
|
|
// argument.
|
2019-04-23 23:25:14 +08:00
|
|
|
if (maskIsAllOneOrUndef(II.getArgOperand(2)))
|
2019-02-02 04:44:24 +08:00
|
|
|
return Builder.CreateAlignedLoad(II.getType(), LoadPtr, Alignment,
|
|
|
|
"unmaskedload");
|
2019-04-23 23:25:14 +08:00
|
|
|
|
|
|
|
// If we can unconditionally load from this address, replace with a
|
|
|
|
// load/select idiom. TODO: use DT for context sensitive query
|
2019-10-21 23:10:26 +08:00
|
|
|
if (isDereferenceableAndAlignedPointer(
|
|
|
|
LoadPtr, II.getType(), MaybeAlign(Alignment),
|
|
|
|
II.getModule()->getDataLayout(), &II, nullptr)) {
|
2019-04-23 23:25:14 +08:00
|
|
|
Value *LI = Builder.CreateAlignedLoad(II.getType(), LoadPtr, Alignment,
|
|
|
|
"unmaskedload");
|
|
|
|
return Builder.CreateSelect(II.getArgOperand(2), LI, II.getArgOperand(3));
|
2016-02-02 01:00:10 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
2019-03-20 11:36:05 +08:00
|
|
|
// TODO, Obvious Missing Transforms:
|
|
|
|
// * Single constant active lane -> store
|
|
|
|
// * Narrow width by halfs excluding zero/undef lanes
|
2019-03-21 02:44:58 +08:00
|
|
|
Instruction *InstCombiner::simplifyMaskedStore(IntrinsicInst &II) {
|
2016-02-02 03:39:52 +08:00
|
|
|
auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3));
|
|
|
|
if (!ConstMask)
|
|
|
|
return nullptr;
|
|
|
|
|
|
|
|
// If the mask is all zeros, this instruction does nothing.
|
|
|
|
if (ConstMask->isNullValue())
|
2019-03-21 02:44:58 +08:00
|
|
|
return eraseInstFromFunction(II);
|
2016-02-02 03:39:52 +08:00
|
|
|
|
|
|
|
// If the mask is all ones, this is a plain vector store of the 1st argument.
|
|
|
|
if (ConstMask->isAllOnesValue()) {
|
|
|
|
Value *StorePtr = II.getArgOperand(1);
|
2019-10-22 20:55:32 +08:00
|
|
|
MaybeAlign Alignment(
|
|
|
|
cast<ConstantInt>(II.getArgOperand(2))->getZExtValue());
|
2016-02-02 03:39:52 +08:00
|
|
|
return new StoreInst(II.getArgOperand(0), StorePtr, false, Alignment);
|
|
|
|
}
|
|
|
|
|
2019-03-21 02:44:58 +08:00
|
|
|
// Use masked off lanes to simplify operands via SimplifyDemandedVectorElts
|
|
|
|
APInt DemandedElts = possiblyDemandedEltsInMask(ConstMask);
|
|
|
|
APInt UndefElts(DemandedElts.getBitWidth(), 0);
|
|
|
|
if (Value *V = SimplifyDemandedVectorElts(II.getOperand(0),
|
|
|
|
DemandedElts, UndefElts)) {
|
|
|
|
II.setOperand(0, V);
|
|
|
|
return &II;
|
|
|
|
}
|
|
|
|
|
2016-02-02 03:39:52 +08:00
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
2019-03-20 11:36:05 +08:00
|
|
|
// TODO, Obvious Missing Transforms:
|
|
|
|
// * Single constant active lane load -> load
|
|
|
|
// * Dereferenceable address & few lanes -> scalarize speculative load/selects
|
|
|
|
// * Adjacent vector addresses -> masked.load
|
|
|
|
// * Narrow width by halfs excluding zero/undef lanes
|
2019-03-21 11:23:40 +08:00
|
|
|
// * Vector splat address w/known mask -> scalar load
|
|
|
|
// * Vector incrementing address -> vector masked load
|
2019-04-25 09:18:56 +08:00
|
|
|
Instruction *InstCombiner::simplifyMaskedGather(IntrinsicInst &II) {
|
2016-02-02 06:10:26 +08:00
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
2019-03-21 11:23:40 +08:00
|
|
|
// TODO, Obvious Missing Transforms:
|
|
|
|
// * Single constant active lane -> store
|
|
|
|
// * Adjacent vector addresses -> masked.store
|
|
|
|
// * Narrow store width by halfs excluding zero/undef lanes
|
|
|
|
// * Vector splat address w/known mask -> scalar store
|
|
|
|
// * Vector incrementing address -> vector masked store
|
|
|
|
Instruction *InstCombiner::simplifyMaskedScatter(IntrinsicInst &II) {
|
|
|
|
auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3));
|
|
|
|
if (!ConstMask)
|
|
|
|
return nullptr;
|
|
|
|
|
|
|
|
// If the mask is all zeros, a scatter does nothing.
|
|
|
|
if (ConstMask->isNullValue())
|
|
|
|
return eraseInstFromFunction(II);
|
|
|
|
|
|
|
|
// Use masked off lanes to simplify operands via SimplifyDemandedVectorElts
|
|
|
|
APInt DemandedElts = possiblyDemandedEltsInMask(ConstMask);
|
|
|
|
APInt UndefElts(DemandedElts.getBitWidth(), 0);
|
|
|
|
if (Value *V = SimplifyDemandedVectorElts(II.getOperand(0),
|
|
|
|
DemandedElts, UndefElts)) {
|
|
|
|
II.setOperand(0, V);
|
|
|
|
return &II;
|
|
|
|
}
|
|
|
|
if (Value *V = SimplifyDemandedVectorElts(II.getOperand(1),
|
|
|
|
DemandedElts, UndefElts)) {
|
|
|
|
II.setOperand(1, V);
|
|
|
|
return &II;
|
|
|
|
}
|
|
|
|
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
2018-07-13 07:55:20 +08:00
|
|
|
/// This function transforms launder.invariant.group and strip.invariant.group
|
|
|
|
/// like:
|
|
|
|
/// launder(launder(%x)) -> launder(%x) (the result is not the argument)
|
|
|
|
/// launder(strip(%x)) -> launder(%x)
|
|
|
|
/// strip(strip(%x)) -> strip(%x) (the result is not the argument)
|
|
|
|
/// strip(launder(%x)) -> strip(%x)
|
|
|
|
/// This is legal because it preserves the most recent information about
|
|
|
|
/// the presence or absence of invariant.group.
|
|
|
|
static Instruction *simplifyInvariantGroupIntrinsic(IntrinsicInst &II,
|
|
|
|
InstCombiner &IC) {
|
|
|
|
auto *Arg = II.getArgOperand(0);
|
|
|
|
auto *StrippedArg = Arg->stripPointerCasts();
|
|
|
|
auto *StrippedInvariantGroupsArg = Arg->stripPointerCastsAndInvariantGroups();
|
|
|
|
if (StrippedArg == StrippedInvariantGroupsArg)
|
|
|
|
return nullptr; // No launders/strips to remove.
|
|
|
|
|
|
|
|
Value *Result = nullptr;
|
|
|
|
|
|
|
|
if (II.getIntrinsicID() == Intrinsic::launder_invariant_group)
|
|
|
|
Result = IC.Builder.CreateLaunderInvariantGroup(StrippedInvariantGroupsArg);
|
|
|
|
else if (II.getIntrinsicID() == Intrinsic::strip_invariant_group)
|
|
|
|
Result = IC.Builder.CreateStripInvariantGroup(StrippedInvariantGroupsArg);
|
|
|
|
else
|
|
|
|
llvm_unreachable(
|
|
|
|
"simplifyInvariantGroupIntrinsic only handles launder and strip");
|
|
|
|
if (Result->getType()->getPointerAddressSpace() !=
|
|
|
|
II.getType()->getPointerAddressSpace())
|
|
|
|
Result = IC.Builder.CreateAddrSpaceCast(Result, II.getType());
|
|
|
|
if (Result->getType() != II.getType())
|
|
|
|
Result = IC.Builder.CreateBitCast(Result, II.getType());
|
|
|
|
|
|
|
|
return cast<Instruction>(Result);
|
|
|
|
}
|
|
|
|
|
2016-08-19 04:43:50 +08:00
|
|
|
static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombiner &IC) {
|
|
|
|
assert((II.getIntrinsicID() == Intrinsic::cttz ||
|
|
|
|
II.getIntrinsicID() == Intrinsic::ctlz) &&
|
|
|
|
"Expected cttz or ctlz intrinsic");
|
2019-04-03 04:13:28 +08:00
|
|
|
bool IsTZ = II.getIntrinsicID() == Intrinsic::cttz;
|
2016-08-06 06:42:46 +08:00
|
|
|
Value *Op0 = II.getArgOperand(0);
|
2019-04-03 04:13:28 +08:00
|
|
|
Value *X;
|
|
|
|
// ctlz(bitreverse(x)) -> cttz(x)
|
|
|
|
// cttz(bitreverse(x)) -> ctlz(x)
|
|
|
|
if (match(Op0, m_BitReverse(m_Value(X)))) {
|
|
|
|
Intrinsic::ID ID = IsTZ ? Intrinsic::ctlz : Intrinsic::cttz;
|
|
|
|
Function *F = Intrinsic::getDeclaration(II.getModule(), ID, II.getType());
|
|
|
|
return CallInst::Create(F, {X, II.getArgOperand(1)});
|
|
|
|
}
|
2016-08-06 06:42:46 +08:00
|
|
|
|
2019-06-21 23:26:22 +08:00
|
|
|
if (IsTZ) {
|
|
|
|
// cttz(-x) -> cttz(x)
|
|
|
|
if (match(Op0, m_Neg(m_Value(X)))) {
|
|
|
|
II.setOperand(0, X);
|
|
|
|
return &II;
|
|
|
|
}
|
|
|
|
|
|
|
|
// cttz(abs(x)) -> cttz(x)
|
|
|
|
// cttz(nabs(x)) -> cttz(x)
|
|
|
|
Value *Y;
|
|
|
|
SelectPatternFlavor SPF = matchSelectPattern(Op0, X, Y).Flavor;
|
|
|
|
if (SPF == SPF_ABS || SPF == SPF_NABS) {
|
|
|
|
II.setOperand(0, X);
|
|
|
|
return &II;
|
|
|
|
}
|
2019-06-21 01:04:14 +08:00
|
|
|
}
|
|
|
|
|
2017-05-25 00:53:07 +08:00
|
|
|
KnownBits Known = IC.computeKnownBits(Op0, 0, &II);
|
2016-08-06 06:42:46 +08:00
|
|
|
|
|
|
|
// Create a mask for bits above (ctlz) or below (cttz) the first known one.
|
2017-05-13 01:20:30 +08:00
|
|
|
unsigned PossibleZeros = IsTZ ? Known.countMaxTrailingZeros()
|
|
|
|
: Known.countMaxLeadingZeros();
|
|
|
|
unsigned DefiniteZeros = IsTZ ? Known.countMinTrailingZeros()
|
|
|
|
: Known.countMinLeadingZeros();
|
2016-08-06 06:42:46 +08:00
|
|
|
|
|
|
|
// If all bits above (ctlz) or below (cttz) the first known one are known
|
|
|
|
// zero, this value is constant.
|
|
|
|
// FIXME: This should be in InstSimplify because we're replacing an
|
|
|
|
// instruction with a constant.
|
2017-04-27 12:51:25 +08:00
|
|
|
if (PossibleZeros == DefiniteZeros) {
|
2017-06-04 02:50:32 +08:00
|
|
|
auto *C = ConstantInt::get(Op0->getType(), DefiniteZeros);
|
2016-08-19 04:43:50 +08:00
|
|
|
return IC.replaceInstUsesWith(II, C);
|
|
|
|
}
|
|
|
|
|
|
|
|
// If the input to cttz/ctlz is known to be non-zero,
|
|
|
|
// then change the 'ZeroIsUndef' parameter to 'true'
|
|
|
|
// because we know the zero behavior can't affect the result.
|
2017-06-07 15:40:37 +08:00
|
|
|
if (!Known.One.isNullValue() ||
|
2017-05-27 02:23:57 +08:00
|
|
|
isKnownNonZero(Op0, IC.getDataLayout(), 0, &IC.getAssumptionCache(), &II,
|
|
|
|
&IC.getDominatorTree())) {
|
2016-08-19 04:43:50 +08:00
|
|
|
if (!match(II.getArgOperand(1), m_One())) {
|
2017-07-08 07:16:26 +08:00
|
|
|
II.setOperand(1, IC.Builder.getTrue());
|
2016-08-19 04:43:50 +08:00
|
|
|
return &II;
|
|
|
|
}
|
|
|
|
}
|
2016-08-06 06:42:46 +08:00
|
|
|
|
2017-06-22 00:32:35 +08:00
|
|
|
// Add range metadata since known bits can't completely reflect what we know.
|
|
|
|
// TODO: Handle splat vectors.
|
|
|
|
auto *IT = dyn_cast<IntegerType>(Op0->getType());
|
|
|
|
if (IT && IT->getBitWidth() != 1 && !II.getMetadata(LLVMContext::MD_range)) {
|
|
|
|
Metadata *LowAndHigh[] = {
|
|
|
|
ConstantAsMetadata::get(ConstantInt::get(IT, DefiniteZeros)),
|
|
|
|
ConstantAsMetadata::get(ConstantInt::get(IT, PossibleZeros + 1))};
|
|
|
|
II.setMetadata(LLVMContext::MD_range,
|
|
|
|
MDNode::get(II.getContext(), LowAndHigh));
|
|
|
|
return &II;
|
|
|
|
}
|
|
|
|
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
static Instruction *foldCtpop(IntrinsicInst &II, InstCombiner &IC) {
|
|
|
|
assert(II.getIntrinsicID() == Intrinsic::ctpop &&
|
|
|
|
"Expected ctpop intrinsic");
|
|
|
|
Value *Op0 = II.getArgOperand(0);
|
2019-04-03 16:08:44 +08:00
|
|
|
Value *X;
|
|
|
|
// ctpop(bitreverse(x)) -> ctpop(x)
|
|
|
|
// ctpop(bswap(x)) -> ctpop(x)
|
|
|
|
if (match(Op0, m_BitReverse(m_Value(X))) || match(Op0, m_BSwap(m_Value(X)))) {
|
|
|
|
II.setOperand(0, X);
|
|
|
|
return &II;
|
|
|
|
}
|
|
|
|
|
2017-06-22 00:32:35 +08:00
|
|
|
// FIXME: Try to simplify vectors of integers.
|
|
|
|
auto *IT = dyn_cast<IntegerType>(Op0->getType());
|
|
|
|
if (!IT)
|
|
|
|
return nullptr;
|
|
|
|
|
|
|
|
unsigned BitWidth = IT->getBitWidth();
|
|
|
|
KnownBits Known(BitWidth);
|
|
|
|
IC.computeKnownBits(Op0, Known, 0, &II);
|
|
|
|
|
|
|
|
unsigned MinCount = Known.countMinPopulation();
|
|
|
|
unsigned MaxCount = Known.countMaxPopulation();
|
|
|
|
|
|
|
|
// Add range metadata since known bits can't completely reflect what we know.
|
|
|
|
if (IT->getBitWidth() != 1 && !II.getMetadata(LLVMContext::MD_range)) {
|
|
|
|
Metadata *LowAndHigh[] = {
|
|
|
|
ConstantAsMetadata::get(ConstantInt::get(IT, MinCount)),
|
|
|
|
ConstantAsMetadata::get(ConstantInt::get(IT, MaxCount + 1))};
|
|
|
|
II.setMetadata(LLVMContext::MD_range,
|
|
|
|
MDNode::get(II.getContext(), LowAndHigh));
|
|
|
|
return &II;
|
|
|
|
}
|
|
|
|
|
2016-08-06 06:42:46 +08:00
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
2016-03-01 07:16:48 +08:00
|
|
|
// TODO: If the x86 backend knew how to convert a bool vector mask back to an
|
|
|
|
// XMM register mask efficiently, we could transform all x86 masked intrinsics
|
|
|
|
// to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
|
|
|
|
static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) {
|
|
|
|
Value *Ptr = II.getOperand(0);
|
|
|
|
Value *Mask = II.getOperand(1);
|
2016-04-13 07:16:23 +08:00
|
|
|
Constant *ZeroVec = Constant::getNullValue(II.getType());
|
2016-03-01 07:16:48 +08:00
|
|
|
|
|
|
|
// Special case a zero mask since that's not a ConstantDataVector.
|
2016-04-13 07:16:23 +08:00
|
|
|
// This masked load instruction creates a zero vector.
|
2016-03-01 07:16:48 +08:00
|
|
|
if (isa<ConstantAggregateZero>(Mask))
|
2016-04-13 07:16:23 +08:00
|
|
|
return IC.replaceInstUsesWith(II, ZeroVec);
|
2016-03-01 07:16:48 +08:00
|
|
|
|
|
|
|
auto *ConstMask = dyn_cast<ConstantDataVector>(Mask);
|
|
|
|
if (!ConstMask)
|
|
|
|
return nullptr;
|
|
|
|
|
|
|
|
// The mask is constant. Convert this x86 intrinsic to the LLVM instrinsic
|
|
|
|
// to allow target-independent optimizations.
|
|
|
|
|
|
|
|
// First, cast the x86 intrinsic scalar pointer to a vector pointer to match
|
|
|
|
// the LLVM intrinsic definition for the pointer argument.
|
|
|
|
unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
|
|
|
|
PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace);
|
2017-07-08 07:16:26 +08:00
|
|
|
Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
|
2016-03-01 07:16:48 +08:00
|
|
|
|
|
|
|
// Second, convert the x86 XMM integer vector mask to a vector of bools based
|
|
|
|
// on each element's most significant bit (the sign bit).
|
|
|
|
Constant *BoolMask = getNegativeIsTrueBoolVec(ConstMask);
|
|
|
|
|
2016-04-13 07:16:23 +08:00
|
|
|
// The pass-through vector for an x86 masked load is a zero vector.
|
|
|
|
CallInst *NewMaskedLoad =
|
2020-01-21 18:21:31 +08:00
|
|
|
IC.Builder.CreateMaskedLoad(PtrCast, Align::None(), BoolMask, ZeroVec);
|
2016-03-01 07:16:48 +08:00
|
|
|
return IC.replaceInstUsesWith(II, NewMaskedLoad);
|
|
|
|
}
|
|
|
|
|
[x86, InstCombine] transform x86 AVX masked stores to LLVM intrinsics
The intended effect of this patch in conjunction with:
http://reviews.llvm.org/rL259392
http://reviews.llvm.org/rL260145
is that customers using the AVX intrinsics in C will benefit from combines when
the store mask is constant:
void mstore_zero_mask(float *f, __m128 v) {
_mm_maskstore_ps(f, _mm_set1_epi32(0), v);
}
void mstore_fake_ones_mask(float *f, __m128 v) {
_mm_maskstore_ps(f, _mm_set1_epi32(1), v);
}
void mstore_ones_mask(float *f, __m128 v) {
_mm_maskstore_ps(f, _mm_set1_epi32(0x80000000), v);
}
void mstore_one_set_elt_mask(float *f, __m128 v) {
_mm_maskstore_ps(f, _mm_set_epi32(0x80000000, 0, 0, 0), v);
}
...so none of the above will actually generate a masked store for optimized code.
Differential Revision: http://reviews.llvm.org/D17485
llvm-svn: 262064
2016-02-27 05:04:14 +08:00
|
|
|
// TODO: If the x86 backend knew how to convert a bool vector mask back to an
|
|
|
|
// XMM register mask efficiently, we could transform all x86 masked intrinsics
|
|
|
|
// to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
|
|
|
|
static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) {
|
|
|
|
Value *Ptr = II.getOperand(0);
|
|
|
|
Value *Mask = II.getOperand(1);
|
|
|
|
Value *Vec = II.getOperand(2);
|
|
|
|
|
|
|
|
// Special case a zero mask since that's not a ConstantDataVector:
|
|
|
|
// this masked store instruction does nothing.
|
|
|
|
if (isa<ConstantAggregateZero>(Mask)) {
|
|
|
|
IC.eraseInstFromFunction(II);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2016-03-12 23:16:59 +08:00
|
|
|
// The SSE2 version is too weird (eg, unaligned but non-temporal) to do
|
|
|
|
// anything else at this level.
|
|
|
|
if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu)
|
|
|
|
return false;
|
|
|
|
|
[x86, InstCombine] transform x86 AVX masked stores to LLVM intrinsics
The intended effect of this patch in conjunction with:
http://reviews.llvm.org/rL259392
http://reviews.llvm.org/rL260145
is that customers using the AVX intrinsics in C will benefit from combines when
the store mask is constant:
void mstore_zero_mask(float *f, __m128 v) {
_mm_maskstore_ps(f, _mm_set1_epi32(0), v);
}
void mstore_fake_ones_mask(float *f, __m128 v) {
_mm_maskstore_ps(f, _mm_set1_epi32(1), v);
}
void mstore_ones_mask(float *f, __m128 v) {
_mm_maskstore_ps(f, _mm_set1_epi32(0x80000000), v);
}
void mstore_one_set_elt_mask(float *f, __m128 v) {
_mm_maskstore_ps(f, _mm_set_epi32(0x80000000, 0, 0, 0), v);
}
...so none of the above will actually generate a masked store for optimized code.
Differential Revision: http://reviews.llvm.org/D17485
llvm-svn: 262064
2016-02-27 05:04:14 +08:00
|
|
|
auto *ConstMask = dyn_cast<ConstantDataVector>(Mask);
|
|
|
|
if (!ConstMask)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// The mask is constant. Convert this x86 intrinsic to the LLVM instrinsic
|
|
|
|
// to allow target-independent optimizations.
|
|
|
|
|
|
|
|
// First, cast the x86 intrinsic scalar pointer to a vector pointer to match
|
|
|
|
// the LLVM intrinsic definition for the pointer argument.
|
|
|
|
unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
|
|
|
|
PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace);
|
2017-07-08 07:16:26 +08:00
|
|
|
Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
|
[x86, InstCombine] transform x86 AVX masked stores to LLVM intrinsics
The intended effect of this patch in conjunction with:
http://reviews.llvm.org/rL259392
http://reviews.llvm.org/rL260145
is that customers using the AVX intrinsics in C will benefit from combines when
the store mask is constant:
void mstore_zero_mask(float *f, __m128 v) {
_mm_maskstore_ps(f, _mm_set1_epi32(0), v);
}
void mstore_fake_ones_mask(float *f, __m128 v) {
_mm_maskstore_ps(f, _mm_set1_epi32(1), v);
}
void mstore_ones_mask(float *f, __m128 v) {
_mm_maskstore_ps(f, _mm_set1_epi32(0x80000000), v);
}
void mstore_one_set_elt_mask(float *f, __m128 v) {
_mm_maskstore_ps(f, _mm_set_epi32(0x80000000, 0, 0, 0), v);
}
...so none of the above will actually generate a masked store for optimized code.
Differential Revision: http://reviews.llvm.org/D17485
llvm-svn: 262064
2016-02-27 05:04:14 +08:00
|
|
|
|
|
|
|
// Second, convert the x86 XMM integer vector mask to a vector of bools based
|
|
|
|
// on each element's most significant bit (the sign bit).
|
|
|
|
Constant *BoolMask = getNegativeIsTrueBoolVec(ConstMask);
|
|
|
|
|
2017-07-08 07:16:26 +08:00
|
|
|
IC.Builder.CreateMaskedStore(Vec, PtrCast, 1, BoolMask);
|
[x86, InstCombine] transform x86 AVX masked stores to LLVM intrinsics
The intended effect of this patch in conjunction with:
http://reviews.llvm.org/rL259392
http://reviews.llvm.org/rL260145
is that customers using the AVX intrinsics in C will benefit from combines when
the store mask is constant:
void mstore_zero_mask(float *f, __m128 v) {
_mm_maskstore_ps(f, _mm_set1_epi32(0), v);
}
void mstore_fake_ones_mask(float *f, __m128 v) {
_mm_maskstore_ps(f, _mm_set1_epi32(1), v);
}
void mstore_ones_mask(float *f, __m128 v) {
_mm_maskstore_ps(f, _mm_set1_epi32(0x80000000), v);
}
void mstore_one_set_elt_mask(float *f, __m128 v) {
_mm_maskstore_ps(f, _mm_set_epi32(0x80000000, 0, 0, 0), v);
}
...so none of the above will actually generate a masked store for optimized code.
Differential Revision: http://reviews.llvm.org/D17485
llvm-svn: 262064
2016-02-27 05:04:14 +08:00
|
|
|
|
|
|
|
// 'Replace uses' doesn't work for stores. Erase the original masked store.
|
|
|
|
IC.eraseInstFromFunction(II);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2017-02-28 07:08:49 +08:00
|
|
|
// Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
|
|
|
|
//
|
|
|
|
// A single NaN input is folded to minnum, so we rely on that folding for
|
|
|
|
// handling NaNs.
|
|
|
|
static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
|
|
|
|
const APFloat &Src2) {
|
|
|
|
APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
|
|
|
|
|
|
|
|
APFloat::cmpResult Cmp0 = Max3.compare(Src0);
|
|
|
|
assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
|
|
|
|
if (Cmp0 == APFloat::cmpEqual)
|
|
|
|
return maxnum(Src1, Src2);
|
|
|
|
|
|
|
|
APFloat::cmpResult Cmp1 = Max3.compare(Src1);
|
|
|
|
assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
|
|
|
|
if (Cmp1 == APFloat::cmpEqual)
|
|
|
|
return maxnum(Src0, Src2);
|
|
|
|
|
|
|
|
return maxnum(Src0, Src1);
|
|
|
|
}
|
|
|
|
|
[InstCombine, ARM, AArch64] Convert table lookup to shuffle vector
Turning a table lookup intrinsic into a shuffle vector instruction
can be beneficial. If the mask used for the lookup is the constant
vector {7,6,5,4,3,2,1,0}, then the back-end generates byte reverse
instructions instead.
Differential Revision: https://reviews.llvm.org/D46133
llvm-svn: 333550
2018-05-30 22:38:50 +08:00
|
|
|
/// Convert a table lookup to shufflevector if the mask is constant.
|
|
|
|
/// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in
|
|
|
|
/// which case we could lower the shufflevector with rev64 instructions
|
|
|
|
/// as it's actually a byte reverse.
|
|
|
|
static Value *simplifyNeonTbl1(const IntrinsicInst &II,
|
|
|
|
InstCombiner::BuilderTy &Builder) {
|
|
|
|
// Bail out if the mask is not a constant.
|
|
|
|
auto *C = dyn_cast<Constant>(II.getArgOperand(1));
|
|
|
|
if (!C)
|
|
|
|
return nullptr;
|
|
|
|
|
|
|
|
auto *VecTy = cast<VectorType>(II.getType());
|
|
|
|
unsigned NumElts = VecTy->getNumElements();
|
|
|
|
|
|
|
|
// Only perform this transformation for <8 x i8> vector types.
|
|
|
|
if (!VecTy->getElementType()->isIntegerTy(8) || NumElts != 8)
|
|
|
|
return nullptr;
|
|
|
|
|
|
|
|
uint32_t Indexes[8];
|
|
|
|
|
|
|
|
for (unsigned I = 0; I < NumElts; ++I) {
|
|
|
|
Constant *COp = C->getAggregateElement(I);
|
|
|
|
|
|
|
|
if (!COp || !isa<ConstantInt>(COp))
|
|
|
|
return nullptr;
|
|
|
|
|
|
|
|
Indexes[I] = cast<ConstantInt>(COp)->getLimitedValue();
|
|
|
|
|
|
|
|
// Make sure the mask indices are in range.
|
|
|
|
if (Indexes[I] >= NumElts)
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
auto *ShuffleMask = ConstantDataVector::get(II.getContext(),
|
|
|
|
makeArrayRef(Indexes));
|
|
|
|
auto *V1 = II.getArgOperand(0);
|
|
|
|
auto *V2 = Constant::getNullValue(V1->getType());
|
|
|
|
return Builder.CreateShuffleVector(V1, V2, ShuffleMask);
|
|
|
|
}
|
|
|
|
|
2018-05-31 20:19:18 +08:00
|
|
|
/// Convert a vector load intrinsic into a simple llvm load instruction.
|
|
|
|
/// This is beneficial when the underlying object being addressed comes
|
|
|
|
/// from a constant, since we get constant-folding for free.
|
|
|
|
static Value *simplifyNeonVld1(const IntrinsicInst &II,
|
|
|
|
unsigned MemAlign,
|
|
|
|
InstCombiner::BuilderTy &Builder) {
|
|
|
|
auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
|
|
|
|
|
|
|
|
if (!IntrAlign)
|
|
|
|
return nullptr;
|
|
|
|
|
|
|
|
unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign ?
|
|
|
|
MemAlign : IntrAlign->getLimitedValue();
|
|
|
|
|
|
|
|
if (!isPowerOf2_32(Alignment))
|
|
|
|
return nullptr;
|
|
|
|
|
|
|
|
auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0),
|
|
|
|
PointerType::get(II.getType(), 0));
|
2019-02-02 04:44:24 +08:00
|
|
|
return Builder.CreateAlignedLoad(II.getType(), BCastInst, Alignment);
|
2018-05-31 20:19:18 +08:00
|
|
|
}
|
|
|
|
|
2016-05-10 17:24:49 +08:00
|
|
|
// Returns true iff the 2 intrinsics have the same operands, limiting the
|
|
|
|
// comparison to the first NumOperands.
|
|
|
|
static bool haveSameOperands(const IntrinsicInst &I, const IntrinsicInst &E,
|
|
|
|
unsigned NumOperands) {
|
|
|
|
assert(I.getNumArgOperands() >= NumOperands && "Not enough operands");
|
|
|
|
assert(E.getNumArgOperands() >= NumOperands && "Not enough operands");
|
|
|
|
for (unsigned i = 0; i < NumOperands; i++)
|
|
|
|
if (I.getArgOperand(i) != E.getArgOperand(i))
|
|
|
|
return false;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Remove trivially empty start/end intrinsic ranges, i.e. a start
|
|
|
|
// immediately followed by an end (ignoring debuginfo or other
|
|
|
|
// start/end intrinsics in between). As this handles only the most trivial
|
|
|
|
// cases, tracking the nesting level is not needed:
|
|
|
|
//
|
|
|
|
// call @llvm.foo.start(i1 0) ; &I
|
|
|
|
// call @llvm.foo.start(i1 0)
|
|
|
|
// call @llvm.foo.end(i1 0) ; This one will not be skipped: it will be removed
|
|
|
|
// call @llvm.foo.end(i1 0)
|
|
|
|
static bool removeTriviallyEmptyRange(IntrinsicInst &I, unsigned StartID,
|
|
|
|
unsigned EndID, InstCombiner &IC) {
|
|
|
|
assert(I.getIntrinsicID() == StartID &&
|
|
|
|
"Start intrinsic does not have expected ID");
|
|
|
|
BasicBlock::iterator BI(I), BE(I.getParent()->end());
|
|
|
|
for (++BI; BI != BE; ++BI) {
|
|
|
|
if (auto *E = dyn_cast<IntrinsicInst>(BI)) {
|
|
|
|
if (isa<DbgInfoIntrinsic>(E) || E->getIntrinsicID() == StartID)
|
|
|
|
continue;
|
|
|
|
if (E->getIntrinsicID() == EndID &&
|
|
|
|
haveSameOperands(I, *E, E->getNumArgOperands())) {
|
|
|
|
IC.eraseInstFromFunction(*E);
|
|
|
|
IC.eraseInstFromFunction(I);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
[NVPTX] Upgrade NVVM intrinsics in InstCombineCalls.
Summary:
There are many NVVM intrinsics that we can't entirely get rid of, but
that nonetheless often correspond to target-generic LLVM intrinsics.
For example, if flush denormals to zero (ftz) is enabled, we can convert
@llvm.nvvm.ceil.ftz.f to @llvm.ceil.f32. On the other hand, if ftz is
disabled, we can't do this, because @llvm.ceil.f32 will be lowered to a
non-ftz PTX instruction. In this case, we can, however, simplify the
non-ftz nvvm ceil intrinsic, @llvm.nvvm.ceil.f, to @llvm.ceil.f32.
These transformations are particularly useful because they let us
constant fold instructions that appear in libdevice, the bitcode library
that ships with CUDA and essentially functions as its libm.
Reviewers: tra
Subscribers: hfinkel, majnemer, llvm-commits
Differential Revision: https://reviews.llvm.org/D28794
llvm-svn: 293244
2017-01-27 08:58:58 +08:00
|
|
|
// Convert NVVM intrinsics to target-generic LLVM code where possible.
|
|
|
|
static Instruction *SimplifyNVVMIntrinsic(IntrinsicInst *II, InstCombiner &IC) {
|
|
|
|
// Each NVVM intrinsic we can simplify can be replaced with one of:
|
|
|
|
//
|
|
|
|
// * an LLVM intrinsic,
|
|
|
|
// * an LLVM cast operation,
|
|
|
|
// * an LLVM binary operation, or
|
|
|
|
// * ad-hoc LLVM IR for the particular operation.
|
|
|
|
|
|
|
|
// Some transformations are only valid when the module's
|
|
|
|
// flush-denormals-to-zero (ftz) setting is true/false, whereas other
|
|
|
|
// transformations are valid regardless of the module's ftz setting.
|
|
|
|
enum FtzRequirementTy {
|
|
|
|
FTZ_Any, // Any ftz setting is ok.
|
|
|
|
FTZ_MustBeOn, // Transformation is valid only if ftz is on.
|
|
|
|
FTZ_MustBeOff, // Transformation is valid only if ftz is off.
|
|
|
|
};
|
|
|
|
// Classes of NVVM intrinsics that can't be replaced one-to-one with a
|
|
|
|
// target-generic intrinsic, cast op, or binary op but that we can nonetheless
|
|
|
|
// simplify.
|
|
|
|
enum SpecialCase {
|
|
|
|
SPC_Reciprocal,
|
|
|
|
};
|
|
|
|
|
|
|
|
// SimplifyAction is a poor-man's variant (plus an additional flag) that
|
|
|
|
// represents how to replace an NVVM intrinsic with target-generic LLVM IR.
|
|
|
|
struct SimplifyAction {
|
|
|
|
// Invariant: At most one of these Optionals has a value.
|
|
|
|
Optional<Intrinsic::ID> IID;
|
|
|
|
Optional<Instruction::CastOps> CastOp;
|
|
|
|
Optional<Instruction::BinaryOps> BinaryOp;
|
|
|
|
Optional<SpecialCase> Special;
|
|
|
|
|
|
|
|
FtzRequirementTy FtzRequirement = FTZ_Any;
|
|
|
|
|
|
|
|
SimplifyAction() = default;
|
|
|
|
|
|
|
|
SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq)
|
|
|
|
: IID(IID), FtzRequirement(FtzReq) {}
|
|
|
|
|
|
|
|
// Cast operations don't have anything to do with FTZ, so we skip that
|
|
|
|
// argument.
|
|
|
|
SimplifyAction(Instruction::CastOps CastOp) : CastOp(CastOp) {}
|
|
|
|
|
|
|
|
SimplifyAction(Instruction::BinaryOps BinaryOp, FtzRequirementTy FtzReq)
|
|
|
|
: BinaryOp(BinaryOp), FtzRequirement(FtzReq) {}
|
|
|
|
|
|
|
|
SimplifyAction(SpecialCase Special, FtzRequirementTy FtzReq)
|
|
|
|
: Special(Special), FtzRequirement(FtzReq) {}
|
|
|
|
};
|
|
|
|
|
|
|
|
// Try to generate a SimplifyAction describing how to replace our
|
|
|
|
// IntrinsicInstr with target-generic LLVM IR.
|
|
|
|
const SimplifyAction Action = [II]() -> SimplifyAction {
|
|
|
|
switch (II->getIntrinsicID()) {
|
|
|
|
// NVVM intrinsics that map directly to LLVM intrinsics.
|
|
|
|
case Intrinsic::nvvm_ceil_d:
|
|
|
|
return {Intrinsic::ceil, FTZ_Any};
|
|
|
|
case Intrinsic::nvvm_ceil_f:
|
|
|
|
return {Intrinsic::ceil, FTZ_MustBeOff};
|
|
|
|
case Intrinsic::nvvm_ceil_ftz_f:
|
|
|
|
return {Intrinsic::ceil, FTZ_MustBeOn};
|
|
|
|
case Intrinsic::nvvm_fabs_d:
|
|
|
|
return {Intrinsic::fabs, FTZ_Any};
|
|
|
|
case Intrinsic::nvvm_fabs_f:
|
|
|
|
return {Intrinsic::fabs, FTZ_MustBeOff};
|
|
|
|
case Intrinsic::nvvm_fabs_ftz_f:
|
|
|
|
return {Intrinsic::fabs, FTZ_MustBeOn};
|
|
|
|
case Intrinsic::nvvm_floor_d:
|
|
|
|
return {Intrinsic::floor, FTZ_Any};
|
|
|
|
case Intrinsic::nvvm_floor_f:
|
|
|
|
return {Intrinsic::floor, FTZ_MustBeOff};
|
|
|
|
case Intrinsic::nvvm_floor_ftz_f:
|
|
|
|
return {Intrinsic::floor, FTZ_MustBeOn};
|
|
|
|
case Intrinsic::nvvm_fma_rn_d:
|
|
|
|
return {Intrinsic::fma, FTZ_Any};
|
|
|
|
case Intrinsic::nvvm_fma_rn_f:
|
|
|
|
return {Intrinsic::fma, FTZ_MustBeOff};
|
|
|
|
case Intrinsic::nvvm_fma_rn_ftz_f:
|
|
|
|
return {Intrinsic::fma, FTZ_MustBeOn};
|
|
|
|
case Intrinsic::nvvm_fmax_d:
|
|
|
|
return {Intrinsic::maxnum, FTZ_Any};
|
|
|
|
case Intrinsic::nvvm_fmax_f:
|
|
|
|
return {Intrinsic::maxnum, FTZ_MustBeOff};
|
|
|
|
case Intrinsic::nvvm_fmax_ftz_f:
|
|
|
|
return {Intrinsic::maxnum, FTZ_MustBeOn};
|
|
|
|
case Intrinsic::nvvm_fmin_d:
|
|
|
|
return {Intrinsic::minnum, FTZ_Any};
|
|
|
|
case Intrinsic::nvvm_fmin_f:
|
|
|
|
return {Intrinsic::minnum, FTZ_MustBeOff};
|
|
|
|
case Intrinsic::nvvm_fmin_ftz_f:
|
|
|
|
return {Intrinsic::minnum, FTZ_MustBeOn};
|
|
|
|
case Intrinsic::nvvm_round_d:
|
|
|
|
return {Intrinsic::round, FTZ_Any};
|
|
|
|
case Intrinsic::nvvm_round_f:
|
|
|
|
return {Intrinsic::round, FTZ_MustBeOff};
|
|
|
|
case Intrinsic::nvvm_round_ftz_f:
|
|
|
|
return {Intrinsic::round, FTZ_MustBeOn};
|
|
|
|
case Intrinsic::nvvm_sqrt_rn_d:
|
|
|
|
return {Intrinsic::sqrt, FTZ_Any};
|
|
|
|
case Intrinsic::nvvm_sqrt_f:
|
|
|
|
// nvvm_sqrt_f is a special case. For most intrinsics, foo_ftz_f is the
|
|
|
|
// ftz version, and foo_f is the non-ftz version. But nvvm_sqrt_f adopts
|
|
|
|
// the ftz-ness of the surrounding code. sqrt_rn_f and sqrt_rn_ftz_f are
|
|
|
|
// the versions with explicit ftz-ness.
|
|
|
|
return {Intrinsic::sqrt, FTZ_Any};
|
|
|
|
case Intrinsic::nvvm_sqrt_rn_f:
|
|
|
|
return {Intrinsic::sqrt, FTZ_MustBeOff};
|
|
|
|
case Intrinsic::nvvm_sqrt_rn_ftz_f:
|
|
|
|
return {Intrinsic::sqrt, FTZ_MustBeOn};
|
|
|
|
case Intrinsic::nvvm_trunc_d:
|
|
|
|
return {Intrinsic::trunc, FTZ_Any};
|
|
|
|
case Intrinsic::nvvm_trunc_f:
|
|
|
|
return {Intrinsic::trunc, FTZ_MustBeOff};
|
|
|
|
case Intrinsic::nvvm_trunc_ftz_f:
|
|
|
|
return {Intrinsic::trunc, FTZ_MustBeOn};
|
|
|
|
|
|
|
|
// NVVM intrinsics that map to LLVM cast operations.
|
|
|
|
//
|
|
|
|
// Note that llvm's target-generic conversion operators correspond to the rz
|
|
|
|
// (round to zero) versions of the nvvm conversion intrinsics, even though
|
|
|
|
// most everything else here uses the rn (round to nearest even) nvvm ops.
|
|
|
|
case Intrinsic::nvvm_d2i_rz:
|
|
|
|
case Intrinsic::nvvm_f2i_rz:
|
|
|
|
case Intrinsic::nvvm_d2ll_rz:
|
|
|
|
case Intrinsic::nvvm_f2ll_rz:
|
|
|
|
return {Instruction::FPToSI};
|
|
|
|
case Intrinsic::nvvm_d2ui_rz:
|
|
|
|
case Intrinsic::nvvm_f2ui_rz:
|
|
|
|
case Intrinsic::nvvm_d2ull_rz:
|
|
|
|
case Intrinsic::nvvm_f2ull_rz:
|
|
|
|
return {Instruction::FPToUI};
|
|
|
|
case Intrinsic::nvvm_i2d_rz:
|
|
|
|
case Intrinsic::nvvm_i2f_rz:
|
|
|
|
case Intrinsic::nvvm_ll2d_rz:
|
|
|
|
case Intrinsic::nvvm_ll2f_rz:
|
|
|
|
return {Instruction::SIToFP};
|
|
|
|
case Intrinsic::nvvm_ui2d_rz:
|
|
|
|
case Intrinsic::nvvm_ui2f_rz:
|
|
|
|
case Intrinsic::nvvm_ull2d_rz:
|
|
|
|
case Intrinsic::nvvm_ull2f_rz:
|
|
|
|
return {Instruction::UIToFP};
|
|
|
|
|
|
|
|
// NVVM intrinsics that map to LLVM binary ops.
|
|
|
|
case Intrinsic::nvvm_add_rn_d:
|
|
|
|
return {Instruction::FAdd, FTZ_Any};
|
|
|
|
case Intrinsic::nvvm_add_rn_f:
|
|
|
|
return {Instruction::FAdd, FTZ_MustBeOff};
|
|
|
|
case Intrinsic::nvvm_add_rn_ftz_f:
|
|
|
|
return {Instruction::FAdd, FTZ_MustBeOn};
|
|
|
|
case Intrinsic::nvvm_mul_rn_d:
|
|
|
|
return {Instruction::FMul, FTZ_Any};
|
|
|
|
case Intrinsic::nvvm_mul_rn_f:
|
|
|
|
return {Instruction::FMul, FTZ_MustBeOff};
|
|
|
|
case Intrinsic::nvvm_mul_rn_ftz_f:
|
|
|
|
return {Instruction::FMul, FTZ_MustBeOn};
|
|
|
|
case Intrinsic::nvvm_div_rn_d:
|
|
|
|
return {Instruction::FDiv, FTZ_Any};
|
|
|
|
case Intrinsic::nvvm_div_rn_f:
|
|
|
|
return {Instruction::FDiv, FTZ_MustBeOff};
|
|
|
|
case Intrinsic::nvvm_div_rn_ftz_f:
|
|
|
|
return {Instruction::FDiv, FTZ_MustBeOn};
|
|
|
|
|
|
|
|
// The remainder of cases are NVVM intrinsics that map to LLVM idioms, but
|
|
|
|
// need special handling.
|
|
|
|
//
|
2017-07-11 14:04:59 +08:00
|
|
|
// We seem to be missing intrinsics for rcp.approx.{ftz.}f32, which is just
|
[NVPTX] Upgrade NVVM intrinsics in InstCombineCalls.
Summary:
There are many NVVM intrinsics that we can't entirely get rid of, but
that nonetheless often correspond to target-generic LLVM intrinsics.
For example, if flush denormals to zero (ftz) is enabled, we can convert
@llvm.nvvm.ceil.ftz.f to @llvm.ceil.f32. On the other hand, if ftz is
disabled, we can't do this, because @llvm.ceil.f32 will be lowered to a
non-ftz PTX instruction. In this case, we can, however, simplify the
non-ftz nvvm ceil intrinsic, @llvm.nvvm.ceil.f, to @llvm.ceil.f32.
These transformations are particularly useful because they let us
constant fold instructions that appear in libdevice, the bitcode library
that ships with CUDA and essentially functions as its libm.
Reviewers: tra
Subscribers: hfinkel, majnemer, llvm-commits
Differential Revision: https://reviews.llvm.org/D28794
llvm-svn: 293244
2017-01-27 08:58:58 +08:00
|
|
|
// as well.
|
|
|
|
case Intrinsic::nvvm_rcp_rn_d:
|
|
|
|
return {SPC_Reciprocal, FTZ_Any};
|
|
|
|
case Intrinsic::nvvm_rcp_rn_f:
|
|
|
|
return {SPC_Reciprocal, FTZ_MustBeOff};
|
|
|
|
case Intrinsic::nvvm_rcp_rn_ftz_f:
|
|
|
|
return {SPC_Reciprocal, FTZ_MustBeOn};
|
|
|
|
|
|
|
|
// We do not currently simplify intrinsics that give an approximate answer.
|
|
|
|
// These include:
|
|
|
|
//
|
|
|
|
// - nvvm_cos_approx_{f,ftz_f}
|
|
|
|
// - nvvm_ex2_approx_{d,f,ftz_f}
|
|
|
|
// - nvvm_lg2_approx_{d,f,ftz_f}
|
|
|
|
// - nvvm_sin_approx_{f,ftz_f}
|
|
|
|
// - nvvm_sqrt_approx_{f,ftz_f}
|
|
|
|
// - nvvm_rsqrt_approx_{d,f,ftz_f}
|
|
|
|
// - nvvm_div_approx_{ftz_d,ftz_f,f}
|
|
|
|
// - nvvm_rcp_approx_ftz_d
|
|
|
|
//
|
|
|
|
// Ideally we'd encode them as e.g. "fast call @llvm.cos", where "fast"
|
|
|
|
// means that fastmath is enabled in the intrinsic. Unfortunately only
|
|
|
|
// binary operators (currently) have a fastmath bit in SelectionDAG, so this
|
|
|
|
// information gets lost and we can't select on it.
|
|
|
|
//
|
|
|
|
// TODO: div and rcp are lowered to a binary op, so these we could in theory
|
|
|
|
// lower them to "fast fdiv".
|
|
|
|
|
|
|
|
default:
|
|
|
|
return {};
|
|
|
|
}
|
|
|
|
}();
|
|
|
|
|
|
|
|
// If Action.FtzRequirementTy is not satisfied by the module's ftz state, we
|
|
|
|
// can bail out now. (Notice that in the case that IID is not an NVVM
|
|
|
|
// intrinsic, we don't have to look up any module metadata, as
|
|
|
|
// FtzRequirementTy will be FTZ_Any.)
|
|
|
|
if (Action.FtzRequirement != FTZ_Any) {
|
Consolidate internal denormal flushing controls
Currently there are 4 different mechanisms for controlling denormal
flushing behavior, and about as many equivalent frontend controls.
- AMDGPU uses the fp32-denormals and fp64-f16-denormals subtarget features
- NVPTX uses the nvptx-f32ftz attribute
- ARM directly uses the denormal-fp-math attribute
- Other targets indirectly use denormal-fp-math in one DAGCombine
- cl-denorms-are-zero has a corresponding denorms-are-zero attribute
AMDGPU wants a distinct control for f32 flushing from f16/f64, and as
far as I can tell the same is true for NVPTX (based on the attribute
name).
Work on consolidating these into the denormal-fp-math attribute, and a
new type specific denormal-fp-math-f32 variant. Only ARM seems to
support the two different flush modes, so this is overkill for the
other use cases. Ideally we would error on the unsupported
positive-zero mode on other targets from somewhere.
Move the logic for selecting the flush mode into the compiler driver,
instead of handling it in cc1. denormal-fp-math/denormal-fp-math-f32
are now both cc1 flags, but denormal-fp-math-f32 is not yet exposed as
a user flag.
-cl-denorms-are-zero, -fcuda-flush-denormals-to-zero and
-fno-cuda-flush-denormals-to-zero will be mapped to
-fp-denormal-math-f32=ieee or preserve-sign rather than the old
attributes.
Stop emitting the denorms-are-zero attribute for the OpenCL flag. It
has no in-tree users. The meaning would also be target dependent, such
as the AMDGPU choice to treat this as only meaning allow flushing of
f32 and not f16 or f64. The naming is also potentially confusing,
since DAZ in other contexts refers to instructions implicitly treating
input denormals as zero, not necessarily flushing output denormals to
zero.
This also does not attempt to change the behavior for the current
attribute. The LangRef now states that the default is ieee behavior,
but this is inaccurate for the current implementation. The clang
handling is slightly hacky to avoid touching the existing
denormal-fp-math uses. Fixing this will be left for a future patch.
AMDGPU is still using the subtarget feature to control the denormal
mode, but the new attribute are now emitted. A future change will
switch this and remove the subtarget features.
2019-11-02 08:57:29 +08:00
|
|
|
StringRef Attr = II->getFunction()
|
|
|
|
->getFnAttribute("denormal-fp-math-f32")
|
|
|
|
.getValueAsString();
|
|
|
|
bool FtzEnabled = parseDenormalFPAttribute(Attr) != DenormalMode::IEEE;
|
[NVPTX] Upgrade NVVM intrinsics in InstCombineCalls.
Summary:
There are many NVVM intrinsics that we can't entirely get rid of, but
that nonetheless often correspond to target-generic LLVM intrinsics.
For example, if flush denormals to zero (ftz) is enabled, we can convert
@llvm.nvvm.ceil.ftz.f to @llvm.ceil.f32. On the other hand, if ftz is
disabled, we can't do this, because @llvm.ceil.f32 will be lowered to a
non-ftz PTX instruction. In this case, we can, however, simplify the
non-ftz nvvm ceil intrinsic, @llvm.nvvm.ceil.f, to @llvm.ceil.f32.
These transformations are particularly useful because they let us
constant fold instructions that appear in libdevice, the bitcode library
that ships with CUDA and essentially functions as its libm.
Reviewers: tra
Subscribers: hfinkel, majnemer, llvm-commits
Differential Revision: https://reviews.llvm.org/D28794
llvm-svn: 293244
2017-01-27 08:58:58 +08:00
|
|
|
|
|
|
|
if (FtzEnabled != (Action.FtzRequirement == FTZ_MustBeOn))
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Simplify to target-generic intrinsic.
|
|
|
|
if (Action.IID) {
|
|
|
|
SmallVector<Value *, 4> Args(II->arg_operands());
|
|
|
|
// All the target-generic intrinsics currently of interest to us have one
|
|
|
|
// type argument, equal to that of the nvvm intrinsic's argument.
|
2017-01-27 09:49:39 +08:00
|
|
|
Type *Tys[] = {II->getArgOperand(0)->getType()};
|
[NVPTX] Upgrade NVVM intrinsics in InstCombineCalls.
Summary:
There are many NVVM intrinsics that we can't entirely get rid of, but
that nonetheless often correspond to target-generic LLVM intrinsics.
For example, if flush denormals to zero (ftz) is enabled, we can convert
@llvm.nvvm.ceil.ftz.f to @llvm.ceil.f32. On the other hand, if ftz is
disabled, we can't do this, because @llvm.ceil.f32 will be lowered to a
non-ftz PTX instruction. In this case, we can, however, simplify the
non-ftz nvvm ceil intrinsic, @llvm.nvvm.ceil.f, to @llvm.ceil.f32.
These transformations are particularly useful because they let us
constant fold instructions that appear in libdevice, the bitcode library
that ships with CUDA and essentially functions as its libm.
Reviewers: tra
Subscribers: hfinkel, majnemer, llvm-commits
Differential Revision: https://reviews.llvm.org/D28794
llvm-svn: 293244
2017-01-27 08:58:58 +08:00
|
|
|
return CallInst::Create(
|
|
|
|
Intrinsic::getDeclaration(II->getModule(), *Action.IID, Tys), Args);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Simplify to target-generic binary op.
|
|
|
|
if (Action.BinaryOp)
|
|
|
|
return BinaryOperator::Create(*Action.BinaryOp, II->getArgOperand(0),
|
|
|
|
II->getArgOperand(1), II->getName());
|
|
|
|
|
|
|
|
// Simplify to target-generic cast op.
|
|
|
|
if (Action.CastOp)
|
|
|
|
return CastInst::Create(*Action.CastOp, II->getArgOperand(0), II->getType(),
|
|
|
|
II->getName());
|
|
|
|
|
|
|
|
// All that's left are the special cases.
|
|
|
|
if (!Action.Special)
|
|
|
|
return nullptr;
|
|
|
|
|
|
|
|
switch (*Action.Special) {
|
|
|
|
case SPC_Reciprocal:
|
|
|
|
// Simplify reciprocal.
|
|
|
|
return BinaryOperator::Create(
|
|
|
|
Instruction::FDiv, ConstantFP::get(II->getArgOperand(0)->getType(), 1),
|
|
|
|
II->getArgOperand(0), II->getName());
|
|
|
|
}
|
2017-01-27 10:04:07 +08:00
|
|
|
llvm_unreachable("All SpecialCase enumerators should be handled in switch.");
|
[NVPTX] Upgrade NVVM intrinsics in InstCombineCalls.
Summary:
There are many NVVM intrinsics that we can't entirely get rid of, but
that nonetheless often correspond to target-generic LLVM intrinsics.
For example, if flush denormals to zero (ftz) is enabled, we can convert
@llvm.nvvm.ceil.ftz.f to @llvm.ceil.f32. On the other hand, if ftz is
disabled, we can't do this, because @llvm.ceil.f32 will be lowered to a
non-ftz PTX instruction. In this case, we can, however, simplify the
non-ftz nvvm ceil intrinsic, @llvm.nvvm.ceil.f, to @llvm.ceil.f32.
These transformations are particularly useful because they let us
constant fold instructions that appear in libdevice, the bitcode library
that ships with CUDA and essentially functions as its libm.
Reviewers: tra
Subscribers: hfinkel, majnemer, llvm-commits
Differential Revision: https://reviews.llvm.org/D28794
llvm-svn: 293244
2017-01-27 08:58:58 +08:00
|
|
|
}
|
|
|
|
|
2016-05-10 17:24:49 +08:00
|
|
|
Instruction *InstCombiner::visitVAStartInst(VAStartInst &I) {
|
|
|
|
removeTriviallyEmptyRange(I, Intrinsic::vastart, Intrinsic::vaend, *this);
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
Instruction *InstCombiner::visitVACopyInst(VACopyInst &I) {
|
|
|
|
removeTriviallyEmptyRange(I, Intrinsic::vacopy, Intrinsic::vaend, *this);
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
2018-11-27 06:00:41 +08:00
|
|
|
static Instruction *canonicalizeConstantArg0ToArg1(CallInst &Call) {
|
|
|
|
assert(Call.getNumArgOperands() > 1 && "Need at least 2 args to swap");
|
|
|
|
Value *Arg0 = Call.getArgOperand(0), *Arg1 = Call.getArgOperand(1);
|
|
|
|
if (isa<Constant>(Arg0) && !isa<Constant>(Arg1)) {
|
|
|
|
Call.setArgOperand(0, Arg1);
|
|
|
|
Call.setArgOperand(1, Arg0);
|
|
|
|
return &Call;
|
|
|
|
}
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
2019-03-07 02:30:00 +08:00
|
|
|
Instruction *InstCombiner::foldIntrinsicWithOverflowCommon(IntrinsicInst *II) {
|
2019-05-26 19:43:31 +08:00
|
|
|
WithOverflowInst *WO = cast<WithOverflowInst>(II);
|
2019-03-07 02:30:00 +08:00
|
|
|
Value *OperationResult = nullptr;
|
|
|
|
Constant *OverflowResult = nullptr;
|
2019-05-26 19:43:31 +08:00
|
|
|
if (OptimizeOverflowCheck(WO->getBinaryOp(), WO->isSigned(), WO->getLHS(),
|
|
|
|
WO->getRHS(), *WO, OperationResult, OverflowResult))
|
|
|
|
return CreateOverflowTuple(WO, OperationResult, OverflowResult);
|
2019-03-07 02:30:00 +08:00
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
2016-01-21 06:24:38 +08:00
|
|
|
/// CallInst simplification. This mostly only handles folding of intrinsic
|
2019-02-01 01:23:29 +08:00
|
|
|
/// instructions. For normal calls, it allows visitCallBase to do the heavy
|
2016-01-21 06:24:38 +08:00
|
|
|
/// lifting.
|
2010-01-05 15:32:13 +08:00
|
|
|
Instruction *InstCombiner::visitCallInst(CallInst &CI) {
|
2017-12-27 08:16:12 +08:00
|
|
|
if (Value *V = SimplifyCall(&CI, SQ.getWithInstruction(&CI)))
|
2016-02-02 06:23:39 +08:00
|
|
|
return replaceInstUsesWith(CI, V);
|
2015-05-22 11:56:46 +08:00
|
|
|
|
2016-08-05 09:06:44 +08:00
|
|
|
if (isFreeCall(&CI, &TLI))
|
2010-01-05 15:32:13 +08:00
|
|
|
return visitFree(CI);
|
|
|
|
|
|
|
|
// If the caller function is nounwind, mark the call as nounwind, even if the
|
|
|
|
// callee isn't.
|
2016-08-11 23:16:06 +08:00
|
|
|
if (CI.getFunction()->doesNotThrow() && !CI.doesNotThrow()) {
|
2010-01-05 15:32:13 +08:00
|
|
|
CI.setDoesNotThrow();
|
|
|
|
return &CI;
|
|
|
|
}
|
2012-02-03 08:07:04 +08:00
|
|
|
|
2010-01-05 15:32:13 +08:00
|
|
|
IntrinsicInst *II = dyn_cast<IntrinsicInst>(&CI);
|
2019-02-01 01:23:29 +08:00
|
|
|
if (!II) return visitCallBase(CI);
|
2010-06-24 20:58:35 +08:00
|
|
|
|
2019-02-09 04:48:56 +08:00
|
|
|
// Intrinsics cannot occur in an invoke or a callbr, so handle them here
|
|
|
|
// instead of in visitCallBase.
|
2018-05-11 22:30:02 +08:00
|
|
|
if (auto *MI = dyn_cast<AnyMemIntrinsic>(II)) {
|
2010-01-05 15:32:13 +08:00
|
|
|
bool Changed = false;
|
|
|
|
|
|
|
|
// memmove/cpy/set of zero bytes is a noop.
|
|
|
|
if (Constant *NumBytes = dyn_cast<Constant>(MI->getLength())) {
|
2010-10-01 13:51:02 +08:00
|
|
|
if (NumBytes->isNullValue())
|
2016-02-02 06:23:39 +08:00
|
|
|
return eraseInstFromFunction(CI);
|
2010-01-05 15:32:13 +08:00
|
|
|
|
|
|
|
if (ConstantInt *CI = dyn_cast<ConstantInt>(NumBytes))
|
|
|
|
if (CI->getZExtValue() == 1) {
|
|
|
|
// Replace the instruction with just byte operations. We would
|
|
|
|
// transform other cases to loads/stores, but we don't know if
|
|
|
|
// alignment is sufficient.
|
|
|
|
}
|
|
|
|
}
|
2012-02-03 08:07:04 +08:00
|
|
|
|
2010-10-01 13:51:02 +08:00
|
|
|
// No other transformations apply to volatile transfers.
|
2018-05-11 22:30:02 +08:00
|
|
|
if (auto *M = dyn_cast<MemIntrinsic>(MI))
|
|
|
|
if (M->isVolatile())
|
|
|
|
return nullptr;
|
2010-01-05 15:32:13 +08:00
|
|
|
|
|
|
|
// If we have a memmove and the source operation is a constant global,
|
|
|
|
// then the source and dest pointers can't alias, so we can change this
|
|
|
|
// into a call to memcpy.
|
2018-05-11 22:30:02 +08:00
|
|
|
if (auto *MMI = dyn_cast<AnyMemMoveInst>(MI)) {
|
2010-01-05 15:32:13 +08:00
|
|
|
if (GlobalVariable *GVSrc = dyn_cast<GlobalVariable>(MMI->getSource()))
|
|
|
|
if (GVSrc->isConstant()) {
|
2015-12-15 01:24:23 +08:00
|
|
|
Module *M = CI.getModule();
|
2018-05-11 22:30:02 +08:00
|
|
|
Intrinsic::ID MemCpyID =
|
|
|
|
isa<AtomicMemMoveInst>(MMI)
|
|
|
|
? Intrinsic::memcpy_element_unordered_atomic
|
|
|
|
: Intrinsic::memcpy;
|
2011-07-12 22:06:48 +08:00
|
|
|
Type *Tys[3] = { CI.getArgOperand(0)->getType(),
|
|
|
|
CI.getArgOperand(1)->getType(),
|
|
|
|
CI.getArgOperand(2)->getType() };
|
2011-07-15 01:45:39 +08:00
|
|
|
CI.setCalledFunction(Intrinsic::getDeclaration(M, MemCpyID, Tys));
|
2010-01-05 15:32:13 +08:00
|
|
|
Changed = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-05-11 22:30:02 +08:00
|
|
|
if (AnyMemTransferInst *MTI = dyn_cast<AnyMemTransferInst>(MI)) {
|
2010-01-05 15:32:13 +08:00
|
|
|
// memmove(x,x,size) -> noop.
|
|
|
|
if (MTI->getSource() == MTI->getDest())
|
2016-02-02 06:23:39 +08:00
|
|
|
return eraseInstFromFunction(CI);
|
2010-04-17 07:37:20 +08:00
|
|
|
}
|
2010-01-05 15:32:13 +08:00
|
|
|
|
2010-04-17 07:37:20 +08:00
|
|
|
// If we can determine a pointer alignment that is bigger than currently
|
|
|
|
// set, update the alignment.
|
2018-05-11 22:30:02 +08:00
|
|
|
if (auto *MTI = dyn_cast<AnyMemTransferInst>(MI)) {
|
|
|
|
if (Instruction *I = SimplifyAnyMemTransfer(MTI))
|
2010-01-05 15:32:13 +08:00
|
|
|
return I;
|
2018-05-12 04:04:50 +08:00
|
|
|
} else if (auto *MSI = dyn_cast<AnyMemSetInst>(MI)) {
|
|
|
|
if (Instruction *I = SimplifyAnyMemSet(MSI))
|
2010-01-05 15:32:13 +08:00
|
|
|
return I;
|
|
|
|
}
|
2010-06-24 21:42:49 +08:00
|
|
|
|
2010-01-05 15:32:13 +08:00
|
|
|
if (Changed) return II;
|
|
|
|
}
|
2012-02-03 08:07:04 +08:00
|
|
|
|
2019-03-16 03:54:06 +08:00
|
|
|
// For vector result intrinsics, use the generic demanded vector support.
|
2019-01-31 03:21:11 +08:00
|
|
|
if (II->getType()->isVectorTy()) {
|
|
|
|
auto VWidth = II->getType()->getVectorNumElements();
|
|
|
|
APInt UndefElts(VWidth, 0);
|
|
|
|
APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
|
|
|
|
if (Value *V = SimplifyDemandedVectorElts(II, AllOnesEltMask, UndefElts)) {
|
|
|
|
if (V != II)
|
|
|
|
return replaceInstUsesWith(*II, V);
|
|
|
|
return II;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
[NVPTX] Upgrade NVVM intrinsics in InstCombineCalls.
Summary:
There are many NVVM intrinsics that we can't entirely get rid of, but
that nonetheless often correspond to target-generic LLVM intrinsics.
For example, if flush denormals to zero (ftz) is enabled, we can convert
@llvm.nvvm.ceil.ftz.f to @llvm.ceil.f32. On the other hand, if ftz is
disabled, we can't do this, because @llvm.ceil.f32 will be lowered to a
non-ftz PTX instruction. In this case, we can, however, simplify the
non-ftz nvvm ceil intrinsic, @llvm.nvvm.ceil.f, to @llvm.ceil.f32.
These transformations are particularly useful because they let us
constant fold instructions that appear in libdevice, the bitcode library
that ships with CUDA and essentially functions as its libm.
Reviewers: tra
Subscribers: hfinkel, majnemer, llvm-commits
Differential Revision: https://reviews.llvm.org/D28794
llvm-svn: 293244
2017-01-27 08:58:58 +08:00
|
|
|
if (Instruction *I = SimplifyNVVMIntrinsic(II, *this))
|
|
|
|
return I;
|
|
|
|
|
2016-01-21 00:41:43 +08:00
|
|
|
auto SimplifyDemandedVectorEltsLow = [this](Value *Op, unsigned Width,
|
|
|
|
unsigned DemandedWidth) {
|
2015-09-18 04:32:45 +08:00
|
|
|
APInt UndefElts(Width, 0);
|
|
|
|
APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
|
|
|
|
return SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
|
|
|
|
};
|
|
|
|
|
2019-05-06 23:35:02 +08:00
|
|
|
Intrinsic::ID IID = II->getIntrinsicID();
|
|
|
|
switch (IID) {
|
2010-01-05 15:32:13 +08:00
|
|
|
default: break;
|
2016-12-21 07:46:36 +08:00
|
|
|
case Intrinsic::objectsize:
|
2019-01-31 04:34:35 +08:00
|
|
|
if (Value *V = lowerObjectSizeCall(II, DL, &TLI, /*MustSucceed=*/false))
|
|
|
|
return replaceInstUsesWith(CI, V);
|
2014-04-25 13:29:35 +08:00
|
|
|
return nullptr;
|
2012-12-13 11:13:36 +08:00
|
|
|
case Intrinsic::bswap: {
|
|
|
|
Value *IIOperand = II->getArgOperand(0);
|
2014-04-25 13:29:35 +08:00
|
|
|
Value *X = nullptr;
|
2012-12-13 11:13:36 +08:00
|
|
|
|
2010-01-05 15:32:13 +08:00
|
|
|
// bswap(trunc(bswap(x))) -> trunc(lshr(x, c))
|
2012-12-13 11:13:36 +08:00
|
|
|
if (match(IIOperand, m_Trunc(m_BSwap(m_Value(X))))) {
|
|
|
|
unsigned C = X->getType()->getPrimitiveSizeInBits() -
|
|
|
|
IIOperand->getType()->getPrimitiveSizeInBits();
|
|
|
|
Value *CV = ConstantInt::get(X->getType(), C);
|
2017-07-08 07:16:26 +08:00
|
|
|
Value *V = Builder.CreateLShr(X, CV);
|
2012-12-13 11:13:36 +08:00
|
|
|
return new TruncInst(V, IIOperand->getType());
|
2010-01-05 15:32:13 +08:00
|
|
|
}
|
|
|
|
break;
|
2012-12-13 11:13:36 +08:00
|
|
|
}
|
2016-02-02 01:00:10 +08:00
|
|
|
case Intrinsic::masked_load:
|
2019-04-25 09:18:56 +08:00
|
|
|
if (Value *SimplifiedMaskedOp = simplifyMaskedLoad(*II))
|
2016-02-02 06:23:39 +08:00
|
|
|
return replaceInstUsesWith(CI, SimplifiedMaskedOp);
|
2016-02-02 01:00:10 +08:00
|
|
|
break;
|
2016-02-02 03:39:52 +08:00
|
|
|
case Intrinsic::masked_store:
|
2019-03-21 02:44:58 +08:00
|
|
|
return simplifyMaskedStore(*II);
|
2016-02-02 06:10:26 +08:00
|
|
|
case Intrinsic::masked_gather:
|
2019-04-25 09:18:56 +08:00
|
|
|
return simplifyMaskedGather(*II);
|
2016-02-02 06:10:26 +08:00
|
|
|
case Intrinsic::masked_scatter:
|
2019-03-21 02:44:58 +08:00
|
|
|
return simplifyMaskedScatter(*II);
|
2018-07-13 07:55:20 +08:00
|
|
|
case Intrinsic::launder_invariant_group:
|
|
|
|
case Intrinsic::strip_invariant_group:
|
|
|
|
if (auto *SkippedBarrier = simplifyInvariantGroupIntrinsic(*II, *this))
|
|
|
|
return replaceInstUsesWith(*II, SkippedBarrier);
|
|
|
|
break;
|
2010-01-05 15:32:13 +08:00
|
|
|
case Intrinsic::powi:
|
2010-06-24 20:58:35 +08:00
|
|
|
if (ConstantInt *Power = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
|
2017-12-27 09:14:30 +08:00
|
|
|
// 0 and 1 are handled in instsimplify
|
|
|
|
|
2010-01-05 15:32:13 +08:00
|
|
|
// powi(x, -1) -> 1/x
|
2017-07-07 02:39:47 +08:00
|
|
|
if (Power->isMinusOne())
|
2010-01-05 15:32:13 +08:00
|
|
|
return BinaryOperator::CreateFDiv(ConstantFP::get(CI.getType(), 1.0),
|
2010-06-24 20:58:35 +08:00
|
|
|
II->getArgOperand(0));
|
2017-12-27 09:30:12 +08:00
|
|
|
// powi(x, 2) -> x*x
|
|
|
|
if (Power->equalsInt(2))
|
|
|
|
return BinaryOperator::CreateFMul(II->getArgOperand(0),
|
|
|
|
II->getArgOperand(0));
|
2010-01-05 15:32:13 +08:00
|
|
|
}
|
|
|
|
break;
|
2012-02-03 08:07:04 +08:00
|
|
|
|
2016-08-06 06:42:46 +08:00
|
|
|
case Intrinsic::cttz:
|
|
|
|
case Intrinsic::ctlz:
|
2016-08-19 04:43:50 +08:00
|
|
|
if (auto *I = foldCttzCtlz(*II, *this))
|
|
|
|
return I;
|
2010-01-05 15:32:13 +08:00
|
|
|
break;
|
2015-04-08 12:27:22 +08:00
|
|
|
|
2017-06-22 00:32:35 +08:00
|
|
|
case Intrinsic::ctpop:
|
|
|
|
if (auto *I = foldCtpop(*II, *this))
|
|
|
|
return I;
|
|
|
|
break;
|
|
|
|
|
2018-11-14 07:27:23 +08:00
|
|
|
case Intrinsic::fshl:
|
|
|
|
case Intrinsic::fshr: {
|
2019-03-18 03:08:00 +08:00
|
|
|
Value *Op0 = II->getArgOperand(0), *Op1 = II->getArgOperand(1);
|
|
|
|
Type *Ty = II->getType();
|
|
|
|
unsigned BitWidth = Ty->getScalarSizeInBits();
|
2019-03-15 03:22:08 +08:00
|
|
|
Constant *ShAmtC;
|
|
|
|
if (match(II->getArgOperand(2), m_Constant(ShAmtC)) &&
|
|
|
|
!isa<ConstantExpr>(ShAmtC) && !ShAmtC->containsConstantExpression()) {
|
2019-03-18 03:08:00 +08:00
|
|
|
// Canonicalize a shift amount constant operand to modulo the bit-width.
|
|
|
|
Constant *WidthC = ConstantInt::get(Ty, BitWidth);
|
2019-03-15 03:22:08 +08:00
|
|
|
Constant *ModuloC = ConstantExpr::getURem(ShAmtC, WidthC);
|
|
|
|
if (ModuloC != ShAmtC) {
|
|
|
|
II->setArgOperand(2, ModuloC);
|
|
|
|
return II;
|
|
|
|
}
|
2019-03-18 22:27:51 +08:00
|
|
|
assert(ConstantExpr::getICmp(ICmpInst::ICMP_UGT, WidthC, ShAmtC) ==
|
|
|
|
ConstantInt::getTrue(CmpInst::makeCmpResultType(Ty)) &&
|
|
|
|
"Shift amount expected to be modulo bitwidth");
|
|
|
|
|
2019-03-18 22:10:11 +08:00
|
|
|
// Canonicalize funnel shift right by constant to funnel shift left. This
|
|
|
|
// is not entirely arbitrary. For historical reasons, the backend may
|
|
|
|
// recognize rotate left patterns but miss rotate right patterns.
|
2019-05-06 23:35:02 +08:00
|
|
|
if (IID == Intrinsic::fshr) {
|
2019-03-18 22:10:11 +08:00
|
|
|
// fshr X, Y, C --> fshl X, Y, (BitWidth - C)
|
2019-03-18 03:08:00 +08:00
|
|
|
Constant *LeftShiftC = ConstantExpr::getSub(WidthC, ShAmtC);
|
|
|
|
Module *Mod = II->getModule();
|
|
|
|
Function *Fshl = Intrinsic::getDeclaration(Mod, Intrinsic::fshl, Ty);
|
2019-03-18 22:10:11 +08:00
|
|
|
return CallInst::Create(Fshl, { Op0, Op1, LeftShiftC });
|
2019-03-18 03:08:00 +08:00
|
|
|
}
|
2019-05-06 23:35:02 +08:00
|
|
|
assert(IID == Intrinsic::fshl &&
|
2019-03-18 22:10:11 +08:00
|
|
|
"All funnel shifts by simple constants should go left");
|
[InstCombine] Simplify funnel shift with zero/undef operand to shift
The following simplifications are implemented:
* `fshl(X, 0, C) -> shl X, C%BW`
* `fshl(X, undef, C) -> shl X, C%BW` (assuming undef = 0)
* `fshl(0, X, C) -> lshr X, BW-C%BW`
* `fshl(undef, X, C) -> lshr X, BW-C%BW` (assuming undef = 0)
* `fshr(X, 0, C) -> shl X, (BW-C%BW)`
* `fshr(X, undef, C) -> shl X, BW-C%BW` (assuming undef = 0)
* `fshr(0, X, C) -> lshr X, C%BW`
* `fshr(undef, X, C) -> lshr, X, C%BW` (assuming undef = 0)
The simplification is only performed if the shift amount C is constant,
because we can explicitly compute C%BW and BW-C%BW in this case.
Differential Revision: https://reviews.llvm.org/D54778
llvm-svn: 347505
2018-11-24 06:45:08 +08:00
|
|
|
|
2019-03-18 22:27:51 +08:00
|
|
|
// fshl(X, 0, C) --> shl X, C
|
|
|
|
// fshl(X, undef, C) --> shl X, C
|
|
|
|
if (match(Op1, m_ZeroInt()) || match(Op1, m_Undef()))
|
|
|
|
return BinaryOperator::CreateShl(Op0, ShAmtC);
|
[InstCombine] Simplify funnel shift with zero/undef operand to shift
The following simplifications are implemented:
* `fshl(X, 0, C) -> shl X, C%BW`
* `fshl(X, undef, C) -> shl X, C%BW` (assuming undef = 0)
* `fshl(0, X, C) -> lshr X, BW-C%BW`
* `fshl(undef, X, C) -> lshr X, BW-C%BW` (assuming undef = 0)
* `fshr(X, 0, C) -> shl X, (BW-C%BW)`
* `fshr(X, undef, C) -> shl X, BW-C%BW` (assuming undef = 0)
* `fshr(0, X, C) -> lshr X, C%BW`
* `fshr(undef, X, C) -> lshr, X, C%BW` (assuming undef = 0)
The simplification is only performed if the shift amount C is constant,
because we can explicitly compute C%BW and BW-C%BW in this case.
Differential Revision: https://reviews.llvm.org/D54778
llvm-svn: 347505
2018-11-24 06:45:08 +08:00
|
|
|
|
2019-03-18 22:27:51 +08:00
|
|
|
// fshl(0, X, C) --> lshr X, (BW-C)
|
|
|
|
// fshl(undef, X, C) --> lshr X, (BW-C)
|
|
|
|
if (match(Op0, m_ZeroInt()) || match(Op0, m_Undef()))
|
|
|
|
return BinaryOperator::CreateLShr(Op1,
|
|
|
|
ConstantExpr::getSub(WidthC, ShAmtC));
|
2019-06-24 23:20:49 +08:00
|
|
|
|
|
|
|
// fshl i16 X, X, 8 --> bswap i16 X (reduce to more-specific form)
|
|
|
|
if (Op0 == Op1 && BitWidth == 16 && match(ShAmtC, m_SpecificInt(8))) {
|
|
|
|
Module *Mod = II->getModule();
|
|
|
|
Function *Bswap = Intrinsic::getDeclaration(Mod, Intrinsic::bswap, Ty);
|
|
|
|
return CallInst::Create(Bswap, { Op0 });
|
|
|
|
}
|
[InstCombine] Simplify funnel shift with zero/undef operand to shift
The following simplifications are implemented:
* `fshl(X, 0, C) -> shl X, C%BW`
* `fshl(X, undef, C) -> shl X, C%BW` (assuming undef = 0)
* `fshl(0, X, C) -> lshr X, BW-C%BW`
* `fshl(undef, X, C) -> lshr X, BW-C%BW` (assuming undef = 0)
* `fshr(X, 0, C) -> shl X, (BW-C%BW)`
* `fshr(X, undef, C) -> shl X, BW-C%BW` (assuming undef = 0)
* `fshr(0, X, C) -> lshr X, C%BW`
* `fshr(undef, X, C) -> lshr, X, C%BW` (assuming undef = 0)
The simplification is only performed if the shift amount C is constant,
because we can explicitly compute C%BW and BW-C%BW in this case.
Differential Revision: https://reviews.llvm.org/D54778
llvm-svn: 347505
2018-11-24 06:45:08 +08:00
|
|
|
}
|
|
|
|
|
2019-04-17 03:05:49 +08:00
|
|
|
// Left or right might be masked.
|
|
|
|
if (SimplifyDemandedInstructionBits(*II))
|
|
|
|
return &CI;
|
|
|
|
|
2018-11-14 07:27:23 +08:00
|
|
|
// The shift amount (operand 2) of a funnel shift is modulo the bitwidth,
|
|
|
|
// so only the low bits of the shift amount are demanded if the bitwidth is
|
|
|
|
// a power-of-2.
|
|
|
|
if (!isPowerOf2_32(BitWidth))
|
|
|
|
break;
|
|
|
|
APInt Op2Demanded = APInt::getLowBitsSet(BitWidth, Log2_32_Ceil(BitWidth));
|
|
|
|
KnownBits Op2Known(BitWidth);
|
|
|
|
if (SimplifyDemandedBits(II, 2, Op2Demanded, Op2Known))
|
|
|
|
return &CI;
|
|
|
|
break;
|
|
|
|
}
|
2019-03-21 02:00:27 +08:00
|
|
|
case Intrinsic::uadd_with_overflow:
|
2019-03-07 02:30:00 +08:00
|
|
|
case Intrinsic::sadd_with_overflow: {
|
|
|
|
if (Instruction *I = canonicalizeConstantArg0ToArg1(CI))
|
|
|
|
return I;
|
|
|
|
if (Instruction *I = foldIntrinsicWithOverflowCommon(II))
|
|
|
|
return I;
|
|
|
|
|
|
|
|
// Given 2 constant operands whose sum does not overflow:
|
2019-03-21 02:00:27 +08:00
|
|
|
// uaddo (X +nuw C0), C1 -> uaddo X, C0 + C1
|
2019-03-07 02:30:00 +08:00
|
|
|
// saddo (X +nsw C0), C1 -> saddo X, C0 + C1
|
|
|
|
Value *X;
|
|
|
|
const APInt *C0, *C1;
|
|
|
|
Value *Arg0 = II->getArgOperand(0);
|
|
|
|
Value *Arg1 = II->getArgOperand(1);
|
2019-05-06 23:35:02 +08:00
|
|
|
bool IsSigned = IID == Intrinsic::sadd_with_overflow;
|
2019-03-21 02:00:27 +08:00
|
|
|
bool HasNWAdd = IsSigned ? match(Arg0, m_NSWAdd(m_Value(X), m_APInt(C0)))
|
|
|
|
: match(Arg0, m_NUWAdd(m_Value(X), m_APInt(C0)));
|
|
|
|
if (HasNWAdd && match(Arg1, m_APInt(C1))) {
|
2019-03-07 02:30:00 +08:00
|
|
|
bool Overflow;
|
2019-03-21 02:00:27 +08:00
|
|
|
APInt NewC =
|
|
|
|
IsSigned ? C1->sadd_ov(*C0, Overflow) : C1->uadd_ov(*C0, Overflow);
|
2019-03-07 02:30:00 +08:00
|
|
|
if (!Overflow)
|
|
|
|
return replaceInstUsesWith(
|
|
|
|
*II, Builder.CreateBinaryIntrinsic(
|
2019-05-06 23:35:02 +08:00
|
|
|
IID, X, ConstantInt::get(Arg1->getType(), NewC)));
|
2019-03-07 02:30:00 +08:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
2019-04-11 00:27:36 +08:00
|
|
|
|
2015-04-14 03:17:37 +08:00
|
|
|
case Intrinsic::umul_with_overflow:
|
|
|
|
case Intrinsic::smul_with_overflow:
|
2018-11-27 06:00:41 +08:00
|
|
|
if (Instruction *I = canonicalizeConstantArg0ToArg1(CI))
|
|
|
|
return I;
|
2016-08-18 04:30:52 +08:00
|
|
|
LLVM_FALLTHROUGH;
|
2010-01-05 15:32:13 +08:00
|
|
|
|
2015-04-14 03:17:37 +08:00
|
|
|
case Intrinsic::usub_with_overflow:
|
2019-04-11 00:27:36 +08:00
|
|
|
if (Instruction *I = foldIntrinsicWithOverflowCommon(II))
|
|
|
|
return I;
|
|
|
|
break;
|
|
|
|
|
2015-04-14 03:17:37 +08:00
|
|
|
case Intrinsic::ssub_with_overflow: {
|
2019-03-07 02:30:00 +08:00
|
|
|
if (Instruction *I = foldIntrinsicWithOverflowCommon(II))
|
|
|
|
return I;
|
2014-07-04 18:22:21 +08:00
|
|
|
|
2019-04-11 00:27:36 +08:00
|
|
|
Constant *C;
|
|
|
|
Value *Arg0 = II->getArgOperand(0);
|
|
|
|
Value *Arg1 = II->getArgOperand(1);
|
|
|
|
// Given a constant C that is not the minimum signed value
|
|
|
|
// for an integer of a given bit width:
|
|
|
|
//
|
|
|
|
// ssubo X, C -> saddo X, -C
|
|
|
|
if (match(Arg1, m_Constant(C)) && C->isNotMinSignedValue()) {
|
|
|
|
Value *NegVal = ConstantExpr::getNeg(C);
|
|
|
|
// Build a saddo call that is equivalent to the discovered
|
|
|
|
// ssubo call.
|
|
|
|
return replaceInstUsesWith(
|
|
|
|
*II, Builder.CreateBinaryIntrinsic(Intrinsic::sadd_with_overflow,
|
|
|
|
Arg0, NegVal));
|
|
|
|
}
|
|
|
|
|
2010-01-05 15:32:13 +08:00
|
|
|
break;
|
2014-12-11 16:02:30 +08:00
|
|
|
}
|
2010-01-05 15:32:13 +08:00
|
|
|
|
2018-11-29 00:36:52 +08:00
|
|
|
case Intrinsic::uadd_sat:
|
|
|
|
case Intrinsic::sadd_sat:
|
|
|
|
if (Instruction *I = canonicalizeConstantArg0ToArg1(CI))
|
|
|
|
return I;
|
2018-11-29 00:36:59 +08:00
|
|
|
LLVM_FALLTHROUGH;
|
|
|
|
case Intrinsic::usub_sat:
|
|
|
|
case Intrinsic::ssub_sat: {
|
2019-05-29 02:59:21 +08:00
|
|
|
SaturatingInst *SI = cast<SaturatingInst>(II);
|
2019-05-30 02:37:13 +08:00
|
|
|
Type *Ty = SI->getType();
|
2019-05-29 02:59:21 +08:00
|
|
|
Value *Arg0 = SI->getLHS();
|
|
|
|
Value *Arg1 = SI->getRHS();
|
2018-11-29 00:36:59 +08:00
|
|
|
|
|
|
|
// Make use of known overflow information.
|
2019-05-29 02:59:21 +08:00
|
|
|
OverflowResult OR = computeOverflow(SI->getBinaryOp(), SI->isSigned(),
|
|
|
|
Arg0, Arg1, SI);
|
|
|
|
switch (OR) {
|
|
|
|
case OverflowResult::MayOverflow:
|
|
|
|
break;
|
|
|
|
case OverflowResult::NeverOverflows:
|
|
|
|
if (SI->isSigned())
|
|
|
|
return BinaryOperator::CreateNSW(SI->getBinaryOp(), Arg0, Arg1);
|
|
|
|
else
|
|
|
|
return BinaryOperator::CreateNUW(SI->getBinaryOp(), Arg0, Arg1);
|
2019-05-30 02:37:13 +08:00
|
|
|
case OverflowResult::AlwaysOverflowsLow: {
|
|
|
|
unsigned BitWidth = Ty->getScalarSizeInBits();
|
|
|
|
APInt Min = APSInt::getMinValue(BitWidth, !SI->isSigned());
|
|
|
|
return replaceInstUsesWith(*SI, ConstantInt::get(Ty, Min));
|
|
|
|
}
|
|
|
|
case OverflowResult::AlwaysOverflowsHigh: {
|
|
|
|
unsigned BitWidth = Ty->getScalarSizeInBits();
|
|
|
|
APInt Max = APSInt::getMaxValue(BitWidth, !SI->isSigned());
|
|
|
|
return replaceInstUsesWith(*SI, ConstantInt::get(Ty, Max));
|
|
|
|
}
|
2018-11-29 00:36:59 +08:00
|
|
|
}
|
2018-11-29 00:37:09 +08:00
|
|
|
|
|
|
|
// ssub.sat(X, C) -> sadd.sat(X, -C) if C != MIN
|
2018-12-01 18:58:34 +08:00
|
|
|
Constant *C;
|
|
|
|
if (IID == Intrinsic::ssub_sat && match(Arg1, m_Constant(C)) &&
|
|
|
|
C->isNotMinSignedValue()) {
|
|
|
|
Value *NegVal = ConstantExpr::getNeg(C);
|
2018-11-29 00:37:09 +08:00
|
|
|
return replaceInstUsesWith(
|
|
|
|
*II, Builder.CreateBinaryIntrinsic(
|
|
|
|
Intrinsic::sadd_sat, Arg0, NegVal));
|
|
|
|
}
|
2018-11-29 00:37:15 +08:00
|
|
|
|
|
|
|
// sat(sat(X + Val2) + Val) -> sat(X + (Val+Val2))
|
|
|
|
// sat(sat(X - Val2) - Val) -> sat(X - (Val+Val2))
|
|
|
|
// if Val and Val2 have the same sign
|
|
|
|
if (auto *Other = dyn_cast<IntrinsicInst>(Arg0)) {
|
|
|
|
Value *X;
|
|
|
|
const APInt *Val, *Val2;
|
|
|
|
APInt NewVal;
|
|
|
|
bool IsUnsigned =
|
|
|
|
IID == Intrinsic::uadd_sat || IID == Intrinsic::usub_sat;
|
2019-05-06 23:35:02 +08:00
|
|
|
if (Other->getIntrinsicID() == IID &&
|
2018-11-29 00:37:15 +08:00
|
|
|
match(Arg1, m_APInt(Val)) &&
|
|
|
|
match(Other->getArgOperand(0), m_Value(X)) &&
|
|
|
|
match(Other->getArgOperand(1), m_APInt(Val2))) {
|
|
|
|
if (IsUnsigned)
|
|
|
|
NewVal = Val->uadd_sat(*Val2);
|
|
|
|
else if (Val->isNonNegative() == Val2->isNonNegative()) {
|
|
|
|
bool Overflow;
|
|
|
|
NewVal = Val->sadd_ov(*Val2, Overflow);
|
|
|
|
if (Overflow) {
|
|
|
|
// Both adds together may add more than SignedMaxValue
|
|
|
|
// without saturating the final result.
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// Cannot fold saturated addition with different signs.
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return replaceInstUsesWith(
|
|
|
|
*II, Builder.CreateBinaryIntrinsic(
|
|
|
|
IID, X, ConstantInt::get(II->getType(), NewVal)));
|
|
|
|
}
|
|
|
|
}
|
2018-11-29 00:36:52 +08:00
|
|
|
break;
|
2018-11-29 00:36:59 +08:00
|
|
|
}
|
2018-11-29 00:36:52 +08:00
|
|
|
|
2014-10-22 07:00:20 +08:00
|
|
|
case Intrinsic::minnum:
|
2018-10-20 03:01:26 +08:00
|
|
|
case Intrinsic::maxnum:
|
|
|
|
case Intrinsic::minimum:
|
|
|
|
case Intrinsic::maximum: {
|
2018-11-27 06:00:41 +08:00
|
|
|
if (Instruction *I = canonicalizeConstantArg0ToArg1(CI))
|
|
|
|
return I;
|
2014-10-22 07:00:20 +08:00
|
|
|
Value *Arg0 = II->getArgOperand(0);
|
|
|
|
Value *Arg1 = II->getArgOperand(1);
|
2018-05-11 04:03:13 +08:00
|
|
|
Value *X, *Y;
|
|
|
|
if (match(Arg0, m_FNeg(m_Value(X))) && match(Arg1, m_FNeg(m_Value(Y))) &&
|
|
|
|
(Arg0->hasOneUse() || Arg1->hasOneUse())) {
|
|
|
|
// If both operands are negated, invert the call and negate the result:
|
2018-10-20 03:01:26 +08:00
|
|
|
// min(-X, -Y) --> -(max(X, Y))
|
|
|
|
// max(-X, -Y) --> -(min(X, Y))
|
|
|
|
Intrinsic::ID NewIID;
|
2018-11-01 01:50:52 +08:00
|
|
|
switch (IID) {
|
2018-10-20 03:01:26 +08:00
|
|
|
case Intrinsic::maxnum:
|
|
|
|
NewIID = Intrinsic::minnum;
|
|
|
|
break;
|
|
|
|
case Intrinsic::minnum:
|
|
|
|
NewIID = Intrinsic::maxnum;
|
|
|
|
break;
|
|
|
|
case Intrinsic::maximum:
|
|
|
|
NewIID = Intrinsic::minimum;
|
|
|
|
break;
|
|
|
|
case Intrinsic::minimum:
|
|
|
|
NewIID = Intrinsic::maximum;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
llvm_unreachable("unexpected intrinsic ID");
|
|
|
|
}
|
2018-10-08 18:32:33 +08:00
|
|
|
Value *NewCall = Builder.CreateBinaryIntrinsic(NewIID, X, Y, II);
|
2018-05-11 04:03:13 +08:00
|
|
|
Instruction *FNeg = BinaryOperator::CreateFNeg(NewCall);
|
|
|
|
FNeg->copyIRFlags(II);
|
|
|
|
return FNeg;
|
|
|
|
}
|
2018-11-01 01:50:52 +08:00
|
|
|
|
|
|
|
// m(m(X, C2), C1) -> m(X, C)
|
|
|
|
const APFloat *C1, *C2;
|
|
|
|
if (auto *M = dyn_cast<IntrinsicInst>(Arg0)) {
|
|
|
|
if (M->getIntrinsicID() == IID && match(Arg1, m_APFloat(C1)) &&
|
|
|
|
((match(M->getArgOperand(0), m_Value(X)) &&
|
|
|
|
match(M->getArgOperand(1), m_APFloat(C2))) ||
|
|
|
|
(match(M->getArgOperand(1), m_Value(X)) &&
|
|
|
|
match(M->getArgOperand(0), m_APFloat(C2))))) {
|
|
|
|
APFloat Res(0.0);
|
|
|
|
switch (IID) {
|
|
|
|
case Intrinsic::maxnum:
|
|
|
|
Res = maxnum(*C1, *C2);
|
|
|
|
break;
|
|
|
|
case Intrinsic::minnum:
|
|
|
|
Res = minnum(*C1, *C2);
|
|
|
|
break;
|
|
|
|
case Intrinsic::maximum:
|
|
|
|
Res = maximum(*C1, *C2);
|
|
|
|
break;
|
|
|
|
case Intrinsic::minimum:
|
|
|
|
Res = minimum(*C1, *C2);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
llvm_unreachable("unexpected intrinsic ID");
|
|
|
|
}
|
|
|
|
Instruction *NewCall = Builder.CreateBinaryIntrinsic(
|
|
|
|
IID, X, ConstantFP::get(Arg0->getType(), Res));
|
|
|
|
NewCall->copyIRFlags(II);
|
|
|
|
return replaceInstUsesWith(*II, NewCall);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-10-22 07:00:20 +08:00
|
|
|
break;
|
|
|
|
}
|
2017-01-03 12:32:31 +08:00
|
|
|
case Intrinsic::fmuladd: {
|
2017-02-17 02:46:24 +08:00
|
|
|
// Canonicalize fast fmuladd to the separate fmul + fadd.
|
[IR] redefine 'UnsafeAlgebra' / 'reassoc' fast-math-flags and add 'trans' fast-math-flag
As discussed on llvm-dev:
http://lists.llvm.org/pipermail/llvm-dev/2016-November/107104.html
and again more recently:
http://lists.llvm.org/pipermail/llvm-dev/2017-October/118118.html
...this is a step in cleaning up our fast-math-flags implementation in IR to better match
the capabilities of both clang's user-visible flags and the backend's flags for SDNode.
As proposed in the above threads, we're replacing the 'UnsafeAlgebra' bit (which had the
'umbrella' meaning that all flags are set) with a new bit that only applies to algebraic
reassociation - 'AllowReassoc'.
We're also adding a bit to allow approximations for library functions called 'ApproxFunc'
(this was initially proposed as 'libm' or similar).
...and we're out of bits. 7 bits ought to be enough for anyone, right? :) FWIW, I did
look at getting this out of SubclassOptionalData via SubclassData (spacious 16-bits),
but that's apparently already used for other purposes. Also, I don't think we can just
add a field to FPMathOperator because Operator is not intended to be instantiated.
We'll defer movement of FMF to another day.
We keep the 'fast' keyword. I thought about removing that, but seeing IR like this:
%f.fast = fadd reassoc nnan ninf nsz arcp contract afn float %op1, %op2
...made me think we want to keep the shortcut synonym.
Finally, this change is binary incompatible with existing IR as seen in the
compatibility tests. This statement:
"Newer releases can ignore features from older releases, but they cannot miscompile
them. For example, if nsw is ever replaced with something else, dropping it would be
a valid way to upgrade the IR."
( http://llvm.org/docs/DeveloperPolicy.html#ir-backwards-compatibility )
...provides the flexibility we want to make this change without requiring a new IR
version. Ie, we're not loosening the FP strictness of existing IR. At worst, we will
fail to optimize some previously 'fast' code because it's no longer recognized as
'fast'. This should get fixed as we audit/squash all of the uses of 'isFast()'.
Note: an inter-dependent clang commit to use the new API name should closely follow
commit.
Differential Revision: https://reviews.llvm.org/D39304
llvm-svn: 317488
2017-11-07 00:27:15 +08:00
|
|
|
if (II->isFast()) {
|
2017-07-08 07:16:26 +08:00
|
|
|
BuilderTy::FastMathFlagGuard Guard(Builder);
|
|
|
|
Builder.setFastMathFlags(II->getFastMathFlags());
|
|
|
|
Value *Mul = Builder.CreateFMul(II->getArgOperand(0),
|
|
|
|
II->getArgOperand(1));
|
|
|
|
Value *Add = Builder.CreateFAdd(Mul, II->getArgOperand(2));
|
2017-02-17 02:46:24 +08:00
|
|
|
Add->takeName(II);
|
|
|
|
return replaceInstUsesWith(*II, Add);
|
|
|
|
}
|
|
|
|
|
2019-09-26 01:03:20 +08:00
|
|
|
// Try to simplify the underlying FMul.
|
|
|
|
if (Value *V = SimplifyFMulInst(II->getArgOperand(0), II->getArgOperand(1),
|
|
|
|
II->getFastMathFlags(),
|
|
|
|
SQ.getWithInstruction(II))) {
|
|
|
|
auto *FAdd = BinaryOperator::CreateFAdd(V, II->getArgOperand(2));
|
|
|
|
FAdd->copyFastMathFlags(II);
|
|
|
|
return FAdd;
|
|
|
|
}
|
|
|
|
|
2017-02-17 02:46:24 +08:00
|
|
|
LLVM_FALLTHROUGH;
|
|
|
|
}
|
|
|
|
case Intrinsic::fma: {
|
2018-11-27 06:00:41 +08:00
|
|
|
if (Instruction *I = canonicalizeConstantArg0ToArg1(CI))
|
|
|
|
return I;
|
2017-01-03 12:32:35 +08:00
|
|
|
|
2017-01-03 12:32:31 +08:00
|
|
|
// fma fneg(x), fneg(y), z -> fma x, y, z
|
2018-11-27 06:00:41 +08:00
|
|
|
Value *Src0 = II->getArgOperand(0);
|
|
|
|
Value *Src1 = II->getArgOperand(1);
|
2018-04-05 21:24:26 +08:00
|
|
|
Value *X, *Y;
|
|
|
|
if (match(Src0, m_FNeg(m_Value(X))) && match(Src1, m_FNeg(m_Value(Y)))) {
|
|
|
|
II->setArgOperand(0, X);
|
|
|
|
II->setArgOperand(1, Y);
|
2017-01-11 07:17:52 +08:00
|
|
|
return II;
|
2017-01-03 12:32:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// fma fabs(x), fabs(x), z -> fma x, x, z
|
2018-07-27 17:04:35 +08:00
|
|
|
if (match(Src0, m_FAbs(m_Value(X))) &&
|
|
|
|
match(Src1, m_FAbs(m_Specific(X)))) {
|
2018-04-05 21:24:26 +08:00
|
|
|
II->setArgOperand(0, X);
|
|
|
|
II->setArgOperand(1, X);
|
2017-01-11 07:17:52 +08:00
|
|
|
return II;
|
2017-01-03 12:32:31 +08:00
|
|
|
}
|
|
|
|
|
2019-09-26 01:03:20 +08:00
|
|
|
// Try to simplify the underlying FMul. We can only apply simplifications
|
|
|
|
// that do not require rounding.
|
|
|
|
if (Value *V = SimplifyFMAFMul(II->getArgOperand(0), II->getArgOperand(1),
|
|
|
|
II->getFastMathFlags(),
|
|
|
|
SQ.getWithInstruction(II))) {
|
|
|
|
auto *FAdd = BinaryOperator::CreateFAdd(V, II->getArgOperand(2));
|
2018-04-05 21:24:26 +08:00
|
|
|
FAdd->copyFastMathFlags(II);
|
|
|
|
return FAdd;
|
2017-01-03 12:32:35 +08:00
|
|
|
}
|
|
|
|
|
2017-01-03 12:32:31 +08:00
|
|
|
break;
|
|
|
|
}
|
2019-12-02 22:21:59 +08:00
|
|
|
case Intrinsic::copysign: {
|
2019-12-22 23:05:28 +08:00
|
|
|
if (SignBitMustBeZero(II->getArgOperand(1), &TLI)) {
|
|
|
|
// If we know that the sign argument is positive, reduce to FABS:
|
|
|
|
// copysign X, Pos --> fabs X
|
2019-12-02 22:21:59 +08:00
|
|
|
Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs,
|
|
|
|
II->getArgOperand(0), II);
|
|
|
|
return replaceInstUsesWith(*II, Fabs);
|
|
|
|
}
|
2019-12-22 23:05:28 +08:00
|
|
|
// TODO: There should be a ValueTracking sibling like SignBitMustBeOne.
|
|
|
|
const APFloat *C;
|
|
|
|
if (match(II->getArgOperand(1), m_APFloat(C)) && C->isNegative()) {
|
|
|
|
// If we know that the sign argument is negative, reduce to FNABS:
|
|
|
|
// copysign X, Neg --> fneg (fabs X)
|
|
|
|
Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs,
|
|
|
|
II->getArgOperand(0), II);
|
|
|
|
return replaceInstUsesWith(*II, Builder.CreateFNegFMF(Fabs, II));
|
|
|
|
}
|
2019-12-31 00:04:00 +08:00
|
|
|
|
|
|
|
// Propagate sign argument through nested calls:
|
|
|
|
// copysign X, (copysign ?, SignArg) --> copysign X, SignArg
|
|
|
|
Value *SignArg;
|
|
|
|
if (match(II->getArgOperand(1),
|
|
|
|
m_Intrinsic<Intrinsic::copysign>(m_Value(), m_Value(SignArg)))) {
|
|
|
|
II->setArgOperand(1, SignArg);
|
|
|
|
return II;
|
|
|
|
}
|
|
|
|
|
2019-12-02 22:21:59 +08:00
|
|
|
break;
|
|
|
|
}
|
2017-01-04 06:40:34 +08:00
|
|
|
case Intrinsic::fabs: {
|
|
|
|
Value *Cond;
|
|
|
|
Constant *LHS, *RHS;
|
|
|
|
if (match(II->getArgOperand(0),
|
|
|
|
m_Select(m_Value(Cond), m_Constant(LHS), m_Constant(RHS)))) {
|
2017-07-08 07:16:26 +08:00
|
|
|
CallInst *Call0 = Builder.CreateCall(II->getCalledFunction(), {LHS});
|
|
|
|
CallInst *Call1 = Builder.CreateCall(II->getCalledFunction(), {RHS});
|
2017-01-04 06:40:34 +08:00
|
|
|
return SelectInst::Create(Cond, Call0, Call1);
|
|
|
|
}
|
|
|
|
|
2017-01-24 07:55:08 +08:00
|
|
|
LLVM_FALLTHROUGH;
|
|
|
|
}
|
|
|
|
case Intrinsic::ceil:
|
|
|
|
case Intrinsic::floor:
|
|
|
|
case Intrinsic::round:
|
|
|
|
case Intrinsic::nearbyint:
|
2017-04-01 03:58:07 +08:00
|
|
|
case Intrinsic::rint:
|
2017-01-24 07:55:08 +08:00
|
|
|
case Intrinsic::trunc: {
|
2017-01-17 08:10:40 +08:00
|
|
|
Value *ExtSrc;
|
2018-03-24 05:18:12 +08:00
|
|
|
if (match(II->getArgOperand(0), m_OneUse(m_FPExt(m_Value(ExtSrc))))) {
|
|
|
|
// Narrow the call: intrinsic (fpext x) -> fpext (intrinsic x)
|
2019-05-06 23:35:02 +08:00
|
|
|
Value *NarrowII = Builder.CreateUnaryIntrinsic(IID, ExtSrc, II);
|
2018-03-24 05:18:12 +08:00
|
|
|
return new FPExtInst(NarrowII, II->getType());
|
2017-01-17 08:10:40 +08:00
|
|
|
}
|
2017-01-04 06:40:34 +08:00
|
|
|
break;
|
|
|
|
}
|
2017-01-05 06:49:03 +08:00
|
|
|
case Intrinsic::cos:
|
|
|
|
case Intrinsic::amdgcn_cos: {
|
2018-08-30 02:27:49 +08:00
|
|
|
Value *X;
|
2017-01-05 06:49:03 +08:00
|
|
|
Value *Src = II->getArgOperand(0);
|
2018-08-30 02:27:49 +08:00
|
|
|
if (match(Src, m_FNeg(m_Value(X))) || match(Src, m_FAbs(m_Value(X)))) {
|
2017-01-05 06:49:03 +08:00
|
|
|
// cos(-x) -> cos(x)
|
|
|
|
// cos(fabs(x)) -> cos(x)
|
2018-08-30 02:27:49 +08:00
|
|
|
II->setArgOperand(0, X);
|
2017-01-05 06:49:03 +08:00
|
|
|
return II;
|
|
|
|
}
|
2018-08-30 02:27:49 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
case Intrinsic::sin: {
|
|
|
|
Value *X;
|
|
|
|
if (match(II->getArgOperand(0), m_OneUse(m_FNeg(m_Value(X))))) {
|
|
|
|
// sin(-x) --> -sin(x)
|
2018-10-08 18:32:33 +08:00
|
|
|
Value *NewSin = Builder.CreateUnaryIntrinsic(Intrinsic::sin, X, II);
|
2018-08-30 02:27:49 +08:00
|
|
|
Instruction *FNeg = BinaryOperator::CreateFNeg(NewSin);
|
|
|
|
FNeg->copyFastMathFlags(II);
|
|
|
|
return FNeg;
|
|
|
|
}
|
2017-01-05 06:49:03 +08:00
|
|
|
break;
|
|
|
|
}
|
2010-01-05 15:32:13 +08:00
|
|
|
case Intrinsic::ppc_altivec_lvx:
|
|
|
|
case Intrinsic::ppc_altivec_lvxl:
|
2011-04-13 08:36:11 +08:00
|
|
|
// Turn PPC lvx -> load if the pointer is known aligned.
|
2016-12-19 16:22:17 +08:00
|
|
|
if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, II, &AC,
|
2016-08-05 09:06:44 +08:00
|
|
|
&DT) >= 16) {
|
2017-07-08 07:16:26 +08:00
|
|
|
Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0),
|
2010-01-05 15:32:13 +08:00
|
|
|
PointerType::getUnqual(II->getType()));
|
2019-02-02 04:44:24 +08:00
|
|
|
return new LoadInst(II->getType(), Ptr);
|
2010-01-05 15:32:13 +08:00
|
|
|
}
|
|
|
|
break;
|
[PowerPC] Add vec_vsx_ld and vec_vsx_st intrinsics
This patch enables the vec_vsx_ld and vec_vsx_st intrinsics for
PowerPC, which provide programmer access to the lxvd2x, lxvw4x,
stxvd2x, and stxvw4x instructions.
New LLVM intrinsics are provided to represent these four instructions
in IntrinsicsPowerPC.td. These are patterned after the similar
intrinsics for lvx and stvx (Altivec). In PPCInstrVSX.td, these
intrinsics are tied to the code gen patterns, with additional patterns
to allow plain vanilla loads and stores to still generate these
instructions.
At -O1 and higher the intrinsics are immediately converted to loads
and stores in InstCombineCalls.cpp. This will open up more
optimization opportunities while still allowing the correct
instructions to be generated. (Similar code exists for aligned
Altivec loads and stores.)
The new intrinsics are added to the code that checks for consecutive
loads and stores in PPCISelLowering.cpp, as well as to
PPCTargetLowering::getTgtMemIntrinsic().
There's a new test to verify the correct instructions are generated.
The loads and stores tend to be reordered, so the test just counts
their number. It runs at -O2, as it's not very effective to test this
at -O0, when many unnecessary loads and stores are generated.
I ended up having to modify vsx-fma-m.ll. It turns out this test case
is slightly unreliable, but I don't know a good way to prevent
problems with it. The xvmaddmdp instructions read and write the same
register, which is one of the multiplicands. Commutativity allows
either to be chosen. If the FMAs are reordered differently than
expected by the test, the register assignment can be different as a
result. Hopefully this doesn't change often.
There is a companion patch for Clang.
llvm-svn: 221767
2014-11-12 12:19:40 +08:00
|
|
|
case Intrinsic::ppc_vsx_lxvw4x:
|
|
|
|
case Intrinsic::ppc_vsx_lxvd2x: {
|
|
|
|
// Turn PPC VSX loads into normal loads.
|
2017-07-08 07:16:26 +08:00
|
|
|
Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0),
|
|
|
|
PointerType::getUnqual(II->getType()));
|
2019-10-22 20:35:55 +08:00
|
|
|
return new LoadInst(II->getType(), Ptr, Twine(""), false, Align::None());
|
[PowerPC] Add vec_vsx_ld and vec_vsx_st intrinsics
This patch enables the vec_vsx_ld and vec_vsx_st intrinsics for
PowerPC, which provide programmer access to the lxvd2x, lxvw4x,
stxvd2x, and stxvw4x instructions.
New LLVM intrinsics are provided to represent these four instructions
in IntrinsicsPowerPC.td. These are patterned after the similar
intrinsics for lvx and stvx (Altivec). In PPCInstrVSX.td, these
intrinsics are tied to the code gen patterns, with additional patterns
to allow plain vanilla loads and stores to still generate these
instructions.
At -O1 and higher the intrinsics are immediately converted to loads
and stores in InstCombineCalls.cpp. This will open up more
optimization opportunities while still allowing the correct
instructions to be generated. (Similar code exists for aligned
Altivec loads and stores.)
The new intrinsics are added to the code that checks for consecutive
loads and stores in PPCISelLowering.cpp, as well as to
PPCTargetLowering::getTgtMemIntrinsic().
There's a new test to verify the correct instructions are generated.
The loads and stores tend to be reordered, so the test just counts
their number. It runs at -O2, as it's not very effective to test this
at -O0, when many unnecessary loads and stores are generated.
I ended up having to modify vsx-fma-m.ll. It turns out this test case
is slightly unreliable, but I don't know a good way to prevent
problems with it. The xvmaddmdp instructions read and write the same
register, which is one of the multiplicands. Commutativity allows
either to be chosen. If the FMAs are reordered differently than
expected by the test, the register assignment can be different as a
result. Hopefully this doesn't change often.
There is a companion patch for Clang.
llvm-svn: 221767
2014-11-12 12:19:40 +08:00
|
|
|
}
|
2010-01-05 15:32:13 +08:00
|
|
|
case Intrinsic::ppc_altivec_stvx:
|
|
|
|
case Intrinsic::ppc_altivec_stvxl:
|
|
|
|
// Turn stvx -> store if the pointer is known aligned.
|
2016-12-19 16:22:17 +08:00
|
|
|
if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL, II, &AC,
|
2016-08-05 09:06:44 +08:00
|
|
|
&DT) >= 16) {
|
2012-02-03 08:07:04 +08:00
|
|
|
Type *OpPtrTy =
|
2010-06-24 23:51:11 +08:00
|
|
|
PointerType::getUnqual(II->getArgOperand(0)->getType());
|
2017-07-08 07:16:26 +08:00
|
|
|
Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy);
|
2010-06-24 23:51:11 +08:00
|
|
|
return new StoreInst(II->getArgOperand(0), Ptr);
|
2010-01-05 15:32:13 +08:00
|
|
|
}
|
|
|
|
break;
|
[PowerPC] Add vec_vsx_ld and vec_vsx_st intrinsics
This patch enables the vec_vsx_ld and vec_vsx_st intrinsics for
PowerPC, which provide programmer access to the lxvd2x, lxvw4x,
stxvd2x, and stxvw4x instructions.
New LLVM intrinsics are provided to represent these four instructions
in IntrinsicsPowerPC.td. These are patterned after the similar
intrinsics for lvx and stvx (Altivec). In PPCInstrVSX.td, these
intrinsics are tied to the code gen patterns, with additional patterns
to allow plain vanilla loads and stores to still generate these
instructions.
At -O1 and higher the intrinsics are immediately converted to loads
and stores in InstCombineCalls.cpp. This will open up more
optimization opportunities while still allowing the correct
instructions to be generated. (Similar code exists for aligned
Altivec loads and stores.)
The new intrinsics are added to the code that checks for consecutive
loads and stores in PPCISelLowering.cpp, as well as to
PPCTargetLowering::getTgtMemIntrinsic().
There's a new test to verify the correct instructions are generated.
The loads and stores tend to be reordered, so the test just counts
their number. It runs at -O2, as it's not very effective to test this
at -O0, when many unnecessary loads and stores are generated.
I ended up having to modify vsx-fma-m.ll. It turns out this test case
is slightly unreliable, but I don't know a good way to prevent
problems with it. The xvmaddmdp instructions read and write the same
register, which is one of the multiplicands. Commutativity allows
either to be chosen. If the FMAs are reordered differently than
expected by the test, the register assignment can be different as a
result. Hopefully this doesn't change often.
There is a companion patch for Clang.
llvm-svn: 221767
2014-11-12 12:19:40 +08:00
|
|
|
case Intrinsic::ppc_vsx_stxvw4x:
|
|
|
|
case Intrinsic::ppc_vsx_stxvd2x: {
|
|
|
|
// Turn PPC VSX stores into normal stores.
|
|
|
|
Type *OpPtrTy = PointerType::getUnqual(II->getArgOperand(0)->getType());
|
2017-07-08 07:16:26 +08:00
|
|
|
Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy);
|
2019-10-22 20:55:32 +08:00
|
|
|
return new StoreInst(II->getArgOperand(0), Ptr, false, Align::None());
|
[PowerPC] Add vec_vsx_ld and vec_vsx_st intrinsics
This patch enables the vec_vsx_ld and vec_vsx_st intrinsics for
PowerPC, which provide programmer access to the lxvd2x, lxvw4x,
stxvd2x, and stxvw4x instructions.
New LLVM intrinsics are provided to represent these four instructions
in IntrinsicsPowerPC.td. These are patterned after the similar
intrinsics for lvx and stvx (Altivec). In PPCInstrVSX.td, these
intrinsics are tied to the code gen patterns, with additional patterns
to allow plain vanilla loads and stores to still generate these
instructions.
At -O1 and higher the intrinsics are immediately converted to loads
and stores in InstCombineCalls.cpp. This will open up more
optimization opportunities while still allowing the correct
instructions to be generated. (Similar code exists for aligned
Altivec loads and stores.)
The new intrinsics are added to the code that checks for consecutive
loads and stores in PPCISelLowering.cpp, as well as to
PPCTargetLowering::getTgtMemIntrinsic().
There's a new test to verify the correct instructions are generated.
The loads and stores tend to be reordered, so the test just counts
their number. It runs at -O2, as it's not very effective to test this
at -O0, when many unnecessary loads and stores are generated.
I ended up having to modify vsx-fma-m.ll. It turns out this test case
is slightly unreliable, but I don't know a good way to prevent
problems with it. The xvmaddmdp instructions read and write the same
register, which is one of the multiplicands. Commutativity allows
either to be chosen. If the FMAs are reordered differently than
expected by the test, the register assignment can be different as a
result. Hopefully this doesn't change often.
There is a companion patch for Clang.
llvm-svn: 221767
2014-11-12 12:19:40 +08:00
|
|
|
}
|
2015-02-27 02:56:03 +08:00
|
|
|
case Intrinsic::ppc_qpx_qvlfs:
|
|
|
|
// Turn PPC QPX qvlfs -> load if the pointer is known aligned.
|
2016-12-19 16:22:17 +08:00
|
|
|
if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, II, &AC,
|
2016-08-05 09:06:44 +08:00
|
|
|
&DT) >= 16) {
|
2017-07-08 07:16:26 +08:00
|
|
|
Type *VTy = VectorType::get(Builder.getFloatTy(),
|
2015-05-11 14:37:03 +08:00
|
|
|
II->getType()->getVectorNumElements());
|
2017-07-08 07:16:26 +08:00
|
|
|
Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0),
|
2015-05-11 14:37:03 +08:00
|
|
|
PointerType::getUnqual(VTy));
|
2019-02-02 04:44:24 +08:00
|
|
|
Value *Load = Builder.CreateLoad(VTy, Ptr);
|
2015-05-11 14:37:03 +08:00
|
|
|
return new FPExtInst(Load, II->getType());
|
2015-02-27 02:56:03 +08:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
case Intrinsic::ppc_qpx_qvlfd:
|
|
|
|
// Turn PPC QPX qvlfd -> load if the pointer is known aligned.
|
2016-12-19 16:22:17 +08:00
|
|
|
if (getOrEnforceKnownAlignment(II->getArgOperand(0), 32, DL, II, &AC,
|
2016-08-05 09:06:44 +08:00
|
|
|
&DT) >= 32) {
|
2017-07-08 07:16:26 +08:00
|
|
|
Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0),
|
2015-02-27 02:56:03 +08:00
|
|
|
PointerType::getUnqual(II->getType()));
|
2019-02-02 04:44:24 +08:00
|
|
|
return new LoadInst(II->getType(), Ptr);
|
2015-02-27 02:56:03 +08:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
case Intrinsic::ppc_qpx_qvstfs:
|
|
|
|
// Turn PPC QPX qvstfs -> store if the pointer is known aligned.
|
2016-12-19 16:22:17 +08:00
|
|
|
if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL, II, &AC,
|
2016-08-05 09:06:44 +08:00
|
|
|
&DT) >= 16) {
|
2017-07-08 07:16:26 +08:00
|
|
|
Type *VTy = VectorType::get(Builder.getFloatTy(),
|
2015-05-11 14:37:03 +08:00
|
|
|
II->getArgOperand(0)->getType()->getVectorNumElements());
|
2017-07-08 07:16:26 +08:00
|
|
|
Value *TOp = Builder.CreateFPTrunc(II->getArgOperand(0), VTy);
|
2015-05-11 14:37:03 +08:00
|
|
|
Type *OpPtrTy = PointerType::getUnqual(VTy);
|
2017-07-08 07:16:26 +08:00
|
|
|
Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy);
|
2015-05-11 14:37:03 +08:00
|
|
|
return new StoreInst(TOp, Ptr);
|
2015-02-27 02:56:03 +08:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
case Intrinsic::ppc_qpx_qvstfd:
|
|
|
|
// Turn PPC QPX qvstfd -> store if the pointer is known aligned.
|
2016-12-19 16:22:17 +08:00
|
|
|
if (getOrEnforceKnownAlignment(II->getArgOperand(1), 32, DL, II, &AC,
|
2016-08-05 09:06:44 +08:00
|
|
|
&DT) >= 32) {
|
2015-02-27 02:56:03 +08:00
|
|
|
Type *OpPtrTy =
|
|
|
|
PointerType::getUnqual(II->getArgOperand(0)->getType());
|
2017-07-08 07:16:26 +08:00
|
|
|
Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy);
|
2015-02-27 02:56:03 +08:00
|
|
|
return new StoreInst(II->getArgOperand(0), Ptr);
|
|
|
|
}
|
|
|
|
break;
|
2015-09-12 21:39:53 +08:00
|
|
|
|
2017-08-01 02:52:13 +08:00
|
|
|
case Intrinsic::x86_bmi_bextr_32:
|
|
|
|
case Intrinsic::x86_bmi_bextr_64:
|
|
|
|
case Intrinsic::x86_tbm_bextri_u32:
|
|
|
|
case Intrinsic::x86_tbm_bextri_u64:
|
|
|
|
// If the RHS is a constant we can try some simplifications.
|
|
|
|
if (auto *C = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
|
|
|
|
uint64_t Shift = C->getZExtValue();
|
|
|
|
uint64_t Length = (Shift >> 8) & 0xff;
|
|
|
|
Shift &= 0xff;
|
|
|
|
unsigned BitWidth = II->getType()->getIntegerBitWidth();
|
|
|
|
// If the length is 0 or the shift is out of range, replace with zero.
|
|
|
|
if (Length == 0 || Shift >= BitWidth)
|
|
|
|
return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0));
|
|
|
|
// If the LHS is also a constant, we can completely constant fold this.
|
|
|
|
if (auto *InC = dyn_cast<ConstantInt>(II->getArgOperand(0))) {
|
|
|
|
uint64_t Result = InC->getZExtValue() >> Shift;
|
|
|
|
if (Length > BitWidth)
|
|
|
|
Length = BitWidth;
|
|
|
|
Result &= maskTrailingOnes<uint64_t>(Length);
|
|
|
|
return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result));
|
|
|
|
}
|
|
|
|
// TODO should we turn this into 'and' if shift is 0? Or 'shl' if we
|
|
|
|
// are only masking bits that a shift already cleared?
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
2017-08-01 02:52:15 +08:00
|
|
|
case Intrinsic::x86_bmi_bzhi_32:
|
|
|
|
case Intrinsic::x86_bmi_bzhi_64:
|
|
|
|
// If the RHS is a constant we can try some simplifications.
|
|
|
|
if (auto *C = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
|
|
|
|
uint64_t Index = C->getZExtValue() & 0xff;
|
|
|
|
unsigned BitWidth = II->getType()->getIntegerBitWidth();
|
|
|
|
if (Index >= BitWidth)
|
|
|
|
return replaceInstUsesWith(CI, II->getArgOperand(0));
|
|
|
|
if (Index == 0)
|
|
|
|
return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0));
|
|
|
|
// If the LHS is also a constant, we can completely constant fold this.
|
|
|
|
if (auto *InC = dyn_cast<ConstantInt>(II->getArgOperand(0))) {
|
|
|
|
uint64_t Result = InC->getZExtValue();
|
|
|
|
Result &= maskTrailingOnes<uint64_t>(Index);
|
|
|
|
return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result));
|
|
|
|
}
|
|
|
|
// TODO should we convert this to an AND if the RHS is constant?
|
|
|
|
}
|
|
|
|
break;
|
2020-01-01 07:06:47 +08:00
|
|
|
case Intrinsic::x86_bmi_pext_32:
|
|
|
|
case Intrinsic::x86_bmi_pext_64:
|
|
|
|
if (auto *MaskC = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
|
|
|
|
if (MaskC->isNullValue())
|
|
|
|
return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0));
|
|
|
|
if (MaskC->isAllOnesValue())
|
|
|
|
return replaceInstUsesWith(CI, II->getArgOperand(0));
|
|
|
|
|
|
|
|
if (auto *SrcC = dyn_cast<ConstantInt>(II->getArgOperand(0))) {
|
|
|
|
uint64_t Src = SrcC->getZExtValue();
|
|
|
|
uint64_t Mask = MaskC->getZExtValue();
|
|
|
|
uint64_t Result = 0;
|
|
|
|
uint64_t BitToSet = 1;
|
|
|
|
|
|
|
|
while (Mask) {
|
|
|
|
// Isolate lowest set bit.
|
|
|
|
uint64_t BitToTest = Mask & -Mask;
|
|
|
|
if (BitToTest & Src)
|
|
|
|
Result |= BitToSet;
|
|
|
|
|
|
|
|
BitToSet <<= 1;
|
|
|
|
// Clear lowest set bit.
|
|
|
|
Mask &= Mask - 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case Intrinsic::x86_bmi_pdep_32:
|
|
|
|
case Intrinsic::x86_bmi_pdep_64:
|
|
|
|
if (auto *MaskC = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
|
|
|
|
if (MaskC->isNullValue())
|
|
|
|
return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0));
|
|
|
|
if (MaskC->isAllOnesValue())
|
|
|
|
return replaceInstUsesWith(CI, II->getArgOperand(0));
|
|
|
|
|
|
|
|
if (auto *SrcC = dyn_cast<ConstantInt>(II->getArgOperand(0))) {
|
|
|
|
uint64_t Src = SrcC->getZExtValue();
|
|
|
|
uint64_t Mask = MaskC->getZExtValue();
|
|
|
|
uint64_t Result = 0;
|
|
|
|
uint64_t BitToTest = 1;
|
|
|
|
|
|
|
|
while (Mask) {
|
|
|
|
// Isolate lowest set bit.
|
|
|
|
uint64_t BitToSet = Mask & -Mask;
|
|
|
|
if (BitToTest & Src)
|
|
|
|
Result |= BitToSet;
|
|
|
|
|
|
|
|
BitToTest <<= 1;
|
|
|
|
// Clear lowest set bit;
|
|
|
|
Mask &= Mask - 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
2017-08-01 02:52:15 +08:00
|
|
|
|
2015-09-12 21:39:53 +08:00
|
|
|
case Intrinsic::x86_vcvtph2ps_128:
|
|
|
|
case Intrinsic::x86_vcvtph2ps_256: {
|
|
|
|
auto Arg = II->getArgOperand(0);
|
|
|
|
auto ArgType = cast<VectorType>(Arg->getType());
|
|
|
|
auto RetType = cast<VectorType>(II->getType());
|
|
|
|
unsigned ArgWidth = ArgType->getNumElements();
|
|
|
|
unsigned RetWidth = RetType->getNumElements();
|
|
|
|
assert(RetWidth <= ArgWidth && "Unexpected input/return vector widths");
|
|
|
|
assert(ArgType->isIntOrIntVectorTy() &&
|
|
|
|
ArgType->getScalarSizeInBits() == 16 &&
|
|
|
|
"CVTPH2PS input type should be 16-bit integer vector");
|
|
|
|
assert(RetType->getScalarType()->isFloatTy() &&
|
|
|
|
"CVTPH2PS output type should be 32-bit float vector");
|
|
|
|
|
|
|
|
// Constant folding: Convert to generic half to single conversion.
|
2015-09-12 22:00:17 +08:00
|
|
|
if (isa<ConstantAggregateZero>(Arg))
|
2016-02-02 06:23:39 +08:00
|
|
|
return replaceInstUsesWith(*II, ConstantAggregateZero::get(RetType));
|
2015-09-12 21:39:53 +08:00
|
|
|
|
2015-09-12 22:00:17 +08:00
|
|
|
if (isa<ConstantDataVector>(Arg)) {
|
2015-09-12 21:39:53 +08:00
|
|
|
auto VectorHalfAsShorts = Arg;
|
|
|
|
if (RetWidth < ArgWidth) {
|
2016-06-12 08:41:19 +08:00
|
|
|
SmallVector<uint32_t, 8> SubVecMask;
|
2015-09-12 21:39:53 +08:00
|
|
|
for (unsigned i = 0; i != RetWidth; ++i)
|
|
|
|
SubVecMask.push_back((int)i);
|
2017-07-08 07:16:26 +08:00
|
|
|
VectorHalfAsShorts = Builder.CreateShuffleVector(
|
2015-09-12 21:39:53 +08:00
|
|
|
Arg, UndefValue::get(ArgType), SubVecMask);
|
|
|
|
}
|
|
|
|
|
|
|
|
auto VectorHalfType =
|
|
|
|
VectorType::get(Type::getHalfTy(II->getContext()), RetWidth);
|
|
|
|
auto VectorHalfs =
|
2017-07-08 07:16:26 +08:00
|
|
|
Builder.CreateBitCast(VectorHalfAsShorts, VectorHalfType);
|
|
|
|
auto VectorFloats = Builder.CreateFPExt(VectorHalfs, RetType);
|
2016-02-02 06:23:39 +08:00
|
|
|
return replaceInstUsesWith(*II, VectorFloats);
|
2015-09-12 21:39:53 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// We only use the lowest lanes of the argument.
|
2015-09-19 19:41:53 +08:00
|
|
|
if (Value *V = SimplifyDemandedVectorEltsLow(Arg, ArgWidth, RetWidth)) {
|
2015-09-12 21:39:53 +08:00
|
|
|
II->setArgOperand(0, V);
|
|
|
|
return II;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2011-01-10 15:19:37 +08:00
|
|
|
case Intrinsic::x86_sse_cvtss2si:
|
|
|
|
case Intrinsic::x86_sse_cvtss2si64:
|
|
|
|
case Intrinsic::x86_sse_cvttss2si:
|
|
|
|
case Intrinsic::x86_sse_cvttss2si64:
|
|
|
|
case Intrinsic::x86_sse2_cvtsd2si:
|
|
|
|
case Intrinsic::x86_sse2_cvtsd2si64:
|
|
|
|
case Intrinsic::x86_sse2_cvttsd2si:
|
2016-12-14 15:46:12 +08:00
|
|
|
case Intrinsic::x86_sse2_cvttsd2si64:
|
|
|
|
case Intrinsic::x86_avx512_vcvtss2si32:
|
|
|
|
case Intrinsic::x86_avx512_vcvtss2si64:
|
|
|
|
case Intrinsic::x86_avx512_vcvtss2usi32:
|
|
|
|
case Intrinsic::x86_avx512_vcvtss2usi64:
|
|
|
|
case Intrinsic::x86_avx512_vcvtsd2si32:
|
|
|
|
case Intrinsic::x86_avx512_vcvtsd2si64:
|
|
|
|
case Intrinsic::x86_avx512_vcvtsd2usi32:
|
|
|
|
case Intrinsic::x86_avx512_vcvtsd2usi64:
|
|
|
|
case Intrinsic::x86_avx512_cvttss2si:
|
|
|
|
case Intrinsic::x86_avx512_cvttss2si64:
|
|
|
|
case Intrinsic::x86_avx512_cvttss2usi:
|
|
|
|
case Intrinsic::x86_avx512_cvttss2usi64:
|
|
|
|
case Intrinsic::x86_avx512_cvttsd2si:
|
|
|
|
case Intrinsic::x86_avx512_cvttsd2si64:
|
|
|
|
case Intrinsic::x86_avx512_cvttsd2usi:
|
|
|
|
case Intrinsic::x86_avx512_cvttsd2usi64: {
|
2011-01-10 15:19:37 +08:00
|
|
|
// These intrinsics only demand the 0th element of their input vectors. If
|
2010-01-05 15:32:13 +08:00
|
|
|
// we can simplify the input based on that, do so now.
|
2015-09-19 19:41:53 +08:00
|
|
|
Value *Arg = II->getArgOperand(0);
|
|
|
|
unsigned VWidth = Arg->getType()->getVectorNumElements();
|
|
|
|
if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) {
|
2010-06-29 00:50:57 +08:00
|
|
|
II->setArgOperand(0, V);
|
2010-01-05 15:32:13 +08:00
|
|
|
return II;
|
|
|
|
}
|
2015-08-05 16:18:00 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2016-06-07 16:18:35 +08:00
|
|
|
case Intrinsic::x86_mmx_pmovmskb:
|
|
|
|
case Intrinsic::x86_sse_movmsk_ps:
|
|
|
|
case Intrinsic::x86_sse2_movmsk_pd:
|
|
|
|
case Intrinsic::x86_sse2_pmovmskb_128:
|
|
|
|
case Intrinsic::x86_avx_movmsk_pd_256:
|
|
|
|
case Intrinsic::x86_avx_movmsk_ps_256:
|
2017-10-25 05:24:53 +08:00
|
|
|
case Intrinsic::x86_avx2_pmovmskb:
|
2018-12-12 00:38:03 +08:00
|
|
|
if (Value *V = simplifyX86movmsk(*II, Builder))
|
2016-06-07 16:18:35 +08:00
|
|
|
return replaceInstUsesWith(*II, V);
|
|
|
|
break;
|
|
|
|
|
2016-02-21 07:17:35 +08:00
|
|
|
case Intrinsic::x86_sse_comieq_ss:
|
|
|
|
case Intrinsic::x86_sse_comige_ss:
|
|
|
|
case Intrinsic::x86_sse_comigt_ss:
|
|
|
|
case Intrinsic::x86_sse_comile_ss:
|
|
|
|
case Intrinsic::x86_sse_comilt_ss:
|
|
|
|
case Intrinsic::x86_sse_comineq_ss:
|
|
|
|
case Intrinsic::x86_sse_ucomieq_ss:
|
|
|
|
case Intrinsic::x86_sse_ucomige_ss:
|
|
|
|
case Intrinsic::x86_sse_ucomigt_ss:
|
|
|
|
case Intrinsic::x86_sse_ucomile_ss:
|
|
|
|
case Intrinsic::x86_sse_ucomilt_ss:
|
|
|
|
case Intrinsic::x86_sse_ucomineq_ss:
|
|
|
|
case Intrinsic::x86_sse2_comieq_sd:
|
|
|
|
case Intrinsic::x86_sse2_comige_sd:
|
|
|
|
case Intrinsic::x86_sse2_comigt_sd:
|
|
|
|
case Intrinsic::x86_sse2_comile_sd:
|
|
|
|
case Intrinsic::x86_sse2_comilt_sd:
|
|
|
|
case Intrinsic::x86_sse2_comineq_sd:
|
|
|
|
case Intrinsic::x86_sse2_ucomieq_sd:
|
|
|
|
case Intrinsic::x86_sse2_ucomige_sd:
|
|
|
|
case Intrinsic::x86_sse2_ucomigt_sd:
|
|
|
|
case Intrinsic::x86_sse2_ucomile_sd:
|
|
|
|
case Intrinsic::x86_sse2_ucomilt_sd:
|
2016-12-11 15:42:04 +08:00
|
|
|
case Intrinsic::x86_sse2_ucomineq_sd:
|
2016-12-31 08:45:06 +08:00
|
|
|
case Intrinsic::x86_avx512_vcomi_ss:
|
|
|
|
case Intrinsic::x86_avx512_vcomi_sd:
|
2016-12-11 15:42:04 +08:00
|
|
|
case Intrinsic::x86_avx512_mask_cmp_ss:
|
|
|
|
case Intrinsic::x86_avx512_mask_cmp_sd: {
|
2016-02-21 07:17:35 +08:00
|
|
|
// These intrinsics only demand the 0th element of their input vectors. If
|
|
|
|
// we can simplify the input based on that, do so now.
|
2016-04-25 01:57:27 +08:00
|
|
|
bool MadeChange = false;
|
2016-02-21 07:17:35 +08:00
|
|
|
Value *Arg0 = II->getArgOperand(0);
|
|
|
|
Value *Arg1 = II->getArgOperand(1);
|
|
|
|
unsigned VWidth = Arg0->getType()->getVectorNumElements();
|
|
|
|
if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) {
|
|
|
|
II->setArgOperand(0, V);
|
2016-04-25 01:57:27 +08:00
|
|
|
MadeChange = true;
|
2016-02-21 07:17:35 +08:00
|
|
|
}
|
|
|
|
if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
|
|
|
|
II->setArgOperand(1, V);
|
2016-04-25 01:57:27 +08:00
|
|
|
MadeChange = true;
|
2016-02-21 07:17:35 +08:00
|
|
|
}
|
2016-04-25 01:57:27 +08:00
|
|
|
if (MadeChange)
|
|
|
|
return II;
|
2016-02-21 07:17:35 +08:00
|
|
|
break;
|
|
|
|
}
|
2018-06-27 23:57:53 +08:00
|
|
|
case Intrinsic::x86_avx512_cmp_pd_128:
|
|
|
|
case Intrinsic::x86_avx512_cmp_pd_256:
|
|
|
|
case Intrinsic::x86_avx512_cmp_pd_512:
|
|
|
|
case Intrinsic::x86_avx512_cmp_ps_128:
|
|
|
|
case Intrinsic::x86_avx512_cmp_ps_256:
|
|
|
|
case Intrinsic::x86_avx512_cmp_ps_512: {
|
2017-04-16 21:26:08 +08:00
|
|
|
// Folding cmp(sub(a,b),0) -> cmp(a,b) and cmp(0,sub(a,b)) -> cmp(b,a)
|
|
|
|
Value *Arg0 = II->getArgOperand(0);
|
|
|
|
Value *Arg1 = II->getArgOperand(1);
|
2018-03-26 05:16:33 +08:00
|
|
|
bool Arg0IsZero = match(Arg0, m_PosZeroFP());
|
2017-04-16 21:26:08 +08:00
|
|
|
if (Arg0IsZero)
|
|
|
|
std::swap(Arg0, Arg1);
|
|
|
|
Value *A, *B;
|
|
|
|
// This fold requires only the NINF(not +/- inf) since inf minus
|
|
|
|
// inf is nan.
|
|
|
|
// NSZ(No Signed Zeros) is not needed because zeros of any sign are
|
|
|
|
// equal for both compares.
|
|
|
|
// NNAN is not needed because nans compare the same for both compares.
|
|
|
|
// The compare intrinsic uses the above assumptions and therefore
|
|
|
|
// doesn't require additional flags.
|
|
|
|
if ((match(Arg0, m_OneUse(m_FSub(m_Value(A), m_Value(B)))) &&
|
2018-03-26 05:16:33 +08:00
|
|
|
match(Arg1, m_PosZeroFP()) && isa<Instruction>(Arg0) &&
|
2017-04-16 21:26:08 +08:00
|
|
|
cast<Instruction>(Arg0)->getFastMathFlags().noInfs())) {
|
|
|
|
if (Arg0IsZero)
|
|
|
|
std::swap(A, B);
|
|
|
|
II->setArgOperand(0, A);
|
|
|
|
II->setArgOperand(1, B);
|
|
|
|
return II;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
2016-02-21 07:17:35 +08:00
|
|
|
|
2018-06-10 14:01:36 +08:00
|
|
|
case Intrinsic::x86_avx512_add_ps_512:
|
|
|
|
case Intrinsic::x86_avx512_div_ps_512:
|
|
|
|
case Intrinsic::x86_avx512_mul_ps_512:
|
|
|
|
case Intrinsic::x86_avx512_sub_ps_512:
|
|
|
|
case Intrinsic::x86_avx512_add_pd_512:
|
|
|
|
case Intrinsic::x86_avx512_div_pd_512:
|
|
|
|
case Intrinsic::x86_avx512_mul_pd_512:
|
|
|
|
case Intrinsic::x86_avx512_sub_pd_512:
|
2016-12-27 08:23:16 +08:00
|
|
|
// If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
|
|
|
|
// IR operations.
|
2018-06-10 14:01:36 +08:00
|
|
|
if (auto *R = dyn_cast<ConstantInt>(II->getArgOperand(2))) {
|
2016-12-27 08:23:16 +08:00
|
|
|
if (R->getValue() == 4) {
|
|
|
|
Value *Arg0 = II->getArgOperand(0);
|
|
|
|
Value *Arg1 = II->getArgOperand(1);
|
|
|
|
|
|
|
|
Value *V;
|
2019-05-06 23:35:02 +08:00
|
|
|
switch (IID) {
|
2016-12-27 08:23:16 +08:00
|
|
|
default: llvm_unreachable("Case stmts out of sync!");
|
2018-06-10 14:01:36 +08:00
|
|
|
case Intrinsic::x86_avx512_add_ps_512:
|
|
|
|
case Intrinsic::x86_avx512_add_pd_512:
|
2017-07-08 07:16:26 +08:00
|
|
|
V = Builder.CreateFAdd(Arg0, Arg1);
|
2016-12-27 08:23:16 +08:00
|
|
|
break;
|
2018-06-10 14:01:36 +08:00
|
|
|
case Intrinsic::x86_avx512_sub_ps_512:
|
|
|
|
case Intrinsic::x86_avx512_sub_pd_512:
|
2017-07-08 07:16:26 +08:00
|
|
|
V = Builder.CreateFSub(Arg0, Arg1);
|
2016-12-27 08:23:16 +08:00
|
|
|
break;
|
2018-06-10 14:01:36 +08:00
|
|
|
case Intrinsic::x86_avx512_mul_ps_512:
|
|
|
|
case Intrinsic::x86_avx512_mul_pd_512:
|
2017-07-08 07:16:26 +08:00
|
|
|
V = Builder.CreateFMul(Arg0, Arg1);
|
2016-12-27 08:23:16 +08:00
|
|
|
break;
|
2018-06-10 14:01:36 +08:00
|
|
|
case Intrinsic::x86_avx512_div_ps_512:
|
|
|
|
case Intrinsic::x86_avx512_div_pd_512:
|
2017-07-08 07:16:26 +08:00
|
|
|
V = Builder.CreateFDiv(Arg0, Arg1);
|
2016-12-27 08:23:16 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return replaceInstUsesWith(*II, V);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
2016-12-11 15:42:01 +08:00
|
|
|
case Intrinsic::x86_avx512_mask_add_ss_round:
|
|
|
|
case Intrinsic::x86_avx512_mask_div_ss_round:
|
|
|
|
case Intrinsic::x86_avx512_mask_mul_ss_round:
|
|
|
|
case Intrinsic::x86_avx512_mask_sub_ss_round:
|
|
|
|
case Intrinsic::x86_avx512_mask_add_sd_round:
|
|
|
|
case Intrinsic::x86_avx512_mask_div_sd_round:
|
|
|
|
case Intrinsic::x86_avx512_mask_mul_sd_round:
|
|
|
|
case Intrinsic::x86_avx512_mask_sub_sd_round:
|
2016-12-26 14:33:19 +08:00
|
|
|
// If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
|
|
|
|
// IR operations.
|
|
|
|
if (auto *R = dyn_cast<ConstantInt>(II->getArgOperand(4))) {
|
|
|
|
if (R->getValue() == 4) {
|
2016-12-27 09:56:30 +08:00
|
|
|
// Extract the element as scalars.
|
|
|
|
Value *Arg0 = II->getArgOperand(0);
|
|
|
|
Value *Arg1 = II->getArgOperand(1);
|
2017-07-08 07:16:26 +08:00
|
|
|
Value *LHS = Builder.CreateExtractElement(Arg0, (uint64_t)0);
|
|
|
|
Value *RHS = Builder.CreateExtractElement(Arg1, (uint64_t)0);
|
2016-12-27 09:56:30 +08:00
|
|
|
|
|
|
|
Value *V;
|
2019-05-06 23:35:02 +08:00
|
|
|
switch (IID) {
|
2016-12-27 09:56:30 +08:00
|
|
|
default: llvm_unreachable("Case stmts out of sync!");
|
|
|
|
case Intrinsic::x86_avx512_mask_add_ss_round:
|
|
|
|
case Intrinsic::x86_avx512_mask_add_sd_round:
|
2017-07-08 07:16:26 +08:00
|
|
|
V = Builder.CreateFAdd(LHS, RHS);
|
2016-12-27 09:56:30 +08:00
|
|
|
break;
|
|
|
|
case Intrinsic::x86_avx512_mask_sub_ss_round:
|
|
|
|
case Intrinsic::x86_avx512_mask_sub_sd_round:
|
2017-07-08 07:16:26 +08:00
|
|
|
V = Builder.CreateFSub(LHS, RHS);
|
2016-12-27 09:56:30 +08:00
|
|
|
break;
|
|
|
|
case Intrinsic::x86_avx512_mask_mul_ss_round:
|
|
|
|
case Intrinsic::x86_avx512_mask_mul_sd_round:
|
2017-07-08 07:16:26 +08:00
|
|
|
V = Builder.CreateFMul(LHS, RHS);
|
2016-12-27 09:56:30 +08:00
|
|
|
break;
|
|
|
|
case Intrinsic::x86_avx512_mask_div_ss_round:
|
|
|
|
case Intrinsic::x86_avx512_mask_div_sd_round:
|
2017-07-08 07:16:26 +08:00
|
|
|
V = Builder.CreateFDiv(LHS, RHS);
|
2016-12-27 09:56:30 +08:00
|
|
|
break;
|
2016-12-26 14:33:19 +08:00
|
|
|
}
|
2016-12-27 09:56:30 +08:00
|
|
|
|
|
|
|
// Handle the masking aspect of the intrinsic.
|
|
|
|
Value *Mask = II->getArgOperand(3);
|
2016-12-31 07:06:28 +08:00
|
|
|
auto *C = dyn_cast<ConstantInt>(Mask);
|
|
|
|
// We don't need a select if we know the mask bit is a 1.
|
|
|
|
if (!C || !C->getValue()[0]) {
|
|
|
|
// Cast the mask to an i1 vector and then extract the lowest element.
|
2017-07-08 07:16:26 +08:00
|
|
|
auto *MaskTy = VectorType::get(Builder.getInt1Ty(),
|
2016-12-27 09:56:30 +08:00
|
|
|
cast<IntegerType>(Mask->getType())->getBitWidth());
|
2017-07-08 07:16:26 +08:00
|
|
|
Mask = Builder.CreateBitCast(Mask, MaskTy);
|
|
|
|
Mask = Builder.CreateExtractElement(Mask, (uint64_t)0);
|
2016-12-31 07:06:28 +08:00
|
|
|
// Extract the lowest element from the passthru operand.
|
2017-07-08 07:16:26 +08:00
|
|
|
Value *Passthru = Builder.CreateExtractElement(II->getArgOperand(2),
|
2016-12-31 07:06:28 +08:00
|
|
|
(uint64_t)0);
|
2017-07-08 07:16:26 +08:00
|
|
|
V = Builder.CreateSelect(Mask, V, Passthru);
|
2016-12-31 07:06:28 +08:00
|
|
|
}
|
2016-12-27 09:56:30 +08:00
|
|
|
|
|
|
|
// Insert the result back into the original argument 0.
|
2017-07-08 07:16:26 +08:00
|
|
|
V = Builder.CreateInsertElement(Arg0, V, (uint64_t)0);
|
2016-12-27 09:56:30 +08:00
|
|
|
|
|
|
|
return replaceInstUsesWith(*II, V);
|
2016-12-26 14:33:19 +08:00
|
|
|
}
|
|
|
|
}
|
2019-01-31 03:21:11 +08:00
|
|
|
break;
|
2016-12-26 14:33:19 +08:00
|
|
|
|
2015-08-11 04:21:15 +08:00
|
|
|
// Constant fold ashr( <A x Bi>, Ci ).
|
2015-08-13 15:39:03 +08:00
|
|
|
// Constant fold lshr( <A x Bi>, Ci ).
|
|
|
|
// Constant fold shl( <A x Bi>, Ci ).
|
2015-08-11 04:21:15 +08:00
|
|
|
case Intrinsic::x86_sse2_psrai_d:
|
|
|
|
case Intrinsic::x86_sse2_psrai_w:
|
|
|
|
case Intrinsic::x86_avx2_psrai_d:
|
|
|
|
case Intrinsic::x86_avx2_psrai_w:
|
2016-11-13 09:51:55 +08:00
|
|
|
case Intrinsic::x86_avx512_psrai_q_128:
|
|
|
|
case Intrinsic::x86_avx512_psrai_q_256:
|
|
|
|
case Intrinsic::x86_avx512_psrai_d_512:
|
|
|
|
case Intrinsic::x86_avx512_psrai_q_512:
|
|
|
|
case Intrinsic::x86_avx512_psrai_w_512:
|
2015-08-05 16:18:00 +08:00
|
|
|
case Intrinsic::x86_sse2_psrli_d:
|
|
|
|
case Intrinsic::x86_sse2_psrli_q:
|
|
|
|
case Intrinsic::x86_sse2_psrli_w:
|
|
|
|
case Intrinsic::x86_avx2_psrli_d:
|
|
|
|
case Intrinsic::x86_avx2_psrli_q:
|
|
|
|
case Intrinsic::x86_avx2_psrli_w:
|
2016-11-13 09:51:55 +08:00
|
|
|
case Intrinsic::x86_avx512_psrli_d_512:
|
|
|
|
case Intrinsic::x86_avx512_psrli_q_512:
|
|
|
|
case Intrinsic::x86_avx512_psrli_w_512:
|
2015-08-13 15:39:03 +08:00
|
|
|
case Intrinsic::x86_sse2_pslli_d:
|
|
|
|
case Intrinsic::x86_sse2_pslli_q:
|
|
|
|
case Intrinsic::x86_sse2_pslli_w:
|
|
|
|
case Intrinsic::x86_avx2_pslli_d:
|
|
|
|
case Intrinsic::x86_avx2_pslli_q:
|
|
|
|
case Intrinsic::x86_avx2_pslli_w:
|
2016-11-13 09:51:55 +08:00
|
|
|
case Intrinsic::x86_avx512_pslli_d_512:
|
|
|
|
case Intrinsic::x86_avx512_pslli_q_512:
|
|
|
|
case Intrinsic::x86_avx512_pslli_w_512:
|
2017-07-08 07:16:26 +08:00
|
|
|
if (Value *V = simplifyX86immShift(*II, Builder))
|
2016-02-02 06:23:39 +08:00
|
|
|
return replaceInstUsesWith(*II, V);
|
2015-08-05 16:18:00 +08:00
|
|
|
break;
|
|
|
|
|
2015-08-13 15:39:03 +08:00
|
|
|
case Intrinsic::x86_sse2_psra_d:
|
|
|
|
case Intrinsic::x86_sse2_psra_w:
|
|
|
|
case Intrinsic::x86_avx2_psra_d:
|
|
|
|
case Intrinsic::x86_avx2_psra_w:
|
2016-11-13 09:51:55 +08:00
|
|
|
case Intrinsic::x86_avx512_psra_q_128:
|
|
|
|
case Intrinsic::x86_avx512_psra_q_256:
|
|
|
|
case Intrinsic::x86_avx512_psra_d_512:
|
|
|
|
case Intrinsic::x86_avx512_psra_q_512:
|
|
|
|
case Intrinsic::x86_avx512_psra_w_512:
|
2015-08-13 15:39:03 +08:00
|
|
|
case Intrinsic::x86_sse2_psrl_d:
|
|
|
|
case Intrinsic::x86_sse2_psrl_q:
|
|
|
|
case Intrinsic::x86_sse2_psrl_w:
|
|
|
|
case Intrinsic::x86_avx2_psrl_d:
|
|
|
|
case Intrinsic::x86_avx2_psrl_q:
|
|
|
|
case Intrinsic::x86_avx2_psrl_w:
|
2016-11-13 09:51:55 +08:00
|
|
|
case Intrinsic::x86_avx512_psrl_d_512:
|
|
|
|
case Intrinsic::x86_avx512_psrl_q_512:
|
|
|
|
case Intrinsic::x86_avx512_psrl_w_512:
|
2015-08-05 16:18:00 +08:00
|
|
|
case Intrinsic::x86_sse2_psll_d:
|
|
|
|
case Intrinsic::x86_sse2_psll_q:
|
|
|
|
case Intrinsic::x86_sse2_psll_w:
|
2014-04-24 08:58:18 +08:00
|
|
|
case Intrinsic::x86_avx2_psll_d:
|
|
|
|
case Intrinsic::x86_avx2_psll_q:
|
2016-11-13 09:51:55 +08:00
|
|
|
case Intrinsic::x86_avx2_psll_w:
|
|
|
|
case Intrinsic::x86_avx512_psll_d_512:
|
|
|
|
case Intrinsic::x86_avx512_psll_q_512:
|
|
|
|
case Intrinsic::x86_avx512_psll_w_512: {
|
2017-07-08 07:16:26 +08:00
|
|
|
if (Value *V = simplifyX86immShift(*II, Builder))
|
2016-02-02 06:23:39 +08:00
|
|
|
return replaceInstUsesWith(*II, V);
|
2015-08-13 15:39:03 +08:00
|
|
|
|
|
|
|
// SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
|
|
|
|
// operand to compute the shift amount.
|
2015-09-19 19:41:53 +08:00
|
|
|
Value *Arg1 = II->getArgOperand(1);
|
|
|
|
assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 &&
|
2015-08-13 15:39:03 +08:00
|
|
|
"Unexpected packed shift size");
|
2015-09-19 19:41:53 +08:00
|
|
|
unsigned VWidth = Arg1->getType()->getVectorNumElements();
|
2015-08-13 15:39:03 +08:00
|
|
|
|
2015-09-19 19:41:53 +08:00
|
|
|
if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) {
|
2015-08-13 15:39:03 +08:00
|
|
|
II->setArgOperand(1, V);
|
|
|
|
return II;
|
|
|
|
}
|
2015-08-05 16:18:00 +08:00
|
|
|
break;
|
2015-08-13 15:39:03 +08:00
|
|
|
}
|
2015-08-05 16:18:00 +08:00
|
|
|
|
2016-06-07 18:27:15 +08:00
|
|
|
case Intrinsic::x86_avx2_psllv_d:
|
|
|
|
case Intrinsic::x86_avx2_psllv_d_256:
|
|
|
|
case Intrinsic::x86_avx2_psllv_q:
|
|
|
|
case Intrinsic::x86_avx2_psllv_q_256:
|
2016-11-13 15:26:19 +08:00
|
|
|
case Intrinsic::x86_avx512_psllv_d_512:
|
|
|
|
case Intrinsic::x86_avx512_psllv_q_512:
|
2016-11-18 14:04:33 +08:00
|
|
|
case Intrinsic::x86_avx512_psllv_w_128:
|
|
|
|
case Intrinsic::x86_avx512_psllv_w_256:
|
|
|
|
case Intrinsic::x86_avx512_psllv_w_512:
|
2016-06-07 18:27:15 +08:00
|
|
|
case Intrinsic::x86_avx2_psrav_d:
|
|
|
|
case Intrinsic::x86_avx2_psrav_d_256:
|
2016-11-13 15:26:19 +08:00
|
|
|
case Intrinsic::x86_avx512_psrav_q_128:
|
|
|
|
case Intrinsic::x86_avx512_psrav_q_256:
|
|
|
|
case Intrinsic::x86_avx512_psrav_d_512:
|
|
|
|
case Intrinsic::x86_avx512_psrav_q_512:
|
2016-11-18 14:04:33 +08:00
|
|
|
case Intrinsic::x86_avx512_psrav_w_128:
|
|
|
|
case Intrinsic::x86_avx512_psrav_w_256:
|
|
|
|
case Intrinsic::x86_avx512_psrav_w_512:
|
2016-06-07 18:27:15 +08:00
|
|
|
case Intrinsic::x86_avx2_psrlv_d:
|
|
|
|
case Intrinsic::x86_avx2_psrlv_d_256:
|
|
|
|
case Intrinsic::x86_avx2_psrlv_q:
|
|
|
|
case Intrinsic::x86_avx2_psrlv_q_256:
|
2016-11-13 15:26:19 +08:00
|
|
|
case Intrinsic::x86_avx512_psrlv_d_512:
|
|
|
|
case Intrinsic::x86_avx512_psrlv_q_512:
|
2016-11-18 14:04:33 +08:00
|
|
|
case Intrinsic::x86_avx512_psrlv_w_128:
|
|
|
|
case Intrinsic::x86_avx512_psrlv_w_256:
|
|
|
|
case Intrinsic::x86_avx512_psrlv_w_512:
|
2017-07-08 07:16:26 +08:00
|
|
|
if (Value *V = simplifyX86varShift(*II, Builder))
|
2016-06-07 18:27:15 +08:00
|
|
|
return replaceInstUsesWith(*II, V);
|
|
|
|
break;
|
|
|
|
|
2017-01-25 22:37:24 +08:00
|
|
|
case Intrinsic::x86_sse2_packssdw_128:
|
|
|
|
case Intrinsic::x86_sse2_packsswb_128:
|
|
|
|
case Intrinsic::x86_avx2_packssdw:
|
|
|
|
case Intrinsic::x86_avx2_packsswb:
|
2017-02-16 15:35:23 +08:00
|
|
|
case Intrinsic::x86_avx512_packssdw_512:
|
|
|
|
case Intrinsic::x86_avx512_packsswb_512:
|
2019-04-25 00:53:17 +08:00
|
|
|
if (Value *V = simplifyX86pack(*II, Builder, true))
|
2017-01-25 22:37:24 +08:00
|
|
|
return replaceInstUsesWith(*II, V);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case Intrinsic::x86_sse2_packuswb_128:
|
|
|
|
case Intrinsic::x86_sse41_packusdw:
|
|
|
|
case Intrinsic::x86_avx2_packusdw:
|
|
|
|
case Intrinsic::x86_avx2_packuswb:
|
2017-02-16 15:35:23 +08:00
|
|
|
case Intrinsic::x86_avx512_packusdw_512:
|
|
|
|
case Intrinsic::x86_avx512_packuswb_512:
|
2019-04-25 00:53:17 +08:00
|
|
|
if (Value *V = simplifyX86pack(*II, Builder, false))
|
2017-01-25 22:37:24 +08:00
|
|
|
return replaceInstUsesWith(*II, V);
|
|
|
|
break;
|
|
|
|
|
2018-05-14 05:56:32 +08:00
|
|
|
case Intrinsic::x86_pclmulqdq:
|
|
|
|
case Intrinsic::x86_pclmulqdq_256:
|
|
|
|
case Intrinsic::x86_pclmulqdq_512: {
|
2017-01-26 13:17:13 +08:00
|
|
|
if (auto *C = dyn_cast<ConstantInt>(II->getArgOperand(2))) {
|
|
|
|
unsigned Imm = C->getZExtValue();
|
|
|
|
|
|
|
|
bool MadeChange = false;
|
|
|
|
Value *Arg0 = II->getArgOperand(0);
|
|
|
|
Value *Arg1 = II->getArgOperand(1);
|
|
|
|
unsigned VWidth = Arg0->getType()->getVectorNumElements();
|
|
|
|
|
|
|
|
APInt UndefElts1(VWidth, 0);
|
2018-05-14 05:56:32 +08:00
|
|
|
APInt DemandedElts1 = APInt::getSplat(VWidth,
|
|
|
|
APInt(2, (Imm & 0x01) ? 2 : 1));
|
|
|
|
if (Value *V = SimplifyDemandedVectorElts(Arg0, DemandedElts1,
|
2017-01-26 13:17:13 +08:00
|
|
|
UndefElts1)) {
|
|
|
|
II->setArgOperand(0, V);
|
|
|
|
MadeChange = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
APInt UndefElts2(VWidth, 0);
|
2018-05-14 05:56:32 +08:00
|
|
|
APInt DemandedElts2 = APInt::getSplat(VWidth,
|
|
|
|
APInt(2, (Imm & 0x10) ? 2 : 1));
|
|
|
|
if (Value *V = SimplifyDemandedVectorElts(Arg1, DemandedElts2,
|
2017-01-26 13:17:13 +08:00
|
|
|
UndefElts2)) {
|
|
|
|
II->setArgOperand(1, V);
|
|
|
|
MadeChange = true;
|
|
|
|
}
|
|
|
|
|
2018-05-14 05:56:32 +08:00
|
|
|
// If either input elements are undef, the result is zero.
|
|
|
|
if (DemandedElts1.isSubsetOf(UndefElts1) ||
|
|
|
|
DemandedElts2.isSubsetOf(UndefElts2))
|
2017-01-26 13:17:13 +08:00
|
|
|
return replaceInstUsesWith(*II,
|
|
|
|
ConstantAggregateZero::get(II->getType()));
|
|
|
|
|
|
|
|
if (MadeChange)
|
|
|
|
return II;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2015-04-17 01:52:13 +08:00
|
|
|
case Intrinsic::x86_sse41_insertps:
|
2017-07-08 07:16:26 +08:00
|
|
|
if (Value *V = simplifyX86insertps(*II, Builder))
|
2016-02-02 06:23:39 +08:00
|
|
|
return replaceInstUsesWith(*II, V);
|
2015-04-17 01:52:13 +08:00
|
|
|
break;
|
2015-07-26 04:41:00 +08:00
|
|
|
|
2015-09-18 04:32:45 +08:00
|
|
|
case Intrinsic::x86_sse4a_extrq: {
|
|
|
|
Value *Op0 = II->getArgOperand(0);
|
|
|
|
Value *Op1 = II->getArgOperand(1);
|
|
|
|
unsigned VWidth0 = Op0->getType()->getVectorNumElements();
|
|
|
|
unsigned VWidth1 = Op1->getType()->getVectorNumElements();
|
2015-10-17 19:40:05 +08:00
|
|
|
assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
|
|
|
|
Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
|
|
|
|
VWidth1 == 16 && "Unexpected operand sizes");
|
|
|
|
|
|
|
|
// See if we're dealing with constant values.
|
|
|
|
Constant *C1 = dyn_cast<Constant>(Op1);
|
|
|
|
ConstantInt *CILength =
|
2016-09-07 20:03:03 +08:00
|
|
|
C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
|
2015-10-17 19:40:05 +08:00
|
|
|
: nullptr;
|
|
|
|
ConstantInt *CIIndex =
|
2016-09-07 20:03:03 +08:00
|
|
|
C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
|
2015-10-17 19:40:05 +08:00
|
|
|
: nullptr;
|
|
|
|
|
|
|
|
// Attempt to simplify to a constant, shuffle vector or EXTRQI call.
|
2017-07-08 07:16:26 +08:00
|
|
|
if (Value *V = simplifyX86extrq(*II, Op0, CILength, CIIndex, Builder))
|
2016-02-02 06:23:39 +08:00
|
|
|
return replaceInstUsesWith(*II, V);
|
2015-09-18 04:32:45 +08:00
|
|
|
|
2015-10-17 19:40:05 +08:00
|
|
|
// EXTRQ only uses the lowest 64-bits of the first 128-bit vector
|
|
|
|
// operands and the lowest 16-bits of the second.
|
2016-04-25 01:57:27 +08:00
|
|
|
bool MadeChange = false;
|
2015-09-18 04:32:45 +08:00
|
|
|
if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
|
|
|
|
II->setArgOperand(0, V);
|
2016-04-25 01:57:27 +08:00
|
|
|
MadeChange = true;
|
2015-09-18 04:32:45 +08:00
|
|
|
}
|
|
|
|
if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) {
|
|
|
|
II->setArgOperand(1, V);
|
2016-04-25 01:57:27 +08:00
|
|
|
MadeChange = true;
|
2015-09-18 04:32:45 +08:00
|
|
|
}
|
2016-04-25 01:57:27 +08:00
|
|
|
if (MadeChange)
|
|
|
|
return II;
|
2015-09-18 04:32:45 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
case Intrinsic::x86_sse4a_extrqi: {
|
2015-10-17 19:40:05 +08:00
|
|
|
// EXTRQI: Extract Length bits starting from Index. Zero pad the remaining
|
|
|
|
// bits of the lower 64-bits. The upper 64-bits are undefined.
|
|
|
|
Value *Op0 = II->getArgOperand(0);
|
|
|
|
unsigned VWidth = Op0->getType()->getVectorNumElements();
|
|
|
|
assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
|
|
|
|
"Unexpected operand size");
|
|
|
|
|
|
|
|
// See if we're dealing with constant values.
|
|
|
|
ConstantInt *CILength = dyn_cast<ConstantInt>(II->getArgOperand(1));
|
|
|
|
ConstantInt *CIIndex = dyn_cast<ConstantInt>(II->getArgOperand(2));
|
2015-09-18 04:32:45 +08:00
|
|
|
|
2015-10-17 19:40:05 +08:00
|
|
|
// Attempt to simplify to a constant or shuffle vector.
|
2017-07-08 07:16:26 +08:00
|
|
|
if (Value *V = simplifyX86extrq(*II, Op0, CILength, CIIndex, Builder))
|
2016-02-02 06:23:39 +08:00
|
|
|
return replaceInstUsesWith(*II, V);
|
2015-10-17 19:40:05 +08:00
|
|
|
|
|
|
|
// EXTRQI only uses the lowest 64-bits of the first 128-bit vector
|
|
|
|
// operand.
|
|
|
|
if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
|
2015-09-18 04:32:45 +08:00
|
|
|
II->setArgOperand(0, V);
|
|
|
|
return II;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
case Intrinsic::x86_sse4a_insertq: {
|
2015-10-17 19:40:05 +08:00
|
|
|
Value *Op0 = II->getArgOperand(0);
|
|
|
|
Value *Op1 = II->getArgOperand(1);
|
|
|
|
unsigned VWidth = Op0->getType()->getVectorNumElements();
|
|
|
|
assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
|
|
|
|
Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
|
|
|
|
Op1->getType()->getVectorNumElements() == 2 &&
|
|
|
|
"Unexpected operand size");
|
|
|
|
|
|
|
|
// See if we're dealing with constant values.
|
|
|
|
Constant *C1 = dyn_cast<Constant>(Op1);
|
|
|
|
ConstantInt *CI11 =
|
2016-09-07 20:47:53 +08:00
|
|
|
C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
|
2015-10-17 19:40:05 +08:00
|
|
|
: nullptr;
|
|
|
|
|
|
|
|
// Attempt to simplify to a constant, shuffle vector or INSERTQI call.
|
|
|
|
if (CI11) {
|
2016-06-08 18:01:20 +08:00
|
|
|
const APInt &V11 = CI11->getValue();
|
2015-10-17 19:40:05 +08:00
|
|
|
APInt Len = V11.zextOrTrunc(6);
|
|
|
|
APInt Idx = V11.lshr(8).zextOrTrunc(6);
|
2017-07-08 07:16:26 +08:00
|
|
|
if (Value *V = simplifyX86insertq(*II, Op0, Op1, Len, Idx, Builder))
|
2016-02-02 06:23:39 +08:00
|
|
|
return replaceInstUsesWith(*II, V);
|
2015-10-17 19:40:05 +08:00
|
|
|
}
|
2015-09-18 04:32:45 +08:00
|
|
|
|
2015-10-17 19:40:05 +08:00
|
|
|
// INSERTQ only uses the lowest 64-bits of the first 128-bit vector
|
|
|
|
// operand.
|
|
|
|
if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
|
2015-09-18 04:32:45 +08:00
|
|
|
II->setArgOperand(0, V);
|
|
|
|
return II;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2014-04-24 08:38:14 +08:00
|
|
|
case Intrinsic::x86_sse4a_insertqi: {
|
2015-10-17 19:40:05 +08:00
|
|
|
// INSERTQI: Extract lowest Length bits from lower half of second source and
|
|
|
|
// insert over first source starting at Index bit. The upper 64-bits are
|
|
|
|
// undefined.
|
2015-09-18 04:32:45 +08:00
|
|
|
Value *Op0 = II->getArgOperand(0);
|
|
|
|
Value *Op1 = II->getArgOperand(1);
|
|
|
|
unsigned VWidth0 = Op0->getType()->getVectorNumElements();
|
|
|
|
unsigned VWidth1 = Op1->getType()->getVectorNumElements();
|
2015-10-17 19:40:05 +08:00
|
|
|
assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
|
|
|
|
Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
|
|
|
|
VWidth1 == 2 && "Unexpected operand sizes");
|
|
|
|
|
|
|
|
// See if we're dealing with constant values.
|
|
|
|
ConstantInt *CILength = dyn_cast<ConstantInt>(II->getArgOperand(2));
|
|
|
|
ConstantInt *CIIndex = dyn_cast<ConstantInt>(II->getArgOperand(3));
|
|
|
|
|
|
|
|
// Attempt to simplify to a constant or shuffle vector.
|
|
|
|
if (CILength && CIIndex) {
|
|
|
|
APInt Len = CILength->getValue().zextOrTrunc(6);
|
|
|
|
APInt Idx = CIIndex->getValue().zextOrTrunc(6);
|
2017-07-08 07:16:26 +08:00
|
|
|
if (Value *V = simplifyX86insertq(*II, Op0, Op1, Len, Idx, Builder))
|
2016-02-02 06:23:39 +08:00
|
|
|
return replaceInstUsesWith(*II, V);
|
2015-10-17 19:40:05 +08:00
|
|
|
}
|
2015-09-18 04:32:45 +08:00
|
|
|
|
2015-10-17 19:40:05 +08:00
|
|
|
// INSERTQI only uses the lowest 64-bits of the first two 128-bit vector
|
|
|
|
// operands.
|
2016-04-25 01:57:27 +08:00
|
|
|
bool MadeChange = false;
|
2015-09-18 04:32:45 +08:00
|
|
|
if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
|
|
|
|
II->setArgOperand(0, V);
|
2016-04-25 01:57:27 +08:00
|
|
|
MadeChange = true;
|
2015-09-18 04:32:45 +08:00
|
|
|
}
|
|
|
|
if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) {
|
|
|
|
II->setArgOperand(1, V);
|
2016-04-25 01:57:27 +08:00
|
|
|
MadeChange = true;
|
2015-09-18 04:32:45 +08:00
|
|
|
}
|
2016-04-25 01:57:27 +08:00
|
|
|
if (MadeChange)
|
|
|
|
return II;
|
2014-04-24 08:38:14 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2014-05-27 11:42:20 +08:00
|
|
|
case Intrinsic::x86_sse41_pblendvb:
|
|
|
|
case Intrinsic::x86_sse41_blendvps:
|
|
|
|
case Intrinsic::x86_sse41_blendvpd:
|
|
|
|
case Intrinsic::x86_avx_blendv_ps_256:
|
|
|
|
case Intrinsic::x86_avx_blendv_pd_256:
|
|
|
|
case Intrinsic::x86_avx2_pblendvb: {
|
2018-09-15 22:25:44 +08:00
|
|
|
// fold (blend A, A, Mask) -> A
|
2015-08-12 16:08:56 +08:00
|
|
|
Value *Op0 = II->getArgOperand(0);
|
|
|
|
Value *Op1 = II->getArgOperand(1);
|
2014-05-27 11:42:20 +08:00
|
|
|
Value *Mask = II->getArgOperand(2);
|
2015-08-12 16:08:56 +08:00
|
|
|
if (Op0 == Op1)
|
2016-02-02 06:23:39 +08:00
|
|
|
return replaceInstUsesWith(CI, Op0);
|
2015-08-12 16:08:56 +08:00
|
|
|
|
|
|
|
// Zero Mask - select 1st argument.
|
2015-08-12 16:23:36 +08:00
|
|
|
if (isa<ConstantAggregateZero>(Mask))
|
2016-02-02 06:23:39 +08:00
|
|
|
return replaceInstUsesWith(CI, Op0);
|
2015-08-12 16:08:56 +08:00
|
|
|
|
|
|
|
// Constant Mask - select 1st/2nd argument lane based on top bit of mask.
|
2016-02-22 01:29:33 +08:00
|
|
|
if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) {
|
|
|
|
Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask);
|
2015-08-12 16:08:56 +08:00
|
|
|
return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
|
2014-05-27 11:42:20 +08:00
|
|
|
}
|
2018-09-15 22:25:44 +08:00
|
|
|
|
|
|
|
// Convert to a vector select if we can bypass casts and find a boolean
|
|
|
|
// vector condition value.
|
|
|
|
Value *BoolVec;
|
2018-09-22 22:43:55 +08:00
|
|
|
Mask = peekThroughBitcast(Mask);
|
|
|
|
if (match(Mask, m_SExt(m_Value(BoolVec))) &&
|
|
|
|
BoolVec->getType()->isVectorTy() &&
|
|
|
|
BoolVec->getType()->getScalarSizeInBits() == 1) {
|
|
|
|
assert(Mask->getType()->getPrimitiveSizeInBits() ==
|
|
|
|
II->getType()->getPrimitiveSizeInBits() &&
|
|
|
|
"Not expecting mask and operands with different sizes");
|
|
|
|
|
|
|
|
unsigned NumMaskElts = Mask->getType()->getVectorNumElements();
|
|
|
|
unsigned NumOperandElts = II->getType()->getVectorNumElements();
|
|
|
|
if (NumMaskElts == NumOperandElts)
|
2018-09-15 22:25:44 +08:00
|
|
|
return SelectInst::Create(BoolVec, Op1, Op0);
|
2018-09-22 22:43:55 +08:00
|
|
|
|
|
|
|
// If the mask has less elements than the operands, each mask bit maps to
|
|
|
|
// multiple elements of the operands. Bitcast back and forth.
|
|
|
|
if (NumMaskElts < NumOperandElts) {
|
|
|
|
Value *CastOp0 = Builder.CreateBitCast(Op0, Mask->getType());
|
|
|
|
Value *CastOp1 = Builder.CreateBitCast(Op1, Mask->getType());
|
|
|
|
Value *Sel = Builder.CreateSelect(BoolVec, CastOp1, CastOp0);
|
|
|
|
return new BitCastInst(Sel, II->getType());
|
|
|
|
}
|
2018-09-15 22:25:44 +08:00
|
|
|
}
|
|
|
|
|
2015-08-12 16:08:56 +08:00
|
|
|
break;
|
2014-05-27 11:42:20 +08:00
|
|
|
}
|
|
|
|
|
2015-10-01 00:44:39 +08:00
|
|
|
case Intrinsic::x86_ssse3_pshuf_b_128:
|
2016-04-25 01:00:34 +08:00
|
|
|
case Intrinsic::x86_avx2_pshuf_b:
|
2017-01-18 21:44:04 +08:00
|
|
|
case Intrinsic::x86_avx512_pshuf_b_512:
|
2017-07-08 07:16:26 +08:00
|
|
|
if (Value *V = simplifyX86pshufb(*II, Builder))
|
2016-04-25 01:00:34 +08:00
|
|
|
return replaceInstUsesWith(*II, V);
|
|
|
|
break;
|
2015-10-01 00:44:39 +08:00
|
|
|
|
2014-04-22 06:06:04 +08:00
|
|
|
case Intrinsic::x86_avx_vpermilvar_ps:
|
|
|
|
case Intrinsic::x86_avx_vpermilvar_ps_256:
|
2016-12-11 09:59:36 +08:00
|
|
|
case Intrinsic::x86_avx512_vpermilvar_ps_512:
|
2014-04-22 06:06:04 +08:00
|
|
|
case Intrinsic::x86_avx_vpermilvar_pd:
|
2016-04-25 01:23:46 +08:00
|
|
|
case Intrinsic::x86_avx_vpermilvar_pd_256:
|
2017-01-18 21:44:04 +08:00
|
|
|
case Intrinsic::x86_avx512_vpermilvar_pd_512:
|
2017-07-08 07:16:26 +08:00
|
|
|
if (Value *V = simplifyX86vpermilvar(*II, Builder))
|
2016-04-25 01:23:46 +08:00
|
|
|
return replaceInstUsesWith(*II, V);
|
|
|
|
break;
|
2014-04-22 06:06:04 +08:00
|
|
|
|
2016-05-02 00:41:22 +08:00
|
|
|
case Intrinsic::x86_avx2_permd:
|
|
|
|
case Intrinsic::x86_avx2_permps:
|
2018-05-21 07:34:04 +08:00
|
|
|
case Intrinsic::x86_avx512_permvar_df_256:
|
|
|
|
case Intrinsic::x86_avx512_permvar_df_512:
|
|
|
|
case Intrinsic::x86_avx512_permvar_di_256:
|
|
|
|
case Intrinsic::x86_avx512_permvar_di_512:
|
|
|
|
case Intrinsic::x86_avx512_permvar_hi_128:
|
|
|
|
case Intrinsic::x86_avx512_permvar_hi_256:
|
|
|
|
case Intrinsic::x86_avx512_permvar_hi_512:
|
|
|
|
case Intrinsic::x86_avx512_permvar_qi_128:
|
|
|
|
case Intrinsic::x86_avx512_permvar_qi_256:
|
|
|
|
case Intrinsic::x86_avx512_permvar_qi_512:
|
|
|
|
case Intrinsic::x86_avx512_permvar_sf_512:
|
|
|
|
case Intrinsic::x86_avx512_permvar_si_512:
|
2017-07-08 07:16:26 +08:00
|
|
|
if (Value *V = simplifyX86vpermv(*II, Builder))
|
2016-05-02 00:41:22 +08:00
|
|
|
return replaceInstUsesWith(*II, V);
|
|
|
|
break;
|
|
|
|
|
2016-03-01 07:16:48 +08:00
|
|
|
case Intrinsic::x86_avx_maskload_ps:
|
2016-03-01 07:59:00 +08:00
|
|
|
case Intrinsic::x86_avx_maskload_pd:
|
|
|
|
case Intrinsic::x86_avx_maskload_ps_256:
|
|
|
|
case Intrinsic::x86_avx_maskload_pd_256:
|
|
|
|
case Intrinsic::x86_avx2_maskload_d:
|
|
|
|
case Intrinsic::x86_avx2_maskload_q:
|
|
|
|
case Intrinsic::x86_avx2_maskload_d_256:
|
|
|
|
case Intrinsic::x86_avx2_maskload_q_256:
|
2016-03-01 07:16:48 +08:00
|
|
|
if (Instruction *I = simplifyX86MaskedLoad(*II, *this))
|
|
|
|
return I;
|
|
|
|
break;
|
|
|
|
|
2016-03-12 23:16:59 +08:00
|
|
|
case Intrinsic::x86_sse2_maskmov_dqu:
|
[x86, InstCombine] transform x86 AVX masked stores to LLVM intrinsics
The intended effect of this patch in conjunction with:
http://reviews.llvm.org/rL259392
http://reviews.llvm.org/rL260145
is that customers using the AVX intrinsics in C will benefit from combines when
the store mask is constant:
void mstore_zero_mask(float *f, __m128 v) {
_mm_maskstore_ps(f, _mm_set1_epi32(0), v);
}
void mstore_fake_ones_mask(float *f, __m128 v) {
_mm_maskstore_ps(f, _mm_set1_epi32(1), v);
}
void mstore_ones_mask(float *f, __m128 v) {
_mm_maskstore_ps(f, _mm_set1_epi32(0x80000000), v);
}
void mstore_one_set_elt_mask(float *f, __m128 v) {
_mm_maskstore_ps(f, _mm_set_epi32(0x80000000, 0, 0, 0), v);
}
...so none of the above will actually generate a masked store for optimized code.
Differential Revision: http://reviews.llvm.org/D17485
llvm-svn: 262064
2016-02-27 05:04:14 +08:00
|
|
|
case Intrinsic::x86_avx_maskstore_ps:
|
|
|
|
case Intrinsic::x86_avx_maskstore_pd:
|
|
|
|
case Intrinsic::x86_avx_maskstore_ps_256:
|
|
|
|
case Intrinsic::x86_avx_maskstore_pd_256:
|
2016-02-27 05:51:44 +08:00
|
|
|
case Intrinsic::x86_avx2_maskstore_d:
|
|
|
|
case Intrinsic::x86_avx2_maskstore_q:
|
|
|
|
case Intrinsic::x86_avx2_maskstore_d_256:
|
|
|
|
case Intrinsic::x86_avx2_maskstore_q_256:
|
[x86, InstCombine] transform x86 AVX masked stores to LLVM intrinsics
The intended effect of this patch in conjunction with:
http://reviews.llvm.org/rL259392
http://reviews.llvm.org/rL260145
is that customers using the AVX intrinsics in C will benefit from combines when
the store mask is constant:
void mstore_zero_mask(float *f, __m128 v) {
_mm_maskstore_ps(f, _mm_set1_epi32(0), v);
}
void mstore_fake_ones_mask(float *f, __m128 v) {
_mm_maskstore_ps(f, _mm_set1_epi32(1), v);
}
void mstore_ones_mask(float *f, __m128 v) {
_mm_maskstore_ps(f, _mm_set1_epi32(0x80000000), v);
}
void mstore_one_set_elt_mask(float *f, __m128 v) {
_mm_maskstore_ps(f, _mm_set_epi32(0x80000000, 0, 0, 0), v);
}
...so none of the above will actually generate a masked store for optimized code.
Differential Revision: http://reviews.llvm.org/D17485
llvm-svn: 262064
2016-02-27 05:04:14 +08:00
|
|
|
if (simplifyX86MaskedStore(*II, *this))
|
|
|
|
return nullptr;
|
|
|
|
break;
|
|
|
|
|
2019-02-01 22:14:47 +08:00
|
|
|
case Intrinsic::x86_addcarry_32:
|
|
|
|
case Intrinsic::x86_addcarry_64:
|
|
|
|
if (Value *V = simplifyX86addcarry(*II, Builder))
|
|
|
|
return replaceInstUsesWith(*II, V);
|
|
|
|
break;
|
|
|
|
|
2010-01-05 15:32:13 +08:00
|
|
|
case Intrinsic::ppc_altivec_vperm:
|
|
|
|
// Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant.
|
2014-06-06 03:46:04 +08:00
|
|
|
// Note that ppc_altivec_vperm has a big-endian bias, so when creating
|
|
|
|
// a vectorshuffle for little endian, we must undo the transformation
|
|
|
|
// performed on vec_perm in altivec.h. That is, we must complement
|
|
|
|
// the permutation mask with respect to 31 and reverse the order of
|
|
|
|
// V1 and V2.
|
2012-01-27 11:08:05 +08:00
|
|
|
if (Constant *Mask = dyn_cast<Constant>(II->getArgOperand(2))) {
|
|
|
|
assert(Mask->getType()->getVectorNumElements() == 16 &&
|
|
|
|
"Bad type for intrinsic!");
|
2012-02-03 08:07:04 +08:00
|
|
|
|
2010-01-05 15:32:13 +08:00
|
|
|
// Check that all of the elements are integer constants or undefs.
|
|
|
|
bool AllEltsOk = true;
|
|
|
|
for (unsigned i = 0; i != 16; ++i) {
|
2012-01-27 11:08:05 +08:00
|
|
|
Constant *Elt = Mask->getAggregateElement(i);
|
2014-04-25 13:29:35 +08:00
|
|
|
if (!Elt || !(isa<ConstantInt>(Elt) || isa<UndefValue>(Elt))) {
|
2010-01-05 15:32:13 +08:00
|
|
|
AllEltsOk = false;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2012-02-03 08:07:04 +08:00
|
|
|
|
2010-01-05 15:32:13 +08:00
|
|
|
if (AllEltsOk) {
|
|
|
|
// Cast the input vectors to byte vectors.
|
2017-07-08 07:16:26 +08:00
|
|
|
Value *Op0 = Builder.CreateBitCast(II->getArgOperand(0),
|
|
|
|
Mask->getType());
|
|
|
|
Value *Op1 = Builder.CreateBitCast(II->getArgOperand(1),
|
|
|
|
Mask->getType());
|
2010-01-05 15:32:13 +08:00
|
|
|
Value *Result = UndefValue::get(Op0->getType());
|
2012-02-03 08:07:04 +08:00
|
|
|
|
2010-01-05 15:32:13 +08:00
|
|
|
// Only extract each element once.
|
|
|
|
Value *ExtractedElts[32];
|
|
|
|
memset(ExtractedElts, 0, sizeof(ExtractedElts));
|
2012-02-03 08:07:04 +08:00
|
|
|
|
2010-01-05 15:32:13 +08:00
|
|
|
for (unsigned i = 0; i != 16; ++i) {
|
2012-01-27 11:08:05 +08:00
|
|
|
if (isa<UndefValue>(Mask->getAggregateElement(i)))
|
2010-01-05 15:32:13 +08:00
|
|
|
continue;
|
2012-02-03 08:07:04 +08:00
|
|
|
unsigned Idx =
|
2012-01-27 11:08:05 +08:00
|
|
|
cast<ConstantInt>(Mask->getAggregateElement(i))->getZExtValue();
|
2010-01-05 15:32:13 +08:00
|
|
|
Idx &= 31; // Match the hardware behavior.
|
2015-03-10 10:37:25 +08:00
|
|
|
if (DL.isLittleEndian())
|
2014-06-06 03:46:04 +08:00
|
|
|
Idx = 31 - Idx;
|
2012-02-03 08:07:04 +08:00
|
|
|
|
2014-04-25 13:29:35 +08:00
|
|
|
if (!ExtractedElts[Idx]) {
|
2015-03-10 10:37:25 +08:00
|
|
|
Value *Op0ToUse = (DL.isLittleEndian()) ? Op1 : Op0;
|
|
|
|
Value *Op1ToUse = (DL.isLittleEndian()) ? Op0 : Op1;
|
2012-02-03 08:07:04 +08:00
|
|
|
ExtractedElts[Idx] =
|
2017-07-08 07:16:26 +08:00
|
|
|
Builder.CreateExtractElement(Idx < 16 ? Op0ToUse : Op1ToUse,
|
|
|
|
Builder.getInt32(Idx&15));
|
2010-01-05 15:32:13 +08:00
|
|
|
}
|
2012-02-03 08:07:04 +08:00
|
|
|
|
2010-01-05 15:32:13 +08:00
|
|
|
// Insert this value into the result vector.
|
2017-07-08 07:16:26 +08:00
|
|
|
Result = Builder.CreateInsertElement(Result, ExtractedElts[Idx],
|
|
|
|
Builder.getInt32(i));
|
2010-01-05 15:32:13 +08:00
|
|
|
}
|
|
|
|
return CastInst::Create(Instruction::BitCast, Result, CI.getType());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
2018-05-31 20:19:18 +08:00
|
|
|
case Intrinsic::arm_neon_vld1: {
|
|
|
|
unsigned MemAlign = getKnownAlignment(II->getArgOperand(0),
|
|
|
|
DL, II, &AC, &DT);
|
|
|
|
if (Value *V = simplifyNeonVld1(*II, MemAlign, Builder))
|
|
|
|
return replaceInstUsesWith(*II, V);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2010-10-23 05:41:48 +08:00
|
|
|
case Intrinsic::arm_neon_vld2:
|
|
|
|
case Intrinsic::arm_neon_vld3:
|
|
|
|
case Intrinsic::arm_neon_vld4:
|
|
|
|
case Intrinsic::arm_neon_vld2lane:
|
|
|
|
case Intrinsic::arm_neon_vld3lane:
|
|
|
|
case Intrinsic::arm_neon_vld4lane:
|
|
|
|
case Intrinsic::arm_neon_vst1:
|
|
|
|
case Intrinsic::arm_neon_vst2:
|
|
|
|
case Intrinsic::arm_neon_vst3:
|
|
|
|
case Intrinsic::arm_neon_vst4:
|
|
|
|
case Intrinsic::arm_neon_vst2lane:
|
|
|
|
case Intrinsic::arm_neon_vst3lane:
|
|
|
|
case Intrinsic::arm_neon_vst4lane: {
|
2016-08-05 09:06:44 +08:00
|
|
|
unsigned MemAlign =
|
2016-12-19 16:22:17 +08:00
|
|
|
getKnownAlignment(II->getArgOperand(0), DL, II, &AC, &DT);
|
2010-10-23 05:41:48 +08:00
|
|
|
unsigned AlignArg = II->getNumArgOperands() - 1;
|
|
|
|
ConstantInt *IntrAlign = dyn_cast<ConstantInt>(II->getArgOperand(AlignArg));
|
|
|
|
if (IntrAlign && IntrAlign->getZExtValue() < MemAlign) {
|
|
|
|
II->setArgOperand(AlignArg,
|
|
|
|
ConstantInt::get(Type::getInt32Ty(II->getContext()),
|
|
|
|
MemAlign, false));
|
|
|
|
return II;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
[InstCombine, ARM, AArch64] Convert table lookup to shuffle vector
Turning a table lookup intrinsic into a shuffle vector instruction
can be beneficial. If the mask used for the lookup is the constant
vector {7,6,5,4,3,2,1,0}, then the back-end generates byte reverse
instructions instead.
Differential Revision: https://reviews.llvm.org/D46133
llvm-svn: 333550
2018-05-30 22:38:50 +08:00
|
|
|
case Intrinsic::arm_neon_vtbl1:
|
|
|
|
case Intrinsic::aarch64_neon_tbl1:
|
|
|
|
if (Value *V = simplifyNeonTbl1(*II, Builder))
|
|
|
|
return replaceInstUsesWith(*II, V);
|
|
|
|
break;
|
|
|
|
|
2012-05-01 08:20:38 +08:00
|
|
|
case Intrinsic::arm_neon_vmulls:
|
2014-03-29 18:18:08 +08:00
|
|
|
case Intrinsic::arm_neon_vmullu:
|
2014-05-24 20:50:23 +08:00
|
|
|
case Intrinsic::aarch64_neon_smull:
|
|
|
|
case Intrinsic::aarch64_neon_umull: {
|
2012-05-01 08:20:38 +08:00
|
|
|
Value *Arg0 = II->getArgOperand(0);
|
|
|
|
Value *Arg1 = II->getArgOperand(1);
|
|
|
|
|
|
|
|
// Handle mul by zero first:
|
|
|
|
if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1)) {
|
2016-02-02 06:23:39 +08:00
|
|
|
return replaceInstUsesWith(CI, ConstantAggregateZero::get(II->getType()));
|
2012-05-01 08:20:38 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Check for constant LHS & RHS - in this case we just simplify.
|
2019-05-06 23:35:02 +08:00
|
|
|
bool Zext = (IID == Intrinsic::arm_neon_vmullu ||
|
|
|
|
IID == Intrinsic::aarch64_neon_umull);
|
2012-05-01 08:20:38 +08:00
|
|
|
VectorType *NewVT = cast<VectorType>(II->getType());
|
2014-02-14 02:23:24 +08:00
|
|
|
if (Constant *CV0 = dyn_cast<Constant>(Arg0)) {
|
|
|
|
if (Constant *CV1 = dyn_cast<Constant>(Arg1)) {
|
|
|
|
CV0 = ConstantExpr::getIntegerCast(CV0, NewVT, /*isSigned=*/!Zext);
|
|
|
|
CV1 = ConstantExpr::getIntegerCast(CV1, NewVT, /*isSigned=*/!Zext);
|
|
|
|
|
2016-02-02 06:23:39 +08:00
|
|
|
return replaceInstUsesWith(CI, ConstantExpr::getMul(CV0, CV1));
|
2012-05-01 08:20:38 +08:00
|
|
|
}
|
|
|
|
|
2014-01-25 01:20:08 +08:00
|
|
|
// Couldn't simplify - canonicalize constant to the RHS.
|
2012-05-01 08:20:38 +08:00
|
|
|
std::swap(Arg0, Arg1);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Handle mul by one:
|
2014-02-14 02:23:24 +08:00
|
|
|
if (Constant *CV1 = dyn_cast<Constant>(Arg1))
|
2012-05-01 08:20:38 +08:00
|
|
|
if (ConstantInt *Splat =
|
2014-02-14 02:23:24 +08:00
|
|
|
dyn_cast_or_null<ConstantInt>(CV1->getSplatValue()))
|
|
|
|
if (Splat->isOne())
|
|
|
|
return CastInst::CreateIntegerCast(Arg0, II->getType(),
|
|
|
|
/*isSigned=*/!Zext);
|
2012-05-01 08:20:38 +08:00
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
2018-05-24 23:26:42 +08:00
|
|
|
case Intrinsic::arm_neon_aesd:
|
|
|
|
case Intrinsic::arm_neon_aese:
|
|
|
|
case Intrinsic::aarch64_crypto_aesd:
|
|
|
|
case Intrinsic::aarch64_crypto_aese: {
|
|
|
|
Value *DataArg = II->getArgOperand(0);
|
|
|
|
Value *KeyArg = II->getArgOperand(1);
|
|
|
|
|
|
|
|
// Try to use the builtin XOR in AESE and AESD to eliminate a prior XOR
|
|
|
|
Value *Data, *Key;
|
|
|
|
if (match(KeyArg, m_ZeroInt()) &&
|
|
|
|
match(DataArg, m_Xor(m_Value(Data), m_Value(Key)))) {
|
|
|
|
II->setArgOperand(0, Data);
|
|
|
|
II->setArgOperand(1, Key);
|
|
|
|
return II;
|
|
|
|
}
|
|
|
|
break;
|
[InstCombine] Known-bits optimization for ARM MVE VADC.
The MVE VADC instruction reads and writes the carry bit at bit 29 of
the FPSCR register. The corresponding ACLE intrinsic is specified to
work with an integer in which the carry bit is stored at bit 0. So if
a user writes a code sequence in C that passes the carry from one VADC
to the next, like this,
s0 = vadcq_u32(a0, b0, &carry);
s1 = vadcq_u32(a1, b1, &carry);
then clang will generate IR for each of those operations that shifts
the carry bit up into bit 29 before the VADC, and after it, shifts it
back down and masks off all but the low bit. But in this situation
what you really wanted was two consecutive VADC instructions, so that
the second one directly reads the value left in FPSCR by the first,
without wasting several instructions on pointlessly clearing the other
flag bits in between.
This commit explains to InstCombine that the other bits of the flags
operand don't matter, and adds a test that demonstrates that all the
code between the two VADC instructions can be optimized away as a
result.
Reviewers: dmgreen, miyuki, ostannard
Subscribers: kristof.beyls, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D67162
2019-09-11 17:29:56 +08:00
|
|
|
}
|
[ARM,MVE] Add InstCombine rules for pred_i2v / pred_v2i.
If you're writing C code using the ACLE MVE intrinsics that passes the
result of a vcmp as input to a predicated intrinsic, e.g.
mve_pred16_t pred = vcmpeqq(v1, v2);
v_out = vaddq_m(v_inactive, v3, v4, pred);
then clang's codegen for the compare intrinsic will create calls to
`@llvm.arm.mve.pred.v2i` to convert the output of `icmp` into an
`mve_pred16_t` integer representation, and then the next intrinsic
will call `@llvm.arm.mve.pred.i2v` to convert it straight back again.
This will be visible in the generated code as a `vmrs`/`vmsr` pair
that move the predicate value pointlessly out of `p0` and back into it again.
To prevent that, I've added InstCombine rules to remove round trips of
the form `v2i(i2v(x))` and `i2v(v2i(x))`. Also I've taught InstCombine
about the known and demanded bits of those intrinsics. As a result,
you now get just the generated code you wanted:
vpt.u16 eq, q1, q2
vaddt.u16 q0, q3, q4
Reviewers: ostannard, MarkMurrayARM, dmgreen
Reviewed By: dmgreen
Subscribers: kristof.beyls, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D70313
2019-11-18 18:38:17 +08:00
|
|
|
case Intrinsic::arm_mve_pred_i2v: {
|
|
|
|
Value *Arg = II->getArgOperand(0);
|
|
|
|
Value *ArgArg;
|
|
|
|
if (match(Arg, m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(m_Value(ArgArg))) &&
|
|
|
|
II->getType() == ArgArg->getType())
|
|
|
|
return replaceInstUsesWith(*II, ArgArg);
|
2019-12-03 00:18:34 +08:00
|
|
|
Constant *XorMask;
|
|
|
|
if (match(Arg,
|
|
|
|
m_Xor(m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(m_Value(ArgArg)),
|
|
|
|
m_Constant(XorMask))) &&
|
|
|
|
II->getType() == ArgArg->getType()) {
|
|
|
|
if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
|
|
|
|
if (CI->getValue().trunc(16).isAllOnesValue()) {
|
|
|
|
auto TrueVector = Builder.CreateVectorSplat(
|
|
|
|
II->getType()->getVectorNumElements(), Builder.getTrue());
|
|
|
|
return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
[ARM,MVE] Add InstCombine rules for pred_i2v / pred_v2i.
If you're writing C code using the ACLE MVE intrinsics that passes the
result of a vcmp as input to a predicated intrinsic, e.g.
mve_pred16_t pred = vcmpeqq(v1, v2);
v_out = vaddq_m(v_inactive, v3, v4, pred);
then clang's codegen for the compare intrinsic will create calls to
`@llvm.arm.mve.pred.v2i` to convert the output of `icmp` into an
`mve_pred16_t` integer representation, and then the next intrinsic
will call `@llvm.arm.mve.pred.i2v` to convert it straight back again.
This will be visible in the generated code as a `vmrs`/`vmsr` pair
that move the predicate value pointlessly out of `p0` and back into it again.
To prevent that, I've added InstCombine rules to remove round trips of
the form `v2i(i2v(x))` and `i2v(v2i(x))`. Also I've taught InstCombine
about the known and demanded bits of those intrinsics. As a result,
you now get just the generated code you wanted:
vpt.u16 eq, q1, q2
vaddt.u16 q0, q3, q4
Reviewers: ostannard, MarkMurrayARM, dmgreen
Reviewed By: dmgreen
Subscribers: kristof.beyls, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D70313
2019-11-18 18:38:17 +08:00
|
|
|
KnownBits ScalarKnown(32);
|
|
|
|
if (SimplifyDemandedBits(II, 0, APInt::getLowBitsSet(32, 16),
|
|
|
|
ScalarKnown, 0))
|
|
|
|
return II;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case Intrinsic::arm_mve_pred_v2i: {
|
|
|
|
Value *Arg = II->getArgOperand(0);
|
|
|
|
Value *ArgArg;
|
|
|
|
if (match(Arg, m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(m_Value(ArgArg))))
|
|
|
|
return replaceInstUsesWith(*II, ArgArg);
|
|
|
|
if (!II->getMetadata(LLVMContext::MD_range)) {
|
|
|
|
Type *IntTy32 = Type::getInt32Ty(II->getContext());
|
|
|
|
Metadata *M[] = {
|
|
|
|
ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)),
|
|
|
|
ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0xFFFF))
|
|
|
|
};
|
|
|
|
II->setMetadata(LLVMContext::MD_range, MDNode::get(II->getContext(), M));
|
|
|
|
return II;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
[InstCombine] Known-bits optimization for ARM MVE VADC.
The MVE VADC instruction reads and writes the carry bit at bit 29 of
the FPSCR register. The corresponding ACLE intrinsic is specified to
work with an integer in which the carry bit is stored at bit 0. So if
a user writes a code sequence in C that passes the carry from one VADC
to the next, like this,
s0 = vadcq_u32(a0, b0, &carry);
s1 = vadcq_u32(a1, b1, &carry);
then clang will generate IR for each of those operations that shifts
the carry bit up into bit 29 before the VADC, and after it, shifts it
back down and masks off all but the low bit. But in this situation
what you really wanted was two consecutive VADC instructions, so that
the second one directly reads the value left in FPSCR by the first,
without wasting several instructions on pointlessly clearing the other
flag bits in between.
This commit explains to InstCombine that the other bits of the flags
operand don't matter, and adds a test that demonstrates that all the
code between the two VADC instructions can be optimized away as a
result.
Reviewers: dmgreen, miyuki, ostannard
Subscribers: kristof.beyls, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D67162
2019-09-11 17:29:56 +08:00
|
|
|
case Intrinsic::arm_mve_vadc:
|
|
|
|
case Intrinsic::arm_mve_vadc_predicated: {
|
|
|
|
unsigned CarryOp =
|
|
|
|
(II->getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
|
2019-10-24 23:57:24 +08:00
|
|
|
assert(II->getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
|
[InstCombine] Known-bits optimization for ARM MVE VADC.
The MVE VADC instruction reads and writes the carry bit at bit 29 of
the FPSCR register. The corresponding ACLE intrinsic is specified to
work with an integer in which the carry bit is stored at bit 0. So if
a user writes a code sequence in C that passes the carry from one VADC
to the next, like this,
s0 = vadcq_u32(a0, b0, &carry);
s1 = vadcq_u32(a1, b1, &carry);
then clang will generate IR for each of those operations that shifts
the carry bit up into bit 29 before the VADC, and after it, shifts it
back down and masks off all but the low bit. But in this situation
what you really wanted was two consecutive VADC instructions, so that
the second one directly reads the value left in FPSCR by the first,
without wasting several instructions on pointlessly clearing the other
flag bits in between.
This commit explains to InstCombine that the other bits of the flags
operand don't matter, and adds a test that demonstrates that all the
code between the two VADC instructions can be optimized away as a
result.
Reviewers: dmgreen, miyuki, ostannard
Subscribers: kristof.beyls, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D67162
2019-09-11 17:29:56 +08:00
|
|
|
"Bad type for intrinsic!");
|
|
|
|
|
|
|
|
KnownBits CarryKnown(32);
|
|
|
|
if (SimplifyDemandedBits(II, CarryOp, APInt::getOneBitSet(32, 29),
|
|
|
|
CarryKnown))
|
|
|
|
return II;
|
|
|
|
break;
|
2018-05-24 23:26:42 +08:00
|
|
|
}
|
2016-01-23 05:30:34 +08:00
|
|
|
case Intrinsic::amdgcn_rcp: {
|
2017-03-25 03:04:57 +08:00
|
|
|
Value *Src = II->getArgOperand(0);
|
|
|
|
|
|
|
|
// TODO: Move to ConstantFolding/InstSimplify?
|
|
|
|
if (isa<UndefValue>(Src))
|
|
|
|
return replaceInstUsesWith(CI, Src);
|
|
|
|
|
|
|
|
if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
|
2014-06-19 09:19:19 +08:00
|
|
|
const APFloat &ArgVal = C->getValueAPF();
|
2019-12-04 17:57:38 +08:00
|
|
|
APFloat Val(ArgVal.getSemantics(), 1);
|
2014-06-19 09:19:19 +08:00
|
|
|
APFloat::opStatus Status = Val.divide(ArgVal,
|
|
|
|
APFloat::rmNearestTiesToEven);
|
|
|
|
// Only do this if it was exact and therefore not dependent on the
|
|
|
|
// rounding mode.
|
|
|
|
if (Status == APFloat::opOK)
|
2016-02-02 06:23:39 +08:00
|
|
|
return replaceInstUsesWith(CI, ConstantFP::get(II->getContext(), Val));
|
2014-06-19 09:19:19 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
2017-03-25 03:04:57 +08:00
|
|
|
case Intrinsic::amdgcn_rsq: {
|
|
|
|
Value *Src = II->getArgOperand(0);
|
|
|
|
|
|
|
|
// TODO: Move to ConstantFolding/InstSimplify?
|
|
|
|
if (isa<UndefValue>(Src))
|
|
|
|
return replaceInstUsesWith(CI, Src);
|
|
|
|
break;
|
|
|
|
}
|
2016-03-31 06:28:52 +08:00
|
|
|
case Intrinsic::amdgcn_frexp_mant:
|
|
|
|
case Intrinsic::amdgcn_frexp_exp: {
|
2016-03-31 06:28:26 +08:00
|
|
|
Value *Src = II->getArgOperand(0);
|
|
|
|
if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
|
|
|
|
int Exp;
|
|
|
|
APFloat Significand = frexp(C->getValueAPF(), Exp,
|
|
|
|
APFloat::rmNearestTiesToEven);
|
|
|
|
|
2019-05-06 23:35:02 +08:00
|
|
|
if (IID == Intrinsic::amdgcn_frexp_mant) {
|
2016-03-31 06:28:52 +08:00
|
|
|
return replaceInstUsesWith(CI, ConstantFP::get(II->getContext(),
|
|
|
|
Significand));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Match instruction special case behavior.
|
|
|
|
if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
|
|
|
|
Exp = 0;
|
|
|
|
|
|
|
|
return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Exp));
|
|
|
|
}
|
|
|
|
|
|
|
|
if (isa<UndefValue>(Src))
|
|
|
|
return replaceInstUsesWith(CI, UndefValue::get(II->getType()));
|
2016-03-31 06:28:26 +08:00
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
2016-09-03 15:06:58 +08:00
|
|
|
case Intrinsic::amdgcn_class: {
|
|
|
|
enum {
|
|
|
|
S_NAN = 1 << 0, // Signaling NaN
|
|
|
|
Q_NAN = 1 << 1, // Quiet NaN
|
|
|
|
N_INFINITY = 1 << 2, // Negative infinity
|
|
|
|
N_NORMAL = 1 << 3, // Negative normal
|
|
|
|
N_SUBNORMAL = 1 << 4, // Negative subnormal
|
|
|
|
N_ZERO = 1 << 5, // Negative zero
|
|
|
|
P_ZERO = 1 << 6, // Positive zero
|
|
|
|
P_SUBNORMAL = 1 << 7, // Positive subnormal
|
|
|
|
P_NORMAL = 1 << 8, // Positive normal
|
|
|
|
P_INFINITY = 1 << 9 // Positive infinity
|
|
|
|
};
|
|
|
|
|
|
|
|
const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL |
|
|
|
|
N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL | P_NORMAL | P_INFINITY;
|
|
|
|
|
|
|
|
Value *Src0 = II->getArgOperand(0);
|
|
|
|
Value *Src1 = II->getArgOperand(1);
|
|
|
|
const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
|
|
|
|
if (!CMask) {
|
|
|
|
if (isa<UndefValue>(Src0))
|
|
|
|
return replaceInstUsesWith(*II, UndefValue::get(II->getType()));
|
|
|
|
|
|
|
|
if (isa<UndefValue>(Src1))
|
|
|
|
return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), false));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
uint32_t Mask = CMask->getZExtValue();
|
|
|
|
|
|
|
|
// If all tests are made, it doesn't matter what the value is.
|
|
|
|
if ((Mask & FullMask) == FullMask)
|
|
|
|
return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), true));
|
|
|
|
|
|
|
|
if ((Mask & FullMask) == 0)
|
|
|
|
return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), false));
|
|
|
|
|
|
|
|
if (Mask == (S_NAN | Q_NAN)) {
|
|
|
|
// Equivalent of isnan. Replace with standard fcmp.
|
2017-07-08 07:16:26 +08:00
|
|
|
Value *FCmp = Builder.CreateFCmpUNO(Src0, Src0);
|
2016-09-03 15:06:58 +08:00
|
|
|
FCmp->takeName(II);
|
|
|
|
return replaceInstUsesWith(*II, FCmp);
|
|
|
|
}
|
|
|
|
|
2018-08-11 02:58:49 +08:00
|
|
|
if (Mask == (N_ZERO | P_ZERO)) {
|
|
|
|
// Equivalent of == 0.
|
|
|
|
Value *FCmp = Builder.CreateFCmpOEQ(
|
|
|
|
Src0, ConstantFP::get(Src0->getType(), 0.0));
|
|
|
|
|
|
|
|
FCmp->takeName(II);
|
|
|
|
return replaceInstUsesWith(*II, FCmp);
|
|
|
|
}
|
|
|
|
|
2018-08-29 02:10:02 +08:00
|
|
|
// fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other
|
|
|
|
if (((Mask & S_NAN) || (Mask & Q_NAN)) && isKnownNeverNaN(Src0, &TLI)) {
|
|
|
|
II->setArgOperand(1, ConstantInt::get(Src1->getType(),
|
|
|
|
Mask & ~(S_NAN | Q_NAN)));
|
|
|
|
return II;
|
|
|
|
}
|
|
|
|
|
2016-09-03 15:06:58 +08:00
|
|
|
const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0);
|
|
|
|
if (!CVal) {
|
|
|
|
if (isa<UndefValue>(Src0))
|
|
|
|
return replaceInstUsesWith(*II, UndefValue::get(II->getType()));
|
|
|
|
|
|
|
|
// Clamp mask to used bits
|
|
|
|
if ((Mask & FullMask) != Mask) {
|
2017-07-08 07:16:26 +08:00
|
|
|
CallInst *NewCall = Builder.CreateCall(II->getCalledFunction(),
|
2016-09-03 15:06:58 +08:00
|
|
|
{ Src0, ConstantInt::get(Src1->getType(), Mask & FullMask) }
|
|
|
|
);
|
|
|
|
|
|
|
|
NewCall->takeName(II);
|
|
|
|
return replaceInstUsesWith(*II, NewCall);
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
const APFloat &Val = CVal->getValueAPF();
|
|
|
|
|
|
|
|
bool Result =
|
|
|
|
((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) ||
|
|
|
|
((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) ||
|
|
|
|
((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) ||
|
|
|
|
((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) ||
|
|
|
|
((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) ||
|
|
|
|
((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) ||
|
|
|
|
((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) ||
|
|
|
|
((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) ||
|
|
|
|
((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) ||
|
|
|
|
((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative());
|
|
|
|
|
|
|
|
return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), Result));
|
|
|
|
}
|
2017-02-22 08:27:34 +08:00
|
|
|
case Intrinsic::amdgcn_cvt_pkrtz: {
|
|
|
|
Value *Src0 = II->getArgOperand(0);
|
|
|
|
Value *Src1 = II->getArgOperand(1);
|
|
|
|
if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
|
|
|
|
if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
|
|
|
|
const fltSemantics &HalfSem
|
|
|
|
= II->getType()->getScalarType()->getFltSemantics();
|
|
|
|
bool LosesInfo;
|
|
|
|
APFloat Val0 = C0->getValueAPF();
|
|
|
|
APFloat Val1 = C1->getValueAPF();
|
|
|
|
Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
|
|
|
|
Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
|
|
|
|
|
|
|
|
Constant *Folded = ConstantVector::get({
|
|
|
|
ConstantFP::get(II->getContext(), Val0),
|
|
|
|
ConstantFP::get(II->getContext(), Val1) });
|
|
|
|
return replaceInstUsesWith(*II, Folded);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1))
|
|
|
|
return replaceInstUsesWith(*II, UndefValue::get(II->getType()));
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
2018-02-01 04:18:04 +08:00
|
|
|
case Intrinsic::amdgcn_cvt_pknorm_i16:
|
|
|
|
case Intrinsic::amdgcn_cvt_pknorm_u16:
|
|
|
|
case Intrinsic::amdgcn_cvt_pk_i16:
|
|
|
|
case Intrinsic::amdgcn_cvt_pk_u16: {
|
|
|
|
Value *Src0 = II->getArgOperand(0);
|
|
|
|
Value *Src1 = II->getArgOperand(1);
|
|
|
|
|
|
|
|
if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1))
|
|
|
|
return replaceInstUsesWith(*II, UndefValue::get(II->getType()));
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
2017-02-23 07:04:58 +08:00
|
|
|
case Intrinsic::amdgcn_ubfe:
|
|
|
|
case Intrinsic::amdgcn_sbfe: {
|
|
|
|
// Decompose simple cases into standard shifts.
|
|
|
|
Value *Src = II->getArgOperand(0);
|
|
|
|
if (isa<UndefValue>(Src))
|
|
|
|
return replaceInstUsesWith(*II, Src);
|
|
|
|
|
|
|
|
unsigned Width;
|
|
|
|
Type *Ty = II->getType();
|
|
|
|
unsigned IntSize = Ty->getIntegerBitWidth();
|
|
|
|
|
|
|
|
ConstantInt *CWidth = dyn_cast<ConstantInt>(II->getArgOperand(2));
|
|
|
|
if (CWidth) {
|
|
|
|
Width = CWidth->getZExtValue();
|
|
|
|
if ((Width & (IntSize - 1)) == 0)
|
|
|
|
return replaceInstUsesWith(*II, ConstantInt::getNullValue(Ty));
|
|
|
|
|
|
|
|
if (Width >= IntSize) {
|
|
|
|
// Hardware ignores high bits, so remove those.
|
|
|
|
II->setArgOperand(2, ConstantInt::get(CWidth->getType(),
|
|
|
|
Width & (IntSize - 1)));
|
|
|
|
return II;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned Offset;
|
|
|
|
ConstantInt *COffset = dyn_cast<ConstantInt>(II->getArgOperand(1));
|
|
|
|
if (COffset) {
|
|
|
|
Offset = COffset->getZExtValue();
|
|
|
|
if (Offset >= IntSize) {
|
|
|
|
II->setArgOperand(1, ConstantInt::get(COffset->getType(),
|
|
|
|
Offset & (IntSize - 1)));
|
|
|
|
return II;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-05-06 23:35:02 +08:00
|
|
|
bool Signed = IID == Intrinsic::amdgcn_sbfe;
|
2017-02-23 07:04:58 +08:00
|
|
|
|
|
|
|
if (!CWidth || !COffset)
|
|
|
|
break;
|
|
|
|
|
2018-11-09 01:57:57 +08:00
|
|
|
// The case of Width == 0 is handled above, which makes this tranformation
|
|
|
|
// safe. If Width == 0, then the ashr and lshr instructions become poison
|
|
|
|
// value since the shift amount would be equal to the bit size.
|
|
|
|
assert(Width != 0);
|
|
|
|
|
2017-02-23 07:04:58 +08:00
|
|
|
// TODO: This allows folding to undef when the hardware has specific
|
|
|
|
// behavior?
|
|
|
|
if (Offset + Width < IntSize) {
|
2017-07-08 07:16:26 +08:00
|
|
|
Value *Shl = Builder.CreateShl(Src, IntSize - Offset - Width);
|
|
|
|
Value *RightShift = Signed ? Builder.CreateAShr(Shl, IntSize - Width)
|
|
|
|
: Builder.CreateLShr(Shl, IntSize - Width);
|
2017-02-23 07:04:58 +08:00
|
|
|
RightShift->takeName(II);
|
|
|
|
return replaceInstUsesWith(*II, RightShift);
|
|
|
|
}
|
|
|
|
|
2017-07-08 07:16:26 +08:00
|
|
|
Value *RightShift = Signed ? Builder.CreateAShr(Src, Offset)
|
|
|
|
: Builder.CreateLShr(Src, Offset);
|
2017-02-23 07:04:58 +08:00
|
|
|
|
|
|
|
RightShift->takeName(II);
|
|
|
|
return replaceInstUsesWith(*II, RightShift);
|
|
|
|
}
|
2017-02-23 08:44:03 +08:00
|
|
|
case Intrinsic::amdgcn_exp:
|
|
|
|
case Intrinsic::amdgcn_exp_compr: {
|
2019-03-13 05:02:54 +08:00
|
|
|
ConstantInt *En = cast<ConstantInt>(II->getArgOperand(1));
|
2017-02-23 08:44:03 +08:00
|
|
|
unsigned EnBits = En->getZExtValue();
|
|
|
|
if (EnBits == 0xf)
|
|
|
|
break; // All inputs enabled.
|
|
|
|
|
2019-05-06 23:35:02 +08:00
|
|
|
bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
|
2017-02-23 08:44:03 +08:00
|
|
|
bool Changed = false;
|
|
|
|
for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
|
|
|
|
if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
|
|
|
|
(IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
|
|
|
|
Value *Src = II->getArgOperand(I + 2);
|
|
|
|
if (!isa<UndefValue>(Src)) {
|
|
|
|
II->setArgOperand(I + 2, UndefValue::get(Src->getType()));
|
|
|
|
Changed = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (Changed)
|
|
|
|
return II;
|
|
|
|
|
|
|
|
break;
|
2017-02-28 07:08:49 +08:00
|
|
|
}
|
|
|
|
case Intrinsic::amdgcn_fmed3: {
|
|
|
|
// Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
|
|
|
|
// for the shader.
|
|
|
|
|
|
|
|
Value *Src0 = II->getArgOperand(0);
|
|
|
|
Value *Src1 = II->getArgOperand(1);
|
|
|
|
Value *Src2 = II->getArgOperand(2);
|
|
|
|
|
2018-07-06 01:05:36 +08:00
|
|
|
// Checking for NaN before canonicalization provides better fidelity when
|
|
|
|
// mapping other operations onto fmed3 since the order of operands is
|
|
|
|
// unchanged.
|
|
|
|
CallInst *NewCall = nullptr;
|
|
|
|
if (match(Src0, m_NaN()) || isa<UndefValue>(Src0)) {
|
|
|
|
NewCall = Builder.CreateMinNum(Src1, Src2);
|
|
|
|
} else if (match(Src1, m_NaN()) || isa<UndefValue>(Src1)) {
|
|
|
|
NewCall = Builder.CreateMinNum(Src0, Src2);
|
|
|
|
} else if (match(Src2, m_NaN()) || isa<UndefValue>(Src2)) {
|
|
|
|
NewCall = Builder.CreateMaxNum(Src0, Src1);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (NewCall) {
|
|
|
|
NewCall->copyFastMathFlags(II);
|
|
|
|
NewCall->takeName(II);
|
|
|
|
return replaceInstUsesWith(*II, NewCall);
|
|
|
|
}
|
|
|
|
|
2017-02-28 07:08:49 +08:00
|
|
|
bool Swap = false;
|
|
|
|
// Canonicalize constants to RHS operands.
|
|
|
|
//
|
|
|
|
// fmed3(c0, x, c1) -> fmed3(x, c0, c1)
|
|
|
|
if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
|
|
|
|
std::swap(Src0, Src1);
|
|
|
|
Swap = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
|
|
|
|
std::swap(Src1, Src2);
|
|
|
|
Swap = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
|
|
|
|
std::swap(Src0, Src1);
|
|
|
|
Swap = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (Swap) {
|
|
|
|
II->setArgOperand(0, Src0);
|
|
|
|
II->setArgOperand(1, Src1);
|
|
|
|
II->setArgOperand(2, Src2);
|
|
|
|
return II;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
|
|
|
|
if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
|
|
|
|
if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
|
|
|
|
APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
|
|
|
|
C2->getValueAPF());
|
|
|
|
return replaceInstUsesWith(*II,
|
2017-07-08 07:16:26 +08:00
|
|
|
ConstantFP::get(Builder.getContext(), Result));
|
2017-02-28 07:08:49 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
2017-02-23 08:44:03 +08:00
|
|
|
}
|
2017-03-14 02:14:02 +08:00
|
|
|
case Intrinsic::amdgcn_icmp:
|
|
|
|
case Intrinsic::amdgcn_fcmp: {
|
2019-03-13 05:02:54 +08:00
|
|
|
const ConstantInt *CC = cast<ConstantInt>(II->getArgOperand(2));
|
2017-03-14 02:14:02 +08:00
|
|
|
// Guard against invalid arguments.
|
|
|
|
int64_t CCVal = CC->getZExtValue();
|
2019-05-06 23:35:02 +08:00
|
|
|
bool IsInteger = IID == Intrinsic::amdgcn_icmp;
|
2017-03-14 02:14:02 +08:00
|
|
|
if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
|
|
|
|
CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
|
|
|
|
(!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
|
|
|
|
CCVal > CmpInst::LAST_FCMP_PREDICATE)))
|
|
|
|
break;
|
|
|
|
|
|
|
|
Value *Src0 = II->getArgOperand(0);
|
|
|
|
Value *Src1 = II->getArgOperand(1);
|
|
|
|
|
|
|
|
if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
|
|
|
|
if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
|
|
|
|
Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1);
|
2017-04-25 01:08:43 +08:00
|
|
|
if (CCmp->isNullValue()) {
|
|
|
|
return replaceInstUsesWith(
|
|
|
|
*II, ConstantExpr::getSExt(CCmp, II->getType()));
|
|
|
|
}
|
|
|
|
|
|
|
|
// The result of V_ICMP/V_FCMP assembly instructions (which this
|
|
|
|
// intrinsic exposes) is one bit per thread, masked with the EXEC
|
|
|
|
// register (which contains the bitmask of live threads). So a
|
|
|
|
// comparison that always returns true is the same as a read of the
|
|
|
|
// EXEC register.
|
2019-02-02 04:43:25 +08:00
|
|
|
Function *NewF = Intrinsic::getDeclaration(
|
2017-04-25 01:08:43 +08:00
|
|
|
II->getModule(), Intrinsic::read_register, II->getType());
|
|
|
|
Metadata *MDArgs[] = {MDString::get(II->getContext(), "exec")};
|
|
|
|
MDNode *MD = MDNode::get(II->getContext(), MDArgs);
|
|
|
|
Value *Args[] = {MetadataAsValue::get(II->getContext(), MD)};
|
2017-07-08 07:16:26 +08:00
|
|
|
CallInst *NewCall = Builder.CreateCall(NewF, Args);
|
2017-04-25 01:08:43 +08:00
|
|
|
NewCall->addAttribute(AttributeList::FunctionIndex,
|
|
|
|
Attribute::Convergent);
|
|
|
|
NewCall->takeName(II);
|
|
|
|
return replaceInstUsesWith(*II, NewCall);
|
2017-03-14 02:14:02 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Canonicalize constants to RHS.
|
|
|
|
CmpInst::Predicate SwapPred
|
|
|
|
= CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal));
|
|
|
|
II->setArgOperand(0, Src1);
|
|
|
|
II->setArgOperand(1, Src0);
|
|
|
|
II->setArgOperand(2, ConstantInt::get(CC->getType(),
|
|
|
|
static_cast<int>(SwapPred)));
|
|
|
|
return II;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
|
|
|
|
break;
|
|
|
|
|
|
|
|
// Canonicalize compare eq with true value to compare != 0
|
|
|
|
// llvm.amdgcn.icmp(zext (i1 x), 1, eq)
|
|
|
|
// -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
|
|
|
|
// llvm.amdgcn.icmp(sext (i1 x), -1, eq)
|
|
|
|
// -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
|
|
|
|
Value *ExtSrc;
|
|
|
|
if (CCVal == CmpInst::ICMP_EQ &&
|
|
|
|
((match(Src1, m_One()) && match(Src0, m_ZExt(m_Value(ExtSrc)))) ||
|
|
|
|
(match(Src1, m_AllOnes()) && match(Src0, m_SExt(m_Value(ExtSrc))))) &&
|
|
|
|
ExtSrc->getType()->isIntegerTy(1)) {
|
|
|
|
II->setArgOperand(1, ConstantInt::getNullValue(Src1->getType()));
|
|
|
|
II->setArgOperand(2, ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
|
|
|
|
return II;
|
|
|
|
}
|
|
|
|
|
|
|
|
CmpInst::Predicate SrcPred;
|
|
|
|
Value *SrcLHS;
|
|
|
|
Value *SrcRHS;
|
|
|
|
|
|
|
|
// Fold compare eq/ne with 0 from a compare result as the predicate to the
|
|
|
|
// intrinsic. The typical use is a wave vote function in the library, which
|
|
|
|
// will be fed from a user code condition compared with 0. Fold in the
|
|
|
|
// redundant compare.
|
|
|
|
|
|
|
|
// llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
|
|
|
|
// -> llvm.amdgcn.[if]cmp(a, b, pred)
|
|
|
|
//
|
|
|
|
// llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
|
|
|
|
// -> llvm.amdgcn.[if]cmp(a, b, inv pred)
|
|
|
|
if (match(Src1, m_Zero()) &&
|
|
|
|
match(Src0,
|
|
|
|
m_ZExtOrSExt(m_Cmp(SrcPred, m_Value(SrcLHS), m_Value(SrcRHS))))) {
|
|
|
|
if (CCVal == CmpInst::ICMP_EQ)
|
|
|
|
SrcPred = CmpInst::getInversePredicate(SrcPred);
|
|
|
|
|
|
|
|
Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred) ?
|
|
|
|
Intrinsic::amdgcn_fcmp : Intrinsic::amdgcn_icmp;
|
|
|
|
|
2018-08-16 05:14:25 +08:00
|
|
|
Type *Ty = SrcLHS->getType();
|
|
|
|
if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {
|
|
|
|
// Promote to next legal integer type.
|
|
|
|
unsigned Width = CmpType->getBitWidth();
|
|
|
|
unsigned NewWidth = Width;
|
AMDGPU: Add a fast path for icmp.i1(src, false, NE)
Summary:
This allows moving the condition from the intrinsic to the standard ICmp
opcode, so that LLVM can do simplifications on it. The icmp.i1 intrinsic
is an identity for retrieving the SGPR mask.
And we can also get the mask from and i1, or i1, xor i1.
Reviewers: arsenm, nhaehnle
Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits
Differential Revision: https://reviews.llvm.org/D52060
llvm-svn: 351150
2019-01-15 10:13:18 +08:00
|
|
|
|
|
|
|
// Don't do anything for i1 comparisons.
|
|
|
|
if (Width == 1)
|
|
|
|
break;
|
|
|
|
|
2018-08-16 05:14:25 +08:00
|
|
|
if (Width <= 16)
|
|
|
|
NewWidth = 16;
|
|
|
|
else if (Width <= 32)
|
|
|
|
NewWidth = 32;
|
|
|
|
else if (Width <= 64)
|
|
|
|
NewWidth = 64;
|
|
|
|
else if (Width > 64)
|
|
|
|
break; // Can't handle this.
|
|
|
|
|
|
|
|
if (Width != NewWidth) {
|
|
|
|
IntegerType *CmpTy = Builder.getIntNTy(NewWidth);
|
|
|
|
if (CmpInst::isSigned(SrcPred)) {
|
|
|
|
SrcLHS = Builder.CreateSExt(SrcLHS, CmpTy);
|
|
|
|
SrcRHS = Builder.CreateSExt(SrcRHS, CmpTy);
|
|
|
|
} else {
|
|
|
|
SrcLHS = Builder.CreateZExt(SrcLHS, CmpTy);
|
|
|
|
SrcRHS = Builder.CreateZExt(SrcRHS, CmpTy);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
|
|
|
|
break;
|
|
|
|
|
2019-02-02 04:43:25 +08:00
|
|
|
Function *NewF =
|
2019-06-14 07:47:36 +08:00
|
|
|
Intrinsic::getDeclaration(II->getModule(), NewIID,
|
|
|
|
{ II->getType(),
|
|
|
|
SrcLHS->getType() });
|
2017-03-14 02:14:02 +08:00
|
|
|
Value *Args[] = { SrcLHS, SrcRHS,
|
|
|
|
ConstantInt::get(CC->getType(), SrcPred) };
|
2017-07-08 07:16:26 +08:00
|
|
|
CallInst *NewCall = Builder.CreateCall(NewF, Args);
|
2017-03-14 02:14:02 +08:00
|
|
|
NewCall->takeName(II);
|
|
|
|
return replaceInstUsesWith(*II, NewCall);
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
2017-10-24 18:26:59 +08:00
|
|
|
case Intrinsic::amdgcn_wqm_vote: {
|
|
|
|
// wqm_vote is identity when the argument is constant.
|
|
|
|
if (!isa<Constant>(II->getArgOperand(0)))
|
|
|
|
break;
|
|
|
|
|
|
|
|
return replaceInstUsesWith(*II, II->getArgOperand(0));
|
|
|
|
}
|
2017-10-24 18:27:13 +08:00
|
|
|
case Intrinsic::amdgcn_kill: {
|
|
|
|
const ConstantInt *C = dyn_cast<ConstantInt>(II->getArgOperand(0));
|
|
|
|
if (!C || !C->getZExtValue())
|
|
|
|
break;
|
|
|
|
|
|
|
|
// amdgcn.kill(i1 1) is a no-op
|
|
|
|
return eraseInstFromFunction(CI);
|
|
|
|
}
|
2018-05-22 16:04:33 +08:00
|
|
|
case Intrinsic::amdgcn_update_dpp: {
|
|
|
|
Value *Old = II->getArgOperand(0);
|
|
|
|
|
2019-03-13 05:02:54 +08:00
|
|
|
auto BC = cast<ConstantInt>(II->getArgOperand(5));
|
|
|
|
auto RM = cast<ConstantInt>(II->getArgOperand(3));
|
|
|
|
auto BM = cast<ConstantInt>(II->getArgOperand(4));
|
|
|
|
if (BC->isZeroValue() ||
|
2018-05-22 16:04:33 +08:00
|
|
|
RM->getZExtValue() != 0xF ||
|
|
|
|
BM->getZExtValue() != 0xF ||
|
|
|
|
isa<UndefValue>(Old))
|
|
|
|
break;
|
|
|
|
|
|
|
|
// If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
|
|
|
|
II->setOperand(0, UndefValue::get(Old->getType()));
|
|
|
|
return II;
|
|
|
|
}
|
2020-01-16 22:59:56 +08:00
|
|
|
case Intrinsic::amdgcn_permlane16:
|
|
|
|
case Intrinsic::amdgcn_permlanex16: {
|
|
|
|
// Discard vdst_in if it's not going to be read.
|
|
|
|
Value *VDstIn = II->getArgOperand(0);
|
|
|
|
if (isa<UndefValue>(VDstIn))
|
|
|
|
break;
|
|
|
|
|
|
|
|
ConstantInt *FetchInvalid = cast<ConstantInt>(II->getArgOperand(4));
|
|
|
|
ConstantInt *BoundCtrl = cast<ConstantInt>(II->getArgOperand(5));
|
|
|
|
if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
|
|
|
|
break;
|
|
|
|
|
|
|
|
II->setArgOperand(0, UndefValue::get(VDstIn->getType()));
|
|
|
|
return II;
|
|
|
|
}
|
2019-06-14 22:51:26 +08:00
|
|
|
case Intrinsic::amdgcn_readfirstlane:
|
|
|
|
case Intrinsic::amdgcn_readlane: {
|
|
|
|
// A constant value is trivially uniform.
|
|
|
|
if (Constant *C = dyn_cast<Constant>(II->getArgOperand(0)))
|
|
|
|
return replaceInstUsesWith(*II, C);
|
2019-06-18 01:52:35 +08:00
|
|
|
|
|
|
|
// The rest of these may not be safe if the exec may not be the same between
|
|
|
|
// the def and use.
|
|
|
|
Value *Src = II->getArgOperand(0);
|
|
|
|
Instruction *SrcInst = dyn_cast<Instruction>(Src);
|
|
|
|
if (SrcInst && SrcInst->getParent() != II->getParent())
|
|
|
|
break;
|
|
|
|
|
|
|
|
// readfirstlane (readfirstlane x) -> readfirstlane x
|
|
|
|
// readlane (readfirstlane x), y -> readfirstlane x
|
|
|
|
if (match(Src, m_Intrinsic<Intrinsic::amdgcn_readfirstlane>()))
|
|
|
|
return replaceInstUsesWith(*II, Src);
|
|
|
|
|
|
|
|
if (IID == Intrinsic::amdgcn_readfirstlane) {
|
|
|
|
// readfirstlane (readlane x, y) -> readlane x, y
|
|
|
|
if (match(Src, m_Intrinsic<Intrinsic::amdgcn_readlane>()))
|
|
|
|
return replaceInstUsesWith(*II, Src);
|
|
|
|
} else {
|
|
|
|
// readlane (readlane x, y), y -> readlane x, y
|
|
|
|
if (match(Src, m_Intrinsic<Intrinsic::amdgcn_readlane>(
|
|
|
|
m_Value(), m_Specific(II->getArgOperand(1)))))
|
|
|
|
return replaceInstUsesWith(*II, Src);
|
|
|
|
}
|
|
|
|
|
2019-06-14 22:51:26 +08:00
|
|
|
break;
|
|
|
|
}
|
2010-01-05 15:32:13 +08:00
|
|
|
case Intrinsic::stackrestore: {
|
|
|
|
// If the save is right next to the restore, remove the restore. This can
|
|
|
|
// happen when variable allocas are DCE'd.
|
2010-06-24 20:58:35 +08:00
|
|
|
if (IntrinsicInst *SS = dyn_cast<IntrinsicInst>(II->getArgOperand(0))) {
|
2010-01-05 15:32:13 +08:00
|
|
|
if (SS->getIntrinsicID() == Intrinsic::stacksave) {
|
2018-06-20 07:42:17 +08:00
|
|
|
// Skip over debug info.
|
|
|
|
if (SS->getNextNonDebugInstruction() == II) {
|
2016-02-02 06:23:39 +08:00
|
|
|
return eraseInstFromFunction(CI);
|
2018-06-09 04:42:36 +08:00
|
|
|
}
|
2010-01-05 15:32:13 +08:00
|
|
|
}
|
|
|
|
}
|
2012-02-03 08:07:04 +08:00
|
|
|
|
2010-01-05 15:32:13 +08:00
|
|
|
// Scan down this block to see if there is another stack restore in the
|
|
|
|
// same block without an intervening call/alloca.
|
2015-10-14 00:59:33 +08:00
|
|
|
BasicBlock::iterator BI(II);
|
2018-10-15 18:04:59 +08:00
|
|
|
Instruction *TI = II->getParent()->getTerminator();
|
2010-01-05 15:32:13 +08:00
|
|
|
bool CannotRemove = false;
|
|
|
|
for (++BI; &*BI != TI; ++BI) {
|
2012-06-21 23:45:28 +08:00
|
|
|
if (isa<AllocaInst>(BI)) {
|
2010-01-05 15:32:13 +08:00
|
|
|
CannotRemove = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (CallInst *BCI = dyn_cast<CallInst>(BI)) {
|
2019-05-06 23:35:02 +08:00
|
|
|
if (auto *II2 = dyn_cast<IntrinsicInst>(BCI)) {
|
2010-01-05 15:32:13 +08:00
|
|
|
// If there is a stackrestore below this one, remove this one.
|
2019-05-06 23:35:02 +08:00
|
|
|
if (II2->getIntrinsicID() == Intrinsic::stackrestore)
|
2016-02-02 06:23:39 +08:00
|
|
|
return eraseInstFromFunction(CI);
|
2016-02-27 08:53:54 +08:00
|
|
|
|
|
|
|
// Bail if we cross over an intrinsic with side effects, such as
|
2019-12-27 15:32:53 +08:00
|
|
|
// llvm.stacksave, or llvm.read_register.
|
2019-05-06 23:35:02 +08:00
|
|
|
if (II2->mayHaveSideEffects()) {
|
2016-02-27 08:53:54 +08:00
|
|
|
CannotRemove = true;
|
|
|
|
break;
|
|
|
|
}
|
2010-01-05 15:32:13 +08:00
|
|
|
} else {
|
|
|
|
// If we found a non-intrinsic call, we can't remove the stack
|
|
|
|
// restore.
|
|
|
|
CannotRemove = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2012-02-03 08:07:04 +08:00
|
|
|
|
2011-07-31 14:30:59 +08:00
|
|
|
// If the stack restore is in a return, resume, or unwind block and if there
|
|
|
|
// are no allocas or calls between the restore and the return, nuke the
|
|
|
|
// restore.
|
2012-02-07 05:16:41 +08:00
|
|
|
if (!CannotRemove && (isa<ReturnInst>(TI) || isa<ResumeInst>(TI)))
|
2016-02-02 06:23:39 +08:00
|
|
|
return eraseInstFromFunction(CI);
|
2010-01-05 15:32:13 +08:00
|
|
|
break;
|
|
|
|
}
|
2016-07-29 06:50:48 +08:00
|
|
|
case Intrinsic::lifetime_start:
|
2016-07-29 06:59:03 +08:00
|
|
|
// Asan needs to poison memory to detect invalid access which is possible
|
|
|
|
// even for empty lifetime range.
|
2017-12-09 08:21:41 +08:00
|
|
|
if (II->getFunction()->hasFnAttribute(Attribute::SanitizeAddress) ||
|
2019-08-27 06:15:50 +08:00
|
|
|
II->getFunction()->hasFnAttribute(Attribute::SanitizeMemory) ||
|
2017-12-09 08:21:41 +08:00
|
|
|
II->getFunction()->hasFnAttribute(Attribute::SanitizeHWAddress))
|
2016-07-29 06:59:03 +08:00
|
|
|
break;
|
|
|
|
|
2016-05-10 17:24:49 +08:00
|
|
|
if (removeTriviallyEmptyRange(*II, Intrinsic::lifetime_start,
|
|
|
|
Intrinsic::lifetime_end, *this))
|
|
|
|
return nullptr;
|
[InstCombine] Remove trivially empty lifetime start/end ranges.
Summary:
Some passes may open up opportunities for optimizations, leaving empty
lifetime start/end ranges. For example, with the following code:
void foo(char *, char *);
void bar(int Size, bool flag) {
for (int i = 0; i < Size; ++i) {
char text[1];
char buff[1];
if (flag)
foo(text, buff); // BBFoo
}
}
the loop unswitch pass will create 2 versions of the loop, one with
flag==true, and the other one with flag==false, but always leaving
the BBFoo basic block, with lifetime ranges covering the scope of the for
loop. Simplify CFG will then remove BBFoo in the case where flag==false,
but will leave the lifetime markers.
This patch teaches InstCombine to remove trivially empty lifetime marker
ranges, that is ranges ending right after they were started (ignoring
debug info or other lifetime markers in the range).
This fixes PR24598: excessive compile time after r234581.
Reviewers: reames, chandlerc
Subscribers: llvm-commits
Differential Revision: http://reviews.llvm.org/D13305
llvm-svn: 249018
2015-10-01 22:54:31 +08:00
|
|
|
break;
|
2014-07-26 05:45:17 +08:00
|
|
|
case Intrinsic::assume: {
|
2016-04-09 00:37:12 +08:00
|
|
|
Value *IIOperand = II->getArgOperand(0);
|
2018-06-20 21:22:26 +08:00
|
|
|
// Remove an assume if it is followed by an identical assume.
|
|
|
|
// TODO: Do we need this? Unless there are conflicting assumptions, the
|
|
|
|
// computeKnownBits(IIOperand) below here eliminates redundant assumes.
|
|
|
|
Instruction *Next = II->getNextNonDebugInstruction();
|
|
|
|
if (match(Next, m_Intrinsic<Intrinsic::assume>(m_Specific(IIOperand))))
|
2016-04-09 00:37:12 +08:00
|
|
|
return eraseInstFromFunction(CI);
|
|
|
|
|
2014-07-26 05:45:17 +08:00
|
|
|
// Canonicalize assume(a && b) -> assume(a); assume(b);
|
2014-09-07 20:44:26 +08:00
|
|
|
// Note: New assumption intrinsics created here are registered by
|
|
|
|
// the InstCombineIRInserter object.
|
2019-02-02 04:43:25 +08:00
|
|
|
FunctionType *AssumeIntrinsicTy = II->getFunctionType();
|
|
|
|
Value *AssumeIntrinsic = II->getCalledValue();
|
|
|
|
Value *A, *B;
|
2014-07-26 05:45:17 +08:00
|
|
|
if (match(IIOperand, m_And(m_Value(A), m_Value(B)))) {
|
2019-02-02 04:43:25 +08:00
|
|
|
Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, A, II->getName());
|
|
|
|
Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, B, II->getName());
|
2016-02-02 06:23:39 +08:00
|
|
|
return eraseInstFromFunction(*II);
|
2014-07-26 05:45:17 +08:00
|
|
|
}
|
|
|
|
// assume(!(a || b)) -> assume(!a); assume(!b);
|
|
|
|
if (match(IIOperand, m_Not(m_Or(m_Value(A), m_Value(B))))) {
|
2019-02-02 04:43:25 +08:00
|
|
|
Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic,
|
|
|
|
Builder.CreateNot(A), II->getName());
|
|
|
|
Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic,
|
|
|
|
Builder.CreateNot(B), II->getName());
|
2016-02-02 06:23:39 +08:00
|
|
|
return eraseInstFromFunction(*II);
|
2014-07-26 05:45:17 +08:00
|
|
|
}
|
2014-10-05 05:27:06 +08:00
|
|
|
|
2014-11-12 07:33:19 +08:00
|
|
|
// assume( (load addr) != null ) -> add 'nonnull' metadata to load
|
|
|
|
// (if assume is valid at the load)
|
2017-01-04 06:25:31 +08:00
|
|
|
CmpInst::Predicate Pred;
|
|
|
|
Instruction *LHS;
|
|
|
|
if (match(IIOperand, m_ICmp(Pred, m_Instruction(LHS), m_Zero())) &&
|
|
|
|
Pred == ICmpInst::ICMP_NE && LHS->getOpcode() == Instruction::Load &&
|
|
|
|
LHS->getType()->isPointerTy() &&
|
|
|
|
isValidAssumeForContext(II, LHS, &DT)) {
|
|
|
|
MDNode *MD = MDNode::get(II->getContext(), None);
|
|
|
|
LHS->setMetadata(LLVMContext::MD_nonnull, MD);
|
|
|
|
return eraseInstFromFunction(*II);
|
|
|
|
|
2015-02-10 16:07:32 +08:00
|
|
|
// TODO: apply nonnull return attributes to calls and invokes
|
2014-11-12 07:33:19 +08:00
|
|
|
// TODO: apply range metadata for range check patterns?
|
|
|
|
}
|
2017-01-04 06:25:31 +08:00
|
|
|
|
2014-10-05 05:27:06 +08:00
|
|
|
// If there is a dominating assume with the same condition as this one,
|
|
|
|
// then this one is redundant, and should be removed.
|
2017-04-27 00:39:58 +08:00
|
|
|
KnownBits Known(1);
|
|
|
|
computeKnownBits(IIOperand, Known, 0, II);
|
2017-05-06 01:36:09 +08:00
|
|
|
if (Known.isAllOnes())
|
2016-02-02 06:23:39 +08:00
|
|
|
return eraseInstFromFunction(*II);
|
2014-10-05 05:27:06 +08:00
|
|
|
|
2017-01-11 21:24:24 +08:00
|
|
|
// Update the cache of affected values for this assumption (we might be
|
|
|
|
// here because we just simplified the condition).
|
|
|
|
AC.updateAffectedValues(II);
|
2014-07-26 05:45:17 +08:00
|
|
|
break;
|
|
|
|
}
|
2014-12-30 07:27:30 +08:00
|
|
|
case Intrinsic::experimental_gc_relocate: {
|
2019-09-25 01:24:16 +08:00
|
|
|
auto &GCR = *cast<GCRelocateInst>(II);
|
|
|
|
|
|
|
|
// If we have two copies of the same pointer in the statepoint argument
|
|
|
|
// list, canonicalize to one. This may let us common gc.relocates.
|
|
|
|
if (GCR.getBasePtr() == GCR.getDerivedPtr() &&
|
|
|
|
GCR.getBasePtrIndex() != GCR.getDerivedPtrIndex()) {
|
|
|
|
auto *OpIntTy = GCR.getOperand(2)->getType();
|
|
|
|
II->setOperand(2, ConstantInt::get(OpIntTy, GCR.getBasePtrIndex()));
|
|
|
|
return II;
|
|
|
|
}
|
|
|
|
|
2014-12-30 07:27:30 +08:00
|
|
|
// Translate facts known about a pointer before relocating into
|
|
|
|
// facts about the relocate value, while being careful to
|
|
|
|
// preserve relocation semantics.
|
2019-09-25 01:24:16 +08:00
|
|
|
Value *DerivedPtr = GCR.getDerivedPtr();
|
2014-12-30 07:27:30 +08:00
|
|
|
|
|
|
|
// Remove the relocation if unused, note that this check is required
|
|
|
|
// to prevent the cases below from looping forever.
|
|
|
|
if (II->use_empty())
|
2016-02-02 06:23:39 +08:00
|
|
|
return eraseInstFromFunction(*II);
|
2014-12-30 07:27:30 +08:00
|
|
|
|
|
|
|
// Undef is undef, even after relocation.
|
|
|
|
// TODO: provide a hook for this in GCStrategy. This is clearly legal for
|
|
|
|
// most practical collectors, but there was discussion in the review thread
|
|
|
|
// about whether it was legal for all possible collectors.
|
2016-02-10 05:09:22 +08:00
|
|
|
if (isa<UndefValue>(DerivedPtr))
|
|
|
|
// Use undef of gc_relocate's type to replace it.
|
|
|
|
return replaceInstUsesWith(*II, UndefValue::get(II->getType()));
|
|
|
|
|
|
|
|
if (auto *PT = dyn_cast<PointerType>(II->getType())) {
|
|
|
|
// The relocation of null will be null for most any collector.
|
|
|
|
// TODO: provide a hook for this in GCStrategy. There might be some
|
|
|
|
// weird collector this property does not hold for.
|
|
|
|
if (isa<ConstantPointerNull>(DerivedPtr))
|
|
|
|
// Use null-pointer of gc_relocate's type to replace it.
|
|
|
|
return replaceInstUsesWith(*II, ConstantPointerNull::get(PT));
|
2016-04-25 01:00:34 +08:00
|
|
|
|
2016-02-10 05:09:22 +08:00
|
|
|
// isKnownNonNull -> nonnull attribute
|
2018-11-13 04:00:53 +08:00
|
|
|
if (!II->hasRetAttr(Attribute::NonNull) &&
|
|
|
|
isKnownNonZero(DerivedPtr, DL, 0, &AC, II, &DT)) {
|
Rename AttributeSet to AttributeList
Summary:
This class is a list of AttributeSetNodes corresponding the function
prototype of a call or function declaration. This class used to be
called ParamAttrListPtr, then AttrListPtr, then AttributeSet. It is
typically accessed by parameter and return value index, so
"AttributeList" seems like a more intuitive name.
Rename AttributeSetImpl to AttributeListImpl to follow suit.
It's useful to rename this class so that we can rename AttributeSetNode
to AttributeSet later. AttributeSet is the set of attributes that apply
to a single function, argument, or return value.
Reviewers: sanjoy, javed.absar, chandlerc, pete
Reviewed By: pete
Subscribers: pete, jholewinski, arsenm, dschuff, mehdi_amini, jfb, nhaehnle, sbc100, void, llvm-commits
Differential Revision: https://reviews.llvm.org/D31102
llvm-svn: 298393
2017-03-22 00:57:19 +08:00
|
|
|
II->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
|
2018-11-13 04:00:53 +08:00
|
|
|
return II;
|
|
|
|
}
|
2015-02-15 03:37:54 +08:00
|
|
|
}
|
2014-12-30 07:27:30 +08:00
|
|
|
|
|
|
|
// TODO: bitcast(relocate(p)) -> relocate(bitcast(p))
|
|
|
|
// Canonicalize on the type from the uses to the defs
|
2015-02-15 03:37:54 +08:00
|
|
|
|
2014-12-30 07:27:30 +08:00
|
|
|
// TODO: relocate((gep p, C, C2, ...)) -> gep(relocate(p), C, C2, ...)
|
2016-02-10 05:09:22 +08:00
|
|
|
break;
|
2014-12-30 07:27:30 +08:00
|
|
|
}
|
2017-01-25 22:12:12 +08:00
|
|
|
|
|
|
|
case Intrinsic::experimental_guard: {
|
2018-05-10 06:56:32 +08:00
|
|
|
// Is this guard followed by another guard? We scan forward over a small
|
|
|
|
// fixed window of instructions to handle common cases with conditions
|
|
|
|
// computed between guards.
|
2019-11-23 05:02:18 +08:00
|
|
|
Instruction *NextInst = II->getNextNonDebugInstruction();
|
2018-05-10 08:05:29 +08:00
|
|
|
for (unsigned i = 0; i < GuardWideningWindow; i++) {
|
2018-05-10 06:56:32 +08:00
|
|
|
// Note: Using context-free form to avoid compile time blow up
|
|
|
|
if (!isSafeToSpeculativelyExecute(NextInst))
|
|
|
|
break;
|
2019-11-23 05:02:18 +08:00
|
|
|
NextInst = NextInst->getNextNonDebugInstruction();
|
2018-05-10 06:56:32 +08:00
|
|
|
}
|
2017-02-02 00:34:55 +08:00
|
|
|
Value *NextCond = nullptr;
|
|
|
|
if (match(NextInst,
|
|
|
|
m_Intrinsic<Intrinsic::experimental_guard>(m_Value(NextCond)))) {
|
|
|
|
Value *CurrCond = II->getArgOperand(0);
|
2017-01-25 22:12:12 +08:00
|
|
|
|
2017-03-30 20:59:53 +08:00
|
|
|
// Remove a guard that it is immediately preceded by an identical guard.
|
2017-02-02 00:34:55 +08:00
|
|
|
// Otherwise canonicalize guard(a); guard(b) -> guard(a & b).
|
2020-01-11 22:10:50 +08:00
|
|
|
if (CurrCond != NextCond) {
|
|
|
|
Instruction *MoveI = II->getNextNonDebugInstruction();
|
|
|
|
while (MoveI != NextInst) {
|
|
|
|
auto *Temp = MoveI;
|
|
|
|
MoveI = MoveI->getNextNonDebugInstruction();
|
|
|
|
Temp->moveBefore(II);
|
|
|
|
}
|
|
|
|
II->setArgOperand(0, Builder.CreateAnd(CurrCond, NextCond));
|
2018-05-10 06:56:32 +08:00
|
|
|
}
|
2020-01-11 22:10:50 +08:00
|
|
|
eraseInstFromFunction(*NextInst);
|
|
|
|
return II;
|
2017-02-02 00:34:55 +08:00
|
|
|
}
|
2017-01-25 22:12:12 +08:00
|
|
|
break;
|
|
|
|
}
|
2010-01-05 15:32:13 +08:00
|
|
|
}
|
2019-02-01 01:23:29 +08:00
|
|
|
return visitCallBase(*II);
|
2010-01-05 15:32:13 +08:00
|
|
|
}
|
|
|
|
|
2017-02-01 02:09:05 +08:00
|
|
|
// Fence instruction simplification
|
|
|
|
Instruction *InstCombiner::visitFenceInst(FenceInst &FI) {
|
|
|
|
// Remove identical consecutive fences.
|
2018-06-20 07:42:17 +08:00
|
|
|
Instruction *Next = FI.getNextNonDebugInstruction();
|
2018-06-06 20:46:02 +08:00
|
|
|
if (auto *NFI = dyn_cast<FenceInst>(Next))
|
2017-02-01 02:09:05 +08:00
|
|
|
if (FI.isIdenticalTo(NFI))
|
|
|
|
return eraseInstFromFunction(FI);
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
2010-01-05 15:32:13 +08:00
|
|
|
// InvokeInst simplification
|
|
|
|
Instruction *InstCombiner::visitInvokeInst(InvokeInst &II) {
|
2019-02-01 01:23:29 +08:00
|
|
|
return visitCallBase(II);
|
2010-01-05 15:32:13 +08:00
|
|
|
}
|
|
|
|
|
2019-02-09 04:48:56 +08:00
|
|
|
// CallBrInst simplification
|
|
|
|
Instruction *InstCombiner::visitCallBrInst(CallBrInst &CBI) {
|
|
|
|
return visitCallBase(CBI);
|
|
|
|
}
|
|
|
|
|
2016-01-21 06:24:38 +08:00
|
|
|
/// If this cast does not affect the value passed through the varargs area, we
|
|
|
|
/// can eliminate the use of the cast.
|
2019-02-01 01:23:29 +08:00
|
|
|
static bool isSafeToEliminateVarargsCast(const CallBase &Call,
|
2015-03-10 10:37:25 +08:00
|
|
|
const DataLayout &DL,
|
|
|
|
const CastInst *const CI,
|
2010-01-05 15:32:13 +08:00
|
|
|
const int ix) {
|
|
|
|
if (!CI->isLosslessCast())
|
|
|
|
return false;
|
|
|
|
|
[Statepoints 3/4] Statepoint infrastructure for garbage collection: SelectionDAGBuilder
This is the third patch in a small series. It contains the CodeGen support for lowering the gc.statepoint intrinsic sequences (223078) to the STATEPOINT pseudo machine instruction (223085). The change also includes the set of helper routines and classes for working with gc.statepoints, gc.relocates, and gc.results since the lowering code uses them.
With this change, gc.statepoints should be functionally complete. The documentation will follow in the fourth change, and there will likely be some cleanup changes, but interested parties can start experimenting now.
I'm not particularly happy with the amount of code or complexity involved with the lowering step, but at least it's fairly well isolated. The statepoint lowering code is split into it's own files and anyone not working on the statepoint support itself should be able to ignore it.
During the lowering process, we currently spill aggressively to stack. This is not entirely ideal (and we have plans to do better), but it's functional, relatively straight forward, and matches closely the implementations of the patchpoint intrinsics. Most of the complexity comes from trying to keep relocated copies of values in the same stack slots across statepoints. Doing so avoids the insertion of pointless load and store instructions to reshuffle the stack. The current implementation isn't as effective as I'd like, but it is functional and 'good enough' for many common use cases.
In the long term, I'd like to figure out how to integrate the statepoint lowering with the register allocator. In principal, we shouldn't need to eagerly spill at all. The register allocator should do any spilling required and the statepoint should simply record that fact. Depending on how challenging that turns out to be, we may invest in a smarter global stack slot assignment mechanism as a stop gap measure.
Reviewed by: atrick, ributzka
llvm-svn: 223137
2014-12-03 02:50:36 +08:00
|
|
|
// If this is a GC intrinsic, avoid munging types. We need types for
|
|
|
|
// statepoint reconstruction in SelectionDAG.
|
|
|
|
// TODO: This is probably something which should be expanded to all
|
|
|
|
// intrinsics since the entire point of intrinsics is that
|
|
|
|
// they are understandable by the optimizer.
|
2019-02-01 01:23:29 +08:00
|
|
|
if (isStatepoint(&Call) || isGCRelocate(&Call) || isGCResult(&Call))
|
[Statepoints 3/4] Statepoint infrastructure for garbage collection: SelectionDAGBuilder
This is the third patch in a small series. It contains the CodeGen support for lowering the gc.statepoint intrinsic sequences (223078) to the STATEPOINT pseudo machine instruction (223085). The change also includes the set of helper routines and classes for working with gc.statepoints, gc.relocates, and gc.results since the lowering code uses them.
With this change, gc.statepoints should be functionally complete. The documentation will follow in the fourth change, and there will likely be some cleanup changes, but interested parties can start experimenting now.
I'm not particularly happy with the amount of code or complexity involved with the lowering step, but at least it's fairly well isolated. The statepoint lowering code is split into it's own files and anyone not working on the statepoint support itself should be able to ignore it.
During the lowering process, we currently spill aggressively to stack. This is not entirely ideal (and we have plans to do better), but it's functional, relatively straight forward, and matches closely the implementations of the patchpoint intrinsics. Most of the complexity comes from trying to keep relocated copies of values in the same stack slots across statepoints. Doing so avoids the insertion of pointless load and store instructions to reshuffle the stack. The current implementation isn't as effective as I'd like, but it is functional and 'good enough' for many common use cases.
In the long term, I'd like to figure out how to integrate the statepoint lowering with the register allocator. In principal, we shouldn't need to eagerly spill at all. The register allocator should do any spilling required and the statepoint should simply record that fact. Depending on how challenging that turns out to be, we may invest in a smarter global stack slot assignment mechanism as a stop gap measure.
Reviewed by: atrick, ributzka
llvm-svn: 223137
2014-12-03 02:50:36 +08:00
|
|
|
return false;
|
|
|
|
|
2014-01-28 10:38:36 +08:00
|
|
|
// The size of ByVal or InAlloca arguments is derived from the type, so we
|
2010-01-05 15:32:13 +08:00
|
|
|
// can't change to a type with a different size. If the size were
|
|
|
|
// passed explicitly we could avoid this check.
|
2019-02-01 01:23:29 +08:00
|
|
|
if (!Call.isByValOrInAllocaArgument(ix))
|
2010-01-05 15:32:13 +08:00
|
|
|
return true;
|
|
|
|
|
2012-02-03 08:07:04 +08:00
|
|
|
Type* SrcTy =
|
2010-01-05 15:32:13 +08:00
|
|
|
cast<PointerType>(CI->getOperand(0)->getType())->getElementType();
|
2019-06-06 04:38:17 +08:00
|
|
|
Type *DstTy = Call.isByValArgument(ix)
|
|
|
|
? Call.getParamByValType(ix)
|
|
|
|
: cast<PointerType>(CI->getType())->getElementType();
|
2010-01-05 15:32:13 +08:00
|
|
|
if (!SrcTy->isSized() || !DstTy->isSized())
|
|
|
|
return false;
|
2015-03-10 10:37:25 +08:00
|
|
|
if (DL.getTypeAllocSize(SrcTy) != DL.getTypeAllocSize(DstTy))
|
2010-01-05 15:32:13 +08:00
|
|
|
return false;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2015-03-10 10:37:25 +08:00
|
|
|
Instruction *InstCombiner::tryOptimizeCall(CallInst *CI) {
|
2014-04-25 13:29:35 +08:00
|
|
|
if (!CI->getCalledFunction()) return nullptr;
|
2010-03-06 18:50:38 +08:00
|
|
|
|
2015-01-21 19:23:40 +08:00
|
|
|
auto InstCombineRAUW = [this](Instruction *From, Value *With) {
|
2016-02-02 06:23:39 +08:00
|
|
|
replaceInstUsesWith(*From, With);
|
2015-01-21 19:23:40 +08:00
|
|
|
};
|
2018-10-11 22:51:11 +08:00
|
|
|
auto InstCombineErase = [this](Instruction *I) {
|
|
|
|
eraseInstFromFunction(*I);
|
|
|
|
};
|
2019-04-16 00:49:00 +08:00
|
|
|
LibCallSimplifier Simplifier(DL, &TLI, ORE, BFI, PSI, InstCombineRAUW,
|
2018-10-11 22:51:11 +08:00
|
|
|
InstCombineErase);
|
2015-01-21 19:23:40 +08:00
|
|
|
if (Value *With = Simplifier.optimizeCall(CI)) {
|
2012-11-30 12:05:06 +08:00
|
|
|
++NumSimplified;
|
2016-02-02 06:23:39 +08:00
|
|
|
return CI->use_empty() ? CI : replaceInstUsesWith(*CI, With);
|
2012-11-30 12:05:06 +08:00
|
|
|
}
|
2012-10-14 00:45:24 +08:00
|
|
|
|
2014-04-25 13:29:35 +08:00
|
|
|
return nullptr;
|
2010-03-06 18:50:38 +08:00
|
|
|
}
|
|
|
|
|
2016-01-30 07:27:03 +08:00
|
|
|
static IntrinsicInst *findInitTrampolineFromAlloca(Value *TrampMem) {
|
2011-09-06 21:37:06 +08:00
|
|
|
// Strip off at most one level of pointer casts, looking for an alloca. This
|
|
|
|
// is good enough in practice and simpler than handling any number of casts.
|
|
|
|
Value *Underlying = TrampMem->stripPointerCasts();
|
|
|
|
if (Underlying != TrampMem &&
|
2014-03-09 11:16:01 +08:00
|
|
|
(!Underlying->hasOneUse() || Underlying->user_back() != TrampMem))
|
2014-04-25 13:29:35 +08:00
|
|
|
return nullptr;
|
2011-09-06 21:37:06 +08:00
|
|
|
if (!isa<AllocaInst>(Underlying))
|
2014-04-25 13:29:35 +08:00
|
|
|
return nullptr;
|
2011-09-06 21:37:06 +08:00
|
|
|
|
2014-04-25 13:29:35 +08:00
|
|
|
IntrinsicInst *InitTrampoline = nullptr;
|
2014-03-09 11:16:01 +08:00
|
|
|
for (User *U : TrampMem->users()) {
|
|
|
|
IntrinsicInst *II = dyn_cast<IntrinsicInst>(U);
|
2011-09-06 21:37:06 +08:00
|
|
|
if (!II)
|
2014-04-25 13:29:35 +08:00
|
|
|
return nullptr;
|
2011-09-06 21:37:06 +08:00
|
|
|
if (II->getIntrinsicID() == Intrinsic::init_trampoline) {
|
|
|
|
if (InitTrampoline)
|
|
|
|
// More than one init_trampoline writes to this value. Give up.
|
2014-04-25 13:29:35 +08:00
|
|
|
return nullptr;
|
2011-09-06 21:37:06 +08:00
|
|
|
InitTrampoline = II;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (II->getIntrinsicID() == Intrinsic::adjust_trampoline)
|
|
|
|
// Allow any number of calls to adjust.trampoline.
|
|
|
|
continue;
|
2014-04-25 13:29:35 +08:00
|
|
|
return nullptr;
|
2011-09-06 21:37:06 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// No call to init.trampoline found.
|
|
|
|
if (!InitTrampoline)
|
2014-04-25 13:29:35 +08:00
|
|
|
return nullptr;
|
2011-09-06 21:37:06 +08:00
|
|
|
|
|
|
|
// Check that the alloca is being used in the expected way.
|
|
|
|
if (InitTrampoline->getOperand(0) != TrampMem)
|
2014-04-25 13:29:35 +08:00
|
|
|
return nullptr;
|
2011-09-06 21:37:06 +08:00
|
|
|
|
|
|
|
return InitTrampoline;
|
|
|
|
}
|
|
|
|
|
2016-01-30 07:27:03 +08:00
|
|
|
static IntrinsicInst *findInitTrampolineFromBB(IntrinsicInst *AdjustTramp,
|
2011-09-06 21:37:06 +08:00
|
|
|
Value *TrampMem) {
|
|
|
|
// Visit all the previous instructions in the basic block, and try to find a
|
|
|
|
// init.trampoline which has a direct path to the adjust.trampoline.
|
2015-10-14 00:59:33 +08:00
|
|
|
for (BasicBlock::iterator I = AdjustTramp->getIterator(),
|
|
|
|
E = AdjustTramp->getParent()->begin();
|
|
|
|
I != E;) {
|
|
|
|
Instruction *Inst = &*--I;
|
2011-09-06 21:37:06 +08:00
|
|
|
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
|
|
|
|
if (II->getIntrinsicID() == Intrinsic::init_trampoline &&
|
|
|
|
II->getOperand(0) == TrampMem)
|
|
|
|
return II;
|
|
|
|
if (Inst->mayWriteToMemory())
|
2014-04-25 13:29:35 +08:00
|
|
|
return nullptr;
|
2011-09-06 21:37:06 +08:00
|
|
|
}
|
2014-04-25 13:29:35 +08:00
|
|
|
return nullptr;
|
2011-09-06 21:37:06 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Given a call to llvm.adjust.trampoline, find and return the corresponding
|
|
|
|
// call to llvm.init.trampoline if the call to the trampoline can be optimized
|
|
|
|
// to a direct call to a function. Otherwise return NULL.
|
2016-01-30 07:27:03 +08:00
|
|
|
static IntrinsicInst *findInitTrampoline(Value *Callee) {
|
2011-09-06 21:37:06 +08:00
|
|
|
Callee = Callee->stripPointerCasts();
|
|
|
|
IntrinsicInst *AdjustTramp = dyn_cast<IntrinsicInst>(Callee);
|
|
|
|
if (!AdjustTramp ||
|
|
|
|
AdjustTramp->getIntrinsicID() != Intrinsic::adjust_trampoline)
|
2014-04-25 13:29:35 +08:00
|
|
|
return nullptr;
|
2011-09-06 21:37:06 +08:00
|
|
|
|
|
|
|
Value *TrampMem = AdjustTramp->getOperand(0);
|
|
|
|
|
2016-01-30 07:27:03 +08:00
|
|
|
if (IntrinsicInst *IT = findInitTrampolineFromAlloca(TrampMem))
|
2011-09-06 21:37:06 +08:00
|
|
|
return IT;
|
2016-01-30 07:27:03 +08:00
|
|
|
if (IntrinsicInst *IT = findInitTrampolineFromBB(AdjustTramp, TrampMem))
|
2011-09-06 21:37:06 +08:00
|
|
|
return IT;
|
2014-04-25 13:29:35 +08:00
|
|
|
return nullptr;
|
2011-09-06 21:37:06 +08:00
|
|
|
}
|
|
|
|
|
2019-08-28 16:28:20 +08:00
|
|
|
static void annotateAnyAllocSite(CallBase &Call, const TargetLibraryInfo *TLI) {
|
2019-09-24 03:55:45 +08:00
|
|
|
unsigned NumArgs = Call.getNumArgOperands();
|
2019-08-28 16:28:20 +08:00
|
|
|
ConstantInt *Op0C = dyn_cast<ConstantInt>(Call.getOperand(0));
|
2019-09-24 03:55:45 +08:00
|
|
|
ConstantInt *Op1C =
|
|
|
|
(NumArgs == 1) ? nullptr : dyn_cast<ConstantInt>(Call.getOperand(1));
|
2019-08-28 23:04:48 +08:00
|
|
|
// Bail out if the allocation size is zero.
|
2019-08-28 16:28:20 +08:00
|
|
|
if ((Op0C && Op0C->isNullValue()) || (Op1C && Op1C->isNullValue()))
|
|
|
|
return;
|
2019-08-28 23:04:48 +08:00
|
|
|
|
2019-08-28 16:28:20 +08:00
|
|
|
if (isMallocLikeFn(&Call, TLI) && Op0C) {
|
2019-09-11 18:37:03 +08:00
|
|
|
if (isOpNewLikeFn(&Call, TLI))
|
|
|
|
Call.addAttribute(AttributeList::ReturnIndex,
|
|
|
|
Attribute::getWithDereferenceableBytes(
|
|
|
|
Call.getContext(), Op0C->getZExtValue()));
|
|
|
|
else
|
|
|
|
Call.addAttribute(AttributeList::ReturnIndex,
|
|
|
|
Attribute::getWithDereferenceableOrNullBytes(
|
|
|
|
Call.getContext(), Op0C->getZExtValue()));
|
2019-08-28 16:28:20 +08:00
|
|
|
} else if (isReallocLikeFn(&Call, TLI) && Op1C) {
|
|
|
|
Call.addAttribute(AttributeList::ReturnIndex,
|
|
|
|
Attribute::getWithDereferenceableOrNullBytes(
|
|
|
|
Call.getContext(), Op1C->getZExtValue()));
|
|
|
|
} else if (isCallocLikeFn(&Call, TLI) && Op0C && Op1C) {
|
|
|
|
bool Overflow;
|
|
|
|
const APInt &N = Op0C->getValue();
|
|
|
|
APInt Size = N.umul_ov(Op1C->getValue(), Overflow);
|
|
|
|
if (!Overflow)
|
|
|
|
Call.addAttribute(AttributeList::ReturnIndex,
|
|
|
|
Attribute::getWithDereferenceableOrNullBytes(
|
|
|
|
Call.getContext(), Size.getZExtValue()));
|
2019-09-24 03:55:45 +08:00
|
|
|
} else if (isStrdupLikeFn(&Call, TLI)) {
|
|
|
|
uint64_t Len = GetStringLength(Call.getOperand(0));
|
|
|
|
if (Len) {
|
|
|
|
// strdup
|
|
|
|
if (NumArgs == 1)
|
|
|
|
Call.addAttribute(AttributeList::ReturnIndex,
|
|
|
|
Attribute::getWithDereferenceableOrNullBytes(
|
|
|
|
Call.getContext(), Len));
|
|
|
|
// strndup
|
|
|
|
else if (NumArgs == 2 && Op1C)
|
|
|
|
Call.addAttribute(
|
|
|
|
AttributeList::ReturnIndex,
|
|
|
|
Attribute::getWithDereferenceableOrNullBytes(
|
|
|
|
Call.getContext(), std::min(Len, Op1C->getZExtValue() + 1)));
|
|
|
|
}
|
2019-08-28 16:28:20 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-02-09 04:48:56 +08:00
|
|
|
/// Improvements for call, callbr and invoke instructions.
|
2019-02-01 01:23:29 +08:00
|
|
|
Instruction *InstCombiner::visitCallBase(CallBase &Call) {
|
2019-08-28 16:28:20 +08:00
|
|
|
if (isAllocationFn(&Call, &TLI))
|
|
|
|
annotateAnyAllocSite(Call, &TLI);
|
|
|
|
|
2010-01-05 15:32:13 +08:00
|
|
|
bool Changed = false;
|
|
|
|
|
2015-06-17 04:24:25 +08:00
|
|
|
// Mark any parameters that are known to be non-null with the nonnull
|
|
|
|
// attribute. This is helpful for inlining calls to functions with null
|
|
|
|
// checks on their arguments.
|
2017-06-01 03:23:09 +08:00
|
|
|
SmallVector<unsigned, 4> ArgNos;
|
2015-06-17 04:24:25 +08:00
|
|
|
unsigned ArgNo = 0;
|
2015-12-02 14:58:49 +08:00
|
|
|
|
2019-02-01 01:23:29 +08:00
|
|
|
for (Value *V : Call.args()) {
|
2016-01-30 07:14:58 +08:00
|
|
|
if (V->getType()->isPointerTy() &&
|
2019-02-01 01:23:29 +08:00
|
|
|
!Call.paramHasAttr(ArgNo, Attribute::NonNull) &&
|
|
|
|
isKnownNonZero(V, DL, 0, &AC, &Call, &DT))
|
2017-06-01 03:23:09 +08:00
|
|
|
ArgNos.push_back(ArgNo);
|
2015-06-17 04:24:25 +08:00
|
|
|
ArgNo++;
|
|
|
|
}
|
2015-12-02 14:58:49 +08:00
|
|
|
|
2019-02-01 01:23:29 +08:00
|
|
|
assert(ArgNo == Call.arg_size() && "sanity check");
|
2015-06-17 04:24:25 +08:00
|
|
|
|
2017-06-01 03:23:09 +08:00
|
|
|
if (!ArgNos.empty()) {
|
2019-02-01 01:23:29 +08:00
|
|
|
AttributeList AS = Call.getAttributes();
|
|
|
|
LLVMContext &Ctx = Call.getContext();
|
2017-06-01 03:23:09 +08:00
|
|
|
AS = AS.addParamAttribute(Ctx, ArgNos,
|
|
|
|
Attribute::get(Ctx, Attribute::NonNull));
|
2019-02-01 01:23:29 +08:00
|
|
|
Call.setAttributes(AS);
|
2015-12-02 14:58:49 +08:00
|
|
|
Changed = true;
|
|
|
|
}
|
|
|
|
|
2010-12-20 16:25:06 +08:00
|
|
|
// If the callee is a pointer to a function, attempt to move any casts to the
|
2019-02-09 04:48:56 +08:00
|
|
|
// arguments of the call/callbr/invoke.
|
2019-02-01 01:23:29 +08:00
|
|
|
Value *Callee = Call.getCalledValue();
|
|
|
|
if (!isa<Function>(Callee) && transformConstExprCastCall(Call))
|
2014-04-25 13:29:35 +08:00
|
|
|
return nullptr;
|
2010-01-05 15:32:13 +08:00
|
|
|
|
2016-03-15 04:18:54 +08:00
|
|
|
if (Function *CalleeF = dyn_cast<Function>(Callee)) {
|
|
|
|
// Remove the convergent attr on calls when the callee is not convergent.
|
2019-02-01 01:23:29 +08:00
|
|
|
if (Call.isConvergent() && !CalleeF->isConvergent() &&
|
2016-06-21 03:04:44 +08:00
|
|
|
!CalleeF->isIntrinsic()) {
|
2019-02-01 01:23:29 +08:00
|
|
|
LLVM_DEBUG(dbgs() << "Removing convergent attr from instr " << Call
|
|
|
|
<< "\n");
|
|
|
|
Call.setNotConvergent();
|
|
|
|
return &Call;
|
2016-03-15 04:18:54 +08:00
|
|
|
}
|
|
|
|
|
2010-02-02 02:11:34 +08:00
|
|
|
// If the call and callee calling conventions don't match, this call must
|
|
|
|
// be unreachable, as the call is undefined.
|
2019-02-01 01:23:29 +08:00
|
|
|
if (CalleeF->getCallingConv() != Call.getCallingConv() &&
|
2010-02-02 02:11:34 +08:00
|
|
|
// Only do this for calls to a function with a body. A prototype may
|
|
|
|
// not actually end up matching the implementation's calling conv for a
|
|
|
|
// variety of reasons (e.g. it may be written in assembly).
|
|
|
|
!CalleeF->isDeclaration()) {
|
2019-02-01 01:23:29 +08:00
|
|
|
Instruction *OldCall = &Call;
|
2019-04-18 01:37:58 +08:00
|
|
|
CreateNonTerminatorUnreachable(OldCall);
|
2012-12-13 08:18:46 +08:00
|
|
|
// If OldCall does not return void then replaceAllUsesWith undef.
|
2010-01-05 15:32:13 +08:00
|
|
|
// This allows ValueHandlers and custom metadata to adjust itself.
|
|
|
|
if (!OldCall->getType()->isVoidTy())
|
2016-02-02 06:23:39 +08:00
|
|
|
replaceInstUsesWith(*OldCall, UndefValue::get(OldCall->getType()));
|
2010-02-02 02:04:58 +08:00
|
|
|
if (isa<CallInst>(OldCall))
|
2016-02-02 06:23:39 +08:00
|
|
|
return eraseInstFromFunction(*OldCall);
|
2012-02-03 08:07:04 +08:00
|
|
|
|
2019-02-09 04:48:56 +08:00
|
|
|
// We cannot remove an invoke or a callbr, because it would change thexi
|
|
|
|
// CFG, just change the callee to a null pointer.
|
|
|
|
cast<CallBase>(OldCall)->setCalledFunction(
|
2019-02-02 04:44:54 +08:00
|
|
|
CalleeF->getFunctionType(),
|
|
|
|
Constant::getNullValue(CalleeF->getType()));
|
2014-04-25 13:29:35 +08:00
|
|
|
return nullptr;
|
2010-01-05 15:32:13 +08:00
|
|
|
}
|
2016-03-15 04:18:54 +08:00
|
|
|
}
|
2010-01-05 15:32:13 +08:00
|
|
|
|
llvm: Add support for "-fno-delete-null-pointer-checks"
Summary:
Support for this option is needed for building Linux kernel.
This is a very frequently requested feature by kernel developers.
More details : https://lkml.org/lkml/2018/4/4/601
GCC option description for -fdelete-null-pointer-checks:
This Assume that programs cannot safely dereference null pointers,
and that no code or data element resides at address zero.
-fno-delete-null-pointer-checks is the inverse of this implying that
null pointer dereferencing is not undefined.
This feature is implemented in LLVM IR in this CL as the function attribute
"null-pointer-is-valid"="true" in IR (Under review at D47894).
The CL updates several passes that assumed null pointer dereferencing is
undefined to not optimize when the "null-pointer-is-valid"="true"
attribute is present.
Reviewers: t.p.northover, efriedma, jyknight, chandlerc, rnk, srhines, void, george.burgess.iv
Reviewed By: efriedma, george.burgess.iv
Subscribers: eraman, haicheng, george.burgess.iv, drinkcat, theraven, reames, sanjoy, xbolva00, llvm-commits
Differential Revision: https://reviews.llvm.org/D47895
llvm-svn: 336613
2018-07-10 06:27:23 +08:00
|
|
|
if ((isa<ConstantPointerNull>(Callee) &&
|
2019-02-01 01:23:29 +08:00
|
|
|
!NullPointerIsDefined(Call.getFunction())) ||
|
llvm: Add support for "-fno-delete-null-pointer-checks"
Summary:
Support for this option is needed for building Linux kernel.
This is a very frequently requested feature by kernel developers.
More details : https://lkml.org/lkml/2018/4/4/601
GCC option description for -fdelete-null-pointer-checks:
This Assume that programs cannot safely dereference null pointers,
and that no code or data element resides at address zero.
-fno-delete-null-pointer-checks is the inverse of this implying that
null pointer dereferencing is not undefined.
This feature is implemented in LLVM IR in this CL as the function attribute
"null-pointer-is-valid"="true" in IR (Under review at D47894).
The CL updates several passes that assumed null pointer dereferencing is
undefined to not optimize when the "null-pointer-is-valid"="true"
attribute is present.
Reviewers: t.p.northover, efriedma, jyknight, chandlerc, rnk, srhines, void, george.burgess.iv
Reviewed By: efriedma, george.burgess.iv
Subscribers: eraman, haicheng, george.burgess.iv, drinkcat, theraven, reames, sanjoy, xbolva00, llvm-commits
Differential Revision: https://reviews.llvm.org/D47895
llvm-svn: 336613
2018-07-10 06:27:23 +08:00
|
|
|
isa<UndefValue>(Callee)) {
|
2019-02-01 01:23:29 +08:00
|
|
|
// If Call does not return void then replaceAllUsesWith undef.
|
2010-01-05 15:32:13 +08:00
|
|
|
// This allows ValueHandlers and custom metadata to adjust itself.
|
2019-02-01 01:23:29 +08:00
|
|
|
if (!Call.getType()->isVoidTy())
|
|
|
|
replaceInstUsesWith(Call, UndefValue::get(Call.getType()));
|
2010-01-05 15:32:13 +08:00
|
|
|
|
2019-02-09 04:48:56 +08:00
|
|
|
if (Call.isTerminator()) {
|
|
|
|
// Can't remove an invoke or callbr because we cannot change the CFG.
|
2014-04-25 13:29:35 +08:00
|
|
|
return nullptr;
|
2010-01-05 15:32:13 +08:00
|
|
|
}
|
2012-06-22 07:52:14 +08:00
|
|
|
|
2019-04-18 01:37:58 +08:00
|
|
|
// This instruction is not reachable, just remove it.
|
|
|
|
CreateNonTerminatorUnreachable(&Call);
|
2019-02-01 01:23:29 +08:00
|
|
|
return eraseInstFromFunction(Call);
|
2010-01-05 15:32:13 +08:00
|
|
|
}
|
|
|
|
|
2016-01-30 07:27:03 +08:00
|
|
|
if (IntrinsicInst *II = findInitTrampoline(Callee))
|
2019-02-01 01:23:29 +08:00
|
|
|
return transformCallThroughTrampoline(Call, *II);
|
2010-01-05 15:32:13 +08:00
|
|
|
|
2011-07-18 12:54:35 +08:00
|
|
|
PointerType *PTy = cast<PointerType>(Callee->getType());
|
|
|
|
FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
|
2010-01-05 15:32:13 +08:00
|
|
|
if (FTy->isVarArg()) {
|
2011-11-29 09:18:23 +08:00
|
|
|
int ix = FTy->getNumParams();
|
2010-01-05 15:32:13 +08:00
|
|
|
// See if we can optimize any arguments passed through the varargs area of
|
|
|
|
// the call.
|
2019-02-01 01:23:29 +08:00
|
|
|
for (auto I = Call.arg_begin() + FTy->getNumParams(), E = Call.arg_end();
|
|
|
|
I != E; ++I, ++ix) {
|
2010-01-05 15:32:13 +08:00
|
|
|
CastInst *CI = dyn_cast<CastInst>(*I);
|
2019-02-01 01:23:29 +08:00
|
|
|
if (CI && isSafeToEliminateVarargsCast(Call, DL, CI, ix)) {
|
2010-01-05 15:32:13 +08:00
|
|
|
*I = CI->getOperand(0);
|
2019-06-06 04:38:17 +08:00
|
|
|
|
|
|
|
// Update the byval type to match the argument type.
|
|
|
|
if (Call.isByValArgument(ix)) {
|
|
|
|
Call.removeParamAttr(ix, Attribute::ByVal);
|
|
|
|
Call.addParamAttr(
|
|
|
|
ix, Attribute::getWithByValType(
|
|
|
|
Call.getContext(),
|
|
|
|
CI->getOperand(0)->getType()->getPointerElementType()));
|
|
|
|
}
|
2010-01-05 15:32:13 +08:00
|
|
|
Changed = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-02-01 01:23:29 +08:00
|
|
|
if (isa<InlineAsm>(Callee) && !Call.doesNotThrow()) {
|
2010-01-05 15:32:13 +08:00
|
|
|
// Inline asm calls cannot throw - mark them 'nounwind'.
|
2019-02-01 01:23:29 +08:00
|
|
|
Call.setDoesNotThrow();
|
2010-01-05 15:32:13 +08:00
|
|
|
Changed = true;
|
|
|
|
}
|
|
|
|
|
2012-10-09 00:38:25 +08:00
|
|
|
// Try to optimize the call if possible, we require DataLayout for most of
|
2010-03-06 18:50:38 +08:00
|
|
|
// this. None of these calls are seen as possibly dead so go ahead and
|
|
|
|
// delete the instruction now.
|
2019-02-01 01:23:29 +08:00
|
|
|
if (CallInst *CI = dyn_cast<CallInst>(&Call)) {
|
2015-03-10 10:37:25 +08:00
|
|
|
Instruction *I = tryOptimizeCall(CI);
|
2010-03-06 18:59:25 +08:00
|
|
|
// If we changed something return the result, etc. Otherwise let
|
|
|
|
// the fallthrough check.
|
2016-02-02 06:23:39 +08:00
|
|
|
if (I) return eraseInstFromFunction(*I);
|
2010-03-06 18:50:38 +08:00
|
|
|
}
|
|
|
|
|
2019-09-24 02:20:01 +08:00
|
|
|
if (isAllocLikeFn(&Call, &TLI))
|
|
|
|
return visitAllocSite(Call);
|
|
|
|
|
2019-02-01 01:23:29 +08:00
|
|
|
return Changed ? &Call : nullptr;
|
2010-01-05 15:32:13 +08:00
|
|
|
}
|
|
|
|
|
2016-01-21 06:24:38 +08:00
|
|
|
/// If the callee is a constexpr cast of a function, attempt to move the cast to
|
2019-02-09 04:48:56 +08:00
|
|
|
/// the arguments of the call/callbr/invoke.
|
2019-02-01 01:23:29 +08:00
|
|
|
bool InstCombiner::transformConstExprCastCall(CallBase &Call) {
|
|
|
|
auto *Callee = dyn_cast<Function>(Call.getCalledValue()->stripPointerCasts());
|
2014-04-25 13:29:35 +08:00
|
|
|
if (!Callee)
|
2010-01-05 15:32:13 +08:00
|
|
|
return false;
|
2016-08-11 23:23:56 +08:00
|
|
|
|
2018-04-03 06:49:44 +08:00
|
|
|
// If this is a call to a thunk function, don't remove the cast. Thunks are
|
|
|
|
// used to transparently forward all incoming parameters and outgoing return
|
|
|
|
// values, so it's important to leave the cast in place.
|
2015-01-22 06:32:04 +08:00
|
|
|
if (Callee->hasFnAttribute("thunk"))
|
|
|
|
return false;
|
2016-08-11 23:23:56 +08:00
|
|
|
|
2018-04-03 06:49:44 +08:00
|
|
|
// If this is a musttail call, the callee's prototype must match the caller's
|
|
|
|
// prototype with the exception of pointee types. The code below doesn't
|
|
|
|
// implement that, so we can't do this transform.
|
|
|
|
// TODO: Do the transform if it only requires adding pointer casts.
|
2019-02-01 01:23:29 +08:00
|
|
|
if (Call.isMustTailCall())
|
2018-04-03 06:49:44 +08:00
|
|
|
return false;
|
|
|
|
|
2019-02-01 01:23:29 +08:00
|
|
|
Instruction *Caller = &Call;
|
|
|
|
const AttributeList &CallerPAL = Call.getAttributes();
|
2010-01-05 15:32:13 +08:00
|
|
|
|
|
|
|
// Okay, this is a cast from a function to a different type. Unless doing so
|
|
|
|
// would cause a type conversion of one of our arguments, change this call to
|
|
|
|
// be a direct call with arguments casted to the appropriate types.
|
2011-07-18 12:54:35 +08:00
|
|
|
FunctionType *FT = Callee->getFunctionType();
|
|
|
|
Type *OldRetTy = Caller->getType();
|
|
|
|
Type *NewRetTy = FT->getReturnType();
|
2010-01-05 15:32:13 +08:00
|
|
|
|
|
|
|
// Check to see if we are changing the return type...
|
|
|
|
if (OldRetTy != NewRetTy) {
|
2014-01-19 06:47:12 +08:00
|
|
|
|
|
|
|
if (NewRetTy->isStructTy())
|
|
|
|
return false; // TODO: Handle multiple return values.
|
|
|
|
|
2015-01-06 16:41:31 +08:00
|
|
|
if (!CastInst::isBitOrNoopPointerCastable(NewRetTy, OldRetTy, DL)) {
|
2013-09-18 05:10:14 +08:00
|
|
|
if (Callee->isDeclaration())
|
|
|
|
return false; // Cannot transform this return value.
|
2010-01-05 15:32:13 +08:00
|
|
|
|
2013-09-18 05:10:14 +08:00
|
|
|
if (!Caller->use_empty() &&
|
|
|
|
// void -> non-void is handled specially
|
|
|
|
!NewRetTy->isVoidTy())
|
2014-10-23 12:08:42 +08:00
|
|
|
return false; // Cannot transform this return value.
|
2013-09-18 05:10:14 +08:00
|
|
|
}
|
2010-01-05 15:32:13 +08:00
|
|
|
|
|
|
|
if (!CallerPAL.isEmpty() && !Caller->use_empty()) {
|
Rename AttributeSet to AttributeList
Summary:
This class is a list of AttributeSetNodes corresponding the function
prototype of a call or function declaration. This class used to be
called ParamAttrListPtr, then AttrListPtr, then AttributeSet. It is
typically accessed by parameter and return value index, so
"AttributeList" seems like a more intuitive name.
Rename AttributeSetImpl to AttributeListImpl to follow suit.
It's useful to rename this class so that we can rename AttributeSetNode
to AttributeSet later. AttributeSet is the set of attributes that apply
to a single function, argument, or return value.
Reviewers: sanjoy, javed.absar, chandlerc, pete
Reviewed By: pete
Subscribers: pete, jholewinski, arsenm, dschuff, mehdi_amini, jfb, nhaehnle, sbc100, void, llvm-commits
Differential Revision: https://reviews.llvm.org/D31102
llvm-svn: 298393
2017-03-22 00:57:19 +08:00
|
|
|
AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex);
|
2015-05-07 07:19:56 +08:00
|
|
|
if (RAttrs.overlaps(AttributeFuncs::typeIncompatible(NewRetTy)))
|
2010-01-05 15:32:13 +08:00
|
|
|
return false; // Attribute not compatible with transformed value.
|
|
|
|
}
|
|
|
|
|
2019-02-09 04:48:56 +08:00
|
|
|
// If the callbase is an invoke/callbr instruction, and the return value is
|
|
|
|
// used by a PHI node in a successor, we cannot change the return type of
|
|
|
|
// the call because there is no place to put the cast instruction (without
|
|
|
|
// breaking the critical edge). Bail out in this case.
|
|
|
|
if (!Caller->use_empty()) {
|
2010-01-05 15:32:13 +08:00
|
|
|
if (InvokeInst *II = dyn_cast<InvokeInst>(Caller))
|
2014-03-09 11:16:01 +08:00
|
|
|
for (User *U : II->users())
|
|
|
|
if (PHINode *PN = dyn_cast<PHINode>(U))
|
2010-01-05 15:32:13 +08:00
|
|
|
if (PN->getParent() == II->getNormalDest() ||
|
|
|
|
PN->getParent() == II->getUnwindDest())
|
|
|
|
return false;
|
2019-02-09 04:48:56 +08:00
|
|
|
// FIXME: Be conservative for callbr to avoid a quadratic search.
|
2019-02-10 10:21:29 +08:00
|
|
|
if (isa<CallBrInst>(Caller))
|
2019-02-09 04:48:56 +08:00
|
|
|
return false;
|
|
|
|
}
|
2010-01-05 15:32:13 +08:00
|
|
|
}
|
|
|
|
|
2019-02-01 01:23:29 +08:00
|
|
|
unsigned NumActualArgs = Call.arg_size();
|
2010-01-05 15:32:13 +08:00
|
|
|
unsigned NumCommonArgs = std::min(FT->getNumParams(), NumActualArgs);
|
|
|
|
|
2015-01-06 16:41:31 +08:00
|
|
|
// Prevent us turning:
|
|
|
|
// declare void @takes_i32_inalloca(i32* inalloca)
|
|
|
|
// call void bitcast (void (i32*)* @takes_i32_inalloca to void (i32)*)(i32 0)
|
|
|
|
//
|
|
|
|
// into:
|
|
|
|
// call void @takes_i32_inalloca(i32* null)
|
2015-03-12 02:03:05 +08:00
|
|
|
//
|
|
|
|
// Similarly, avoid folding away bitcasts of byval calls.
|
|
|
|
if (Callee->getAttributes().hasAttrSomewhere(Attribute::InAlloca) ||
|
|
|
|
Callee->getAttributes().hasAttrSomewhere(Attribute::ByVal))
|
2015-01-06 16:41:31 +08:00
|
|
|
return false;
|
|
|
|
|
2019-02-01 01:23:29 +08:00
|
|
|
auto AI = Call.arg_begin();
|
2010-01-05 15:32:13 +08:00
|
|
|
for (unsigned i = 0, e = NumCommonArgs; i != e; ++i, ++AI) {
|
2011-07-18 12:54:35 +08:00
|
|
|
Type *ParamTy = FT->getParamType(i);
|
|
|
|
Type *ActTy = (*AI)->getType();
|
2010-01-05 15:32:13 +08:00
|
|
|
|
2015-01-06 16:41:31 +08:00
|
|
|
if (!CastInst::isBitOrNoopPointerCastable(ActTy, ParamTy, DL))
|
2010-01-05 15:32:13 +08:00
|
|
|
return false; // Cannot transform this parameter value.
|
|
|
|
|
2017-04-14 07:12:13 +08:00
|
|
|
if (AttrBuilder(CallerPAL.getParamAttributes(i))
|
|
|
|
.overlaps(AttributeFuncs::typeIncompatible(ParamTy)))
|
2010-01-05 15:32:13 +08:00
|
|
|
return false; // Attribute not compatible with transformed value.
|
2012-02-03 08:07:04 +08:00
|
|
|
|
2019-02-01 01:23:29 +08:00
|
|
|
if (Call.isInAllocaArgument(i))
|
2014-01-28 10:38:36 +08:00
|
|
|
return false; // Cannot transform to and from inalloca.
|
|
|
|
|
2010-12-20 16:36:38 +08:00
|
|
|
// If the parameter is passed as a byval argument, then we have to have a
|
|
|
|
// sized type and the sized type has to have the same size as the old type.
|
2017-04-14 07:12:13 +08:00
|
|
|
if (ParamTy != ActTy && CallerPAL.hasParamAttribute(i, Attribute::ByVal)) {
|
2011-07-18 12:54:35 +08:00
|
|
|
PointerType *ParamPTy = dyn_cast<PointerType>(ParamTy);
|
2015-03-10 10:37:25 +08:00
|
|
|
if (!ParamPTy || !ParamPTy->getElementType()->isSized())
|
2010-12-20 16:36:38 +08:00
|
|
|
return false;
|
2012-02-03 08:07:04 +08:00
|
|
|
|
2019-06-06 04:38:17 +08:00
|
|
|
Type *CurElTy = Call.getParamByValType(i);
|
2015-03-10 10:37:25 +08:00
|
|
|
if (DL.getTypeAllocSize(CurElTy) !=
|
|
|
|
DL.getTypeAllocSize(ParamPTy->getElementType()))
|
2010-12-20 16:36:38 +08:00
|
|
|
return false;
|
|
|
|
}
|
2010-01-05 15:32:13 +08:00
|
|
|
}
|
|
|
|
|
2011-02-24 13:10:56 +08:00
|
|
|
if (Callee->isDeclaration()) {
|
|
|
|
// Do not delete arguments unless we have a function body.
|
|
|
|
if (FT->getNumParams() < NumActualArgs && !FT->isVarArg())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// If the callee is just a declaration, don't change the varargsness of the
|
|
|
|
// call. We don't want to introduce a varargs call where one doesn't
|
|
|
|
// already exist.
|
2019-02-01 01:23:29 +08:00
|
|
|
PointerType *APTy = cast<PointerType>(Call.getCalledValue()->getType());
|
2011-02-24 13:10:56 +08:00
|
|
|
if (FT->isVarArg()!=cast<FunctionType>(APTy->getElementType())->isVarArg())
|
|
|
|
return false;
|
2012-02-03 08:00:55 +08:00
|
|
|
|
|
|
|
// If both the callee and the cast type are varargs, we still have to make
|
|
|
|
// sure the number of fixed parameters are the same or we have the same
|
|
|
|
// ABI issues as if we introduce a varargs call.
|
2012-02-03 08:26:07 +08:00
|
|
|
if (FT->isVarArg() &&
|
|
|
|
cast<FunctionType>(APTy->getElementType())->isVarArg() &&
|
|
|
|
FT->getNumParams() !=
|
2012-02-03 08:00:55 +08:00
|
|
|
cast<FunctionType>(APTy->getElementType())->getNumParams())
|
|
|
|
return false;
|
2011-02-24 13:10:56 +08:00
|
|
|
}
|
2012-02-03 08:07:04 +08:00
|
|
|
|
2012-02-03 08:00:50 +08:00
|
|
|
if (FT->getNumParams() < NumActualArgs && FT->isVarArg() &&
|
2017-04-20 07:17:47 +08:00
|
|
|
!CallerPAL.isEmpty()) {
|
2012-02-03 08:00:50 +08:00
|
|
|
// In this case we have more arguments than the new function type, but we
|
|
|
|
// won't be dropping them. Check that these extra arguments have attributes
|
|
|
|
// that are compatible with being a vararg call argument.
|
2017-04-20 07:17:47 +08:00
|
|
|
unsigned SRetIdx;
|
|
|
|
if (CallerPAL.hasAttrSomewhere(Attribute::StructRet, &SRetIdx) &&
|
|
|
|
SRetIdx > FT->getNumParams())
|
|
|
|
return false;
|
|
|
|
}
|
2012-02-03 08:07:04 +08:00
|
|
|
|
2010-01-05 15:32:13 +08:00
|
|
|
// Okay, we decided that this is a safe thing to do: go ahead and start
|
2011-02-24 13:10:56 +08:00
|
|
|
// inserting cast instructions as necessary.
|
2017-04-14 02:11:03 +08:00
|
|
|
SmallVector<Value *, 8> Args;
|
|
|
|
SmallVector<AttributeSet, 8> ArgAttrs;
|
2010-01-05 15:32:13 +08:00
|
|
|
Args.reserve(NumActualArgs);
|
2017-04-14 02:11:03 +08:00
|
|
|
ArgAttrs.reserve(NumActualArgs);
|
2010-01-05 15:32:13 +08:00
|
|
|
|
|
|
|
// Get any return attributes.
|
Rename AttributeSet to AttributeList
Summary:
This class is a list of AttributeSetNodes corresponding the function
prototype of a call or function declaration. This class used to be
called ParamAttrListPtr, then AttrListPtr, then AttributeSet. It is
typically accessed by parameter and return value index, so
"AttributeList" seems like a more intuitive name.
Rename AttributeSetImpl to AttributeListImpl to follow suit.
It's useful to rename this class so that we can rename AttributeSetNode
to AttributeSet later. AttributeSet is the set of attributes that apply
to a single function, argument, or return value.
Reviewers: sanjoy, javed.absar, chandlerc, pete
Reviewed By: pete
Subscribers: pete, jholewinski, arsenm, dschuff, mehdi_amini, jfb, nhaehnle, sbc100, void, llvm-commits
Differential Revision: https://reviews.llvm.org/D31102
llvm-svn: 298393
2017-03-22 00:57:19 +08:00
|
|
|
AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex);
|
2010-01-05 15:32:13 +08:00
|
|
|
|
|
|
|
// If the return value is not being used, the type may not be compatible
|
|
|
|
// with the existing attributes. Wipe out any problematic attributes.
|
2015-05-07 07:19:56 +08:00
|
|
|
RAttrs.remove(AttributeFuncs::typeIncompatible(NewRetTy));
|
2010-01-05 15:32:13 +08:00
|
|
|
|
2019-06-06 04:38:17 +08:00
|
|
|
LLVMContext &Ctx = Call.getContext();
|
2019-02-01 01:23:29 +08:00
|
|
|
AI = Call.arg_begin();
|
2010-01-05 15:32:13 +08:00
|
|
|
for (unsigned i = 0; i != NumCommonArgs; ++i, ++AI) {
|
2011-07-18 12:54:35 +08:00
|
|
|
Type *ParamTy = FT->getParamType(i);
|
2013-07-31 04:45:05 +08:00
|
|
|
|
2017-04-14 02:11:03 +08:00
|
|
|
Value *NewArg = *AI;
|
|
|
|
if ((*AI)->getType() != ParamTy)
|
2017-07-08 07:16:26 +08:00
|
|
|
NewArg = Builder.CreateBitOrPointerCast(*AI, ParamTy);
|
2017-04-14 02:11:03 +08:00
|
|
|
Args.push_back(NewArg);
|
2010-01-05 15:32:13 +08:00
|
|
|
|
|
|
|
// Add any parameter attributes.
|
2019-06-06 04:38:17 +08:00
|
|
|
if (CallerPAL.hasParamAttribute(i, Attribute::ByVal)) {
|
|
|
|
AttrBuilder AB(CallerPAL.getParamAttributes(i));
|
|
|
|
AB.addByValAttr(NewArg->getType()->getPointerElementType());
|
|
|
|
ArgAttrs.push_back(AttributeSet::get(Ctx, AB));
|
|
|
|
} else
|
|
|
|
ArgAttrs.push_back(CallerPAL.getParamAttributes(i));
|
2010-01-05 15:32:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// If the function takes more arguments than the call was taking, add them
|
|
|
|
// now.
|
2017-04-14 02:11:03 +08:00
|
|
|
for (unsigned i = NumCommonArgs; i != FT->getNumParams(); ++i) {
|
2010-01-05 15:32:13 +08:00
|
|
|
Args.push_back(Constant::getNullValue(FT->getParamType(i)));
|
2017-04-14 02:11:03 +08:00
|
|
|
ArgAttrs.push_back(AttributeSet());
|
|
|
|
}
|
2010-01-05 15:32:13 +08:00
|
|
|
|
|
|
|
// If we are removing arguments to the function, emit an obnoxious warning.
|
|
|
|
if (FT->getNumParams() < NumActualArgs) {
|
2012-12-27 06:00:35 +08:00
|
|
|
// TODO: if (!FT->isVarArg()) this call may be unreachable. PR14722
|
|
|
|
if (FT->isVarArg()) {
|
2010-01-05 15:32:13 +08:00
|
|
|
// Add all of the arguments in their promoted form to the arg list.
|
|
|
|
for (unsigned i = FT->getNumParams(); i != NumActualArgs; ++i, ++AI) {
|
2011-07-18 12:54:35 +08:00
|
|
|
Type *PTy = getPromotedType((*AI)->getType());
|
2017-04-14 02:11:03 +08:00
|
|
|
Value *NewArg = *AI;
|
2010-01-05 15:32:13 +08:00
|
|
|
if (PTy != (*AI)->getType()) {
|
|
|
|
// Must promote to pass through va_arg area!
|
|
|
|
Instruction::CastOps opcode =
|
|
|
|
CastInst::getCastOpcode(*AI, false, PTy, false);
|
2017-07-08 07:16:26 +08:00
|
|
|
NewArg = Builder.CreateCast(opcode, *AI, PTy);
|
2010-01-05 15:32:13 +08:00
|
|
|
}
|
2017-04-14 02:11:03 +08:00
|
|
|
Args.push_back(NewArg);
|
2010-01-05 15:32:13 +08:00
|
|
|
|
|
|
|
// Add any parameter attributes.
|
2017-04-14 07:12:13 +08:00
|
|
|
ArgAttrs.push_back(CallerPAL.getParamAttributes(i));
|
2010-01-05 15:32:13 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-04-12 08:38:00 +08:00
|
|
|
AttributeSet FnAttrs = CallerPAL.getFnAttributes();
|
2010-01-05 15:32:13 +08:00
|
|
|
|
|
|
|
if (NewRetTy->isVoidTy())
|
|
|
|
Caller->setName(""); // Void type should not have a name.
|
|
|
|
|
2017-04-14 02:11:03 +08:00
|
|
|
assert((ArgAttrs.size() == FT->getNumParams() || FT->isVarArg()) &&
|
|
|
|
"missing argument attributes");
|
|
|
|
AttributeList NewCallerPAL = AttributeList::get(
|
|
|
|
Ctx, FnAttrs, AttributeSet::get(Ctx, RAttrs), ArgAttrs);
|
2010-01-05 15:32:13 +08:00
|
|
|
|
2015-11-25 08:42:19 +08:00
|
|
|
SmallVector<OperandBundleDef, 1> OpBundles;
|
2019-02-01 01:23:29 +08:00
|
|
|
Call.getOperandBundlesAsDefs(OpBundles);
|
2015-11-25 08:42:19 +08:00
|
|
|
|
2019-02-01 01:23:29 +08:00
|
|
|
CallBase *NewCall;
|
2010-01-05 15:32:13 +08:00
|
|
|
if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
|
2019-02-01 01:23:29 +08:00
|
|
|
NewCall = Builder.CreateInvoke(Callee, II->getNormalDest(),
|
|
|
|
II->getUnwindDest(), Args, OpBundles);
|
2019-02-09 04:48:56 +08:00
|
|
|
} else if (CallBrInst *CBI = dyn_cast<CallBrInst>(Caller)) {
|
|
|
|
NewCall = Builder.CreateCallBr(Callee, CBI->getDefaultDest(),
|
|
|
|
CBI->getIndirectDests(), Args, OpBundles);
|
2010-01-05 15:32:13 +08:00
|
|
|
} else {
|
2019-02-01 01:23:29 +08:00
|
|
|
NewCall = Builder.CreateCall(Callee, Args, OpBundles);
|
|
|
|
cast<CallInst>(NewCall)->setTailCallKind(
|
|
|
|
cast<CallInst>(Caller)->getTailCallKind());
|
2010-01-05 15:32:13 +08:00
|
|
|
}
|
2019-02-01 01:23:29 +08:00
|
|
|
NewCall->takeName(Caller);
|
|
|
|
NewCall->setCallingConv(Call.getCallingConv());
|
|
|
|
NewCall->setAttributes(NewCallerPAL);
|
2017-04-14 04:26:38 +08:00
|
|
|
|
|
|
|
// Preserve the weight metadata for the new call instruction. The metadata
|
|
|
|
// is used by SamplePGO to check callsite's hotness.
|
|
|
|
uint64_t W;
|
|
|
|
if (Caller->extractProfTotalWeight(W))
|
2019-02-01 01:23:29 +08:00
|
|
|
NewCall->setProfWeight(W);
|
2010-01-05 15:32:13 +08:00
|
|
|
|
|
|
|
// Insert a cast of the return type as necessary.
|
2019-02-01 01:23:29 +08:00
|
|
|
Instruction *NC = NewCall;
|
2010-01-05 15:32:13 +08:00
|
|
|
Value *NV = NC;
|
|
|
|
if (OldRetTy != NV->getType() && !Caller->use_empty()) {
|
|
|
|
if (!NV->getType()->isVoidTy()) {
|
2015-01-06 16:41:31 +08:00
|
|
|
NV = NC = CastInst::CreateBitOrPointerCast(NC, OldRetTy);
|
2011-05-27 08:19:40 +08:00
|
|
|
NC->setDebugLoc(Caller->getDebugLoc());
|
2010-01-05 15:32:13 +08:00
|
|
|
|
2019-02-09 04:48:56 +08:00
|
|
|
// If this is an invoke/callbr instruction, we should insert it after the
|
|
|
|
// first non-phi instruction in the normal successor block.
|
2010-01-05 15:32:13 +08:00
|
|
|
if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
|
2011-08-25 09:08:34 +08:00
|
|
|
BasicBlock::iterator I = II->getNormalDest()->getFirstInsertionPt();
|
2010-01-05 15:32:13 +08:00
|
|
|
InsertNewInstBefore(NC, *I);
|
2019-02-09 04:48:56 +08:00
|
|
|
} else if (CallBrInst *CBI = dyn_cast<CallBrInst>(Caller)) {
|
|
|
|
BasicBlock::iterator I = CBI->getDefaultDest()->getFirstInsertionPt();
|
|
|
|
InsertNewInstBefore(NC, *I);
|
2010-01-05 15:32:13 +08:00
|
|
|
} else {
|
2010-12-20 16:25:06 +08:00
|
|
|
// Otherwise, it's a call, just insert cast right after the call.
|
2010-01-05 15:32:13 +08:00
|
|
|
InsertNewInstBefore(NC, *Caller);
|
|
|
|
}
|
|
|
|
Worklist.AddUsersToWorkList(*Caller);
|
|
|
|
} else {
|
|
|
|
NV = UndefValue::get(Caller->getType());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!Caller->use_empty())
|
2016-02-02 06:23:39 +08:00
|
|
|
replaceInstUsesWith(*Caller, NV);
|
2014-10-23 12:08:42 +08:00
|
|
|
else if (Caller->hasValueHandle()) {
|
|
|
|
if (OldRetTy == NV->getType())
|
|
|
|
ValueHandleBase::ValueIsRAUWd(Caller, NV);
|
|
|
|
else
|
|
|
|
// We cannot call ValueIsRAUWd with a different type, and the
|
|
|
|
// actual tracked value will disappear.
|
|
|
|
ValueHandleBase::ValueIsDeleted(Caller);
|
|
|
|
}
|
2011-05-18 08:32:01 +08:00
|
|
|
|
2016-02-02 06:23:39 +08:00
|
|
|
eraseInstFromFunction(*Caller);
|
2010-01-05 15:32:13 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2016-01-21 06:24:38 +08:00
|
|
|
/// Turn a call to a function created by init_trampoline / adjust_trampoline
|
|
|
|
/// intrinsic pair into a direct call to the underlying function.
|
2011-09-06 21:37:06 +08:00
|
|
|
Instruction *
|
2019-02-01 01:23:29 +08:00
|
|
|
InstCombiner::transformCallThroughTrampoline(CallBase &Call,
|
|
|
|
IntrinsicInst &Tramp) {
|
|
|
|
Value *Callee = Call.getCalledValue();
|
2019-02-02 04:44:54 +08:00
|
|
|
Type *CalleeTy = Callee->getType();
|
|
|
|
FunctionType *FTy = Call.getFunctionType();
|
2019-02-01 01:23:29 +08:00
|
|
|
AttributeList Attrs = Call.getAttributes();
|
2010-01-05 15:32:13 +08:00
|
|
|
|
|
|
|
// If the call already has the 'nest' attribute somewhere then give up -
|
|
|
|
// otherwise 'nest' would occur twice after splicing in the chain.
|
2012-12-31 08:49:59 +08:00
|
|
|
if (Attrs.hasAttrSomewhere(Attribute::Nest))
|
2014-04-25 13:29:35 +08:00
|
|
|
return nullptr;
|
2010-01-05 15:32:13 +08:00
|
|
|
|
2019-02-01 01:23:29 +08:00
|
|
|
Function *NestF = cast<Function>(Tramp.getArgOperand(1)->stripPointerCasts());
|
2019-02-02 04:44:54 +08:00
|
|
|
FunctionType *NestFTy = NestF->getFunctionType();
|
2010-01-05 15:32:13 +08:00
|
|
|
|
2017-04-11 07:31:05 +08:00
|
|
|
AttributeList NestAttrs = NestF->getAttributes();
|
2010-01-05 15:32:13 +08:00
|
|
|
if (!NestAttrs.isEmpty()) {
|
2017-04-14 07:12:13 +08:00
|
|
|
unsigned NestArgNo = 0;
|
2014-04-25 13:29:35 +08:00
|
|
|
Type *NestTy = nullptr;
|
2017-04-12 08:38:00 +08:00
|
|
|
AttributeSet NestAttr;
|
2010-01-05 15:32:13 +08:00
|
|
|
|
|
|
|
// Look for a parameter marked with the 'nest' attribute.
|
|
|
|
for (FunctionType::param_iterator I = NestFTy->param_begin(),
|
2017-04-14 07:12:13 +08:00
|
|
|
E = NestFTy->param_end();
|
|
|
|
I != E; ++NestArgNo, ++I) {
|
|
|
|
AttributeSet AS = NestAttrs.getParamAttributes(NestArgNo);
|
|
|
|
if (AS.hasAttribute(Attribute::Nest)) {
|
2010-01-05 15:32:13 +08:00
|
|
|
// Record the parameter type and any other attributes.
|
|
|
|
NestTy = *I;
|
2017-04-14 07:12:13 +08:00
|
|
|
NestAttr = AS;
|
2010-01-05 15:32:13 +08:00
|
|
|
break;
|
|
|
|
}
|
2017-04-14 07:12:13 +08:00
|
|
|
}
|
2010-01-05 15:32:13 +08:00
|
|
|
|
|
|
|
if (NestTy) {
|
|
|
|
std::vector<Value*> NewArgs;
|
2017-04-13 08:58:09 +08:00
|
|
|
std::vector<AttributeSet> NewArgAttrs;
|
2019-02-01 01:23:29 +08:00
|
|
|
NewArgs.reserve(Call.arg_size() + 1);
|
|
|
|
NewArgAttrs.reserve(Call.arg_size());
|
2010-01-05 15:32:13 +08:00
|
|
|
|
|
|
|
// Insert the nest argument into the call argument list, which may
|
|
|
|
// mean appending it. Likewise for attributes.
|
|
|
|
|
|
|
|
{
|
2017-04-14 07:12:13 +08:00
|
|
|
unsigned ArgNo = 0;
|
2019-02-01 01:23:29 +08:00
|
|
|
auto I = Call.arg_begin(), E = Call.arg_end();
|
2010-01-05 15:32:13 +08:00
|
|
|
do {
|
2017-04-14 07:12:13 +08:00
|
|
|
if (ArgNo == NestArgNo) {
|
2010-01-05 15:32:13 +08:00
|
|
|
// Add the chain argument and attributes.
|
2019-02-01 01:23:29 +08:00
|
|
|
Value *NestVal = Tramp.getArgOperand(2);
|
2010-01-05 15:32:13 +08:00
|
|
|
if (NestVal->getType() != NestTy)
|
2017-07-08 07:16:26 +08:00
|
|
|
NestVal = Builder.CreateBitCast(NestVal, NestTy, "nest");
|
2010-01-05 15:32:13 +08:00
|
|
|
NewArgs.push_back(NestVal);
|
2017-04-13 08:58:09 +08:00
|
|
|
NewArgAttrs.push_back(NestAttr);
|
2010-01-05 15:32:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
if (I == E)
|
|
|
|
break;
|
|
|
|
|
|
|
|
// Add the original argument and attributes.
|
|
|
|
NewArgs.push_back(*I);
|
2017-04-14 07:12:13 +08:00
|
|
|
NewArgAttrs.push_back(Attrs.getParamAttributes(ArgNo));
|
2010-01-05 15:32:13 +08:00
|
|
|
|
2017-04-14 07:12:13 +08:00
|
|
|
++ArgNo;
|
2016-02-19 06:09:30 +08:00
|
|
|
++I;
|
2016-08-12 01:20:18 +08:00
|
|
|
} while (true);
|
2010-01-05 15:32:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// The trampoline may have been bitcast to a bogus type (FTy).
|
|
|
|
// Handle this by synthesizing a new function type, equal to FTy
|
|
|
|
// with the chain parameter inserted.
|
|
|
|
|
2011-07-12 22:06:48 +08:00
|
|
|
std::vector<Type*> NewTypes;
|
2010-01-05 15:32:13 +08:00
|
|
|
NewTypes.reserve(FTy->getNumParams()+1);
|
|
|
|
|
|
|
|
// Insert the chain's type into the list of parameter types, which may
|
|
|
|
// mean appending it.
|
|
|
|
{
|
2017-04-14 07:12:13 +08:00
|
|
|
unsigned ArgNo = 0;
|
2010-01-05 15:32:13 +08:00
|
|
|
FunctionType::param_iterator I = FTy->param_begin(),
|
|
|
|
E = FTy->param_end();
|
|
|
|
|
|
|
|
do {
|
2017-04-14 07:12:13 +08:00
|
|
|
if (ArgNo == NestArgNo)
|
2010-01-05 15:32:13 +08:00
|
|
|
// Add the chain's type.
|
|
|
|
NewTypes.push_back(NestTy);
|
|
|
|
|
|
|
|
if (I == E)
|
|
|
|
break;
|
|
|
|
|
|
|
|
// Add the original type.
|
|
|
|
NewTypes.push_back(*I);
|
|
|
|
|
2017-04-14 07:12:13 +08:00
|
|
|
++ArgNo;
|
2016-02-19 06:09:30 +08:00
|
|
|
++I;
|
2016-08-12 01:20:18 +08:00
|
|
|
} while (true);
|
2010-01-05 15:32:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Replace the trampoline call with a direct call. Let the generic
|
|
|
|
// code sort out any function type mismatches.
|
2012-02-03 08:07:04 +08:00
|
|
|
FunctionType *NewFTy = FunctionType::get(FTy->getReturnType(), NewTypes,
|
2010-01-05 15:32:13 +08:00
|
|
|
FTy->isVarArg());
|
|
|
|
Constant *NewCallee =
|
|
|
|
NestF->getType() == PointerType::getUnqual(NewFTy) ?
|
2012-02-03 08:07:04 +08:00
|
|
|
NestF : ConstantExpr::getBitCast(NestF,
|
2010-01-05 15:32:13 +08:00
|
|
|
PointerType::getUnqual(NewFTy));
|
2017-04-13 08:58:09 +08:00
|
|
|
AttributeList NewPAL =
|
|
|
|
AttributeList::get(FTy->getContext(), Attrs.getFnAttributes(),
|
|
|
|
Attrs.getRetAttributes(), NewArgAttrs);
|
2010-01-05 15:32:13 +08:00
|
|
|
|
2016-04-29 16:07:20 +08:00
|
|
|
SmallVector<OperandBundleDef, 1> OpBundles;
|
2019-02-01 01:23:29 +08:00
|
|
|
Call.getOperandBundlesAsDefs(OpBundles);
|
2016-04-29 16:07:20 +08:00
|
|
|
|
2010-01-05 15:32:13 +08:00
|
|
|
Instruction *NewCaller;
|
2019-02-01 01:23:29 +08:00
|
|
|
if (InvokeInst *II = dyn_cast<InvokeInst>(&Call)) {
|
2019-02-02 04:43:25 +08:00
|
|
|
NewCaller = InvokeInst::Create(NewFTy, NewCallee,
|
2010-01-05 15:32:13 +08:00
|
|
|
II->getNormalDest(), II->getUnwindDest(),
|
2016-04-29 16:07:20 +08:00
|
|
|
NewArgs, OpBundles);
|
2010-01-05 15:32:13 +08:00
|
|
|
cast<InvokeInst>(NewCaller)->setCallingConv(II->getCallingConv());
|
|
|
|
cast<InvokeInst>(NewCaller)->setAttributes(NewPAL);
|
2019-02-09 04:48:56 +08:00
|
|
|
} else if (CallBrInst *CBI = dyn_cast<CallBrInst>(&Call)) {
|
|
|
|
NewCaller =
|
|
|
|
CallBrInst::Create(NewFTy, NewCallee, CBI->getDefaultDest(),
|
|
|
|
CBI->getIndirectDests(), NewArgs, OpBundles);
|
|
|
|
cast<CallBrInst>(NewCaller)->setCallingConv(CBI->getCallingConv());
|
|
|
|
cast<CallBrInst>(NewCaller)->setAttributes(NewPAL);
|
2010-01-05 15:32:13 +08:00
|
|
|
} else {
|
2019-02-02 04:43:25 +08:00
|
|
|
NewCaller = CallInst::Create(NewFTy, NewCallee, NewArgs, OpBundles);
|
2016-11-26 06:35:09 +08:00
|
|
|
cast<CallInst>(NewCaller)->setTailCallKind(
|
2019-02-01 01:23:29 +08:00
|
|
|
cast<CallInst>(Call).getTailCallKind());
|
2016-11-26 06:35:09 +08:00
|
|
|
cast<CallInst>(NewCaller)->setCallingConv(
|
2019-02-01 01:23:29 +08:00
|
|
|
cast<CallInst>(Call).getCallingConv());
|
2010-01-05 15:32:13 +08:00
|
|
|
cast<CallInst>(NewCaller)->setAttributes(NewPAL);
|
|
|
|
}
|
2019-02-01 01:23:29 +08:00
|
|
|
NewCaller->setDebugLoc(Call.getDebugLoc());
|
2011-05-19 03:57:14 +08:00
|
|
|
|
|
|
|
return NewCaller;
|
2010-01-05 15:32:13 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Replace the trampoline call with a direct call. Since there is no 'nest'
|
|
|
|
// parameter, there is no need to adjust the argument list. Let the generic
|
|
|
|
// code sort out any function type mismatches.
|
2019-02-02 04:44:54 +08:00
|
|
|
Constant *NewCallee = ConstantExpr::getBitCast(NestF, CalleeTy);
|
|
|
|
Call.setCalledFunction(FTy, NewCallee);
|
2019-02-01 01:23:29 +08:00
|
|
|
return &Call;
|
2010-01-05 15:32:13 +08:00
|
|
|
}
|