llvm-project/clang/lib/CodeGen/CGExprAgg.cpp

940 lines
35 KiB
C++
Raw Normal View History

2007-08-24 10:22:53 +08:00
//===--- CGExprAgg.cpp - Emit LLVM Code from Aggregate Expressions --------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This contains code to emit Aggregate Expr nodes as LLVM code.
//
//===----------------------------------------------------------------------===//
#include "CodeGenFunction.h"
#include "CodeGenModule.h"
#include "CGObjCRuntime.h"
#include "clang/AST/ASTContext.h"
#include "clang/AST/DeclCXX.h"
#include "clang/AST/StmtVisitor.h"
#include "llvm/Constants.h"
#include "llvm/Function.h"
#include "llvm/GlobalVariable.h"
#include "llvm/Intrinsics.h"
using namespace clang;
using namespace CodeGen;
//===----------------------------------------------------------------------===//
// Aggregate Expression Emitter
//===----------------------------------------------------------------------===//
namespace {
class AggExprEmitter : public StmtVisitor<AggExprEmitter> {
CodeGenFunction &CGF;
CGBuilderTy &Builder;
AggValueSlot Dest;
bool IgnoreResult;
ReturnValueSlot getReturnValueSlot() const {
// If the destination slot requires garbage collection, we can't
// use the real return value slot, because we have to use the GC
// API.
if (Dest.requiresGCollection()) return ReturnValueSlot();
return ReturnValueSlot(Dest.getAddr(), Dest.isVolatile());
}
AggValueSlot EnsureSlot(QualType T) {
if (!Dest.isIgnored()) return Dest;
return CGF.CreateAggTemp(T, "agg.tmp.ensured");
}
public:
AggExprEmitter(CodeGenFunction &cgf, AggValueSlot Dest,
bool ignore)
: CGF(cgf), Builder(CGF.Builder), Dest(Dest),
IgnoreResult(ignore) {
}
//===--------------------------------------------------------------------===//
// Utilities
//===--------------------------------------------------------------------===//
/// EmitAggLoadOfLValue - Given an expression with aggregate type that
/// represents a value lvalue, this method emits the address of the lvalue,
/// then loads the result into DestPtr.
void EmitAggLoadOfLValue(const Expr *E);
/// EmitFinalDestCopy - Perform the final copy to DestPtr, if desired.
void EmitFinalDestCopy(const Expr *E, LValue Src, bool Ignore = false);
void EmitFinalDestCopy(const Expr *E, RValue Src, bool Ignore = false);
void EmitGCMove(const Expr *E, RValue Src);
bool TypeRequiresGCollection(QualType T);
//===--------------------------------------------------------------------===//
// Visitor Methods
//===--------------------------------------------------------------------===//
void VisitStmt(Stmt *S) {
CGF.ErrorUnsupported(S, "aggregate expression");
}
void VisitParenExpr(ParenExpr *PE) { Visit(PE->getSubExpr()); }
void VisitUnaryExtension(UnaryOperator *E) { Visit(E->getSubExpr()); }
// l-values.
void VisitDeclRefExpr(DeclRefExpr *DRE) { EmitAggLoadOfLValue(DRE); }
void VisitMemberExpr(MemberExpr *ME) { EmitAggLoadOfLValue(ME); }
void VisitUnaryDeref(UnaryOperator *E) { EmitAggLoadOfLValue(E); }
void VisitStringLiteral(StringLiteral *E) { EmitAggLoadOfLValue(E); }
void VisitCompoundLiteralExpr(CompoundLiteralExpr *E) {
EmitAggLoadOfLValue(E);
}
void VisitArraySubscriptExpr(ArraySubscriptExpr *E) {
EmitAggLoadOfLValue(E);
}
void VisitBlockDeclRefExpr(const BlockDeclRefExpr *E) {
EmitAggLoadOfLValue(E);
}
void VisitPredefinedExpr(const PredefinedExpr *E) {
EmitAggLoadOfLValue(E);
}
// Operators.
void VisitCastExpr(CastExpr *E);
void VisitCallExpr(const CallExpr *E);
void VisitStmtExpr(const StmtExpr *E);
void VisitBinaryOperator(const BinaryOperator *BO);
void VisitPointerToDataMemberBinaryOperator(const BinaryOperator *BO);
void VisitBinAssign(const BinaryOperator *E);
void VisitBinComma(const BinaryOperator *E);
void VisitObjCMessageExpr(ObjCMessageExpr *E);
void VisitObjCIvarRefExpr(ObjCIvarRefExpr *E) {
EmitAggLoadOfLValue(E);
}
void VisitObjCPropertyRefExpr(ObjCPropertyRefExpr *E);
void VisitConditionalOperator(const ConditionalOperator *CO);
void VisitChooseExpr(const ChooseExpr *CE);
void VisitInitListExpr(InitListExpr *E);
void VisitImplicitValueInitExpr(ImplicitValueInitExpr *E);
void VisitCXXDefaultArgExpr(CXXDefaultArgExpr *DAE) {
Visit(DAE->getExpr());
}
void VisitCXXBindTemporaryExpr(CXXBindTemporaryExpr *E);
void VisitCXXConstructExpr(const CXXConstructExpr *E);
void VisitExprWithCleanups(ExprWithCleanups *E);
void VisitCXXScalarValueInitExpr(CXXScalarValueInitExpr *E);
void VisitCXXTypeidExpr(CXXTypeidExpr *E) { EmitAggLoadOfLValue(E); }
void VisitVAArgExpr(VAArgExpr *E);
2010-02-04 01:33:16 +08:00
void EmitInitializationToLValue(Expr *E, LValue Address, QualType T);
void EmitNullInitializationToLValue(LValue Address, QualType T);
// case Expr::ChooseExprClass:
void VisitCXXThrowExpr(const CXXThrowExpr *E) { CGF.EmitCXXThrowExpr(E); }
};
} // end anonymous namespace.
//===----------------------------------------------------------------------===//
// Utilities
//===----------------------------------------------------------------------===//
/// EmitAggLoadOfLValue - Given an expression with aggregate type that
/// represents a value lvalue, this method emits the address of the lvalue,
/// then loads the result into DestPtr.
void AggExprEmitter::EmitAggLoadOfLValue(const Expr *E) {
LValue LV = CGF.EmitLValue(E);
EmitFinalDestCopy(E, LV);
}
/// \brief True if the given aggregate type requires special GC API calls.
bool AggExprEmitter::TypeRequiresGCollection(QualType T) {
// Only record types have members that might require garbage collection.
const RecordType *RecordTy = T->getAs<RecordType>();
if (!RecordTy) return false;
// Don't mess with non-trivial C++ types.
RecordDecl *Record = RecordTy->getDecl();
if (isa<CXXRecordDecl>(Record) &&
(!cast<CXXRecordDecl>(Record)->hasTrivialCopyConstructor() ||
!cast<CXXRecordDecl>(Record)->hasTrivialDestructor()))
return false;
// Check whether the type has an object member.
return Record->hasObjectMember();
}
/// \brief Perform the final move to DestPtr if RequiresGCollection is set.
///
/// The idea is that you do something like this:
/// RValue Result = EmitSomething(..., getReturnValueSlot());
/// EmitGCMove(E, Result);
/// If GC doesn't interfere, this will cause the result to be emitted
/// directly into the return value slot. If GC does interfere, a final
/// move will be performed.
void AggExprEmitter::EmitGCMove(const Expr *E, RValue Src) {
if (Dest.requiresGCollection()) {
std::pair<uint64_t, unsigned> TypeInfo =
CGF.getContext().getTypeInfo(E->getType());
unsigned long size = TypeInfo.first/8;
const llvm::Type *SizeTy = CGF.ConvertType(CGF.getContext().getSizeType());
llvm::Value *SizeVal = llvm::ConstantInt::get(SizeTy, size);
CGF.CGM.getObjCRuntime().EmitGCMemmoveCollectable(CGF, Dest.getAddr(),
Src.getAggregateAddr(),
SizeVal);
}
}
/// EmitFinalDestCopy - Perform the final copy to DestPtr, if desired.
void AggExprEmitter::EmitFinalDestCopy(const Expr *E, RValue Src, bool Ignore) {
assert(Src.isAggregate() && "value must be aggregate value!");
// If Dest is ignored, then we're evaluating an aggregate expression
// in a context (like an expression statement) that doesn't care
// about the result. C says that an lvalue-to-rvalue conversion is
// performed in these cases; C++ says that it is not. In either
// case, we don't actually need to do anything unless the value is
// volatile.
if (Dest.isIgnored()) {
if (!Src.isVolatileQualified() ||
CGF.CGM.getLangOptions().CPlusPlus ||
(IgnoreResult && Ignore))
return;
// If the source is volatile, we must read from it; to do that, we need
// some place to put it.
Dest = CGF.CreateAggTemp(E->getType(), "agg.tmp");
}
if (Dest.requiresGCollection()) {
std::pair<uint64_t, unsigned> TypeInfo =
CGF.getContext().getTypeInfo(E->getType());
unsigned long size = TypeInfo.first/8;
const llvm::Type *SizeTy = CGF.ConvertType(CGF.getContext().getSizeType());
llvm::Value *SizeVal = llvm::ConstantInt::get(SizeTy, size);
CGF.CGM.getObjCRuntime().EmitGCMemmoveCollectable(CGF,
Dest.getAddr(),
Src.getAggregateAddr(),
SizeVal);
return;
}
// If the result of the assignment is used, copy the LHS there also.
// FIXME: Pass VolatileDest as well. I think we also need to merge volatile
// from the source as well, as we can't eliminate it if either operand
// is volatile, unless copy has volatile for both source and destination..
CGF.EmitAggregateCopy(Dest.getAddr(), Src.getAggregateAddr(), E->getType(),
Dest.isVolatile()|Src.isVolatileQualified());
}
/// EmitFinalDestCopy - Perform the final copy to DestPtr, if desired.
void AggExprEmitter::EmitFinalDestCopy(const Expr *E, LValue Src, bool Ignore) {
assert(Src.isSimple() && "Can't have aggregate bitfield, vector, etc");
EmitFinalDestCopy(E, RValue::getAggregate(Src.getAddress(),
Src.isVolatileQualified()),
Ignore);
}
//===----------------------------------------------------------------------===//
// Visitor Methods
//===----------------------------------------------------------------------===//
void AggExprEmitter::VisitCastExpr(CastExpr *E) {
if (Dest.isIgnored() && E->getCastKind() != CK_Dynamic) {
Visit(E->getSubExpr());
return;
}
switch (E->getCastKind()) {
case CK_Dynamic: {
assert(isa<CXXDynamicCastExpr>(E) && "CK_Dynamic without a dynamic_cast?");
LValue LV = CGF.EmitCheckedLValue(E->getSubExpr());
// FIXME: Do we also need to handle property references here?
if (LV.isSimple())
CGF.EmitDynamicCast(LV.getAddress(), cast<CXXDynamicCastExpr>(E));
else
CGF.CGM.ErrorUnsupported(E, "non-simple lvalue dynamic_cast");
if (!Dest.isIgnored())
CGF.CGM.ErrorUnsupported(E, "lvalue dynamic_cast with a destination");
break;
}
case CK_ToUnion: {
// GCC union extension
QualType Ty = E->getSubExpr()->getType();
QualType PtrTy = CGF.getContext().getPointerType(Ty);
llvm::Value *CastPtr = Builder.CreateBitCast(Dest.getAddr(),
CGF.ConvertType(PtrTy));
EmitInitializationToLValue(E->getSubExpr(), CGF.MakeAddrLValue(CastPtr, Ty),
Ty);
break;
}
case CK_DerivedToBase:
case CK_BaseToDerived:
case CK_UncheckedDerivedToBase: {
assert(0 && "cannot perform hierarchy conversion in EmitAggExpr: "
"should have been unpacked before we got here");
break;
}
case CK_GetObjCProperty: {
LValue LV = CGF.EmitLValue(E->getSubExpr());
assert(LV.isPropertyRef());
RValue RV = CGF.EmitLoadOfPropertyRefLValue(LV, getReturnValueSlot());
EmitGCMove(E, RV);
break;
}
case CK_LValueToRValue: // hope for downstream optimization
case CK_NoOp:
case CK_UserDefinedConversion:
case CK_ConstructorConversion:
assert(CGF.getContext().hasSameUnqualifiedType(E->getSubExpr()->getType(),
E->getType()) &&
"Implicit cast types must be compatible");
Visit(E->getSubExpr());
break;
case CK_LValueBitCast:
llvm_unreachable("should not be emitting lvalue bitcast as rvalue");
break;
case CK_Dependent:
case CK_BitCast:
case CK_ArrayToPointerDecay:
case CK_FunctionToPointerDecay:
case CK_NullToPointer:
case CK_NullToMemberPointer:
case CK_BaseToDerivedMemberPointer:
case CK_DerivedToBaseMemberPointer:
case CK_MemberPointerToBoolean:
case CK_IntegralToPointer:
case CK_PointerToIntegral:
case CK_PointerToBoolean:
case CK_ToVoid:
case CK_VectorSplat:
case CK_IntegralCast:
case CK_IntegralToBoolean:
case CK_IntegralToFloating:
case CK_FloatingToIntegral:
case CK_FloatingToBoolean:
case CK_FloatingCast:
case CK_AnyPointerToObjCPointerCast:
case CK_AnyPointerToBlockPointerCast:
case CK_ObjCObjectLValueCast:
case CK_FloatingRealToComplex:
case CK_FloatingComplexToReal:
case CK_FloatingComplexToBoolean:
case CK_FloatingComplexCast:
case CK_FloatingComplexToIntegralComplex:
case CK_IntegralRealToComplex:
case CK_IntegralComplexToReal:
case CK_IntegralComplexToBoolean:
case CK_IntegralComplexCast:
case CK_IntegralComplexToFloatingComplex:
llvm_unreachable("cast kind invalid for aggregate types");
}
}
void AggExprEmitter::VisitCallExpr(const CallExpr *E) {
if (E->getCallReturnType()->isReferenceType()) {
EmitAggLoadOfLValue(E);
return;
}
RValue RV = CGF.EmitCallExpr(E, getReturnValueSlot());
EmitGCMove(E, RV);
}
void AggExprEmitter::VisitObjCMessageExpr(ObjCMessageExpr *E) {
RValue RV = CGF.EmitObjCMessageExpr(E, getReturnValueSlot());
EmitGCMove(E, RV);
}
void AggExprEmitter::VisitObjCPropertyRefExpr(ObjCPropertyRefExpr *E) {
llvm_unreachable("direct property access not surrounded by "
"lvalue-to-rvalue cast");
}
void AggExprEmitter::VisitBinComma(const BinaryOperator *E) {
CGF.EmitIgnoredExpr(E->getLHS());
Visit(E->getRHS());
}
void AggExprEmitter::VisitStmtExpr(const StmtExpr *E) {
CGF.EmitCompoundStmt(*E->getSubStmt(), true, Dest);
CGF.EnsureInsertPoint();
}
void AggExprEmitter::VisitBinaryOperator(const BinaryOperator *E) {
if (E->getOpcode() == BO_PtrMemD || E->getOpcode() == BO_PtrMemI)
VisitPointerToDataMemberBinaryOperator(E);
else
CGF.ErrorUnsupported(E, "aggregate binary expression");
}
void AggExprEmitter::VisitPointerToDataMemberBinaryOperator(
const BinaryOperator *E) {
LValue LV = CGF.EmitPointerToDataMemberBinaryExpr(E);
EmitFinalDestCopy(E, LV);
}
void AggExprEmitter::VisitBinAssign(const BinaryOperator *E) {
// For an assignment to work, the value on the right has
// to be compatible with the value on the left.
assert(CGF.getContext().hasSameUnqualifiedType(E->getLHS()->getType(),
E->getRHS()->getType())
&& "Invalid assignment");
// FIXME: __block variables need the RHS evaluated first!
LValue LHS = CGF.EmitLValue(E->getLHS());
// We have to special case property setters, otherwise we must have
// a simple lvalue (no aggregates inside vectors, bitfields).
if (LHS.isPropertyRef()) {
AggValueSlot Slot = EnsureSlot(E->getRHS()->getType());
CGF.EmitAggExpr(E->getRHS(), Slot);
CGF.EmitStoreThroughPropertyRefLValue(Slot.asRValue(), LHS);
} else {
bool GCollection = false;
if (CGF.getContext().getLangOptions().getGCMode())
GCollection = TypeRequiresGCollection(E->getLHS()->getType());
// Codegen the RHS so that it stores directly into the LHS.
AggValueSlot LHSSlot = AggValueSlot::forLValue(LHS, true,
GCollection);
CGF.EmitAggExpr(E->getRHS(), LHSSlot, false);
EmitFinalDestCopy(E, LHS, true);
}
}
void AggExprEmitter::VisitConditionalOperator(const ConditionalOperator *E) {
if (!E->getLHS()) {
CGF.ErrorUnsupported(E, "conditional operator with missing LHS");
return;
}
llvm::BasicBlock *LHSBlock = CGF.createBasicBlock("cond.true");
llvm::BasicBlock *RHSBlock = CGF.createBasicBlock("cond.false");
llvm::BasicBlock *ContBlock = CGF.createBasicBlock("cond.end");
CGF.EmitBranchOnBoolExpr(E->getCond(), LHSBlock, RHSBlock);
CGF.BeginConditionalBranch();
CGF.EmitBlock(LHSBlock);
// Save whether the destination's lifetime is externally managed.
bool DestLifetimeManaged = Dest.isLifetimeExternallyManaged();
Visit(E->getLHS());
CGF.EndConditionalBranch();
CGF.EmitBranch(ContBlock);
CGF.BeginConditionalBranch();
CGF.EmitBlock(RHSBlock);
// If the result of an agg expression is unused, then the emission
// of the LHS might need to create a destination slot. That's fine
// with us, and we can safely emit the RHS into the same slot, but
// we shouldn't claim that its lifetime is externally managed.
Dest.setLifetimeExternallyManaged(DestLifetimeManaged);
Visit(E->getRHS());
CGF.EndConditionalBranch();
CGF.EmitBranch(ContBlock);
CGF.EmitBlock(ContBlock);
}
void AggExprEmitter::VisitChooseExpr(const ChooseExpr *CE) {
Visit(CE->getChosenSubExpr(CGF.getContext()));
}
void AggExprEmitter::VisitVAArgExpr(VAArgExpr *VE) {
llvm::Value *ArgValue = CGF.EmitVAListRef(VE->getSubExpr());
llvm::Value *ArgPtr = CGF.EmitVAArg(ArgValue, VE->getType());
if (!ArgPtr) {
CGF.ErrorUnsupported(VE, "aggregate va_arg expression");
return;
}
EmitFinalDestCopy(VE, CGF.MakeAddrLValue(ArgPtr, VE->getType()));
}
void AggExprEmitter::VisitCXXBindTemporaryExpr(CXXBindTemporaryExpr *E) {
// Ensure that we have a slot, but if we already do, remember
// whether its lifetime was externally managed.
bool WasManaged = Dest.isLifetimeExternallyManaged();
Dest = EnsureSlot(E->getType());
Dest.setLifetimeExternallyManaged();
Visit(E->getSubExpr());
// Set up the temporary's destructor if its lifetime wasn't already
// being managed.
if (!WasManaged)
CGF.EmitCXXTemporary(E->getTemporary(), Dest.getAddr());
}
void
AggExprEmitter::VisitCXXConstructExpr(const CXXConstructExpr *E) {
AggValueSlot Slot = EnsureSlot(E->getType());
CGF.EmitCXXConstructExpr(E, Slot);
}
void AggExprEmitter::VisitExprWithCleanups(ExprWithCleanups *E) {
CGF.EmitExprWithCleanups(E, Dest);
}
void AggExprEmitter::VisitCXXScalarValueInitExpr(CXXScalarValueInitExpr *E) {
QualType T = E->getType();
AggValueSlot Slot = EnsureSlot(T);
EmitNullInitializationToLValue(CGF.MakeAddrLValue(Slot.getAddr(), T), T);
}
void AggExprEmitter::VisitImplicitValueInitExpr(ImplicitValueInitExpr *E) {
QualType T = E->getType();
AggValueSlot Slot = EnsureSlot(T);
EmitNullInitializationToLValue(CGF.MakeAddrLValue(Slot.getAddr(), T), T);
}
Improve codegen for initializer lists to use memset more aggressively when an initializer is variable (I handled the constant case in a previous patch). This has three pieces: 1. Enhance AggValueSlot to have a 'isZeroed' bit to tell CGExprAgg that the memory being stored into has previously been memset to zero. 2. Teach CGExprAgg to not emit stores of zero to isZeroed memory. 3. Teach CodeGenFunction::EmitAggExpr to scan initializers to determine whether they are profitable to emit a memset + inividual stores vs stores for everything. The heuristic used is that a global has to be more than 16 bytes and has to be 3/4 zero to be candidate for this xform. The two testcases are illustrative of the scenarios this catches. We now codegen test9 into: call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 400, i32 4, i1 false) %.array = getelementptr inbounds [100 x i32]* %Arr, i32 0, i32 0 %tmp = load i32* %X.addr, align 4 store i32 %tmp, i32* %.array and test10 into: call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 392, i32 8, i1 false) %tmp = getelementptr inbounds %struct.b* %S, i32 0, i32 0 %tmp1 = getelementptr inbounds %struct.a* %tmp, i32 0, i32 0 %tmp2 = load i32* %X.addr, align 4 store i32 %tmp2, i32* %tmp1, align 4 %tmp5 = getelementptr inbounds %struct.b* %S, i32 0, i32 3 %tmp10 = getelementptr inbounds %struct.a* %tmp5, i32 0, i32 4 %tmp11 = load i32* %X.addr, align 4 store i32 %tmp11, i32* %tmp10, align 4 Previously we produced 99 stores of zero for test9 and also tons for test10. This xforms should substantially speed up -O0 builds when it kicks in as well as reducing code size and optimizer heartburn on insane cases. This resolves PR279. llvm-svn: 120692
2010-12-02 15:07:26 +08:00
/// isSimpleZero - If emitting this value will obviously just cause a store of
/// zero to memory, return true. This can return false if uncertain, so it just
/// handles simple cases.
static bool isSimpleZero(const Expr *E, CodeGenFunction &CGF) {
// (0)
if (const ParenExpr *PE = dyn_cast<ParenExpr>(E))
return isSimpleZero(PE->getSubExpr(), CGF);
// 0
if (const IntegerLiteral *IL = dyn_cast<IntegerLiteral>(E))
return IL->getValue() == 0;
// +0.0
if (const FloatingLiteral *FL = dyn_cast<FloatingLiteral>(E))
return FL->getValue().isPosZero();
// int()
if ((isa<ImplicitValueInitExpr>(E) || isa<CXXScalarValueInitExpr>(E)) &&
CGF.getTypes().isZeroInitializable(E->getType()))
return true;
// (int*)0 - Null pointer expressions.
if (const CastExpr *ICE = dyn_cast<CastExpr>(E))
return ICE->getCastKind() == CK_NullToPointer;
// '\0'
if (const CharacterLiteral *CL = dyn_cast<CharacterLiteral>(E))
return CL->getValue() == 0;
// Otherwise, hard case: conservatively return false.
return false;
}
2010-02-04 01:33:16 +08:00
void
AggExprEmitter::EmitInitializationToLValue(Expr* E, LValue LV, QualType T) {
// FIXME: Ignore result?
// FIXME: Are initializers affected by volatile?
Improve codegen for initializer lists to use memset more aggressively when an initializer is variable (I handled the constant case in a previous patch). This has three pieces: 1. Enhance AggValueSlot to have a 'isZeroed' bit to tell CGExprAgg that the memory being stored into has previously been memset to zero. 2. Teach CGExprAgg to not emit stores of zero to isZeroed memory. 3. Teach CodeGenFunction::EmitAggExpr to scan initializers to determine whether they are profitable to emit a memset + inividual stores vs stores for everything. The heuristic used is that a global has to be more than 16 bytes and has to be 3/4 zero to be candidate for this xform. The two testcases are illustrative of the scenarios this catches. We now codegen test9 into: call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 400, i32 4, i1 false) %.array = getelementptr inbounds [100 x i32]* %Arr, i32 0, i32 0 %tmp = load i32* %X.addr, align 4 store i32 %tmp, i32* %.array and test10 into: call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 392, i32 8, i1 false) %tmp = getelementptr inbounds %struct.b* %S, i32 0, i32 0 %tmp1 = getelementptr inbounds %struct.a* %tmp, i32 0, i32 0 %tmp2 = load i32* %X.addr, align 4 store i32 %tmp2, i32* %tmp1, align 4 %tmp5 = getelementptr inbounds %struct.b* %S, i32 0, i32 3 %tmp10 = getelementptr inbounds %struct.a* %tmp5, i32 0, i32 4 %tmp11 = load i32* %X.addr, align 4 store i32 %tmp11, i32* %tmp10, align 4 Previously we produced 99 stores of zero for test9 and also tons for test10. This xforms should substantially speed up -O0 builds when it kicks in as well as reducing code size and optimizer heartburn on insane cases. This resolves PR279. llvm-svn: 120692
2010-12-02 15:07:26 +08:00
if (Dest.isZeroed() && isSimpleZero(E, CGF)) {
// Storing "i32 0" to a zero'd memory location is a noop.
} else if (isa<ImplicitValueInitExpr>(E)) {
2010-02-04 01:33:16 +08:00
EmitNullInitializationToLValue(LV, T);
} else if (T->isReferenceType()) {
RValue RV = CGF.EmitReferenceBindingToExpr(E, /*InitializedDecl=*/0);
CGF.EmitStoreThroughLValue(RV, LV, T);
2010-02-04 01:33:16 +08:00
} else if (T->isAnyComplexType()) {
CGF.EmitComplexExprIntoAddr(E, LV.getAddress(), false);
2010-02-04 01:33:16 +08:00
} else if (CGF.hasAggregateLLVMType(T)) {
Improve codegen for initializer lists to use memset more aggressively when an initializer is variable (I handled the constant case in a previous patch). This has three pieces: 1. Enhance AggValueSlot to have a 'isZeroed' bit to tell CGExprAgg that the memory being stored into has previously been memset to zero. 2. Teach CGExprAgg to not emit stores of zero to isZeroed memory. 3. Teach CodeGenFunction::EmitAggExpr to scan initializers to determine whether they are profitable to emit a memset + inividual stores vs stores for everything. The heuristic used is that a global has to be more than 16 bytes and has to be 3/4 zero to be candidate for this xform. The two testcases are illustrative of the scenarios this catches. We now codegen test9 into: call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 400, i32 4, i1 false) %.array = getelementptr inbounds [100 x i32]* %Arr, i32 0, i32 0 %tmp = load i32* %X.addr, align 4 store i32 %tmp, i32* %.array and test10 into: call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 392, i32 8, i1 false) %tmp = getelementptr inbounds %struct.b* %S, i32 0, i32 0 %tmp1 = getelementptr inbounds %struct.a* %tmp, i32 0, i32 0 %tmp2 = load i32* %X.addr, align 4 store i32 %tmp2, i32* %tmp1, align 4 %tmp5 = getelementptr inbounds %struct.b* %S, i32 0, i32 3 %tmp10 = getelementptr inbounds %struct.a* %tmp5, i32 0, i32 4 %tmp11 = load i32* %X.addr, align 4 store i32 %tmp11, i32* %tmp10, align 4 Previously we produced 99 stores of zero for test9 and also tons for test10. This xforms should substantially speed up -O0 builds when it kicks in as well as reducing code size and optimizer heartburn on insane cases. This resolves PR279. llvm-svn: 120692
2010-12-02 15:07:26 +08:00
CGF.EmitAggExpr(E, AggValueSlot::forAddr(LV.getAddress(), false, true,
false, Dest.isZeroed()));
} else {
CGF.EmitStoreThroughLValue(RValue::get(CGF.EmitScalarExpr(E)), LV, T);
}
}
void AggExprEmitter::EmitNullInitializationToLValue(LValue LV, QualType T) {
Improve codegen for initializer lists to use memset more aggressively when an initializer is variable (I handled the constant case in a previous patch). This has three pieces: 1. Enhance AggValueSlot to have a 'isZeroed' bit to tell CGExprAgg that the memory being stored into has previously been memset to zero. 2. Teach CGExprAgg to not emit stores of zero to isZeroed memory. 3. Teach CodeGenFunction::EmitAggExpr to scan initializers to determine whether they are profitable to emit a memset + inividual stores vs stores for everything. The heuristic used is that a global has to be more than 16 bytes and has to be 3/4 zero to be candidate for this xform. The two testcases are illustrative of the scenarios this catches. We now codegen test9 into: call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 400, i32 4, i1 false) %.array = getelementptr inbounds [100 x i32]* %Arr, i32 0, i32 0 %tmp = load i32* %X.addr, align 4 store i32 %tmp, i32* %.array and test10 into: call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 392, i32 8, i1 false) %tmp = getelementptr inbounds %struct.b* %S, i32 0, i32 0 %tmp1 = getelementptr inbounds %struct.a* %tmp, i32 0, i32 0 %tmp2 = load i32* %X.addr, align 4 store i32 %tmp2, i32* %tmp1, align 4 %tmp5 = getelementptr inbounds %struct.b* %S, i32 0, i32 3 %tmp10 = getelementptr inbounds %struct.a* %tmp5, i32 0, i32 4 %tmp11 = load i32* %X.addr, align 4 store i32 %tmp11, i32* %tmp10, align 4 Previously we produced 99 stores of zero for test9 and also tons for test10. This xforms should substantially speed up -O0 builds when it kicks in as well as reducing code size and optimizer heartburn on insane cases. This resolves PR279. llvm-svn: 120692
2010-12-02 15:07:26 +08:00
// If the destination slot is already zeroed out before the aggregate is
// copied into it, we don't have to emit any zeros here.
if (Dest.isZeroed() && CGF.getTypes().isZeroInitializable(T))
return;
if (!CGF.hasAggregateLLVMType(T)) {
// For non-aggregates, we can store zero
llvm::Value *Null = llvm::Constant::getNullValue(CGF.ConvertType(T));
CGF.EmitStoreThroughLValue(RValue::get(Null), LV, T);
} else {
// There's a potential optimization opportunity in combining
// memsets; that would be easy for arrays, but relatively
// difficult for structures with the current code.
CGF.EmitNullInitialization(LV.getAddress(), T);
}
}
void AggExprEmitter::VisitInitListExpr(InitListExpr *E) {
#if 0
// FIXME: Assess perf here? Figure out what cases are worth optimizing here
// (Length of globals? Chunks of zeroed-out space?).
//
2009-05-16 15:57:57 +08:00
// If we can, prefer a copy from a global; this is a lot less code for long
// globals, and it's easier for the current optimizers to analyze.
if (llvm::Constant* C = CGF.CGM.EmitConstantExpr(E, E->getType(), &CGF)) {
llvm::GlobalVariable* GV =
new llvm::GlobalVariable(CGF.CGM.getModule(), C->getType(), true,
llvm::GlobalValue::InternalLinkage, C, "");
EmitFinalDestCopy(E, CGF.MakeAddrLValue(GV, E->getType()));
return;
}
#endif
if (E->hadArrayRangeDesignator())
CGF.ErrorUnsupported(E, "GNU array range designator extension");
llvm::Value *DestPtr = Dest.getAddr();
// Handle initialization of an array.
if (E->getType()->isArrayType()) {
const llvm::PointerType *APType =
cast<llvm::PointerType>(DestPtr->getType());
const llvm::ArrayType *AType =
cast<llvm::ArrayType>(APType->getElementType());
uint64_t NumInitElements = E->getNumInits();
if (E->getNumInits() > 0) {
QualType T1 = E->getType();
QualType T2 = E->getInit(0)->getType();
if (CGF.getContext().hasSameUnqualifiedType(T1, T2)) {
EmitAggLoadOfLValue(E->getInit(0));
return;
}
}
uint64_t NumArrayElements = AType->getNumElements();
QualType ElementType = CGF.getContext().getCanonicalType(E->getType());
ElementType = CGF.getContext().getAsArrayType(ElementType)->getElementType();
// FIXME: were we intentionally ignoring address spaces and GC attributes?
for (uint64_t i = 0; i != NumArrayElements; ++i) {
Improve codegen for initializer lists to use memset more aggressively when an initializer is variable (I handled the constant case in a previous patch). This has three pieces: 1. Enhance AggValueSlot to have a 'isZeroed' bit to tell CGExprAgg that the memory being stored into has previously been memset to zero. 2. Teach CGExprAgg to not emit stores of zero to isZeroed memory. 3. Teach CodeGenFunction::EmitAggExpr to scan initializers to determine whether they are profitable to emit a memset + inividual stores vs stores for everything. The heuristic used is that a global has to be more than 16 bytes and has to be 3/4 zero to be candidate for this xform. The two testcases are illustrative of the scenarios this catches. We now codegen test9 into: call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 400, i32 4, i1 false) %.array = getelementptr inbounds [100 x i32]* %Arr, i32 0, i32 0 %tmp = load i32* %X.addr, align 4 store i32 %tmp, i32* %.array and test10 into: call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 392, i32 8, i1 false) %tmp = getelementptr inbounds %struct.b* %S, i32 0, i32 0 %tmp1 = getelementptr inbounds %struct.a* %tmp, i32 0, i32 0 %tmp2 = load i32* %X.addr, align 4 store i32 %tmp2, i32* %tmp1, align 4 %tmp5 = getelementptr inbounds %struct.b* %S, i32 0, i32 3 %tmp10 = getelementptr inbounds %struct.a* %tmp5, i32 0, i32 4 %tmp11 = load i32* %X.addr, align 4 store i32 %tmp11, i32* %tmp10, align 4 Previously we produced 99 stores of zero for test9 and also tons for test10. This xforms should substantially speed up -O0 builds when it kicks in as well as reducing code size and optimizer heartburn on insane cases. This resolves PR279. llvm-svn: 120692
2010-12-02 15:07:26 +08:00
// If we're done emitting initializers and the destination is known-zeroed
// then we're done.
if (i == NumInitElements &&
Dest.isZeroed() &&
CGF.getTypes().isZeroInitializable(ElementType))
break;
llvm::Value *NextVal = Builder.CreateStructGEP(DestPtr, i, ".array");
LValue LV = CGF.MakeAddrLValue(NextVal, ElementType);
Improve codegen for initializer lists to use memset more aggressively when an initializer is variable (I handled the constant case in a previous patch). This has three pieces: 1. Enhance AggValueSlot to have a 'isZeroed' bit to tell CGExprAgg that the memory being stored into has previously been memset to zero. 2. Teach CGExprAgg to not emit stores of zero to isZeroed memory. 3. Teach CodeGenFunction::EmitAggExpr to scan initializers to determine whether they are profitable to emit a memset + inividual stores vs stores for everything. The heuristic used is that a global has to be more than 16 bytes and has to be 3/4 zero to be candidate for this xform. The two testcases are illustrative of the scenarios this catches. We now codegen test9 into: call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 400, i32 4, i1 false) %.array = getelementptr inbounds [100 x i32]* %Arr, i32 0, i32 0 %tmp = load i32* %X.addr, align 4 store i32 %tmp, i32* %.array and test10 into: call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 392, i32 8, i1 false) %tmp = getelementptr inbounds %struct.b* %S, i32 0, i32 0 %tmp1 = getelementptr inbounds %struct.a* %tmp, i32 0, i32 0 %tmp2 = load i32* %X.addr, align 4 store i32 %tmp2, i32* %tmp1, align 4 %tmp5 = getelementptr inbounds %struct.b* %S, i32 0, i32 3 %tmp10 = getelementptr inbounds %struct.a* %tmp5, i32 0, i32 4 %tmp11 = load i32* %X.addr, align 4 store i32 %tmp11, i32* %tmp10, align 4 Previously we produced 99 stores of zero for test9 and also tons for test10. This xforms should substantially speed up -O0 builds when it kicks in as well as reducing code size and optimizer heartburn on insane cases. This resolves PR279. llvm-svn: 120692
2010-12-02 15:07:26 +08:00
if (i < NumInitElements)
EmitInitializationToLValue(E->getInit(i), LV, ElementType);
else
EmitNullInitializationToLValue(LV, ElementType);
Improve codegen for initializer lists to use memset more aggressively when an initializer is variable (I handled the constant case in a previous patch). This has three pieces: 1. Enhance AggValueSlot to have a 'isZeroed' bit to tell CGExprAgg that the memory being stored into has previously been memset to zero. 2. Teach CGExprAgg to not emit stores of zero to isZeroed memory. 3. Teach CodeGenFunction::EmitAggExpr to scan initializers to determine whether they are profitable to emit a memset + inividual stores vs stores for everything. The heuristic used is that a global has to be more than 16 bytes and has to be 3/4 zero to be candidate for this xform. The two testcases are illustrative of the scenarios this catches. We now codegen test9 into: call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 400, i32 4, i1 false) %.array = getelementptr inbounds [100 x i32]* %Arr, i32 0, i32 0 %tmp = load i32* %X.addr, align 4 store i32 %tmp, i32* %.array and test10 into: call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 392, i32 8, i1 false) %tmp = getelementptr inbounds %struct.b* %S, i32 0, i32 0 %tmp1 = getelementptr inbounds %struct.a* %tmp, i32 0, i32 0 %tmp2 = load i32* %X.addr, align 4 store i32 %tmp2, i32* %tmp1, align 4 %tmp5 = getelementptr inbounds %struct.b* %S, i32 0, i32 3 %tmp10 = getelementptr inbounds %struct.a* %tmp5, i32 0, i32 4 %tmp11 = load i32* %X.addr, align 4 store i32 %tmp11, i32* %tmp10, align 4 Previously we produced 99 stores of zero for test9 and also tons for test10. This xforms should substantially speed up -O0 builds when it kicks in as well as reducing code size and optimizer heartburn on insane cases. This resolves PR279. llvm-svn: 120692
2010-12-02 15:07:26 +08:00
// If the GEP didn't get used because of a dead zero init or something
// else, clean it up for -O0 builds and general tidiness.
if (llvm::GetElementPtrInst *GEP =
dyn_cast<llvm::GetElementPtrInst>(NextVal))
if (GEP->use_empty())
GEP->eraseFromParent();
}
return;
}
assert(E->getType()->isRecordType() && "Only support structs/unions here!");
// Do struct initialization; this code just sets each individual member
// to the approprate value. This makes bitfield support automatic;
// the disadvantage is that the generated code is more difficult for
// the optimizer, especially with bitfields.
unsigned NumInitElements = E->getNumInits();
RecordDecl *SD = E->getType()->getAs<RecordType>()->getDecl();
if (E->getType()->isUnionType()) {
// Only initialize one field of a union. The field itself is
// specified by the initializer list.
if (!E->getInitializedFieldInUnion()) {
// Empty union; we have nothing to do.
#ifndef NDEBUG
// Make sure that it's really an empty and not a failure of
// semantic analysis.
for (RecordDecl::field_iterator Field = SD->field_begin(),
FieldEnd = SD->field_end();
Field != FieldEnd; ++Field)
assert(Field->isUnnamedBitfield() && "Only unnamed bitfields allowed");
#endif
return;
}
// FIXME: volatility
FieldDecl *Field = E->getInitializedFieldInUnion();
Improve codegen for initializer lists to use memset more aggressively when an initializer is variable (I handled the constant case in a previous patch). This has three pieces: 1. Enhance AggValueSlot to have a 'isZeroed' bit to tell CGExprAgg that the memory being stored into has previously been memset to zero. 2. Teach CGExprAgg to not emit stores of zero to isZeroed memory. 3. Teach CodeGenFunction::EmitAggExpr to scan initializers to determine whether they are profitable to emit a memset + inividual stores vs stores for everything. The heuristic used is that a global has to be more than 16 bytes and has to be 3/4 zero to be candidate for this xform. The two testcases are illustrative of the scenarios this catches. We now codegen test9 into: call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 400, i32 4, i1 false) %.array = getelementptr inbounds [100 x i32]* %Arr, i32 0, i32 0 %tmp = load i32* %X.addr, align 4 store i32 %tmp, i32* %.array and test10 into: call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 392, i32 8, i1 false) %tmp = getelementptr inbounds %struct.b* %S, i32 0, i32 0 %tmp1 = getelementptr inbounds %struct.a* %tmp, i32 0, i32 0 %tmp2 = load i32* %X.addr, align 4 store i32 %tmp2, i32* %tmp1, align 4 %tmp5 = getelementptr inbounds %struct.b* %S, i32 0, i32 3 %tmp10 = getelementptr inbounds %struct.a* %tmp5, i32 0, i32 4 %tmp11 = load i32* %X.addr, align 4 store i32 %tmp11, i32* %tmp10, align 4 Previously we produced 99 stores of zero for test9 and also tons for test10. This xforms should substantially speed up -O0 builds when it kicks in as well as reducing code size and optimizer heartburn on insane cases. This resolves PR279. llvm-svn: 120692
2010-12-02 15:07:26 +08:00
LValue FieldLoc = CGF.EmitLValueForFieldInitialization(DestPtr, Field, 0);
if (NumInitElements) {
// Store the initializer into the field
2010-02-04 01:33:16 +08:00
EmitInitializationToLValue(E->getInit(0), FieldLoc, Field->getType());
} else {
Improve codegen for initializer lists to use memset more aggressively when an initializer is variable (I handled the constant case in a previous patch). This has three pieces: 1. Enhance AggValueSlot to have a 'isZeroed' bit to tell CGExprAgg that the memory being stored into has previously been memset to zero. 2. Teach CGExprAgg to not emit stores of zero to isZeroed memory. 3. Teach CodeGenFunction::EmitAggExpr to scan initializers to determine whether they are profitable to emit a memset + inividual stores vs stores for everything. The heuristic used is that a global has to be more than 16 bytes and has to be 3/4 zero to be candidate for this xform. The two testcases are illustrative of the scenarios this catches. We now codegen test9 into: call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 400, i32 4, i1 false) %.array = getelementptr inbounds [100 x i32]* %Arr, i32 0, i32 0 %tmp = load i32* %X.addr, align 4 store i32 %tmp, i32* %.array and test10 into: call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 392, i32 8, i1 false) %tmp = getelementptr inbounds %struct.b* %S, i32 0, i32 0 %tmp1 = getelementptr inbounds %struct.a* %tmp, i32 0, i32 0 %tmp2 = load i32* %X.addr, align 4 store i32 %tmp2, i32* %tmp1, align 4 %tmp5 = getelementptr inbounds %struct.b* %S, i32 0, i32 3 %tmp10 = getelementptr inbounds %struct.a* %tmp5, i32 0, i32 4 %tmp11 = load i32* %X.addr, align 4 store i32 %tmp11, i32* %tmp10, align 4 Previously we produced 99 stores of zero for test9 and also tons for test10. This xforms should substantially speed up -O0 builds when it kicks in as well as reducing code size and optimizer heartburn on insane cases. This resolves PR279. llvm-svn: 120692
2010-12-02 15:07:26 +08:00
// Default-initialize to null.
EmitNullInitializationToLValue(FieldLoc, Field->getType());
}
return;
}
// Here we iterate over the fields; this makes it simpler to both
// default-initialize fields and skip over unnamed fields.
unsigned CurInitVal = 0;
for (RecordDecl::field_iterator Field = SD->field_begin(),
FieldEnd = SD->field_end();
Field != FieldEnd; ++Field) {
// We're done once we hit the flexible array member
if (Field->getType()->isIncompleteArrayType())
break;
if (Field->isUnnamedBitfield())
continue;
Improve codegen for initializer lists to use memset more aggressively when an initializer is variable (I handled the constant case in a previous patch). This has three pieces: 1. Enhance AggValueSlot to have a 'isZeroed' bit to tell CGExprAgg that the memory being stored into has previously been memset to zero. 2. Teach CGExprAgg to not emit stores of zero to isZeroed memory. 3. Teach CodeGenFunction::EmitAggExpr to scan initializers to determine whether they are profitable to emit a memset + inividual stores vs stores for everything. The heuristic used is that a global has to be more than 16 bytes and has to be 3/4 zero to be candidate for this xform. The two testcases are illustrative of the scenarios this catches. We now codegen test9 into: call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 400, i32 4, i1 false) %.array = getelementptr inbounds [100 x i32]* %Arr, i32 0, i32 0 %tmp = load i32* %X.addr, align 4 store i32 %tmp, i32* %.array and test10 into: call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 392, i32 8, i1 false) %tmp = getelementptr inbounds %struct.b* %S, i32 0, i32 0 %tmp1 = getelementptr inbounds %struct.a* %tmp, i32 0, i32 0 %tmp2 = load i32* %X.addr, align 4 store i32 %tmp2, i32* %tmp1, align 4 %tmp5 = getelementptr inbounds %struct.b* %S, i32 0, i32 3 %tmp10 = getelementptr inbounds %struct.a* %tmp5, i32 0, i32 4 %tmp11 = load i32* %X.addr, align 4 store i32 %tmp11, i32* %tmp10, align 4 Previously we produced 99 stores of zero for test9 and also tons for test10. This xforms should substantially speed up -O0 builds when it kicks in as well as reducing code size and optimizer heartburn on insane cases. This resolves PR279. llvm-svn: 120692
2010-12-02 15:07:26 +08:00
// Don't emit GEP before a noop store of zero.
if (CurInitVal == NumInitElements && Dest.isZeroed() &&
CGF.getTypes().isZeroInitializable(E->getType()))
break;
// FIXME: volatility
LValue FieldLoc = CGF.EmitLValueForFieldInitialization(DestPtr, *Field, 0);
// We never generate write-barries for initialized fields.
FieldLoc.setNonGC(true);
Improve codegen for initializer lists to use memset more aggressively when an initializer is variable (I handled the constant case in a previous patch). This has three pieces: 1. Enhance AggValueSlot to have a 'isZeroed' bit to tell CGExprAgg that the memory being stored into has previously been memset to zero. 2. Teach CGExprAgg to not emit stores of zero to isZeroed memory. 3. Teach CodeGenFunction::EmitAggExpr to scan initializers to determine whether they are profitable to emit a memset + inividual stores vs stores for everything. The heuristic used is that a global has to be more than 16 bytes and has to be 3/4 zero to be candidate for this xform. The two testcases are illustrative of the scenarios this catches. We now codegen test9 into: call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 400, i32 4, i1 false) %.array = getelementptr inbounds [100 x i32]* %Arr, i32 0, i32 0 %tmp = load i32* %X.addr, align 4 store i32 %tmp, i32* %.array and test10 into: call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 392, i32 8, i1 false) %tmp = getelementptr inbounds %struct.b* %S, i32 0, i32 0 %tmp1 = getelementptr inbounds %struct.a* %tmp, i32 0, i32 0 %tmp2 = load i32* %X.addr, align 4 store i32 %tmp2, i32* %tmp1, align 4 %tmp5 = getelementptr inbounds %struct.b* %S, i32 0, i32 3 %tmp10 = getelementptr inbounds %struct.a* %tmp5, i32 0, i32 4 %tmp11 = load i32* %X.addr, align 4 store i32 %tmp11, i32* %tmp10, align 4 Previously we produced 99 stores of zero for test9 and also tons for test10. This xforms should substantially speed up -O0 builds when it kicks in as well as reducing code size and optimizer heartburn on insane cases. This resolves PR279. llvm-svn: 120692
2010-12-02 15:07:26 +08:00
if (CurInitVal < NumInitElements) {
// Store the initializer into the field.
EmitInitializationToLValue(E->getInit(CurInitVal++), FieldLoc,
2010-02-04 01:33:16 +08:00
Field->getType());
} else {
// We're out of initalizers; default-initialize to null
EmitNullInitializationToLValue(FieldLoc, Field->getType());
}
Improve codegen for initializer lists to use memset more aggressively when an initializer is variable (I handled the constant case in a previous patch). This has three pieces: 1. Enhance AggValueSlot to have a 'isZeroed' bit to tell CGExprAgg that the memory being stored into has previously been memset to zero. 2. Teach CGExprAgg to not emit stores of zero to isZeroed memory. 3. Teach CodeGenFunction::EmitAggExpr to scan initializers to determine whether they are profitable to emit a memset + inividual stores vs stores for everything. The heuristic used is that a global has to be more than 16 bytes and has to be 3/4 zero to be candidate for this xform. The two testcases are illustrative of the scenarios this catches. We now codegen test9 into: call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 400, i32 4, i1 false) %.array = getelementptr inbounds [100 x i32]* %Arr, i32 0, i32 0 %tmp = load i32* %X.addr, align 4 store i32 %tmp, i32* %.array and test10 into: call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 392, i32 8, i1 false) %tmp = getelementptr inbounds %struct.b* %S, i32 0, i32 0 %tmp1 = getelementptr inbounds %struct.a* %tmp, i32 0, i32 0 %tmp2 = load i32* %X.addr, align 4 store i32 %tmp2, i32* %tmp1, align 4 %tmp5 = getelementptr inbounds %struct.b* %S, i32 0, i32 3 %tmp10 = getelementptr inbounds %struct.a* %tmp5, i32 0, i32 4 %tmp11 = load i32* %X.addr, align 4 store i32 %tmp11, i32* %tmp10, align 4 Previously we produced 99 stores of zero for test9 and also tons for test10. This xforms should substantially speed up -O0 builds when it kicks in as well as reducing code size and optimizer heartburn on insane cases. This resolves PR279. llvm-svn: 120692
2010-12-02 15:07:26 +08:00
// If the GEP didn't get used because of a dead zero init or something
// else, clean it up for -O0 builds and general tidiness.
if (FieldLoc.isSimple())
if (llvm::GetElementPtrInst *GEP =
dyn_cast<llvm::GetElementPtrInst>(FieldLoc.getAddress()))
if (GEP->use_empty())
GEP->eraseFromParent();
}
}
//===----------------------------------------------------------------------===//
// Entry Points into this File
//===----------------------------------------------------------------------===//
Improve codegen for initializer lists to use memset more aggressively when an initializer is variable (I handled the constant case in a previous patch). This has three pieces: 1. Enhance AggValueSlot to have a 'isZeroed' bit to tell CGExprAgg that the memory being stored into has previously been memset to zero. 2. Teach CGExprAgg to not emit stores of zero to isZeroed memory. 3. Teach CodeGenFunction::EmitAggExpr to scan initializers to determine whether they are profitable to emit a memset + inividual stores vs stores for everything. The heuristic used is that a global has to be more than 16 bytes and has to be 3/4 zero to be candidate for this xform. The two testcases are illustrative of the scenarios this catches. We now codegen test9 into: call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 400, i32 4, i1 false) %.array = getelementptr inbounds [100 x i32]* %Arr, i32 0, i32 0 %tmp = load i32* %X.addr, align 4 store i32 %tmp, i32* %.array and test10 into: call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 392, i32 8, i1 false) %tmp = getelementptr inbounds %struct.b* %S, i32 0, i32 0 %tmp1 = getelementptr inbounds %struct.a* %tmp, i32 0, i32 0 %tmp2 = load i32* %X.addr, align 4 store i32 %tmp2, i32* %tmp1, align 4 %tmp5 = getelementptr inbounds %struct.b* %S, i32 0, i32 3 %tmp10 = getelementptr inbounds %struct.a* %tmp5, i32 0, i32 4 %tmp11 = load i32* %X.addr, align 4 store i32 %tmp11, i32* %tmp10, align 4 Previously we produced 99 stores of zero for test9 and also tons for test10. This xforms should substantially speed up -O0 builds when it kicks in as well as reducing code size and optimizer heartburn on insane cases. This resolves PR279. llvm-svn: 120692
2010-12-02 15:07:26 +08:00
/// GetNumNonZeroBytesInInit - Get an approximate count of the number of
/// non-zero bytes that will be stored when outputting the initializer for the
/// specified initializer expression.
static uint64_t GetNumNonZeroBytesInInit(const Expr *E, CodeGenFunction &CGF) {
if (const ParenExpr *PE = dyn_cast<ParenExpr>(E))
return GetNumNonZeroBytesInInit(PE->getSubExpr(), CGF);
// 0 and 0.0 won't require any non-zero stores!
if (isSimpleZero(E, CGF)) return 0;
// If this is an initlist expr, sum up the size of sizes of the (present)
// elements. If this is something weird, assume the whole thing is non-zero.
const InitListExpr *ILE = dyn_cast<InitListExpr>(E);
if (ILE == 0 || !CGF.getTypes().isZeroInitializable(ILE->getType()))
return CGF.getContext().getTypeSize(E->getType())/8;
// InitListExprs for structs have to be handled carefully. If there are
// reference members, we need to consider the size of the reference, not the
// referencee. InitListExprs for unions and arrays can't have references.
if (const RecordType *RT = E->getType()->getAs<RecordType>()) {
if (!RT->isUnionType()) {
RecordDecl *SD = E->getType()->getAs<RecordType>()->getDecl();
uint64_t NumNonZeroBytes = 0;
unsigned ILEElement = 0;
for (RecordDecl::field_iterator Field = SD->field_begin(),
FieldEnd = SD->field_end(); Field != FieldEnd; ++Field) {
// We're done once we hit the flexible array member or run out of
// InitListExpr elements.
if (Field->getType()->isIncompleteArrayType() ||
ILEElement == ILE->getNumInits())
break;
if (Field->isUnnamedBitfield())
continue;
const Expr *E = ILE->getInit(ILEElement++);
// Reference values are always non-null and have the width of a pointer.
if (Field->getType()->isReferenceType())
NumNonZeroBytes += CGF.getContext().Target.getPointerWidth(0);
else
NumNonZeroBytes += GetNumNonZeroBytesInInit(E, CGF);
}
return NumNonZeroBytes;
}
}
Improve codegen for initializer lists to use memset more aggressively when an initializer is variable (I handled the constant case in a previous patch). This has three pieces: 1. Enhance AggValueSlot to have a 'isZeroed' bit to tell CGExprAgg that the memory being stored into has previously been memset to zero. 2. Teach CGExprAgg to not emit stores of zero to isZeroed memory. 3. Teach CodeGenFunction::EmitAggExpr to scan initializers to determine whether they are profitable to emit a memset + inividual stores vs stores for everything. The heuristic used is that a global has to be more than 16 bytes and has to be 3/4 zero to be candidate for this xform. The two testcases are illustrative of the scenarios this catches. We now codegen test9 into: call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 400, i32 4, i1 false) %.array = getelementptr inbounds [100 x i32]* %Arr, i32 0, i32 0 %tmp = load i32* %X.addr, align 4 store i32 %tmp, i32* %.array and test10 into: call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 392, i32 8, i1 false) %tmp = getelementptr inbounds %struct.b* %S, i32 0, i32 0 %tmp1 = getelementptr inbounds %struct.a* %tmp, i32 0, i32 0 %tmp2 = load i32* %X.addr, align 4 store i32 %tmp2, i32* %tmp1, align 4 %tmp5 = getelementptr inbounds %struct.b* %S, i32 0, i32 3 %tmp10 = getelementptr inbounds %struct.a* %tmp5, i32 0, i32 4 %tmp11 = load i32* %X.addr, align 4 store i32 %tmp11, i32* %tmp10, align 4 Previously we produced 99 stores of zero for test9 and also tons for test10. This xforms should substantially speed up -O0 builds when it kicks in as well as reducing code size and optimizer heartburn on insane cases. This resolves PR279. llvm-svn: 120692
2010-12-02 15:07:26 +08:00
uint64_t NumNonZeroBytes = 0;
for (unsigned i = 0, e = ILE->getNumInits(); i != e; ++i)
NumNonZeroBytes += GetNumNonZeroBytesInInit(ILE->getInit(i), CGF);
return NumNonZeroBytes;
}
/// CheckAggExprForMemSetUse - If the initializer is large and has a lot of
/// zeros in it, emit a memset and avoid storing the individual zeros.
///
static void CheckAggExprForMemSetUse(AggValueSlot &Slot, const Expr *E,
CodeGenFunction &CGF) {
// If the slot is already known to be zeroed, nothing to do. Don't mess with
// volatile stores.
if (Slot.isZeroed() || Slot.isVolatile() || Slot.getAddr() == 0) return;
// If the type is 16-bytes or smaller, prefer individual stores over memset.
std::pair<uint64_t, unsigned> TypeInfo =
CGF.getContext().getTypeInfo(E->getType());
if (TypeInfo.first/8 <= 16)
return;
// Check to see if over 3/4 of the initializer are known to be zero. If so,
// we prefer to emit memset + individual stores for the rest.
uint64_t NumNonZeroBytes = GetNumNonZeroBytesInInit(E, CGF);
if (NumNonZeroBytes*4 > TypeInfo.first/8)
return;
// Okay, it seems like a good idea to use an initial memset, emit the call.
llvm::Constant *SizeVal = CGF.Builder.getInt64(TypeInfo.first/8);
unsigned Align = TypeInfo.second/8;
Improve codegen for initializer lists to use memset more aggressively when an initializer is variable (I handled the constant case in a previous patch). This has three pieces: 1. Enhance AggValueSlot to have a 'isZeroed' bit to tell CGExprAgg that the memory being stored into has previously been memset to zero. 2. Teach CGExprAgg to not emit stores of zero to isZeroed memory. 3. Teach CodeGenFunction::EmitAggExpr to scan initializers to determine whether they are profitable to emit a memset + inividual stores vs stores for everything. The heuristic used is that a global has to be more than 16 bytes and has to be 3/4 zero to be candidate for this xform. The two testcases are illustrative of the scenarios this catches. We now codegen test9 into: call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 400, i32 4, i1 false) %.array = getelementptr inbounds [100 x i32]* %Arr, i32 0, i32 0 %tmp = load i32* %X.addr, align 4 store i32 %tmp, i32* %.array and test10 into: call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 392, i32 8, i1 false) %tmp = getelementptr inbounds %struct.b* %S, i32 0, i32 0 %tmp1 = getelementptr inbounds %struct.a* %tmp, i32 0, i32 0 %tmp2 = load i32* %X.addr, align 4 store i32 %tmp2, i32* %tmp1, align 4 %tmp5 = getelementptr inbounds %struct.b* %S, i32 0, i32 3 %tmp10 = getelementptr inbounds %struct.a* %tmp5, i32 0, i32 4 %tmp11 = load i32* %X.addr, align 4 store i32 %tmp11, i32* %tmp10, align 4 Previously we produced 99 stores of zero for test9 and also tons for test10. This xforms should substantially speed up -O0 builds when it kicks in as well as reducing code size and optimizer heartburn on insane cases. This resolves PR279. llvm-svn: 120692
2010-12-02 15:07:26 +08:00
llvm::Value *Loc = Slot.getAddr();
const llvm::Type *BP = llvm::Type::getInt8PtrTy(CGF.getLLVMContext());
Loc = CGF.Builder.CreateBitCast(Loc, BP);
CGF.Builder.CreateMemSet(Loc, CGF.Builder.getInt8(0), SizeVal, Align, false);
Improve codegen for initializer lists to use memset more aggressively when an initializer is variable (I handled the constant case in a previous patch). This has three pieces: 1. Enhance AggValueSlot to have a 'isZeroed' bit to tell CGExprAgg that the memory being stored into has previously been memset to zero. 2. Teach CGExprAgg to not emit stores of zero to isZeroed memory. 3. Teach CodeGenFunction::EmitAggExpr to scan initializers to determine whether they are profitable to emit a memset + inividual stores vs stores for everything. The heuristic used is that a global has to be more than 16 bytes and has to be 3/4 zero to be candidate for this xform. The two testcases are illustrative of the scenarios this catches. We now codegen test9 into: call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 400, i32 4, i1 false) %.array = getelementptr inbounds [100 x i32]* %Arr, i32 0, i32 0 %tmp = load i32* %X.addr, align 4 store i32 %tmp, i32* %.array and test10 into: call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 392, i32 8, i1 false) %tmp = getelementptr inbounds %struct.b* %S, i32 0, i32 0 %tmp1 = getelementptr inbounds %struct.a* %tmp, i32 0, i32 0 %tmp2 = load i32* %X.addr, align 4 store i32 %tmp2, i32* %tmp1, align 4 %tmp5 = getelementptr inbounds %struct.b* %S, i32 0, i32 3 %tmp10 = getelementptr inbounds %struct.a* %tmp5, i32 0, i32 4 %tmp11 = load i32* %X.addr, align 4 store i32 %tmp11, i32* %tmp10, align 4 Previously we produced 99 stores of zero for test9 and also tons for test10. This xforms should substantially speed up -O0 builds when it kicks in as well as reducing code size and optimizer heartburn on insane cases. This resolves PR279. llvm-svn: 120692
2010-12-02 15:07:26 +08:00
// Tell the AggExprEmitter that the slot is known zero.
Slot.setZeroed();
}
/// EmitAggExpr - Emit the computation of the specified expression of aggregate
/// type. The result is computed into DestPtr. Note that if DestPtr is null,
/// the value of the aggregate expression is not needed. If VolatileDest is
/// true, DestPtr cannot be 0.
///
/// \param IsInitializer - true if this evaluation is initializing an
/// object whose lifetime is already being managed.
//
// FIXME: Take Qualifiers object.
void CodeGenFunction::EmitAggExpr(const Expr *E, AggValueSlot Slot,
bool IgnoreResult) {
assert(E && hasAggregateLLVMType(E->getType()) &&
"Invalid aggregate expression to emit");
Improve codegen for initializer lists to use memset more aggressively when an initializer is variable (I handled the constant case in a previous patch). This has three pieces: 1. Enhance AggValueSlot to have a 'isZeroed' bit to tell CGExprAgg that the memory being stored into has previously been memset to zero. 2. Teach CGExprAgg to not emit stores of zero to isZeroed memory. 3. Teach CodeGenFunction::EmitAggExpr to scan initializers to determine whether they are profitable to emit a memset + inividual stores vs stores for everything. The heuristic used is that a global has to be more than 16 bytes and has to be 3/4 zero to be candidate for this xform. The two testcases are illustrative of the scenarios this catches. We now codegen test9 into: call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 400, i32 4, i1 false) %.array = getelementptr inbounds [100 x i32]* %Arr, i32 0, i32 0 %tmp = load i32* %X.addr, align 4 store i32 %tmp, i32* %.array and test10 into: call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 392, i32 8, i1 false) %tmp = getelementptr inbounds %struct.b* %S, i32 0, i32 0 %tmp1 = getelementptr inbounds %struct.a* %tmp, i32 0, i32 0 %tmp2 = load i32* %X.addr, align 4 store i32 %tmp2, i32* %tmp1, align 4 %tmp5 = getelementptr inbounds %struct.b* %S, i32 0, i32 3 %tmp10 = getelementptr inbounds %struct.a* %tmp5, i32 0, i32 4 %tmp11 = load i32* %X.addr, align 4 store i32 %tmp11, i32* %tmp10, align 4 Previously we produced 99 stores of zero for test9 and also tons for test10. This xforms should substantially speed up -O0 builds when it kicks in as well as reducing code size and optimizer heartburn on insane cases. This resolves PR279. llvm-svn: 120692
2010-12-02 15:07:26 +08:00
assert((Slot.getAddr() != 0 || Slot.isIgnored()) &&
"slot has bits but no address");
Improve codegen for initializer lists to use memset more aggressively when an initializer is variable (I handled the constant case in a previous patch). This has three pieces: 1. Enhance AggValueSlot to have a 'isZeroed' bit to tell CGExprAgg that the memory being stored into has previously been memset to zero. 2. Teach CGExprAgg to not emit stores of zero to isZeroed memory. 3. Teach CodeGenFunction::EmitAggExpr to scan initializers to determine whether they are profitable to emit a memset + inividual stores vs stores for everything. The heuristic used is that a global has to be more than 16 bytes and has to be 3/4 zero to be candidate for this xform. The two testcases are illustrative of the scenarios this catches. We now codegen test9 into: call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 400, i32 4, i1 false) %.array = getelementptr inbounds [100 x i32]* %Arr, i32 0, i32 0 %tmp = load i32* %X.addr, align 4 store i32 %tmp, i32* %.array and test10 into: call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 392, i32 8, i1 false) %tmp = getelementptr inbounds %struct.b* %S, i32 0, i32 0 %tmp1 = getelementptr inbounds %struct.a* %tmp, i32 0, i32 0 %tmp2 = load i32* %X.addr, align 4 store i32 %tmp2, i32* %tmp1, align 4 %tmp5 = getelementptr inbounds %struct.b* %S, i32 0, i32 3 %tmp10 = getelementptr inbounds %struct.a* %tmp5, i32 0, i32 4 %tmp11 = load i32* %X.addr, align 4 store i32 %tmp11, i32* %tmp10, align 4 Previously we produced 99 stores of zero for test9 and also tons for test10. This xforms should substantially speed up -O0 builds when it kicks in as well as reducing code size and optimizer heartburn on insane cases. This resolves PR279. llvm-svn: 120692
2010-12-02 15:07:26 +08:00
// Optimize the slot if possible.
CheckAggExprForMemSetUse(Slot, E, *this);
AggExprEmitter(*this, Slot, IgnoreResult).Visit(const_cast<Expr*>(E));
}
LValue CodeGenFunction::EmitAggExprToLValue(const Expr *E) {
assert(hasAggregateLLVMType(E->getType()) && "Invalid argument!");
llvm::Value *Temp = CreateMemTemp(E->getType());
LValue LV = MakeAddrLValue(Temp, E->getType());
Improve codegen for initializer lists to use memset more aggressively when an initializer is variable (I handled the constant case in a previous patch). This has three pieces: 1. Enhance AggValueSlot to have a 'isZeroed' bit to tell CGExprAgg that the memory being stored into has previously been memset to zero. 2. Teach CGExprAgg to not emit stores of zero to isZeroed memory. 3. Teach CodeGenFunction::EmitAggExpr to scan initializers to determine whether they are profitable to emit a memset + inividual stores vs stores for everything. The heuristic used is that a global has to be more than 16 bytes and has to be 3/4 zero to be candidate for this xform. The two testcases are illustrative of the scenarios this catches. We now codegen test9 into: call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 400, i32 4, i1 false) %.array = getelementptr inbounds [100 x i32]* %Arr, i32 0, i32 0 %tmp = load i32* %X.addr, align 4 store i32 %tmp, i32* %.array and test10 into: call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 392, i32 8, i1 false) %tmp = getelementptr inbounds %struct.b* %S, i32 0, i32 0 %tmp1 = getelementptr inbounds %struct.a* %tmp, i32 0, i32 0 %tmp2 = load i32* %X.addr, align 4 store i32 %tmp2, i32* %tmp1, align 4 %tmp5 = getelementptr inbounds %struct.b* %S, i32 0, i32 3 %tmp10 = getelementptr inbounds %struct.a* %tmp5, i32 0, i32 4 %tmp11 = load i32* %X.addr, align 4 store i32 %tmp11, i32* %tmp10, align 4 Previously we produced 99 stores of zero for test9 and also tons for test10. This xforms should substantially speed up -O0 builds when it kicks in as well as reducing code size and optimizer heartburn on insane cases. This resolves PR279. llvm-svn: 120692
2010-12-02 15:07:26 +08:00
EmitAggExpr(E, AggValueSlot::forAddr(Temp, LV.isVolatileQualified(), false));
return LV;
}
void CodeGenFunction::EmitAggregateCopy(llvm::Value *DestPtr,
llvm::Value *SrcPtr, QualType Ty,
bool isVolatile) {
assert(!Ty->isAnyComplexType() && "Shouldn't happen for complex");
if (getContext().getLangOptions().CPlusPlus) {
if (const RecordType *RT = Ty->getAs<RecordType>()) {
CXXRecordDecl *Record = cast<CXXRecordDecl>(RT->getDecl());
assert((Record->hasTrivialCopyConstructor() ||
Record->hasTrivialCopyAssignment()) &&
"Trying to aggregate-copy a type without a trivial copy "
"constructor or assignment operator");
2010-05-20 23:48:29 +08:00
// Ignore empty classes in C++.
if (Record->isEmpty())
return;
}
}
// Aggregate assignment turns into llvm.memcpy. This is almost valid per
// C99 6.5.16.1p3, which states "If the value being stored in an object is
// read from another object that overlaps in anyway the storage of the first
// object, then the overlap shall be exact and the two objects shall have
// qualified or unqualified versions of a compatible type."
//
// memcpy is not defined if the source and destination pointers are exactly
// equal, but other compilers do this optimization, and almost every memcpy
// implementation handles this case safely. If there is a libc that does not
// safely handle this, we can add a target hook.
// Get size and alignment info for this aggregate.
std::pair<uint64_t, unsigned> TypeInfo = getContext().getTypeInfo(Ty);
// FIXME: Handle variable sized types.
// FIXME: If we have a volatile struct, the optimizer can remove what might
// appear to be `extra' memory ops:
//
// volatile struct { int i; } a, b;
//
// int main() {
// a = b;
// a = b;
// }
//
// we need to use a different call here. We use isVolatile to indicate when
// either the source or the destination is volatile.
const llvm::PointerType *DPT = cast<llvm::PointerType>(DestPtr->getType());
const llvm::Type *DBP =
llvm::Type::getInt8PtrTy(VMContext, DPT->getAddressSpace());
DestPtr = Builder.CreateBitCast(DestPtr, DBP, "tmp");
const llvm::PointerType *SPT = cast<llvm::PointerType>(SrcPtr->getType());
const llvm::Type *SBP =
llvm::Type::getInt8PtrTy(VMContext, SPT->getAddressSpace());
SrcPtr = Builder.CreateBitCast(SrcPtr, SBP, "tmp");
if (const RecordType *RecordTy = Ty->getAs<RecordType>()) {
RecordDecl *Record = RecordTy->getDecl();
if (Record->hasObjectMember()) {
unsigned long size = TypeInfo.first/8;
const llvm::Type *SizeTy = ConvertType(getContext().getSizeType());
llvm::Value *SizeVal = llvm::ConstantInt::get(SizeTy, size);
CGM.getObjCRuntime().EmitGCMemmoveCollectable(*this, DestPtr, SrcPtr,
SizeVal);
return;
}
} else if (getContext().getAsArrayType(Ty)) {
QualType BaseType = getContext().getBaseElementType(Ty);
if (const RecordType *RecordTy = BaseType->getAs<RecordType>()) {
if (RecordTy->getDecl()->hasObjectMember()) {
unsigned long size = TypeInfo.first/8;
const llvm::Type *SizeTy = ConvertType(getContext().getSizeType());
llvm::Value *SizeVal = llvm::ConstantInt::get(SizeTy, size);
CGM.getObjCRuntime().EmitGCMemmoveCollectable(*this, DestPtr, SrcPtr,
SizeVal);
return;
}
}
}
Builder.CreateMemCpy(DestPtr, SrcPtr,
llvm::ConstantInt::get(IntPtrTy, TypeInfo.first/8),
TypeInfo.second/8, isVolatile);
}