diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 5392c9ff6cfe..eb1c7399fe05 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -12772,6 +12772,76 @@ Examples %res = call i4 @llvm.usub.sat.i4(i4 2, i4 6) ; %res = 0 +Fixed Point Arithmetic Intrinsics +--------------------------------- + +A fixed point number represents a real data type for a number that has a fixed +number of digits after a radix point (equivalent to the decimal point '.'). +The number of digits after the radix point is referred as the ``scale``. These +are useful for representing fractional values to a specific precision. The +following intrinsics perform fixed point arithmetic operations on 2 operands +of the same scale, specified as the third argument. + + +'``llvm.smul.fix.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax +""""""" + +This is an overloaded intrinsic. You can use ``llvm.smul.fix`` +on any integer bit width or vectors of integers. + +:: + + declare i16 @llvm.smul.fix.i16(i16 %a, i16 %b, i32 %scale) + declare i32 @llvm.smul.fix.i32(i32 %a, i32 %b, i32 %scale) + declare i64 @llvm.smul.fix.i64(i64 %a, i64 %b, i32 %scale) + declare <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> %a, <4 x i32> %b, i32 %scale) + +Overview +""""""""" + +The '``llvm.smul.fix``' family of intrinsic functions perform signed +fixed point multiplication on 2 arguments of the same scale. + +Arguments +"""""""""" + +The arguments (%a and %b) and the result may be of integer types of any bit +width, but they must have the same bit width. ``%a`` and ``%b`` are the two +values that will undergo signed fixed point multiplication. The argument +``%scale`` represents the scale of both operands, and must be a constant +integer. + +Semantics: +"""""""""" + +This operation performs fixed point multiplication on the 2 arguments of a +specified scale. The result will also be returned in the same scale specified +in the third argument. + +If the result value cannot be precisely represented in the given scale, the +value is rounded up or down to the closest representable value. The rounding +direction is unspecified. + +It is undefined behavior if the source value does not fit within the range of +the fixed point type. + + +Examples +""""""""" + +.. code-block:: llvm + + %res = call i4 @llvm.smul.fix.i4(i4 3, i4 2, i32 0) ; %res = 6 (2 x 3 = 6) + %res = call i4 @llvm.smul.fix.i4(i4 3, i4 2, i32 1) ; %res = 3 (1.5 x 1 = 1.5) + %res = call i4 @llvm.smul.fix.i4(i4 3, i4 -2, i32 1) ; %res = -3 (1.5 x -1 = -1.5) + + ; The result in the following could be rounded up to -2 or down to -2.5 + %res = call i4 @llvm.smul.fix.i4(i4 3, i4 -3, i32 1) ; %res = -5 (or -4) (1.5 x -1.5 = -2.25) + + Specialised Arithmetic Intrinsics --------------------------------- diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h index 544f574f80a3..9c918ae1104f 100644 --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -272,6 +272,13 @@ namespace ISD { /// resulting value is this minimum value. SSUBSAT, USUBSAT, + /// RESULT = SMULFIX(LHS, RHS, SCALE) - Perform fixed point multiplication on + /// 2 integers with the same width and scale. SCALE represents the scale of + /// both operands as fixed point numbers. This SCALE parameter must be a + /// constant integer. A scale of zero is effectively performing + /// multiplication on 2 integers. + SMULFIX, + /// Simple binary floating point operators. FADD, FSUB, FMUL, FDIV, FREM, diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index bfd8f58bd79e..03791e13e9b9 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -805,6 +805,38 @@ public: return OpActions[(unsigned)VT.getSimpleVT().SimpleTy][Op]; } + /// Custom method defined by each target to indicate if an operation which + /// may require a scale is supported natively by the target. + /// If not, the operation is illegal. + virtual bool isSupportedFixedPointOperation(unsigned Op, EVT VT, + unsigned Scale) const { + return false; + } + + /// Some fixed point operations may be natively supported by the target but + /// only for specific scales. This method allows for checking + /// if the width is supported by the target for a given operation that may + /// depend on scale. + LegalizeAction getFixedPointOperationAction(unsigned Op, EVT VT, + unsigned Scale) const { + auto Action = getOperationAction(Op, VT); + if (Action != Legal) + return Action; + + // This operation is supported in this type but may only work on specific + // scales. + bool Supported; + switch (Op) { + default: + llvm_unreachable("Unexpected fixed point operation."); + case ISD::SMULFIX: + Supported = isSupportedFixedPointOperation(Op, VT, Scale); + break; + } + + return Supported ? Action : Expand; + } + LegalizeAction getStrictFPOperationAction(unsigned Op, EVT VT) const { unsigned EqOpc; switch (Op) { @@ -3775,10 +3807,15 @@ public: SDValue Index) const; /// Method for building the DAG expansion of ISD::[US][ADD|SUB]SAT. This - /// method accepts integers or vectors of integers as its arguments. + /// method accepts integers as its arguments. SDValue getExpandedSaturationAdditionSubtraction(SDNode *Node, SelectionDAG &DAG) const; + /// Method for building the DAG expansion of ISD::SMULFIX. This method accepts + /// integers as its arguments. + SDValue getExpandedFixedPointMultiplication(SDNode *Node, + SelectionDAG &DAG) const; + //===--------------------------------------------------------------------===// // Instruction Emitting Hooks // diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 4ed46e30a3b5..f503d3ebbdfe 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -811,7 +811,7 @@ def int_umul_with_overflow : Intrinsic<[llvm_anyint_ty, llvm_i1_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]>; -//===------------------------- Fixed Point Intrinsics ---------------------===// +//===------------------------- Saturation Arithmetic Intrinsics ---------------------===// // def int_sadd_sat : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], @@ -826,6 +826,12 @@ def int_usub_sat : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]>; +//===------------------------- Fixed Point Arithmetic Intrinsics ---------------------===// +// +def int_smul_fix : Intrinsic<[llvm_anyint_ty], + [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable, Commutative]>; + //===------------------------- Memory Use Markers -------------------------===// // def int_lifetime_start : Intrinsic<[], diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td index 9f94f72a7e26..eb5a14bd21b8 100644 --- a/llvm/include/llvm/Target/TargetSelectionDAG.td +++ b/llvm/include/llvm/Target/TargetSelectionDAG.td @@ -125,6 +125,9 @@ def SDTIntSatNoShOp : SDTypeProfile<1, 2, [ // ssat with no shift def SDTIntBinHiLoOp : SDTypeProfile<2, 2, [ // mulhi, mullo, sdivrem, udivrem SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>,SDTCisInt<0> ]>; +def SDTIntScaledBinOp : SDTypeProfile<1, 3, [ // smulfix + SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3> +]>; def SDTFPBinOp : SDTypeProfile<1, 2, [ // fadd, fmul, etc. SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisFP<0> @@ -382,6 +385,7 @@ def saddsat : SDNode<"ISD::SADDSAT" , SDTIntBinOp, [SDNPCommutative]>; def uaddsat : SDNode<"ISD::UADDSAT" , SDTIntBinOp, [SDNPCommutative]>; def ssubsat : SDNode<"ISD::SSUBSAT" , SDTIntBinOp>; def usubsat : SDNode<"ISD::USUBSAT" , SDTIntBinOp>; +def smulfix : SDNode<"ISD::SMULFIX" , SDTIntScaledBinOp, [SDNPCommutative]>; def sext_inreg : SDNode<"ISD::SIGN_EXTEND_INREG", SDTExtInreg>; def sext_invec : SDNode<"ISD::SIGN_EXTEND_VECTOR_INREG", SDTExtInvec>; diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 2f4a5451a611..f1c6d63982e5 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -1128,6 +1128,12 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) { Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0)); break; } + case ISD::SMULFIX: { + unsigned Scale = Node->getConstantOperandVal(2); + Action = TLI.getFixedPointOperationAction(Node->getOpcode(), + Node->getValueType(0), Scale); + break; + } case ISD::MSCATTER: Action = TLI.getOperationAction(Node->getOpcode(), cast(Node)->getValue().getValueType()); @@ -3276,6 +3282,10 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { Results.push_back(TLI.getExpandedSaturationAdditionSubtraction(Node, DAG)); break; } + case ISD::SMULFIX: { + Results.push_back(TLI.getExpandedFixedPointMultiplication(Node, DAG)); + break; + } case ISD::SADDO: case ISD::SSUBO: { SDValue LHS = Node->getOperand(0); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 265f67b76597..96d1c3d75e4e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -147,6 +147,7 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) { case ISD::UADDSAT: case ISD::SSUBSAT: case ISD::USUBSAT: Res = PromoteIntRes_ADDSUBSAT(N); break; + case ISD::SMULFIX: Res = PromoteIntRes_SMULFIX(N); break; case ISD::ATOMIC_LOAD: Res = PromoteIntRes_Atomic0(cast(N)); break; @@ -625,6 +626,16 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSAT(SDNode *N) { return DAG.getNode(ShiftOp, dl, PromotedType, Result, ShiftAmount); } +SDValue DAGTypeLegalizer::PromoteIntRes_SMULFIX(SDNode *N) { + // Can just promote the operands then continue with operation. + SDLoc dl(N); + SDValue Op1Promoted = SExtPromotedInteger(N->getOperand(0)); + SDValue Op2Promoted = SExtPromotedInteger(N->getOperand(1)); + EVT PromotedType = Op1Promoted.getValueType(); + return DAG.getNode(N->getOpcode(), dl, PromotedType, Op1Promoted, Op2Promoted, + N->getOperand(2)); +} + SDValue DAGTypeLegalizer::PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo) { if (ResNo == 1) return PromoteIntRes_Overflow(N); @@ -1056,6 +1067,8 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) { case ISD::RETURNADDR: Res = PromoteIntOp_FRAMERETURNADDR(N); break; case ISD::PREFETCH: Res = PromoteIntOp_PREFETCH(N, OpNo); break; + + case ISD::SMULFIX: Res = PromoteIntOp_SMULFIX(N); break; } // If the result is null, the sub-method took care of registering results etc. @@ -1415,6 +1428,12 @@ SDValue DAGTypeLegalizer::PromoteIntOp_ADDSUBCARRY(SDNode *N, unsigned OpNo) { return SDValue(DAG.UpdateNodeOperands(N, LHS, RHS, Carry), 0); } +SDValue DAGTypeLegalizer::PromoteIntOp_SMULFIX(SDNode *N) { + SDValue Op2 = ZExtPromotedInteger(N->getOperand(2)); + return SDValue( + DAG.UpdateNodeOperands(N, N->getOperand(0), N->getOperand(1), Op2), 0); +} + SDValue DAGTypeLegalizer::PromoteIntOp_FRAMERETURNADDR(SDNode *N) { // Promote the RETURNADDR/FRAMEADDR argument to a supported integer width. SDValue Op = ZExtPromotedInteger(N->getOperand(0)); @@ -1571,6 +1590,7 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) { case ISD::UADDSAT: case ISD::SSUBSAT: case ISD::USUBSAT: ExpandIntRes_ADDSUBSAT(N, Lo, Hi); break; + case ISD::SMULFIX: ExpandIntRes_SMULFIX(N, Lo, Hi); break; } // If Lo/Hi is null, the sub-method took care of registering results etc. @@ -2539,6 +2559,95 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUBSAT(SDNode *N, SDValue &Lo, SplitInteger(Result, Lo, Hi); } +void DAGTypeLegalizer::ExpandIntRes_SMULFIX(SDNode *N, SDValue &Lo, + SDValue &Hi) { + SDLoc dl(N); + EVT VT = N->getValueType(0); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + uint64_t Scale = N->getConstantOperandVal(2); + if (!Scale) { + SDValue Result = DAG.getNode(ISD::MUL, dl, VT, LHS, RHS); + SplitInteger(Result, Lo, Hi); + return; + } + + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); + SDValue LL, LH, RL, RH; + GetExpandedInteger(LHS, LL, LH); + GetExpandedInteger(RHS, RL, RH); + SmallVector Result; + + if (!TLI.expandMUL_LOHI(ISD::SMUL_LOHI, VT, dl, LHS, RHS, Result, NVT, DAG, + TargetLowering::MulExpansionKind::OnlyLegalOrCustom, + LL, LH, RL, RH)) { + report_fatal_error("Unable to expand SMUL_FIX using SMUL_LOHI."); + return; + } + + unsigned VTSize = VT.getScalarSizeInBits(); + unsigned NVTSize = NVT.getScalarSizeInBits(); + EVT ShiftTy = TLI.getShiftAmountTy(NVT, DAG.getDataLayout()); + + // Shift whole amount by scale. + SDValue ResultLL = Result[0]; + SDValue ResultLH = Result[1]; + SDValue ResultHL = Result[2]; + SDValue ResultHH = Result[3]; + + // After getting the multplication result in 4 parts, we need to perform a + // shift right by the amount of the scale to get the result in that scale. + // Let's say we multiply 2 64 bit numbers. The resulting value can be held in + // 128 bits that are cut into 4 32-bit parts: + // + // HH HL LH LL + // |---32---|---32---|---32---|---32---| + // 128 96 64 32 0 + // + // |------VTSize-----| + // + // |NVTSize-| + // + // The resulting Lo and Hi will only need to be one of these 32-bit parts + // after shifting. + if (Scale < NVTSize) { + // If the scale is less than the size of the VT we expand to, the Hi and + // Lo of the result will be in the first 2 parts of the result after + // shifting right. This only requires shifting by the scale as far as the + // third part in the result (ResultHL). + SDValue SRLAmnt = DAG.getConstant(Scale, dl, ShiftTy); + SDValue SHLAmnt = DAG.getConstant(NVTSize - Scale, dl, ShiftTy); + Lo = DAG.getNode(ISD::SRL, dl, NVT, ResultLL, SRLAmnt); + Lo = DAG.getNode(ISD::OR, dl, NVT, Lo, + DAG.getNode(ISD::SHL, dl, NVT, ResultLH, SHLAmnt)); + Hi = DAG.getNode(ISD::SRL, dl, NVT, ResultLH, SRLAmnt); + Hi = DAG.getNode(ISD::OR, dl, NVT, Hi, + DAG.getNode(ISD::SHL, dl, NVT, ResultHL, SHLAmnt)); + } else if (Scale == NVTSize) { + // If the scales are equal, Lo and Hi are ResultLH and Result HL, + // respectively. Avoid shifting to prevent undefined behavior. + Lo = ResultLH; + Hi = ResultHL; + } else if (Scale < VTSize) { + // If the scale is instead less than the old VT size, but greater than or + // equal to the expanded VT size, the first part of the result (ResultLL) is + // no longer a part of Lo because it would be scaled out anyway. Instead we + // can start shifting right from the fourth part (ResultHH) to the second + // part (ResultLH), and Result LH will be the new Lo. + SDValue SRLAmnt = DAG.getConstant(Scale - NVTSize, dl, ShiftTy); + SDValue SHLAmnt = DAG.getConstant(VTSize - Scale, dl, ShiftTy); + Lo = DAG.getNode(ISD::SRL, dl, NVT, ResultLH, SRLAmnt); + Lo = DAG.getNode(ISD::OR, dl, NVT, Lo, + DAG.getNode(ISD::SHL, dl, NVT, ResultHL, SHLAmnt)); + Hi = DAG.getNode(ISD::SRL, dl, NVT, ResultHL, SRLAmnt); + Hi = DAG.getNode(ISD::OR, dl, NVT, Hi, + DAG.getNode(ISD::SHL, dl, NVT, ResultHH, SHLAmnt)); + } else { + llvm_unreachable( + "Expected the scale to be less than the width of the operands"); + } +} + void DAGTypeLegalizer::ExpandIntRes_SADDSUBO(SDNode *Node, SDValue &Lo, SDValue &Hi) { SDValue LHS = Node->getOperand(0); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 38a23729a1d6..032000f6cb79 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -345,6 +345,7 @@ private: SDValue PromoteIntRes_VAARG(SDNode *N); SDValue PromoteIntRes_XMULO(SDNode *N, unsigned ResNo); SDValue PromoteIntRes_ADDSUBSAT(SDNode *N); + SDValue PromoteIntRes_SMULFIX(SDNode *N); SDValue PromoteIntRes_FLT_ROUNDS(SDNode *N); // Integer Operand Promotion. @@ -378,6 +379,7 @@ private: SDValue PromoteIntOp_ADDSUBCARRY(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_FRAMERETURNADDR(SDNode *N); SDValue PromoteIntOp_PREFETCH(SDNode *N, unsigned OpNo); + SDValue PromoteIntOp_SMULFIX(SDNode *N); void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code); @@ -433,6 +435,7 @@ private: void ExpandIntRes_UADDSUBO (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_XMULO (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_ADDSUBSAT (SDNode *N, SDValue &Lo, SDValue &Hi); + void ExpandIntRes_SMULFIX (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_ATOMIC_LOAD (SDNode *N, SDValue &Lo, SDValue &Hi); @@ -688,6 +691,8 @@ private: SDValue ScalarizeVecRes_UNDEF(SDNode *N); SDValue ScalarizeVecRes_VECTOR_SHUFFLE(SDNode *N); + SDValue ScalarizeVecRes_SMULFIX(SDNode *N); + // Vector Operand Scalarization: <1 x ty> -> ty. bool ScalarizeVectorOperand(SDNode *N, unsigned OpNo); SDValue ScalarizeVecOp_BITCAST(SDNode *N); @@ -723,6 +728,8 @@ private: void SplitVecRes_ExtVecInRegOp(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_StrictFPOp(SDNode *N, SDValue &Lo, SDValue &Hi); + void SplitVecRes_SMULFIX(SDNode *N, SDValue &Lo, SDValue &Hi); + void SplitVecRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_CONCAT_VECTORS(SDNode *N, SDValue &Lo, SDValue &Hi); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 4229e577a3d4..b8066c1c53b9 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -414,6 +414,12 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { case ISD::USUBSAT: Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0)); break; + case ISD::SMULFIX: { + unsigned Scale = Node->getConstantOperandVal(2); + Action = TLI.getFixedPointOperationAction(Node->getOpcode(), + Node->getValueType(0), Scale); + break; + } case ISD::FP_ROUND_INREG: Action = TLI.getOperationAction(Node->getOpcode(), cast(Node->getOperand(1))->getVT()); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 2809fcaff4f5..a40618b0ed23 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -172,6 +172,9 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) { case ISD::STRICT_FTRUNC: R = ScalarizeVecRes_StrictFPOp(N); break; + case ISD::SMULFIX: + R = ScalarizeVecRes_SMULFIX(N); + break; } // If R is null, the sub-method took care of registering the result. @@ -194,6 +197,14 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_TernaryOp(SDNode *N) { Op0.getValueType(), Op0, Op1, Op2); } +SDValue DAGTypeLegalizer::ScalarizeVecRes_SMULFIX(SDNode *N) { + SDValue Op0 = GetScalarizedVector(N->getOperand(0)); + SDValue Op1 = GetScalarizedVector(N->getOperand(1)); + SDValue Op2 = N->getOperand(2); + return DAG.getNode(N->getOpcode(), SDLoc(N), Op0.getValueType(), Op0, Op1, + Op2); +} + SDValue DAGTypeLegalizer::ScalarizeVecRes_StrictFPOp(SDNode *N) { EVT VT = N->getValueType(0).getVectorElementType(); unsigned NumOpers = N->getNumOperands(); @@ -848,6 +859,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::STRICT_FTRUNC: SplitVecRes_StrictFPOp(N, Lo, Hi); break; + case ISD::SMULFIX: + SplitVecRes_SMULFIX(N, Lo, Hi); + break; } // If Lo/Hi is null, the sub-method took care of registering results etc. @@ -885,6 +899,20 @@ void DAGTypeLegalizer::SplitVecRes_TernaryOp(SDNode *N, SDValue &Lo, Op0Hi, Op1Hi, Op2Hi); } +void DAGTypeLegalizer::SplitVecRes_SMULFIX(SDNode *N, SDValue &Lo, + SDValue &Hi) { + SDValue LHSLo, LHSHi; + GetSplitVector(N->getOperand(0), LHSLo, LHSHi); + SDValue RHSLo, RHSHi; + GetSplitVector(N->getOperand(1), RHSLo, RHSHi); + SDLoc dl(N); + SDValue Op2 = N->getOperand(2); + + unsigned Opcode = N->getOpcode(); + Lo = DAG.getNode(Opcode, dl, LHSLo.getValueType(), LHSLo, RHSLo, Op2); + Hi = DAG.getNode(Opcode, dl, LHSHi.getValueType(), LHSHi, RHSHi, Op2); +} + void DAGTypeLegalizer::SplitVecRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) { // We know the result is a vector. The input may be either a vector or a diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 0de149a43684..cf06c1f0b4b4 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -5832,6 +5832,14 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { setValue(&I, DAG.getNode(ISD::USUBSAT, sdl, Op1.getValueType(), Op1, Op2)); return nullptr; } + case Intrinsic::smul_fix: { + SDValue Op1 = getValue(I.getArgOperand(0)); + SDValue Op2 = getValue(I.getArgOperand(1)); + SDValue Op3 = getValue(I.getArgOperand(2)); + setValue(&I, + DAG.getNode(ISD::SMULFIX, sdl, Op1.getValueType(), Op1, Op2, Op3)); + return nullptr; + } case Intrinsic::stacksave: { SDValue Op = getRoot(); Res = DAG.getNode( diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index bf81d0d267a1..43df2abb674b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -297,6 +297,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::UADDSAT: return "uaddsat"; case ISD::SSUBSAT: return "ssubsat"; case ISD::USUBSAT: return "usubsat"; + case ISD::SMULFIX: return "smulfix"; // Conversion operators. case ISD::SIGN_EXTEND: return "sign_extend"; diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 6d0d392b8f42..fa73684399b6 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -4089,8 +4089,17 @@ bool TargetLowering::expandMUL_LOHI(unsigned Opcode, EVT VT, SDLoc dl, if (!MakeMUL_LOHI(LH, RL, Lo, Hi, false)) return false; - Next = DAG.getNode(ISD::ADDC, dl, DAG.getVTList(VT, MVT::Glue), Next, - Merge(Lo, Hi)); + SDValue Zero = DAG.getConstant(0, dl, HiLoVT); + EVT BoolType = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); + + bool UseGlue = (isOperationLegalOrCustom(ISD::ADDC, VT) && + isOperationLegalOrCustom(ISD::ADDE, VT)); + if (UseGlue) + Next = DAG.getNode(ISD::ADDC, dl, DAG.getVTList(VT, MVT::Glue), Next, + Merge(Lo, Hi)); + else + Next = DAG.getNode(ISD::ADDCARRY, dl, DAG.getVTList(VT, BoolType), Next, + Merge(Lo, Hi), DAG.getConstant(0, dl, BoolType)); SDValue Carry = Next.getValue(1); Result.push_back(DAG.getNode(ISD::TRUNCATE, dl, HiLoVT, Next)); @@ -4099,9 +4108,13 @@ bool TargetLowering::expandMUL_LOHI(unsigned Opcode, EVT VT, SDLoc dl, if (!MakeMUL_LOHI(LH, RH, Lo, Hi, Opcode == ISD::SMUL_LOHI)) return false; - SDValue Zero = DAG.getConstant(0, dl, HiLoVT); - Hi = DAG.getNode(ISD::ADDE, dl, DAG.getVTList(HiLoVT, MVT::Glue), Hi, Zero, - Carry); + if (UseGlue) + Hi = DAG.getNode(ISD::ADDE, dl, DAG.getVTList(HiLoVT, MVT::Glue), Hi, Zero, + Carry); + else + Hi = DAG.getNode(ISD::ADDCARRY, dl, DAG.getVTList(HiLoVT, BoolType), Hi, + Zero, Carry); + Next = DAG.getNode(ISD::ADD, dl, VT, Next, Merge(Lo, Hi)); if (Opcode == ISD::SMUL_LOHI) { @@ -5198,3 +5211,55 @@ SDValue TargetLowering::getExpandedSaturationAdditionSubtraction( return DAG.getSelect(dl, ResultType, Overflow, Result, SumDiff); } } + +SDValue +TargetLowering::getExpandedFixedPointMultiplication(SDNode *Node, + SelectionDAG &DAG) const { + assert(Node->getOpcode() == ISD::SMULFIX && "Expected opcode to be SMULFIX."); + assert(Node->getNumOperands() == 3 && + "Expected signed fixed point multiplication to have 3 operands."); + + SDLoc dl(Node); + SDValue LHS = Node->getOperand(0); + SDValue RHS = Node->getOperand(1); + assert(LHS.getValueType().isScalarInteger() && + "Expected operands to be integers. Vector of int arguments should " + "already be unrolled."); + assert(RHS.getValueType().isScalarInteger() && + "Expected operands to be integers. Vector of int arguments should " + "already be unrolled."); + assert(LHS.getValueType() == RHS.getValueType() && + "Expected both operands to be the same type"); + + unsigned Scale = Node->getConstantOperandVal(2); + EVT VT = LHS.getValueType(); + assert(Scale < VT.getScalarSizeInBits() && + "Expected scale to be less than the number of bits."); + + if (!Scale) + return DAG.getNode(ISD::MUL, dl, VT, LHS, RHS); + + // Get the upper and lower bits of the result. + SDValue Lo, Hi; + if (isOperationLegalOrCustom(ISD::SMUL_LOHI, VT)) { + SDValue Result = + DAG.getNode(ISD::SMUL_LOHI, dl, DAG.getVTList(VT, VT), LHS, RHS); + Lo = Result.getValue(0); + Hi = Result.getValue(1); + } else if (isOperationLegalOrCustom(ISD::MULHS, VT)) { + Lo = DAG.getNode(ISD::MUL, dl, VT, LHS, RHS); + Hi = DAG.getNode(ISD::MULHS, dl, VT, LHS, RHS); + } else { + report_fatal_error("Unable to expand signed fixed point multiplication."); + } + + // The result will need to be shifted right by the scale since both operands + // are scaled. The result is given to us in 2 halves, so we only want part of + // both in the result. + EVT ShiftTy = getShiftAmountTy(VT, DAG.getDataLayout()); + Lo = DAG.getNode(ISD::SRL, dl, VT, Lo, DAG.getConstant(Scale, dl, ShiftTy)); + Hi = DAG.getNode( + ISD::SHL, dl, VT, Hi, + DAG.getConstant(VT.getScalarSizeInBits() - Scale, dl, ShiftTy)); + return DAG.getNode(ISD::OR, dl, VT, Lo, Hi); +} diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index b6eeda4811ea..e86190375642 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -616,6 +616,7 @@ void TargetLoweringBase::initActions() { setOperationAction(ISD::UADDSAT, VT, Expand); setOperationAction(ISD::SSUBSAT, VT, Expand); setOperationAction(ISD::USUBSAT, VT, Expand); + setOperationAction(ISD::SMULFIX, VT, Expand); // Overflow operations default to expand setOperationAction(ISD::SADDO, VT, Expand); diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 40f73c9f1a4e..a31f7cddf6e7 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -4541,6 +4541,24 @@ void Verifier::visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS) { "of ints"); break; } + case Intrinsic::smul_fix: { + Value *Op1 = CS.getArgOperand(0); + Value *Op2 = CS.getArgOperand(1); + Assert(Op1->getType()->isIntOrIntVectorTy(), + "first operand of smul_fix must be an int type or vector " + "of ints"); + Assert(Op2->getType()->isIntOrIntVectorTy(), + "second operand of smul_fix must be an int type or vector " + "of ints"); + + auto *Op3 = dyn_cast(CS.getArgOperand(2)); + Assert(Op3, "third argument of smul_fix must be a constant integer"); + Assert(Op3->getType()->getBitWidth() <= 32, + "third argument of smul_fix must fit within 32 bits"); + Assert(Op3->getZExtValue() < Op1->getType()->getScalarSizeInBits(), + "the scale of smul_fix must be less than the width of the operands"); + break; + } }; } diff --git a/llvm/test/CodeGen/X86/smul_fix.ll b/llvm/test/CodeGen/X86/smul_fix.ll new file mode 100644 index 000000000000..2e69a2666f9a --- /dev/null +++ b/llvm/test/CodeGen/X86/smul_fix.ll @@ -0,0 +1,458 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s --check-prefix=X64 +; RUN: llc < %s -mtriple=i686 -mattr=cmov | FileCheck %s --check-prefix=X86 + +declare i4 @llvm.smul.fix.i4 (i4, i4, i32) +declare i32 @llvm.smul.fix.i32 (i32, i32, i32) +declare i64 @llvm.smul.fix.i64 (i64, i64, i32) +declare <4 x i32> @llvm.smul.fix.v4i32(<4 x i32>, <4 x i32>, i32) + +define i32 @func(i32 %x, i32 %y) nounwind { +; X64-LABEL: func: +; X64: # %bb.0: +; X64-NEXT: movslq %esi, %rax +; X64-NEXT: movslq %edi, %rcx +; X64-NEXT: imulq %rax, %rcx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: shrq $32, %rax +; X64-NEXT: shldl $30, %ecx, %eax +; X64-NEXT: # kill: def $eax killed $eax killed $rax +; X64-NEXT: retq +; +; X86-LABEL: func: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: imull {{[0-9]+}}(%esp) +; X86-NEXT: shrdl $2, %edx, %eax +; X86-NEXT: retl + %tmp = call i32 @llvm.smul.fix.i32(i32 %x, i32 %y, i32 2); + ret i32 %tmp; +} + +define i64 @func2(i64 %x, i64 %y) { +; X64-LABEL: func2: +; X64: # %bb.0: +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: imulq %rsi +; X64-NEXT: shrdq $2, %rdx, %rax +; X64-NEXT: retq +; +; X86-LABEL: func2: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: pushl %ebx +; X86-NEXT: .cfi_def_cfa_offset 12 +; X86-NEXT: pushl %edi +; X86-NEXT: .cfi_def_cfa_offset 16 +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 20 +; X86-NEXT: .cfi_offset %esi, -20 +; X86-NEXT: .cfi_offset %edi, -16 +; X86-NEXT: .cfi_offset %ebx, -12 +; X86-NEXT: .cfi_offset %ebp, -8 +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: addl %edi, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %ecx +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: adcl %esi, %edx +; X86-NEXT: movl %edi, %esi +; X86-NEXT: imull {{[0-9]+}}(%esp), %esi +; X86-NEXT: addl %edx, %esi +; X86-NEXT: movl %esi, %ebp +; X86-NEXT: subl %ecx, %ebp +; X86-NEXT: testl %edi, %edi +; X86-NEXT: cmovnsl %esi, %ebp +; X86-NEXT: movl %ebp, %edx +; X86-NEXT: subl {{[0-9]+}}(%esp), %edx +; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; X86-NEXT: cmovnsl %ebp, %edx +; X86-NEXT: shldl $30, %eax, %edx +; X86-NEXT: shldl $30, %ebx, %eax +; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 16 +; X86-NEXT: popl %edi +; X86-NEXT: .cfi_def_cfa_offset 12 +; X86-NEXT: popl %ebx +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: popl %ebp +; X86-NEXT: .cfi_def_cfa_offset 4 +; X86-NEXT: retl + %tmp = call i64 @llvm.smul.fix.i64(i64 %x, i64 %y, i32 2); + ret i64 %tmp; +} + +define i4 @func3(i4 %x, i4 %y) nounwind { +; X64-LABEL: func3: +; X64: # %bb.0: +; X64-NEXT: shlb $4, %dil +; X64-NEXT: sarb $4, %dil +; X64-NEXT: shlb $4, %sil +; X64-NEXT: sarb $4, %sil +; X64-NEXT: movsbl %sil, %ecx +; X64-NEXT: movsbl %dil, %eax +; X64-NEXT: imull %ecx, %eax +; X64-NEXT: movl %eax, %ecx +; X64-NEXT: shrb $2, %cl +; X64-NEXT: shrl $8, %eax +; X64-NEXT: shlb $6, %al +; X64-NEXT: orb %cl, %al +; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: retq +; +; X86-LABEL: func3: +; X86: # %bb.0: +; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: shlb $4, %al +; X86-NEXT: sarb $4, %al +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: shlb $4, %cl +; X86-NEXT: sarb $4, %cl +; X86-NEXT: movsbl %cl, %ecx +; X86-NEXT: movsbl %al, %eax +; X86-NEXT: imull %ecx, %eax +; X86-NEXT: shlb $6, %ah +; X86-NEXT: shrb $2, %al +; X86-NEXT: orb %ah, %al +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl + %tmp = call i4 @llvm.smul.fix.i4(i4 %x, i4 %y, i32 2); + ret i4 %tmp; +} + +define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { +; X64-LABEL: vec: +; X64: # %bb.0: +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3] +; X64-NEXT: movd %xmm2, %eax +; X64-NEXT: cltq +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] +; X64-NEXT: movd %xmm2, %ecx +; X64-NEXT: movslq %ecx, %rcx +; X64-NEXT: imulq %rax, %rcx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: shrq $32, %rax +; X64-NEXT: shldl $30, %ecx, %eax +; X64-NEXT: movd %eax, %xmm2 +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; X64-NEXT: movd %xmm3, %eax +; X64-NEXT: cltq +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; X64-NEXT: movd %xmm3, %ecx +; X64-NEXT: movslq %ecx, %rcx +; X64-NEXT: imulq %rax, %rcx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: shrq $32, %rax +; X64-NEXT: shldl $30, %ecx, %eax +; X64-NEXT: movd %eax, %xmm3 +; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; X64-NEXT: movd %xmm1, %eax +; X64-NEXT: cltq +; X64-NEXT: movd %xmm0, %ecx +; X64-NEXT: movslq %ecx, %rcx +; X64-NEXT: imulq %rax, %rcx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: shrq $32, %rax +; X64-NEXT: shldl $30, %ecx, %eax +; X64-NEXT: movd %eax, %xmm2 +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; X64-NEXT: movd %xmm1, %eax +; X64-NEXT: cltq +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X64-NEXT: movd %xmm0, %ecx +; X64-NEXT: movslq %ecx, %rcx +; X64-NEXT: imulq %rax, %rcx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: shrq $32, %rax +; X64-NEXT: shldl $30, %ecx, %eax +; X64-NEXT: movd %eax, %xmm0 +; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; X64-NEXT: movdqa %xmm2, %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: vec: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: imull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: shldl $30, %eax, %ebp +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: imull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: shldl $30, %eax, %ebx +; X86-NEXT: movl %edi, %eax +; X86-NEXT: imull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %edi +; X86-NEXT: shldl $30, %eax, %edi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: imull {{[0-9]+}}(%esp) +; X86-NEXT: shldl $30, %eax, %edx +; X86-NEXT: movl %edx, 12(%ecx) +; X86-NEXT: movl %edi, 8(%ecx) +; X86-NEXT: movl %ebx, 4(%ecx) +; X86-NEXT: movl %ebp, (%ecx) +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl $4 + %tmp = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> %x, <4 x i32> %y, i32 2); + ret <4 x i32> %tmp; +} + +; These result in regular integer multiplication +define i32 @func4(i32 %x, i32 %y) nounwind { +; X64-LABEL: func4: +; X64: # %bb.0: +; X64-NEXT: movl %edi, %eax +; X64-NEXT: imull %esi, %eax +; X64-NEXT: retq +; +; X86-LABEL: func4: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: imull {{[0-9]+}}(%esp), %eax +; X86-NEXT: retl + %tmp = call i32 @llvm.smul.fix.i32(i32 %x, i32 %y, i32 0); + ret i32 %tmp; +} + +define i64 @func5(i64 %x, i64 %y) { +; X64-LABEL: func5: +; X64: # %bb.0: +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: imulq %rsi, %rax +; X64-NEXT: retq +; +; X86-LABEL: func5: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %esi, -8 +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %esi +; X86-NEXT: imull {{[0-9]+}}(%esp), %ecx +; X86-NEXT: addl %ecx, %edx +; X86-NEXT: imull {{[0-9]+}}(%esp), %esi +; X86-NEXT: addl %esi, %edx +; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 +; X86-NEXT: retl + %tmp = call i64 @llvm.smul.fix.i64(i64 %x, i64 %y, i32 0); + ret i64 %tmp; +} + +define i4 @func6(i4 %x, i4 %y) nounwind { +; X64-LABEL: func6: +; X64: # %bb.0: +; X64-NEXT: movl %edi, %eax +; X64-NEXT: shlb $4, %al +; X64-NEXT: sarb $4, %al +; X64-NEXT: shlb $4, %sil +; X64-NEXT: sarb $4, %sil +; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: mulb %sil +; X64-NEXT: retq +; +; X86-LABEL: func6: +; X86: # %bb.0: +; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: shlb $4, %al +; X86-NEXT: sarb $4, %al +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: shlb $4, %cl +; X86-NEXT: sarb $4, %cl +; X86-NEXT: mulb %cl +; X86-NEXT: retl + %tmp = call i4 @llvm.smul.fix.i4(i4 %x, i4 %y, i32 0); + ret i4 %tmp; +} + +define <4 x i32> @vec2(<4 x i32> %x, <4 x i32> %y) nounwind { +; X64-LABEL: vec2: +; X64: # %bb.0: +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3] +; X64-NEXT: movd %xmm2, %eax +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] +; X64-NEXT: movd %xmm2, %ecx +; X64-NEXT: imull %eax, %ecx +; X64-NEXT: movd %ecx, %xmm2 +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; X64-NEXT: movd %xmm3, %eax +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; X64-NEXT: movd %xmm3, %ecx +; X64-NEXT: imull %eax, %ecx +; X64-NEXT: movd %ecx, %xmm3 +; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; X64-NEXT: movd %xmm1, %eax +; X64-NEXT: movd %xmm0, %ecx +; X64-NEXT: imull %eax, %ecx +; X64-NEXT: movd %ecx, %xmm2 +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; X64-NEXT: movd %xmm1, %eax +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X64-NEXT: movd %xmm0, %ecx +; X64-NEXT: imull %eax, %ecx +; X64-NEXT: movd %ecx, %xmm0 +; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; X64-NEXT: movdqa %xmm2, %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: vec2: +; X86: # %bb.0: +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: imull {{[0-9]+}}(%esp), %edi +; X86-NEXT: imull {{[0-9]+}}(%esp), %esi +; X86-NEXT: imull {{[0-9]+}}(%esp), %edx +; X86-NEXT: imull {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: movl %edx, 8(%eax) +; X86-NEXT: movl %esi, 4(%eax) +; X86-NEXT: movl %edi, (%eax) +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: retl $4 + %tmp = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> %x, <4 x i32> %y, i32 0); + ret <4 x i32> %tmp; +} + +define i64 @func7(i64 %x, i64 %y) nounwind { +; X64-LABEL: func7: +; X64: # %bb.0: +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: imulq %rsi +; X64-NEXT: shrdq $32, %rdx, %rax +; X64-NEXT: retq +; +; X86-LABEL: func7: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %ebp +; X86-NEXT: addl %edx, %ebx +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %ebp +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: adcl %edi, %edx +; X86-NEXT: movl %esi, %edi +; X86-NEXT: imull {{[0-9]+}}(%esp), %edi +; X86-NEXT: addl %edx, %edi +; X86-NEXT: movl %edi, %ebx +; X86-NEXT: subl %ebp, %ebx +; X86-NEXT: testl %esi, %esi +; X86-NEXT: cmovnsl %edi, %ebx +; X86-NEXT: movl %ebx, %edx +; X86-NEXT: subl %ecx, %edx +; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; X86-NEXT: cmovnsl %ebx, %edx +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl + %tmp = call i64 @llvm.smul.fix.i64(i64 %x, i64 %y, i32 32); + ret i64 %tmp; +} + +define i64 @func8(i64 %x, i64 %y) nounwind { +; X64-LABEL: func8: +; X64: # %bb.0: +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: imulq %rsi +; X64-NEXT: shrdq $63, %rdx, %rax +; X64-NEXT: retq +; +; X86-LABEL: func8: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: addl %ebx, %ebp +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: imull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: adcl %edi, %edx +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: addl %ecx, %edx +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ebx, %esi +; X86-NEXT: sbbl $0, %esi +; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; X86-NEXT: cmovnsl %ebx, %esi +; X86-NEXT: cmovnsl %edx, %ecx +; X86-NEXT: movl %ecx, %edi +; X86-NEXT: subl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %esi, %edx +; X86-NEXT: sbbl $0, %edx +; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; X86-NEXT: cmovnsl %esi, %edx +; X86-NEXT: cmovnsl %ecx, %edi +; X86-NEXT: shldl $1, %edi, %edx +; X86-NEXT: shrdl $31, %edi, %eax +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl + %tmp = call i64 @llvm.smul.fix.i64(i64 %x, i64 %y, i32 63); + ret i64 %tmp; +}