diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 1a8b598c074f..7cb7f2750ffb 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -131,6 +131,13 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( for (auto T : {MVT::v16i8, MVT::v8i16}) setOperationAction(Op, T, Legal); + // Custom lower BUILD_VECTORs to minimize number of replace_lanes + for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32}) + setOperationAction(ISD::BUILD_VECTOR, T, Custom); + if (Subtarget->hasUnimplementedSIMD128()) + for (auto T : {MVT::v2i64, MVT::v2f64}) + setOperationAction(ISD::BUILD_VECTOR, T, Custom); + // We have custom shuffle lowering to expose the shuffle mask for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32}) setOperationAction(ISD::VECTOR_SHUFFLE, T, Custom); @@ -886,6 +893,8 @@ SDValue WebAssemblyTargetLowering::LowerOperation(SDValue Op, return LowerINTRINSIC_VOID(Op, DAG); case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG); + case ISD::BUILD_VECTOR: + return LowerBUILD_VECTOR(Op, DAG); case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); case ISD::SHL: @@ -1103,6 +1112,107 @@ WebAssemblyTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, return SDValue(); } +SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + const EVT VecT = Op.getValueType(); + const EVT LaneT = Op.getOperand(0).getValueType(); + const size_t Lanes = Op.getNumOperands(); + auto IsConstant = [](const SDValue &V) { + return V.getOpcode() == ISD::Constant || V.getOpcode() == ISD::ConstantFP; + }; + + // Find the most common operand, which is approximately the best to splat + using Entry = std::pair; + SmallVector ValueCounts; + size_t NumConst = 0, NumDynamic = 0; + for (const SDValue &Lane : Op->op_values()) { + if (Lane.isUndef()) { + continue; + } else if (IsConstant(Lane)) { + NumConst++; + } else { + NumDynamic++; + } + auto CountIt = std::find_if(ValueCounts.begin(), ValueCounts.end(), + [&Lane](Entry A) { return A.first == Lane; }); + if (CountIt == ValueCounts.end()) { + ValueCounts.emplace_back(Lane, 1); + } else { + CountIt->second++; + } + } + auto CommonIt = + std::max_element(ValueCounts.begin(), ValueCounts.end(), + [](Entry A, Entry B) { return A.second < B.second; }); + assert(CommonIt != ValueCounts.end() && "Unexpected all-undef build_vector"); + SDValue SplatValue = CommonIt->first; + size_t NumCommon = CommonIt->second; + + // If v128.const is available, consider using it instead of a splat + if (Subtarget->hasUnimplementedSIMD128()) { + // {i32,i64,f32,f64}.const opcode, and value + const size_t ConstBytes = 1 + std::max(size_t(4), 16 / Lanes); + // SIMD prefix and opcode + const size_t SplatBytes = 2; + const size_t SplatConstBytes = SplatBytes + ConstBytes; + // SIMD prefix, opcode, and lane index + const size_t ReplaceBytes = 3; + const size_t ReplaceConstBytes = ReplaceBytes + ConstBytes; + // SIMD prefix, v128.const opcode, and 128-bit value + const size_t VecConstBytes = 18; + // Initial v128.const and a replace_lane for each non-const operand + const size_t ConstInitBytes = VecConstBytes + NumDynamic * ReplaceBytes; + // Initial splat and all necessary replace_lanes + const size_t SplatInitBytes = + IsConstant(SplatValue) + // Initial constant splat + ? (SplatConstBytes + + // Constant replace_lanes + (NumConst - NumCommon) * ReplaceConstBytes + + // Dynamic replace_lanes + (NumDynamic * ReplaceBytes)) + // Initial dynamic splat + : (SplatBytes + + // Constant replace_lanes + (NumConst * ReplaceConstBytes) + + // Dynamic replace_lanes + (NumDynamic - NumCommon) * ReplaceBytes); + if (ConstInitBytes < SplatInitBytes) { + // Create build_vector that will lower to initial v128.const + SmallVector ConstLanes; + for (const SDValue &Lane : Op->op_values()) { + if (IsConstant(Lane)) { + ConstLanes.push_back(Lane); + } else if (LaneT.isFloatingPoint()) { + ConstLanes.push_back(DAG.getConstantFP(0, DL, LaneT)); + } else { + ConstLanes.push_back(DAG.getConstant(0, DL, LaneT)); + } + } + SDValue Result = DAG.getBuildVector(VecT, DL, ConstLanes); + // Add replace_lane instructions for non-const lanes + for (size_t I = 0; I < Lanes; ++I) { + const SDValue &Lane = Op->getOperand(I); + if (!Lane.isUndef() && !IsConstant(Lane)) + Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecT, Result, Lane, + DAG.getConstant(I, DL, MVT::i32)); + } + return Result; + } + } + // Use a splat for the initial vector + SDValue Result = DAG.getSplatBuildVector(VecT, DL, SplatValue); + // Add replace_lane instructions for other values + for (size_t I = 0; I < Lanes; ++I) { + const SDValue &Lane = Op->getOperand(I); + if (Lane != SplatValue) + Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecT, Result, Lane, + DAG.getConstant(I, DL, MVT::i32)); + } + return Result; +} + SDValue WebAssemblyTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h index 33f384b44f19..d4b6dcacafc5 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h @@ -99,6 +99,7 @@ private: SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerAccessVectorElement(SDValue Op, SelectionDAG &DAG) const; SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index 2476d8d99cfb..b7ecd49c7937 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -359,118 +359,6 @@ def : Pat<(vector_insert (v4f32 V128:$vec), F32:$x, undef), def : Pat<(vector_insert (v2f64 V128:$vec), F64:$x, undef), (REPLACE_LANE_v2f64 V128:$vec, 0, F64:$x)>; -// Arbitrary other BUILD_VECTOR patterns -def : Pat<(v16i8 (build_vector - (i32 I32:$x0), (i32 I32:$x1), (i32 I32:$x2), (i32 I32:$x3), - (i32 I32:$x4), (i32 I32:$x5), (i32 I32:$x6), (i32 I32:$x7), - (i32 I32:$x8), (i32 I32:$x9), (i32 I32:$x10), (i32 I32:$x11), - (i32 I32:$x12), (i32 I32:$x13), (i32 I32:$x14), (i32 I32:$x15) - )), - (v16i8 (REPLACE_LANE_v16i8 - (v16i8 (REPLACE_LANE_v16i8 - (v16i8 (REPLACE_LANE_v16i8 - (v16i8 (REPLACE_LANE_v16i8 - (v16i8 (REPLACE_LANE_v16i8 - (v16i8 (REPLACE_LANE_v16i8 - (v16i8 (REPLACE_LANE_v16i8 - (v16i8 (REPLACE_LANE_v16i8 - (v16i8 (REPLACE_LANE_v16i8 - (v16i8 (REPLACE_LANE_v16i8 - (v16i8 (REPLACE_LANE_v16i8 - (v16i8 (REPLACE_LANE_v16i8 - (v16i8 (REPLACE_LANE_v16i8 - (v16i8 (REPLACE_LANE_v16i8 - (v16i8 (REPLACE_LANE_v16i8 - (v16i8 (SPLAT_v16i8 (i32 I32:$x0))), - 1, I32:$x1 - )), - 2, I32:$x2 - )), - 3, I32:$x3 - )), - 4, I32:$x4 - )), - 5, I32:$x5 - )), - 6, I32:$x6 - )), - 7, I32:$x7 - )), - 8, I32:$x8 - )), - 9, I32:$x9 - )), - 10, I32:$x10 - )), - 11, I32:$x11 - )), - 12, I32:$x12 - )), - 13, I32:$x13 - )), - 14, I32:$x14 - )), - 15, I32:$x15 - ))>; -def : Pat<(v8i16 (build_vector - (i32 I32:$x0), (i32 I32:$x1), (i32 I32:$x2), (i32 I32:$x3), - (i32 I32:$x4), (i32 I32:$x5), (i32 I32:$x6), (i32 I32:$x7) - )), - (v8i16 (REPLACE_LANE_v8i16 - (v8i16 (REPLACE_LANE_v8i16 - (v8i16 (REPLACE_LANE_v8i16 - (v8i16 (REPLACE_LANE_v8i16 - (v8i16 (REPLACE_LANE_v8i16 - (v8i16 (REPLACE_LANE_v8i16 - (v8i16 (REPLACE_LANE_v8i16 - (v8i16 (SPLAT_v8i16 (i32 I32:$x0))), - 1, I32:$x1 - )), - 2, I32:$x2 - )), - 3, I32:$x3 - )), - 4, I32:$x4 - )), - 5, I32:$x5 - )), - 6, I32:$x6 - )), - 7, I32:$x7 - ))>; -def : Pat<(v4i32 (build_vector - (i32 I32:$x0), (i32 I32:$x1), (i32 I32:$x2), (i32 I32:$x3) - )), - (v4i32 (REPLACE_LANE_v4i32 - (v4i32 (REPLACE_LANE_v4i32 - (v4i32 (REPLACE_LANE_v4i32 - (v4i32 (SPLAT_v4i32 (i32 I32:$x0))), - 1, I32:$x1 - )), - 2, I32:$x2 - )), - 3, I32:$x3 - ))>; -def : Pat<(v2i64 (build_vector (i64 I64:$x0), (i64 I64:$x1))), - (v2i64 (REPLACE_LANE_v2i64 - (v2i64 (SPLAT_v2i64 (i64 I64:$x0))), 1, I64:$x1))>; -def : Pat<(v4f32 (build_vector - (f32 F32:$x0), (f32 F32:$x1), (f32 F32:$x2), (f32 F32:$x3) - )), - (v4f32 (REPLACE_LANE_v4f32 - (v4f32 (REPLACE_LANE_v4f32 - (v4f32 (REPLACE_LANE_v4f32 - (v4f32 (SPLAT_v4f32 (f32 F32:$x0))), - 1, F32:$x1 - )), - 2, F32:$x2 - )), - 3, F32:$x3 - ))>; -def : Pat<(v2f64 (build_vector (f64 F64:$x0), (f64 F64:$x1))), - (v2f64 (REPLACE_LANE_v2f64 - (v2f64 (SPLAT_v2f64 (f64 F64:$x0))), 1, F64:$x1))>; - //===----------------------------------------------------------------------===// // Comparisons //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll b/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll new file mode 100644 index 000000000000..ab08ef4be7db --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll @@ -0,0 +1,127 @@ +; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+unimplemented-simd128 | FileCheck %s + +; Test that the logic to choose between v128.const vector +; initialization and splat vector initialization and to optimize the +; choice of splat value works correctly. + +target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128" +target triple = "wasm32-unknown-unknown" + +; CHECK-LABEL: same_const_one_replaced_i8x16: +; CHECK-NEXT: .functype same_const_one_replaced_i8x16 (i32) -> (v128) +; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 42 +; CHECK-NEXT: i16x8.splat $push[[L1:[0-9]+]]=, $pop[[L0]] +; CHECK-NEXT: i16x8.replace_lane $push[[L2:[0-9]+]]=, $pop[[L1]], 5, $0 +; CHECK-NEXT: return $pop[[L2]] +define <8 x i16> @same_const_one_replaced_i8x16(i16 %x) { + %v = insertelement + <8 x i16> , + i16 %x, + i32 5 + ret <8 x i16> %v +} + +; CHECK-LABEL: different_const_one_replaced_i8x16: +; CHECK-NEXT: .functype different_const_one_replaced_i8x16 (i32) -> (v128) +; CHECK-NEXT: v128.const $push[[L0:[0-9]+]]=, 1, 2, 3, 4, 5, 0, 7, 8 +; CHECK-NEXT: i16x8.replace_lane $push[[L1:[0-9]+]]=, $pop[[L0]], 5, $0 +; CHECK-NEXT: return $pop[[L1]] +define <8 x i16> @different_const_one_replaced_i8x16(i16 %x) { + %v = insertelement + <8 x i16> , + i16 %x, + i32 5 + ret <8 x i16> %v +} + +; CHECK-LABEL: same_const_one_replaced_f32x4: +; CHECK-NEXT: .functype same_const_one_replaced_f32x4 (f32) -> (v128) +; CHECK-NEXT: f32.const $push[[L0:[0-9]+]]=, 0x1.5p5 +; CHECK-NEXT: f32x4.splat $push[[L1:[0-9]+]]=, $pop[[L0]] +; CHECK-NEXT: f32x4.replace_lane $push[[L2:[0-9]+]]=, $pop[[L1]], 2, $0 +; CHECK-NEXT: return $pop[[L2]] +define <4 x float> @same_const_one_replaced_f32x4(float %x) { + %v = insertelement + <4 x float> , + float %x, + i32 2 + ret <4 x float> %v +} + +; CHECK-LABEL: different_const_one_replaced_f32x4: +; CHECK-NEXT: .functype different_const_one_replaced_f32x4 (f32) -> (v128) +; CHECK-NEXT: v128.const $push[[L0:[0-9]+]]=, 0x1p0, 0x1p1, 0x0p0, 0x1p2 +; CHECK-NEXT: f32x4.replace_lane $push[[L1:[0-9]+]]=, $pop[[L0]], 2, $0 +; CHECK-NEXT: return $pop[[L1]] +define <4 x float> @different_const_one_replaced_f32x4(float %x) { + %v = insertelement + <4 x float> , + float %x, + i32 2 + ret <4 x float> %v +} + +; CHECK-LABEL: splat_common_const_i32x4: +; CHECK-NEXT: .functype splat_common_const_i32x4 () -> (v128) +; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 3 +; CHECK-NEXT: i32x4.splat $push[[L1:[0-9]+]]=, $pop[[L0]] +; CHECK-NEXT: i32.const $push[[L2:[0-9]+]]=, 1 +; CHECK-NEXT: i32x4.replace_lane $push[[L3:[0-9]+]]=, $pop[[L1]], 3, $pop[[L2]] +; CHECK-NEXT: return $pop[[L3]] +define <4 x i32> @splat_common_const_i32x4() { + ret <4 x i32> +} + +; CHECK-LABEL: splat_common_arg_i16x8: +; CHECK-NEXT: .functype splat_common_arg_i16x8 (i32, i32, i32) -> (v128) +; CHECK-NEXT: i16x8.splat $push[[L0:[0-9]+]]=, $2 +; CHECK-NEXT: i16x8.replace_lane $push[[L1:[0-9]+]]=, $pop[[L0]], 0, $1 +; CHECK-NEXT: i16x8.replace_lane $push[[L2:[0-9]+]]=, $pop[[L1]], 2, $0 +; CHECK-NEXT: i16x8.replace_lane $push[[L3:[0-9]+]]=, $pop[[L2]], 4, $1 +; CHECK-NEXT: i16x8.replace_lane $push[[L4:[0-9]+]]=, $pop[[L3]], 7, $1 +; CHECK-NEXT: return $pop[[L4]] +define <8 x i16> @splat_common_arg_i16x8(i16 %a, i16 %b, i16 %c) { + %v0 = insertelement <8 x i16> undef, i16 %b, i32 0 + %v1 = insertelement <8 x i16> %v0, i16 %c, i32 1 + %v2 = insertelement <8 x i16> %v1, i16 %a, i32 2 + %v3 = insertelement <8 x i16> %v2, i16 %c, i32 3 + %v4 = insertelement <8 x i16> %v3, i16 %b, i32 4 + %v5 = insertelement <8 x i16> %v4, i16 %c, i32 5 + %v6 = insertelement <8 x i16> %v5, i16 %c, i32 6 + %v7 = insertelement <8 x i16> %v6, i16 %b, i32 7 + ret <8 x i16> %v7 +} + +; CHECK-LABEL: undef_const_insert_f32x4: +; CHECK-NEXT: .functype undef_const_insert_f32x4 () -> (v128) +; CHECK-NEXT: f32.const $push[[L0:[0-9]+]]=, 0x1.5p5 +; CHECK-NEXT: f32x4.splat $push[[L1:[0-9]+]]=, $pop[[L0]] +; CHECK-NEXT: return $pop[[L1]] +define <4 x float> @undef_const_insert_f32x4() { + %v = insertelement <4 x float> undef, float 42., i32 1 + ret <4 x float> %v +} + +; CHECK-LABEL: undef_arg_insert_i32x4: +; CHECK-NEXT: .functype undef_arg_insert_i32x4 (i32) -> (v128) +; CHECK-NEXT: i32x4.splat $push[[L0:[0-9]+]]=, $0 +; CHECK-NEXT: return $pop[[L0]] +define <4 x i32> @undef_arg_insert_i32x4(i32 %x) { + %v = insertelement <4 x i32> undef, i32 %x, i32 3 + ret <4 x i32> %v +} + +; CHECK-LABEL: all_undef_i8x16: +; CHECK-NEXT: .functype all_undef_i8x16 () -> (v128) +; CHECK-NEXT: return $0 +define <16 x i8> @all_undef_i8x16() { + %v = insertelement <16 x i8> undef, i8 undef, i32 4 + ret <16 x i8> %v +} + +; CHECK-LABEL: all_undef_f64x2: +; CHECK-NEXT: .functype all_undef_f64x2 () -> (v128) +; CHECK-NEXT: return $0 +define <2 x double> @all_undef_f64x2() { + ret <2 x double> undef +}