[WebAssembly] Optimize BUILD_VECTOR lowering for size

Summary:
Implements custom lowering logic that finds the optimal value for the
initial splat of the vector and either uses it or uses v128.const if
it is available and if it would produce smaller code. This logic
replaces large TableGen ISEL patterns that would lower all non-splat
BUILD_VECTORs into a splat followed by a fixed number of replace_lane
instructions. This CL fixes PR39685.

Reviewers: aheejin

Subscribers: dschuff, sbc100, jgravelle-google, sunfish, llvm-commits

Differential Revision: https://reviews.llvm.org/D56633

llvm-svn: 352592
This commit is contained in:
Thomas Lively 2019-01-30 02:23:29 +00:00
parent ccefbbd0f0
commit 079816efb7
4 changed files with 238 additions and 112 deletions

View File

@ -131,6 +131,13 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
for (auto T : {MVT::v16i8, MVT::v8i16})
setOperationAction(Op, T, Legal);
// Custom lower BUILD_VECTORs to minimize number of replace_lanes
for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32})
setOperationAction(ISD::BUILD_VECTOR, T, Custom);
if (Subtarget->hasUnimplementedSIMD128())
for (auto T : {MVT::v2i64, MVT::v2f64})
setOperationAction(ISD::BUILD_VECTOR, T, Custom);
// We have custom shuffle lowering to expose the shuffle mask
for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32})
setOperationAction(ISD::VECTOR_SHUFFLE, T, Custom);
@ -886,6 +893,8 @@ SDValue WebAssemblyTargetLowering::LowerOperation(SDValue Op,
return LowerINTRINSIC_VOID(Op, DAG);
case ISD::SIGN_EXTEND_INREG:
return LowerSIGN_EXTEND_INREG(Op, DAG);
case ISD::BUILD_VECTOR:
return LowerBUILD_VECTOR(Op, DAG);
case ISD::VECTOR_SHUFFLE:
return LowerVECTOR_SHUFFLE(Op, DAG);
case ISD::SHL:
@ -1103,6 +1112,107 @@ WebAssemblyTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
return SDValue();
}
SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
const EVT VecT = Op.getValueType();
const EVT LaneT = Op.getOperand(0).getValueType();
const size_t Lanes = Op.getNumOperands();
auto IsConstant = [](const SDValue &V) {
return V.getOpcode() == ISD::Constant || V.getOpcode() == ISD::ConstantFP;
};
// Find the most common operand, which is approximately the best to splat
using Entry = std::pair<SDValue, size_t>;
SmallVector<Entry, 16> ValueCounts;
size_t NumConst = 0, NumDynamic = 0;
for (const SDValue &Lane : Op->op_values()) {
if (Lane.isUndef()) {
continue;
} else if (IsConstant(Lane)) {
NumConst++;
} else {
NumDynamic++;
}
auto CountIt = std::find_if(ValueCounts.begin(), ValueCounts.end(),
[&Lane](Entry A) { return A.first == Lane; });
if (CountIt == ValueCounts.end()) {
ValueCounts.emplace_back(Lane, 1);
} else {
CountIt->second++;
}
}
auto CommonIt =
std::max_element(ValueCounts.begin(), ValueCounts.end(),
[](Entry A, Entry B) { return A.second < B.second; });
assert(CommonIt != ValueCounts.end() && "Unexpected all-undef build_vector");
SDValue SplatValue = CommonIt->first;
size_t NumCommon = CommonIt->second;
// If v128.const is available, consider using it instead of a splat
if (Subtarget->hasUnimplementedSIMD128()) {
// {i32,i64,f32,f64}.const opcode, and value
const size_t ConstBytes = 1 + std::max(size_t(4), 16 / Lanes);
// SIMD prefix and opcode
const size_t SplatBytes = 2;
const size_t SplatConstBytes = SplatBytes + ConstBytes;
// SIMD prefix, opcode, and lane index
const size_t ReplaceBytes = 3;
const size_t ReplaceConstBytes = ReplaceBytes + ConstBytes;
// SIMD prefix, v128.const opcode, and 128-bit value
const size_t VecConstBytes = 18;
// Initial v128.const and a replace_lane for each non-const operand
const size_t ConstInitBytes = VecConstBytes + NumDynamic * ReplaceBytes;
// Initial splat and all necessary replace_lanes
const size_t SplatInitBytes =
IsConstant(SplatValue)
// Initial constant splat
? (SplatConstBytes +
// Constant replace_lanes
(NumConst - NumCommon) * ReplaceConstBytes +
// Dynamic replace_lanes
(NumDynamic * ReplaceBytes))
// Initial dynamic splat
: (SplatBytes +
// Constant replace_lanes
(NumConst * ReplaceConstBytes) +
// Dynamic replace_lanes
(NumDynamic - NumCommon) * ReplaceBytes);
if (ConstInitBytes < SplatInitBytes) {
// Create build_vector that will lower to initial v128.const
SmallVector<SDValue, 16> ConstLanes;
for (const SDValue &Lane : Op->op_values()) {
if (IsConstant(Lane)) {
ConstLanes.push_back(Lane);
} else if (LaneT.isFloatingPoint()) {
ConstLanes.push_back(DAG.getConstantFP(0, DL, LaneT));
} else {
ConstLanes.push_back(DAG.getConstant(0, DL, LaneT));
}
}
SDValue Result = DAG.getBuildVector(VecT, DL, ConstLanes);
// Add replace_lane instructions for non-const lanes
for (size_t I = 0; I < Lanes; ++I) {
const SDValue &Lane = Op->getOperand(I);
if (!Lane.isUndef() && !IsConstant(Lane))
Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecT, Result, Lane,
DAG.getConstant(I, DL, MVT::i32));
}
return Result;
}
}
// Use a splat for the initial vector
SDValue Result = DAG.getSplatBuildVector(VecT, DL, SplatValue);
// Add replace_lane instructions for other values
for (size_t I = 0; I < Lanes; ++I) {
const SDValue &Lane = Op->getOperand(I);
if (Lane != SplatValue)
Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecT, Result, Lane,
DAG.getConstant(I, DL, MVT::i32));
}
return Result;
}
SDValue
WebAssemblyTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
SelectionDAG &DAG) const {

View File

@ -99,6 +99,7 @@ private:
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerAccessVectorElement(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const;

View File

@ -359,118 +359,6 @@ def : Pat<(vector_insert (v4f32 V128:$vec), F32:$x, undef),
def : Pat<(vector_insert (v2f64 V128:$vec), F64:$x, undef),
(REPLACE_LANE_v2f64 V128:$vec, 0, F64:$x)>;
// Arbitrary other BUILD_VECTOR patterns
def : Pat<(v16i8 (build_vector
(i32 I32:$x0), (i32 I32:$x1), (i32 I32:$x2), (i32 I32:$x3),
(i32 I32:$x4), (i32 I32:$x5), (i32 I32:$x6), (i32 I32:$x7),
(i32 I32:$x8), (i32 I32:$x9), (i32 I32:$x10), (i32 I32:$x11),
(i32 I32:$x12), (i32 I32:$x13), (i32 I32:$x14), (i32 I32:$x15)
)),
(v16i8 (REPLACE_LANE_v16i8
(v16i8 (REPLACE_LANE_v16i8
(v16i8 (REPLACE_LANE_v16i8
(v16i8 (REPLACE_LANE_v16i8
(v16i8 (REPLACE_LANE_v16i8
(v16i8 (REPLACE_LANE_v16i8
(v16i8 (REPLACE_LANE_v16i8
(v16i8 (REPLACE_LANE_v16i8
(v16i8 (REPLACE_LANE_v16i8
(v16i8 (REPLACE_LANE_v16i8
(v16i8 (REPLACE_LANE_v16i8
(v16i8 (REPLACE_LANE_v16i8
(v16i8 (REPLACE_LANE_v16i8
(v16i8 (REPLACE_LANE_v16i8
(v16i8 (REPLACE_LANE_v16i8
(v16i8 (SPLAT_v16i8 (i32 I32:$x0))),
1, I32:$x1
)),
2, I32:$x2
)),
3, I32:$x3
)),
4, I32:$x4
)),
5, I32:$x5
)),
6, I32:$x6
)),
7, I32:$x7
)),
8, I32:$x8
)),
9, I32:$x9
)),
10, I32:$x10
)),
11, I32:$x11
)),
12, I32:$x12
)),
13, I32:$x13
)),
14, I32:$x14
)),
15, I32:$x15
))>;
def : Pat<(v8i16 (build_vector
(i32 I32:$x0), (i32 I32:$x1), (i32 I32:$x2), (i32 I32:$x3),
(i32 I32:$x4), (i32 I32:$x5), (i32 I32:$x6), (i32 I32:$x7)
)),
(v8i16 (REPLACE_LANE_v8i16
(v8i16 (REPLACE_LANE_v8i16
(v8i16 (REPLACE_LANE_v8i16
(v8i16 (REPLACE_LANE_v8i16
(v8i16 (REPLACE_LANE_v8i16
(v8i16 (REPLACE_LANE_v8i16
(v8i16 (REPLACE_LANE_v8i16
(v8i16 (SPLAT_v8i16 (i32 I32:$x0))),
1, I32:$x1
)),
2, I32:$x2
)),
3, I32:$x3
)),
4, I32:$x4
)),
5, I32:$x5
)),
6, I32:$x6
)),
7, I32:$x7
))>;
def : Pat<(v4i32 (build_vector
(i32 I32:$x0), (i32 I32:$x1), (i32 I32:$x2), (i32 I32:$x3)
)),
(v4i32 (REPLACE_LANE_v4i32
(v4i32 (REPLACE_LANE_v4i32
(v4i32 (REPLACE_LANE_v4i32
(v4i32 (SPLAT_v4i32 (i32 I32:$x0))),
1, I32:$x1
)),
2, I32:$x2
)),
3, I32:$x3
))>;
def : Pat<(v2i64 (build_vector (i64 I64:$x0), (i64 I64:$x1))),
(v2i64 (REPLACE_LANE_v2i64
(v2i64 (SPLAT_v2i64 (i64 I64:$x0))), 1, I64:$x1))>;
def : Pat<(v4f32 (build_vector
(f32 F32:$x0), (f32 F32:$x1), (f32 F32:$x2), (f32 F32:$x3)
)),
(v4f32 (REPLACE_LANE_v4f32
(v4f32 (REPLACE_LANE_v4f32
(v4f32 (REPLACE_LANE_v4f32
(v4f32 (SPLAT_v4f32 (f32 F32:$x0))),
1, F32:$x1
)),
2, F32:$x2
)),
3, F32:$x3
))>;
def : Pat<(v2f64 (build_vector (f64 F64:$x0), (f64 F64:$x1))),
(v2f64 (REPLACE_LANE_v2f64
(v2f64 (SPLAT_v2f64 (f64 F64:$x0))), 1, F64:$x1))>;
//===----------------------------------------------------------------------===//
// Comparisons
//===----------------------------------------------------------------------===//

View File

@ -0,0 +1,127 @@
; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+unimplemented-simd128 | FileCheck %s
; Test that the logic to choose between v128.const vector
; initialization and splat vector initialization and to optimize the
; choice of splat value works correctly.
target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
target triple = "wasm32-unknown-unknown"
; CHECK-LABEL: same_const_one_replaced_i8x16:
; CHECK-NEXT: .functype same_const_one_replaced_i8x16 (i32) -> (v128)
; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 42
; CHECK-NEXT: i16x8.splat $push[[L1:[0-9]+]]=, $pop[[L0]]
; CHECK-NEXT: i16x8.replace_lane $push[[L2:[0-9]+]]=, $pop[[L1]], 5, $0
; CHECK-NEXT: return $pop[[L2]]
define <8 x i16> @same_const_one_replaced_i8x16(i16 %x) {
%v = insertelement
<8 x i16> <i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42>,
i16 %x,
i32 5
ret <8 x i16> %v
}
; CHECK-LABEL: different_const_one_replaced_i8x16:
; CHECK-NEXT: .functype different_const_one_replaced_i8x16 (i32) -> (v128)
; CHECK-NEXT: v128.const $push[[L0:[0-9]+]]=, 1, 2, 3, 4, 5, 0, 7, 8
; CHECK-NEXT: i16x8.replace_lane $push[[L1:[0-9]+]]=, $pop[[L0]], 5, $0
; CHECK-NEXT: return $pop[[L1]]
define <8 x i16> @different_const_one_replaced_i8x16(i16 %x) {
%v = insertelement
<8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>,
i16 %x,
i32 5
ret <8 x i16> %v
}
; CHECK-LABEL: same_const_one_replaced_f32x4:
; CHECK-NEXT: .functype same_const_one_replaced_f32x4 (f32) -> (v128)
; CHECK-NEXT: f32.const $push[[L0:[0-9]+]]=, 0x1.5p5
; CHECK-NEXT: f32x4.splat $push[[L1:[0-9]+]]=, $pop[[L0]]
; CHECK-NEXT: f32x4.replace_lane $push[[L2:[0-9]+]]=, $pop[[L1]], 2, $0
; CHECK-NEXT: return $pop[[L2]]
define <4 x float> @same_const_one_replaced_f32x4(float %x) {
%v = insertelement
<4 x float> <float 42., float 42., float 42., float 42.>,
float %x,
i32 2
ret <4 x float> %v
}
; CHECK-LABEL: different_const_one_replaced_f32x4:
; CHECK-NEXT: .functype different_const_one_replaced_f32x4 (f32) -> (v128)
; CHECK-NEXT: v128.const $push[[L0:[0-9]+]]=, 0x1p0, 0x1p1, 0x0p0, 0x1p2
; CHECK-NEXT: f32x4.replace_lane $push[[L1:[0-9]+]]=, $pop[[L0]], 2, $0
; CHECK-NEXT: return $pop[[L1]]
define <4 x float> @different_const_one_replaced_f32x4(float %x) {
%v = insertelement
<4 x float> <float 1., float 2., float 3., float 4.>,
float %x,
i32 2
ret <4 x float> %v
}
; CHECK-LABEL: splat_common_const_i32x4:
; CHECK-NEXT: .functype splat_common_const_i32x4 () -> (v128)
; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 3
; CHECK-NEXT: i32x4.splat $push[[L1:[0-9]+]]=, $pop[[L0]]
; CHECK-NEXT: i32.const $push[[L2:[0-9]+]]=, 1
; CHECK-NEXT: i32x4.replace_lane $push[[L3:[0-9]+]]=, $pop[[L1]], 3, $pop[[L2]]
; CHECK-NEXT: return $pop[[L3]]
define <4 x i32> @splat_common_const_i32x4() {
ret <4 x i32> <i32 undef, i32 3, i32 3, i32 1>
}
; CHECK-LABEL: splat_common_arg_i16x8:
; CHECK-NEXT: .functype splat_common_arg_i16x8 (i32, i32, i32) -> (v128)
; CHECK-NEXT: i16x8.splat $push[[L0:[0-9]+]]=, $2
; CHECK-NEXT: i16x8.replace_lane $push[[L1:[0-9]+]]=, $pop[[L0]], 0, $1
; CHECK-NEXT: i16x8.replace_lane $push[[L2:[0-9]+]]=, $pop[[L1]], 2, $0
; CHECK-NEXT: i16x8.replace_lane $push[[L3:[0-9]+]]=, $pop[[L2]], 4, $1
; CHECK-NEXT: i16x8.replace_lane $push[[L4:[0-9]+]]=, $pop[[L3]], 7, $1
; CHECK-NEXT: return $pop[[L4]]
define <8 x i16> @splat_common_arg_i16x8(i16 %a, i16 %b, i16 %c) {
%v0 = insertelement <8 x i16> undef, i16 %b, i32 0
%v1 = insertelement <8 x i16> %v0, i16 %c, i32 1
%v2 = insertelement <8 x i16> %v1, i16 %a, i32 2
%v3 = insertelement <8 x i16> %v2, i16 %c, i32 3
%v4 = insertelement <8 x i16> %v3, i16 %b, i32 4
%v5 = insertelement <8 x i16> %v4, i16 %c, i32 5
%v6 = insertelement <8 x i16> %v5, i16 %c, i32 6
%v7 = insertelement <8 x i16> %v6, i16 %b, i32 7
ret <8 x i16> %v7
}
; CHECK-LABEL: undef_const_insert_f32x4:
; CHECK-NEXT: .functype undef_const_insert_f32x4 () -> (v128)
; CHECK-NEXT: f32.const $push[[L0:[0-9]+]]=, 0x1.5p5
; CHECK-NEXT: f32x4.splat $push[[L1:[0-9]+]]=, $pop[[L0]]
; CHECK-NEXT: return $pop[[L1]]
define <4 x float> @undef_const_insert_f32x4() {
%v = insertelement <4 x float> undef, float 42., i32 1
ret <4 x float> %v
}
; CHECK-LABEL: undef_arg_insert_i32x4:
; CHECK-NEXT: .functype undef_arg_insert_i32x4 (i32) -> (v128)
; CHECK-NEXT: i32x4.splat $push[[L0:[0-9]+]]=, $0
; CHECK-NEXT: return $pop[[L0]]
define <4 x i32> @undef_arg_insert_i32x4(i32 %x) {
%v = insertelement <4 x i32> undef, i32 %x, i32 3
ret <4 x i32> %v
}
; CHECK-LABEL: all_undef_i8x16:
; CHECK-NEXT: .functype all_undef_i8x16 () -> (v128)
; CHECK-NEXT: return $0
define <16 x i8> @all_undef_i8x16() {
%v = insertelement <16 x i8> undef, i8 undef, i32 4
ret <16 x i8> %v
}
; CHECK-LABEL: all_undef_f64x2:
; CHECK-NEXT: .functype all_undef_f64x2 () -> (v128)
; CHECK-NEXT: return $0
define <2 x double> @all_undef_f64x2() {
ret <2 x double> undef
}