[WebAssembly] Restore builtins and intrinsics for pmin/pmax

Partially reverts 85157c0079, which had removed these builtins and intrinsics
in favor of normal codegen patterns. It turns out that it is possible for the
patterns to be split over multiple basic blocks, however, which means that DAG
ISel is not able to select them to the pmin/pmax instructions. To make sure the
SIMD intrinsics generate the correct instructions in these cases, reintroduce
the clang builtins and corresponding LLVM intrinsics, but also keep the normal
pattern matching as well.

Differential Revision: https://reviews.llvm.org/D108387
This commit is contained in:
Thomas Lively 2021-08-20 09:21:31 -07:00
parent 24ea94ad0c
commit 88962cea46
8 changed files with 123 additions and 22 deletions

View File

@ -129,8 +129,12 @@ TARGET_BUILTIN(__builtin_wasm_abs_f64x2, "V2dV2d", "nc", "simd128")
TARGET_BUILTIN(__builtin_wasm_min_f32x4, "V4fV4fV4f", "nc", "simd128")
TARGET_BUILTIN(__builtin_wasm_max_f32x4, "V4fV4fV4f", "nc", "simd128")
TARGET_BUILTIN(__builtin_wasm_pmin_f32x4, "V4fV4fV4f", "nc", "simd128")
TARGET_BUILTIN(__builtin_wasm_pmax_f32x4, "V4fV4fV4f", "nc", "simd128")
TARGET_BUILTIN(__builtin_wasm_min_f64x2, "V2dV2dV2d", "nc", "simd128")
TARGET_BUILTIN(__builtin_wasm_max_f64x2, "V2dV2dV2d", "nc", "simd128")
TARGET_BUILTIN(__builtin_wasm_pmin_f64x2, "V2dV2dV2d", "nc", "simd128")
TARGET_BUILTIN(__builtin_wasm_pmax_f64x2, "V2dV2dV2d", "nc", "simd128")
TARGET_BUILTIN(__builtin_wasm_ceil_f32x4, "V4fV4f", "nc", "simd128")
TARGET_BUILTIN(__builtin_wasm_floor_f32x4, "V4fV4f", "nc", "simd128")

View File

@ -17822,6 +17822,22 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
CGM.getIntrinsic(Intrinsic::maximum, ConvertType(E->getType()));
return Builder.CreateCall(Callee, {LHS, RHS});
}
case WebAssembly::BI__builtin_wasm_pmin_f32x4:
case WebAssembly::BI__builtin_wasm_pmin_f64x2: {
Value *LHS = EmitScalarExpr(E->getArg(0));
Value *RHS = EmitScalarExpr(E->getArg(1));
Function *Callee =
CGM.getIntrinsic(Intrinsic::wasm_pmin, ConvertType(E->getType()));
return Builder.CreateCall(Callee, {LHS, RHS});
}
case WebAssembly::BI__builtin_wasm_pmax_f32x4:
case WebAssembly::BI__builtin_wasm_pmax_f64x2: {
Value *LHS = EmitScalarExpr(E->getArg(0));
Value *RHS = EmitScalarExpr(E->getArg(1));
Function *Callee =
CGM.getIntrinsic(Intrinsic::wasm_pmax, ConvertType(E->getType()));
return Builder.CreateCall(Callee, {LHS, RHS});
}
case WebAssembly::BI__builtin_wasm_ceil_f32x4:
case WebAssembly::BI__builtin_wasm_floor_f32x4:
case WebAssembly::BI__builtin_wasm_trunc_f32x4:

View File

@ -1297,14 +1297,12 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f32x4_max(v128_t __a,
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f32x4_pmin(v128_t __a,
v128_t __b) {
__i32x4 __mask = (__i32x4)((__f32x4)__b < (__f32x4)__a);
return (v128_t)((((__i32x4)__b) & __mask) | (((__i32x4)__a) & ~__mask));
return (v128_t)__builtin_wasm_pmin_f32x4((__f32x4)__a, (__f32x4)__b);
}
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f32x4_pmax(v128_t __a,
v128_t __b) {
__i32x4 __mask = (__i32x4)((__f32x4)__a < (__f32x4)__b);
return (v128_t)((((__i32x4)__b) & __mask) | (((__i32x4)__a) & ~__mask));
return (v128_t)__builtin_wasm_pmax_f32x4((__f32x4)__a, (__f32x4)__b);
}
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f64x2_abs(v128_t __a) {
@ -1367,14 +1365,12 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f64x2_max(v128_t __a,
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f64x2_pmin(v128_t __a,
v128_t __b) {
__i64x2 __mask = (__i64x2)((__f64x2)__b < (__f64x2)__a);
return (v128_t)((((__i64x2)__b) & __mask) | (((__i64x2)__a) & ~__mask));
return (v128_t)__builtin_wasm_pmin_f64x2((__f64x2)__a, (__f64x2)__b);
}
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f64x2_pmax(v128_t __a,
v128_t __b) {
__i64x2 __mask = (__i64x2)((__f64x2)__a < (__f64x2)__b);
return (v128_t)((((__i64x2)__b) & __mask) | (((__i64x2)__a) & ~__mask));
return (v128_t)__builtin_wasm_pmax_f64x2((__f64x2)__a, (__f64x2)__b);
}
static __inline__ v128_t __DEFAULT_FN_ATTRS

View File

@ -506,6 +506,20 @@ f32x4 max_f32x4(f32x4 x, f32x4 y) {
// WEBASSEMBLY-NEXT: ret
}
f32x4 pmin_f32x4(f32x4 x, f32x4 y) {
return __builtin_wasm_pmin_f32x4(x, y);
// WEBASSEMBLY: call <4 x float> @llvm.wasm.pmin.v4f32(
// WEBASSEMBLY-SAME: <4 x float> %x, <4 x float> %y)
// WEBASSEMBLY-NEXT: ret
}
f32x4 pmax_f32x4(f32x4 x, f32x4 y) {
return __builtin_wasm_pmax_f32x4(x, y);
// WEBASSEMBLY: call <4 x float> @llvm.wasm.pmax.v4f32(
// WEBASSEMBLY-SAME: <4 x float> %x, <4 x float> %y)
// WEBASSEMBLY-NEXT: ret
}
f64x2 min_f64x2(f64x2 x, f64x2 y) {
return __builtin_wasm_min_f64x2(x, y);
// WEBASSEMBLY: call <2 x double> @llvm.minimum.v2f64(
@ -520,6 +534,20 @@ f64x2 max_f64x2(f64x2 x, f64x2 y) {
// WEBASSEMBLY-NEXT: ret
}
f64x2 pmin_f64x2(f64x2 x, f64x2 y) {
return __builtin_wasm_pmin_f64x2(x, y);
// WEBASSEMBLY: call <2 x double> @llvm.wasm.pmin.v2f64(
// WEBASSEMBLY-SAME: <2 x double> %x, <2 x double> %y)
// WEBASSEMBLY-NEXT: ret
}
f64x2 pmax_f64x2(f64x2 x, f64x2 y) {
return __builtin_wasm_pmax_f64x2(x, y);
// WEBASSEMBLY: call <2 x double> @llvm.wasm.pmax.v2f64(
// WEBASSEMBLY-SAME: <2 x double> %x, <2 x double> %y)
// WEBASSEMBLY-NEXT: ret
}
f32x4 ceil_f32x4(f32x4 x) {
return __builtin_wasm_ceil_f32x4(x);
// WEBASSEMBLY: call <4 x float> @llvm.ceil.v4f32(<4 x float> %x)

View File

@ -2424,11 +2424,11 @@ v128_t test_f32x4_max(v128_t a, v128_t b) {
// CHECK-LABEL: @test_f32x4_pmin(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B:%.*]] to <4 x float>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
// CHECK-NEXT: [[CMP_I:%.*]] = fcmp olt <4 x float> [[TMP0]], [[TMP1]]
// CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[CMP_I]], <4 x i32> [[B]], <4 x i32> [[A]]
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <4 x float>
// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.wasm.pmin.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]]) #[[ATTR6]]
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP3]]
//
v128_t test_f32x4_pmin(v128_t a, v128_t b) {
return wasm_f32x4_pmin(a, b);
@ -2438,9 +2438,9 @@ v128_t test_f32x4_pmin(v128_t a, v128_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <4 x float>
// CHECK-NEXT: [[CMP_I:%.*]] = fcmp olt <4 x float> [[TMP0]], [[TMP1]]
// CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[CMP_I]], <4 x i32> [[B]], <4 x i32> [[A]]
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.wasm.pmax.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]]) #[[ATTR6]]
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP3]]
//
v128_t test_f32x4_pmax(v128_t a, v128_t b) {
return wasm_f32x4_pmax(a, b);
@ -2597,10 +2597,9 @@ v128_t test_f64x2_max(v128_t a, v128_t b) {
// CHECK-LABEL: @test_f64x2_pmin(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x double>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
// CHECK-NEXT: [[CMP_I:%.*]] = fcmp olt <2 x double> [[TMP0]], [[TMP1]]
// CHECK-NEXT: [[TMP2:%.*]] = select <2 x i1> [[CMP_I]], <2 x double> [[TMP0]], <2 x double> [[TMP1]]
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x double>
// CHECK-NEXT: [[TMP2:%.*]] = tail call <2 x double> @llvm.wasm.pmin.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]]) #[[ATTR6]]
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP3]]
//
@ -2612,8 +2611,7 @@ v128_t test_f64x2_pmin(v128_t a, v128_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x double>
// CHECK-NEXT: [[CMP_I:%.*]] = fcmp olt <2 x double> [[TMP0]], [[TMP1]]
// CHECK-NEXT: [[TMP2:%.*]] = select <2 x i1> [[CMP_I]], <2 x double> [[TMP1]], <2 x double> [[TMP0]]
// CHECK-NEXT: [[TMP2:%.*]] = tail call <2 x double> @llvm.wasm.pmax.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]]) #[[ATTR6]]
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP3]]
//

View File

@ -164,6 +164,15 @@ def int_wasm_q15mulr_sat_signed :
[llvm_v8i16_ty, llvm_v8i16_ty],
[IntrNoMem, IntrSpeculatable]>;
def int_wasm_pmin :
Intrinsic<[llvm_anyvector_ty],
[LLVMMatchType<0>, LLVMMatchType<0>],
[IntrNoMem, IntrSpeculatable]>;
def int_wasm_pmax :
Intrinsic<[llvm_anyvector_ty],
[LLVMMatchType<0>, LLVMMatchType<0>],
[IntrNoMem, IntrSpeculatable]>;
def int_wasm_extadd_pairwise_signed :
Intrinsic<[llvm_anyvector_ty],
[LLVMSubdivide2VectorType<0>],

View File

@ -1175,6 +1175,16 @@ def : Pat<(vec.int_vt (vselect
(pmax $lhs, $rhs)>;
}
// And match the pmin/pmax LLVM intrinsics as well
def : Pat<(v4f32 (int_wasm_pmin (v4f32 V128:$lhs), (v4f32 V128:$rhs))),
(PMIN_F32x4 V128:$lhs, V128:$rhs)>;
def : Pat<(v4f32 (int_wasm_pmax (v4f32 V128:$lhs), (v4f32 V128:$rhs))),
(PMAX_F32x4 V128:$lhs, V128:$rhs)>;
def : Pat<(v2f64 (int_wasm_pmin (v2f64 V128:$lhs), (v2f64 V128:$rhs))),
(PMIN_F64x2 V128:$lhs, V128:$rhs)>;
def : Pat<(v2f64 (int_wasm_pmax (v2f64 V128:$lhs), (v2f64 V128:$rhs))),
(PMAX_F64x2 V128:$lhs, V128:$rhs)>;
//===----------------------------------------------------------------------===//
// Conversions
//===----------------------------------------------------------------------===//

View File

@ -540,6 +540,26 @@ define <4 x float> @bitselect_v4f32(<4 x float> %v1, <4 x float> %v2, <4 x float
ret <4 x float> %a
}
; CHECK-LABEL: pmin_v4f32:
; CHECK-NEXT: .functype pmin_v4f32 (v128, v128) -> (v128){{$}}
; CHECK-NEXT: f32x4.pmin $push[[R:[0-9]+]]=, $0, $1{{$}}
; CHECK-NEXT: return $pop[[R]]{{$}}
declare <4 x float> @llvm.wasm.pmin.v4f32(<4 x float>, <4 x float>)
define <4 x float> @pmin_v4f32(<4 x float> %a, <4 x float> %b) {
%v = call <4 x float> @llvm.wasm.pmin.v4f32(<4 x float> %a, <4 x float> %b)
ret <4 x float> %v
}
; CHECK-LABEL: pmax_v4f32:
; CHECK-NEXT: .functype pmax_v4f32 (v128, v128) -> (v128){{$}}
; CHECK-NEXT: f32x4.pmax $push[[R:[0-9]+]]=, $0, $1{{$}}
; CHECK-NEXT: return $pop[[R]]{{$}}
declare <4 x float> @llvm.wasm.pmax.v4f32(<4 x float>, <4 x float>)
define <4 x float> @pmax_v4f32(<4 x float> %a, <4 x float> %b) {
%v = call <4 x float> @llvm.wasm.pmax.v4f32(<4 x float> %a, <4 x float> %b)
ret <4 x float> %v
}
; CHECK-LABEL: ceil_v4f32:
; CHECK-NEXT: .functype ceil_v4f32 (v128) -> (v128){{$}}
; CHECK-NEXT: f32x4.ceil $push[[R:[0-9]+]]=, $0{{$}}
@ -595,6 +615,26 @@ define <2 x double> @bitselect_v2f64(<2 x double> %v1, <2 x double> %v2, <2 x do
ret <2 x double> %a
}
; CHECK-LABEL: pmin_v2f64:
; CHECK-NEXT: .functype pmin_v2f64 (v128, v128) -> (v128){{$}}
; CHECK-NEXT: f64x2.pmin $push[[R:[0-9]+]]=, $0, $1{{$}}
; CHECK-NEXT: return $pop[[R]]{{$}}
declare <2 x double> @llvm.wasm.pmin.v2f64(<2 x double>, <2 x double>)
define <2 x double> @pmin_v2f64(<2 x double> %a, <2 x double> %b) {
%v = call <2 x double> @llvm.wasm.pmin.v2f64(<2 x double> %a, <2 x double> %b)
ret <2 x double> %v
}
; CHECK-LABEL: pmax_v2f64:
; CHECK-NEXT: .functype pmax_v2f64 (v128, v128) -> (v128){{$}}
; CHECK-NEXT: f64x2.pmax $push[[R:[0-9]+]]=, $0, $1{{$}}
; CHECK-NEXT: return $pop[[R]]{{$}}
declare <2 x double> @llvm.wasm.pmax.v2f64(<2 x double>, <2 x double>)
define <2 x double> @pmax_v2f64(<2 x double> %a, <2 x double> %b) {
%v = call <2 x double> @llvm.wasm.pmax.v2f64(<2 x double> %a, <2 x double> %b)
ret <2 x double> %v
}
; CHECK-LABEL: ceil_v2f64:
; CHECK-NEXT: .functype ceil_v2f64 (v128) -> (v128){{$}}
; CHECK-NEXT: f64x2.ceil $push[[R:[0-9]+]]=, $0{{$}}