forked from OSchip/llvm-project
[X86] Replace avx2 broadcast intrinsics with native IR.
Since r245605, the clang headers don't use these anymore. r245165 updated some of the tests already; update the others, add an autoupgrade, remove the intrinsics, and cleanup the definitions. Differential Revision: http://reviews.llvm.org/D10555 llvm-svn: 245606
This commit is contained in:
parent
5e354cb547
commit
1a498705e4
|
@ -2167,39 +2167,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
|
|||
|
||||
// Vector load with broadcast
|
||||
let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
|
||||
def int_x86_avx2_vbroadcast_ss_ps :
|
||||
GCCBuiltin<"__builtin_ia32_vbroadcastss_ps">,
|
||||
Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
|
||||
def int_x86_avx2_vbroadcast_sd_pd_256 :
|
||||
GCCBuiltin<"__builtin_ia32_vbroadcastsd_pd256">,
|
||||
Intrinsic<[llvm_v4f64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
|
||||
def int_x86_avx2_vbroadcast_ss_ps_256 :
|
||||
GCCBuiltin<"__builtin_ia32_vbroadcastss_ps256">,
|
||||
Intrinsic<[llvm_v8f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
|
||||
def int_x86_avx2_pbroadcastb_128 :
|
||||
GCCBuiltin<"__builtin_ia32_pbroadcastb128">,
|
||||
Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty], [IntrNoMem]>;
|
||||
def int_x86_avx2_pbroadcastb_256 :
|
||||
GCCBuiltin<"__builtin_ia32_pbroadcastb256">,
|
||||
Intrinsic<[llvm_v32i8_ty], [llvm_v16i8_ty], [IntrNoMem]>;
|
||||
def int_x86_avx2_pbroadcastw_128 :
|
||||
GCCBuiltin<"__builtin_ia32_pbroadcastw128">,
|
||||
Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty], [IntrNoMem]>;
|
||||
def int_x86_avx2_pbroadcastw_256 :
|
||||
GCCBuiltin<"__builtin_ia32_pbroadcastw256">,
|
||||
Intrinsic<[llvm_v16i16_ty], [llvm_v8i16_ty], [IntrNoMem]>;
|
||||
def int_x86_avx2_pbroadcastd_128 :
|
||||
GCCBuiltin<"__builtin_ia32_pbroadcastd128">,
|
||||
Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty], [IntrNoMem]>;
|
||||
def int_x86_avx2_pbroadcastd_256 :
|
||||
GCCBuiltin<"__builtin_ia32_pbroadcastd256">,
|
||||
Intrinsic<[llvm_v8i32_ty], [llvm_v4i32_ty], [IntrNoMem]>;
|
||||
def int_x86_avx2_pbroadcastq_128 :
|
||||
GCCBuiltin<"__builtin_ia32_pbroadcastq128">,
|
||||
Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty], [IntrNoMem]>;
|
||||
def int_x86_avx2_pbroadcastq_256 :
|
||||
GCCBuiltin<"__builtin_ia32_pbroadcastq256">,
|
||||
Intrinsic<[llvm_v4i64_ty], [llvm_v2i64_ty], [IntrNoMem]>;
|
||||
def int_x86_avx512_mask_pbroadcast_d_gpr_512 :
|
||||
GCCBuiltin<"__builtin_ia32_pbroadcastd512_gpr_mask">,
|
||||
Intrinsic<[llvm_v16i32_ty], [llvm_i32_ty, llvm_v16i32_ty,
|
||||
|
|
|
@ -129,6 +129,8 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
|
|||
Name.startswith("x86.sse2.pcmpgt.") ||
|
||||
Name.startswith("x86.avx2.pcmpeq.") ||
|
||||
Name.startswith("x86.avx2.pcmpgt.") ||
|
||||
Name.startswith("x86.avx2.vbroadcast") ||
|
||||
Name.startswith("x86.avx2.pbroadcast") ||
|
||||
Name.startswith("x86.avx.vpermil.") ||
|
||||
Name == "x86.avx.vinsertf128.pd.256" ||
|
||||
Name == "x86.avx.vinsertf128.ps.256" ||
|
||||
|
@ -447,6 +449,14 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
|
|||
const int Idxs[4] = { 0, 1, 0, 1 };
|
||||
Rep = Builder.CreateShuffleVector(Load, UndefValue::get(Load->getType()),
|
||||
Idxs);
|
||||
} else if (Name.startswith("llvm.x86.avx2.pbroadcast") ||
|
||||
Name.startswith("llvm.x86.avx2.vbroadcast")) {
|
||||
// Replace vp?broadcasts with a vector shuffle.
|
||||
Value *Op = CI->getArgOperand(0);
|
||||
unsigned NumElts = CI->getType()->getVectorNumElements();
|
||||
Type *MaskTy = VectorType::get(Type::getInt32Ty(C), NumElts);
|
||||
Rep = Builder.CreateShuffleVector(Op, UndefValue::get(Op->getType()),
|
||||
Constant::getNullValue(MaskTy));
|
||||
} else if (Name == "llvm.x86.sse2.psll.dq") {
|
||||
// 128-bit shift left specified in bits.
|
||||
unsigned Shift = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
|
||||
|
|
|
@ -7823,13 +7823,7 @@ def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
|
|||
// VBROADCAST - Load from memory and broadcast to all elements of the
|
||||
// destination operand
|
||||
//
|
||||
class avx_broadcast<bits<8> opc, string OpcodeStr, RegisterClass RC,
|
||||
X86MemOperand x86memop, Intrinsic Int, SchedWrite Sched> :
|
||||
AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
|
||||
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
|
||||
[(set RC:$dst, (Int addr:$src))]>, Sched<[Sched]>, VEX;
|
||||
|
||||
class avx_broadcast_no_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
|
||||
class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC,
|
||||
X86MemOperand x86memop, ValueType VT,
|
||||
PatFrag ld_frag, SchedWrite Sched> :
|
||||
AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
|
||||
|
@ -7840,38 +7834,33 @@ class avx_broadcast_no_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
|
|||
}
|
||||
|
||||
// AVX2 adds register forms
|
||||
class avx2_broadcast_reg<bits<8> opc, string OpcodeStr, RegisterClass RC,
|
||||
Intrinsic Int, SchedWrite Sched> :
|
||||
class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC,
|
||||
ValueType ResVT, ValueType OpVT, SchedWrite Sched> :
|
||||
AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
|
||||
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
|
||||
[(set RC:$dst, (Int VR128:$src))]>, Sched<[Sched]>, VEX;
|
||||
[(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>,
|
||||
Sched<[Sched]>, VEX;
|
||||
|
||||
let ExeDomain = SSEPackedSingle in {
|
||||
def VBROADCASTSSrm : avx_broadcast_no_int<0x18, "vbroadcastss", VR128,
|
||||
def VBROADCASTSSrm : avx_broadcast_rm<0x18, "vbroadcastss", VR128,
|
||||
f32mem, v4f32, loadf32, WriteLoad>;
|
||||
def VBROADCASTSSYrm : avx_broadcast_no_int<0x18, "vbroadcastss", VR256,
|
||||
def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256,
|
||||
f32mem, v8f32, loadf32,
|
||||
WriteFShuffleLd>, VEX_L;
|
||||
}
|
||||
let ExeDomain = SSEPackedDouble in
|
||||
def VBROADCASTSDYrm : avx_broadcast_no_int<0x19, "vbroadcastsd", VR256, f64mem,
|
||||
def VBROADCASTSDYrm : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem,
|
||||
v4f64, loadf64, WriteFShuffleLd>, VEX_L;
|
||||
def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem,
|
||||
int_x86_avx_vbroadcastf128_pd_256,
|
||||
WriteFShuffleLd>, VEX_L;
|
||||
|
||||
let ExeDomain = SSEPackedSingle in {
|
||||
def VBROADCASTSSrr : avx2_broadcast_reg<0x18, "vbroadcastss", VR128,
|
||||
int_x86_avx2_vbroadcast_ss_ps,
|
||||
WriteFShuffle>;
|
||||
def VBROADCASTSSYrr : avx2_broadcast_reg<0x18, "vbroadcastss", VR256,
|
||||
int_x86_avx2_vbroadcast_ss_ps_256,
|
||||
WriteFShuffle256>, VEX_L;
|
||||
def VBROADCASTSSrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR128,
|
||||
v4f32, v4f32, WriteFShuffle>;
|
||||
def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256,
|
||||
v8f32, v4f32, WriteFShuffle256>, VEX_L;
|
||||
}
|
||||
let ExeDomain = SSEPackedDouble in
|
||||
def VBROADCASTSDYrr : avx2_broadcast_reg<0x19, "vbroadcastsd", VR256,
|
||||
int_x86_avx2_vbroadcast_sd_pd_256,
|
||||
WriteFShuffle256>, VEX_L;
|
||||
def VBROADCASTSDYrr : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256,
|
||||
v4f64, v2f64, WriteFShuffle256>, VEX_L;
|
||||
|
||||
let mayLoad = 1, Predicates = [HasAVX2] in
|
||||
def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst),
|
||||
|
@ -7879,6 +7868,13 @@ def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst),
|
|||
"vbroadcasti128\t{$src, $dst|$dst, $src}", []>,
|
||||
Sched<[WriteLoad]>, VEX, VEX_L;
|
||||
|
||||
def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
|
||||
(ins f128mem:$src),
|
||||
"vbroadcastf128\t{$src, $dst|$dst, $src}",
|
||||
[(set VR256:$dst,
|
||||
(int_x86_avx_vbroadcastf128_pd_256 addr:$src))]>,
|
||||
Sched<[WriteFShuffleLd]>, VEX, VEX_L;
|
||||
|
||||
let Predicates = [HasAVX] in
|
||||
def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src),
|
||||
(VBROADCASTF128 addr:$src)>;
|
||||
|
@ -8317,83 +8313,31 @@ defm VPBLENDDY : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v8i32,
|
|||
//
|
||||
multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
|
||||
X86MemOperand x86memop, PatFrag ld_frag,
|
||||
Intrinsic Int128, Intrinsic Int256> {
|
||||
ValueType OpVT128, ValueType OpVT256> {
|
||||
def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
|
||||
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
|
||||
[(set VR128:$dst, (Int128 VR128:$src))]>,
|
||||
[(set VR128:$dst, (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>,
|
||||
Sched<[WriteShuffle]>, VEX;
|
||||
def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
|
||||
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
|
||||
[(set VR128:$dst,
|
||||
(Int128 (scalar_to_vector (ld_frag addr:$src))))]>,
|
||||
[(set VR128:$dst, (OpVT128 (X86VBroadcast (ld_frag addr:$src))))]>,
|
||||
Sched<[WriteLoad]>, VEX;
|
||||
def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
|
||||
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
|
||||
[(set VR256:$dst, (Int256 VR128:$src))]>,
|
||||
[(set VR256:$dst, (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>,
|
||||
Sched<[WriteShuffle256]>, VEX, VEX_L;
|
||||
def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
|
||||
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
|
||||
[(set VR256:$dst,
|
||||
(Int256 (scalar_to_vector (ld_frag addr:$src))))]>,
|
||||
[(set VR256:$dst, (OpVT256 (X86VBroadcast (ld_frag addr:$src))))]>,
|
||||
Sched<[WriteLoad]>, VEX, VEX_L;
|
||||
}
|
||||
|
||||
defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8,
|
||||
int_x86_avx2_pbroadcastb_128,
|
||||
int_x86_avx2_pbroadcastb_256>;
|
||||
defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16,
|
||||
int_x86_avx2_pbroadcastw_128,
|
||||
int_x86_avx2_pbroadcastw_256>;
|
||||
defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32,
|
||||
int_x86_avx2_pbroadcastd_128,
|
||||
int_x86_avx2_pbroadcastd_256>;
|
||||
defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64,
|
||||
int_x86_avx2_pbroadcastq_128,
|
||||
int_x86_avx2_pbroadcastq_256>;
|
||||
defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8, v16i8, v32i8>;
|
||||
defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16, v8i16, v16i16>;
|
||||
defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32, v4i32, v8i32>;
|
||||
defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64, v2i64, v4i64>;
|
||||
|
||||
let Predicates = [HasAVX2] in {
|
||||
def : Pat<(v16i8 (X86VBroadcast (loadi8 addr:$src))),
|
||||
(VPBROADCASTBrm addr:$src)>;
|
||||
def : Pat<(v32i8 (X86VBroadcast (loadi8 addr:$src))),
|
||||
(VPBROADCASTBYrm addr:$src)>;
|
||||
def : Pat<(v8i16 (X86VBroadcast (loadi16 addr:$src))),
|
||||
(VPBROADCASTWrm addr:$src)>;
|
||||
def : Pat<(v16i16 (X86VBroadcast (loadi16 addr:$src))),
|
||||
(VPBROADCASTWYrm addr:$src)>;
|
||||
def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
|
||||
(VPBROADCASTDrm addr:$src)>;
|
||||
def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
|
||||
(VPBROADCASTDYrm addr:$src)>;
|
||||
def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))),
|
||||
(VPBROADCASTQrm addr:$src)>;
|
||||
def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
|
||||
(VPBROADCASTQYrm addr:$src)>;
|
||||
|
||||
def : Pat<(v16i8 (X86VBroadcast (v16i8 VR128:$src))),
|
||||
(VPBROADCASTBrr VR128:$src)>;
|
||||
def : Pat<(v32i8 (X86VBroadcast (v16i8 VR128:$src))),
|
||||
(VPBROADCASTBYrr VR128:$src)>;
|
||||
def : Pat<(v8i16 (X86VBroadcast (v8i16 VR128:$src))),
|
||||
(VPBROADCASTWrr VR128:$src)>;
|
||||
def : Pat<(v16i16 (X86VBroadcast (v8i16 VR128:$src))),
|
||||
(VPBROADCASTWYrr VR128:$src)>;
|
||||
def : Pat<(v4i32 (X86VBroadcast (v4i32 VR128:$src))),
|
||||
(VPBROADCASTDrr VR128:$src)>;
|
||||
def : Pat<(v8i32 (X86VBroadcast (v4i32 VR128:$src))),
|
||||
(VPBROADCASTDYrr VR128:$src)>;
|
||||
def : Pat<(v2i64 (X86VBroadcast (v2i64 VR128:$src))),
|
||||
(VPBROADCASTQrr VR128:$src)>;
|
||||
def : Pat<(v4i64 (X86VBroadcast (v2i64 VR128:$src))),
|
||||
(VPBROADCASTQYrr VR128:$src)>;
|
||||
def : Pat<(v4f32 (X86VBroadcast (v4f32 VR128:$src))),
|
||||
(VBROADCASTSSrr VR128:$src)>;
|
||||
def : Pat<(v8f32 (X86VBroadcast (v4f32 VR128:$src))),
|
||||
(VBROADCASTSSYrr VR128:$src)>;
|
||||
def : Pat<(v2f64 (X86VBroadcast (v2f64 VR128:$src))),
|
||||
(VPBROADCASTQrr VR128:$src)>;
|
||||
def : Pat<(v4f64 (X86VBroadcast (v2f64 VR128:$src))),
|
||||
(VBROADCASTSDYrr VR128:$src)>;
|
||||
|
||||
// Provide aliases for broadcast from the same register class that
|
||||
// automatically does the extract.
|
||||
def : Pat<(v32i8 (X86VBroadcast (v32i8 VR256:$src))),
|
||||
|
|
|
@ -83,3 +83,123 @@ define <4 x i64> @test_x86_avx2_vinserti128(<4 x i64> %a0, <2 x i64> %a1) {
|
|||
}
|
||||
declare <4 x i64> @llvm.x86.avx2.vinserti128(<4 x i64>, <2 x i64>, i8) nounwind readnone
|
||||
|
||||
|
||||
define <4 x double> @test_x86_avx2_vbroadcast_sd_pd_256(<2 x double> %a0) {
|
||||
; CHECK-LABEL: test_x86_avx2_vbroadcast_sd_pd_256:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
|
||||
; CHECK-NEXT: retl
|
||||
%res = call <4 x double> @llvm.x86.avx2.vbroadcast.sd.pd.256(<2 x double> %a0)
|
||||
ret <4 x double> %res
|
||||
}
|
||||
declare <4 x double> @llvm.x86.avx2.vbroadcast.sd.pd.256(<2 x double>) nounwind readonly
|
||||
|
||||
|
||||
define <4 x float> @test_x86_avx2_vbroadcast_ss_ps(<4 x float> %a0) {
|
||||
; CHECK-LABEL: test_x86_avx2_vbroadcast_ss_ps:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
|
||||
; CHECK-NEXT: retl
|
||||
%res = call <4 x float> @llvm.x86.avx2.vbroadcast.ss.ps(<4 x float> %a0)
|
||||
ret <4 x float> %res
|
||||
}
|
||||
declare <4 x float> @llvm.x86.avx2.vbroadcast.ss.ps(<4 x float>) nounwind readonly
|
||||
|
||||
|
||||
define <8 x float> @test_x86_avx2_vbroadcast_ss_ps_256(<4 x float> %a0) {
|
||||
; CHECK-LABEL: test_x86_avx2_vbroadcast_ss_ps_256:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
|
||||
; CHECK-NEXT: retl
|
||||
%res = call <8 x float> @llvm.x86.avx2.vbroadcast.ss.ps.256(<4 x float> %a0)
|
||||
ret <8 x float> %res
|
||||
}
|
||||
declare <8 x float> @llvm.x86.avx2.vbroadcast.ss.ps.256(<4 x float>) nounwind readonly
|
||||
|
||||
|
||||
define <16 x i8> @test_x86_avx2_pbroadcastb_128(<16 x i8> %a0) {
|
||||
; CHECK-LABEL: test_x86_avx2_pbroadcastb_128:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0
|
||||
; CHECK-NEXT: retl
|
||||
%res = call <16 x i8> @llvm.x86.avx2.pbroadcastb.128(<16 x i8> %a0)
|
||||
ret <16 x i8> %res
|
||||
}
|
||||
declare <16 x i8> @llvm.x86.avx2.pbroadcastb.128(<16 x i8>) nounwind readonly
|
||||
|
||||
|
||||
define <32 x i8> @test_x86_avx2_pbroadcastb_256(<16 x i8> %a0) {
|
||||
; CHECK-LABEL: test_x86_avx2_pbroadcastb_256:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0
|
||||
; CHECK-NEXT: retl
|
||||
%res = call <32 x i8> @llvm.x86.avx2.pbroadcastb.256(<16 x i8> %a0)
|
||||
ret <32 x i8> %res
|
||||
}
|
||||
declare <32 x i8> @llvm.x86.avx2.pbroadcastb.256(<16 x i8>) nounwind readonly
|
||||
|
||||
|
||||
define <8 x i16> @test_x86_avx2_pbroadcastw_128(<8 x i16> %a0) {
|
||||
; CHECK-LABEL: test_x86_avx2_pbroadcastw_128:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0
|
||||
; CHECK-NEXT: retl
|
||||
%res = call <8 x i16> @llvm.x86.avx2.pbroadcastw.128(<8 x i16> %a0)
|
||||
ret <8 x i16> %res
|
||||
}
|
||||
declare <8 x i16> @llvm.x86.avx2.pbroadcastw.128(<8 x i16>) nounwind readonly
|
||||
|
||||
|
||||
define <16 x i16> @test_x86_avx2_pbroadcastw_256(<8 x i16> %a0) {
|
||||
; CHECK-LABEL: test_x86_avx2_pbroadcastw_256:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0
|
||||
; CHECK-NEXT: retl
|
||||
%res = call <16 x i16> @llvm.x86.avx2.pbroadcastw.256(<8 x i16> %a0)
|
||||
ret <16 x i16> %res
|
||||
}
|
||||
declare <16 x i16> @llvm.x86.avx2.pbroadcastw.256(<8 x i16>) nounwind readonly
|
||||
|
||||
|
||||
define <4 x i32> @test_x86_avx2_pbroadcastd_128(<4 x i32> %a0) {
|
||||
; CHECK-LABEL: test_x86_avx2_pbroadcastd_128:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
|
||||
; CHECK-NEXT: retl
|
||||
%res = call <4 x i32> @llvm.x86.avx2.pbroadcastd.128(<4 x i32> %a0)
|
||||
ret <4 x i32> %res
|
||||
}
|
||||
declare <4 x i32> @llvm.x86.avx2.pbroadcastd.128(<4 x i32>) nounwind readonly
|
||||
|
||||
|
||||
define <8 x i32> @test_x86_avx2_pbroadcastd_256(<4 x i32> %a0) {
|
||||
; CHECK-LABEL: test_x86_avx2_pbroadcastd_256:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
|
||||
; CHECK-NEXT: retl
|
||||
%res = call <8 x i32> @llvm.x86.avx2.pbroadcastd.256(<4 x i32> %a0)
|
||||
ret <8 x i32> %res
|
||||
}
|
||||
declare <8 x i32> @llvm.x86.avx2.pbroadcastd.256(<4 x i32>) nounwind readonly
|
||||
|
||||
|
||||
define <2 x i64> @test_x86_avx2_pbroadcastq_128(<2 x i64> %a0) {
|
||||
; CHECK-LABEL: test_x86_avx2_pbroadcastq_128:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0
|
||||
; CHECK-NEXT: retl
|
||||
%res = call <2 x i64> @llvm.x86.avx2.pbroadcastq.128(<2 x i64> %a0)
|
||||
ret <2 x i64> %res
|
||||
}
|
||||
declare <2 x i64> @llvm.x86.avx2.pbroadcastq.128(<2 x i64>) nounwind readonly
|
||||
|
||||
|
||||
define <4 x i64> @test_x86_avx2_pbroadcastq_256(<2 x i64> %a0) {
|
||||
; CHECK-LABEL: test_x86_avx2_pbroadcastq_256:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
|
||||
; CHECK-NEXT: retl
|
||||
%res = call <4 x i64> @llvm.x86.avx2.pbroadcastq.256(<2 x i64> %a0)
|
||||
ret <4 x i64> %res
|
||||
}
|
||||
declare <4 x i64> @llvm.x86.avx2.pbroadcastq.256(<2 x i64>) nounwind readonly
|
||||
|
|
|
@ -641,30 +641,6 @@ define <4 x i64> @test_x86_avx2_pmul.dq(<8 x i32> %a0, <8 x i32> %a1) {
|
|||
declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone
|
||||
|
||||
|
||||
define <4 x double> @test_x86_avx2_vbroadcast_sd_pd_256(<2 x double> %a0) {
|
||||
; CHECK: vbroadcastsd
|
||||
%res = call <4 x double> @llvm.x86.avx2.vbroadcast.sd.pd.256(<2 x double> %a0) ; <<4 x double>> [#uses=1]
|
||||
ret <4 x double> %res
|
||||
}
|
||||
declare <4 x double> @llvm.x86.avx2.vbroadcast.sd.pd.256(<2 x double>) nounwind readonly
|
||||
|
||||
|
||||
define <4 x float> @test_x86_avx2_vbroadcast_ss_ps(<4 x float> %a0) {
|
||||
; CHECK: vbroadcastss
|
||||
%res = call <4 x float> @llvm.x86.avx2.vbroadcast.ss.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
|
||||
ret <4 x float> %res
|
||||
}
|
||||
declare <4 x float> @llvm.x86.avx2.vbroadcast.ss.ps(<4 x float>) nounwind readonly
|
||||
|
||||
|
||||
define <8 x float> @test_x86_avx2_vbroadcast_ss_ps_256(<4 x float> %a0) {
|
||||
; CHECK: vbroadcastss
|
||||
%res = call <8 x float> @llvm.x86.avx2.vbroadcast.ss.ps.256(<4 x float> %a0) ; <<8 x float>> [#uses=1]
|
||||
ret <8 x float> %res
|
||||
}
|
||||
declare <8 x float> @llvm.x86.avx2.vbroadcast.ss.ps.256(<4 x float>) nounwind readonly
|
||||
|
||||
|
||||
define <4 x i32> @test_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) {
|
||||
; CHECK: vpblendd
|
||||
%res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i8 7) ; <<4 x i32>> [#uses=1]
|
||||
|
@ -681,70 +657,6 @@ define <8 x i32> @test_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) {
|
|||
declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i8) nounwind readnone
|
||||
|
||||
|
||||
define <16 x i8> @test_x86_avx2_pbroadcastb_128(<16 x i8> %a0) {
|
||||
; CHECK: vpbroadcastb
|
||||
%res = call <16 x i8> @llvm.x86.avx2.pbroadcastb.128(<16 x i8> %a0) ; <<16 x i8>> [#uses=1]
|
||||
ret <16 x i8> %res
|
||||
}
|
||||
declare <16 x i8> @llvm.x86.avx2.pbroadcastb.128(<16 x i8>) nounwind readonly
|
||||
|
||||
|
||||
define <32 x i8> @test_x86_avx2_pbroadcastb_256(<16 x i8> %a0) {
|
||||
; CHECK: vpbroadcastb
|
||||
%res = call <32 x i8> @llvm.x86.avx2.pbroadcastb.256(<16 x i8> %a0) ; <<32 x i8>> [#uses=1]
|
||||
ret <32 x i8> %res
|
||||
}
|
||||
declare <32 x i8> @llvm.x86.avx2.pbroadcastb.256(<16 x i8>) nounwind readonly
|
||||
|
||||
|
||||
define <8 x i16> @test_x86_avx2_pbroadcastw_128(<8 x i16> %a0) {
|
||||
; CHECK: vpbroadcastw
|
||||
%res = call <8 x i16> @llvm.x86.avx2.pbroadcastw.128(<8 x i16> %a0) ; <<8 x i16>> [#uses=1]
|
||||
ret <8 x i16> %res
|
||||
}
|
||||
declare <8 x i16> @llvm.x86.avx2.pbroadcastw.128(<8 x i16>) nounwind readonly
|
||||
|
||||
|
||||
define <16 x i16> @test_x86_avx2_pbroadcastw_256(<8 x i16> %a0) {
|
||||
; CHECK: vpbroadcastw
|
||||
%res = call <16 x i16> @llvm.x86.avx2.pbroadcastw.256(<8 x i16> %a0) ; <<16 x i16>> [#uses=1]
|
||||
ret <16 x i16> %res
|
||||
}
|
||||
declare <16 x i16> @llvm.x86.avx2.pbroadcastw.256(<8 x i16>) nounwind readonly
|
||||
|
||||
|
||||
define <4 x i32> @test_x86_avx2_pbroadcastd_128(<4 x i32> %a0) {
|
||||
; CHECK: vbroadcastss
|
||||
%res = call <4 x i32> @llvm.x86.avx2.pbroadcastd.128(<4 x i32> %a0) ; <<4 x i32>> [#uses=1]
|
||||
ret <4 x i32> %res
|
||||
}
|
||||
declare <4 x i32> @llvm.x86.avx2.pbroadcastd.128(<4 x i32>) nounwind readonly
|
||||
|
||||
|
||||
define <8 x i32> @test_x86_avx2_pbroadcastd_256(<4 x i32> %a0) {
|
||||
; CHECK: vbroadcastss {{[^,]+}}, %ymm{{[0-9]+}}
|
||||
%res = call <8 x i32> @llvm.x86.avx2.pbroadcastd.256(<4 x i32> %a0) ; <<8 x i32>> [#uses=1]
|
||||
ret <8 x i32> %res
|
||||
}
|
||||
declare <8 x i32> @llvm.x86.avx2.pbroadcastd.256(<4 x i32>) nounwind readonly
|
||||
|
||||
|
||||
define <2 x i64> @test_x86_avx2_pbroadcastq_128(<2 x i64> %a0) {
|
||||
; CHECK: vpbroadcastq
|
||||
%res = call <2 x i64> @llvm.x86.avx2.pbroadcastq.128(<2 x i64> %a0) ; <<2 x i64>> [#uses=1]
|
||||
ret <2 x i64> %res
|
||||
}
|
||||
declare <2 x i64> @llvm.x86.avx2.pbroadcastq.128(<2 x i64>) nounwind readonly
|
||||
|
||||
|
||||
define <4 x i64> @test_x86_avx2_pbroadcastq_256(<2 x i64> %a0) {
|
||||
; CHECK: vbroadcastsd {{[^,]+}}, %ymm{{[0-9]+}}
|
||||
%res = call <4 x i64> @llvm.x86.avx2.pbroadcastq.256(<2 x i64> %a0) ; <<4 x i64>> [#uses=1]
|
||||
ret <4 x i64> %res
|
||||
}
|
||||
declare <4 x i64> @llvm.x86.avx2.pbroadcastq.256(<2 x i64>) nounwind readonly
|
||||
|
||||
|
||||
define <8 x i32> @test_x86_avx2_permd(<8 x i32> %a0, <8 x i32> %a1) {
|
||||
; Check that the arguments are swapped between the intrinsic definition
|
||||
; and its lowering. Indeed, the offsets are the first source in
|
||||
|
|
|
@ -12,7 +12,7 @@ define <4 x double> @stack_fold_broadcastsd_ymm(<2 x double> %a0) {
|
|||
;CHECK-LABEL: stack_fold_broadcastsd_ymm
|
||||
;CHECK: vbroadcastsd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
|
||||
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
|
||||
%2 = call <4 x double> @llvm.x86.avx2.vbroadcast.sd.pd.256(<2 x double> %a0)
|
||||
%2 = shufflevector <2 x double> %a0, <2 x double> undef, <4 x i32> zeroinitializer
|
||||
; fadd forces execution domain
|
||||
%3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
|
||||
ret <4 x double> %3
|
||||
|
@ -23,7 +23,7 @@ define <4 x float> @stack_fold_broadcastss(<4 x float> %a0) {
|
|||
;CHECK-LABEL: stack_fold_broadcastss
|
||||
;CHECK: vbroadcastss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
|
||||
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
|
||||
%2 = call <4 x float> @llvm.x86.avx2.vbroadcast.ss.ps(<4 x float> %a0)
|
||||
%2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer
|
||||
; fadd forces execution domain
|
||||
%3 = fadd <4 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0>
|
||||
ret <4 x float> %3
|
||||
|
@ -34,7 +34,7 @@ define <8 x float> @stack_fold_broadcastss_ymm(<4 x float> %a0) {
|
|||
;CHECK-LABEL: stack_fold_broadcastss_ymm
|
||||
;CHECK: vbroadcastss {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
|
||||
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
|
||||
%2 = call <8 x float> @llvm.x86.avx2.vbroadcast.ss.ps.256(<4 x float> %a0)
|
||||
%2 = shufflevector <4 x float> %a0, <4 x float> undef, <8 x i32> zeroinitializer
|
||||
; fadd forces execution domain
|
||||
%3 = fadd <8 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
|
||||
ret <8 x float> %3
|
||||
|
|
Loading…
Reference in New Issue