From 07cdffc324b8fd336b04e8a7f83d7c3ce1ba7815 Mon Sep 17 00:00:00 2001 From: Andrea Di Biagio Date: Wed, 25 Jun 2014 17:41:58 +0000 Subject: [PATCH] [X86] Always prefer to lower a VECTOR_SHUFFLE into a BLENDI instead of SHUFP (or VPERM2X128). This patch teaches method 'LowerVECTOR_SHUFFLE' to give higher precedence to the check for 'isBlendMask'; the idea is that, when possible, we should firstly check if a shuffle performs a blend, and in case, try to lower it into a BLENDI instead of selecting a SHUFP or (worse) a VPERM2X128. In general: - AVX VBLENDPS/D always have better latency and throughput than VPERM2F128; - BLENDPS/D instructions tend to always have better 'reciprocal throughput' than the equivalent SHUFPS/D; - Both BLENDPS/D and SHUFPS/D are often decoded into the same number of m-ops; however, a m-op obtained from a BLENDPS/D can be scheduled to more than one execution port. This patch: - Moves the check for 'isBlendMask' immediately before the check for 'isSHUFPMask' within method 'LowerVECTOR_SHUFFLE'; - Updates existing tests for sse/avx shuffle/blend instructions to verify that we select (v)blendps/d when possible (instead of (v)shufps/d or vperm2f128). llvm-svn: 211720 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 10 +++++----- llvm/lib/Target/X86/X86InstrSSE.td | 4 ++-- llvm/test/CodeGen/X86/avx-blend.ll | 2 +- llvm/test/CodeGen/X86/avx-shuffle.ll | 2 +- llvm/test/CodeGen/X86/avx-vperm2f128.ll | 2 +- llvm/test/CodeGen/X86/avx-vshufp.ll | 10 +++++----- llvm/test/CodeGen/X86/combine-or.ll | 4 ++-- 7 files changed, 17 insertions(+), 17 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 989e6f1ee81d..cde413f4551c 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -8337,6 +8337,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { getShufflePSHUFLWImmediate(SVOp), DAG); + unsigned MaskValue; + if (isBlendMask(M, VT, Subtarget->hasSSE41(), Subtarget->hasInt256(), + &MaskValue)) + return LowerVECTOR_SHUFFLEtoBlend(SVOp, MaskValue, Subtarget, DAG); + if (isSHUFPMask(M, VT)) return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2, getShuffleSHUFImmediate(SVOp), DAG); @@ -8374,11 +8379,6 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1, V2, getShuffleVPERM2X128Immediate(SVOp), DAG); - unsigned MaskValue; - if (isBlendMask(M, VT, Subtarget->hasSSE41(), Subtarget->hasInt256(), - &MaskValue)) - return LowerVECTOR_SHUFFLEtoBlend(SVOp, MaskValue, Subtarget, DAG); - if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT)) return getINSERTPS(SVOp, dl, DAG); diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 988a05989754..e6ca519b7d5d 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -5374,8 +5374,8 @@ let Predicates = [HasAVX] in { // - the 1st and 3rd element from the first input vector (the 'fsub' node); // - the 2nd and 4th element from the second input vector (the 'fadd' node). - def : Pat<(v4f64 (X86Shufp (v4f64 (fsub VR256:$lhs, VR256:$rhs)), - (v4f64 (fadd VR256:$lhs, VR256:$rhs)), (i8 10))), + def : Pat<(v4f64 (X86Blendi (v4f64 (fsub VR256:$lhs, VR256:$rhs)), + (v4f64 (fadd VR256:$lhs, VR256:$rhs)), (i32 10))), (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>; def : Pat<(v4f64 (X86Blendi (v4f64 (fsub VR256:$lhs, VR256:$rhs)), (v4f64 (fadd VR256:$lhs, VR256:$rhs)), (i32 10))), diff --git a/llvm/test/CodeGen/X86/avx-blend.ll b/llvm/test/CodeGen/X86/avx-blend.ll index 43cdf7edf70a..d2a22d709474 100644 --- a/llvm/test/CodeGen/X86/avx-blend.ll +++ b/llvm/test/CodeGen/X86/avx-blend.ll @@ -110,7 +110,7 @@ define <8 x i64> @vsel_i648(<8 x i64> %v1, <8 x i64> %v2) { ;CHECK-LABEL: vsel_double4: ;CHECK-NOT: vinsertf128 -;CHECK: vshufpd $10 +;CHECK: vblendpd $10 ;CHECK-NEXT: ret define <4 x double> @vsel_double4(<4 x double> %v1, <4 x double> %v2) { %vsel = select <4 x i1> , <4 x double> %v1, <4 x double> %v2 diff --git a/llvm/test/CodeGen/X86/avx-shuffle.ll b/llvm/test/CodeGen/X86/avx-shuffle.ll index f3f7e554a33b..4a996d79815c 100644 --- a/llvm/test/CodeGen/X86/avx-shuffle.ll +++ b/llvm/test/CodeGen/X86/avx-shuffle.ll @@ -25,7 +25,7 @@ define <4 x i64> @test3(<4 x i64> %a, <4 x i64> %b) nounwind { %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %c ; CHECK-LABEL: test3: -; CHECK: vperm2f128 +; CHECK: vblendpd ; CHECK: ret } diff --git a/llvm/test/CodeGen/X86/avx-vperm2f128.ll b/llvm/test/CodeGen/X86/avx-vperm2f128.ll index caa21e5bacfe..c20775bacad2 100644 --- a/llvm/test/CodeGen/X86/avx-vperm2f128.ll +++ b/llvm/test/CodeGen/X86/avx-vperm2f128.ll @@ -9,7 +9,7 @@ entry: } ; CHECK: _B -; CHECK: vperm2f128 $48 +; CHECK: vblendps $240 define <8 x float> @B(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { entry: %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> diff --git a/llvm/test/CodeGen/X86/avx-vshufp.ll b/llvm/test/CodeGen/X86/avx-vshufp.ll index 45883b717380..ad3dbc1ed893 100644 --- a/llvm/test/CodeGen/X86/avx-vshufp.ll +++ b/llvm/test/CodeGen/X86/avx-vshufp.ll @@ -32,14 +32,14 @@ entry: ret <8 x i32> %shuffle } -; CHECK: vshufpd $10, %ymm +; CHECK: vblendpd $10, %ymm define <4 x double> @B(<4 x double> %a, <4 x double> %b) nounwind uwtable readnone ssp { entry: %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } -; CHECK: vshufpd $10, (%{{.*}}), %ymm +; CHECK: vblendpd $10, (%{{.*}}), %ymm define <4 x double> @B2(<4 x double>* %a, <4 x double>* %b) nounwind uwtable readnone ssp { entry: %a2 = load <4 x double>* %a @@ -48,14 +48,14 @@ entry: ret <4 x double> %shuffle } -; CHECK: vshufpd $10, %ymm +; CHECK: vblendpd $10, %ymm define <4 x i64> @B3(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { entry: %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } -; CHECK: vshufpd $10, (%{{.*}}), %ymm +; CHECK: vblendpd $10, (%{{.*}}), %ymm define <4 x i64> @B4(<4 x i64>* %a, <4 x i64>* %b) nounwind uwtable readnone ssp { entry: %a2 = load <4 x i64>* %a @@ -71,7 +71,7 @@ entry: ret <8 x float> %shuffle } -; CHECK: vshufpd $2, %ymm +; CHECK: vblendpd $2, %ymm define <4 x double> @D(<4 x double> %a, <4 x double> %b) nounwind uwtable readnone ssp { entry: %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> diff --git a/llvm/test/CodeGen/X86/combine-or.ll b/llvm/test/CodeGen/X86/combine-or.ll index 572aded5e9a3..ff807b98717d 100644 --- a/llvm/test/CodeGen/X86/combine-or.ll +++ b/llvm/test/CodeGen/X86/combine-or.ll @@ -74,7 +74,7 @@ define <4 x i32> @test6(<4 x i32> %a, <4 x i32> %b) { } ; CHECK-LABEL: test6 ; CHECK-NOT: xorps -; CHECK: shufps +; CHECK: blendps $12 ; CHECK-NEXT: ret @@ -86,7 +86,7 @@ define <4 x i32> @test7(<4 x i32> %a, <4 x i32> %b) { } ; CHECK-LABEL: test7 ; CHECK-NOT: xorps -; CHECK: shufps +; CHECK: blendps $12 ; CHECK-NEXT: ret