forked from OSchip/llvm-project
[InstCombine] Improve Vector Demanded Bits Through Bitcasts
Currently SimplifyDemandedVectorElts can only peek through bitcasts if the vectors have the same number of elements. This patch fixes and enables some existing (disabled) code to support bitcasting to vectors with more/fewer elements. It currently only accepts cases when vectors alias cleanly (i.e. number of elements are an exact multiple of the other vector). This was added to improve the demanded vector elements support for SSE vector shifts which require the __m128i (<2 x i64>) argument type to be bitcast to the vector type for the builtin shift. I've added extra tests for various additional bitcasts. Differential Revision: http://reviews.llvm.org/D12935 llvm-svn: 248784
This commit is contained in:
parent
868e1c08d9
commit
43f5e0848e
|
@ -1082,6 +1082,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
|
||||||
if (!VTy) break;
|
if (!VTy) break;
|
||||||
unsigned InVWidth = VTy->getNumElements();
|
unsigned InVWidth = VTy->getNumElements();
|
||||||
APInt InputDemandedElts(InVWidth, 0);
|
APInt InputDemandedElts(InVWidth, 0);
|
||||||
|
UndefElts2 = APInt(InVWidth, 0);
|
||||||
unsigned Ratio;
|
unsigned Ratio;
|
||||||
|
|
||||||
if (VWidth == InVWidth) {
|
if (VWidth == InVWidth) {
|
||||||
|
@ -1089,29 +1090,25 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
|
||||||
// elements as are demanded of us.
|
// elements as are demanded of us.
|
||||||
Ratio = 1;
|
Ratio = 1;
|
||||||
InputDemandedElts = DemandedElts;
|
InputDemandedElts = DemandedElts;
|
||||||
} else if (VWidth > InVWidth) {
|
} else if ((VWidth % InVWidth) == 0) {
|
||||||
// Untested so far.
|
// If the number of elements in the output is a multiple of the number of
|
||||||
break;
|
// elements in the input then an input element is live if any of the
|
||||||
|
// corresponding output elements are live.
|
||||||
// If there are more elements in the result than there are in the source,
|
Ratio = VWidth / InVWidth;
|
||||||
// then an input element is live if any of the corresponding output
|
for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx)
|
||||||
// elements are live.
|
|
||||||
Ratio = VWidth/InVWidth;
|
|
||||||
for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) {
|
|
||||||
if (DemandedElts[OutIdx])
|
if (DemandedElts[OutIdx])
|
||||||
InputDemandedElts.setBit(OutIdx/Ratio);
|
InputDemandedElts.setBit(OutIdx / Ratio);
|
||||||
}
|
} else if ((InVWidth % VWidth) == 0) {
|
||||||
} else {
|
// If the number of elements in the input is a multiple of the number of
|
||||||
// Untested so far.
|
// elements in the output then an input element is live if the
|
||||||
break;
|
// corresponding output element is live.
|
||||||
|
Ratio = InVWidth / VWidth;
|
||||||
// If there are more elements in the source than there are in the result,
|
|
||||||
// then an input element is live if the corresponding output element is
|
|
||||||
// live.
|
|
||||||
Ratio = InVWidth/VWidth;
|
|
||||||
for (unsigned InIdx = 0; InIdx != InVWidth; ++InIdx)
|
for (unsigned InIdx = 0; InIdx != InVWidth; ++InIdx)
|
||||||
if (DemandedElts[InIdx/Ratio])
|
if (DemandedElts[InIdx / Ratio])
|
||||||
InputDemandedElts.setBit(InIdx);
|
InputDemandedElts.setBit(InIdx);
|
||||||
|
} else {
|
||||||
|
// Unsupported so far.
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
// div/rem demand all inputs, because they don't want divide by zero.
|
// div/rem demand all inputs, because they don't want divide by zero.
|
||||||
|
@ -1122,24 +1119,26 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
|
||||||
MadeChange = true;
|
MadeChange = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
UndefElts = UndefElts2;
|
if (VWidth == InVWidth) {
|
||||||
if (VWidth > InVWidth) {
|
UndefElts = UndefElts2;
|
||||||
llvm_unreachable("Unimp");
|
} else if ((VWidth % InVWidth) == 0) {
|
||||||
// If there are more elements in the result than there are in the source,
|
// If the number of elements in the output is a multiple of the number of
|
||||||
// then an output element is undef if the corresponding input element is
|
// elements in the input then an output element is undef if the
|
||||||
// undef.
|
// corresponding input element is undef.
|
||||||
for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx)
|
for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx)
|
||||||
if (UndefElts2[OutIdx/Ratio])
|
if (UndefElts2[OutIdx / Ratio])
|
||||||
UndefElts.setBit(OutIdx);
|
UndefElts.setBit(OutIdx);
|
||||||
} else if (VWidth < InVWidth) {
|
} else if ((InVWidth % VWidth) == 0) {
|
||||||
|
// If the number of elements in the input is a multiple of the number of
|
||||||
|
// elements in the output then an output element is undef if all of the
|
||||||
|
// corresponding input elements are undef.
|
||||||
|
for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) {
|
||||||
|
APInt SubUndef = UndefElts2.lshr(OutIdx * Ratio).zextOrTrunc(Ratio);
|
||||||
|
if (SubUndef.countPopulation() == Ratio)
|
||||||
|
UndefElts.setBit(OutIdx);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
llvm_unreachable("Unimp");
|
llvm_unreachable("Unimp");
|
||||||
// If there are more elements in the source than there are in the result,
|
|
||||||
// then a result element is undef if all of the corresponding input
|
|
||||||
// elements are undef.
|
|
||||||
UndefElts = ~0ULL >> (64-VWidth); // Start out all undef.
|
|
||||||
for (unsigned InIdx = 0; InIdx != InVWidth; ++InIdx)
|
|
||||||
if (!UndefElts2[InIdx]) // Not undef?
|
|
||||||
UndefElts.clearBit(InIdx/Ratio); // Clear undef bit.
|
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
|
@ -838,6 +838,17 @@ define <8 x i16> @sse2_psra_w_var(<8 x i16> %v, <8 x i16> %a) {
|
||||||
ret <8 x i16> %2
|
ret <8 x i16> %2
|
||||||
}
|
}
|
||||||
|
|
||||||
|
define <8 x i16> @sse2_psra_w_var_bc(<8 x i16> %v, <2 x i64> %a) {
|
||||||
|
; CHECK-LABEL: @sse2_psra_w_var_bc
|
||||||
|
; CHECK-NEXT: %1 = bitcast <2 x i64> %a to <8 x i16>
|
||||||
|
; CHECK-NEXT: %2 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> %1)
|
||||||
|
; CHECK-NEXT: ret <8 x i16> %2
|
||||||
|
%1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
|
||||||
|
%2 = bitcast <2 x i64> %1 to <8 x i16>
|
||||||
|
%3 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> %2)
|
||||||
|
ret <8 x i16> %3
|
||||||
|
}
|
||||||
|
|
||||||
define <4 x i32> @sse2_psra_d_var(<4 x i32> %v, <4 x i32> %a) {
|
define <4 x i32> @sse2_psra_d_var(<4 x i32> %v, <4 x i32> %a) {
|
||||||
; CHECK-LABEL: @sse2_psra_d_var
|
; CHECK-LABEL: @sse2_psra_d_var
|
||||||
; CHECK-NEXT: %1 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> %a)
|
; CHECK-NEXT: %1 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> %a)
|
||||||
|
@ -847,6 +858,17 @@ define <4 x i32> @sse2_psra_d_var(<4 x i32> %v, <4 x i32> %a) {
|
||||||
ret <4 x i32> %2
|
ret <4 x i32> %2
|
||||||
}
|
}
|
||||||
|
|
||||||
|
define <4 x i32> @sse2_psra_d_var_bc(<4 x i32> %v, <8 x i16> %a) {
|
||||||
|
; CHECK-LABEL: @sse2_psra_d_var_bc
|
||||||
|
; CHECK-NEXT: %1 = bitcast <8 x i16> %a to <4 x i32>
|
||||||
|
; CHECK-NEXT: %2 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> %1)
|
||||||
|
; CHECK-NEXT: ret <4 x i32> %2
|
||||||
|
%1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
|
||||||
|
%2 = bitcast <8 x i16> %1 to <4 x i32>
|
||||||
|
%3 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> %2)
|
||||||
|
ret <4 x i32> %3
|
||||||
|
}
|
||||||
|
|
||||||
define <16 x i16> @avx2_psra_w_var(<16 x i16> %v, <8 x i16> %a) {
|
define <16 x i16> @avx2_psra_w_var(<16 x i16> %v, <8 x i16> %a) {
|
||||||
; CHECK-LABEL: @avx2_psra_w_var
|
; CHECK-LABEL: @avx2_psra_w_var
|
||||||
; CHECK-NEXT: %1 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> %a)
|
; CHECK-NEXT: %1 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> %a)
|
||||||
|
@ -901,6 +923,17 @@ define <16 x i16> @avx2_psrl_w_var(<16 x i16> %v, <8 x i16> %a) {
|
||||||
ret <16 x i16> %2
|
ret <16 x i16> %2
|
||||||
}
|
}
|
||||||
|
|
||||||
|
define <16 x i16> @avx2_psrl_w_var_bc(<16 x i16> %v, <16 x i8> %a) {
|
||||||
|
; CHECK-LABEL: @avx2_psrl_w_var_bc
|
||||||
|
; CHECK-NEXT: %1 = bitcast <16 x i8> %a to <8 x i16>
|
||||||
|
; CHECK-NEXT: %2 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> %1)
|
||||||
|
; CHECK-NEXT: ret <16 x i16> %2
|
||||||
|
%1 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||||
|
%2 = bitcast <16 x i8> %1 to <8 x i16>
|
||||||
|
%3 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> %2)
|
||||||
|
ret <16 x i16> %3
|
||||||
|
}
|
||||||
|
|
||||||
define <8 x i32> @avx2_psrl_d_var(<8 x i32> %v, <4 x i32> %a) {
|
define <8 x i32> @avx2_psrl_d_var(<8 x i32> %v, <4 x i32> %a) {
|
||||||
; CHECK-LABEL: @avx2_psrl_d_var
|
; CHECK-LABEL: @avx2_psrl_d_var
|
||||||
; CHECK-NEXT: %1 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> %a)
|
; CHECK-NEXT: %1 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> %a)
|
||||||
|
@ -910,6 +943,17 @@ define <8 x i32> @avx2_psrl_d_var(<8 x i32> %v, <4 x i32> %a) {
|
||||||
ret <8 x i32> %2
|
ret <8 x i32> %2
|
||||||
}
|
}
|
||||||
|
|
||||||
|
define <8 x i32> @avx2_psrl_d_var_bc(<8 x i32> %v, <2 x i64> %a) {
|
||||||
|
; CHECK-LABEL: @avx2_psrl_d_var_bc
|
||||||
|
; CHECK-NEXT: %1 = bitcast <2 x i64> %a to <4 x i32>
|
||||||
|
; CHECK-NEXT: %2 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> %1)
|
||||||
|
; CHECK-NEXT: ret <8 x i32> %2
|
||||||
|
%1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
|
||||||
|
%2 = bitcast <2 x i64> %1 to <4 x i32>
|
||||||
|
%3 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> %2)
|
||||||
|
ret <8 x i32> %3
|
||||||
|
}
|
||||||
|
|
||||||
define <4 x i64> @avx2_psrl_q_var(<4 x i64> %v, <2 x i64> %a) {
|
define <4 x i64> @avx2_psrl_q_var(<4 x i64> %v, <2 x i64> %a) {
|
||||||
; CHECK-LABEL: @avx2_psrl_q_var
|
; CHECK-LABEL: @avx2_psrl_q_var
|
||||||
; CHECK-NEXT: %1 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> %a)
|
; CHECK-NEXT: %1 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> %a)
|
||||||
|
|
Loading…
Reference in New Issue