[X86][SSE] combineExtractWithShuffle - extract(bitcast(broadcast(x))) --> x

Removes some unnecessary gpr<-->fpu traffic
This commit is contained in:
Simon Pilgrim 2020-01-22 17:23:56 +00:00
parent 58991ba773
commit 5340434c94
2 changed files with 15 additions and 12 deletions

View File

@ -37102,11 +37102,24 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
SDValue SrcBC = peekThroughBitcasts(Src);
// Handle extract(broadcast(scalar_value)), it doesn't matter what index is.
// Handle extract(bitcast(broadcast(scalar_value))).
if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
SDValue SrcOp = SrcBC.getOperand(0);
if (SrcOp.getValueSizeInBits() == VT.getSizeInBits())
return DAG.getBitcast(VT, SrcOp);
EVT SrcOpVT = SrcOp.getValueType();
if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
(SrcOpVT.getSizeInBits() % SrcSVT.getSizeInBits()) == 0) {
unsigned Scale = SrcOpVT.getSizeInBits() / SrcSVT.getSizeInBits();
unsigned Offset = IdxC.urem(Scale) * SrcSVT.getSizeInBits();
// TODO support non-zero offsets.
if (Offset == 0) {
SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
return SrcOp;
}
}
}
// If we're extracting a single element from a broadcast load and there are
@ -37126,7 +37139,7 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
}
}
// Handle extract(scalar_to_vector(scalar_value)) for integers.
// Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
// TODO: Move to DAGCombine?
if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
SrcBC.getValueType().isInteger() &&

View File

@ -77,9 +77,7 @@ define i2 @bitcast_v4i32_to_v2i2(<4 x i32> %a0) nounwind {
; AVX512-NEXT: movzbl %cl, %eax
; AVX512-NEXT: shrl $2, %eax
; AVX512-NEXT: andl $3, %eax
; AVX512-NEXT: vpbroadcastq %rax, %xmm0
; AVX512-NEXT: andl $3, %ecx
; AVX512-NEXT: vpextrb $8, %xmm0, %eax
; AVX512-NEXT: addb %cl, %al
; AVX512-NEXT: # kill: def $al killed $al killed $eax
; AVX512-NEXT: retq
@ -124,9 +122,7 @@ define i4 @bitcast_v8i16_to_v2i4(<8 x i16> %a0) nounwind {
; AVX512-NEXT: kmovd %k0, %ecx
; AVX512-NEXT: movzbl %cl, %eax
; AVX512-NEXT: shrl $4, %eax
; AVX512-NEXT: vpbroadcastq %rax, %xmm0
; AVX512-NEXT: andl $15, %ecx
; AVX512-NEXT: vpextrb $8, %xmm0, %eax
; AVX512-NEXT: addb %cl, %al
; AVX512-NEXT: # kill: def $al killed $al killed $eax
; AVX512-NEXT: retq
@ -214,9 +210,7 @@ define i2 @bitcast_v4i64_to_v2i2(<4 x i64> %a0) nounwind {
; AVX512-NEXT: movzbl %cl, %eax
; AVX512-NEXT: shrl $2, %eax
; AVX512-NEXT: andl $3, %eax
; AVX512-NEXT: vpbroadcastq %rax, %xmm0
; AVX512-NEXT: andl $3, %ecx
; AVX512-NEXT: vpextrb $8, %xmm0, %eax
; AVX512-NEXT: addb %cl, %al
; AVX512-NEXT: # kill: def $al killed $al killed $eax
; AVX512-NEXT: vzeroupper
@ -264,9 +258,7 @@ define i4 @bitcast_v8i32_to_v2i4(<8 x i32> %a0) nounwind {
; AVX512-NEXT: kmovd %k0, %ecx
; AVX512-NEXT: movzbl %cl, %eax
; AVX512-NEXT: shrl $4, %eax
; AVX512-NEXT: vpbroadcastq %rax, %xmm0
; AVX512-NEXT: andl $15, %ecx
; AVX512-NEXT: vpextrb $8, %xmm0, %eax
; AVX512-NEXT: addb %cl, %al
; AVX512-NEXT: # kill: def $al killed $al killed $eax
; AVX512-NEXT: vzeroupper
@ -451,9 +443,7 @@ define i4 @bitcast_v8i64_to_v2i4(<8 x i64> %a0) nounwind {
; AVX512-NEXT: kmovd %k0, %ecx
; AVX512-NEXT: movzbl %cl, %eax
; AVX512-NEXT: shrl $4, %eax
; AVX512-NEXT: vpbroadcastq %rax, %xmm0
; AVX512-NEXT: andl $15, %ecx
; AVX512-NEXT: vpextrb $8, %xmm0, %eax
; AVX512-NEXT: addb %cl, %al
; AVX512-NEXT: # kill: def $al killed $al killed $eax
; AVX512-NEXT: vzeroupper