forked from OSchip/llvm-project
[SLP] further limit bailout for load combine candidate (PR47450)
The test example based on PR47450 shows that we can match non-byte-sized shifts, but those won't ever be bswap opportunities. This isn't a full fix (we'd still match if the shifts were by 8-bits for example), but this should be enough until there's evidence that we need to do more (this is a borderline case for vectorization in the first place).
This commit is contained in:
parent
54680591e8
commit
40f12ef621
|
@ -3694,11 +3694,13 @@ static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
|
|||
TargetTransformInfo *TTI) {
|
||||
// Look past the root to find a source value. Arbitrarily follow the
|
||||
// path through operand 0 of any 'or'. Also, peek through optional
|
||||
// shift-left-by-constant.
|
||||
// shift-left-by-multiple-of-8-bits.
|
||||
Value *ZextLoad = Root;
|
||||
const APInt *ShAmtC;
|
||||
while (!isa<ConstantExpr>(ZextLoad) &&
|
||||
(match(ZextLoad, m_Or(m_Value(), m_Value())) ||
|
||||
match(ZextLoad, m_Shl(m_Value(), m_Constant()))))
|
||||
(match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
|
||||
ShAmtC->urem(8) == 0)))
|
||||
ZextLoad = cast<BinaryOperator>(ZextLoad)->getOperand(0);
|
||||
|
||||
// Check if the input is an extended load of the required or/shift expression.
|
||||
|
|
|
@ -545,10 +545,11 @@ define void @PR47450(i16* nocapture readonly %p) {
|
|||
; CHECK-NEXT: [[X:%.*]] = load i16, i16* [[P:%.*]], align 2
|
||||
; CHECK-NEXT: [[Z:%.*]] = zext i16 [[X]] to i32
|
||||
; CHECK-NEXT: [[S:%.*]] = shl nuw nsw i32 [[Z]], 1
|
||||
; CHECK-NEXT: store i32 [[S]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @output, i64 0, i64 0), align 16
|
||||
; CHECK-NEXT: store i32 [[S]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @output, i64 0, i64 1), align 4
|
||||
; CHECK-NEXT: store i32 [[S]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @output, i64 0, i64 2), align 8
|
||||
; CHECK-NEXT: store i32 [[S]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @output, i64 0, i64 3), align 4
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i32 0
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[S]], i32 1
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[S]], i32 2
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[S]], i32 3
|
||||
; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast ([8 x i32]* @output to <4 x i32>*), align 16
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
%x = load i16, i16* %p, align 2
|
||||
|
|
Loading…
Reference in New Issue