[SLP] add tests for reduction ordering; NFC

2020-09-16 10:59:30 -04:00 · 2020-09-16 10:59:30 -04:00 · b011611e37
parent 50f4c7c785
commit b011611e37
1 changed files with 147 additions and 0 deletions
--- a/llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll
@ -74,3 +74,150 @@ for.end:                                          ; preds = %for.inc

 declare i32 @printf(i8* nocapture, ...)

+; PR41312 - the order of the reduction ops should not prevent forming a reduction.
+; The 'wrong' member of the reduction requires a greater cost if grouped with the
+; other candidates in the reduction because it does not have matching predicate
+; and/or constant operand.
+
+define float @merge_anyof_v4f32_wrong_first(<4 x float> %x) {
+; CHECK-LABEL: @merge_anyof_v4f32_wrong_first(
+; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x float> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x float> [[X]], i32 1
+; CHECK-NEXT:    [[X2:%.*]] = extractelement <4 x float> [[X]], i32 2
+; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x float> [[X]], i32 3
+; CHECK-NEXT:    [[CMP3WRONG:%.*]] = fcmp olt float [[X3]], 4.200000e+01
+; CHECK-NEXT:    [[CMP0:%.*]] = fcmp ogt float [[X0]], 1.000000e+00
+; CHECK-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[X1]], 1.000000e+00
+; CHECK-NEXT:    [[CMP2:%.*]] = fcmp ogt float [[X2]], 1.000000e+00
+; CHECK-NEXT:    [[CMP3:%.*]] = fcmp ogt float [[X3]], 1.000000e+00
+; CHECK-NEXT:    [[OR03:%.*]] = or i1 [[CMP0]], [[CMP3WRONG]]
+; CHECK-NEXT:    [[OR031:%.*]] = or i1 [[OR03]], [[CMP1]]
+; CHECK-NEXT:    [[OR0312:%.*]] = or i1 [[OR031]], [[CMP2]]
+; CHECK-NEXT:    [[OR03123:%.*]] = or i1 [[OR0312]], [[CMP3]]
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[OR03123]], float -1.000000e+00, float 1.000000e+00
+; CHECK-NEXT:    ret float [[R]]
+;
+  %x0 = extractelement <4 x float> %x, i32 0
+  %x1 = extractelement <4 x float> %x, i32 1
+  %x2 = extractelement <4 x float> %x, i32 2
+  %x3 = extractelement <4 x float> %x, i32 3
+  %cmp3wrong = fcmp olt float %x3, 42.0
+  %cmp0 = fcmp ogt float %x0, 1.0
+  %cmp1 = fcmp ogt float %x1, 1.0
+  %cmp2 = fcmp ogt float %x2, 1.0
+  %cmp3 = fcmp ogt float %x3, 1.0
+  %or03 = or i1 %cmp0, %cmp3wrong
+  %or031 = or i1 %or03, %cmp1
+  %or0312 = or i1 %or031, %cmp2
+  %or03123 = or i1 %or0312, %cmp3
+  %r = select i1 %or03123, float -1.0, float 1.0
+  ret float %r
+}
+
+define float @merge_anyof_v4f32_wrong_last(<4 x float> %x) {
+; CHECK-LABEL: @merge_anyof_v4f32_wrong_last(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3
+; CHECK-NEXT:    [[CMP3WRONG:%.*]] = fcmp olt float [[TMP1]], 4.200000e+01
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp ogt <4 x float> [[X]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = or i1 [[TMP3]], [[CMP3WRONG]]
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[TMP4]], float -1.000000e+00, float 1.000000e+00
+; CHECK-NEXT:    ret float [[R]]
+;
+  %x0 = extractelement <4 x float> %x, i32 0
+  %x1 = extractelement <4 x float> %x, i32 1
+  %x2 = extractelement <4 x float> %x, i32 2
+  %x3 = extractelement <4 x float> %x, i32 3
+  %cmp3wrong = fcmp olt float %x3, 42.0
+  %cmp0 = fcmp ogt float %x0, 1.0
+  %cmp1 = fcmp ogt float %x1, 1.0
+  %cmp2 = fcmp ogt float %x2, 1.0
+  %cmp3 = fcmp ogt float %x3, 1.0
+  %or03 = or i1 %cmp0, %cmp3
+  %or031 = or i1 %or03, %cmp1
+  %or0312 = or i1 %or031, %cmp2
+  %or03123 = or i1 %or0312, %cmp3wrong
+  %r = select i1 %or03123, float -1.0, float 1.0
+  ret float %r
+}
+
+define i32 @merge_anyof_v4i32_wrong_middle(<4 x i32> %x) {
+; CHECK-LABEL: @merge_anyof_v4i32_wrong_middle(
+; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1
+; CHECK-NEXT:    [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
+; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
+; CHECK-NEXT:    [[CMP3WRONG:%.*]] = icmp slt i32 [[X3]], 42
+; CHECK-NEXT:    [[CMP0:%.*]] = icmp sgt i32 [[X0]], 1
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[X1]], 1
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[X2]], 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp sgt i32 [[X3]], 1
+; CHECK-NEXT:    [[OR03:%.*]] = or i1 [[CMP0]], [[CMP3]]
+; CHECK-NEXT:    [[OR033:%.*]] = or i1 [[OR03]], [[CMP3WRONG]]
+; CHECK-NEXT:    [[OR0332:%.*]] = or i1 [[OR033]], [[CMP2]]
+; CHECK-NEXT:    [[OR03321:%.*]] = or i1 [[OR0332]], [[CMP1]]
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[OR03321]], i32 -1, i32 1
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %x0 = extractelement <4 x i32> %x, i32 0
+  %x1 = extractelement <4 x i32> %x, i32 1
+  %x2 = extractelement <4 x i32> %x, i32 2
+  %x3 = extractelement <4 x i32> %x, i32 3
+  %cmp3wrong = icmp slt i32 %x3, 42
+  %cmp0 = icmp sgt i32 %x0, 1
+  %cmp1 = icmp sgt i32 %x1, 1
+  %cmp2 = icmp sgt i32 %x2, 1
+  %cmp3 = icmp sgt i32 %x3, 1
+  %or03 = or i1 %cmp0, %cmp3
+  %or033 = or i1 %or03, %cmp3wrong
+  %or0332 = or i1 %or033, %cmp2
+  %or03321 = or i1 %or0332, %cmp1
+  %r = select i1 %or03321, i32 -1, i32 1
+  ret i32 %r
+}
+
+define i32 @merge_anyof_v4i32_wrong_middle_better_rdx(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @merge_anyof_v4i32_wrong_middle_better_rdx(
+; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1
+; CHECK-NEXT:    [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
+; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
+; CHECK-NEXT:    [[Y0:%.*]] = extractelement <4 x i32> [[Y:%.*]], i32 0
+; CHECK-NEXT:    [[Y1:%.*]] = extractelement <4 x i32> [[Y]], i32 1
+; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i32> [[Y]], i32 2
+; CHECK-NEXT:    [[Y3:%.*]] = extractelement <4 x i32> [[Y]], i32 3
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[X1]], [[Y1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[X0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[X3]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[Y3]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[X2]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> undef, i32 [[Y0]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[Y3]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[X3]], i32 2
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[Y2]], i32 3
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp sgt <4 x i32> [[TMP4]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> [[TMP9]])
+; CHECK-NEXT:    [[TMP11:%.*]] = or i1 [[TMP10]], [[CMP1]]
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[TMP11]], i32 -1, i32 1
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %x0 = extractelement <4 x i32> %x, i32 0
+  %x1 = extractelement <4 x i32> %x, i32 1
+  %x2 = extractelement <4 x i32> %x, i32 2
+  %x3 = extractelement <4 x i32> %x, i32 3
+  %y0 = extractelement <4 x i32> %y, i32 0
+  %y1 = extractelement <4 x i32> %y, i32 1
+  %y2 = extractelement <4 x i32> %y, i32 2
+  %y3 = extractelement <4 x i32> %y, i32 3
+  %cmp3wrong = icmp slt i32 %x3, %y3
+  %cmp0 = icmp sgt i32 %x0, %y0
+  %cmp1 = icmp sgt i32 %x1, %y1
+  %cmp2 = icmp sgt i32 %x2, %y2
+  %cmp3 = icmp sgt i32 %x3, %y3
+  %or03 = or i1 %cmp0, %cmp3
+  %or033 = or i1 %or03, %cmp3wrong
+  %or0332 = or i1 %or033, %cmp2
+  %or03321 = or i1 %or0332, %cmp1
+  %r = select i1 %or03321, i32 -1, i32 1
+  ret i32 %r
+}