[X86] Add broadcast load unfolding support for VPTESTMD/Q and VPTESTNMD/Q.

llvm-svn: 373138
This commit is contained in:
Craig Topper 2019-09-28 01:56:36 +00:00
parent 82a707e941
commit 8b5ad3d16e
2 changed files with 162 additions and 0 deletions

View File

@ -5374,6 +5374,18 @@ static const X86MemoryFoldTableEntry BroadcastFoldTable2[] = {
{ X86::VPORQZ128rr, X86::VPORQZ128rmb, TB_BCAST_Q },
{ X86::VPORQZ256rr, X86::VPORQZ256rmb, TB_BCAST_Q },
{ X86::VPORQZrr, X86::VPORQZrmb, TB_BCAST_Q },
{ X86::VPTESTMDZ128rr, X86::VPTESTMDZ128rmb, TB_BCAST_D },
{ X86::VPTESTMDZ256rr, X86::VPTESTMDZ256rmb, TB_BCAST_D },
{ X86::VPTESTMDZrr, X86::VPTESTMDZrmb, TB_BCAST_D },
{ X86::VPTESTMQZ128rr, X86::VPTESTMQZ128rmb, TB_BCAST_Q },
{ X86::VPTESTMQZ256rr, X86::VPTESTMQZ256rmb, TB_BCAST_Q },
{ X86::VPTESTMQZrr, X86::VPTESTMQZrmb, TB_BCAST_Q },
{ X86::VPTESTNMDZ128rr,X86::VPTESTNMDZ128rmb,TB_BCAST_D },
{ X86::VPTESTNMDZ256rr,X86::VPTESTNMDZ256rmb,TB_BCAST_D },
{ X86::VPTESTNMDZrr, X86::VPTESTNMDZrmb, TB_BCAST_D },
{ X86::VPTESTNMQZ128rr,X86::VPTESTNMQZ128rmb,TB_BCAST_Q },
{ X86::VPTESTNMQZ256rr,X86::VPTESTNMQZ256rmb,TB_BCAST_Q },
{ X86::VPTESTNMQZrr, X86::VPTESTNMQZrmb, TB_BCAST_Q },
{ X86::VPXORDZ128rr, X86::VPXORDZ128rmb, TB_BCAST_D },
{ X86::VPXORDZ256rr, X86::VPXORDZ256rmb, TB_BCAST_D },
{ X86::VPXORDZrr, X86::VPXORDZrmb, TB_BCAST_D },

View File

@ -4483,3 +4483,153 @@ define void @bcast_unfold_cmp_v8f32_refold(float* nocapture %0) {
12: ; preds = %2
ret void
}
define void @bcast_unfold_ptestm_v4i32(i32* %arg) {
; CHECK-LABEL: bcast_unfold_ptestm_v4i32:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB127_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm1
; CHECK-NEXT: vptestmd %xmm0, %xmm1, %k1
; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 {%k1}
; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
; CHECK-NEXT: addq $16, %rax
; CHECK-NEXT: jne .LBB127_1
; CHECK-NEXT: # %bb.2: # %bb10
; CHECK-NEXT: retq
bb:
br label %bb1
bb1: ; preds = %bb1, %bb
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
%tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
%tmp3 = bitcast i32* %tmp2 to <4 x i32>*
%tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4
%tmp4b = and <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
%tmp5 = icmp ne <4 x i32> %tmp4b, zeroinitializer
%tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4
%tmp7 = bitcast i32* %tmp2 to <4 x i32>*
store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
%tmp8 = add i64 %tmp, 4
%tmp9 = icmp eq i64 %tmp8, 1024
br i1 %tmp9, label %bb10, label %bb1
bb10: ; preds = %bb1
ret void
}
define void @bcast_unfold_ptestnm_v4i32(i32* %arg) {
; CHECK-LABEL: bcast_unfold_ptestnm_v4i32:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB128_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm1
; CHECK-NEXT: vptestnmd %xmm0, %xmm1, %k1
; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 {%k1}
; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
; CHECK-NEXT: addq $16, %rax
; CHECK-NEXT: jne .LBB128_1
; CHECK-NEXT: # %bb.2: # %bb10
; CHECK-NEXT: retq
bb:
br label %bb1
bb1: ; preds = %bb1, %bb
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
%tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
%tmp3 = bitcast i32* %tmp2 to <4 x i32>*
%tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4
%tmp4b = and <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
%tmp5 = icmp eq <4 x i32> %tmp4b, zeroinitializer
%tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4
%tmp7 = bitcast i32* %tmp2 to <4 x i32>*
store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
%tmp8 = add i64 %tmp, 4
%tmp9 = icmp eq i64 %tmp8, 1024
br i1 %tmp9, label %bb10, label %bb1
bb10: ; preds = %bb1
ret void
}
define void @bcast_unfold_ptestm_v4i64(i64* %arg) {
; CHECK-LABEL: bcast_unfold_ptestm_v4i64:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB129_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm1
; CHECK-NEXT: vptestmq %ymm0, %ymm1, %k1
; CHECK-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 {%k1}
; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax)
; CHECK-NEXT: addq $32, %rax
; CHECK-NEXT: jne .LBB129_1
; CHECK-NEXT: # %bb.2: # %bb10
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
bb:
br label %bb1
bb1: ; preds = %bb1, %bb
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
%tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
%tmp3 = bitcast i64* %tmp2 to <4 x i64>*
%tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 8
%tmp4b = and <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
%tmp5 = icmp ne <4 x i64> %tmp4b, zeroinitializer
%tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4
%tmp7 = bitcast i64* %tmp2 to <4 x i64>*
store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 8
%tmp8 = add i64 %tmp, 4
%tmp9 = icmp eq i64 %tmp8, 1024
br i1 %tmp9, label %bb10, label %bb1
bb10: ; preds = %bb1
ret void
}
define void @bcast_unfold_ptestnm_v4i64(i64* %arg) {
; CHECK-LABEL: bcast_unfold_ptestnm_v4i64:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB130_1: # %bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm1
; CHECK-NEXT: vptestnmq %ymm0, %ymm1, %k1
; CHECK-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 {%k1}
; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax)
; CHECK-NEXT: addq $32, %rax
; CHECK-NEXT: jne .LBB130_1
; CHECK-NEXT: # %bb.2: # %bb10
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
bb:
br label %bb1
bb1: ; preds = %bb1, %bb
%tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
%tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
%tmp3 = bitcast i64* %tmp2 to <4 x i64>*
%tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 8
%tmp4b = and <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
%tmp5 = icmp eq <4 x i64> %tmp4b, zeroinitializer
%tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4
%tmp7 = bitcast i64* %tmp2 to <4 x i64>*
store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 8
%tmp8 = add i64 %tmp, 4
%tmp9 = icmp eq i64 %tmp8, 1024
br i1 %tmp9, label %bb10, label %bb1
bb10: ; preds = %bb1
ret void
}