llvm-project/llvm/test/CodeGen/X86/shrink_vmul.ll

2591 lines
117 KiB
LLVM
Raw Normal View History

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
@c = external global i32*, align 8
; %val1 = load <2 x i8>
; %op1 = zext<2 x i32> %val1
; %val2 = load <2 x i8>
; %op2 = zext<2 x i32> %val2
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
; X86-SSE-LABEL: mul_2xi8:
; X86-SSE: # %bb.0: # %entry
; X86-SSE-NEXT: pushl %esi
; X86-SSE-NEXT: .cfi_def_cfa_offset 8
; X86-SSE-NEXT: .cfi_offset %esi, -8
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-SSE-NEXT: movl c, %esi
; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx
; X86-SSE-NEXT: movd %edx, %xmm0
; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax
; X86-SSE-NEXT: movd %eax, %xmm1
; X86-SSE-NEXT: pxor %xmm2, %xmm2
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; X86-SSE-NEXT: pmullw %xmm0, %xmm1
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4)
; X86-SSE-NEXT: popl %esi
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; X86-SSE-NEXT: .cfi_def_cfa_offset 4
; X86-SSE-NEXT: retl
;
; X86-AVX-LABEL: mul_2xi8:
; X86-AVX: # %bb.0: # %entry
; X86-AVX-NEXT: pushl %esi
; X86-AVX-NEXT: .cfi_def_cfa_offset 8
; X86-AVX-NEXT: .cfi_offset %esi, -8
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-AVX-NEXT: movl c, %esi
; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
; X86-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0
; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
; X86-AVX-NEXT: popl %esi
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; X86-AVX-NEXT: .cfi_def_cfa_offset 4
; X86-AVX-NEXT: retl
;
; X64-SSE-LABEL: mul_2xi8:
; X64-SSE: # %bb.0: # %entry
; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx
; X64-SSE-NEXT: movd %ecx, %xmm0
; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx
; X64-SSE-NEXT: movd %ecx, %xmm1
; X64-SSE-NEXT: pxor %xmm2, %xmm2
; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; X64-SSE-NEXT: pmullw %xmm0, %xmm1
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4)
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: mul_2xi8:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
; X64-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0
; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
; X64-AVX-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
%tmp7 = bitcast i8* %tmp6 to <2 x i8>*
%wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
%tmp8 = zext <2 x i8> %wide.load to <2 x i32>
%tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
%tmp11 = bitcast i8* %tmp10 to <2 x i8>*
%wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1
%tmp12 = zext <2 x i8> %wide.load17 to <2 x i32>
%tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
%tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
%tmp15 = bitcast i32* %tmp14 to <2 x i32>*
store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
ret void
}
; %val1 = load <4 x i8>
; %op1 = zext<4 x i32> %val1
; %val2 = load <4 x i8>
; %op2 = zext<4 x i32> %val2
; %rst = mul <4 x i32> %op1, %op2
;
define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
; X86-SSE-LABEL: mul_4xi8:
; X86-SSE: # %bb.0: # %entry
; X86-SSE-NEXT: pushl %esi
; X86-SSE-NEXT: .cfi_def_cfa_offset 8
; X86-SSE-NEXT: .cfi_offset %esi, -8
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-SSE-NEXT: movl c, %esi
; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE-NEXT: pxor %xmm1, %xmm1
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X86-SSE-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; X86-SSE-NEXT: pmaddwd %xmm0, %xmm2
; X86-SSE-NEXT: movdqu %xmm2, (%esi,%ecx,4)
; X86-SSE-NEXT: popl %esi
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; X86-SSE-NEXT: .cfi_def_cfa_offset 4
; X86-SSE-NEXT: retl
;
; X86-AVX-LABEL: mul_4xi8:
; X86-AVX: # %bb.0: # %entry
; X86-AVX-NEXT: pushl %esi
; X86-AVX-NEXT: .cfi_def_cfa_offset 8
; X86-AVX-NEXT: .cfi_offset %esi, -8
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-AVX-NEXT: movl c, %esi
; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; X86-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0
; X86-AVX-NEXT: vmovdqu %xmm0, (%esi,%ecx,4)
; X86-AVX-NEXT: popl %esi
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; X86-AVX-NEXT: .cfi_def_cfa_offset 4
; X86-AVX-NEXT: retl
;
; X64-SSE-LABEL: mul_4xi8:
; X64-SSE: # %bb.0: # %entry
; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-SSE-NEXT: pxor %xmm1, %xmm1
; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X64-SSE-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; X64-SSE-NEXT: pmaddwd %xmm0, %xmm2
; X64-SSE-NEXT: movdqu %xmm2, (%rax,%rdx,4)
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: mul_4xi8:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; X64-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0
; X64-AVX-NEXT: vmovdqu %xmm0, (%rax,%rdx,4)
; X64-AVX-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
%tmp7 = bitcast i8* %tmp6 to <4 x i8>*
%wide.load = load <4 x i8>, <4 x i8>* %tmp7, align 1
%tmp8 = zext <4 x i8> %wide.load to <4 x i32>
%tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
%tmp11 = bitcast i8* %tmp10 to <4 x i8>*
%wide.load17 = load <4 x i8>, <4 x i8>* %tmp11, align 1
%tmp12 = zext <4 x i8> %wide.load17 to <4 x i32>
%tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8
%tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
%tmp15 = bitcast i32* %tmp14 to <4 x i32>*
store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4
ret void
}
; %val1 = load <8 x i8>
; %op1 = zext<8 x i32> %val1
; %val2 = load <8 x i8>
; %op2 = zext<8 x i32> %val2
; %rst = mul <8 x i32> %op1, %op2
;
define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
; X86-SSE-LABEL: mul_8xi8:
; X86-SSE: # %bb.0: # %entry
; X86-SSE-NEXT: pushl %esi
; X86-SSE-NEXT: .cfi_def_cfa_offset 8
; X86-SSE-NEXT: .cfi_offset %esi, -8
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-SSE-NEXT: movl c, %esi
; X86-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X86-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; X86-SSE-NEXT: pxor %xmm2, %xmm2
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; X86-SSE-NEXT: pmullw %xmm0, %xmm1
; X86-SSE-NEXT: movdqa %xmm1, %xmm0
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; X86-SSE-NEXT: movdqu %xmm1, 16(%esi,%ecx,4)
; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4)
; X86-SSE-NEXT: popl %esi
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; X86-SSE-NEXT: .cfi_def_cfa_offset 4
; X86-SSE-NEXT: retl
;
; X86-AVX1-LABEL: mul_8xi8:
; X86-AVX1: # %bb.0: # %entry
; X86-AVX1-NEXT: pushl %esi
; X86-AVX1-NEXT: .cfi_def_cfa_offset 8
; X86-AVX1-NEXT: .cfi_offset %esi, -8
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-AVX1-NEXT: movl c, %esi
; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; X86-AVX1-NEXT: vpmaddwd %xmm0, %xmm2, %xmm0
; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; X86-AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1
; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; X86-AVX1-NEXT: vmovups %ymm0, (%esi,%ecx,4)
; X86-AVX1-NEXT: popl %esi
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; X86-AVX1-NEXT: .cfi_def_cfa_offset 4
; X86-AVX1-NEXT: vzeroupper
; X86-AVX1-NEXT: retl
;
; X86-AVX2-LABEL: mul_8xi8:
; X86-AVX2: # %bb.0: # %entry
; X86-AVX2-NEXT: pushl %esi
; X86-AVX2-NEXT: .cfi_def_cfa_offset 8
; X86-AVX2-NEXT: .cfi_offset %esi, -8
; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-AVX2-NEXT: movl c, %esi
; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; X86-AVX2-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0
; X86-AVX2-NEXT: vmovdqu %ymm0, (%esi,%ecx,4)
; X86-AVX2-NEXT: popl %esi
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; X86-AVX2-NEXT: .cfi_def_cfa_offset 4
; X86-AVX2-NEXT: vzeroupper
; X86-AVX2-NEXT: retl
;
; X64-SSE-LABEL: mul_8xi8:
; X64-SSE: # %bb.0: # %entry
; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
; X64-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X64-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; X64-SSE-NEXT: pxor %xmm2, %xmm2
; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; X64-SSE-NEXT: pmullw %xmm0, %xmm1
; X64-SSE-NEXT: movdqa %xmm1, %xmm0
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; X64-SSE-NEXT: movdqu %xmm1, 16(%rax,%rdx,4)
; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4)
; X64-SSE-NEXT: retq
;
; X64-AVX1-LABEL: mul_8xi8:
; X64-AVX1: # %bb.0: # %entry
; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax
; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; X64-AVX1-NEXT: vpmaddwd %xmm0, %xmm2, %xmm0
; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; X64-AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1
; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; X64-AVX1-NEXT: vmovups %ymm0, (%rax,%rdx,4)
; X64-AVX1-NEXT: vzeroupper
; X64-AVX1-NEXT: retq
;
; X64-AVX2-LABEL: mul_8xi8:
; X64-AVX2: # %bb.0: # %entry
; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax
; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; X64-AVX2-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0
; X64-AVX2-NEXT: vmovdqu %ymm0, (%rax,%rdx,4)
; X64-AVX2-NEXT: vzeroupper
; X64-AVX2-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
%tmp7 = bitcast i8* %tmp6 to <8 x i8>*
%wide.load = load <8 x i8>, <8 x i8>* %tmp7, align 1
%tmp8 = zext <8 x i8> %wide.load to <8 x i32>
%tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
%tmp11 = bitcast i8* %tmp10 to <8 x i8>*
%wide.load17 = load <8 x i8>, <8 x i8>* %tmp11, align 1
%tmp12 = zext <8 x i8> %wide.load17 to <8 x i32>
%tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8
%tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
%tmp15 = bitcast i32* %tmp14 to <8 x i32>*
store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4
ret void
}
; %val1 = load <16 x i8>
; %op1 = zext<16 x i32> %val1
; %val2 = load <16 x i8>
; %op2 = zext<16 x i32> %val2
; %rst = mul <16 x i32> %op1, %op2
;
define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
; X86-SSE-LABEL: mul_16xi8:
; X86-SSE: # %bb.0: # %entry
; X86-SSE-NEXT: pushl %esi
; X86-SSE-NEXT: .cfi_def_cfa_offset 8
; X86-SSE-NEXT: .cfi_offset %esi, -8
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-SSE-NEXT: movl c, %esi
; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0
; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm1
; X86-SSE-NEXT: pxor %xmm2, %xmm2
; X86-SSE-NEXT: movdqa %xmm0, %xmm3
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
; X86-SSE-NEXT: movdqa %xmm1, %xmm4
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
; X86-SSE-NEXT: pmullw %xmm3, %xmm4
; X86-SSE-NEXT: movdqa %xmm4, %xmm3
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
; X86-SSE-NEXT: pmullw %xmm0, %xmm1
; X86-SSE-NEXT: movdqa %xmm1, %xmm0
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; X86-SSE-NEXT: movdqu %xmm1, 48(%esi,%ecx,4)
; X86-SSE-NEXT: movdqu %xmm0, 32(%esi,%ecx,4)
; X86-SSE-NEXT: movdqu %xmm4, 16(%esi,%ecx,4)
; X86-SSE-NEXT: movdqu %xmm3, (%esi,%ecx,4)
; X86-SSE-NEXT: popl %esi
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; X86-SSE-NEXT: .cfi_def_cfa_offset 4
; X86-SSE-NEXT: retl
;
; X86-AVX1-LABEL: mul_16xi8:
; X86-AVX1: # %bb.0: # %entry
; X86-AVX1-NEXT: pushl %esi
; X86-AVX1-NEXT: .cfi_def_cfa_offset 8
; X86-AVX1-NEXT: .cfi_offset %esi, -8
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-AVX1-NEXT: movl c, %esi
; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; X86-AVX1-NEXT: vpmaddwd %xmm0, %xmm4, %xmm0
; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; X86-AVX1-NEXT: vpmaddwd %xmm1, %xmm4, %xmm1
; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; X86-AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2
; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; X86-AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3
; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; X86-AVX1-NEXT: vmovups %ymm0, 32(%esi,%ecx,4)
; X86-AVX1-NEXT: vmovups %ymm2, (%esi,%ecx,4)
; X86-AVX1-NEXT: popl %esi
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; X86-AVX1-NEXT: .cfi_def_cfa_offset 4
; X86-AVX1-NEXT: vzeroupper
; X86-AVX1-NEXT: retl
;
; X86-AVX2-LABEL: mul_16xi8:
; X86-AVX2: # %bb.0: # %entry
; X86-AVX2-NEXT: pushl %esi
; X86-AVX2-NEXT: .cfi_def_cfa_offset 8
; X86-AVX2-NEXT: .cfi_offset %esi, -8
; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-AVX2-NEXT: movl c, %esi
; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; X86-AVX2-NEXT: vpmaddwd %ymm0, %ymm2, %ymm0
; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; X86-AVX2-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1
; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4)
; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4)
; X86-AVX2-NEXT: popl %esi
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; X86-AVX2-NEXT: .cfi_def_cfa_offset 4
; X86-AVX2-NEXT: vzeroupper
; X86-AVX2-NEXT: retl
;
; X64-SSE-LABEL: mul_16xi8:
; X64-SSE: # %bb.0: # %entry
; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0
; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm1
; X64-SSE-NEXT: pxor %xmm2, %xmm2
; X64-SSE-NEXT: movdqa %xmm0, %xmm3
; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
; X64-SSE-NEXT: movdqa %xmm1, %xmm4
; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
; X64-SSE-NEXT: pmullw %xmm3, %xmm4
; X64-SSE-NEXT: movdqa %xmm4, %xmm3
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
; X64-SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
; X64-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
; X64-SSE-NEXT: pmullw %xmm0, %xmm1
; X64-SSE-NEXT: movdqa %xmm1, %xmm0
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4)
; X64-SSE-NEXT: movdqu %xmm0, 32(%rax,%rdx,4)
; X64-SSE-NEXT: movdqu %xmm4, 16(%rax,%rdx,4)
; X64-SSE-NEXT: movdqu %xmm3, (%rax,%rdx,4)
; X64-SSE-NEXT: retq
;
; X64-AVX1-LABEL: mul_16xi8:
; X64-AVX1: # %bb.0: # %entry
; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax
; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; X64-AVX1-NEXT: vpmaddwd %xmm0, %xmm4, %xmm0
; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; X64-AVX1-NEXT: vpmaddwd %xmm1, %xmm4, %xmm1
; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; X64-AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2
; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; X64-AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3
; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; X64-AVX1-NEXT: vmovups %ymm0, 32(%rax,%rdx,4)
; X64-AVX1-NEXT: vmovups %ymm2, (%rax,%rdx,4)
; X64-AVX1-NEXT: vzeroupper
; X64-AVX1-NEXT: retq
;
; X64-AVX2-LABEL: mul_16xi8:
; X64-AVX2: # %bb.0: # %entry
; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax
; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; X64-AVX2-NEXT: vpmaddwd %ymm0, %ymm2, %ymm0
; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; X64-AVX2-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1
; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4)
; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4)
; X64-AVX2-NEXT: vzeroupper
; X64-AVX2-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
%tmp7 = bitcast i8* %tmp6 to <16 x i8>*
%wide.load = load <16 x i8>, <16 x i8>* %tmp7, align 1
%tmp8 = zext <16 x i8> %wide.load to <16 x i32>
%tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
%tmp11 = bitcast i8* %tmp10 to <16 x i8>*
%wide.load17 = load <16 x i8>, <16 x i8>* %tmp11, align 1
%tmp12 = zext <16 x i8> %wide.load17 to <16 x i32>
%tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
%tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
%tmp15 = bitcast i32* %tmp14 to <16 x i32>*
store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4
ret void
}
; %val1 = load <2 x i16>
; %op1 = zext<2 x i32> %val1
; %val2 = load <2 x i16>
; %op2 = zext<2 x i32> %val2
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
; X86-SSE-LABEL: mul_2xi16:
; X86-SSE: # %bb.0: # %entry
; X86-SSE-NEXT: pushl %esi
; X86-SSE-NEXT: .cfi_def_cfa_offset 8
; X86-SSE-NEXT: .cfi_offset %esi, -8
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-SSE-NEXT: movl c, %esi
; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X86-SSE-NEXT: movdqa %xmm1, %xmm2
; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2
; X86-SSE-NEXT: pmullw %xmm0, %xmm1
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4)
; X86-SSE-NEXT: popl %esi
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; X86-SSE-NEXT: .cfi_def_cfa_offset 4
; X86-SSE-NEXT: retl
;
; X86-AVX-LABEL: mul_2xi16:
; X86-AVX: # %bb.0: # %entry
; X86-AVX-NEXT: pushl %esi
; X86-AVX-NEXT: .cfi_def_cfa_offset 8
; X86-AVX-NEXT: .cfi_offset %esi, -8
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-AVX-NEXT: movl c, %esi
; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
; X86-AVX-NEXT: popl %esi
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; X86-AVX-NEXT: .cfi_def_cfa_offset 4
; X86-AVX-NEXT: retl
;
; X64-SSE-LABEL: mul_2xi16:
; X64-SSE: # %bb.0: # %entry
; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X64-SSE-NEXT: movdqa %xmm1, %xmm2
; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2
; X64-SSE-NEXT: pmullw %xmm0, %xmm1
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4)
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: mul_2xi16:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
; X64-AVX-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
%tmp7 = bitcast i8* %tmp6 to <2 x i16>*
%wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
%tmp8 = zext <2 x i16> %wide.load to <2 x i32>
%tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
%tmp11 = bitcast i8* %tmp10 to <2 x i16>*
%wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1
%tmp12 = zext <2 x i16> %wide.load17 to <2 x i32>
%tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
%tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
%tmp15 = bitcast i32* %tmp14 to <2 x i32>*
store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
ret void
}
; %val1 = load <4 x i16>
; %op1 = zext<4 x i32> %val1
; %val2 = load <4 x i16>
; %op2 = zext<4 x i32> %val2
; %rst = mul <4 x i32> %op1, %op2
;
define void @mul_4xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
; X86-SSE-LABEL: mul_4xi16:
; X86-SSE: # %bb.0: # %entry
; X86-SSE-NEXT: pushl %esi
; X86-SSE-NEXT: .cfi_def_cfa_offset 8
; X86-SSE-NEXT: .cfi_offset %esi, -8
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-SSE-NEXT: movl c, %esi
; X86-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X86-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; X86-SSE-NEXT: movdqa %xmm1, %xmm2
; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2
; X86-SSE-NEXT: pmullw %xmm0, %xmm1
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; X86-SSE-NEXT: movdqu %xmm1, (%esi,%ecx,4)
; X86-SSE-NEXT: popl %esi
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; X86-SSE-NEXT: .cfi_def_cfa_offset 4
; X86-SSE-NEXT: retl
;
; X86-AVX-LABEL: mul_4xi16:
; X86-AVX: # %bb.0: # %entry
; X86-AVX-NEXT: pushl %esi
; X86-AVX-NEXT: .cfi_def_cfa_offset 8
; X86-AVX-NEXT: .cfi_offset %esi, -8
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-AVX-NEXT: movl c, %esi
; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
; X86-AVX-NEXT: vmovdqu %xmm0, (%esi,%ecx,4)
; X86-AVX-NEXT: popl %esi
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; X86-AVX-NEXT: .cfi_def_cfa_offset 4
; X86-AVX-NEXT: retl
;
; X64-SSE-LABEL: mul_4xi16:
; X64-SSE: # %bb.0: # %entry
; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
; X64-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X64-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; X64-SSE-NEXT: movdqa %xmm1, %xmm2
; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2
; X64-SSE-NEXT: pmullw %xmm0, %xmm1
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; X64-SSE-NEXT: movdqu %xmm1, (%rax,%rdx,4)
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: mul_4xi16:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
; X64-AVX-NEXT: vmovdqu %xmm0, (%rax,%rdx,4)
; X64-AVX-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
%tmp7 = bitcast i8* %tmp6 to <4 x i16>*
%wide.load = load <4 x i16>, <4 x i16>* %tmp7, align 1
%tmp8 = zext <4 x i16> %wide.load to <4 x i32>
%tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
%tmp11 = bitcast i8* %tmp10 to <4 x i16>*
%wide.load17 = load <4 x i16>, <4 x i16>* %tmp11, align 1
%tmp12 = zext <4 x i16> %wide.load17 to <4 x i32>
%tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8
%tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
%tmp15 = bitcast i32* %tmp14 to <4 x i32>*
store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4
ret void
}
; %val1 = load <8 x i16>
; %op1 = zext<8 x i32> %val1
; %val2 = load <8 x i16>
; %op2 = zext<8 x i32> %val2
; %rst = mul <8 x i32> %op1, %op2
;
define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
; X86-SSE-LABEL: mul_8xi16:
; X86-SSE: # %bb.0: # %entry
; X86-SSE-NEXT: pushl %esi
; X86-SSE-NEXT: .cfi_def_cfa_offset 8
; X86-SSE-NEXT: .cfi_offset %esi, -8
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-SSE-NEXT: movl c, %esi
; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0
; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm1
; X86-SSE-NEXT: movdqa %xmm1, %xmm2
; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2
; X86-SSE-NEXT: pmullw %xmm0, %xmm1
; X86-SSE-NEXT: movdqa %xmm1, %xmm0
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; X86-SSE-NEXT: movdqu %xmm1, 16(%esi,%ecx,4)
; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4)
; X86-SSE-NEXT: popl %esi
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; X86-SSE-NEXT: .cfi_def_cfa_offset 4
; X86-SSE-NEXT: retl
;
; X86-AVX1-LABEL: mul_8xi16:
; X86-AVX1: # %bb.0: # %entry
; X86-AVX1-NEXT: pushl %esi
; X86-AVX1-NEXT: .cfi_def_cfa_offset 8
; X86-AVX1-NEXT: .cfi_offset %esi, -8
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-AVX1-NEXT: movl c, %esi
; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; X86-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0
; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; X86-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1
; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; X86-AVX1-NEXT: vmovups %ymm0, (%esi,%ecx,4)
; X86-AVX1-NEXT: popl %esi
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; X86-AVX1-NEXT: .cfi_def_cfa_offset 4
; X86-AVX1-NEXT: vzeroupper
; X86-AVX1-NEXT: retl
;
; X86-AVX2-LABEL: mul_8xi16:
; X86-AVX2: # %bb.0: # %entry
; X86-AVX2-NEXT: pushl %esi
; X86-AVX2-NEXT: .cfi_def_cfa_offset 8
; X86-AVX2-NEXT: .cfi_offset %esi, -8
; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-AVX2-NEXT: movl c, %esi
; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; X86-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0
; X86-AVX2-NEXT: vmovdqu %ymm0, (%esi,%ecx,4)
; X86-AVX2-NEXT: popl %esi
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; X86-AVX2-NEXT: .cfi_def_cfa_offset 4
; X86-AVX2-NEXT: vzeroupper
; X86-AVX2-NEXT: retl
;
; X64-SSE-LABEL: mul_8xi16:
; X64-SSE: # %bb.0: # %entry
; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0
; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm1
; X64-SSE-NEXT: movdqa %xmm1, %xmm2
; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2
; X64-SSE-NEXT: pmullw %xmm0, %xmm1
; X64-SSE-NEXT: movdqa %xmm1, %xmm0
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; X64-SSE-NEXT: movdqu %xmm1, 16(%rax,%rdx,4)
; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4)
; X64-SSE-NEXT: retq
;
; X64-AVX1-LABEL: mul_8xi16:
; X64-AVX1: # %bb.0: # %entry
; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax
; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; X64-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0
; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1
; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; X64-AVX1-NEXT: vmovups %ymm0, (%rax,%rdx,4)
; X64-AVX1-NEXT: vzeroupper
; X64-AVX1-NEXT: retq
;
; X64-AVX2-LABEL: mul_8xi16:
; X64-AVX2: # %bb.0: # %entry
; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax
; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; X64-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0
; X64-AVX2-NEXT: vmovdqu %ymm0, (%rax,%rdx,4)
; X64-AVX2-NEXT: vzeroupper
; X64-AVX2-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
%tmp7 = bitcast i8* %tmp6 to <8 x i16>*
%wide.load = load <8 x i16>, <8 x i16>* %tmp7, align 1
%tmp8 = zext <8 x i16> %wide.load to <8 x i32>
%tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
%tmp11 = bitcast i8* %tmp10 to <8 x i16>*
%wide.load17 = load <8 x i16>, <8 x i16>* %tmp11, align 1
%tmp12 = zext <8 x i16> %wide.load17 to <8 x i32>
%tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8
%tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
%tmp15 = bitcast i32* %tmp14 to <8 x i32>*
store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4
ret void
}
; %val1 = load <16 x i16>
; %op1 = zext<16 x i32> %val1
; %val2 = load <16 x i16>
; %op2 = zext<16 x i32> %val2
; %rst = mul <16 x i32> %op1, %op2
;
define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
; X86-SSE-LABEL: mul_16xi16:
; X86-SSE: # %bb.0: # %entry
; X86-SSE-NEXT: pushl %esi
; X86-SSE-NEXT: .cfi_def_cfa_offset 8
; X86-SSE-NEXT: .cfi_offset %esi, -8
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-SSE-NEXT: movl c, %esi
; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0
; X86-SSE-NEXT: movdqu 16(%edx,%ecx), %xmm1
; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm2
; X86-SSE-NEXT: movdqu 16(%eax,%ecx), %xmm3
; X86-SSE-NEXT: movdqa %xmm2, %xmm4
; X86-SSE-NEXT: pmulhuw %xmm0, %xmm4
; X86-SSE-NEXT: pmullw %xmm0, %xmm2
; X86-SSE-NEXT: movdqa %xmm2, %xmm0
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
; X86-SSE-NEXT: movdqa %xmm3, %xmm4
; X86-SSE-NEXT: pmulhuw %xmm1, %xmm4
; X86-SSE-NEXT: pmullw %xmm1, %xmm3
; X86-SSE-NEXT: movdqa %xmm3, %xmm1
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
; X86-SSE-NEXT: movdqu %xmm3, 48(%esi,%ecx,4)
; X86-SSE-NEXT: movdqu %xmm1, 32(%esi,%ecx,4)
; X86-SSE-NEXT: movdqu %xmm2, 16(%esi,%ecx,4)
; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4)
; X86-SSE-NEXT: popl %esi
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; X86-SSE-NEXT: .cfi_def_cfa_offset 4
; X86-SSE-NEXT: retl
;
; X86-AVX1-LABEL: mul_16xi16:
; X86-AVX1: # %bb.0: # %entry
; X86-AVX1-NEXT: pushl %esi
; X86-AVX1-NEXT: .cfi_def_cfa_offset 8
; X86-AVX1-NEXT: .cfi_offset %esi, -8
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-AVX1-NEXT: movl c, %esi
; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; X86-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0
; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; X86-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1
; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2
; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; X86-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3
; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; X86-AVX1-NEXT: vmovups %ymm0, 32(%esi,%ecx,4)
; X86-AVX1-NEXT: vmovups %ymm2, (%esi,%ecx,4)
; X86-AVX1-NEXT: popl %esi
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; X86-AVX1-NEXT: .cfi_def_cfa_offset 4
; X86-AVX1-NEXT: vzeroupper
; X86-AVX1-NEXT: retl
;
; X86-AVX2-LABEL: mul_16xi16:
; X86-AVX2: # %bb.0: # %entry
; X86-AVX2-NEXT: pushl %esi
; X86-AVX2-NEXT: .cfi_def_cfa_offset 8
; X86-AVX2-NEXT: .cfi_offset %esi, -8
; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-AVX2-NEXT: movl c, %esi
; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; X86-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0
; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; X86-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1
; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4)
; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4)
; X86-AVX2-NEXT: popl %esi
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; X86-AVX2-NEXT: .cfi_def_cfa_offset 4
; X86-AVX2-NEXT: vzeroupper
; X86-AVX2-NEXT: retl
;
; X64-SSE-LABEL: mul_16xi16:
; X64-SSE: # %bb.0: # %entry
; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0
; X64-SSE-NEXT: movdqu 16(%rdi,%rdx), %xmm1
; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm2
; X64-SSE-NEXT: movdqu 16(%rsi,%rdx), %xmm3
; X64-SSE-NEXT: movdqa %xmm2, %xmm4
; X64-SSE-NEXT: pmulhuw %xmm0, %xmm4
; X64-SSE-NEXT: pmullw %xmm0, %xmm2
; X64-SSE-NEXT: movdqa %xmm2, %xmm0
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
; X64-SSE-NEXT: movdqa %xmm3, %xmm4
; X64-SSE-NEXT: pmulhuw %xmm1, %xmm4
; X64-SSE-NEXT: pmullw %xmm1, %xmm3
; X64-SSE-NEXT: movdqa %xmm3, %xmm1
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
; X64-SSE-NEXT: movdqu %xmm3, 48(%rax,%rdx,4)
; X64-SSE-NEXT: movdqu %xmm1, 32(%rax,%rdx,4)
; X64-SSE-NEXT: movdqu %xmm2, 16(%rax,%rdx,4)
; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4)
; X64-SSE-NEXT: retq
;
; X64-AVX1-LABEL: mul_16xi16:
; X64-AVX1: # %bb.0: # %entry
; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax
; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; X64-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0
; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; X64-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1
; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; X64-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2
; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; X64-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3
; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; X64-AVX1-NEXT: vmovups %ymm0, 32(%rax,%rdx,4)
; X64-AVX1-NEXT: vmovups %ymm2, (%rax,%rdx,4)
; X64-AVX1-NEXT: vzeroupper
; X64-AVX1-NEXT: retq
;
; X64-AVX2-LABEL: mul_16xi16:
; X64-AVX2: # %bb.0: # %entry
; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax
; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; X64-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0
; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; X64-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1
; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4)
; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4)
; X64-AVX2-NEXT: vzeroupper
; X64-AVX2-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
%tmp7 = bitcast i8* %tmp6 to <16 x i16>*
%wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1
%tmp8 = zext <16 x i16> %wide.load to <16 x i32>
%tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
%tmp11 = bitcast i8* %tmp10 to <16 x i16>*
%wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1
%tmp12 = zext <16 x i16> %wide.load17 to <16 x i32>
%tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
%tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
%tmp15 = bitcast i32* %tmp14 to <16 x i32>*
store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4
ret void
}
; %val1 = load <2 x i8>
; %op1 = sext<2 x i32> %val1
; %val2 = load <2 x i8>
; %op2 = sext<2 x i32> %val2
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
; X86-SSE-LABEL: mul_2xi8_sext:
; X86-SSE: # %bb.0: # %entry
; X86-SSE-NEXT: pushl %esi
; X86-SSE-NEXT: .cfi_def_cfa_offset 8
; X86-SSE-NEXT: .cfi_offset %esi, -8
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-SSE-NEXT: movl c, %esi
; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx
; X86-SSE-NEXT: movd %edx, %xmm0
; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax
; X86-SSE-NEXT: movd %eax, %xmm1
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X86-SSE-NEXT: psraw $8, %xmm0
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X86-SSE-NEXT: psraw $8, %xmm1
; X86-SSE-NEXT: pmullw %xmm0, %xmm1
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X86-SSE-NEXT: psrad $16, %xmm0
; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4)
; X86-SSE-NEXT: popl %esi
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; X86-SSE-NEXT: .cfi_def_cfa_offset 4
; X86-SSE-NEXT: retl
;
; X86-AVX-LABEL: mul_2xi8_sext:
; X86-AVX: # %bb.0: # %entry
; X86-AVX-NEXT: pushl %esi
; X86-AVX-NEXT: .cfi_def_cfa_offset 8
; X86-AVX-NEXT: .cfi_offset %esi, -8
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-AVX-NEXT: movl c, %esi
; X86-AVX-NEXT: vpmovsxbq (%edx,%ecx), %xmm0
; X86-AVX-NEXT: vpmovsxbq (%eax,%ecx), %xmm1
; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
; X86-AVX-NEXT: popl %esi
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; X86-AVX-NEXT: .cfi_def_cfa_offset 4
; X86-AVX-NEXT: retl
;
; X64-SSE-LABEL: mul_2xi8_sext:
; X64-SSE: # %bb.0: # %entry
; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx
; X64-SSE-NEXT: movd %ecx, %xmm0
; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx
; X64-SSE-NEXT: movd %ecx, %xmm1
; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X64-SSE-NEXT: psraw $8, %xmm0
; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X64-SSE-NEXT: psraw $8, %xmm1
; X64-SSE-NEXT: pmullw %xmm0, %xmm1
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X64-SSE-NEXT: psrad $16, %xmm0
; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4)
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: mul_2xi8_sext:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
; X64-AVX-NEXT: vpmovsxbq (%rdi,%rdx), %xmm0
; X64-AVX-NEXT: vpmovsxbq (%rsi,%rdx), %xmm1
; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
; X64-AVX-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
%tmp7 = bitcast i8* %tmp6 to <2 x i8>*
%wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
%tmp8 = sext <2 x i8> %wide.load to <2 x i32>
%tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
%tmp11 = bitcast i8* %tmp10 to <2 x i8>*
%wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1
%tmp12 = sext <2 x i8> %wide.load17 to <2 x i32>
%tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
%tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
%tmp15 = bitcast i32* %tmp14 to <2 x i32>*
store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
ret void
}
; %val1 = load <2 x i8>
; %op1 = sext<2 x i32> %val1
; %val2 = load <2 x i8>
; %op2 = zext<2 x i32> %val2
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
; X86-SSE-LABEL: mul_2xi8_sext_zext:
; X86-SSE: # %bb.0: # %entry
; X86-SSE-NEXT: pushl %esi
; X86-SSE-NEXT: .cfi_def_cfa_offset 8
; X86-SSE-NEXT: .cfi_offset %esi, -8
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-SSE-NEXT: movl c, %esi
; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx
; X86-SSE-NEXT: movd %edx, %xmm0
; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax
; X86-SSE-NEXT: movd %eax, %xmm1
; X86-SSE-NEXT: pxor %xmm2, %xmm2
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X86-SSE-NEXT: psraw $8, %xmm0
; X86-SSE-NEXT: movdqa %xmm1, %xmm2
; X86-SSE-NEXT: pmulhw %xmm0, %xmm2
; X86-SSE-NEXT: pmullw %xmm1, %xmm0
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4)
; X86-SSE-NEXT: popl %esi
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; X86-SSE-NEXT: .cfi_def_cfa_offset 4
; X86-SSE-NEXT: retl
;
; X86-AVX-LABEL: mul_2xi8_sext_zext:
; X86-AVX: # %bb.0: # %entry
; X86-AVX-NEXT: pushl %esi
; X86-AVX-NEXT: .cfi_def_cfa_offset 8
; X86-AVX-NEXT: .cfi_offset %esi, -8
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-AVX-NEXT: movl c, %esi
; X86-AVX-NEXT: vpmovsxbq (%edx,%ecx), %xmm0
; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
; X86-AVX-NEXT: popl %esi
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; X86-AVX-NEXT: .cfi_def_cfa_offset 4
; X86-AVX-NEXT: retl
;
; X64-SSE-LABEL: mul_2xi8_sext_zext:
; X64-SSE: # %bb.0: # %entry
; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx
; X64-SSE-NEXT: movd %ecx, %xmm0
; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx
; X64-SSE-NEXT: movd %ecx, %xmm1
; X64-SSE-NEXT: pxor %xmm2, %xmm2
; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X64-SSE-NEXT: psraw $8, %xmm0
; X64-SSE-NEXT: movdqa %xmm1, %xmm2
; X64-SSE-NEXT: pmulhw %xmm0, %xmm2
; X64-SSE-NEXT: pmullw %xmm1, %xmm0
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4)
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: mul_2xi8_sext_zext:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
; X64-AVX-NEXT: vpmovsxbq (%rdi,%rdx), %xmm0
; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
; X64-AVX-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
%tmp7 = bitcast i8* %tmp6 to <2 x i8>*
%wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
%tmp8 = sext <2 x i8> %wide.load to <2 x i32>
%tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
%tmp11 = bitcast i8* %tmp10 to <2 x i8>*
%wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1
%tmp12 = zext <2 x i8> %wide.load17 to <2 x i32>
%tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
%tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
%tmp15 = bitcast i32* %tmp14 to <2 x i32>*
store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
ret void
}
; %val1 = load <2 x i16>
; %op1 = sext<2 x i32> %val1
; %val2 = load <2 x i16>
; %op2 = sext<2 x i32> %val2
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
; X86-SSE-LABEL: mul_2xi16_sext:
; X86-SSE: # %bb.0: # %entry
; X86-SSE-NEXT: pushl %esi
; X86-SSE-NEXT: .cfi_def_cfa_offset 8
; X86-SSE-NEXT: .cfi_offset %esi, -8
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-SSE-NEXT: movl c, %esi
; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X86-SSE-NEXT: movdqa %xmm1, %xmm2
; X86-SSE-NEXT: pmulhw %xmm0, %xmm2
; X86-SSE-NEXT: pmullw %xmm0, %xmm1
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4)
; X86-SSE-NEXT: popl %esi
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; X86-SSE-NEXT: .cfi_def_cfa_offset 4
; X86-SSE-NEXT: retl
;
; X86-AVX-LABEL: mul_2xi16_sext:
; X86-AVX: # %bb.0: # %entry
; X86-AVX-NEXT: pushl %esi
; X86-AVX-NEXT: .cfi_def_cfa_offset 8
; X86-AVX-NEXT: .cfi_offset %esi, -8
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-AVX-NEXT: movl c, %esi
; X86-AVX-NEXT: vpmovsxwq (%edx,%ecx), %xmm0
; X86-AVX-NEXT: vpmovsxwq (%eax,%ecx), %xmm1
; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
; X86-AVX-NEXT: popl %esi
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; X86-AVX-NEXT: .cfi_def_cfa_offset 4
; X86-AVX-NEXT: retl
;
; X64-SSE-LABEL: mul_2xi16_sext:
; X64-SSE: # %bb.0: # %entry
; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X64-SSE-NEXT: movdqa %xmm1, %xmm2
; X64-SSE-NEXT: pmulhw %xmm0, %xmm2
; X64-SSE-NEXT: pmullw %xmm0, %xmm1
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4)
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: mul_2xi16_sext:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
; X64-AVX-NEXT: vpmovsxwq (%rdi,%rdx), %xmm0
; X64-AVX-NEXT: vpmovsxwq (%rsi,%rdx), %xmm1
; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
; X64-AVX-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
%tmp7 = bitcast i8* %tmp6 to <2 x i16>*
%wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
%tmp8 = sext <2 x i16> %wide.load to <2 x i32>
%tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
%tmp11 = bitcast i8* %tmp10 to <2 x i16>*
%wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1
%tmp12 = sext <2 x i16> %wide.load17 to <2 x i32>
%tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
%tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
%tmp15 = bitcast i32* %tmp14 to <2 x i32>*
store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
ret void
}
; %val1 = load <2 x i16>
; %op1 = sext<2 x i32> %val1
; %val2 = load <2 x i16>
; %op2 = zext<2 x i32> %val2
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
; X86-SSE-LABEL: mul_2xi16_sext_zext:
; X86-SSE: # %bb.0: # %entry
; X86-SSE-NEXT: pushl %esi
; X86-SSE-NEXT: .cfi_def_cfa_offset 8
; X86-SSE-NEXT: .cfi_offset %esi, -8
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-SSE-NEXT: movl c, %esi
; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
; X86-SSE-NEXT: psrad $16, %xmm0
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X86-SSE-NEXT: pxor %xmm2, %xmm2
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
; X86-SSE-NEXT: movdqa %xmm1, %xmm3
; X86-SSE-NEXT: pmuludq %xmm0, %xmm3
; X86-SSE-NEXT: pmuludq %xmm2, %xmm1
; X86-SSE-NEXT: pmuludq %xmm0, %xmm2
; X86-SSE-NEXT: paddq %xmm1, %xmm2
; X86-SSE-NEXT: psllq $32, %xmm2
; X86-SSE-NEXT: paddq %xmm3, %xmm2
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4)
; X86-SSE-NEXT: popl %esi
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; X86-SSE-NEXT: .cfi_def_cfa_offset 4
; X86-SSE-NEXT: retl
;
; X86-AVX-LABEL: mul_2xi16_sext_zext:
; X86-AVX: # %bb.0: # %entry
; X86-AVX-NEXT: pushl %esi
; X86-AVX-NEXT: .cfi_def_cfa_offset 8
; X86-AVX-NEXT: .cfi_offset %esi, -8
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-AVX-NEXT: movl c, %esi
; X86-AVX-NEXT: vpmovsxwq (%edx,%ecx), %xmm0
; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; X86-AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
; X86-AVX-NEXT: popl %esi
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; X86-AVX-NEXT: .cfi_def_cfa_offset 4
; X86-AVX-NEXT: retl
;
; X64-SSE-LABEL: mul_2xi16_sext_zext:
; X64-SSE: # %bb.0: # %entry
; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
; X64-SSE-NEXT: psrad $16, %xmm0
; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X64-SSE-NEXT: pxor %xmm2, %xmm2
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
; X64-SSE-NEXT: movdqa %xmm1, %xmm3
; X64-SSE-NEXT: pmuludq %xmm0, %xmm3
; X64-SSE-NEXT: pmuludq %xmm2, %xmm1
; X64-SSE-NEXT: pmuludq %xmm0, %xmm2
; X64-SSE-NEXT: paddq %xmm1, %xmm2
; X64-SSE-NEXT: psllq $32, %xmm2
; X64-SSE-NEXT: paddq %xmm3, %xmm2
; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4)
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: mul_2xi16_sext_zext:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
; X64-AVX-NEXT: vpmovsxwq (%rdi,%rdx), %xmm0
; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; X64-AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
; X64-AVX-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
%tmp7 = bitcast i8* %tmp6 to <2 x i16>*
%wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
%tmp8 = sext <2 x i16> %wide.load to <2 x i32>
%tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
%tmp11 = bitcast i8* %tmp10 to <2 x i16>*
%wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1
%tmp12 = zext <2 x i16> %wide.load17 to <2 x i32>
%tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
%tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
%tmp15 = bitcast i32* %tmp14 to <2 x i32>*
store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
ret void
}
; %val1 = load <16 x i16>
; %op1 = sext<16 x i32> %val1
; %val2 = load <16 x i16>
; %op2 = sext<16 x i32> %val2
; %rst = mul <16 x i32> %op1, %op2
;
define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
; X86-SSE-LABEL: mul_16xi16_sext:
; X86-SSE: # %bb.0: # %entry
; X86-SSE-NEXT: pushl %esi
; X86-SSE-NEXT: .cfi_def_cfa_offset 8
; X86-SSE-NEXT: .cfi_offset %esi, -8
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-SSE-NEXT: movl c, %esi
; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0
; X86-SSE-NEXT: movdqu 16(%edx,%ecx), %xmm1
; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm2
; X86-SSE-NEXT: movdqu 16(%eax,%ecx), %xmm3
; X86-SSE-NEXT: movdqa %xmm2, %xmm4
; X86-SSE-NEXT: pmulhw %xmm0, %xmm4
; X86-SSE-NEXT: pmullw %xmm0, %xmm2
; X86-SSE-NEXT: movdqa %xmm2, %xmm0
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
; X86-SSE-NEXT: movdqa %xmm3, %xmm4
; X86-SSE-NEXT: pmulhw %xmm1, %xmm4
; X86-SSE-NEXT: pmullw %xmm1, %xmm3
; X86-SSE-NEXT: movdqa %xmm3, %xmm1
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
; X86-SSE-NEXT: movdqu %xmm3, 48(%esi,%ecx,4)
; X86-SSE-NEXT: movdqu %xmm1, 32(%esi,%ecx,4)
; X86-SSE-NEXT: movdqu %xmm2, 16(%esi,%ecx,4)
; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4)
; X86-SSE-NEXT: popl %esi
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; X86-SSE-NEXT: .cfi_def_cfa_offset 4
; X86-SSE-NEXT: retl
;
; X86-AVX1-LABEL: mul_16xi16_sext:
; X86-AVX1: # %bb.0: # %entry
; X86-AVX1-NEXT: pushl %esi
; X86-AVX1-NEXT: .cfi_def_cfa_offset 8
; X86-AVX1-NEXT: .cfi_offset %esi, -8
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-AVX1-NEXT: movl c, %esi
; X86-AVX1-NEXT: vpmovsxwd 16(%edx,%ecx), %xmm0
; X86-AVX1-NEXT: vpmovsxwd 24(%edx,%ecx), %xmm1
; X86-AVX1-NEXT: vpmovsxwd (%edx,%ecx), %xmm2
; X86-AVX1-NEXT: vpmovsxwd 8(%edx,%ecx), %xmm3
; X86-AVX1-NEXT: vpmovsxwd 16(%eax,%ecx), %xmm4
; X86-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0
; X86-AVX1-NEXT: vpmovsxwd 24(%eax,%ecx), %xmm4
; X86-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1
; X86-AVX1-NEXT: vpmovsxwd (%eax,%ecx), %xmm4
; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2
; X86-AVX1-NEXT: vpmovsxwd 8(%eax,%ecx), %xmm4
; X86-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3
; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; X86-AVX1-NEXT: vmovups %ymm0, 32(%esi,%ecx,4)
; X86-AVX1-NEXT: vmovups %ymm2, (%esi,%ecx,4)
; X86-AVX1-NEXT: popl %esi
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; X86-AVX1-NEXT: .cfi_def_cfa_offset 4
; X86-AVX1-NEXT: vzeroupper
; X86-AVX1-NEXT: retl
;
; X86-AVX2-LABEL: mul_16xi16_sext:
; X86-AVX2: # %bb.0: # %entry
; X86-AVX2-NEXT: pushl %esi
; X86-AVX2-NEXT: .cfi_def_cfa_offset 8
; X86-AVX2-NEXT: .cfi_offset %esi, -8
; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-AVX2-NEXT: movl c, %esi
; X86-AVX2-NEXT: vpmovsxwd 16(%edx,%ecx), %ymm0
; X86-AVX2-NEXT: vpmovsxwd (%edx,%ecx), %ymm1
; X86-AVX2-NEXT: vpmovsxwd 16(%eax,%ecx), %ymm2
; X86-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0
; X86-AVX2-NEXT: vpmovsxwd (%eax,%ecx), %ymm2
; X86-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1
; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4)
; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4)
; X86-AVX2-NEXT: popl %esi
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; X86-AVX2-NEXT: .cfi_def_cfa_offset 4
; X86-AVX2-NEXT: vzeroupper
; X86-AVX2-NEXT: retl
;
; X64-SSE-LABEL: mul_16xi16_sext:
; X64-SSE: # %bb.0: # %entry
; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0
; X64-SSE-NEXT: movdqu 16(%rdi,%rdx), %xmm1
; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm2
; X64-SSE-NEXT: movdqu 16(%rsi,%rdx), %xmm3
; X64-SSE-NEXT: movdqa %xmm2, %xmm4
; X64-SSE-NEXT: pmulhw %xmm0, %xmm4
; X64-SSE-NEXT: pmullw %xmm0, %xmm2
; X64-SSE-NEXT: movdqa %xmm2, %xmm0
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
; X64-SSE-NEXT: movdqa %xmm3, %xmm4
; X64-SSE-NEXT: pmulhw %xmm1, %xmm4
; X64-SSE-NEXT: pmullw %xmm1, %xmm3
; X64-SSE-NEXT: movdqa %xmm3, %xmm1
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
; X64-SSE-NEXT: movdqu %xmm3, 48(%rax,%rdx,4)
; X64-SSE-NEXT: movdqu %xmm1, 32(%rax,%rdx,4)
; X64-SSE-NEXT: movdqu %xmm2, 16(%rax,%rdx,4)
; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4)
; X64-SSE-NEXT: retq
;
; X64-AVX1-LABEL: mul_16xi16_sext:
; X64-AVX1: # %bb.0: # %entry
; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax
; X64-AVX1-NEXT: vpmovsxwd 16(%rdi,%rdx), %xmm0
; X64-AVX1-NEXT: vpmovsxwd 24(%rdi,%rdx), %xmm1
; X64-AVX1-NEXT: vpmovsxwd (%rdi,%rdx), %xmm2
; X64-AVX1-NEXT: vpmovsxwd 8(%rdi,%rdx), %xmm3
; X64-AVX1-NEXT: vpmovsxwd 16(%rsi,%rdx), %xmm4
; X64-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0
; X64-AVX1-NEXT: vpmovsxwd 24(%rsi,%rdx), %xmm4
; X64-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1
; X64-AVX1-NEXT: vpmovsxwd (%rsi,%rdx), %xmm4
; X64-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2
; X64-AVX1-NEXT: vpmovsxwd 8(%rsi,%rdx), %xmm4
; X64-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3
; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; X64-AVX1-NEXT: vmovups %ymm0, 32(%rax,%rdx,4)
; X64-AVX1-NEXT: vmovups %ymm2, (%rax,%rdx,4)
; X64-AVX1-NEXT: vzeroupper
; X64-AVX1-NEXT: retq
;
; X64-AVX2-LABEL: mul_16xi16_sext:
; X64-AVX2: # %bb.0: # %entry
; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax
; X64-AVX2-NEXT: vpmovsxwd 16(%rdi,%rdx), %ymm0
; X64-AVX2-NEXT: vpmovsxwd (%rdi,%rdx), %ymm1
; X64-AVX2-NEXT: vpmovsxwd 16(%rsi,%rdx), %ymm2
; X64-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0
; X64-AVX2-NEXT: vpmovsxwd (%rsi,%rdx), %ymm2
; X64-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1
; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4)
; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4)
; X64-AVX2-NEXT: vzeroupper
; X64-AVX2-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
%tmp7 = bitcast i8* %tmp6 to <16 x i16>*
%wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1
%tmp8 = sext <16 x i16> %wide.load to <16 x i32>
%tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
%tmp11 = bitcast i8* %tmp10 to <16 x i16>*
%wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1
%tmp12 = sext <16 x i16> %wide.load17 to <16 x i32>
%tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
%tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
%tmp15 = bitcast i32* %tmp14 to <16 x i32>*
store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4
ret void
}
; %val = load <2 x i8>
; %op1 = zext<2 x i32> %val
; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 255)
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi8_varconst1(i8* nocapture readonly %a, i64 %index) {
; X86-SSE-LABEL: mul_2xi8_varconst1:
; X86-SSE: # %bb.0: # %entry
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE-NEXT: movl c, %edx
; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx
; X86-SSE-NEXT: movd %ecx, %xmm0
; X86-SSE-NEXT: pxor %xmm1, %xmm1
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; X86-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
; X86-SSE-NEXT: retl
;
; X86-AVX-LABEL: mul_2xi8_varconst1:
; X86-AVX: # %bb.0: # %entry
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl c, %edx
; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
; X86-AVX-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
;
; X64-SSE-LABEL: mul_2xi8_varconst1:
; X64-SSE: # %bb.0: # %entry
; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx
; X64-SSE-NEXT: movd %ecx, %xmm0
; X64-SSE-NEXT: pxor %xmm1, %xmm1
; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; X64-SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: mul_2xi8_varconst1:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
; X64-AVX-NEXT: movl $255, %ecx
; X64-AVX-NEXT: vmovq %rcx, %xmm1
; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
; X64-AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0
; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
; X64-AVX-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
%tmp7 = bitcast i8* %tmp6 to <2 x i8>*
%wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
%tmp8 = zext <2 x i8> %wide.load to <2 x i32>
%tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 255>
%tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
%tmp15 = bitcast i32* %tmp14 to <2 x i32>*
store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
ret void
}
; %val = load <2 x i8>
; %op1 = sext<2 x i32> %val
; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 127)
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi8_varconst2(i8* nocapture readonly %a, i64 %index) {
; X86-SSE-LABEL: mul_2xi8_varconst2:
; X86-SSE: # %bb.0: # %entry
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE-NEXT: movl c, %edx
; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx
; X86-SSE-NEXT: movd %ecx, %xmm0
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X86-SSE-NEXT: psraw $8, %xmm0
; X86-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; X86-SSE-NEXT: psrad $16, %xmm0
; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
; X86-SSE-NEXT: retl
;
; X86-AVX-LABEL: mul_2xi8_varconst2:
; X86-AVX: # %bb.0: # %entry
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl c, %edx
; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0
; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
;
; X64-SSE-LABEL: mul_2xi8_varconst2:
; X64-SSE: # %bb.0: # %entry
; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx
; X64-SSE-NEXT: movd %ecx, %xmm0
; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X64-SSE-NEXT: psraw $8, %xmm0
; X64-SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; X64-SSE-NEXT: psrad $16, %xmm0
; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: mul_2xi8_varconst2:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0
; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
; X64-AVX-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
%tmp7 = bitcast i8* %tmp6 to <2 x i8>*
%wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
%tmp8 = sext <2 x i8> %wide.load to <2 x i32>
%tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -128, i32 127>
%tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
%tmp15 = bitcast i32* %tmp14 to <2 x i32>*
store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
ret void
}
; %val = load <2 x i8>
; %op1 = zext<2 x i32> %val
; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 256)
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi8_varconst3(i8* nocapture readonly %a, i64 %index) {
; X86-SSE-LABEL: mul_2xi8_varconst3:
; X86-SSE: # %bb.0: # %entry
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE-NEXT: movl c, %edx
; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx
; X86-SSE-NEXT: movd %ecx, %xmm0
; X86-SSE-NEXT: pxor %xmm1, %xmm1
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u>
; X86-SSE-NEXT: movdqa %xmm0, %xmm2
; X86-SSE-NEXT: pmulhw %xmm1, %xmm2
; X86-SSE-NEXT: pmullw %xmm1, %xmm0
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
; X86-SSE-NEXT: retl
;
; X86-AVX-LABEL: mul_2xi8_varconst3:
; X86-AVX: # %bb.0: # %entry
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl c, %edx
; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
; X86-AVX-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
;
; X64-SSE-LABEL: mul_2xi8_varconst3:
; X64-SSE: # %bb.0: # %entry
; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx
; X64-SSE-NEXT: movd %ecx, %xmm0
; X64-SSE-NEXT: pxor %xmm1, %xmm1
; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u>
; X64-SSE-NEXT: movdqa %xmm0, %xmm2
; X64-SSE-NEXT: pmulhw %xmm1, %xmm2
; X64-SSE-NEXT: pmullw %xmm1, %xmm0
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: mul_2xi8_varconst3:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
; X64-AVX-NEXT: movl $256, %ecx # imm = 0x100
; X64-AVX-NEXT: vmovq %rcx, %xmm1
; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
; X64-AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0
; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
; X64-AVX-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
%tmp7 = bitcast i8* %tmp6 to <2 x i8>*
%wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
%tmp8 = zext <2 x i8> %wide.load to <2 x i32>
%tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 256>
%tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
%tmp15 = bitcast i32* %tmp14 to <2 x i32>*
store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
ret void
}
; %val = load <2 x i8>
; %op1 = zext<2 x i32> %val
; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-1 ~ 255)
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi8_varconst4(i8* nocapture readonly %a, i64 %index) {
; X86-SSE-LABEL: mul_2xi8_varconst4:
; X86-SSE: # %bb.0: # %entry
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE-NEXT: movl c, %edx
; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx
; X86-SSE-NEXT: movd %ecx, %xmm0
; X86-SSE-NEXT: pxor %xmm1, %xmm1
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u>
; X86-SSE-NEXT: movdqa %xmm0, %xmm2
; X86-SSE-NEXT: pmulhw %xmm1, %xmm2
; X86-SSE-NEXT: pmullw %xmm1, %xmm0
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
; X86-SSE-NEXT: retl
;
; X86-AVX-LABEL: mul_2xi8_varconst4:
; X86-AVX: # %bb.0: # %entry
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl c, %edx
; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
;
; X64-SSE-LABEL: mul_2xi8_varconst4:
; X64-SSE: # %bb.0: # %entry
; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx
; X64-SSE-NEXT: movd %ecx, %xmm0
; X64-SSE-NEXT: pxor %xmm1, %xmm1
; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u>
; X64-SSE-NEXT: movdqa %xmm0, %xmm2
; X64-SSE-NEXT: pmulhw %xmm1, %xmm2
; X64-SSE-NEXT: pmullw %xmm1, %xmm0
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: mul_2xi8_varconst4:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
; X64-AVX-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
%tmp7 = bitcast i8* %tmp6 to <2 x i8>*
%wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
%tmp8 = zext <2 x i8> %wide.load to <2 x i32>
%tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -1, i32 255>
%tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
%tmp15 = bitcast i32* %tmp14 to <2 x i32>*
store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
ret void
}
; %val = load <2 x i8>
; %op1 = sext<2 x i32> %val
; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-129 ~ 127)
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi8_varconst5(i8* nocapture readonly %a, i64 %index) {
; X86-SSE-LABEL: mul_2xi8_varconst5:
; X86-SSE: # %bb.0: # %entry
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE-NEXT: movl c, %edx
; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx
; X86-SSE-NEXT: movd %ecx, %xmm0
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X86-SSE-NEXT: psraw $8, %xmm0
; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u>
; X86-SSE-NEXT: movdqa %xmm0, %xmm2
; X86-SSE-NEXT: pmulhw %xmm1, %xmm2
; X86-SSE-NEXT: pmullw %xmm1, %xmm0
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
; X86-SSE-NEXT: retl
;
; X86-AVX-LABEL: mul_2xi8_varconst5:
; X86-AVX: # %bb.0: # %entry
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl c, %edx
; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0
; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
;
; X64-SSE-LABEL: mul_2xi8_varconst5:
; X64-SSE: # %bb.0: # %entry
; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx
; X64-SSE-NEXT: movd %ecx, %xmm0
; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X64-SSE-NEXT: psraw $8, %xmm0
; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u>
; X64-SSE-NEXT: movdqa %xmm0, %xmm2
; X64-SSE-NEXT: pmulhw %xmm1, %xmm2
; X64-SSE-NEXT: pmullw %xmm1, %xmm0
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: mul_2xi8_varconst5:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0
; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
; X64-AVX-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
%tmp7 = bitcast i8* %tmp6 to <2 x i8>*
%wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
%tmp8 = sext <2 x i8> %wide.load to <2 x i32>
%tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -129, i32 127>
%tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
%tmp15 = bitcast i32* %tmp14 to <2 x i32>*
store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
ret void
}
; %val = load <2 x i8>
; %op1 = sext<2 x i32> %val
; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 128)
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi8_varconst6(i8* nocapture readonly %a, i64 %index) {
; X86-SSE-LABEL: mul_2xi8_varconst6:
; X86-SSE: # %bb.0: # %entry
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE-NEXT: movl c, %edx
; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx
; X86-SSE-NEXT: movd %ecx, %xmm0
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X86-SSE-NEXT: psraw $8, %xmm0
; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u>
; X86-SSE-NEXT: movdqa %xmm0, %xmm2
; X86-SSE-NEXT: pmulhw %xmm1, %xmm2
; X86-SSE-NEXT: pmullw %xmm1, %xmm0
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
; X86-SSE-NEXT: retl
;
; X86-AVX-LABEL: mul_2xi8_varconst6:
; X86-AVX: # %bb.0: # %entry
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl c, %edx
; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0
; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
;
; X64-SSE-LABEL: mul_2xi8_varconst6:
; X64-SSE: # %bb.0: # %entry
; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx
; X64-SSE-NEXT: movd %ecx, %xmm0
; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X64-SSE-NEXT: psraw $8, %xmm0
; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u>
; X64-SSE-NEXT: movdqa %xmm0, %xmm2
; X64-SSE-NEXT: pmulhw %xmm1, %xmm2
; X64-SSE-NEXT: pmullw %xmm1, %xmm0
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: mul_2xi8_varconst6:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0
; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
; X64-AVX-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
%tmp7 = bitcast i8* %tmp6 to <2 x i8>*
%wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
%tmp8 = sext <2 x i8> %wide.load to <2 x i32>
%tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -128, i32 128>
%tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
%tmp15 = bitcast i32* %tmp14 to <2 x i32>*
store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
ret void
}
; %val = load <2 x i16>
; %op1 = zext<2 x i32> %val
; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65535)
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi16_varconst1(i8* nocapture readonly %a, i64 %index) {
; X86-SSE-LABEL: mul_2xi16_varconst1:
; X86-SSE: # %bb.0: # %entry
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE-NEXT: movl c, %edx
; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u>
; X86-SSE-NEXT: movdqa %xmm0, %xmm2
; X86-SSE-NEXT: pmulhuw %xmm1, %xmm2
; X86-SSE-NEXT: pmullw %xmm1, %xmm0
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
; X86-SSE-NEXT: retl
;
; X86-AVX-LABEL: mul_2xi16_varconst1:
; X86-AVX: # %bb.0: # %entry
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl c, %edx
; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; X86-AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
;
; X64-SSE-LABEL: mul_2xi16_varconst1:
; X64-SSE: # %bb.0: # %entry
; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u>
; X64-SSE-NEXT: movdqa %xmm0, %xmm2
; X64-SSE-NEXT: pmulhuw %xmm1, %xmm2
; X64-SSE-NEXT: pmullw %xmm1, %xmm0
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: mul_2xi16_varconst1:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; X64-AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; X64-AVX-NEXT: movl $65535, %ecx # imm = 0xFFFF
; X64-AVX-NEXT: vmovq %rcx, %xmm1
; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
; X64-AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
; X64-AVX-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
%tmp7 = bitcast i8* %tmp6 to <2 x i16>*
%wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
%tmp8 = zext <2 x i16> %wide.load to <2 x i32>
%tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 65535>
%tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
%tmp15 = bitcast i32* %tmp14 to <2 x i32>*
store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
ret void
}
; %val = load <2 x i16>
; %op1 = sext<2 x i32> %val
; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-32768 ~ 32767)
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi16_varconst2(i8* nocapture readonly %a, i64 %index) {
; X86-SSE-LABEL: mul_2xi16_varconst2:
; X86-SSE: # %bb.0: # %entry
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE-NEXT: movl c, %edx
; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u>
; X86-SSE-NEXT: movdqa %xmm0, %xmm2
; X86-SSE-NEXT: pmulhw %xmm1, %xmm2
; X86-SSE-NEXT: pmullw %xmm1, %xmm0
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
; X86-SSE-NEXT: retl
;
; X86-AVX-LABEL: mul_2xi16_varconst2:
; X86-AVX: # %bb.0: # %entry
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl c, %edx
; X86-AVX-NEXT: vpmovsxwq (%ecx,%eax), %xmm0
; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
;
; X64-SSE-LABEL: mul_2xi16_varconst2:
; X64-SSE: # %bb.0: # %entry
; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u>
; X64-SSE-NEXT: movdqa %xmm0, %xmm2
; X64-SSE-NEXT: pmulhw %xmm1, %xmm2
; X64-SSE-NEXT: pmullw %xmm1, %xmm0
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: mul_2xi16_varconst2:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
; X64-AVX-NEXT: vpmovsxwq (%rdi,%rsi), %xmm0
; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
; X64-AVX-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
%tmp7 = bitcast i8* %tmp6 to <2 x i16>*
%wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
%tmp8 = sext <2 x i16> %wide.load to <2 x i32>
%tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -32768, i32 32767>
%tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
%tmp15 = bitcast i32* %tmp14 to <2 x i32>*
store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
ret void
}
; %val = load <2 x i16>
; %op1 = zext<2 x i32> %val
; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65536)
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) {
; X86-SSE-LABEL: mul_2xi16_varconst3:
; X86-SSE: # %bb.0: # %entry
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE-NEXT: movl c, %edx
; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE-NEXT: pxor %xmm1, %xmm1
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,0,65536,0]
; X86-SSE-NEXT: pmuludq %xmm2, %xmm0
; X86-SSE-NEXT: pmuludq %xmm2, %xmm1
; X86-SSE-NEXT: psllq $32, %xmm1
; X86-SSE-NEXT: paddq %xmm0, %xmm1
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
; X86-SSE-NEXT: retl
;
; X86-AVX-LABEL: mul_2xi16_varconst3:
; X86-AVX: # %bb.0: # %entry
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl c, %edx
; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; X86-AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
;
; X64-SSE-LABEL: mul_2xi16_varconst3:
; X64-SSE: # %bb.0: # %entry
; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-SSE-NEXT: pxor %xmm1, %xmm1
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; X64-SSE-NEXT: movl $65536, %ecx # imm = 0x10000
; X64-SSE-NEXT: movq %rcx, %xmm2
; X64-SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
; X64-SSE-NEXT: pmuludq %xmm2, %xmm0
; X64-SSE-NEXT: pmuludq %xmm1, %xmm2
; X64-SSE-NEXT: psllq $32, %xmm2
; X64-SSE-NEXT: paddq %xmm0, %xmm2
; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: mul_2xi16_varconst3:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; X64-AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; X64-AVX-NEXT: movl $65536, %ecx # imm = 0x10000
; X64-AVX-NEXT: vmovq %rcx, %xmm1
; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
; X64-AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
; X64-AVX-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
%tmp7 = bitcast i8* %tmp6 to <2 x i16>*
%wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
%tmp8 = zext <2 x i16> %wide.load to <2 x i32>
%tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 65536>
%tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
%tmp15 = bitcast i32* %tmp14 to <2 x i32>*
store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
ret void
}
; %val = load <2 x i16>
; %op1 = sext<2 x i32> %val
; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 32768)
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi16_varconst4(i8* nocapture readonly %a, i64 %index) {
; X86-SSE-LABEL: mul_2xi16_varconst4:
; X86-SSE: # %bb.0: # %entry
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE-NEXT: movl c, %edx
; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
; X86-SSE-NEXT: psrad $16, %xmm0
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,32768,0]
; X86-SSE-NEXT: pmuludq %xmm1, %xmm0
; X86-SSE-NEXT: pxor %xmm2, %xmm2
; X86-SSE-NEXT: pmuludq %xmm1, %xmm2
; X86-SSE-NEXT: psllq $32, %xmm2
; X86-SSE-NEXT: paddq %xmm0, %xmm2
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
; X86-SSE-NEXT: retl
;
; X86-AVX-LABEL: mul_2xi16_varconst4:
; X86-AVX: # %bb.0: # %entry
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl c, %edx
; X86-AVX-NEXT: vpmovsxwq (%ecx,%eax), %xmm0
; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
;
; X64-SSE-LABEL: mul_2xi16_varconst4:
; X64-SSE: # %bb.0: # %entry
; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
; X64-SSE-NEXT: psrad $16, %xmm0
; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; X64-SSE-NEXT: movl $32768, %ecx # imm = 0x8000
; X64-SSE-NEXT: movq %rcx, %xmm1
; X64-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
; X64-SSE-NEXT: pmuludq %xmm1, %xmm0
; X64-SSE-NEXT: pxor %xmm2, %xmm2
; X64-SSE-NEXT: pmuludq %xmm1, %xmm2
; X64-SSE-NEXT: psllq $32, %xmm2
; X64-SSE-NEXT: paddq %xmm0, %xmm2
; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: mul_2xi16_varconst4:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
; X64-AVX-NEXT: vpmovsxwq (%rdi,%rsi), %xmm0
; X64-AVX-NEXT: movl $32768, %ecx # imm = 0x8000
; X64-AVX-NEXT: vmovq %rcx, %xmm1
; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
; X64-AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
; X64-AVX-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
%tmp7 = bitcast i8* %tmp6 to <2 x i16>*
%wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
%tmp8 = sext <2 x i16> %wide.load to <2 x i32>
%tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 32768>
%tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
%tmp15 = bitcast i32* %tmp14 to <2 x i32>*
store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
ret void
}
;
; Illegal Types
;
define void @PR34947() {
; X86-SSE-LABEL: PR34947:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: movdqa (%eax), %xmm0
; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
; X86-SSE-NEXT: movd %xmm1, %ecx
; X86-SSE-NEXT: xorl %eax, %eax
; X86-SSE-NEXT: xorl %edx, %edx
; X86-SSE-NEXT: divl %ecx
; X86-SSE-NEXT: movd %edx, %xmm1
; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
; X86-SSE-NEXT: movd %xmm2, %ecx
; X86-SSE-NEXT: xorl %eax, %eax
; X86-SSE-NEXT: xorl %edx, %edx
; X86-SSE-NEXT: divl %ecx
; X86-SSE-NEXT: movd %edx, %xmm2
; X86-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; X86-SSE-NEXT: movd %xmm0, %ecx
; X86-SSE-NEXT: xorl %eax, %eax
; X86-SSE-NEXT: xorl %edx, %edx
; X86-SSE-NEXT: divl %ecx
; X86-SSE-NEXT: movd %edx, %xmm1
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; X86-SSE-NEXT: movd %xmm0, %ecx
; X86-SSE-NEXT: xorl %eax, %eax
; X86-SSE-NEXT: xorl %edx, %edx
; X86-SSE-NEXT: divl %ecx
; X86-SSE-NEXT: movd %edx, %xmm0
; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; X86-SSE-NEXT: xorl %eax, %eax
; X86-SSE-NEXT: xorl %edx, %edx
; X86-SSE-NEXT: divl (%eax)
; X86-SSE-NEXT: movd %edx, %xmm0
; X86-SSE-NEXT: pmaddwd {{\.LCPI.*}}, %xmm1
; X86-SSE-NEXT: movl $8199, %eax # imm = 0x2007
; X86-SSE-NEXT: movd %eax, %xmm2
; X86-SSE-NEXT: pmuludq %xmm0, %xmm2
; X86-SSE-NEXT: movd %xmm2, (%eax)
; X86-SSE-NEXT: movdqa %xmm1, (%eax)
; X86-SSE-NEXT: retl
;
; X86-AVX1-LABEL: PR34947:
; X86-AVX1: # %bb.0:
; X86-AVX1-NEXT: pushl %ebp
; X86-AVX1-NEXT: .cfi_def_cfa_offset 8
; X86-AVX1-NEXT: pushl %ebx
; X86-AVX1-NEXT: .cfi_def_cfa_offset 12
; X86-AVX1-NEXT: pushl %edi
; X86-AVX1-NEXT: .cfi_def_cfa_offset 16
; X86-AVX1-NEXT: pushl %esi
; X86-AVX1-NEXT: .cfi_def_cfa_offset 20
; X86-AVX1-NEXT: subl $16, %esp
; X86-AVX1-NEXT: .cfi_def_cfa_offset 36
; X86-AVX1-NEXT: .cfi_offset %esi, -20
; X86-AVX1-NEXT: .cfi_offset %edi, -16
; X86-AVX1-NEXT: .cfi_offset %ebx, -12
; X86-AVX1-NEXT: .cfi_offset %ebp, -8
; X86-AVX1-NEXT: vmovdqa (%eax), %ymm0
; X86-AVX1-NEXT: xorl %eax, %eax
; X86-AVX1-NEXT: xorl %edx, %edx
; X86-AVX1-NEXT: divl (%eax)
; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
; X86-AVX1-NEXT: vpextrd $3, %xmm0, %ecx
; X86-AVX1-NEXT: xorl %eax, %eax
; X86-AVX1-NEXT: xorl %edx, %edx
; X86-AVX1-NEXT: divl %ecx
; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
; X86-AVX1-NEXT: vpextrd $2, %xmm0, %ecx
; X86-AVX1-NEXT: xorl %eax, %eax
; X86-AVX1-NEXT: xorl %edx, %edx
; X86-AVX1-NEXT: divl %ecx
; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
; X86-AVX1-NEXT: vpextrd $1, %xmm0, %ecx
; X86-AVX1-NEXT: xorl %eax, %eax
; X86-AVX1-NEXT: xorl %edx, %edx
; X86-AVX1-NEXT: divl %ecx
; X86-AVX1-NEXT: movl %edx, (%esp) # 4-byte Spill
; X86-AVX1-NEXT: vmovd %xmm0, %ecx
; X86-AVX1-NEXT: xorl %eax, %eax
; X86-AVX1-NEXT: xorl %edx, %edx
; X86-AVX1-NEXT: divl %ecx
; X86-AVX1-NEXT: movl %edx, %ebp
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; X86-AVX1-NEXT: xorl %eax, %eax
; X86-AVX1-NEXT: xorl %edx, %edx
; X86-AVX1-NEXT: vpextrd $3, %xmm0, %ecx
; X86-AVX1-NEXT: divl %ecx
; X86-AVX1-NEXT: movl %edx, %ecx
; X86-AVX1-NEXT: xorl %eax, %eax
; X86-AVX1-NEXT: xorl %edx, %edx
; X86-AVX1-NEXT: vpextrd $2, %xmm0, %esi
; X86-AVX1-NEXT: divl %esi
; X86-AVX1-NEXT: movl %edx, %esi
; X86-AVX1-NEXT: xorl %eax, %eax
; X86-AVX1-NEXT: xorl %edx, %edx
; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edi
; X86-AVX1-NEXT: divl %edi
; X86-AVX1-NEXT: movl %edx, %edi
; X86-AVX1-NEXT: xorl %eax, %eax
; X86-AVX1-NEXT: xorl %edx, %edx
; X86-AVX1-NEXT: vmovd %xmm0, %ebx
; X86-AVX1-NEXT: divl %ebx
; X86-AVX1-NEXT: vmovd %edx, %xmm0
; X86-AVX1-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0
; X86-AVX1-NEXT: vpinsrd $2, %esi, %xmm0, %xmm0
; X86-AVX1-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0
; X86-AVX1-NEXT: vmovd %ebp, %xmm1
; X86-AVX1-NEXT: vpinsrd $1, (%esp), %xmm1, %xmm1 # 4-byte Folded Reload
; X86-AVX1-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 # 4-byte Folded Reload
; X86-AVX1-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 # 4-byte Folded Reload
; X86-AVX1-NEXT: vmovd {{[0-9]+}}(%esp), %xmm2 # 4-byte Folded Reload
; X86-AVX1-NEXT: # xmm2 = mem[0],zero,zero,zero
; X86-AVX1-NEXT: movl $8199, %eax # imm = 0x2007
; X86-AVX1-NEXT: vmovd %eax, %xmm3
; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [8199,8199,8199,8199]
; X86-AVX1-NEXT: vpmaddwd %xmm4, %xmm0, %xmm0
; X86-AVX1-NEXT: vpmaddwd %xmm4, %xmm1, %xmm1
; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; X86-AVX1-NEXT: vpmulld %xmm3, %xmm2, %xmm1
; X86-AVX1-NEXT: vmovd %xmm1, (%eax)
; X86-AVX1-NEXT: vmovaps %ymm0, (%eax)
; X86-AVX1-NEXT: addl $16, %esp
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; X86-AVX1-NEXT: .cfi_def_cfa_offset 20
; X86-AVX1-NEXT: popl %esi
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; X86-AVX1-NEXT: .cfi_def_cfa_offset 16
; X86-AVX1-NEXT: popl %edi
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; X86-AVX1-NEXT: .cfi_def_cfa_offset 12
; X86-AVX1-NEXT: popl %ebx
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; X86-AVX1-NEXT: .cfi_def_cfa_offset 8
; X86-AVX1-NEXT: popl %ebp
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; X86-AVX1-NEXT: .cfi_def_cfa_offset 4
; X86-AVX1-NEXT: vzeroupper
; X86-AVX1-NEXT: retl
;
; X86-AVX2-LABEL: PR34947:
; X86-AVX2: # %bb.0:
; X86-AVX2-NEXT: pushl %esi
; X86-AVX2-NEXT: .cfi_def_cfa_offset 8
; X86-AVX2-NEXT: .cfi_offset %esi, -8
; X86-AVX2-NEXT: vmovdqa (%eax), %ymm0
; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; X86-AVX2-NEXT: vpextrd $1, %xmm1, %ecx
; X86-AVX2-NEXT: xorl %eax, %eax
; X86-AVX2-NEXT: xorl %edx, %edx
; X86-AVX2-NEXT: divl %ecx
; X86-AVX2-NEXT: movl %edx, %ecx
; X86-AVX2-NEXT: vmovd %xmm1, %esi
; X86-AVX2-NEXT: xorl %eax, %eax
; X86-AVX2-NEXT: xorl %edx, %edx
; X86-AVX2-NEXT: divl %esi
; X86-AVX2-NEXT: vmovd %edx, %xmm2
; X86-AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
; X86-AVX2-NEXT: vpextrd $2, %xmm1, %ecx
; X86-AVX2-NEXT: xorl %eax, %eax
; X86-AVX2-NEXT: xorl %edx, %edx
; X86-AVX2-NEXT: divl %ecx
; X86-AVX2-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2
; X86-AVX2-NEXT: vpextrd $3, %xmm1, %ecx
; X86-AVX2-NEXT: xorl %eax, %eax
; X86-AVX2-NEXT: xorl %edx, %edx
; X86-AVX2-NEXT: divl %ecx
; X86-AVX2-NEXT: vpinsrd $3, %edx, %xmm2, %xmm1
; X86-AVX2-NEXT: vpextrd $1, %xmm0, %ecx
; X86-AVX2-NEXT: xorl %eax, %eax
; X86-AVX2-NEXT: xorl %edx, %edx
; X86-AVX2-NEXT: divl %ecx
; X86-AVX2-NEXT: movl %edx, %ecx
; X86-AVX2-NEXT: vmovd %xmm0, %esi
; X86-AVX2-NEXT: xorl %eax, %eax
; X86-AVX2-NEXT: xorl %edx, %edx
; X86-AVX2-NEXT: divl %esi
; X86-AVX2-NEXT: vmovd %edx, %xmm2
; X86-AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
; X86-AVX2-NEXT: vpextrd $2, %xmm0, %ecx
; X86-AVX2-NEXT: xorl %eax, %eax
; X86-AVX2-NEXT: xorl %edx, %edx
; X86-AVX2-NEXT: divl %ecx
; X86-AVX2-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2
; X86-AVX2-NEXT: vpextrd $3, %xmm0, %ecx
; X86-AVX2-NEXT: xorl %eax, %eax
; X86-AVX2-NEXT: xorl %edx, %edx
; X86-AVX2-NEXT: divl %ecx
; X86-AVX2-NEXT: vpinsrd $3, %edx, %xmm2, %xmm0
; X86-AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; X86-AVX2-NEXT: xorl %eax, %eax
; X86-AVX2-NEXT: xorl %edx, %edx
; X86-AVX2-NEXT: divl (%eax)
; X86-AVX2-NEXT: vmovd %edx, %xmm1
; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [8199,8199,8199,8199,8199,8199,8199,8199]
; X86-AVX2-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
; X86-AVX2-NEXT: movl $8199, %eax # imm = 0x2007
; X86-AVX2-NEXT: vmovd %eax, %xmm2
; X86-AVX2-NEXT: vpmulld %ymm2, %ymm1, %ymm1
; X86-AVX2-NEXT: vmovd %xmm1, (%eax)
; X86-AVX2-NEXT: vmovdqa %ymm0, (%eax)
; X86-AVX2-NEXT: popl %esi
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; X86-AVX2-NEXT: .cfi_def_cfa_offset 4
; X86-AVX2-NEXT: vzeroupper
; X86-AVX2-NEXT: retl
;
; X64-SSE-LABEL: PR34947:
; X64-SSE: # %bb.0:
; X64-SSE-NEXT: movdqa (%rax), %xmm0
; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
; X64-SSE-NEXT: movd %xmm1, %ecx
; X64-SSE-NEXT: xorl %eax, %eax
; X64-SSE-NEXT: xorl %edx, %edx
; X64-SSE-NEXT: divl %ecx
; X64-SSE-NEXT: movd %edx, %xmm1
; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
; X64-SSE-NEXT: movd %xmm2, %ecx
; X64-SSE-NEXT: xorl %eax, %eax
; X64-SSE-NEXT: xorl %edx, %edx
; X64-SSE-NEXT: divl %ecx
; X64-SSE-NEXT: movd %edx, %xmm2
; X64-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; X64-SSE-NEXT: movd %xmm0, %ecx
; X64-SSE-NEXT: xorl %eax, %eax
; X64-SSE-NEXT: xorl %edx, %edx
; X64-SSE-NEXT: divl %ecx
; X64-SSE-NEXT: movd %edx, %xmm1
; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; X64-SSE-NEXT: movd %xmm0, %ecx
; X64-SSE-NEXT: xorl %eax, %eax
; X64-SSE-NEXT: xorl %edx, %edx
; X64-SSE-NEXT: divl %ecx
; X64-SSE-NEXT: movd %edx, %xmm0
; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; X64-SSE-NEXT: xorl %eax, %eax
; X64-SSE-NEXT: xorl %edx, %edx
; X64-SSE-NEXT: divl (%rax)
; X64-SSE-NEXT: movd %edx, %xmm0
; X64-SSE-NEXT: pmaddwd {{.*}}(%rip), %xmm1
; X64-SSE-NEXT: movl $8199, %eax # imm = 0x2007
; X64-SSE-NEXT: movd %eax, %xmm2
; X64-SSE-NEXT: pmuludq %xmm0, %xmm2
; X64-SSE-NEXT: movd %xmm2, (%rax)
; X64-SSE-NEXT: movdqa %xmm1, (%rax)
; X64-SSE-NEXT: retq
;
; X64-AVX1-LABEL: PR34947:
; X64-AVX1: # %bb.0:
; X64-AVX1-NEXT: pushq %rbp
; X64-AVX1-NEXT: .cfi_def_cfa_offset 16
; X64-AVX1-NEXT: pushq %rbx
; X64-AVX1-NEXT: .cfi_def_cfa_offset 24
; X64-AVX1-NEXT: .cfi_offset %rbx, -24
; X64-AVX1-NEXT: .cfi_offset %rbp, -16
; X64-AVX1-NEXT: vmovdqa (%rax), %ymm0
; X64-AVX1-NEXT: xorl %eax, %eax
; X64-AVX1-NEXT: xorl %edx, %edx
; X64-AVX1-NEXT: divl (%rax)
; X64-AVX1-NEXT: movl %edx, %r8d
; X64-AVX1-NEXT: vpextrd $3, %xmm0, %ecx
; X64-AVX1-NEXT: xorl %eax, %eax
; X64-AVX1-NEXT: xorl %edx, %edx
; X64-AVX1-NEXT: divl %ecx
; X64-AVX1-NEXT: movl %edx, %r9d
; X64-AVX1-NEXT: vpextrd $2, %xmm0, %ecx
; X64-AVX1-NEXT: xorl %eax, %eax
; X64-AVX1-NEXT: xorl %edx, %edx
; X64-AVX1-NEXT: divl %ecx
; X64-AVX1-NEXT: movl %edx, %r10d
; X64-AVX1-NEXT: vpextrd $1, %xmm0, %ecx
; X64-AVX1-NEXT: xorl %eax, %eax
; X64-AVX1-NEXT: xorl %edx, %edx
; X64-AVX1-NEXT: divl %ecx
; X64-AVX1-NEXT: movl %edx, %r11d
; X64-AVX1-NEXT: vmovd %xmm0, %ecx
; X64-AVX1-NEXT: xorl %eax, %eax
; X64-AVX1-NEXT: xorl %edx, %edx
; X64-AVX1-NEXT: divl %ecx
; X64-AVX1-NEXT: movl %edx, %esi
; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; X64-AVX1-NEXT: vpextrd $3, %xmm0, %ecx
; X64-AVX1-NEXT: xorl %eax, %eax
; X64-AVX1-NEXT: xorl %edx, %edx
; X64-AVX1-NEXT: divl %ecx
; X64-AVX1-NEXT: movl %edx, %edi
; X64-AVX1-NEXT: vpextrd $2, %xmm0, %ecx
; X64-AVX1-NEXT: xorl %eax, %eax
; X64-AVX1-NEXT: xorl %edx, %edx
; X64-AVX1-NEXT: divl %ecx
; X64-AVX1-NEXT: movl %edx, %ecx
; X64-AVX1-NEXT: vpextrd $1, %xmm0, %ebx
; X64-AVX1-NEXT: xorl %eax, %eax
; X64-AVX1-NEXT: xorl %edx, %edx
; X64-AVX1-NEXT: divl %ebx
; X64-AVX1-NEXT: movl %edx, %ebx
; X64-AVX1-NEXT: vmovd %xmm0, %ebp
; X64-AVX1-NEXT: xorl %eax, %eax
; X64-AVX1-NEXT: xorl %edx, %edx
; X64-AVX1-NEXT: divl %ebp
; X64-AVX1-NEXT: vmovd %edx, %xmm0
; X64-AVX1-NEXT: vpinsrd $1, %ebx, %xmm0, %xmm0
; X64-AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
; X64-AVX1-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0
; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8199,8199,8199,8199]
; X64-AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vmovd %esi, %xmm2
; X64-AVX1-NEXT: vpinsrd $1, %r11d, %xmm2, %xmm2
; X64-AVX1-NEXT: vpinsrd $2, %r10d, %xmm2, %xmm2
; X64-AVX1-NEXT: vpinsrd $3, %r9d, %xmm2, %xmm2
; X64-AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1
; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; X64-AVX1-NEXT: vmovd %r8d, %xmm1
; X64-AVX1-NEXT: movl $8199, %eax # imm = 0x2007
; X64-AVX1-NEXT: vmovd %eax, %xmm2
; X64-AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1
; X64-AVX1-NEXT: vmovd %xmm1, (%rax)
; X64-AVX1-NEXT: vmovaps %ymm0, (%rax)
; X64-AVX1-NEXT: popq %rbx
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; X64-AVX1-NEXT: .cfi_def_cfa_offset 16
; X64-AVX1-NEXT: popq %rbp
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; X64-AVX1-NEXT: .cfi_def_cfa_offset 8
; X64-AVX1-NEXT: vzeroupper
; X64-AVX1-NEXT: retq
;
; X64-AVX2-LABEL: PR34947:
; X64-AVX2: # %bb.0:
; X64-AVX2-NEXT: vmovdqa (%rax), %ymm0
; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-AVX2-NEXT: vpextrd $1, %xmm1, %ecx
; X64-AVX2-NEXT: xorl %eax, %eax
; X64-AVX2-NEXT: xorl %edx, %edx
; X64-AVX2-NEXT: divl %ecx
; X64-AVX2-NEXT: movl %edx, %ecx
; X64-AVX2-NEXT: vmovd %xmm1, %esi
; X64-AVX2-NEXT: xorl %eax, %eax
; X64-AVX2-NEXT: xorl %edx, %edx
; X64-AVX2-NEXT: divl %esi
; X64-AVX2-NEXT: vmovd %edx, %xmm2
; X64-AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
; X64-AVX2-NEXT: vpextrd $2, %xmm1, %ecx
; X64-AVX2-NEXT: xorl %eax, %eax
; X64-AVX2-NEXT: xorl %edx, %edx
; X64-AVX2-NEXT: divl %ecx
; X64-AVX2-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2
; X64-AVX2-NEXT: vpextrd $3, %xmm1, %ecx
; X64-AVX2-NEXT: xorl %eax, %eax
; X64-AVX2-NEXT: xorl %edx, %edx
; X64-AVX2-NEXT: divl %ecx
; X64-AVX2-NEXT: vpinsrd $3, %edx, %xmm2, %xmm1
; X64-AVX2-NEXT: vpextrd $1, %xmm0, %ecx
; X64-AVX2-NEXT: xorl %eax, %eax
; X64-AVX2-NEXT: xorl %edx, %edx
; X64-AVX2-NEXT: divl %ecx
; X64-AVX2-NEXT: movl %edx, %ecx
; X64-AVX2-NEXT: vmovd %xmm0, %esi
; X64-AVX2-NEXT: xorl %eax, %eax
; X64-AVX2-NEXT: xorl %edx, %edx
; X64-AVX2-NEXT: divl %esi
; X64-AVX2-NEXT: vmovd %edx, %xmm2
; X64-AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
; X64-AVX2-NEXT: vpextrd $2, %xmm0, %ecx
; X64-AVX2-NEXT: xorl %eax, %eax
; X64-AVX2-NEXT: xorl %edx, %edx
; X64-AVX2-NEXT: divl %ecx
; X64-AVX2-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2
; X64-AVX2-NEXT: vpextrd $3, %xmm0, %ecx
; X64-AVX2-NEXT: xorl %eax, %eax
; X64-AVX2-NEXT: xorl %edx, %edx
; X64-AVX2-NEXT: divl %ecx
; X64-AVX2-NEXT: vpinsrd $3, %edx, %xmm2, %xmm0
; X64-AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; X64-AVX2-NEXT: xorl %eax, %eax
; X64-AVX2-NEXT: xorl %edx, %edx
; X64-AVX2-NEXT: divl (%rax)
; X64-AVX2-NEXT: vmovd %edx, %xmm1
; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [8199,8199,8199,8199,8199,8199,8199,8199]
; X64-AVX2-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
; X64-AVX2-NEXT: movl $8199, %eax # imm = 0x2007
; X64-AVX2-NEXT: vmovd %eax, %xmm2
; X64-AVX2-NEXT: vpmulld %ymm2, %ymm1, %ymm1
; X64-AVX2-NEXT: vmovd %xmm1, (%rax)
; X64-AVX2-NEXT: vmovdqa %ymm0, (%rax)
; X64-AVX2-NEXT: vzeroupper
; X64-AVX2-NEXT: retq
%tmp = load <9 x i32>, <9 x i32>* undef, align 64
%rem = urem <9 x i32> zeroinitializer, %tmp
%mul = mul <9 x i32> <i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199>, %rem
store <9 x i32> %mul, <9 x i32>* undef, align 64
ret void
}