[X86][MS] Fix the aligement mismatch of vector variable arguments on Win32

The alignment of vector variable arguments in callee side is 4, which is
aligned with MSVC. But the caller aligns them to the size of vector
arguments. It results in run fails. This patch fixes this problem by
trimming it to 4 bytes for variable arguments on Win32.

Fixed vector arguments are passed by pointer on Win32. So they don't have
the problem.

I don't find a doc in MSDN for this calling conversion, so I did several
experiments here: https://godbolt.org/z/n1zn1Gx1z

Reviewed By: rnk

Differential Revision: https://reviews.llvm.org/D108887
This commit is contained in:
Wang, Pengfei 2021-09-08 08:22:46 +08:00
parent 68b9d8ed7a
commit 9d7d34c769
3 changed files with 61 additions and 1 deletions

View File

@ -23,6 +23,13 @@ class CCIfNotSubtarget<string F, CCAction A>
"(State.getMachineFunction().getSubtarget()).", F),
A>;
/// CCIfIsVarArgOnWin - Match if isVarArg on Windows 32bits.
class CCIfIsVarArgOnWin<CCAction A>
: CCIf<"State.isVarArg() && "
"State.getMachineFunction().getSubtarget().getTargetTriple()."
"isWindowsMSVCEnvironment()",
A>;
// Register classes for RegCall
class RC_X86_RegCall {
list<Register> GPR_8 = [];
@ -771,6 +778,22 @@ def CC_X86_32_Vector_Common : CallingConv<[
CCAssignToStack<64, 64>>
]>;
/// CC_X86_Win32_Vector - In X86 Win32 calling conventions, extra vector
/// values are spilled on the stack.
def CC_X86_Win32_Vector : CallingConv<[
// Other SSE vectors get 16-byte stack slots that are 4-byte aligned.
CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64],
CCAssignToStack<16, 4>>,
// 256-bit AVX vectors get 32-byte stack slots that are 4-byte aligned.
CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v8f32, v4f64],
CCAssignToStack<32, 4>>,
// 512-bit AVX 512-bit vectors get 64-byte stack slots that are 4-byte aligned.
CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v16f32, v8f64],
CCAssignToStack<64, 4>>
]>;
// CC_X86_32_Vector_Standard - The first 3 vector arguments are passed in
// vector registers
def CC_X86_32_Vector_Standard : CallingConv<[
@ -787,6 +810,7 @@ def CC_X86_32_Vector_Standard : CallingConv<[
CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v16f32, v8f64],
CCAssignToReg<[ZMM0, ZMM1, ZMM2]>>>,
CCIfIsVarArgOnWin<CCDelegateTo<CC_X86_Win32_Vector>>,
CCDelegateTo<CC_X86_32_Vector_Common>
]>;

View File

@ -0,0 +1,36 @@
; RUN: llc -mcpu=generic -mtriple=i686-pc-windows-msvc -mattr=+sse < %s | FileCheck %s --check-prefix=MSVC
; RUN: llc -mcpu=generic -mtriple=i686-pc-mingw32 -mattr=+sse < %s | FileCheck %s --check-prefix=MINGW
@a = external dso_local global <4 x float>, align 16
define dso_local void @testPastArguments() nounwind {
; MSVC-LABEL: testPastArguments:
; MSVC: # %bb.0: # %entry
; MSVC-NEXT: subl $20, %esp
; MSVC-NEXT: movaps _a, %xmm0
; MSVC-NEXT: movups %xmm0, 4(%esp)
; MSVC-NEXT: movl $1, (%esp)
; MSVC-NEXT: calll _testm128
; MSVC-NEXT: addl $20, %esp
; MSVC-NEXT: retl
;
; MINGW-LABEL: testPastArguments:
; MINGW: # %bb.0: # %entry
; MINGW-NEXT: pushl %ebp
; MINGW-NEXT: movl %esp, %ebp
; MINGW-NEXT: andl $-16, %esp
; MINGW-NEXT: subl $48, %esp
; MINGW-NEXT: movaps _a, %xmm0
; MINGW-NEXT: movaps %xmm0, 16(%esp)
; MINGW-NEXT: movl $1, (%esp)
; MINGW-NEXT: calll _testm128
; MINGW-NEXT: movl %ebp, %esp
; MINGW-NEXT: popl %ebp
; MINGW-NEXT: retl
entry:
%0 = load <4 x float>, <4 x float>* @a, align 16
%call = tail call i32 (i32, ...) @testm128(i32 1, <4 x float> inreg %0)
ret void
}
declare i32 @testm128(i32, ...) nounwind

View File

@ -20,7 +20,7 @@ declare void @bar(<16 x float> %a, i32 %b)
; Check that proper alignment of spilled vector does not affect vargs
; CHECK-LABEL: vargs_not_affected
; CHECK: movl 28(%ebp), %eax
; CHECK: movl 28(%esp), %eax
define i32 @vargs_not_affected(<4 x float> %v, i8* %f, ...) {
entry:
%ap = alloca i8*, align 4