2017-07-12 02:04:56 +08:00
|
|
|
// z_Linux_asm.S: - microtasking routines specifically
|
2013-09-27 18:38:44 +08:00
|
|
|
// written for Intel platforms running Linux* OS
|
|
|
|
|
|
|
|
//
|
|
|
|
////===----------------------------------------------------------------------===//
|
|
|
|
////
|
2019-01-19 18:56:40 +08:00
|
|
|
//// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
|
|
//// See https://llvm.org/LICENSE.txt for license information.
|
|
|
|
//// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
2013-09-27 18:38:44 +08:00
|
|
|
////
|
|
|
|
////===----------------------------------------------------------------------===//
|
|
|
|
//
|
|
|
|
|
|
|
|
// -----------------------------------------------------------------------
|
|
|
|
// macros
|
|
|
|
// -----------------------------------------------------------------------
|
|
|
|
|
2015-08-29 02:42:10 +08:00
|
|
|
#include "kmp_config.h"
|
2015-05-30 00:13:56 +08:00
|
|
|
|
2013-09-27 18:38:44 +08:00
|
|
|
#if KMP_ARCH_X86 || KMP_ARCH_X86_64
|
|
|
|
|
2015-08-21 03:46:14 +08:00
|
|
|
# if KMP_MIC
|
2013-09-27 18:38:44 +08:00
|
|
|
// the 'delay r16/r32/r64' should be used instead of the 'pause'.
|
|
|
|
// The delay operation has the effect of removing the current thread from
|
|
|
|
// the round-robin HT mechanism, and therefore speeds up the issue rate of
|
|
|
|
// the other threads on the same core.
|
|
|
|
//
|
|
|
|
// A value of 0 works fine for <= 2 threads per core, but causes the EPCC
|
|
|
|
// barrier time to increase greatly for 3 or more threads per core.
|
|
|
|
//
|
|
|
|
// A value of 100 works pretty well for up to 4 threads per core, but isn't
|
|
|
|
// quite as fast as 0 for 2 threads per core.
|
|
|
|
//
|
|
|
|
// We need to check what happens for oversubscription / > 4 threads per core.
|
|
|
|
// It is possible that we need to pass the delay value in as a parameter
|
|
|
|
// that the caller determines based on the total # threads / # cores.
|
|
|
|
//
|
|
|
|
//.macro pause_op
|
|
|
|
// mov $100, %rax
|
|
|
|
// delay %rax
|
|
|
|
//.endm
|
|
|
|
# else
|
|
|
|
# define pause_op .byte 0xf3,0x90
|
2015-08-21 03:46:14 +08:00
|
|
|
# endif // KMP_MIC
|
2013-09-27 18:38:44 +08:00
|
|
|
|
2015-08-21 03:46:14 +08:00
|
|
|
# if KMP_OS_DARWIN
|
2013-09-27 18:38:44 +08:00
|
|
|
# define KMP_PREFIX_UNDERSCORE(x) _##x // extra underscore for OS X* symbols
|
2015-02-11 02:51:52 +08:00
|
|
|
# define KMP_LABEL(x) L_##x // form the name of label
|
|
|
|
.macro KMP_CFI_DEF_OFFSET
|
|
|
|
.endmacro
|
|
|
|
.macro KMP_CFI_OFFSET
|
|
|
|
.endmacro
|
|
|
|
.macro KMP_CFI_REGISTER
|
|
|
|
.endmacro
|
|
|
|
.macro KMP_CFI_DEF
|
|
|
|
.endmacro
|
2013-09-27 18:38:44 +08:00
|
|
|
.macro ALIGN
|
|
|
|
.align $0
|
|
|
|
.endmacro
|
|
|
|
.macro DEBUG_INFO
|
|
|
|
/* Not sure what .size does in icc, not sure if we need to do something
|
|
|
|
similar for OS X*.
|
|
|
|
*/
|
|
|
|
.endmacro
|
|
|
|
.macro PROC
|
|
|
|
ALIGN 4
|
|
|
|
.globl KMP_PREFIX_UNDERSCORE($0)
|
|
|
|
KMP_PREFIX_UNDERSCORE($0):
|
|
|
|
.endmacro
|
2015-08-21 03:46:14 +08:00
|
|
|
# else // KMP_OS_DARWIN
|
2017-05-13 02:01:32 +08:00
|
|
|
# define KMP_PREFIX_UNDERSCORE(x) x //no extra underscore for Linux* OS symbols
|
2015-02-11 03:31:17 +08:00
|
|
|
// Format labels so that they don't override function names in gdb's backtraces
|
2017-05-13 02:01:32 +08:00
|
|
|
// MIC assembler doesn't accept .L syntax, the L works fine there (as well as
|
|
|
|
// on OS X*)
|
2015-08-21 03:46:14 +08:00
|
|
|
# if KMP_MIC
|
2015-02-11 02:51:52 +08:00
|
|
|
# define KMP_LABEL(x) L_##x // local label
|
|
|
|
# else
|
|
|
|
# define KMP_LABEL(x) .L_##x // local label hidden from backtraces
|
2015-08-21 03:46:14 +08:00
|
|
|
# endif // KMP_MIC
|
2013-09-27 18:38:44 +08:00
|
|
|
.macro ALIGN size
|
|
|
|
.align 1<<(\size)
|
|
|
|
.endm
|
|
|
|
.macro DEBUG_INFO proc
|
2015-02-11 02:51:52 +08:00
|
|
|
.cfi_endproc
|
2013-09-27 18:38:44 +08:00
|
|
|
// Not sure why we need .type and .size for the functions
|
|
|
|
.align 16
|
|
|
|
.type \proc,@function
|
|
|
|
.size \proc,.-\proc
|
|
|
|
.endm
|
|
|
|
.macro PROC proc
|
|
|
|
ALIGN 4
|
|
|
|
.globl KMP_PREFIX_UNDERSCORE(\proc)
|
|
|
|
KMP_PREFIX_UNDERSCORE(\proc):
|
2015-02-11 02:51:52 +08:00
|
|
|
.cfi_startproc
|
|
|
|
.endm
|
|
|
|
.macro KMP_CFI_DEF_OFFSET sz
|
|
|
|
.cfi_def_cfa_offset \sz
|
|
|
|
.endm
|
|
|
|
.macro KMP_CFI_OFFSET reg, sz
|
|
|
|
.cfi_offset \reg,\sz
|
|
|
|
.endm
|
|
|
|
.macro KMP_CFI_REGISTER reg
|
|
|
|
.cfi_def_cfa_register \reg
|
|
|
|
.endm
|
|
|
|
.macro KMP_CFI_DEF reg, sz
|
|
|
|
.cfi_def_cfa \reg,\sz
|
2013-09-27 18:38:44 +08:00
|
|
|
.endm
|
2015-08-21 03:46:14 +08:00
|
|
|
# endif // KMP_OS_DARWIN
|
2013-12-24 01:28:57 +08:00
|
|
|
#endif // KMP_ARCH_X86 || KMP_ARCH_x86_64
|
2013-09-27 18:38:44 +08:00
|
|
|
|
2017-04-17 19:58:20 +08:00
|
|
|
#if (KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64
|
2016-05-13 16:26:42 +08:00
|
|
|
|
2017-04-17 19:58:20 +08:00
|
|
|
# if KMP_OS_DARWIN
|
|
|
|
# define KMP_PREFIX_UNDERSCORE(x) _##x // extra underscore for OS X* symbols
|
|
|
|
# define KMP_LABEL(x) L_##x // form the name of label
|
|
|
|
|
|
|
|
.macro ALIGN
|
|
|
|
.align $0
|
|
|
|
.endmacro
|
|
|
|
|
|
|
|
.macro DEBUG_INFO
|
|
|
|
/* Not sure what .size does in icc, not sure if we need to do something
|
|
|
|
similar for OS X*.
|
|
|
|
*/
|
|
|
|
.endmacro
|
|
|
|
|
|
|
|
.macro PROC
|
|
|
|
ALIGN 4
|
|
|
|
.globl KMP_PREFIX_UNDERSCORE($0)
|
|
|
|
KMP_PREFIX_UNDERSCORE($0):
|
|
|
|
.endmacro
|
|
|
|
# else // KMP_OS_DARWIN
|
2016-05-13 16:26:42 +08:00
|
|
|
# define KMP_PREFIX_UNDERSCORE(x) x // no extra underscore for Linux* OS symbols
|
|
|
|
// Format labels so that they don't override function names in gdb's backtraces
|
|
|
|
# define KMP_LABEL(x) .L_##x // local label hidden from backtraces
|
|
|
|
|
|
|
|
.macro ALIGN size
|
|
|
|
.align 1<<(\size)
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro DEBUG_INFO proc
|
|
|
|
.cfi_endproc
|
|
|
|
// Not sure why we need .type and .size for the functions
|
|
|
|
ALIGN 2
|
|
|
|
.type \proc,@function
|
|
|
|
.size \proc,.-\proc
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro PROC proc
|
|
|
|
ALIGN 2
|
|
|
|
.globl KMP_PREFIX_UNDERSCORE(\proc)
|
|
|
|
KMP_PREFIX_UNDERSCORE(\proc):
|
|
|
|
.cfi_startproc
|
|
|
|
.endm
|
2017-04-17 19:58:20 +08:00
|
|
|
# endif // KMP_OS_DARWIN
|
2016-05-13 16:26:42 +08:00
|
|
|
|
2017-04-17 19:58:20 +08:00
|
|
|
#endif // (KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64
|
2013-09-27 18:38:44 +08:00
|
|
|
|
|
|
|
// -----------------------------------------------------------------------
|
|
|
|
// data
|
|
|
|
// -----------------------------------------------------------------------
|
|
|
|
|
|
|
|
#ifdef KMP_GOMP_COMPAT
|
|
|
|
|
|
|
|
// Support for unnamed common blocks.
|
|
|
|
//
|
|
|
|
// Because the symbol ".gomp_critical_user_" contains a ".", we have to
|
|
|
|
// put this stuff in assembly.
|
|
|
|
|
|
|
|
# if KMP_ARCH_X86
|
2015-08-21 03:46:14 +08:00
|
|
|
# if KMP_OS_DARWIN
|
2013-09-27 18:38:44 +08:00
|
|
|
.data
|
|
|
|
.comm .gomp_critical_user_,32
|
|
|
|
.data
|
|
|
|
.globl ___kmp_unnamed_critical_addr
|
|
|
|
___kmp_unnamed_critical_addr:
|
|
|
|
.long .gomp_critical_user_
|
|
|
|
# else /* Linux* OS */
|
|
|
|
.data
|
|
|
|
.comm .gomp_critical_user_,32,8
|
|
|
|
.data
|
|
|
|
ALIGN 4
|
|
|
|
.global __kmp_unnamed_critical_addr
|
|
|
|
__kmp_unnamed_critical_addr:
|
|
|
|
.4byte .gomp_critical_user_
|
|
|
|
.type __kmp_unnamed_critical_addr,@object
|
|
|
|
.size __kmp_unnamed_critical_addr,4
|
2015-08-21 03:46:14 +08:00
|
|
|
# endif /* KMP_OS_DARWIN */
|
2013-09-27 18:38:44 +08:00
|
|
|
# endif /* KMP_ARCH_X86 */
|
|
|
|
|
|
|
|
# if KMP_ARCH_X86_64
|
2015-08-21 03:46:14 +08:00
|
|
|
# if KMP_OS_DARWIN
|
2013-09-27 18:38:44 +08:00
|
|
|
.data
|
|
|
|
.comm .gomp_critical_user_,32
|
|
|
|
.data
|
|
|
|
.globl ___kmp_unnamed_critical_addr
|
|
|
|
___kmp_unnamed_critical_addr:
|
|
|
|
.quad .gomp_critical_user_
|
|
|
|
# else /* Linux* OS */
|
|
|
|
.data
|
|
|
|
.comm .gomp_critical_user_,32,8
|
|
|
|
.data
|
|
|
|
ALIGN 8
|
|
|
|
.global __kmp_unnamed_critical_addr
|
|
|
|
__kmp_unnamed_critical_addr:
|
|
|
|
.8byte .gomp_critical_user_
|
|
|
|
.type __kmp_unnamed_critical_addr,@object
|
|
|
|
.size __kmp_unnamed_critical_addr,8
|
2015-08-21 03:46:14 +08:00
|
|
|
# endif /* KMP_OS_DARWIN */
|
2013-09-27 18:38:44 +08:00
|
|
|
# endif /* KMP_ARCH_X86_64 */
|
|
|
|
|
|
|
|
#endif /* KMP_GOMP_COMPAT */
|
|
|
|
|
|
|
|
|
2014-08-07 18:12:54 +08:00
|
|
|
#if KMP_ARCH_X86 && !KMP_ARCH_PPC64
|
2013-09-27 18:38:44 +08:00
|
|
|
|
|
|
|
// -----------------------------------------------------------------------
|
|
|
|
// microtasking routines specifically written for IA-32 architecture
|
|
|
|
// running Linux* OS
|
|
|
|
// -----------------------------------------------------------------------
|
|
|
|
|
|
|
|
.ident "Intel Corporation"
|
|
|
|
.data
|
|
|
|
ALIGN 4
|
|
|
|
// void
|
|
|
|
// __kmp_x86_pause( void );
|
|
|
|
|
|
|
|
.text
|
|
|
|
PROC __kmp_x86_pause
|
|
|
|
|
|
|
|
pause_op
|
|
|
|
ret
|
|
|
|
|
|
|
|
DEBUG_INFO __kmp_x86_pause
|
|
|
|
|
|
|
|
# if !KMP_ASM_INTRINS
|
|
|
|
|
|
|
|
//------------------------------------------------------------------------
|
|
|
|
// kmp_int32
|
|
|
|
// __kmp_test_then_add32( volatile kmp_int32 *p, kmp_int32 d );
|
|
|
|
|
|
|
|
PROC __kmp_test_then_add32
|
|
|
|
|
|
|
|
movl 4(%esp), %ecx
|
|
|
|
movl 8(%esp), %eax
|
|
|
|
lock
|
|
|
|
xaddl %eax,(%ecx)
|
|
|
|
ret
|
|
|
|
|
|
|
|
DEBUG_INFO __kmp_test_then_add32
|
|
|
|
|
|
|
|
//------------------------------------------------------------------------
|
|
|
|
// FUNCTION __kmp_xchg_fixed8
|
|
|
|
//
|
|
|
|
// kmp_int32
|
|
|
|
// __kmp_xchg_fixed8( volatile kmp_int8 *p, kmp_int8 d );
|
|
|
|
//
|
|
|
|
// parameters:
|
|
|
|
// p: 4(%esp)
|
|
|
|
// d: 8(%esp)
|
|
|
|
//
|
|
|
|
// return: %al
|
|
|
|
PROC __kmp_xchg_fixed8
|
|
|
|
|
|
|
|
movl 4(%esp), %ecx // "p"
|
|
|
|
movb 8(%esp), %al // "d"
|
|
|
|
|
|
|
|
lock
|
|
|
|
xchgb %al,(%ecx)
|
|
|
|
ret
|
|
|
|
|
|
|
|
DEBUG_INFO __kmp_xchg_fixed8
|
|
|
|
|
|
|
|
|
|
|
|
//------------------------------------------------------------------------
|
|
|
|
// FUNCTION __kmp_xchg_fixed16
|
|
|
|
//
|
|
|
|
// kmp_int16
|
|
|
|
// __kmp_xchg_fixed16( volatile kmp_int16 *p, kmp_int16 d );
|
|
|
|
//
|
|
|
|
// parameters:
|
|
|
|
// p: 4(%esp)
|
|
|
|
// d: 8(%esp)
|
|
|
|
// return: %ax
|
|
|
|
PROC __kmp_xchg_fixed16
|
|
|
|
|
|
|
|
movl 4(%esp), %ecx // "p"
|
|
|
|
movw 8(%esp), %ax // "d"
|
|
|
|
|
|
|
|
lock
|
|
|
|
xchgw %ax,(%ecx)
|
|
|
|
ret
|
|
|
|
|
|
|
|
DEBUG_INFO __kmp_xchg_fixed16
|
|
|
|
|
|
|
|
|
|
|
|
//------------------------------------------------------------------------
|
|
|
|
// FUNCTION __kmp_xchg_fixed32
|
|
|
|
//
|
|
|
|
// kmp_int32
|
|
|
|
// __kmp_xchg_fixed32( volatile kmp_int32 *p, kmp_int32 d );
|
|
|
|
//
|
|
|
|
// parameters:
|
|
|
|
// p: 4(%esp)
|
|
|
|
// d: 8(%esp)
|
|
|
|
//
|
|
|
|
// return: %eax
|
|
|
|
PROC __kmp_xchg_fixed32
|
|
|
|
|
|
|
|
movl 4(%esp), %ecx // "p"
|
|
|
|
movl 8(%esp), %eax // "d"
|
|
|
|
|
|
|
|
lock
|
|
|
|
xchgl %eax,(%ecx)
|
|
|
|
ret
|
|
|
|
|
|
|
|
DEBUG_INFO __kmp_xchg_fixed32
|
|
|
|
|
|
|
|
|
|
|
|
// kmp_int8
|
|
|
|
// __kmp_compare_and_store8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
|
|
|
|
PROC __kmp_compare_and_store8
|
|
|
|
|
|
|
|
movl 4(%esp), %ecx
|
|
|
|
movb 8(%esp), %al
|
|
|
|
movb 12(%esp), %dl
|
|
|
|
lock
|
|
|
|
cmpxchgb %dl,(%ecx)
|
|
|
|
sete %al // if %al == (%ecx) set %al = 1 else set %al = 0
|
|
|
|
and $1, %eax // sign extend previous instruction
|
|
|
|
ret
|
|
|
|
|
|
|
|
DEBUG_INFO __kmp_compare_and_store8
|
|
|
|
|
|
|
|
// kmp_int16
|
2017-05-13 02:01:32 +08:00
|
|
|
// __kmp_compare_and_store16(volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv);
|
2013-09-27 18:38:44 +08:00
|
|
|
PROC __kmp_compare_and_store16
|
|
|
|
|
|
|
|
movl 4(%esp), %ecx
|
|
|
|
movw 8(%esp), %ax
|
|
|
|
movw 12(%esp), %dx
|
|
|
|
lock
|
|
|
|
cmpxchgw %dx,(%ecx)
|
|
|
|
sete %al // if %ax == (%ecx) set %al = 1 else set %al = 0
|
|
|
|
and $1, %eax // sign extend previous instruction
|
|
|
|
ret
|
|
|
|
|
|
|
|
DEBUG_INFO __kmp_compare_and_store16
|
|
|
|
|
|
|
|
// kmp_int32
|
2017-05-13 02:01:32 +08:00
|
|
|
// __kmp_compare_and_store32(volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv);
|
2013-09-27 18:38:44 +08:00
|
|
|
PROC __kmp_compare_and_store32
|
|
|
|
|
|
|
|
movl 4(%esp), %ecx
|
|
|
|
movl 8(%esp), %eax
|
|
|
|
movl 12(%esp), %edx
|
|
|
|
lock
|
|
|
|
cmpxchgl %edx,(%ecx)
|
2017-05-13 02:01:32 +08:00
|
|
|
sete %al // if %eax == (%ecx) set %al = 1 else set %al = 0
|
|
|
|
and $1, %eax // sign extend previous instruction
|
2013-09-27 18:38:44 +08:00
|
|
|
ret
|
|
|
|
|
|
|
|
DEBUG_INFO __kmp_compare_and_store32
|
|
|
|
|
|
|
|
// kmp_int32
|
2017-05-13 02:01:32 +08:00
|
|
|
// __kmp_compare_and_store64(volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 s );
|
2013-09-27 18:38:44 +08:00
|
|
|
PROC __kmp_compare_and_store64
|
|
|
|
|
|
|
|
pushl %ebp
|
|
|
|
movl %esp, %ebp
|
|
|
|
pushl %ebx
|
|
|
|
pushl %edi
|
|
|
|
movl 8(%ebp), %edi
|
|
|
|
movl 12(%ebp), %eax // "cv" low order word
|
|
|
|
movl 16(%ebp), %edx // "cv" high order word
|
|
|
|
movl 20(%ebp), %ebx // "sv" low order word
|
|
|
|
movl 24(%ebp), %ecx // "sv" high order word
|
|
|
|
lock
|
|
|
|
cmpxchg8b (%edi)
|
2017-05-13 02:01:32 +08:00
|
|
|
sete %al // if %edx:eax == (%edi) set %al = 1 else set %al = 0
|
|
|
|
and $1, %eax // sign extend previous instruction
|
2013-09-27 18:38:44 +08:00
|
|
|
popl %edi
|
|
|
|
popl %ebx
|
|
|
|
movl %ebp, %esp
|
|
|
|
popl %ebp
|
|
|
|
ret
|
|
|
|
|
|
|
|
DEBUG_INFO __kmp_compare_and_store64
|
|
|
|
|
|
|
|
// kmp_int8
|
2017-05-13 02:01:32 +08:00
|
|
|
// __kmp_compare_and_store_ret8(volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv);
|
2013-09-27 18:38:44 +08:00
|
|
|
PROC __kmp_compare_and_store_ret8
|
|
|
|
|
|
|
|
movl 4(%esp), %ecx
|
|
|
|
movb 8(%esp), %al
|
|
|
|
movb 12(%esp), %dl
|
|
|
|
lock
|
|
|
|
cmpxchgb %dl,(%ecx)
|
|
|
|
ret
|
|
|
|
|
|
|
|
DEBUG_INFO __kmp_compare_and_store_ret8
|
|
|
|
|
|
|
|
// kmp_int16
|
2017-05-13 02:01:32 +08:00
|
|
|
// __kmp_compare_and_store_ret16(volatile kmp_int16 *p, kmp_int16 cv,
|
|
|
|
// kmp_int16 sv);
|
2013-09-27 18:38:44 +08:00
|
|
|
PROC __kmp_compare_and_store_ret16
|
|
|
|
|
|
|
|
movl 4(%esp), %ecx
|
|
|
|
movw 8(%esp), %ax
|
|
|
|
movw 12(%esp), %dx
|
|
|
|
lock
|
|
|
|
cmpxchgw %dx,(%ecx)
|
|
|
|
ret
|
|
|
|
|
|
|
|
DEBUG_INFO __kmp_compare_and_store_ret16
|
|
|
|
|
|
|
|
// kmp_int32
|
2017-05-13 02:01:32 +08:00
|
|
|
// __kmp_compare_and_store_ret32(volatile kmp_int32 *p, kmp_int32 cv,
|
|
|
|
// kmp_int32 sv);
|
2013-09-27 18:38:44 +08:00
|
|
|
PROC __kmp_compare_and_store_ret32
|
|
|
|
|
|
|
|
movl 4(%esp), %ecx
|
|
|
|
movl 8(%esp), %eax
|
|
|
|
movl 12(%esp), %edx
|
|
|
|
lock
|
|
|
|
cmpxchgl %edx,(%ecx)
|
|
|
|
ret
|
|
|
|
|
|
|
|
DEBUG_INFO __kmp_compare_and_store_ret32
|
|
|
|
|
|
|
|
// kmp_int64
|
2017-05-13 02:01:32 +08:00
|
|
|
// __kmp_compare_and_store_ret64(volatile kmp_int64 *p, kmp_int64 cv,
|
|
|
|
// kmp_int64 sv);
|
2013-09-27 18:38:44 +08:00
|
|
|
PROC __kmp_compare_and_store_ret64
|
|
|
|
|
|
|
|
pushl %ebp
|
|
|
|
movl %esp, %ebp
|
|
|
|
pushl %ebx
|
|
|
|
pushl %edi
|
|
|
|
movl 8(%ebp), %edi
|
|
|
|
movl 12(%ebp), %eax // "cv" low order word
|
|
|
|
movl 16(%ebp), %edx // "cv" high order word
|
|
|
|
movl 20(%ebp), %ebx // "sv" low order word
|
|
|
|
movl 24(%ebp), %ecx // "sv" high order word
|
|
|
|
lock
|
|
|
|
cmpxchg8b (%edi)
|
|
|
|
popl %edi
|
|
|
|
popl %ebx
|
|
|
|
movl %ebp, %esp
|
|
|
|
popl %ebp
|
|
|
|
ret
|
|
|
|
|
|
|
|
DEBUG_INFO __kmp_compare_and_store_ret64
|
|
|
|
|
|
|
|
|
|
|
|
//------------------------------------------------------------------------
|
|
|
|
// FUNCTION __kmp_xchg_real32
|
|
|
|
//
|
|
|
|
// kmp_real32
|
|
|
|
// __kmp_xchg_real32( volatile kmp_real32 *addr, kmp_real32 data );
|
|
|
|
//
|
|
|
|
// parameters:
|
|
|
|
// addr: 4(%esp)
|
|
|
|
// data: 8(%esp)
|
|
|
|
//
|
|
|
|
// return: %eax
|
|
|
|
PROC __kmp_xchg_real32
|
|
|
|
|
|
|
|
pushl %ebp
|
|
|
|
movl %esp, %ebp
|
|
|
|
subl $4, %esp
|
|
|
|
pushl %esi
|
|
|
|
|
|
|
|
movl 4(%ebp), %esi
|
|
|
|
flds (%esi)
|
|
|
|
// load <addr>
|
|
|
|
fsts -4(%ebp)
|
|
|
|
// store old value
|
|
|
|
|
|
|
|
movl 8(%ebp), %eax
|
|
|
|
|
|
|
|
lock
|
|
|
|
xchgl %eax, (%esi)
|
|
|
|
|
|
|
|
flds -4(%ebp)
|
|
|
|
// return old value
|
|
|
|
|
|
|
|
popl %esi
|
|
|
|
movl %ebp, %esp
|
|
|
|
popl %ebp
|
|
|
|
ret
|
|
|
|
|
|
|
|
DEBUG_INFO __kmp_xchg_real32
|
|
|
|
|
|
|
|
# endif /* !KMP_ASM_INTRINS */
|
|
|
|
|
|
|
|
//------------------------------------------------------------------------
|
|
|
|
// int
|
2019-07-23 02:46:02 +08:00
|
|
|
// __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
|
|
|
|
// int gtid, int tid,
|
|
|
|
// int argc, void *p_argv[]
|
|
|
|
// #if OMPT_SUPPORT
|
|
|
|
// ,
|
|
|
|
// void **exit_frame_ptr
|
|
|
|
// #endif
|
|
|
|
// ) {
|
|
|
|
// #if OMPT_SUPPORT
|
|
|
|
// *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
|
|
|
|
// #endif
|
|
|
|
//
|
|
|
|
// (*pkfn)( & gtid, & tid, argv[0], ... );
|
|
|
|
// return 1;
|
2013-09-27 18:38:44 +08:00
|
|
|
// }
|
|
|
|
|
|
|
|
// -- Begin __kmp_invoke_microtask
|
|
|
|
// mark_begin;
|
|
|
|
PROC __kmp_invoke_microtask
|
|
|
|
|
|
|
|
pushl %ebp
|
2015-02-11 02:51:52 +08:00
|
|
|
KMP_CFI_DEF_OFFSET 8
|
|
|
|
KMP_CFI_OFFSET ebp,-8
|
2013-09-27 18:38:44 +08:00
|
|
|
movl %esp,%ebp // establish the base pointer for this routine.
|
2015-02-11 02:51:52 +08:00
|
|
|
KMP_CFI_REGISTER ebp
|
2013-09-27 18:38:44 +08:00
|
|
|
subl $8,%esp // allocate space for two local variables.
|
|
|
|
// These varibales are:
|
|
|
|
// argv: -4(%ebp)
|
|
|
|
// temp: -8(%ebp)
|
|
|
|
//
|
|
|
|
pushl %ebx // save %ebx to use during this routine
|
2015-04-30 00:42:24 +08:00
|
|
|
//
|
|
|
|
#if OMPT_SUPPORT
|
|
|
|
movl 28(%ebp),%ebx // get exit_frame address
|
|
|
|
movl %ebp,(%ebx) // save exit_frame
|
|
|
|
#endif
|
|
|
|
|
2013-09-27 18:38:44 +08:00
|
|
|
movl 20(%ebp),%ebx // Stack alignment - # args
|
|
|
|
addl $2,%ebx // #args +2 Always pass at least 2 args (gtid and tid)
|
|
|
|
shll $2,%ebx // Number of bytes used on stack: (#args+2)*4
|
|
|
|
movl %esp,%eax //
|
|
|
|
subl %ebx,%eax // %esp-((#args+2)*4) -> %eax -- without mods, stack ptr would be this
|
|
|
|
movl %eax,%ebx // Save to %ebx
|
|
|
|
andl $0xFFFFFF80,%eax // mask off 7 bits
|
|
|
|
subl %eax,%ebx // Amount to subtract from %esp
|
|
|
|
subl %ebx,%esp // Prepare the stack ptr --
|
|
|
|
// now it will be aligned on 128-byte boundary at the call
|
|
|
|
|
|
|
|
movl 24(%ebp),%eax // copy from p_argv[]
|
|
|
|
movl %eax,-4(%ebp) // into the local variable *argv.
|
|
|
|
|
|
|
|
movl 20(%ebp),%ebx // argc is 20(%ebp)
|
|
|
|
shll $2,%ebx
|
|
|
|
|
2015-02-11 02:51:52 +08:00
|
|
|
KMP_LABEL(invoke_2):
|
2013-09-27 18:38:44 +08:00
|
|
|
cmpl $0,%ebx
|
2015-02-11 02:51:52 +08:00
|
|
|
jg KMP_LABEL(invoke_4)
|
|
|
|
jmp KMP_LABEL(invoke_3)
|
2013-09-27 18:38:44 +08:00
|
|
|
ALIGN 2
|
2015-02-11 02:51:52 +08:00
|
|
|
KMP_LABEL(invoke_4):
|
2013-09-27 18:38:44 +08:00
|
|
|
movl -4(%ebp),%eax
|
|
|
|
subl $4,%ebx // decrement argc.
|
|
|
|
addl %ebx,%eax // index into argv.
|
|
|
|
movl (%eax),%edx
|
|
|
|
pushl %edx
|
|
|
|
|
2015-02-11 02:51:52 +08:00
|
|
|
jmp KMP_LABEL(invoke_2)
|
2013-09-27 18:38:44 +08:00
|
|
|
ALIGN 2
|
2015-02-11 02:51:52 +08:00
|
|
|
KMP_LABEL(invoke_3):
|
2013-09-27 18:38:44 +08:00
|
|
|
leal 16(%ebp),%eax // push & tid
|
|
|
|
pushl %eax
|
|
|
|
|
|
|
|
leal 12(%ebp),%eax // push & gtid
|
|
|
|
pushl %eax
|
|
|
|
|
|
|
|
movl 8(%ebp),%ebx
|
|
|
|
call *%ebx // call (*pkfn)();
|
|
|
|
|
|
|
|
movl $1,%eax // return 1;
|
|
|
|
|
|
|
|
movl -12(%ebp),%ebx // restore %ebx
|
|
|
|
leave
|
2015-02-11 02:51:52 +08:00
|
|
|
KMP_CFI_DEF esp,4
|
2013-09-27 18:38:44 +08:00
|
|
|
ret
|
|
|
|
|
|
|
|
DEBUG_INFO __kmp_invoke_microtask
|
|
|
|
// -- End __kmp_invoke_microtask
|
|
|
|
|
|
|
|
|
|
|
|
// kmp_uint64
|
|
|
|
// __kmp_hardware_timestamp(void)
|
|
|
|
PROC __kmp_hardware_timestamp
|
|
|
|
rdtsc
|
|
|
|
ret
|
|
|
|
|
|
|
|
DEBUG_INFO __kmp_hardware_timestamp
|
|
|
|
// -- End __kmp_hardware_timestamp
|
|
|
|
|
|
|
|
#endif /* KMP_ARCH_X86 */
|
|
|
|
|
|
|
|
|
|
|
|
#if KMP_ARCH_X86_64
|
|
|
|
|
|
|
|
// -----------------------------------------------------------------------
|
|
|
|
// microtasking routines specifically written for IA-32 architecture and
|
|
|
|
// Intel(R) 64 running Linux* OS
|
|
|
|
// -----------------------------------------------------------------------
|
|
|
|
|
|
|
|
// -- Machine type P
|
|
|
|
// mark_description "Intel Corporation";
|
|
|
|
.ident "Intel Corporation"
|
2017-07-12 02:04:56 +08:00
|
|
|
// -- .file "z_Linux_asm.S"
|
2013-09-27 18:38:44 +08:00
|
|
|
.data
|
|
|
|
ALIGN 4
|
|
|
|
|
2017-05-13 02:01:32 +08:00
|
|
|
// To prevent getting our code into .data section .text added to every routine
|
|
|
|
// definition for x86_64.
|
2013-09-27 18:38:44 +08:00
|
|
|
//------------------------------------------------------------------------
|
|
|
|
# if !KMP_ASM_INTRINS
|
|
|
|
|
|
|
|
//------------------------------------------------------------------------
|
|
|
|
// FUNCTION __kmp_test_then_add32
|
|
|
|
//
|
|
|
|
// kmp_int32
|
|
|
|
// __kmp_test_then_add32( volatile kmp_int32 *p, kmp_int32 d );
|
|
|
|
//
|
|
|
|
// parameters:
|
|
|
|
// p: %rdi
|
|
|
|
// d: %esi
|
|
|
|
//
|
|
|
|
// return: %eax
|
|
|
|
.text
|
|
|
|
PROC __kmp_test_then_add32
|
|
|
|
|
|
|
|
movl %esi, %eax // "d"
|
|
|
|
lock
|
|
|
|
xaddl %eax,(%rdi)
|
|
|
|
ret
|
|
|
|
|
|
|
|
DEBUG_INFO __kmp_test_then_add32
|
|
|
|
|
|
|
|
|
|
|
|
//------------------------------------------------------------------------
|
|
|
|
// FUNCTION __kmp_test_then_add64
|
|
|
|
//
|
|
|
|
// kmp_int64
|
|
|
|
// __kmp_test_then_add64( volatile kmp_int64 *p, kmp_int64 d );
|
|
|
|
//
|
|
|
|
// parameters:
|
|
|
|
// p: %rdi
|
|
|
|
// d: %rsi
|
|
|
|
// return: %rax
|
|
|
|
.text
|
|
|
|
PROC __kmp_test_then_add64
|
|
|
|
|
|
|
|
movq %rsi, %rax // "d"
|
|
|
|
lock
|
|
|
|
xaddq %rax,(%rdi)
|
|
|
|
ret
|
|
|
|
|
|
|
|
DEBUG_INFO __kmp_test_then_add64
|
|
|
|
|
|
|
|
|
|
|
|
//------------------------------------------------------------------------
|
|
|
|
// FUNCTION __kmp_xchg_fixed8
|
|
|
|
//
|
|
|
|
// kmp_int32
|
|
|
|
// __kmp_xchg_fixed8( volatile kmp_int8 *p, kmp_int8 d );
|
|
|
|
//
|
|
|
|
// parameters:
|
|
|
|
// p: %rdi
|
|
|
|
// d: %sil
|
|
|
|
//
|
|
|
|
// return: %al
|
|
|
|
.text
|
|
|
|
PROC __kmp_xchg_fixed8
|
|
|
|
|
|
|
|
movb %sil, %al // "d"
|
|
|
|
|
|
|
|
lock
|
|
|
|
xchgb %al,(%rdi)
|
|
|
|
ret
|
|
|
|
|
|
|
|
DEBUG_INFO __kmp_xchg_fixed8
|
|
|
|
|
|
|
|
|
|
|
|
//------------------------------------------------------------------------
|
|
|
|
// FUNCTION __kmp_xchg_fixed16
|
|
|
|
//
|
|
|
|
// kmp_int16
|
|
|
|
// __kmp_xchg_fixed16( volatile kmp_int16 *p, kmp_int16 d );
|
|
|
|
//
|
|
|
|
// parameters:
|
|
|
|
// p: %rdi
|
|
|
|
// d: %si
|
|
|
|
// return: %ax
|
|
|
|
.text
|
|
|
|
PROC __kmp_xchg_fixed16
|
|
|
|
|
|
|
|
movw %si, %ax // "d"
|
|
|
|
|
|
|
|
lock
|
|
|
|
xchgw %ax,(%rdi)
|
|
|
|
ret
|
|
|
|
|
|
|
|
DEBUG_INFO __kmp_xchg_fixed16
|
|
|
|
|
|
|
|
|
|
|
|
//------------------------------------------------------------------------
|
|
|
|
// FUNCTION __kmp_xchg_fixed32
|
|
|
|
//
|
|
|
|
// kmp_int32
|
|
|
|
// __kmp_xchg_fixed32( volatile kmp_int32 *p, kmp_int32 d );
|
|
|
|
//
|
|
|
|
// parameters:
|
|
|
|
// p: %rdi
|
|
|
|
// d: %esi
|
|
|
|
//
|
|
|
|
// return: %eax
|
|
|
|
.text
|
|
|
|
PROC __kmp_xchg_fixed32
|
|
|
|
|
|
|
|
movl %esi, %eax // "d"
|
|
|
|
|
|
|
|
lock
|
|
|
|
xchgl %eax,(%rdi)
|
|
|
|
ret
|
|
|
|
|
|
|
|
DEBUG_INFO __kmp_xchg_fixed32
|
|
|
|
|
|
|
|
|
|
|
|
//------------------------------------------------------------------------
|
|
|
|
// FUNCTION __kmp_xchg_fixed64
|
|
|
|
//
|
|
|
|
// kmp_int64
|
|
|
|
// __kmp_xchg_fixed64( volatile kmp_int64 *p, kmp_int64 d );
|
|
|
|
//
|
|
|
|
// parameters:
|
|
|
|
// p: %rdi
|
|
|
|
// d: %rsi
|
|
|
|
// return: %rax
|
|
|
|
.text
|
|
|
|
PROC __kmp_xchg_fixed64
|
|
|
|
|
|
|
|
movq %rsi, %rax // "d"
|
|
|
|
|
|
|
|
lock
|
|
|
|
xchgq %rax,(%rdi)
|
|
|
|
ret
|
|
|
|
|
|
|
|
DEBUG_INFO __kmp_xchg_fixed64
|
|
|
|
|
|
|
|
|
|
|
|
//------------------------------------------------------------------------
|
|
|
|
// FUNCTION __kmp_compare_and_store8
|
|
|
|
//
|
|
|
|
// kmp_int8
|
|
|
|
// __kmp_compare_and_store8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
|
|
|
|
//
|
|
|
|
// parameters:
|
|
|
|
// p: %rdi
|
|
|
|
// cv: %esi
|
|
|
|
// sv: %edx
|
|
|
|
//
|
|
|
|
// return: %eax
|
|
|
|
.text
|
|
|
|
PROC __kmp_compare_and_store8
|
|
|
|
|
|
|
|
movb %sil, %al // "cv"
|
|
|
|
lock
|
|
|
|
cmpxchgb %dl,(%rdi)
|
|
|
|
sete %al // if %al == (%rdi) set %al = 1 else set %al = 0
|
|
|
|
andq $1, %rax // sign extend previous instruction for return value
|
|
|
|
ret
|
|
|
|
|
|
|
|
DEBUG_INFO __kmp_compare_and_store8
|
|
|
|
|
|
|
|
|
|
|
|
//------------------------------------------------------------------------
|
|
|
|
// FUNCTION __kmp_compare_and_store16
|
|
|
|
//
|
|
|
|
// kmp_int16
|
|
|
|
// __kmp_compare_and_store16( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
|
|
|
|
//
|
|
|
|
// parameters:
|
|
|
|
// p: %rdi
|
|
|
|
// cv: %si
|
|
|
|
// sv: %dx
|
|
|
|
//
|
|
|
|
// return: %eax
|
|
|
|
.text
|
|
|
|
PROC __kmp_compare_and_store16
|
|
|
|
|
|
|
|
movw %si, %ax // "cv"
|
|
|
|
lock
|
|
|
|
cmpxchgw %dx,(%rdi)
|
|
|
|
sete %al // if %ax == (%rdi) set %al = 1 else set %al = 0
|
|
|
|
andq $1, %rax // sign extend previous instruction for return value
|
|
|
|
ret
|
|
|
|
|
|
|
|
DEBUG_INFO __kmp_compare_and_store16
|
|
|
|
|
|
|
|
|
|
|
|
//------------------------------------------------------------------------
|
|
|
|
// FUNCTION __kmp_compare_and_store32
|
|
|
|
//
|
|
|
|
// kmp_int32
|
|
|
|
// __kmp_compare_and_store32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
|
|
|
|
//
|
|
|
|
// parameters:
|
|
|
|
// p: %rdi
|
|
|
|
// cv: %esi
|
|
|
|
// sv: %edx
|
|
|
|
//
|
|
|
|
// return: %eax
|
|
|
|
.text
|
|
|
|
PROC __kmp_compare_and_store32
|
|
|
|
|
|
|
|
movl %esi, %eax // "cv"
|
|
|
|
lock
|
|
|
|
cmpxchgl %edx,(%rdi)
|
|
|
|
sete %al // if %eax == (%rdi) set %al = 1 else set %al = 0
|
|
|
|
andq $1, %rax // sign extend previous instruction for return value
|
|
|
|
ret
|
|
|
|
|
|
|
|
DEBUG_INFO __kmp_compare_and_store32
|
|
|
|
|
|
|
|
|
|
|
|
//------------------------------------------------------------------------
|
|
|
|
// FUNCTION __kmp_compare_and_store64
|
|
|
|
//
|
|
|
|
// kmp_int32
|
|
|
|
// __kmp_compare_and_store64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
|
|
|
|
//
|
|
|
|
// parameters:
|
|
|
|
// p: %rdi
|
|
|
|
// cv: %rsi
|
|
|
|
// sv: %rdx
|
|
|
|
// return: %eax
|
|
|
|
.text
|
|
|
|
PROC __kmp_compare_and_store64
|
|
|
|
|
|
|
|
movq %rsi, %rax // "cv"
|
|
|
|
lock
|
|
|
|
cmpxchgq %rdx,(%rdi)
|
|
|
|
sete %al // if %rax == (%rdi) set %al = 1 else set %al = 0
|
|
|
|
andq $1, %rax // sign extend previous instruction for return value
|
|
|
|
ret
|
|
|
|
|
|
|
|
DEBUG_INFO __kmp_compare_and_store64
|
|
|
|
|
|
|
|
//------------------------------------------------------------------------
|
|
|
|
// FUNCTION __kmp_compare_and_store_ret8
|
|
|
|
//
|
|
|
|
// kmp_int8
|
|
|
|
// __kmp_compare_and_store_ret8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
|
|
|
|
//
|
|
|
|
// parameters:
|
|
|
|
// p: %rdi
|
|
|
|
// cv: %esi
|
|
|
|
// sv: %edx
|
|
|
|
//
|
|
|
|
// return: %eax
|
|
|
|
.text
|
|
|
|
PROC __kmp_compare_and_store_ret8
|
|
|
|
|
|
|
|
movb %sil, %al // "cv"
|
|
|
|
lock
|
|
|
|
cmpxchgb %dl,(%rdi)
|
|
|
|
ret
|
|
|
|
|
|
|
|
DEBUG_INFO __kmp_compare_and_store_ret8
|
|
|
|
|
|
|
|
|
|
|
|
//------------------------------------------------------------------------
|
|
|
|
// FUNCTION __kmp_compare_and_store_ret16
|
|
|
|
//
|
|
|
|
// kmp_int16
|
|
|
|
// __kmp_compare_and_store16_ret( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
|
|
|
|
//
|
|
|
|
// parameters:
|
|
|
|
// p: %rdi
|
|
|
|
// cv: %si
|
|
|
|
// sv: %dx
|
|
|
|
//
|
|
|
|
// return: %eax
|
|
|
|
.text
|
|
|
|
PROC __kmp_compare_and_store_ret16
|
|
|
|
|
|
|
|
movw %si, %ax // "cv"
|
|
|
|
lock
|
|
|
|
cmpxchgw %dx,(%rdi)
|
|
|
|
ret
|
|
|
|
|
|
|
|
DEBUG_INFO __kmp_compare_and_store_ret16
|
|
|
|
|
|
|
|
|
|
|
|
//------------------------------------------------------------------------
|
|
|
|
// FUNCTION __kmp_compare_and_store_ret32
|
|
|
|
//
|
|
|
|
// kmp_int32
|
|
|
|
// __kmp_compare_and_store_ret32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
|
|
|
|
//
|
|
|
|
// parameters:
|
|
|
|
// p: %rdi
|
|
|
|
// cv: %esi
|
|
|
|
// sv: %edx
|
|
|
|
//
|
|
|
|
// return: %eax
|
|
|
|
.text
|
|
|
|
PROC __kmp_compare_and_store_ret32
|
|
|
|
|
|
|
|
movl %esi, %eax // "cv"
|
|
|
|
lock
|
|
|
|
cmpxchgl %edx,(%rdi)
|
|
|
|
ret
|
|
|
|
|
|
|
|
DEBUG_INFO __kmp_compare_and_store_ret32
|
|
|
|
|
|
|
|
|
|
|
|
//------------------------------------------------------------------------
|
|
|
|
// FUNCTION __kmp_compare_and_store_ret64
|
|
|
|
//
|
|
|
|
// kmp_int64
|
|
|
|
// __kmp_compare_and_store_ret64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
|
|
|
|
//
|
|
|
|
// parameters:
|
|
|
|
// p: %rdi
|
|
|
|
// cv: %rsi
|
|
|
|
// sv: %rdx
|
|
|
|
// return: %eax
|
|
|
|
.text
|
|
|
|
PROC __kmp_compare_and_store_ret64
|
|
|
|
|
|
|
|
movq %rsi, %rax // "cv"
|
|
|
|
lock
|
|
|
|
cmpxchgq %rdx,(%rdi)
|
|
|
|
ret
|
|
|
|
|
|
|
|
DEBUG_INFO __kmp_compare_and_store_ret64
|
|
|
|
|
|
|
|
# endif /* !KMP_ASM_INTRINS */
|
|
|
|
|
|
|
|
|
2015-08-21 03:46:14 +08:00
|
|
|
# if !KMP_MIC
|
2013-09-27 18:38:44 +08:00
|
|
|
|
|
|
|
# if !KMP_ASM_INTRINS
|
|
|
|
|
|
|
|
//------------------------------------------------------------------------
|
|
|
|
// FUNCTION __kmp_xchg_real32
|
|
|
|
//
|
|
|
|
// kmp_real32
|
|
|
|
// __kmp_xchg_real32( volatile kmp_real32 *addr, kmp_real32 data );
|
|
|
|
//
|
|
|
|
// parameters:
|
|
|
|
// addr: %rdi
|
|
|
|
// data: %xmm0 (lower 4 bytes)
|
|
|
|
//
|
|
|
|
// return: %xmm0 (lower 4 bytes)
|
|
|
|
.text
|
|
|
|
PROC __kmp_xchg_real32
|
|
|
|
|
|
|
|
movd %xmm0, %eax // load "data" to eax
|
|
|
|
|
|
|
|
lock
|
|
|
|
xchgl %eax, (%rdi)
|
|
|
|
|
|
|
|
movd %eax, %xmm0 // load old value into return register
|
|
|
|
|
|
|
|
ret
|
|
|
|
|
|
|
|
DEBUG_INFO __kmp_xchg_real32
|
|
|
|
|
|
|
|
|
|
|
|
//------------------------------------------------------------------------
|
|
|
|
// FUNCTION __kmp_xchg_real64
|
|
|
|
//
|
|
|
|
// kmp_real64
|
|
|
|
// __kmp_xchg_real64( volatile kmp_real64 *addr, kmp_real64 data );
|
|
|
|
//
|
|
|
|
// parameters:
|
|
|
|
// addr: %rdi
|
|
|
|
// data: %xmm0 (lower 8 bytes)
|
|
|
|
// return: %xmm0 (lower 8 bytes)
|
|
|
|
.text
|
|
|
|
PROC __kmp_xchg_real64
|
|
|
|
|
|
|
|
movd %xmm0, %rax // load "data" to rax
|
|
|
|
|
|
|
|
lock
|
|
|
|
xchgq %rax, (%rdi)
|
|
|
|
|
|
|
|
movd %rax, %xmm0 // load old value into return register
|
|
|
|
ret
|
|
|
|
|
|
|
|
DEBUG_INFO __kmp_xchg_real64
|
|
|
|
|
|
|
|
|
2015-08-21 03:46:14 +08:00
|
|
|
# endif /* !KMP_MIC */
|
2013-09-27 18:38:44 +08:00
|
|
|
|
|
|
|
# endif /* !KMP_ASM_INTRINS */
|
|
|
|
|
|
|
|
//------------------------------------------------------------------------
|
|
|
|
// int
|
I apologise in advance for the size of this check-in. At Intel we do
understand that this is not friendly, and are working to change our
internal code-development to make it easier to make development
features available more frequently and in finer (more functional)
chunks. Unfortunately we haven't got that in place yet, and unpicking
this into multiple separate check-ins would be non-trivial, so please
bear with me on this one. We should be better in the future.
Apologies over, what do we have here?
GGC 4.9 compatibility
--------------------
* We have implemented the new entrypoints used by code compiled by GCC
4.9 to implement the same functionality in gcc 4.8. Therefore code
compiled with gcc 4.9 that used to work will continue to do so.
However, there are some other new entrypoints (associated with task
cancellation) which are not implemented. Therefore user code compiled
by gcc 4.9 that uses these new features will not link against the LLVM
runtime. (It remains unclear how to handle those entrypoints, since
the GCC interface has potentially unpleasant performance implications
for join barriers even when cancellation is not used)
--- new parallel entry points ---
new entry points that aren't OpenMP 4.0 related
These are implemented fully :-
GOMP_parallel_loop_dynamic()
GOMP_parallel_loop_guided()
GOMP_parallel_loop_runtime()
GOMP_parallel_loop_static()
GOMP_parallel_sections()
GOMP_parallel()
--- cancellation entry points ---
Currently, these only give a runtime error if OMP_CANCELLATION is true
because our plain barriers don't check for cancellation while waiting
GOMP_barrier_cancel()
GOMP_cancel()
GOMP_cancellation_point()
GOMP_loop_end_cancel()
GOMP_sections_end_cancel()
--- taskgroup entry points ---
These are implemented fully.
GOMP_taskgroup_start()
GOMP_taskgroup_end()
--- target entry points ---
These are empty (as they are in libgomp)
GOMP_target()
GOMP_target_data()
GOMP_target_end_data()
GOMP_target_update()
GOMP_teams()
Improvements in Barriers and Fork/Join
--------------------------------------
* Barrier and fork/join code is now in its own file (which makes it
easier to understand and modify).
* Wait/release code is now templated and in its own file; suspend/resume code is also templated
* There's a new, hierarchical, barrier, which exploits the
cache-hierarchy of the Intel(r) Xeon Phi(tm) coprocessor to improve
fork/join and barrier performance.
***BEWARE*** the new source files have *not* been added to the legacy
Cmake build system. If you want to use that fixes wil be required.
Statistics Collection Code
--------------------------
* New code has been added to collect application statistics (if this
is enabled at library compile time; by default it is not). The
statistics code itself is generally useful, the lightweight timing
code uses the X86 rdtsc instruction, so will require changes for other
architectures.
The intent of this code is not for users to tune their codes but
rather
1) For timing code-paths inside the runtime
2) For gathering general properties of OpenMP codes to focus attention
on which OpenMP features are most used.
Nested Hot Teams
----------------
* The runtime now maintains more state to reduce the overhead of
creating and destroying inner parallel teams. This improves the
performance of code that repeatedly uses nested parallelism with the
same resource allocation. Set the new KMP_HOT_TEAMS_MAX_LEVEL
envirable to a depth to enable this (and, of course, OMP_NESTED=true
to enable nested parallelism at all).
Improved Intel(r) VTune(Tm) Amplifier support
---------------------------------------------
* The runtime provides additional information to Vtune via the
itt_notify interface to allow it to display better OpenMP specific
analyses of load-imbalance.
Support for OpenMP Composite Statements
---------------------------------------
* Implement new entrypoints required by some of the OpenMP 4.1
composite statements.
Improved ifdefs
---------------
* More separation of concepts ("Does this platform do X?") from
platforms ("Are we compiling for platform Y?"), which should simplify
future porting.
ScaleMP* contribution
---------------------
Stack padding to improve the performance in their environment where
cross-node coherency is managed at the page level.
Redesign of wait and release code
---------------------------------
The code is simplified and performance improved.
Bug Fixes
---------
*Fixes for Windows multiple processor groups.
*Fix Fortran module build on Linux: offload attribute added.
*Fix entry names for distribute-parallel-loop construct to be consistent with the compiler codegen.
*Fix an inconsistent error message for KMP_PLACE_THREADS environment variable.
llvm-svn: 219214
2014-10-08 00:25:50 +08:00
|
|
|
// __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
|
2019-07-23 02:46:02 +08:00
|
|
|
// int gtid, int tid,
|
|
|
|
// int argc, void *p_argv[]
|
|
|
|
// #if OMPT_SUPPORT
|
|
|
|
// ,
|
|
|
|
// void **exit_frame_ptr
|
|
|
|
// #endif
|
|
|
|
// ) {
|
|
|
|
// #if OMPT_SUPPORT
|
|
|
|
// *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
|
|
|
|
// #endif
|
|
|
|
//
|
|
|
|
// (*pkfn)( & gtid, & tid, argv[0], ... );
|
|
|
|
// return 1;
|
2013-09-27 18:38:44 +08:00
|
|
|
// }
|
|
|
|
//
|
2017-05-13 02:01:32 +08:00
|
|
|
// note: at call to pkfn must have %rsp 128-byte aligned for compiler
|
2013-09-27 18:38:44 +08:00
|
|
|
//
|
|
|
|
// parameters:
|
|
|
|
// %rdi: pkfn
|
|
|
|
// %esi: gtid
|
|
|
|
// %edx: tid
|
|
|
|
// %ecx: argc
|
|
|
|
// %r8: p_argv
|
2015-04-30 00:42:24 +08:00
|
|
|
// %r9: &exit_frame
|
2013-09-27 18:38:44 +08:00
|
|
|
//
|
|
|
|
// locals:
|
|
|
|
// __gtid: gtid parm pushed on stack so can pass >id to pkfn
|
|
|
|
// __tid: tid parm pushed on stack so can pass &tid to pkfn
|
|
|
|
//
|
|
|
|
// reg temps:
|
|
|
|
// %rax: used all over the place
|
|
|
|
// %rdx: used in stack pointer alignment calculation
|
|
|
|
// %r11: used to traverse p_argv array
|
|
|
|
// %rsi: used as temporary for stack parameters
|
|
|
|
// used as temporary for number of pkfn parms to push
|
|
|
|
// %rbx: used to hold pkfn address, and zero constant, callee-save
|
|
|
|
//
|
|
|
|
// return: %eax (always 1/TRUE)
|
|
|
|
__gtid = -16
|
|
|
|
__tid = -24
|
|
|
|
|
|
|
|
// -- Begin __kmp_invoke_microtask
|
|
|
|
// mark_begin;
|
|
|
|
.text
|
|
|
|
PROC __kmp_invoke_microtask
|
|
|
|
|
|
|
|
pushq %rbp // save base pointer
|
2015-02-11 02:51:52 +08:00
|
|
|
KMP_CFI_DEF_OFFSET 16
|
|
|
|
KMP_CFI_OFFSET rbp,-16
|
2013-09-27 18:38:44 +08:00
|
|
|
movq %rsp,%rbp // establish the base pointer for this routine.
|
2015-02-11 02:51:52 +08:00
|
|
|
KMP_CFI_REGISTER rbp
|
2015-04-30 00:42:24 +08:00
|
|
|
|
|
|
|
#if OMPT_SUPPORT
|
|
|
|
movq %rbp, (%r9) // save exit_frame
|
|
|
|
#endif
|
|
|
|
|
2013-09-27 18:38:44 +08:00
|
|
|
pushq %rbx // %rbx is callee-saved register
|
|
|
|
pushq %rsi // Put gtid on stack so can pass &tgid to pkfn
|
|
|
|
pushq %rdx // Put tid on stack so can pass &tid to pkfn
|
|
|
|
|
|
|
|
movq %rcx, %rax // Stack alignment calculation begins; argc -> %rax
|
|
|
|
movq $0, %rbx // constant for cmovs later
|
|
|
|
subq $4, %rax // subtract four args passed in registers to pkfn
|
2015-08-21 03:46:14 +08:00
|
|
|
#if KMP_MIC
|
2015-02-11 02:51:52 +08:00
|
|
|
js KMP_LABEL(kmp_0) // jump to movq
|
|
|
|
jmp KMP_LABEL(kmp_0_exit) // jump ahead
|
|
|
|
KMP_LABEL(kmp_0):
|
2013-09-27 18:38:44 +08:00
|
|
|
movq %rbx, %rax // zero negative value in %rax <- max(0, argc-4)
|
2015-02-11 02:51:52 +08:00
|
|
|
KMP_LABEL(kmp_0_exit):
|
2013-09-27 18:38:44 +08:00
|
|
|
#else
|
|
|
|
cmovsq %rbx, %rax // zero negative value in %rax <- max(0, argc-4)
|
2015-08-21 03:46:14 +08:00
|
|
|
#endif // KMP_MIC
|
2013-09-27 18:38:44 +08:00
|
|
|
|
|
|
|
movq %rax, %rsi // save max(0, argc-4) -> %rsi for later
|
|
|
|
shlq $3, %rax // Number of bytes used on stack: max(0, argc-4)*8
|
|
|
|
|
|
|
|
movq %rsp, %rdx //
|
|
|
|
subq %rax, %rdx // %rsp-(max(0,argc-4)*8) -> %rdx --
|
|
|
|
// without align, stack ptr would be this
|
|
|
|
movq %rdx, %rax // Save to %rax
|
|
|
|
|
|
|
|
andq $0xFFFFFFFFFFFFFF80, %rax // mask off lower 7 bits (128 bytes align)
|
|
|
|
subq %rax, %rdx // Amount to subtract from %rsp
|
|
|
|
subq %rdx, %rsp // Prepare the stack ptr --
|
|
|
|
// now %rsp will align to 128-byte boundary at call site
|
|
|
|
|
|
|
|
// setup pkfn parameter reg and stack
|
|
|
|
movq %rcx, %rax // argc -> %rax
|
|
|
|
cmpq $0, %rsi
|
2015-02-11 02:51:52 +08:00
|
|
|
je KMP_LABEL(kmp_invoke_pass_parms) // jump ahead if no parms to push
|
2013-09-27 18:38:44 +08:00
|
|
|
shlq $3, %rcx // argc*8 -> %rcx
|
|
|
|
movq %r8, %rdx // p_argv -> %rdx
|
|
|
|
addq %rcx, %rdx // &p_argv[argc] -> %rdx
|
|
|
|
|
|
|
|
movq %rsi, %rcx // max (0, argc-4) -> %rcx
|
|
|
|
|
2015-02-11 02:51:52 +08:00
|
|
|
KMP_LABEL(kmp_invoke_push_parms):
|
|
|
|
// push nth - 7th parms to pkfn on stack
|
2013-09-27 18:38:44 +08:00
|
|
|
subq $8, %rdx // decrement p_argv pointer to previous parm
|
|
|
|
movq (%rdx), %rsi // p_argv[%rcx-1] -> %rsi
|
|
|
|
pushq %rsi // push p_argv[%rcx-1] onto stack (reverse order)
|
|
|
|
subl $1, %ecx
|
|
|
|
|
|
|
|
// C69570: "X86_64_RELOC_BRANCH not supported" error at linking on mac_32e
|
|
|
|
// if the name of the label that is an operand of this jecxz starts with a dot (".");
|
|
|
|
// Apple's linker does not support 1-byte length relocation;
|
|
|
|
// Resolution: replace all .labelX entries with L_labelX.
|
|
|
|
|
2015-02-11 02:51:52 +08:00
|
|
|
jecxz KMP_LABEL(kmp_invoke_pass_parms) // stop when four p_argv[] parms left
|
|
|
|
jmp KMP_LABEL(kmp_invoke_push_parms)
|
2013-09-27 18:38:44 +08:00
|
|
|
ALIGN 3
|
2015-02-11 02:51:52 +08:00
|
|
|
KMP_LABEL(kmp_invoke_pass_parms): // put 1st - 6th parms to pkfn in registers.
|
2013-09-27 18:38:44 +08:00
|
|
|
// order here is important to avoid trashing
|
|
|
|
// registers used for both input and output parms!
|
|
|
|
movq %rdi, %rbx // pkfn -> %rbx
|
|
|
|
leaq __gtid(%rbp), %rdi // >id -> %rdi (store 1st parm to pkfn)
|
|
|
|
leaq __tid(%rbp), %rsi // &tid -> %rsi (store 2nd parm to pkfn)
|
|
|
|
|
|
|
|
movq %r8, %r11 // p_argv -> %r11
|
|
|
|
|
2015-08-21 03:46:14 +08:00
|
|
|
#if KMP_MIC
|
2013-09-27 18:38:44 +08:00
|
|
|
cmpq $4, %rax // argc >= 4?
|
2015-02-11 02:51:52 +08:00
|
|
|
jns KMP_LABEL(kmp_4) // jump to movq
|
|
|
|
jmp KMP_LABEL(kmp_4_exit) // jump ahead
|
|
|
|
KMP_LABEL(kmp_4):
|
2013-09-27 18:38:44 +08:00
|
|
|
movq 24(%r11), %r9 // p_argv[3] -> %r9 (store 6th parm to pkfn)
|
2015-02-11 02:51:52 +08:00
|
|
|
KMP_LABEL(kmp_4_exit):
|
2013-09-27 18:38:44 +08:00
|
|
|
|
|
|
|
cmpq $3, %rax // argc >= 3?
|
2015-02-11 02:51:52 +08:00
|
|
|
jns KMP_LABEL(kmp_3) // jump to movq
|
|
|
|
jmp KMP_LABEL(kmp_3_exit) // jump ahead
|
|
|
|
KMP_LABEL(kmp_3):
|
2013-09-27 18:38:44 +08:00
|
|
|
movq 16(%r11), %r8 // p_argv[2] -> %r8 (store 5th parm to pkfn)
|
2015-02-11 02:51:52 +08:00
|
|
|
KMP_LABEL(kmp_3_exit):
|
2013-09-27 18:38:44 +08:00
|
|
|
|
|
|
|
cmpq $2, %rax // argc >= 2?
|
2015-02-11 02:51:52 +08:00
|
|
|
jns KMP_LABEL(kmp_2) // jump to movq
|
|
|
|
jmp KMP_LABEL(kmp_2_exit) // jump ahead
|
|
|
|
KMP_LABEL(kmp_2):
|
2013-09-27 18:38:44 +08:00
|
|
|
movq 8(%r11), %rcx // p_argv[1] -> %rcx (store 4th parm to pkfn)
|
2015-02-11 02:51:52 +08:00
|
|
|
KMP_LABEL(kmp_2_exit):
|
2013-09-27 18:38:44 +08:00
|
|
|
|
|
|
|
cmpq $1, %rax // argc >= 1?
|
2015-02-11 02:51:52 +08:00
|
|
|
jns KMP_LABEL(kmp_1) // jump to movq
|
|
|
|
jmp KMP_LABEL(kmp_1_exit) // jump ahead
|
|
|
|
KMP_LABEL(kmp_1):
|
2013-09-27 18:38:44 +08:00
|
|
|
movq (%r11), %rdx // p_argv[0] -> %rdx (store 3rd parm to pkfn)
|
2015-02-11 02:51:52 +08:00
|
|
|
KMP_LABEL(kmp_1_exit):
|
2013-09-27 18:38:44 +08:00
|
|
|
#else
|
|
|
|
cmpq $4, %rax // argc >= 4?
|
|
|
|
cmovnsq 24(%r11), %r9 // p_argv[3] -> %r9 (store 6th parm to pkfn)
|
|
|
|
|
|
|
|
cmpq $3, %rax // argc >= 3?
|
|
|
|
cmovnsq 16(%r11), %r8 // p_argv[2] -> %r8 (store 5th parm to pkfn)
|
|
|
|
|
|
|
|
cmpq $2, %rax // argc >= 2?
|
|
|
|
cmovnsq 8(%r11), %rcx // p_argv[1] -> %rcx (store 4th parm to pkfn)
|
|
|
|
|
|
|
|
cmpq $1, %rax // argc >= 1?
|
|
|
|
cmovnsq (%r11), %rdx // p_argv[0] -> %rdx (store 3rd parm to pkfn)
|
2015-08-21 03:46:14 +08:00
|
|
|
#endif // KMP_MIC
|
2013-09-27 18:38:44 +08:00
|
|
|
|
|
|
|
call *%rbx // call (*pkfn)();
|
|
|
|
movq $1, %rax // move 1 into return register;
|
|
|
|
|
|
|
|
movq -8(%rbp), %rbx // restore %rbx using %rbp since %rsp was modified
|
|
|
|
movq %rbp, %rsp // restore stack pointer
|
|
|
|
popq %rbp // restore frame pointer
|
2015-02-11 02:51:52 +08:00
|
|
|
KMP_CFI_DEF rsp,8
|
2013-09-27 18:38:44 +08:00
|
|
|
ret
|
|
|
|
|
|
|
|
DEBUG_INFO __kmp_invoke_microtask
|
|
|
|
// -- End __kmp_invoke_microtask
|
|
|
|
|
|
|
|
// kmp_uint64
|
|
|
|
// __kmp_hardware_timestamp(void)
|
|
|
|
.text
|
|
|
|
PROC __kmp_hardware_timestamp
|
|
|
|
rdtsc
|
|
|
|
shlq $32, %rdx
|
|
|
|
orq %rdx, %rax
|
|
|
|
ret
|
|
|
|
|
|
|
|
DEBUG_INFO __kmp_hardware_timestamp
|
|
|
|
// -- End __kmp_hardware_timestamp
|
|
|
|
|
|
|
|
//------------------------------------------------------------------------
|
|
|
|
// FUNCTION __kmp_bsr32
|
|
|
|
//
|
|
|
|
// int
|
|
|
|
// __kmp_bsr32( int );
|
|
|
|
.text
|
|
|
|
PROC __kmp_bsr32
|
|
|
|
|
|
|
|
bsr %edi,%eax
|
|
|
|
ret
|
|
|
|
|
|
|
|
DEBUG_INFO __kmp_bsr32
|
|
|
|
|
|
|
|
// -----------------------------------------------------------------------
|
|
|
|
#endif /* KMP_ARCH_X86_64 */
|
2013-12-24 01:28:57 +08:00
|
|
|
|
2016-05-13 16:26:42 +08:00
|
|
|
// '
|
2017-04-17 19:58:20 +08:00
|
|
|
#if (KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64
|
2016-05-13 16:26:42 +08:00
|
|
|
|
|
|
|
//------------------------------------------------------------------------
|
|
|
|
// int
|
|
|
|
// __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
|
2019-07-23 02:46:02 +08:00
|
|
|
// int gtid, int tid,
|
|
|
|
// int argc, void *p_argv[]
|
|
|
|
// #if OMPT_SUPPORT
|
|
|
|
// ,
|
|
|
|
// void **exit_frame_ptr
|
|
|
|
// #endif
|
|
|
|
// ) {
|
|
|
|
// #if OMPT_SUPPORT
|
|
|
|
// *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
|
|
|
|
// #endif
|
|
|
|
//
|
|
|
|
// (*pkfn)( & gtid, & tid, argv[0], ... );
|
|
|
|
//
|
|
|
|
// // FIXME: This is done at call-site and can be removed here.
|
|
|
|
// #if OMPT_SUPPORT
|
|
|
|
// *exit_frame_ptr = 0;
|
|
|
|
// #endif
|
|
|
|
//
|
|
|
|
// return 1;
|
2016-05-13 16:26:42 +08:00
|
|
|
// }
|
|
|
|
//
|
|
|
|
// parameters:
|
|
|
|
// x0: pkfn
|
|
|
|
// w1: gtid
|
|
|
|
// w2: tid
|
|
|
|
// w3: argc
|
|
|
|
// x4: p_argv
|
|
|
|
// x5: &exit_frame
|
|
|
|
//
|
|
|
|
// locals:
|
|
|
|
// __gtid: gtid parm pushed on stack so can pass >id to pkfn
|
|
|
|
// __tid: tid parm pushed on stack so can pass &tid to pkfn
|
|
|
|
//
|
|
|
|
// reg temps:
|
|
|
|
// x8: used to hold pkfn address
|
|
|
|
// w9: used as temporary for number of pkfn parms
|
|
|
|
// x10: used to traverse p_argv array
|
|
|
|
// x11: used as temporary for stack placement calculation
|
|
|
|
// x12: used as temporary for stack parameters
|
|
|
|
// x19: used to preserve exit_frame_ptr, callee-save
|
|
|
|
//
|
|
|
|
// return: w0 (always 1/TRUE)
|
|
|
|
//
|
|
|
|
|
|
|
|
__gtid = 4
|
|
|
|
__tid = 8
|
|
|
|
|
|
|
|
// -- Begin __kmp_invoke_microtask
|
|
|
|
// mark_begin;
|
|
|
|
.text
|
|
|
|
PROC __kmp_invoke_microtask
|
|
|
|
|
|
|
|
stp x29, x30, [sp, #-16]!
|
|
|
|
# if OMPT_SUPPORT
|
|
|
|
stp x19, x20, [sp, #-16]!
|
|
|
|
# endif
|
|
|
|
mov x29, sp
|
|
|
|
|
|
|
|
orr w9, wzr, #1
|
|
|
|
add w9, w9, w3, lsr #1
|
2019-05-15 05:44:54 +08:00
|
|
|
sub sp, sp, w9, uxtw #4
|
2016-05-13 16:26:42 +08:00
|
|
|
mov x11, sp
|
|
|
|
|
|
|
|
mov x8, x0
|
|
|
|
str w1, [x29, #-__gtid]
|
|
|
|
str w2, [x29, #-__tid]
|
|
|
|
mov w9, w3
|
|
|
|
mov x10, x4
|
|
|
|
# if OMPT_SUPPORT
|
|
|
|
mov x19, x5
|
|
|
|
str x29, [x19]
|
|
|
|
# endif
|
|
|
|
|
|
|
|
sub x0, x29, #__gtid
|
|
|
|
sub x1, x29, #__tid
|
|
|
|
|
|
|
|
cbz w9, KMP_LABEL(kmp_1)
|
|
|
|
ldr x2, [x10]
|
|
|
|
|
|
|
|
sub w9, w9, #1
|
|
|
|
cbz w9, KMP_LABEL(kmp_1)
|
|
|
|
ldr x3, [x10, #8]!
|
|
|
|
|
|
|
|
sub w9, w9, #1
|
|
|
|
cbz w9, KMP_LABEL(kmp_1)
|
|
|
|
ldr x4, [x10, #8]!
|
|
|
|
|
|
|
|
sub w9, w9, #1
|
|
|
|
cbz w9, KMP_LABEL(kmp_1)
|
|
|
|
ldr x5, [x10, #8]!
|
|
|
|
|
|
|
|
sub w9, w9, #1
|
|
|
|
cbz w9, KMP_LABEL(kmp_1)
|
|
|
|
ldr x6, [x10, #8]!
|
|
|
|
|
|
|
|
sub w9, w9, #1
|
|
|
|
cbz w9, KMP_LABEL(kmp_1)
|
|
|
|
ldr x7, [x10, #8]!
|
|
|
|
|
|
|
|
KMP_LABEL(kmp_0):
|
|
|
|
sub w9, w9, #1
|
|
|
|
cbz w9, KMP_LABEL(kmp_1)
|
|
|
|
ldr x12, [x10, #8]!
|
|
|
|
str x12, [x11], #8
|
|
|
|
b KMP_LABEL(kmp_0)
|
|
|
|
KMP_LABEL(kmp_1):
|
|
|
|
blr x8
|
|
|
|
orr w0, wzr, #1
|
|
|
|
mov sp, x29
|
|
|
|
# if OMPT_SUPPORT
|
|
|
|
str xzr, [x19]
|
|
|
|
ldp x19, x20, [sp], #16
|
|
|
|
# endif
|
|
|
|
ldp x29, x30, [sp], #16
|
|
|
|
ret
|
|
|
|
|
|
|
|
DEBUG_INFO __kmp_invoke_microtask
|
|
|
|
// -- End __kmp_invoke_microtask
|
|
|
|
|
2017-04-17 19:58:20 +08:00
|
|
|
#endif /* (KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64 */
|
2016-05-13 16:26:42 +08:00
|
|
|
|
2016-05-26 12:48:14 +08:00
|
|
|
#if KMP_ARCH_PPC64
|
|
|
|
|
|
|
|
//------------------------------------------------------------------------
|
|
|
|
// int
|
|
|
|
// __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
|
2019-07-23 02:46:02 +08:00
|
|
|
// int gtid, int tid,
|
|
|
|
// int argc, void *p_argv[]
|
|
|
|
// #if OMPT_SUPPORT
|
|
|
|
// ,
|
|
|
|
// void **exit_frame_ptr
|
|
|
|
// #endif
|
|
|
|
// ) {
|
|
|
|
// #if OMPT_SUPPORT
|
|
|
|
// *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
|
|
|
|
// #endif
|
|
|
|
//
|
|
|
|
// (*pkfn)( & gtid, & tid, argv[0], ... );
|
|
|
|
//
|
|
|
|
// // FIXME: This is done at call-site and can be removed here.
|
|
|
|
// #if OMPT_SUPPORT
|
|
|
|
// *exit_frame_ptr = 0;
|
|
|
|
// #endif
|
|
|
|
//
|
|
|
|
// return 1;
|
2016-05-26 12:48:14 +08:00
|
|
|
// }
|
|
|
|
//
|
|
|
|
// parameters:
|
|
|
|
// r3: pkfn
|
|
|
|
// r4: gtid
|
|
|
|
// r5: tid
|
|
|
|
// r6: argc
|
|
|
|
// r7: p_argv
|
|
|
|
// r8: &exit_frame
|
|
|
|
//
|
|
|
|
// return: r3 (always 1/TRUE)
|
|
|
|
//
|
|
|
|
.text
|
|
|
|
# if KMP_ARCH_PPC64_LE
|
|
|
|
.abiversion 2
|
|
|
|
# endif
|
|
|
|
.globl __kmp_invoke_microtask
|
|
|
|
|
|
|
|
# if KMP_ARCH_PPC64_LE
|
|
|
|
.p2align 4
|
|
|
|
# else
|
|
|
|
.p2align 2
|
|
|
|
# endif
|
|
|
|
|
|
|
|
.type __kmp_invoke_microtask,@function
|
|
|
|
|
|
|
|
# if KMP_ARCH_PPC64_LE
|
|
|
|
__kmp_invoke_microtask:
|
|
|
|
.Lfunc_begin0:
|
|
|
|
.Lfunc_gep0:
|
|
|
|
addis 2, 12, .TOC.-.Lfunc_gep0@ha
|
|
|
|
addi 2, 2, .TOC.-.Lfunc_gep0@l
|
|
|
|
.Lfunc_lep0:
|
|
|
|
.localentry __kmp_invoke_microtask, .Lfunc_lep0-.Lfunc_gep0
|
|
|
|
# else
|
|
|
|
.section .opd,"aw",@progbits
|
|
|
|
__kmp_invoke_microtask:
|
|
|
|
.p2align 3
|
|
|
|
.quad .Lfunc_begin0
|
|
|
|
.quad .TOC.@tocbase
|
|
|
|
.quad 0
|
|
|
|
.text
|
|
|
|
.Lfunc_begin0:
|
|
|
|
# endif
|
|
|
|
|
|
|
|
// -- Begin __kmp_invoke_microtask
|
|
|
|
// mark_begin;
|
|
|
|
|
|
|
|
// We need to allocate a stack frame large enough to hold all of the parameters
|
|
|
|
// on the stack for the microtask plus what this function needs. That's 48
|
|
|
|
// bytes under the ELFv1 ABI (32 bytes under ELFv2), plus 8*(2 + argc) for the
|
|
|
|
// parameters to the microtask, plus 8 bytes to store the values of r4 and r5,
|
|
|
|
// and 8 bytes to store r31. With OMP-T support, we need an additional 8 bytes
|
|
|
|
// to save r30 to hold a copy of r8.
|
|
|
|
|
|
|
|
.cfi_startproc
|
|
|
|
mflr 0
|
|
|
|
std 31, -8(1)
|
|
|
|
std 0, 16(1)
|
|
|
|
|
|
|
|
// This is unusual because normally we'd set r31 equal to r1 after the stack
|
|
|
|
// frame is established. In this case, however, we need to dynamically compute
|
|
|
|
// the stack frame size, and so we keep a direct copy of r1 to access our
|
|
|
|
// register save areas and restore the r1 value before returning.
|
|
|
|
mr 31, 1
|
|
|
|
.cfi_def_cfa_register r31
|
|
|
|
.cfi_offset r31, -8
|
|
|
|
.cfi_offset lr, 16
|
|
|
|
|
|
|
|
// Compute the size necessary for the local stack frame.
|
|
|
|
# if KMP_ARCH_PPC64_LE
|
|
|
|
li 12, 72
|
|
|
|
# else
|
|
|
|
li 12, 88
|
|
|
|
# endif
|
|
|
|
sldi 0, 6, 3
|
|
|
|
add 12, 0, 12
|
|
|
|
neg 12, 12
|
|
|
|
|
|
|
|
// We need to make sure that the stack frame stays aligned (to 16 bytes, except
|
|
|
|
// under the BG/Q CNK, where it must be to 32 bytes).
|
|
|
|
# if KMP_OS_CNK
|
|
|
|
li 0, -32
|
|
|
|
# else
|
|
|
|
li 0, -16
|
|
|
|
# endif
|
|
|
|
and 12, 0, 12
|
|
|
|
|
|
|
|
// Establish the local stack frame.
|
|
|
|
stdux 1, 1, 12
|
|
|
|
|
|
|
|
# if OMPT_SUPPORT
|
|
|
|
.cfi_offset r30, -16
|
|
|
|
std 30, -16(31)
|
2016-05-28 03:04:05 +08:00
|
|
|
std 1, 0(8)
|
2016-05-26 12:48:14 +08:00
|
|
|
mr 30, 8
|
|
|
|
# endif
|
|
|
|
|
|
|
|
// Store gtid and tid to the stack because they're passed by reference to the microtask.
|
|
|
|
stw 4, -20(31)
|
|
|
|
stw 5, -24(31)
|
|
|
|
|
|
|
|
mr 12, 6
|
|
|
|
mr 4, 7
|
|
|
|
|
|
|
|
cmpwi 0, 12, 1
|
|
|
|
blt 0, .Lcall
|
|
|
|
|
|
|
|
ld 5, 0(4)
|
|
|
|
|
|
|
|
cmpwi 0, 12, 2
|
|
|
|
blt 0, .Lcall
|
|
|
|
|
|
|
|
ld 6, 8(4)
|
|
|
|
|
|
|
|
cmpwi 0, 12, 3
|
|
|
|
blt 0, .Lcall
|
|
|
|
|
|
|
|
ld 7, 16(4)
|
|
|
|
|
|
|
|
cmpwi 0, 12, 4
|
|
|
|
blt 0, .Lcall
|
|
|
|
|
|
|
|
ld 8, 24(4)
|
|
|
|
|
|
|
|
cmpwi 0, 12, 5
|
|
|
|
blt 0, .Lcall
|
|
|
|
|
|
|
|
ld 9, 32(4)
|
|
|
|
|
|
|
|
cmpwi 0, 12, 6
|
|
|
|
blt 0, .Lcall
|
|
|
|
|
|
|
|
ld 10, 40(4)
|
|
|
|
|
|
|
|
cmpwi 0, 12, 7
|
|
|
|
blt 0, .Lcall
|
|
|
|
|
|
|
|
// There are more than 6 microtask parameters, so we need to store the
|
|
|
|
// remainder to the stack.
|
|
|
|
addi 12, 12, -6
|
|
|
|
mtctr 12
|
|
|
|
|
|
|
|
// These are set to 8 bytes before the first desired store address (we're using
|
|
|
|
// pre-increment loads and stores in the loop below). The parameter save area
|
|
|
|
// for the microtask begins 48 + 8*8 == 112 bytes above r1 for ELFv1 and
|
|
|
|
// 32 + 8*8 == 96 bytes above r1 for ELFv2.
|
|
|
|
addi 4, 4, 40
|
|
|
|
# if KMP_ARCH_PPC64_LE
|
|
|
|
addi 12, 1, 88
|
|
|
|
# else
|
|
|
|
addi 12, 1, 104
|
|
|
|
# endif
|
|
|
|
|
|
|
|
.Lnext:
|
|
|
|
ldu 0, 8(4)
|
|
|
|
stdu 0, 8(12)
|
|
|
|
bdnz .Lnext
|
|
|
|
|
|
|
|
.Lcall:
|
|
|
|
# if KMP_ARCH_PPC64_LE
|
|
|
|
std 2, 24(1)
|
|
|
|
mr 12, 3
|
|
|
|
#else
|
|
|
|
std 2, 40(1)
|
|
|
|
// For ELFv1, we need to load the actual function address from the function descriptor.
|
|
|
|
ld 12, 0(3)
|
|
|
|
ld 2, 8(3)
|
|
|
|
ld 11, 16(3)
|
|
|
|
#endif
|
|
|
|
|
|
|
|
addi 3, 31, -20
|
|
|
|
addi 4, 31, -24
|
|
|
|
|
|
|
|
mtctr 12
|
|
|
|
bctrl
|
|
|
|
# if KMP_ARCH_PPC64_LE
|
|
|
|
ld 2, 24(1)
|
|
|
|
# else
|
|
|
|
ld 2, 40(1)
|
|
|
|
# endif
|
|
|
|
|
|
|
|
# if OMPT_SUPPORT
|
|
|
|
li 3, 0
|
|
|
|
std 3, 0(30)
|
|
|
|
# endif
|
|
|
|
|
|
|
|
li 3, 1
|
|
|
|
|
|
|
|
# if OMPT_SUPPORT
|
|
|
|
ld 30, -16(31)
|
|
|
|
# endif
|
|
|
|
|
|
|
|
mr 1, 31
|
|
|
|
ld 0, 16(1)
|
|
|
|
ld 31, -8(1)
|
|
|
|
mtlr 0
|
|
|
|
blr
|
|
|
|
|
|
|
|
.long 0
|
|
|
|
.quad 0
|
|
|
|
.Lfunc_end0:
|
|
|
|
.size __kmp_invoke_microtask, .Lfunc_end0-.Lfunc_begin0
|
|
|
|
.cfi_endproc
|
|
|
|
|
|
|
|
// -- End __kmp_invoke_microtask
|
|
|
|
|
|
|
|
#endif /* KMP_ARCH_PPC64 */
|
|
|
|
|
2016-12-08 17:22:24 +08:00
|
|
|
#if KMP_ARCH_ARM || KMP_ARCH_MIPS
|
2013-12-24 01:28:57 +08:00
|
|
|
.data
|
|
|
|
.comm .gomp_critical_user_,32,8
|
|
|
|
.data
|
|
|
|
.align 4
|
|
|
|
.global __kmp_unnamed_critical_addr
|
|
|
|
__kmp_unnamed_critical_addr:
|
|
|
|
.4byte .gomp_critical_user_
|
|
|
|
.size __kmp_unnamed_critical_addr,4
|
|
|
|
#endif /* KMP_ARCH_ARM */
|
|
|
|
|
2016-12-08 17:22:24 +08:00
|
|
|
#if KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
|
2014-08-07 18:12:54 +08:00
|
|
|
.data
|
|
|
|
.comm .gomp_critical_user_,32,8
|
|
|
|
.data
|
|
|
|
.align 8
|
|
|
|
.global __kmp_unnamed_critical_addr
|
|
|
|
__kmp_unnamed_critical_addr:
|
|
|
|
.8byte .gomp_critical_user_
|
|
|
|
.size __kmp_unnamed_critical_addr,8
|
2015-01-13 22:43:35 +08:00
|
|
|
#endif /* KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 */
|
2013-12-24 01:28:57 +08:00
|
|
|
|
2015-08-21 03:46:14 +08:00
|
|
|
#if KMP_OS_LINUX
|
I apologise in advance for the size of this check-in. At Intel we do
understand that this is not friendly, and are working to change our
internal code-development to make it easier to make development
features available more frequently and in finer (more functional)
chunks. Unfortunately we haven't got that in place yet, and unpicking
this into multiple separate check-ins would be non-trivial, so please
bear with me on this one. We should be better in the future.
Apologies over, what do we have here?
GGC 4.9 compatibility
--------------------
* We have implemented the new entrypoints used by code compiled by GCC
4.9 to implement the same functionality in gcc 4.8. Therefore code
compiled with gcc 4.9 that used to work will continue to do so.
However, there are some other new entrypoints (associated with task
cancellation) which are not implemented. Therefore user code compiled
by gcc 4.9 that uses these new features will not link against the LLVM
runtime. (It remains unclear how to handle those entrypoints, since
the GCC interface has potentially unpleasant performance implications
for join barriers even when cancellation is not used)
--- new parallel entry points ---
new entry points that aren't OpenMP 4.0 related
These are implemented fully :-
GOMP_parallel_loop_dynamic()
GOMP_parallel_loop_guided()
GOMP_parallel_loop_runtime()
GOMP_parallel_loop_static()
GOMP_parallel_sections()
GOMP_parallel()
--- cancellation entry points ---
Currently, these only give a runtime error if OMP_CANCELLATION is true
because our plain barriers don't check for cancellation while waiting
GOMP_barrier_cancel()
GOMP_cancel()
GOMP_cancellation_point()
GOMP_loop_end_cancel()
GOMP_sections_end_cancel()
--- taskgroup entry points ---
These are implemented fully.
GOMP_taskgroup_start()
GOMP_taskgroup_end()
--- target entry points ---
These are empty (as they are in libgomp)
GOMP_target()
GOMP_target_data()
GOMP_target_end_data()
GOMP_target_update()
GOMP_teams()
Improvements in Barriers and Fork/Join
--------------------------------------
* Barrier and fork/join code is now in its own file (which makes it
easier to understand and modify).
* Wait/release code is now templated and in its own file; suspend/resume code is also templated
* There's a new, hierarchical, barrier, which exploits the
cache-hierarchy of the Intel(r) Xeon Phi(tm) coprocessor to improve
fork/join and barrier performance.
***BEWARE*** the new source files have *not* been added to the legacy
Cmake build system. If you want to use that fixes wil be required.
Statistics Collection Code
--------------------------
* New code has been added to collect application statistics (if this
is enabled at library compile time; by default it is not). The
statistics code itself is generally useful, the lightweight timing
code uses the X86 rdtsc instruction, so will require changes for other
architectures.
The intent of this code is not for users to tune their codes but
rather
1) For timing code-paths inside the runtime
2) For gathering general properties of OpenMP codes to focus attention
on which OpenMP features are most used.
Nested Hot Teams
----------------
* The runtime now maintains more state to reduce the overhead of
creating and destroying inner parallel teams. This improves the
performance of code that repeatedly uses nested parallelism with the
same resource allocation. Set the new KMP_HOT_TEAMS_MAX_LEVEL
envirable to a depth to enable this (and, of course, OMP_NESTED=true
to enable nested parallelism at all).
Improved Intel(r) VTune(Tm) Amplifier support
---------------------------------------------
* The runtime provides additional information to Vtune via the
itt_notify interface to allow it to display better OpenMP specific
analyses of load-imbalance.
Support for OpenMP Composite Statements
---------------------------------------
* Implement new entrypoints required by some of the OpenMP 4.1
composite statements.
Improved ifdefs
---------------
* More separation of concepts ("Does this platform do X?") from
platforms ("Are we compiling for platform Y?"), which should simplify
future porting.
ScaleMP* contribution
---------------------
Stack padding to improve the performance in their environment where
cross-node coherency is managed at the page level.
Redesign of wait and release code
---------------------------------
The code is simplified and performance improved.
Bug Fixes
---------
*Fixes for Windows multiple processor groups.
*Fix Fortran module build on Linux: offload attribute added.
*Fix entry names for distribute-parallel-loop construct to be consistent with the compiler codegen.
*Fix an inconsistent error message for KMP_PLACE_THREADS environment variable.
llvm-svn: 219214
2014-10-08 00:25:50 +08:00
|
|
|
# if KMP_ARCH_ARM
|
|
|
|
.section .note.GNU-stack,"",%progbits
|
|
|
|
# else
|
2013-12-24 01:28:57 +08:00
|
|
|
.section .note.GNU-stack,"",@progbits
|
I apologise in advance for the size of this check-in. At Intel we do
understand that this is not friendly, and are working to change our
internal code-development to make it easier to make development
features available more frequently and in finer (more functional)
chunks. Unfortunately we haven't got that in place yet, and unpicking
this into multiple separate check-ins would be non-trivial, so please
bear with me on this one. We should be better in the future.
Apologies over, what do we have here?
GGC 4.9 compatibility
--------------------
* We have implemented the new entrypoints used by code compiled by GCC
4.9 to implement the same functionality in gcc 4.8. Therefore code
compiled with gcc 4.9 that used to work will continue to do so.
However, there are some other new entrypoints (associated with task
cancellation) which are not implemented. Therefore user code compiled
by gcc 4.9 that uses these new features will not link against the LLVM
runtime. (It remains unclear how to handle those entrypoints, since
the GCC interface has potentially unpleasant performance implications
for join barriers even when cancellation is not used)
--- new parallel entry points ---
new entry points that aren't OpenMP 4.0 related
These are implemented fully :-
GOMP_parallel_loop_dynamic()
GOMP_parallel_loop_guided()
GOMP_parallel_loop_runtime()
GOMP_parallel_loop_static()
GOMP_parallel_sections()
GOMP_parallel()
--- cancellation entry points ---
Currently, these only give a runtime error if OMP_CANCELLATION is true
because our plain barriers don't check for cancellation while waiting
GOMP_barrier_cancel()
GOMP_cancel()
GOMP_cancellation_point()
GOMP_loop_end_cancel()
GOMP_sections_end_cancel()
--- taskgroup entry points ---
These are implemented fully.
GOMP_taskgroup_start()
GOMP_taskgroup_end()
--- target entry points ---
These are empty (as they are in libgomp)
GOMP_target()
GOMP_target_data()
GOMP_target_end_data()
GOMP_target_update()
GOMP_teams()
Improvements in Barriers and Fork/Join
--------------------------------------
* Barrier and fork/join code is now in its own file (which makes it
easier to understand and modify).
* Wait/release code is now templated and in its own file; suspend/resume code is also templated
* There's a new, hierarchical, barrier, which exploits the
cache-hierarchy of the Intel(r) Xeon Phi(tm) coprocessor to improve
fork/join and barrier performance.
***BEWARE*** the new source files have *not* been added to the legacy
Cmake build system. If you want to use that fixes wil be required.
Statistics Collection Code
--------------------------
* New code has been added to collect application statistics (if this
is enabled at library compile time; by default it is not). The
statistics code itself is generally useful, the lightweight timing
code uses the X86 rdtsc instruction, so will require changes for other
architectures.
The intent of this code is not for users to tune their codes but
rather
1) For timing code-paths inside the runtime
2) For gathering general properties of OpenMP codes to focus attention
on which OpenMP features are most used.
Nested Hot Teams
----------------
* The runtime now maintains more state to reduce the overhead of
creating and destroying inner parallel teams. This improves the
performance of code that repeatedly uses nested parallelism with the
same resource allocation. Set the new KMP_HOT_TEAMS_MAX_LEVEL
envirable to a depth to enable this (and, of course, OMP_NESTED=true
to enable nested parallelism at all).
Improved Intel(r) VTune(Tm) Amplifier support
---------------------------------------------
* The runtime provides additional information to Vtune via the
itt_notify interface to allow it to display better OpenMP specific
analyses of load-imbalance.
Support for OpenMP Composite Statements
---------------------------------------
* Implement new entrypoints required by some of the OpenMP 4.1
composite statements.
Improved ifdefs
---------------
* More separation of concepts ("Does this platform do X?") from
platforms ("Are we compiling for platform Y?"), which should simplify
future porting.
ScaleMP* contribution
---------------------
Stack padding to improve the performance in their environment where
cross-node coherency is managed at the page level.
Redesign of wait and release code
---------------------------------
The code is simplified and performance improved.
Bug Fixes
---------
*Fixes for Windows multiple processor groups.
*Fix Fortran module build on Linux: offload attribute added.
*Fix entry names for distribute-parallel-loop construct to be consistent with the compiler codegen.
*Fix an inconsistent error message for KMP_PLACE_THREADS environment variable.
llvm-svn: 219214
2014-10-08 00:25:50 +08:00
|
|
|
# endif
|
2013-12-24 01:28:57 +08:00
|
|
|
#endif
|