[feat] [assistant] [I3T96T] add new Dataset operator CMUARCTICDataset
This commit is contained in:
parent
c5bc9af49f
commit
b077aa1cab
|
@ -24,6 +24,9 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
|
|||
-Wno-return-std-move -Wno-unused-private-field -Wno-unused-lambda-capture -Wno-sign-compare \
|
||||
-Wno-overloaded-virtual -Wno-unneeded-internal-declaration -Wno-unused-variable -Wno-pessimizing-move \
|
||||
-Wno-inconsistent-missing-override -DHALF_ENABLE_CPP11_USER_LITERALS=0 -D_FORTIFY_SOURCE=2")
|
||||
elseif(ENABLE_SYM_FILE)
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -O2 -g -ggdb -Wl,--allow-shlib-undefined \
|
||||
-DHALF_ENABLE_CPP11_USER_LITERALS=0 -D_FORTIFY_SOURCE=2")
|
||||
else()
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -O2 -Wl,--allow-shlib-undefined \
|
||||
-DHALF_ENABLE_CPP11_USER_LITERALS=0 -D_FORTIFY_SOURCE=2")
|
||||
|
|
12
build.sh
12
build.sh
|
@ -27,7 +27,7 @@ usage()
|
|||
echo " [-P on|off] [-z [on|off]] [-M on|off] [-V 10.1|11.1|310|910] [-I arm64|arm32|x86_64] [-K] \\"
|
||||
echo " [-B on|off] [-E] [-l on|off] [-n full|lite|off] [-H on|off] \\"
|
||||
echo " [-A on|off] [-S on|off] [-k on|off] [-W sse|neon|avx|avx512|off] \\"
|
||||
echo " [-L Tensor-RT path] \\"
|
||||
echo " [-L Tensor-RT path] [-y on|off] \\"
|
||||
echo ""
|
||||
echo "Options:"
|
||||
echo " -d Debug mode"
|
||||
|
@ -64,6 +64,7 @@ usage()
|
|||
echo " -W Enable x86_64 SSE or AVX instruction set, use [sse|neon|avx|avx512|off], default off for lite and avx for CPU"
|
||||
echo " -H Enable hidden"
|
||||
echo " -L Link and specify Tensor-RT library path, default disable Tensor-RT lib linking"
|
||||
echo " -y Compile the symbol table switch and save the symbol table to the directory output"
|
||||
}
|
||||
|
||||
# check value of input is 'on' or 'off'
|
||||
|
@ -122,8 +123,9 @@ checkopts()
|
|||
TENSORRT_HOME=""
|
||||
USER_ENABLE_DUMP_IR=false
|
||||
USER_ENABLE_DEBUGGER=false
|
||||
ENABLE_SYM_FILE="off"
|
||||
# Process the options
|
||||
while getopts 'drvj:c:t:hb:s:a:g:p:ie:m:l:I:RP:D:zM:V:K:B:En:A:S:k:W:H:L:' opt
|
||||
while getopts 'drvj:c:t:hb:s:a:g:p:ie:m:l:I:RP:D:zM:V:K:B:En:A:S:k:W:H:L:y' opt
|
||||
do
|
||||
CASE_SENSIVE_ARG=${OPTARG}
|
||||
OPTARG=$(echo ${OPTARG} | tr '[A-Z]' '[a-z]')
|
||||
|
@ -140,6 +142,9 @@ checkopts()
|
|||
exit 1
|
||||
fi
|
||||
;;
|
||||
y)
|
||||
ENABLE_SYM_FILE="on"
|
||||
;;
|
||||
r)
|
||||
DEBUG_MODE="off"
|
||||
;;
|
||||
|
@ -442,6 +447,9 @@ build_mindspore()
|
|||
if [[ -n "$TRAIN_MODE" ]]; then
|
||||
CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_${TRAIN_MODE}=ON"
|
||||
fi
|
||||
if [[ "X$ENABLE_SYM_FILE" = "Xon" ]]; then
|
||||
CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_SYM_FILE=ON"
|
||||
fi
|
||||
if [[ "X$ENABLE_ASAN" = "Xon" ]]; then
|
||||
CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_ASAN=ON"
|
||||
fi
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
if(MSVC)
|
||||
set(flatbuffers_CXXFLAGS "${CMAKE_CXX_FLAGS}")
|
||||
set(flatbuffers_CFLAGS "${CMAKE_CXX_FLAGS}")
|
||||
set(flatbuffers_CFLAGS "${CMAKE_C_FLAGS}")
|
||||
set(flatbuffers_LDFLAGS "${CMAKE_SHARED_LINKER_FLAGS}")
|
||||
else()
|
||||
set(flatbuffers_CXXFLAGS "-D_FORTIFY_SOURCE=2 -O2")
|
||||
set(flatbuffers_CFLAGS "-D_FORTIFY_SOURCE=2 -O2")
|
||||
set(flatbuffers_CXXFLAGS "-fPIC -fPIE -D_FORTIFY_SOURCE=2 -O2 -fstack-protector-strong")
|
||||
set(flatbuffers_CFLAGS "-fPIC -fPIE -D_FORTIFY_SOURCE=2 -O2 -fstack-protector-strong")
|
||||
endif()
|
||||
|
||||
if(WIN32)
|
||||
|
|
|
@ -89,7 +89,6 @@ if(ENABLE_MINDDATA)
|
|||
include(${CMAKE_SOURCE_DIR}/cmake/external_libs/tinyxml2.cmake)
|
||||
include(${CMAKE_SOURCE_DIR}/cmake/external_libs/cppjieba.cmake)
|
||||
include(${CMAKE_SOURCE_DIR}/cmake/external_libs/sentencepiece.cmake)
|
||||
include(${CMAKE_SOURCE_DIR}/cmake/external_libs/ffmpeg.cmake)
|
||||
endif()
|
||||
|
||||
if(ENABLE_MINDDATA)
|
||||
|
|
|
@ -25,6 +25,7 @@ option(ENABLE_ACL "enable acl" OFF)
|
|||
option(ENABLE_GLIBCXX "enable_glibcxx" OFF)
|
||||
option(MODE_ASCEND_ALL "supports all ascend platform" OFF)
|
||||
option(MODE_ASCEND_ACL "supports ascend acl mode only" OFF)
|
||||
option(ENABLE_SYM_FILE "enable sym file" OFF)
|
||||
|
||||
if(NOT ENABLE_D AND NOT ENABLE_TESTCASES AND NOT ENABLE_ACL AND NOT ENABLE_GE)
|
||||
set(ENABLE_GLIBCXX ON)
|
||||
|
|
|
@ -12,6 +12,8 @@ set(CPACK_TEMPORARY_PACKAGE_FILE_NAME ${BUILD_PATH}/package/mindspore)
|
|||
set(CPACK_TEMPORARY_INSTALL_DIRECTORY ${BUILD_PATH}/package/mindspore)
|
||||
set(CPACK_PACK_ROOT_DIR ${BUILD_PATH}/package/)
|
||||
set(CPACK_CMAKE_SOURCE_DIR ${CMAKE_SOURCE_DIR})
|
||||
set(CPACK_ENABLE_SYM_FILE ${ENABLE_SYM_FILE})
|
||||
set(CPACK_CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE})
|
||||
if(ENABLE_GE)
|
||||
set(CPACK_MS_BACKEND "ge")
|
||||
set(CPACK_MS_TARGET "ascend or cpu")
|
||||
|
@ -125,17 +127,6 @@ if(ENABLE_MINDDATA)
|
|||
DESTINATION ${INSTALL_LIB_DIR} RENAME libicudata.so.67 COMPONENT mindspore)
|
||||
install(FILES ${icu4c_LIBPATH}/libicui18n.so.67.1
|
||||
DESTINATION ${INSTALL_LIB_DIR} RENAME libicui18n.so.67 COMPONENT mindspore)
|
||||
|
||||
install(FILES ${ffmpeg_LIBPATH}/libavcodec.so.58.91.100
|
||||
DESTINATION ${INSTALL_LIB_DIR} RENAME libavcodec.so.58 COMPONENT mindspore)
|
||||
install(FILES ${ffmpeg_LIBPATH}/libavformat.so.58.45.100
|
||||
DESTINATION ${INSTALL_LIB_DIR} RENAME libavformat.so.58 COMPONENT mindspore)
|
||||
install(FILES ${ffmpeg_LIBPATH}/libavutil.so.56.51.100
|
||||
DESTINATION ${INSTALL_LIB_DIR} RENAME libavutil.so.56 COMPONENT mindspore)
|
||||
install(FILES ${ffmpeg_LIBPATH}/libswresample.so.3.7.100
|
||||
DESTINATION ${INSTALL_LIB_DIR} RENAME libswresample.so.3 COMPONENT mindspore)
|
||||
install(FILES ${ffmpeg_LIBPATH}/libswscale.so.5.7.100
|
||||
DESTINATION ${INSTALL_LIB_DIR} RENAME libswscale.so.5 COMPONENT mindspore)
|
||||
endif()
|
||||
|
||||
if(ENABLE_CPU)
|
||||
|
|
|
@ -77,6 +77,48 @@ set(ENV{BACKEND_TARGET} ${CPACK_MS_TARGET})
|
|||
set(ENV{MS_PACKAGE_NAME} ${CPACK_MS_PACKAGE_NAME})
|
||||
set(ENV{COMMIT_ID} ${GIT_COMMIT_ID})
|
||||
|
||||
file(GLOB DEBUG_SYM
|
||||
${MS_PACK_ROOT_DIR}/mindspore/*.so
|
||||
${MS_PACK_ROOT_DIR}/mindspore/lib/*.so
|
||||
)
|
||||
|
||||
file(GLOB DEBUG_STRIP_SYM
|
||||
${MS_PACK_ROOT_DIR}/mindspore/*.so
|
||||
${MS_PACK_ROOT_DIR}/mindspore/lib/*.so*
|
||||
)
|
||||
|
||||
set(CMAKE_OBJCOPY $ENV{CROSS_COMPILE}objcopy)
|
||||
set(CMAKE_STRIP $ENV{CROSS_COMPILE}strip)
|
||||
|
||||
if(CPACK_ENABLE_SYM_FILE)
|
||||
foreach(schema ${DEBUG_SYM})
|
||||
execute_process(
|
||||
COMMAND ${CMAKE_OBJCOPY} "--only-keep-debug" ${schema} ${schema}.sym
|
||||
WORKING_DIRECTORY ${MS_PACK_ROOT_DIR}
|
||||
)
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
if("${CPACK_CMAKE_BUILD_TYPE}" STREQUAL "Release")
|
||||
foreach(schema ${DEBUG_STRIP_SYM})
|
||||
execute_process(
|
||||
COMMAND ${CMAKE_STRIP} ${schema}
|
||||
WORKING_DIRECTORY ${MS_PACK_ROOT_DIR}
|
||||
)
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
file(GLOB DEBUG_SYM_FILE
|
||||
${MS_PACK_ROOT_DIR}/mindspore/*.sym
|
||||
${MS_PACK_ROOT_DIR}/mindspore/lib/*.sym
|
||||
)
|
||||
|
||||
if(CPACK_ENABLE_SYM_FILE)
|
||||
file(MAKE_DIRECTORY ${MS_ROOT_DIR}/debug_info)
|
||||
file(COPY ${DEBUG_SYM_FILE} DESTINATION ${MS_ROOT_DIR}/debug_info/)
|
||||
file(REMOVE_RECURSE ${DEBUG_SYM_FILE})
|
||||
endif()
|
||||
|
||||
execute_process(
|
||||
COMMAND ${PYTHON} ${MS_ROOT_DIR}/setup.py "bdist_wheel"
|
||||
WORKING_DIRECTORY ${MS_PACK_ROOT_DIR}
|
||||
|
@ -104,3 +146,16 @@ file(COPY ${MS_PACK_ROOT_DIR}/${NEW_FILE_NAME} DESTINATION ${MS_ROOT_DIR}/output
|
|||
|
||||
file(SHA256 ${MS_ROOT_DIR}/output/${NEW_FILE_NAME} SHA256_VAR)
|
||||
file(WRITE ${MS_ROOT_DIR}/output/${NEW_FILE_NAME}.sha256 ${SHA256_VAR} " " ${NEW_FILE_NAME})
|
||||
set(CMAKE_TAR $ENV{CROSS_COMPILE}tar)
|
||||
if(CPACK_ENABLE_SYM_FILE)
|
||||
file(MAKE_DIRECTORY ${MS_ROOT_DIR}/output/${PACKAGE_NAME}-${VERSION}-${PY_TAGS}-${PLATFORM_TAG})
|
||||
file(COPY ${MS_ROOT_DIR}/debug_info/ DESTINATION
|
||||
${MS_ROOT_DIR}/output/${PACKAGE_NAME}-${VERSION}-${PY_TAGS}-${PLATFORM_TAG}/)
|
||||
execute_process(COMMAND
|
||||
${CMAKE_COMMAND} -E ${CMAKE_TAR} cfv
|
||||
${MS_ROOT_DIR}/output/${PACKAGE_NAME}-${VERSION}-${PY_TAGS}-${PLATFORM_TAG}.zip
|
||||
${MS_ROOT_DIR}/output/${PACKAGE_NAME}-${VERSION}-${PY_TAGS}-${PLATFORM_TAG}/ --format=zip
|
||||
WORKING_DIRECTORY ${MS_ROOT_DIR})
|
||||
file(REMOVE_RECURSE ${MS_ROOT_DIR}/debug_info)
|
||||
file(REMOVE_RECURSE ${MS_ROOT_DIR}/output/${PACKAGE_NAME}-${VERSION}-${PY_TAGS}-${PLATFORM_TAG})
|
||||
endif()
|
||||
|
|
|
@ -91,18 +91,6 @@ if(ENABLE_MINDDATA)
|
|||
DESTINATION ${INSTALL_LIB_DIR}
|
||||
COMPONENT mindspore
|
||||
)
|
||||
file(GLOB_RECURSE FFMPEG_LIB_LIST
|
||||
${ffmpeg_LIBPATH}/libavcodec*
|
||||
${ffmpeg_LIBPATH}/libavformat*
|
||||
${ffmpeg_LIBPATH}/libavutil*
|
||||
${ffmpeg_LIBPATH}/libswresample*
|
||||
${ffmpeg_LIBPATH}/libswscale*
|
||||
)
|
||||
install(
|
||||
FILES ${FFMPEG_LIB_LIST}
|
||||
DESTINATION ${INSTALL_LIB_DIR}
|
||||
COMPONENT mindspore
|
||||
)
|
||||
endif()
|
||||
|
||||
# CPU mode
|
||||
|
|
|
@ -42,7 +42,6 @@ set(opencv_LIBPATH ${opencv_LIBPATH}/../bin/)
|
|||
set(jpeg_turbo_LIBPATH ${jpeg_turbo_LIBPATH}/../bin/)
|
||||
set(sqlite_LIBPATH ${sqlite_LIBPATH}/../bin/)
|
||||
set(tinyxml2_LIBPATH ${tinyxml2_LIBPATH}/../bin/)
|
||||
set(ffmpeg_LIBPATH ${ffmpeg_LIBPATH}/../bin/)
|
||||
|
||||
message("offline debugger does not support windows system temporarily")
|
||||
|
||||
|
@ -98,18 +97,6 @@ if(ENABLE_MINDDATA)
|
|||
DESTINATION ${INSTALL_LIB_DIR}
|
||||
COMPONENT mindspore
|
||||
)
|
||||
file(GLOB_RECURSE FFMPEG_LIB_LIST
|
||||
${ffmpeg_LIBPATH}/libavcodec*
|
||||
${ffmpeg_LIBPATH}/libavformat*
|
||||
${ffmpeg_LIBPATH}/libavutil*
|
||||
${ffmpeg_LIBPATH}/libswresample*
|
||||
${ffmpeg_LIBPATH}/libswscale*
|
||||
)
|
||||
install(
|
||||
FILES ${FFMPEG_LIB_LIST}
|
||||
DESTINATION ${INSTALL_LIB_DIR}
|
||||
COMPONENT mindspore
|
||||
)
|
||||
endif()
|
||||
|
||||
if(ENABLE_CPU)
|
||||
|
|
|
@ -1,2 +1,4 @@
|
|||
approvers:
|
||||
- zhoufeng54
|
||||
reviewers:
|
||||
- HW_KK
|
||||
- HW_KK
|
|
@ -58,8 +58,11 @@ RUN apt install -y libffi-dev libssl-dev zlib1g-dev libbz2-dev libncurses5-dev \
|
|||
&& make install -j4 \
|
||||
&& rm -f /usr/local/bin/python \
|
||||
&& rm -f /usr/local/bin/pip \
|
||||
&& rm -f /usr/local/lib/libpython3.7m.so.1.0 \
|
||||
&& ln -s ${PYTHON_ROOT_PATH}/bin/python3.7 /usr/local/bin/python \
|
||||
&& ln -s ${PYTHON_ROOT_PATH}/bin/pip3.7 /usr/local/bin/pip \
|
||||
&& ln -s ${PYTHON_ROOT_PATH}/lib/libpython3.7m.so.1.0 /usr/local/lib/libpython3.7m.so.1.0 \
|
||||
&& ldconfig \
|
||||
&& rm -rf /tmp/cpython-3.7.5 \
|
||||
&& rm -f /tmp/v3.7.5.tar.gz
|
||||
|
||||
|
|
|
@ -51,13 +51,16 @@ RUN apt install -y libffi-dev libssl-dev zlib1g-dev libbz2-dev libncurses5-dev \
|
|||
&& tar -xvf v3.7.5.tar.gz \
|
||||
&& cd /tmp/cpython-3.7.5 \
|
||||
&& mkdir -p ${PYTHON_ROOT_PATH} \
|
||||
&& ./configure --prefix=${PYTHON_ROOT_PATH} \
|
||||
&& ./configure --prefix=${PYTHON_ROOT_PATH} --enable-shared \
|
||||
&& make -j4 \
|
||||
&& make install -j4 \
|
||||
&& rm -f /usr/local/bin/python \
|
||||
&& rm -f /usr/local/bin/pip \
|
||||
&& rm -f /usr/local/lib/libpython3.7m.so.1.0 \
|
||||
&& ln -s ${PYTHON_ROOT_PATH}/bin/python3.7 /usr/local/bin/python \
|
||||
&& ln -s ${PYTHON_ROOT_PATH}/bin/pip3.7 /usr/local/bin/pip \
|
||||
&& ln -s ${PYTHON_ROOT_PATH}/lib/libpython3.7m.so.1.0 /usr/local/lib/libpython3.7m.so.1.0 \
|
||||
&& ldconfig \
|
||||
&& rm -rf /tmp/cpython-3.7.5 \
|
||||
&& rm -f /tmp/v3.7.5.tar.gz
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
|
||||
FROM nvidia/cuda:11.1-cudnn8-devel-ubuntu18.04
|
||||
|
||||
MAINTAINER leonwanghui <leon.wanghui@huawei.com>
|
||||
|
||||
|
@ -43,7 +43,7 @@ RUN DEBIAN_FRONTEND=noninteractive apt install -y \
|
|||
libnuma-dev
|
||||
|
||||
# Configure cuDNN (v7.6.5)
|
||||
RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7.6.5 /usr/local/cuda/lib64/libcudnn.so
|
||||
RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.8.0.5 /usr/local/cuda/lib64/libcudnn.so
|
||||
|
||||
# Set bash
|
||||
RUN echo "dash dash/sh boolean false" | debconf-set-selections
|
||||
|
@ -62,8 +62,11 @@ RUN apt install -y libffi-dev libssl-dev zlib1g-dev libbz2-dev libncurses5-dev \
|
|||
&& make install -j4 \
|
||||
&& rm -f /usr/local/bin/python \
|
||||
&& rm -f /usr/local/bin/pip \
|
||||
&& rm -f /usr/local/lib/libpython3.7m.so.1.0 \
|
||||
&& ln -s ${PYTHON_ROOT_PATH}/bin/python3.7 /usr/local/bin/python \
|
||||
&& ln -s ${PYTHON_ROOT_PATH}/bin/pip3.7 /usr/local/bin/pip \
|
||||
&& ln -s ${PYTHON_ROOT_PATH}/lib/libpython3.7m.so.1.0 /usr/local/lib/libpython3.7m.so.1.0 \
|
||||
&& ldconfig \
|
||||
&& rm -rf /tmp/cpython-3.7.5 \
|
||||
&& rm -f /tmp/v3.7.5.tar.gz
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
|
||||
FROM nvidia/cuda:11.1-cudnn8-devel-ubuntu18.04
|
||||
|
||||
MAINTAINER leonwanghui <leon.wanghui@huawei.com>
|
||||
|
||||
|
@ -53,13 +53,16 @@ RUN apt install -y libffi-dev libssl-dev zlib1g-dev libbz2-dev libncurses5-dev \
|
|||
&& tar -xvf v3.7.5.tar.gz \
|
||||
&& cd /tmp/cpython-3.7.5 \
|
||||
&& mkdir -p ${PYTHON_ROOT_PATH} \
|
||||
&& ./configure --prefix=${PYTHON_ROOT_PATH} \
|
||||
&& ./configure --prefix=${PYTHON_ROOT_PATH} --enable-shared \
|
||||
&& make -j4 \
|
||||
&& make install -j4 \
|
||||
&& rm -f /usr/local/bin/python \
|
||||
&& rm -f /usr/local/bin/pip \
|
||||
&& rm -f /usr/local/lib/libpython3.7m.so.1.0 \
|
||||
&& ln -s ${PYTHON_ROOT_PATH}/bin/python3.7 /usr/local/bin/python \
|
||||
&& ln -s ${PYTHON_ROOT_PATH}/bin/pip3.7 /usr/local/bin/pip \
|
||||
&& ln -s ${PYTHON_ROOT_PATH}/lib/libpython3.7m.so.1.0 /usr/local/lib/libpython3.7m.so.1.0 \
|
||||
&& ldconfig \
|
||||
&& rm -rf /tmp/cpython-3.7.5 \
|
||||
&& rm -f /tmp/v3.7.5.tar.gz
|
||||
|
||||
|
|
|
@ -38,12 +38,19 @@ class Allocator;
|
|||
class Delegate;
|
||||
class DeviceInfoContext;
|
||||
|
||||
/// \brief Context is used to store environment variables during execution.
|
||||
class MS_API Context {
|
||||
public:
|
||||
Context();
|
||||
~Context() = default;
|
||||
|
||||
/// \brief Set the number of threads at runtime. This option is only valid for MindSpore Lite.
|
||||
///
|
||||
/// \param[in] thread_num the number of threads at runtime.
|
||||
void SetThreadNum(int32_t thread_num);
|
||||
/// \brief Get the current thread number setting.
|
||||
///
|
||||
/// \return The current thread number setting.
|
||||
int32_t GetThreadNum() const;
|
||||
|
||||
/// \brief Set the thread affinity to CPU cores.
|
||||
|
@ -60,6 +67,10 @@ class MS_API Context {
|
|||
void SetDelegate(const std::shared_ptr<Delegate> &delegate);
|
||||
std::shared_ptr<Delegate> GetDelegate() const;
|
||||
|
||||
/// \brief Get a mutable reference of DeviceInfoContext vector in this context. Only MindSpore Lite supports
|
||||
/// heterogeneous scenarios with multiple members in the vector.
|
||||
///
|
||||
/// \return Mutable reference of DeviceInfoContext vector in this context.
|
||||
std::vector<std::shared_ptr<DeviceInfoContext>> &MutableDeviceInfo();
|
||||
|
||||
private:
|
||||
|
@ -67,14 +78,24 @@ class MS_API Context {
|
|||
std::shared_ptr<Data> data_;
|
||||
};
|
||||
|
||||
/// \brief DeviceInfoContext defines different device contexts.
|
||||
class MS_API DeviceInfoContext : public std::enable_shared_from_this<DeviceInfoContext> {
|
||||
public:
|
||||
struct Data;
|
||||
|
||||
DeviceInfoContext();
|
||||
virtual ~DeviceInfoContext() = default;
|
||||
|
||||
/// \brief Get the type of this DeviceInfoContext.
|
||||
///
|
||||
/// \return Type of this DeviceInfoContext.
|
||||
virtual enum DeviceType GetDeviceType() const = 0;
|
||||
|
||||
/// \brief A similar function to RTTI is provided when the -fno-rtti compilation option is turned on, which converts
|
||||
/// DeviceInfoContext to a shared pointer of type T, and returns nullptr if the conversion fails.
|
||||
///
|
||||
/// \param T Type
|
||||
/// \return A pointer of type T after conversion. If the conversion fails, it will be nullptr.
|
||||
template <class T>
|
||||
std::shared_ptr<T> Cast() {
|
||||
static_assert(std::is_base_of<DeviceInfoContext, T>::value, "Wrong cast type.");
|
||||
|
@ -98,27 +119,60 @@ class MS_API DeviceInfoContext : public std::enable_shared_from_this<DeviceInfoC
|
|||
std::shared_ptr<Data> data_;
|
||||
};
|
||||
|
||||
/// \brief Derived from DeviceInfoContext, The configuration of the model running on the CPU. This option is only valid
|
||||
/// for MindSpore Lite.
|
||||
class MS_API CPUDeviceInfo : public DeviceInfoContext {
|
||||
public:
|
||||
/// \brief Get the type of this DeviceInfoContext.
|
||||
///
|
||||
/// \return Type of this DeviceInfoContext.
|
||||
enum DeviceType GetDeviceType() const override { return DeviceType::kCPU; };
|
||||
|
||||
/// \brief Set enables to perform the float16 inference
|
||||
///
|
||||
/// \param[in] is_fp16 Enable float16 inference or not.
|
||||
void SetEnableFP16(bool is_fp16);
|
||||
/// \brief Get enables to perform the float16 inference
|
||||
///
|
||||
/// \return Whether enable float16 inference.
|
||||
bool GetEnableFP16() const;
|
||||
};
|
||||
|
||||
/// \brief Derived from DeviceInfoContext, The configuration of the model running on the NPU. This option is only valid
|
||||
/// for MindSpore Lite.
|
||||
class MS_API KirinNPUDeviceInfo : public DeviceInfoContext {
|
||||
public:
|
||||
/// \brief Get the type of this DeviceInfoContext.
|
||||
///
|
||||
/// \return Type of this DeviceInfoContext.
|
||||
enum DeviceType GetDeviceType() const override { return DeviceType::kKirinNPU; };
|
||||
|
||||
/// \brief Set the NPU frequency.
|
||||
///
|
||||
/// \param[in] frequency Can be set to 1 (low power consumption), 2 (balanced), 3 (high performance), 4 (extreme
|
||||
/// performance), default as 3.
|
||||
void SetFrequency(int frequency);
|
||||
/// \brief Get the NPU frequency.
|
||||
///
|
||||
/// \return NPU frequency
|
||||
int GetFrequency() const;
|
||||
};
|
||||
|
||||
/// \brief Derived from DeviceInfoContext, The configuration of the model running on the GPU.
|
||||
class MS_API GPUDeviceInfo : public DeviceInfoContext {
|
||||
public:
|
||||
/// \brief Get the type of this DeviceInfoContext.
|
||||
///
|
||||
/// \return Type of this DeviceInfoContext.
|
||||
enum DeviceType GetDeviceType() const override { return DeviceType::kGPU; };
|
||||
|
||||
/// \brief Set device id.
|
||||
///
|
||||
/// \param[in] device_id The device id.
|
||||
void SetDeviceID(uint32_t device_id);
|
||||
/// \brief Get the device id.
|
||||
///
|
||||
/// \return The device id.
|
||||
uint32_t GetDeviceID() const;
|
||||
|
||||
void SetGpuTrtInferMode(bool gpu_trt_infer_mode);
|
||||
|
@ -127,8 +181,15 @@ class MS_API GPUDeviceInfo : public DeviceInfoContext {
|
|||
inline void SetPrecisionMode(const std::string &precison_mode);
|
||||
inline std::string GetPrecisionMode() const;
|
||||
|
||||
/// \brief Set enables to perform the float16 inference
|
||||
///
|
||||
/// \param[in] is_fp16 Enable float16 inference or not.
|
||||
void SetEnableFP16(bool is_fp16);
|
||||
/// \brief Get enables to perform the float16 inference
|
||||
///
|
||||
/// \return Whether enable float16 inference.
|
||||
bool GetEnableFP16() const;
|
||||
|
||||
private:
|
||||
void SetPrecisionMode(const std::vector<char> &precision_mode);
|
||||
std::vector<char> GetPrecisionModeChar() const;
|
||||
|
@ -139,52 +200,113 @@ void GPUDeviceInfo::SetPrecisionMode(const std::string &precision_mode) {
|
|||
}
|
||||
std::string GPUDeviceInfo::GetPrecisionMode() const { return CharToString(GetPrecisionModeChar()); }
|
||||
|
||||
/// \brief Derived from DeviceInfoContext, The configuration of the model running on the Ascend910. This option is
|
||||
/// invalid for MindSpore Lite.
|
||||
class MS_API Ascend910DeviceInfo : public DeviceInfoContext {
|
||||
public:
|
||||
/// \brief Get the type of this DeviceInfoContext.
|
||||
///
|
||||
/// \return Type of this DeviceInfoContext.
|
||||
enum DeviceType GetDeviceType() const override { return DeviceType::kAscend910; };
|
||||
|
||||
/// \brief Set device id.
|
||||
///
|
||||
/// \param[in] device_id The device id.
|
||||
void SetDeviceID(uint32_t device_id);
|
||||
/// \brief Get the device id.
|
||||
///
|
||||
/// \return The device id.
|
||||
uint32_t GetDeviceID() const;
|
||||
};
|
||||
|
||||
/// \brief Derived from DeviceInfoContext, The configuration of the model running on the Ascend310. This option is
|
||||
/// invalid for MindSpore Lite.
|
||||
class MS_API Ascend310DeviceInfo : public DeviceInfoContext {
|
||||
public:
|
||||
/// \brief Get the type of this DeviceInfoContext.
|
||||
///
|
||||
/// \return Type of this DeviceInfoContext.
|
||||
enum DeviceType GetDeviceType() const override { return DeviceType::kAscend310; };
|
||||
|
||||
/// \brief Set device id.
|
||||
///
|
||||
/// \param[in] device_id The device id.
|
||||
void SetDeviceID(uint32_t device_id);
|
||||
/// \brief Get the device id.
|
||||
///
|
||||
/// \return The device id.
|
||||
uint32_t GetDeviceID() const;
|
||||
|
||||
inline void SetDumpConfigPath(const std::string &cfg_path);
|
||||
inline std::string GetDumpConfigPath() const;
|
||||
|
||||
// aipp config file
|
||||
/// \brief Set AIPP configuration file path.
|
||||
///
|
||||
/// \param[in] cfg_path AIPP configuration file path.
|
||||
inline void SetInsertOpConfigPath(const std::string &cfg_path);
|
||||
/// \brief Get AIPP configuration file path.
|
||||
///
|
||||
/// \return AIPP configuration file path.
|
||||
inline std::string GetInsertOpConfigPath() const;
|
||||
|
||||
// nchw or nhwc
|
||||
/// \brief Set format of model inputs.
|
||||
///
|
||||
/// \param[in] format Optional "NCHW", "NHWC", etc.
|
||||
inline void SetInputFormat(const std::string &format);
|
||||
/// \brief Get format of model inputs.
|
||||
///
|
||||
/// \return The format of model inputs.
|
||||
inline std::string GetInputFormat() const;
|
||||
|
||||
// Mandatory while dynamic batch: e.g. "input_op_name1: 1,2,3,4;input_op_name2: 4,3,2,1"
|
||||
/// \brief Set shape of model inputs.
|
||||
///
|
||||
/// \param[in] shape e.g. "input_op_name1: 1,2,3,4;input_op_name2: 4,3,2,1".
|
||||
inline void SetInputShape(const std::string &shape);
|
||||
/// \brief Get shape of model inputs.
|
||||
///
|
||||
/// \return The shape of model inputs.
|
||||
inline std::string GetInputShape() const;
|
||||
|
||||
/// \brief Set shape of model inputs.
|
||||
///
|
||||
/// \param[in] shape e.g. {{1, {1,2,3,4}}, {2, {4,3,2,1}}} means the first input shape 1,2,3,4 and the second input
|
||||
/// shape 4,3,2,1.
|
||||
void SetInputShapeMap(const std::map<int, std::vector<int>> &shape);
|
||||
/// \brief Get shape of model inputs.
|
||||
///
|
||||
/// \return The shape of model inputs.
|
||||
std::map<int, std::vector<int>> GetInputShapeMap() const;
|
||||
|
||||
void SetDynamicBatchSize(const std::vector<size_t> &dynamic_batch_size);
|
||||
inline std::string GetDynamicBatchSize() const;
|
||||
|
||||
// FP32, UINT8 or FP16, default as FP32
|
||||
/// \brief Set type of model outputs.
|
||||
///
|
||||
/// \param[in] output_type FP32, UINT8 or FP16, default as FP32.
|
||||
void SetOutputType(enum DataType output_type);
|
||||
/// \brief Get type of model outputs.
|
||||
///
|
||||
/// \return The set type of model outputs.
|
||||
enum DataType GetOutputType() const;
|
||||
|
||||
// "force_fp16", "allow_fp32_to_fp16", "must_keep_origin_dtype" or "allow_mix_precision", default as "force_fp16"
|
||||
/// \brief Set precision mode of model.
|
||||
///
|
||||
/// \param[in] precision_mode Optional "force_fp16", "allow_fp32_to_fp16", "must_keep_origin_dtype" and
|
||||
/// "allow_mix_precision", "force_fp16" is set as default
|
||||
inline void SetPrecisionMode(const std::string &precision_mode);
|
||||
/// \brief Get precision mode of model.
|
||||
///
|
||||
/// \return The set type of model outputs
|
||||
inline std::string GetPrecisionMode() const;
|
||||
|
||||
// Optional "high_performance" and "high_precision", "high_performance" is set as default
|
||||
/// \brief Set op select implementation mode.
|
||||
///
|
||||
/// \param[in] op_select_impl_mode Optional "high_performance" and "high_precision", "high_performance" is set as
|
||||
/// default.
|
||||
inline void SetOpSelectImplMode(const std::string &op_select_impl_mode);
|
||||
/// \brief Get op select implementation mode.
|
||||
///
|
||||
/// \return The set op select implementation mode.
|
||||
inline std::string GetOpSelectImplMode() const;
|
||||
|
||||
inline void SetFusionSwitchConfigPath(const std::string &cfg_path);
|
||||
|
|
|
@ -37,32 +37,75 @@ class Metrics;
|
|||
namespace dataset {
|
||||
class Dataset;
|
||||
} // namespace dataset
|
||||
|
||||
/// \brief The Model class is used to define a MindSpore model, facilitating computational graph management.
|
||||
class MS_API Model {
|
||||
public:
|
||||
Model();
|
||||
~Model();
|
||||
Model(const Model &) = delete;
|
||||
void operator=(const Model &) = delete;
|
||||
|
||||
/// \brief Builds a model so that it can run on a device.
|
||||
///
|
||||
/// \param[in] graph GraphCell is a derivative of Cell. Cell is not available currently. GraphCell can be constructed
|
||||
/// from Graph, for example, model.Build(GraphCell(graph), context).
|
||||
/// \param[in] model_context A context used to store options during execution.
|
||||
/// \param[in] train_cfg A config used by training.
|
||||
///
|
||||
/// \return Status.
|
||||
Status Build(GraphCell graph, const std::shared_ptr<Context> &model_context = nullptr,
|
||||
const std::shared_ptr<TrainCfg> &train_cfg = nullptr);
|
||||
|
||||
/// \brief Resizes the shapes of inputs.
|
||||
///
|
||||
/// \param[in] inputs A vector that includes all input tensors in order.
|
||||
/// \param[in] dims Defines the new shapes of inputs, should be consistent with inputs.
|
||||
///
|
||||
/// \return Status.
|
||||
Status Resize(const std::vector<MSTensor> &inputs, const std::vector<std::vector<int64_t>> &dims);
|
||||
|
||||
/// \brief Inference model.
|
||||
///
|
||||
/// \param[in] inputs A vector where model inputs are arranged in sequence.
|
||||
/// \param[out] outputs Which is a pointer to a vector. The model outputs are filled in the container in sequence.
|
||||
/// \param[in] before CallBack before predict.
|
||||
/// \param[in] after CallBack after predict.
|
||||
///
|
||||
/// \return Status.
|
||||
Status Predict(const std::vector<MSTensor> &inputs, std::vector<MSTensor> *outputs,
|
||||
const MSKernelCallBack &before = nullptr, const MSKernelCallBack &after = nullptr);
|
||||
|
||||
/// \brief Obtains all input tensors of the model.
|
||||
///
|
||||
/// \return The vector that includes all input tensors.
|
||||
std::vector<MSTensor> GetInputs();
|
||||
/// \brief Obtains the input tensor of the model by name.
|
||||
///
|
||||
/// \return The input tensor with the given name, if the name is not found, an invalid tensor is returned.
|
||||
inline MSTensor GetInputByTensorName(const std::string &tensor_name);
|
||||
|
||||
Status InitMetrics(std::vector<Metrics *> metrics);
|
||||
std::vector<Metrics *> GetMetrics();
|
||||
|
||||
/// \brief Obtains all output tensors of the model.
|
||||
///
|
||||
/// \return The vector that includes all output tensors.
|
||||
std::vector<MSTensor> GetOutputs();
|
||||
/// \brief Obtains names of all output tensors of the model.
|
||||
///
|
||||
/// \return A vector that includes names of all output tensors.
|
||||
inline std::vector<std::string> GetOutputTensorNames();
|
||||
/// \brief Obtains the output tensor of the model by name.
|
||||
///
|
||||
/// \return The output tensor with the given name, if the name is not found, an invalid tensor is returned.
|
||||
inline MSTensor GetOutputByTensorName(const std::string &tensor_name);
|
||||
inline std::vector<MSTensor> GetOutputsByNodeName(const std::string &tensor_name);
|
||||
|
||||
/// \brief Inference model.
|
||||
///
|
||||
/// \param[in] device_type Device type,options are kGPU, kAscend910, etc.
|
||||
/// \param[in] model_type The type of model file, options are ModelType::kMindIR, ModelType::kOM.
|
||||
///
|
||||
/// \return Is supported or not.
|
||||
static bool CheckModelSupport(enum DeviceType device_type, ModelType model_type);
|
||||
|
||||
Status SetTrainMode(bool train);
|
||||
|
|
|
@ -27,13 +27,43 @@
|
|||
#include "include/api/dual_abi_helper.h"
|
||||
|
||||
namespace mindspore {
|
||||
|
||||
/// \brief The Serialization class is used to summarize methods for reading and writing model files.
|
||||
class MS_API Serialization {
|
||||
public:
|
||||
/// \brief Loads a model file from memory buffer.
|
||||
///
|
||||
/// \param[in] model_data A buffer filled by model file.
|
||||
/// \param[in] data_size The size of the buffer.
|
||||
/// \param[in] model_type The Type of model file, options are ModelType::kMindIR, ModelType::kOM.
|
||||
/// \param[out] graph The output parameter, an object saves graph data.
|
||||
/// \param[in] dec_key The decryption key, key length is 16, 24, or 32.
|
||||
/// \param[in] dec_mode The decryption mode, optional options are AES-GCM, AES-CBC.
|
||||
///
|
||||
/// \return Status.
|
||||
inline static Status Load(const void *model_data, size_t data_size, ModelType model_type, Graph *graph,
|
||||
const Key &dec_key = {}, const std::string &dec_mode = kDecModeAesGcm);
|
||||
|
||||
/// \brief Loads a model file from path, is not supported on MindSpore Lite.
|
||||
///
|
||||
/// \param[in] file The path of model file.
|
||||
/// \param[in] model_type The Type of model file, options are ModelType::kMindIR, ModelType::kOM.
|
||||
/// \param[out] graph The output parameter, an object saves graph data.
|
||||
/// \param[in] dec_key The decryption key, key length is 16, 24, or 32.
|
||||
/// \param[in] dec_mode The decryption mode, optional options are AES-GCM, AES-CBC.
|
||||
///
|
||||
/// \return Status.
|
||||
inline static Status Load(const std::string &file, ModelType model_type, Graph *graph, const Key &dec_key = {},
|
||||
const std::string &dec_mode = kDecModeAesGcm);
|
||||
|
||||
/// \brief Load multiple models from multiple files, MindSpore Lite does not provide this feature.
|
||||
///
|
||||
/// \param[in] files The path of model files.
|
||||
/// \param[in] model_type The Type of model file, options are ModelType::kMindIR, ModelType::kOM.
|
||||
/// \param[out] graph The output parameter, an object saves graph data.
|
||||
/// \param[in] dec_key The decryption key, key length is 16, 24, or 32.
|
||||
/// \param[in] dec_mode The decryption mode, optional options are AES-GCM, AES-CBC.
|
||||
///
|
||||
/// \return Status.
|
||||
inline static Status Load(const std::vector<std::string> &files, ModelType model_type, std::vector<Graph> *graphs,
|
||||
const Key &dec_key = {}, const std::string &dec_mode = kDecModeAesGcm);
|
||||
static Status SetParameters(const std::map<std::string, Buffer> ¶meters, Model *model);
|
||||
|
|
|
@ -25,11 +25,17 @@
|
|||
#include "include/api/dual_abi_helper.h"
|
||||
#include "include/api/format.h"
|
||||
|
||||
#ifndef MS_API
|
||||
#ifdef _WIN32
|
||||
#ifdef BUILDING_DLL
|
||||
#define MS_API __declspec(dllexport)
|
||||
#else
|
||||
#define MS_API __declspec(dllimport)
|
||||
#endif
|
||||
#else
|
||||
#define MS_API __attribute__((visibility("default")))
|
||||
#endif
|
||||
#endif
|
||||
|
||||
namespace mindspore {
|
||||
enum ModelType : uint32_t {
|
||||
|
@ -64,18 +70,64 @@ struct QuantParam {
|
|||
};
|
||||
|
||||
class Allocator;
|
||||
/// \brief The MSTensor class defines a tensor in MindSpore.
|
||||
class MS_API MSTensor {
|
||||
public:
|
||||
class Impl;
|
||||
|
||||
/// \brief Creates a MSTensor object, whose data need to be copied before accessed by Model, must be used in pairs
|
||||
/// with DestroyTensorPtr.
|
||||
///
|
||||
/// \param[in] name The name of the MSTensor.
|
||||
/// \param[in] type The data type of the MSTensor.
|
||||
/// \param[in] shape The shape of the MSTensor.
|
||||
/// \param[in] data The data pointer that points to allocated memory.
|
||||
/// \param[in] data_len The length of the memory, in bytes.
|
||||
///
|
||||
/// \return A pointer of MSTensor.
|
||||
static inline MSTensor *CreateTensor(const std::string &name, DataType type, const std::vector<int64_t> &shape,
|
||||
const void *data, size_t data_len) noexcept;
|
||||
/// \brief Creates a MSTensor object, whose data can be directly accessed by Model, must be used in pairs with
|
||||
/// DestroyTensorPtr.
|
||||
///
|
||||
/// \param[in] name The name of the MSTensor.
|
||||
/// \param[in] type The data type of the MSTensor.
|
||||
/// \param[in] shape The shape of the MSTensor.
|
||||
/// \param[in] data The data pointer that points to allocated memory.
|
||||
/// \param[in] data_len The length of the memory, in bytes.
|
||||
///
|
||||
/// \return A pointer of MSTensor.
|
||||
static inline MSTensor *CreateRefTensor(const std::string &name, DataType type, const std::vector<int64_t> &shape,
|
||||
const void *data, size_t data_len) noexcept;
|
||||
/// \brief Creates a MSTensor object, whose device data can be directly accessed by Model, must be used in pairs with
|
||||
/// DestroyTensorPtr.
|
||||
///
|
||||
/// \param[in] name The name of the MSTensor.
|
||||
/// \param[in] type The data type of the MSTensor.
|
||||
/// \param[in] shape The shape of the MSTensor.
|
||||
/// \param[in] data The data pointer that points to device memory.
|
||||
/// \param[in] data_len The length of the memory, in bytes.
|
||||
///
|
||||
/// \return A pointer of MSTensor.
|
||||
static inline MSTensor *CreateDevTensor(const std::string &name, DataType type, const std::vector<int64_t> &shape,
|
||||
const void *data, size_t data_len) noexcept;
|
||||
/// \brief Create a string type MSTensor object whose data can be accessed by Model only after being copied, must be
|
||||
/// used in pair with DestroyTensorPtr.
|
||||
///
|
||||
/// \param[in] name The name of the MSTensor.
|
||||
/// \param[in] str A vector container containing several strings.
|
||||
///
|
||||
/// \return A pointer of MSTensor.
|
||||
static inline MSTensor *StringsToTensor(const std::string &name, const std::vector<std::string> &str);
|
||||
/// \brief Parse the string type MSTensor object into strings.
|
||||
///
|
||||
/// \param[in] tensor A MSTensor object.
|
||||
///
|
||||
/// \return A vector container containing several strings.
|
||||
static inline std::vector<std::string> TensorToStrings(const MSTensor &tensor);
|
||||
/// \brief Destroy an object created by Clone, StringsToTensor, CreateRefTensor, CreateDevTensor or CreateTensor. Do
|
||||
/// not use it to destroy MSTensor from other sources.
|
||||
///
|
||||
/// \param[in] tensor A MSTensor object.
|
||||
static void DestroyTensorPtr(MSTensor *tensor) noexcept;
|
||||
|
||||
MSTensor();
|
||||
|
@ -85,19 +137,51 @@ class MS_API MSTensor {
|
|||
explicit MSTensor(std::nullptr_t);
|
||||
~MSTensor();
|
||||
|
||||
/// \brief Obtains the name of the MSTensor.
|
||||
///
|
||||
/// \return The name of the MSTensor.
|
||||
inline std::string Name() const;
|
||||
/// \brief Obtains the data type of the MSTensor.
|
||||
///
|
||||
/// \return The data type of the MSTensor.
|
||||
enum DataType DataType() const;
|
||||
/// \brief Obtains the shape of the MSTensor.
|
||||
///
|
||||
/// \return The shape of the MSTensor.
|
||||
const std::vector<int64_t> &Shape() const;
|
||||
/// \brief Obtains the number of elements of the MSTensor.
|
||||
///
|
||||
/// \return The number of elements of the MSTensor.
|
||||
int64_t ElementNum() const;
|
||||
|
||||
/// \brief Obtains a shared pointer to the copy of data of the MSTensor. The data can be read on host.
|
||||
///
|
||||
/// \return A shared pointer to the copy of data of the MSTensor.
|
||||
std::shared_ptr<const void> Data() const;
|
||||
/// \brief Obtains the pointer to the data of the MSTensor. If the MSTensor is a device tensor, the data cannot be
|
||||
/// accessed directly on host.
|
||||
///
|
||||
/// \return A pointer to the data of the MSTensor.
|
||||
void *MutableData();
|
||||
/// \brief Obtains the length of the data of the MSTensor, in bytes.
|
||||
///
|
||||
/// \return The length of the data of the MSTensor, in bytes.
|
||||
size_t DataSize() const;
|
||||
|
||||
/// \brief Gets the boolean value that indicates whether the memory of MSTensor is on device.
|
||||
///
|
||||
/// \return The boolean value that indicates whether the memory of MSTensor is on device.
|
||||
bool IsDevice() const;
|
||||
|
||||
/// \brief Gets a deep copy of the MSTensor, must be used in pair with DestroyTensorPtr.
|
||||
///
|
||||
/// \return A pointer points to a deep copy of the MSTensor.
|
||||
MSTensor *Clone() const;
|
||||
/// \brief Gets the boolean value that indicates whether the MSTensor is valid.
|
||||
///
|
||||
/// \return The boolean value that indicates whether the MSTensor is valid.
|
||||
bool operator==(std::nullptr_t) const;
|
||||
/// \brief Gets the boolean value that indicates whether the MSTensor is valid.
|
||||
///
|
||||
/// \return The boolean value that indicates whether the MSTensor is valid.
|
||||
bool operator!=(std::nullptr_t) const;
|
||||
bool operator==(const MSTensor &tensor) const;
|
||||
|
||||
|
|
|
@ -23,6 +23,7 @@ from itertools import repeat, zip_longest
|
|||
from collections import deque
|
||||
from collections.abc import Iterable
|
||||
import numpy as np
|
||||
from mindspore import context
|
||||
from mindspore import log as logger
|
||||
from mindspore.common import dtype as mstype
|
||||
from mindspore._c_expression import Tensor as Tensor_
|
||||
|
@ -846,6 +847,10 @@ class Validator:
|
|||
"""Returns an empty Tensor."""
|
||||
return Tensor_(dtype, shape)
|
||||
|
||||
@staticmethod
|
||||
def check_type_support(dtype, device, supported_dtypes):
|
||||
return dtype in supported_dtypes or not context.get_context('device_target') == device
|
||||
|
||||
|
||||
def check_input_format(input_param):
|
||||
"""Judge input format."""
|
||||
|
|
|
@ -21,7 +21,7 @@ from . import model
|
|||
|
||||
|
||||
def estimate_ops(json_str: str):
|
||||
"""Call costmodel to estimate ops."""
|
||||
"""Call cost model to estimate ops."""
|
||||
try:
|
||||
json_obj = json.loads(json_str)
|
||||
graph_descs = json_obj["graph_desc"]
|
||||
|
@ -38,7 +38,7 @@ def estimate_ops(json_str: str):
|
|||
|
||||
|
||||
def estimate_calulation_amount(json_str: str):
|
||||
"""Call costmodel to estimate calculation amount of op."""
|
||||
"""Call cost model to estimate calculation amount of op."""
|
||||
try:
|
||||
graph_desc = json.loads(json_str)
|
||||
comp = model.load_composite(graph_desc)
|
||||
|
|
|
@ -24,7 +24,7 @@ from . import utils
|
|||
|
||||
|
||||
def split_with_json(json_str, flags_str):
|
||||
"""Call costmodel to split GraphKernel"""
|
||||
"""Call cost model to split GraphKernel"""
|
||||
try:
|
||||
graph_desc = json.loads(json_str)
|
||||
flags = json.loads(flags_str)
|
||||
|
|
|
@ -50,11 +50,6 @@ def _compile_akg_task_gpu(json_strs, attrs):
|
|||
if not res:
|
||||
raise ValueError("Compile error, args: {}! build attrs: {}".format(json_str, attrs))
|
||||
|
||||
pid_path = os.path.realpath("./cuda_meta_" + str(os.getpid()))
|
||||
if os.path.exists(pid_path):
|
||||
copy_json(pid_path, os.path.realpath("./cuda_meta_" + str(os.getppid())))
|
||||
shutil.rmtree(pid_path)
|
||||
|
||||
|
||||
def _compile_akg_task_ascend(json_strs, attrs):
|
||||
"""
|
||||
|
|
|
@ -159,12 +159,17 @@ def resolve_symbol(namespace, symbol):
|
|||
if getattr(resolve_, "__hash__") is None:
|
||||
return resolve_
|
||||
|
||||
# Raise NotImplementedError when parsing the numpy methods, but not the numpy constant.
|
||||
if namespace.name == "numpy" and isinstance(resolve_, (types.FunctionType, types.MethodType, types.ModuleType)):
|
||||
raise NotImplementedError(
|
||||
f"MindSpore does not support to use the numpy methods in the function construct with the graph mode.")
|
||||
|
||||
# If need trope the obj
|
||||
if resolve_ in convert_object_map:
|
||||
resolve_ = convert_object_map.get(resolve_)
|
||||
logger.debug("convert resolve = %r", resolve_)
|
||||
if resolve_ == NO_IMPLEMENT:
|
||||
raise NotImplementedError(f"Not support for `{symbol}`")
|
||||
raise NotImplementedError(f"Not support for `{symbol}`.")
|
||||
except Exception as e:
|
||||
if isinstance(e, NotImplementedError):
|
||||
raise e
|
||||
|
|
|
@ -1312,7 +1312,8 @@ def sum(x, axis=None, dtype=None, keepdims=False, initial=None): # pylint: disab
|
|||
>>> print(input_x.sum(axis=1))
|
||||
[10. 35.]
|
||||
"""
|
||||
dtype = x.dtype if dtype is None else dtype
|
||||
input_x = x.astype(mstype.int32) if x.dtype == mstype.bool_ else x
|
||||
dtype = input_x.dtype if dtype is None else dtype
|
||||
if not isinstance(keepdims, int):
|
||||
const_utils.raise_type_error("integer argument expected")
|
||||
if initial is not None and not isinstance(initial, (int, float, bool)):
|
||||
|
@ -1322,14 +1323,14 @@ def sum(x, axis=None, dtype=None, keepdims=False, initial=None): # pylint: disab
|
|||
else:
|
||||
axis = check_and_canonicalize_axes(axis, x.ndim)
|
||||
|
||||
if x.dtype == mstype.bool_:
|
||||
x = x.astype("int32")
|
||||
if not check_type_support(input_x.dtype, 'GPU', (mstype.float64, mstype.float32, mstype.float16)):
|
||||
input_x = input_x.astype(mstype.float32)
|
||||
if 0 in x.shape:
|
||||
x = const_utils.make_tensor([0], x.dtype)
|
||||
if keepdims:
|
||||
res = _reduce_sum_keepdims(x, axis)
|
||||
res = _reduce_sum_keepdims(input_x, axis)
|
||||
else:
|
||||
res = _reduce_sum_default(x, axis)
|
||||
res = _reduce_sum_default(input_x, axis)
|
||||
if initial is not None:
|
||||
res += initial
|
||||
return res.astype(dtype)
|
||||
|
@ -1648,6 +1649,7 @@ get_log2_size = constexpr(validator.get_log2_size)
|
|||
check_axis_type = constexpr(validator.check_axis_type)
|
||||
check_and_canonicalize_axes = constexpr(validator.check_and_canonicalize_axes)
|
||||
empty_compile = constexpr(validator.empty_compile)
|
||||
check_type_support = constexpr(validator.check_type_support)
|
||||
|
||||
|
||||
def tensor_bool(x):
|
||||
|
|
|
@ -325,7 +325,7 @@ endif()
|
|||
set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
|
||||
set_property(SOURCE "pipeline/jit/init.cc" PROPERTY
|
||||
COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_PIPELINE)
|
||||
pybind11_add_module(_c_expression "pipeline/jit/init.cc")
|
||||
pybind11_add_module(_c_expression NO_EXTRAS "pipeline/jit/init.cc")
|
||||
|
||||
MESSAGE(STATUS "operation system is ${CMAKE_SYSTEM}")
|
||||
if(CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||
|
|
|
@ -35,6 +35,7 @@ if(ENABLE_CPU)
|
|||
"cpu/fl/*.cc"
|
||||
"cpu/ps/*.cc"
|
||||
"cpu/quantum/*.cc"
|
||||
"cpu/pyfunc/*.cc"
|
||||
)
|
||||
|
||||
if(NOT ENABLE_MPI)
|
||||
|
|
|
@ -16,6 +16,11 @@
|
|||
|
||||
#include "backend/kernel_compiler/akg/akg_kernel_build.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <errno.h>
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
|
@ -23,6 +28,7 @@
|
|||
#include <unordered_set>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
#include "nlohmann/json.hpp"
|
||||
#include "ir/dtype.h"
|
||||
#include "ir/func_graph.h"
|
||||
|
@ -34,9 +40,320 @@
|
|||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
|
||||
#define INIT_SET_FROM_2D_ARRAY(set_var, list_idx) \
|
||||
std::set<size_t> set_var(kernel_lists_[list_idx], kernel_lists_[list_idx] + kernel_lists_[list_idx][kMaxKernelNum_]);
|
||||
|
||||
#define LIST_BEGIN(list_idx) kernel_lists_[list_idx]
|
||||
#define LIST_END(list_idx) (kernel_lists_[list_idx] + kernel_lists_[list_idx][kMaxKernelNum_])
|
||||
#define RESET_LIST_SIZE(list_idx, val) kernel_lists_[list_idx][kMaxKernelNum_] = val
|
||||
|
||||
#define INCREASE_LIST_SIZE(list_idx, val) kernel_lists_[list_idx][kMaxKernelNum_] += val
|
||||
|
||||
constexpr int32_t PROCESS_NUM = 16;
|
||||
constexpr int32_t TIME_OUT = 300;
|
||||
|
||||
bool AkgKernelPool::LockMng::TryLock() {
|
||||
// Try to lock 100 times. Return errno if lock unsuccessfully
|
||||
uint32_t trial = 100;
|
||||
|
||||
int32_t ret = -1;
|
||||
while (trial > 0) {
|
||||
ret = lockf(fd_, F_TLOCK, 0);
|
||||
if (ret == 0 || (errno != EACCES && errno != EAGAIN)) {
|
||||
break;
|
||||
}
|
||||
|
||||
trial--;
|
||||
usleep(5000);
|
||||
}
|
||||
|
||||
if (ret == -1) {
|
||||
MS_LOG(ERROR) << "Failed to acquire the lock, errno:" << strerror(errno) << ".";
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void AkgKernelPool::LockMng::Unlock() {
|
||||
auto ret = lockf(fd_, F_ULOCK, 0);
|
||||
if (ret == -1) {
|
||||
MS_LOG(ERROR) << "Failed to release the lock, errno:" << strerror(errno);
|
||||
}
|
||||
}
|
||||
|
||||
std::string AkgKernelPool::GetCurrentPath() {
|
||||
char cwd[PATH_MAX];
|
||||
char *ret = getcwd(cwd, sizeof(cwd));
|
||||
if (ret == nullptr) {
|
||||
MS_LOG(ERROR) << "Get current work directory failed, errno:" << strerror(errno);
|
||||
return "";
|
||||
}
|
||||
|
||||
char abspath[PATH_MAX];
|
||||
char *res = realpath(cwd, abspath);
|
||||
if (res == nullptr) {
|
||||
MS_LOG(ERROR) << "Change to realpath failed, errno:" << strerror(errno);
|
||||
return "";
|
||||
}
|
||||
|
||||
return std::string(abspath);
|
||||
}
|
||||
|
||||
void *AkgKernelPool::CreateSharedMem(const std::string &path) {
|
||||
is_creator_ = false;
|
||||
|
||||
auto hash_id = std::hash<std::string>()(path);
|
||||
auto key_id = static_cast<key_t>(hash_id);
|
||||
auto mem_size = sizeof(size_t) * kListNum_ * (kMaxKernelNum_ + 1) + 512;
|
||||
|
||||
{
|
||||
LockMng lock(fd_);
|
||||
if (!lock.locked_) {
|
||||
MS_LOG(ERROR) << "Failed to acquire lock.";
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// check if the shared memory exists or not.
|
||||
// remove shared memory if exists and the nattach is 0
|
||||
struct shmid_ds buf;
|
||||
auto id = shmget(key_id, mem_size, 0);
|
||||
if (id != -1) {
|
||||
auto ret = shmctl(id, IPC_STAT, &buf);
|
||||
if (ret == -1) {
|
||||
MS_LOG(ERROR) << "Failed to get the info of shared memory, errno:" << strerror(errno);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (buf.shm_nattch == 0) {
|
||||
ret = shmctl(id, IPC_RMID, nullptr);
|
||||
if (ret < 0) {
|
||||
MS_LOG(EXCEPTION) << "Realse shared_mem failed, errno:" << strerror(errno);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
LockMng lock(fd_);
|
||||
if (!lock.locked_) {
|
||||
MS_LOG(ERROR) << "Failed to acquire lock.";
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
shm_id_ = shmget(key_id, mem_size, IPC_CREAT | IPC_EXCL | 0600);
|
||||
if (shm_id_ == -1) {
|
||||
if (errno == EEXIST) {
|
||||
shm_id_ = shmget(key_id, mem_size, 0);
|
||||
}
|
||||
|
||||
if (shm_id_ == -1) {
|
||||
MS_LOG(ERROR) << "Create shared_mem failed, error no:" << strerror(errno);
|
||||
return nullptr;
|
||||
}
|
||||
} else {
|
||||
is_creator_ = true;
|
||||
}
|
||||
|
||||
auto local_addr = shmat(shm_id_, nullptr, 0);
|
||||
if (local_addr == reinterpret_cast<void *>(-1)) {
|
||||
MS_LOG(ERROR) << "Attach to shared_mem failed, error no:" << strerror(errno);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (is_creator_) {
|
||||
(void)memset(local_addr, 0, mem_size);
|
||||
}
|
||||
|
||||
return local_addr;
|
||||
}
|
||||
|
||||
int32_t AkgKernelPool::Init(const std::vector<JsonNodePair> &build_args) {
|
||||
auto cp = GetCurrentPath();
|
||||
if (cp.empty()) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
fd_ = open(kKeyName_, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
|
||||
if (fd_ == -1) {
|
||||
MS_LOG(ERROR) << "open file <" << kKeyName_ << "> failed, errno:" << strerror(errno);
|
||||
return -1;
|
||||
}
|
||||
|
||||
auto addr = CreateSharedMem(cp);
|
||||
if (addr == nullptr) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
InitKernelLists(addr);
|
||||
|
||||
auto ret = AddKernels(build_args);
|
||||
if (ret != 0) {
|
||||
MS_LOG(ERROR) << "AkgKernelPool AddKernels failed.";
|
||||
return false;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
AkgKernelPool::~AkgKernelPool() {
|
||||
// Detach shared memory
|
||||
auto ret = shmdt(reinterpret_cast<void *>(kernel_lists_[0]));
|
||||
if (ret < 0) {
|
||||
MS_LOG(EXCEPTION) << "Shared_mem detach failed, errno:" << strerror(errno);
|
||||
}
|
||||
|
||||
// Realse shared_memroy
|
||||
if (is_creator_) {
|
||||
ret = shmctl(shm_id_, IPC_RMID, nullptr);
|
||||
if (ret < 0) {
|
||||
MS_LOG(EXCEPTION) << "Realse shared_mem failed, errno:" << strerror(errno);
|
||||
}
|
||||
}
|
||||
|
||||
// Close key file
|
||||
if (fd_ != -1) {
|
||||
(void)close(fd_);
|
||||
}
|
||||
}
|
||||
|
||||
int32_t AkgKernelPool::AddKernels(const std::vector<JsonNodePair> &build_args) {
|
||||
LockMng lock(fd_);
|
||||
if (!lock.locked_) {
|
||||
MS_LOG(ERROR) << "Failed to acquire lock.";
|
||||
return -1;
|
||||
}
|
||||
|
||||
INIT_SET_FROM_2D_ARRAY(todo_list, kToDoIdx_);
|
||||
INIT_SET_FROM_2D_ARRAY(doing_list, kDoingIdx_);
|
||||
INIT_SET_FROM_2D_ARRAY(done_list, kDoneIdx_);
|
||||
|
||||
for (const auto &[json_generator, anf_node] : build_args) {
|
||||
MS_EXCEPTION_IF_NULL(anf_node);
|
||||
auto kernel_name = json_generator.kernel_name();
|
||||
|
||||
auto hash_id = std::hash<std::string>()(kernel_name);
|
||||
if (self_kernel_ids_.count(hash_id) != 0) {
|
||||
MS_LOG(ERROR) << "Duplicated hash_id in list.";
|
||||
return -1;
|
||||
}
|
||||
|
||||
self_kernel_ids_.emplace(hash_id);
|
||||
}
|
||||
|
||||
std::set<size_t> diff_from_todo;
|
||||
std::set<size_t> diff_from_doing;
|
||||
std::set<size_t> diff_from_done;
|
||||
|
||||
// add the unique kernel only once, so need to check if it exists in todo_list, doing_list, or done_list
|
||||
std::set_difference(self_kernel_ids_.begin(), self_kernel_ids_.end(), todo_list.begin(), todo_list.end(),
|
||||
std::inserter(diff_from_todo, diff_from_todo.begin()));
|
||||
std::set_difference(diff_from_todo.begin(), diff_from_todo.end(), doing_list.begin(), doing_list.end(),
|
||||
std::inserter(diff_from_doing, diff_from_doing.begin()));
|
||||
std::set_difference(diff_from_doing.begin(), diff_from_doing.end(), done_list.begin(), done_list.end(),
|
||||
std::inserter(diff_from_done, diff_from_done.begin()));
|
||||
|
||||
auto new_kernel_size = diff_from_done.size();
|
||||
if (new_kernel_size + todo_list.size() > static_cast<size_t>(kMaxKernelNum_)) {
|
||||
MS_LOG(ERROR) << "The size of kernels is " << new_kernel_size << ", while the left space of the pool is "
|
||||
<< kMaxKernelNum_ - todo_list.size();
|
||||
return -1;
|
||||
}
|
||||
|
||||
std::copy(diff_from_done.begin(), diff_from_done.end(), LIST_END(kToDoIdx_));
|
||||
INCREASE_LIST_SIZE(kToDoIdx_, new_kernel_size);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int32_t AkgKernelPool::FetchKernels(std::set<size_t> *out) {
|
||||
LockMng lock(fd_);
|
||||
if (!lock.locked_) {
|
||||
MS_LOG(ERROR) << "Failed to acquire lock.";
|
||||
return -1;
|
||||
}
|
||||
|
||||
std::set<size_t> left_in_todo_list;
|
||||
|
||||
// filter out kernels which belongs to other processes
|
||||
auto FilterBySelfList = [&left_in_todo_list, &out, this](size_t id) {
|
||||
if (this->self_kernel_ids_.count(id) != 0) {
|
||||
out->emplace(id);
|
||||
} else {
|
||||
left_in_todo_list.emplace(id);
|
||||
}
|
||||
};
|
||||
|
||||
std::for_each(LIST_BEGIN(kToDoIdx_), LIST_END(kToDoIdx_), FilterBySelfList);
|
||||
|
||||
std::copy(out->begin(), out->end(), LIST_END(kDoingIdx_));
|
||||
INCREASE_LIST_SIZE(kDoingIdx_, out->size());
|
||||
|
||||
std::copy(left_in_todo_list.begin(), left_in_todo_list.end(), LIST_BEGIN(kToDoIdx_));
|
||||
RESET_LIST_SIZE(kToDoIdx_, left_in_todo_list.size());
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int32_t AkgKernelPool::UpdateAndWait(const std::set<size_t> &ids) {
|
||||
if (!ids.empty()) {
|
||||
LockMng lock(fd_);
|
||||
if (!lock.locked_) {
|
||||
MS_LOG(ERROR) << "Failed to acquire lock.";
|
||||
return -1;
|
||||
}
|
||||
|
||||
// update the state of finished kernels to `done`
|
||||
std::copy(ids.begin(), ids.end(), LIST_END(kDoneIdx_));
|
||||
INCREASE_LIST_SIZE(kDoneIdx_, ids.size());
|
||||
|
||||
// delete the finished kernels from doing_list
|
||||
std::vector<size_t> left_in_doing_list;
|
||||
INIT_SET_FROM_2D_ARRAY(doing_list, kDoingIdx_);
|
||||
std::set_difference(doing_list.begin(), doing_list.end(), ids.begin(), ids.end(),
|
||||
std::inserter(left_in_doing_list, left_in_doing_list.begin()));
|
||||
|
||||
std::copy(left_in_doing_list.begin(), left_in_doing_list.end(), LIST_BEGIN(kDoingIdx_));
|
||||
RESET_LIST_SIZE(kDoingIdx_, left_in_doing_list.size());
|
||||
}
|
||||
|
||||
auto ret = Wait();
|
||||
if (ret != 0) {
|
||||
MS_LOG(ERROR) << "AkgKernelPool Wait failed.";
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int32_t AkgKernelPool::Wait() {
|
||||
// wait until all the kernels which belong to this process finish compiling
|
||||
uint32_t trials = 1000;
|
||||
|
||||
while (trials > 0) {
|
||||
{
|
||||
LockMng lock(fd_);
|
||||
if (!lock.locked_) {
|
||||
MS_LOG(ERROR) << "Failed to acquire lock.";
|
||||
return -1;
|
||||
}
|
||||
|
||||
INIT_SET_FROM_2D_ARRAY(done_list, kDoneIdx_);
|
||||
|
||||
if (std::all_of(self_kernel_ids_.begin(), self_kernel_ids_.end(),
|
||||
[&done_list](size_t id) { return done_list.count(id) != 0; })) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
usleep(1000000);
|
||||
trials--;
|
||||
}
|
||||
|
||||
MS_LOG(ERROR) << "Time out while wait kernel compiling";
|
||||
return -1;
|
||||
}
|
||||
|
||||
std::vector<std::string> AkgKernelBuilder::GetNotCachedKernelJsons(const std::vector<JsonNodePair> &build_args) {
|
||||
// Remove cached nodes, gether unique nodes, and collect repeated nodes which need postprecess.
|
||||
std::vector<std::string> jsons;
|
||||
|
@ -66,6 +383,31 @@ std::vector<std::string> AkgKernelBuilder::GetNotCachedKernelJsons(const std::ve
|
|||
return jsons;
|
||||
}
|
||||
|
||||
std::vector<JsonNodePair> AkgKernelBuilder::GetNotCachedKernels(const std::vector<JsonNodePair> &build_args) {
|
||||
std::unordered_set<std::string> kernel_name_set;
|
||||
std::vector<JsonNodePair> new_build_args;
|
||||
for (const auto &[json_generator, anf_node] : build_args) {
|
||||
MS_EXCEPTION_IF_NULL(anf_node);
|
||||
auto kernel_name = json_generator.kernel_name();
|
||||
|
||||
auto cached_kernel_pack = AkgSearchCache(kernel_name);
|
||||
if (cached_kernel_pack != nullptr) {
|
||||
MS_LOG(DEBUG) << "Use cached kernel, kernel_name[" << kernel_name << "], fullname_with_scope["
|
||||
<< anf_node->fullname_with_scope() << "].";
|
||||
AkgSetKernelMod(cached_kernel_pack, json_generator, anf_node);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (kernel_name_set.count(kernel_name) != 0) {
|
||||
repeat_nodes_.push_back({json_generator, anf_node});
|
||||
continue;
|
||||
}
|
||||
kernel_name_set.insert(kernel_name);
|
||||
new_build_args.push_back({json_generator, anf_node});
|
||||
}
|
||||
return new_build_args;
|
||||
}
|
||||
|
||||
bool AkgKernelBuilder::InsertToCache(const std::vector<JsonNodePair> &build_args) {
|
||||
for (const auto &[json_generator, anf_node] : build_args) {
|
||||
auto kernel_name = json_generator.kernel_name();
|
||||
|
@ -97,32 +439,77 @@ bool AkgKernelBuilder::HandleRepeatNodes() {
|
|||
return true;
|
||||
}
|
||||
|
||||
std::vector<std::string> AkgKernelBuilder::GetKernelJsonsByHashId(const std::vector<JsonNodePair> &build_args,
|
||||
std::set<size_t> fetched_ids) {
|
||||
std::vector<std::string> jsons;
|
||||
for (const auto &[json_generator, anf_node] : build_args) {
|
||||
MS_EXCEPTION_IF_NULL(anf_node);
|
||||
auto kernel_name = json_generator.kernel_name();
|
||||
|
||||
auto hash_id = std::hash<std::string>()(kernel_name);
|
||||
|
||||
if (fetched_ids.count(hash_id) == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto kernel_json = json_generator.kernel_json_str();
|
||||
AkgSaveJsonInfo(kernel_name, kernel_json);
|
||||
jsons.push_back(kernel_json);
|
||||
}
|
||||
return jsons;
|
||||
}
|
||||
|
||||
bool AkgKernelBuilder::AkgOpParallelBuild(const std::vector<JsonNodePair> &build_args) {
|
||||
repeat_nodes_.clear();
|
||||
auto jsons = GetNotCachedKernelJsons(build_args);
|
||||
if (jsons.empty()) {
|
||||
auto new_build_args = GetNotCachedKernels(build_args);
|
||||
if (new_build_args.empty()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
auto client = GetClient();
|
||||
MS_EXCEPTION_IF_NULL(client);
|
||||
if (!client->AkgStart(PROCESS_NUM, TIME_OUT)) {
|
||||
MS_LOG(ERROR) << "Akg start failed.";
|
||||
AkgKernelPool kp;
|
||||
auto ret = kp.Init(new_build_args);
|
||||
if (ret != 0) {
|
||||
MS_LOG(ERROR) << "AkgKernelPool init failed.";
|
||||
return false;
|
||||
}
|
||||
auto attrs = CollectBuildAttrs();
|
||||
if (!attrs.empty() && !client->AkgSendAttr(attrs)) {
|
||||
MS_LOG(ERROR) << "Akg send attr failed.";
|
||||
|
||||
std::set<size_t> fetched_ids;
|
||||
ret = kp.FetchKernels(&fetched_ids);
|
||||
if (ret != 0) {
|
||||
MS_LOG(ERROR) << "AkgKernelPool FetchKernels failed.";
|
||||
return false;
|
||||
}
|
||||
if (!client->AkgSendData(jsons)) {
|
||||
MS_LOG(ERROR) << "Akg send data failed.";
|
||||
return false;
|
||||
}
|
||||
if (!client->AkgWait()) {
|
||||
MS_LOG(ERROR) << "Akg compile failed.";
|
||||
|
||||
if (!fetched_ids.empty()) {
|
||||
auto jsons = GetKernelJsonsByHashId(new_build_args, fetched_ids);
|
||||
|
||||
auto client = GetClient();
|
||||
MS_EXCEPTION_IF_NULL(client);
|
||||
if (!client->AkgStart(PROCESS_NUM, TIME_OUT)) {
|
||||
MS_LOG(ERROR) << "Akg start failed.";
|
||||
return false;
|
||||
}
|
||||
auto attrs = CollectBuildAttrs();
|
||||
if (!attrs.empty() && !client->AkgSendAttr(attrs)) {
|
||||
MS_LOG(ERROR) << "Akg send attr failed.";
|
||||
return false;
|
||||
}
|
||||
if (!client->AkgSendData(jsons)) {
|
||||
MS_LOG(ERROR) << "Akg send data failed.";
|
||||
return false;
|
||||
}
|
||||
if (!client->AkgWait()) {
|
||||
MS_LOG(ERROR) << "Akg compile failed.";
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
ret = kp.UpdateAndWait(fetched_ids);
|
||||
if (ret != 0) {
|
||||
MS_LOG(ERROR) << "AkgKernelPool UpdateAndWait failed.";
|
||||
return false;
|
||||
}
|
||||
|
||||
// All unique done here, cache them and set kernel.
|
||||
if (!InsertToCache(build_args)) {
|
||||
MS_LOG(ERROR) << "Insert cache failed.";
|
||||
|
|
|
@ -17,10 +17,13 @@
|
|||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_AKG_KERNEL_BUILD_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_AKG_KERNEL_BUILD_H_
|
||||
|
||||
#include <sys/shm.h>
|
||||
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include "ir/anf.h"
|
||||
#include "backend/kernel_compiler/kernel.h"
|
||||
#include "backend/session/kernel_build_client.h"
|
||||
|
@ -45,12 +48,84 @@ class AkgKernelBuilder {
|
|||
|
||||
private:
|
||||
std::vector<std::string> GetNotCachedKernelJsons(const std::vector<JsonNodePair> &build_args);
|
||||
std::vector<JsonNodePair> GetNotCachedKernels(const std::vector<JsonNodePair> &build_args);
|
||||
std::vector<std::string> GetKernelJsonsByHashId(const std::vector<JsonNodePair> &build_args,
|
||||
std::set<size_t> fetched_ids);
|
||||
bool InsertToCache(const std::vector<JsonNodePair> &build_args);
|
||||
bool HandleRepeatNodes();
|
||||
bool AkgOpParallelBuild(const std::vector<JsonNodePair> &build_args);
|
||||
std::vector<JsonNodePair> repeat_nodes_;
|
||||
std::string CollectBuildAttrs();
|
||||
};
|
||||
|
||||
class AkgKernelPool {
|
||||
public:
|
||||
class LockMng {
|
||||
public:
|
||||
explicit LockMng(int32_t fd) {
|
||||
fd_ = fd;
|
||||
locked_ = TryLock();
|
||||
}
|
||||
|
||||
virtual ~LockMng() {
|
||||
if (locked_) {
|
||||
Unlock();
|
||||
}
|
||||
}
|
||||
|
||||
bool locked_{false};
|
||||
|
||||
private:
|
||||
bool TryLock();
|
||||
void Unlock();
|
||||
|
||||
int32_t fd_{-1};
|
||||
};
|
||||
|
||||
public:
|
||||
AkgKernelPool() = default;
|
||||
virtual ~AkgKernelPool();
|
||||
|
||||
int32_t Init(const std::vector<JsonNodePair> &build_args);
|
||||
int32_t FetchKernels(std::set<size_t> *out);
|
||||
int32_t UpdateAndWait(const std::set<size_t> &ids);
|
||||
|
||||
constexpr inline static size_t kMaxKernelNum_{1000};
|
||||
constexpr inline static key_t kSharedMemKey_{0x57565845};
|
||||
|
||||
// allocate memory for todo_list, doing_list, done_list
|
||||
constexpr inline static size_t kListNum_{3};
|
||||
|
||||
constexpr inline static auto kKeyName_ = "./akg_build_tmp.key";
|
||||
|
||||
constexpr inline static int32_t kToDoIdx_ = 0;
|
||||
constexpr inline static int32_t kDoingIdx_ = 1;
|
||||
constexpr inline static int32_t kDoneIdx_ = 2;
|
||||
|
||||
private:
|
||||
void *CreateSharedMem(const std::string &path);
|
||||
std::string GetCurrentPath();
|
||||
|
||||
inline void InitKernelLists(void *addr) {
|
||||
kernel_lists_[kToDoIdx_] = reinterpret_cast<size_t *>(addr);
|
||||
kernel_lists_[kDoingIdx_] = kernel_lists_[kToDoIdx_] + kMaxKernelNum_ + 1;
|
||||
kernel_lists_[kDoneIdx_] = kernel_lists_[kDoingIdx_] + kMaxKernelNum_ + 1;
|
||||
}
|
||||
|
||||
int32_t AddKernels(const std::vector<JsonNodePair> &kernel_jsons);
|
||||
int32_t Wait();
|
||||
|
||||
int32_t shm_id_{-1};
|
||||
bool is_creator_{false};
|
||||
int32_t fd_{-1};
|
||||
|
||||
// includes 3 lists: todo_list, doing_list, done_list.
|
||||
// each list has kMaxKernelNum_ + 1 elements and, the count of elements in each list
|
||||
// is stored in kernel_lists_[xx][kMaxKernelNum_]
|
||||
size_t *kernel_lists_[kListNum_]{nullptr, nullptr, nullptr};
|
||||
|
||||
std::set<size_t> self_kernel_ids_;
|
||||
};
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
||||
|
|
|
@ -44,8 +44,10 @@ KernelPackPtr AkgAscendKernelBuilder::AkgInsertCache(const std::string &kernel_n
|
|||
void AkgAscendKernelBuilder::AkgSetKernelMod(const KernelPackPtr &kernel_pack,
|
||||
const AkgKernelJsonGenerator &json_generator, const AnfNodePtr &anf_node) {
|
||||
auto kernel_mod_ptr = std::make_shared<AkgKernelMod>(kernel_pack);
|
||||
auto kernel_json_info = kernel_pack->kernel_json_info();
|
||||
kernel_mod_ptr->SetInputSizeList(json_generator.input_size_list());
|
||||
kernel_mod_ptr->SetOutputSizeList(json_generator.output_size_list());
|
||||
kernel_mod_ptr->SetWorkspaceSizeList(kernel_json_info.workspaces);
|
||||
AnfAlgo::SetKernelMod(kernel_mod_ptr, anf_node.get());
|
||||
}
|
||||
|
||||
|
|
|
@ -49,7 +49,7 @@ const std::vector<size_t> &AkgKernelMod::GetOutputSizeList() const { return outp
|
|||
|
||||
const std::vector<size_t> &AkgKernelMod::GetWorkspaceSizeList() const { return workspace_size_list_; }
|
||||
|
||||
bool AkgKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
|
||||
bool AkgKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs, void *stream_ptr) {
|
||||
if (stream_ptr == nullptr) {
|
||||
MS_LOG(ERROR) << "stream_ptr should not be nullptr.";
|
||||
|
@ -74,6 +74,10 @@ bool AkgKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vect
|
|||
[](const AddressPtr &input) -> void * { return input->addr; });
|
||||
(void)std::transform(std::begin(outputs), std::end(outputs), std::back_inserter(runtime_args),
|
||||
[](const AddressPtr &output) -> void * { return output->addr; });
|
||||
if (!workspace.empty()) {
|
||||
(void)std::transform(std::begin(workspace), std::end(workspace), std::back_inserter(runtime_args),
|
||||
[](const AddressPtr &addr) -> void * { return addr->addr; });
|
||||
}
|
||||
|
||||
rtL2Ctrl_t *l2ctrl = nullptr;
|
||||
auto stream = static_cast<rtStream_t *>(stream_ptr);
|
||||
|
@ -86,7 +90,8 @@ bool AkgKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vect
|
|||
return true;
|
||||
}
|
||||
|
||||
std::vector<TaskInfoPtr> AkgKernelMod::GenTask(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
|
||||
std::vector<TaskInfoPtr> AkgKernelMod::GenTask(const std::vector<AddressPtr> &inputs,
|
||||
const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs, uint32_t stream_id) {
|
||||
if (kernel_pack_ == nullptr) {
|
||||
MS_LOG(EXCEPTION) << "kernel pack should not be nullptr.";
|
||||
|
@ -107,6 +112,10 @@ std::vector<TaskInfoPtr> AkgKernelMod::GenTask(const std::vector<AddressPtr> &in
|
|||
[](const AddressPtr &input) -> void * { return input->addr; });
|
||||
(void)std::transform(std::begin(outputs), std::end(outputs), std::back_inserter(output_data_addrs),
|
||||
[](const AddressPtr &output) -> void * { return output->addr; });
|
||||
if (!workspace.empty()) {
|
||||
(void)std::transform(std::begin(workspace), std::end(workspace), std::back_inserter(workspace_addrs),
|
||||
[](const AddressPtr &workspace) -> void * { return workspace->addr; });
|
||||
}
|
||||
|
||||
uint32_t block_dim = DEFAULT_BLOCK_DIM; // default blockdim equal to 1.
|
||||
auto func_stub = KernelManager::GenFuncStub(*kernel_pack_, false, &block_dim);
|
||||
|
|
|
@ -39,8 +39,10 @@ KernelPackPtr AkgGpuKernelBuilder::AkgInsertCache(const std::string &kernel_name
|
|||
void AkgGpuKernelBuilder::AkgSetKernelMod(const KernelPackPtr &kernel_pack,
|
||||
const AkgKernelJsonGenerator &json_generator, const AnfNodePtr &anf_node) {
|
||||
auto kernel_mod_ptr = std::make_shared<GpuKernelMod>(kernel_pack);
|
||||
auto kernel_json_info = kernel_pack->kernel_json_info();
|
||||
kernel_mod_ptr->SetInputSizeList(json_generator.input_size_list());
|
||||
kernel_mod_ptr->SetOutputSizeList(json_generator.output_size_list());
|
||||
kernel_mod_ptr->SetWorkspaceSizeList(kernel_json_info.workspaces);
|
||||
AnfAlgo::SetKernelMod(kernel_mod_ptr, anf_node.get());
|
||||
}
|
||||
|
||||
|
|
|
@ -92,13 +92,15 @@ void GpuKernelMod::SetInputSizeList(const std::vector<size_t> &size_list) { inpu
|
|||
|
||||
void GpuKernelMod::SetOutputSizeList(const std::vector<size_t> &size_list) { output_size_list_ = size_list; }
|
||||
|
||||
void GpuKernelMod::SetWorkspaceSizeList(const std::vector<size_t> &size_list) { workspace_size_list_ = size_list; }
|
||||
|
||||
const std::vector<size_t> &GpuKernelMod::GetInputSizeList() const { return input_size_list_; }
|
||||
|
||||
const std::vector<size_t> &GpuKernelMod::GetOutputSizeList() const { return output_size_list_; }
|
||||
|
||||
const std::vector<size_t> &GpuKernelMod::GetWorkspaceSizeList() const { return workspace_size_list_; }
|
||||
|
||||
bool GpuKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
|
||||
bool GpuKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs, void *stream_ptr) {
|
||||
if (stream_ptr == 0) {
|
||||
MS_LOG(ERROR) << "stream_ptr should not be nullptr.";
|
||||
|
@ -122,6 +124,10 @@ bool GpuKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vect
|
|||
[](const AddressPtr &input) -> void * { return reinterpret_cast<void *>(&(input->addr)); });
|
||||
(void)std::transform(std::begin(outputs), std::end(outputs), std::back_inserter(runtimeargs),
|
||||
[](const AddressPtr &output) -> void * { return reinterpret_cast<void *>(&(output->addr)); });
|
||||
if (!workspace.empty()) {
|
||||
(void)std::transform(std::begin(workspace), std::end(workspace), std::back_inserter(runtimeargs),
|
||||
[](const AddressPtr &addr) -> void * { return addr->addr; });
|
||||
}
|
||||
result = cuLaunchKernel(kernel_addr, thread_info[0], thread_info[1], thread_info[2], thread_info[3], thread_info[4],
|
||||
thread_info[5], 0, reinterpret_cast<CUstream>(stream_ptr),
|
||||
reinterpret_cast<void **>(&runtimeargs[0]), 0);
|
||||
|
|
|
@ -60,6 +60,7 @@ class GpuKernelMod : public KernelMod {
|
|||
|
||||
void SetInputSizeList(const std::vector<size_t> &size_list);
|
||||
void SetOutputSizeList(const std::vector<size_t> &size_list);
|
||||
void SetWorkspaceSizeList(const std::vector<size_t> &size_list);
|
||||
const std::vector<size_t> &GetInputSizeList() const override;
|
||||
const std::vector<size_t> &GetOutputSizeList() const override;
|
||||
const std::vector<size_t> &GetWorkspaceSizeList() const override;
|
||||
|
|
|
@ -141,14 +141,8 @@ FusionType GetFusionTypeByName(const std::string &name) {
|
|||
return iter->first;
|
||||
}
|
||||
|
||||
void KernelMeta::Initialize(int pid) {
|
||||
if (pid == -1) {
|
||||
kernel_meta_path_ = std::string(kGpuKernelMeta) + "_" + std::to_string(getpid()) + "/";
|
||||
} else {
|
||||
kernel_meta_path_ = std::string(kGpuKernelMeta) + "_" + std::to_string(pid) + "/";
|
||||
}
|
||||
// remove old kernel cache
|
||||
RemoveKernelCache();
|
||||
void KernelMeta::Initialize() {
|
||||
kernel_meta_path_ = std::string(kGpuKernelMeta) + "/";
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
auto ret = mkdir(kernel_meta_path_.c_str());
|
||||
|
@ -161,21 +155,6 @@ void KernelMeta::Initialize(int pid) {
|
|||
initialized_ = true;
|
||||
}
|
||||
|
||||
void KernelMeta::RemoveKernelCache() {
|
||||
DIR *dir = opendir(kernel_meta_path_.c_str());
|
||||
if (dir == nullptr) {
|
||||
return;
|
||||
}
|
||||
struct dirent *entry;
|
||||
while ((entry = readdir(dir)) != nullptr) {
|
||||
std::string kernel_file = entry->d_name;
|
||||
std::string kernel_file_realpath = kernel_meta_path_ + kernel_file;
|
||||
(void)remove(kernel_file_realpath.c_str());
|
||||
}
|
||||
(void)closedir(dir);
|
||||
(void)rmdir(kernel_meta_path_.c_str());
|
||||
}
|
||||
|
||||
std::string KernelMeta::Search(const std::string &kernel_name) const {
|
||||
if (!initialized_) {
|
||||
return "";
|
||||
|
@ -227,7 +206,7 @@ KernelPackPtr SearchCache(const std::string &kernel_name, const std::string &pro
|
|||
KernelPackPtr kernel_pack = std::make_shared<KernelPack>();
|
||||
// just a tmp solution.
|
||||
if (!kernel_pack->ReadFromJsonFile(kernel_json, processor)) {
|
||||
MS_LOG(DEBUG) << "Read cache json and bin file failed[" << kernel_json << "].";
|
||||
MS_LOG(ERROR) << "Read cache json and bin file failed[" << kernel_json << "].";
|
||||
return nullptr;
|
||||
} else {
|
||||
return kernel_pack;
|
||||
|
@ -250,7 +229,7 @@ KernelPackPtr InsertCache(const std::string &kernel_name, const std::string &pro
|
|||
(void)kernel_json.append(kernel_name).append(kJsonSuffix);
|
||||
KernelPackPtr kernel_pack = std::make_shared<KernelPack>();
|
||||
if (!kernel_pack->ReadFromJsonFile(kernel_json, processor)) {
|
||||
MS_LOG(DEBUG) << "Read json and bin file failed[" << kernel_json << "].";
|
||||
MS_LOG(ERROR) << "Read json and bin file failed[" << kernel_json << "].";
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
|
@ -714,6 +693,9 @@ void GetFuncGraphOutputNodes(const FuncGraphPtr &func_graph, std::vector<AnfNode
|
|||
for (size_t input_idx = 1; input_idx < cnode->inputs().size(); ++input_idx) {
|
||||
auto input_node = cnode->input(input_idx);
|
||||
MS_EXCEPTION_IF_NULL(input_node);
|
||||
if (input_node->isa<CNode>() && AnfAlgo::GetInputTensorNum(input_node) == 0) {
|
||||
continue;
|
||||
}
|
||||
output_list->push_back(AnfAlgo::VisitKernel(input_node, 0).first);
|
||||
}
|
||||
} else {
|
||||
|
|
|
@ -55,8 +55,7 @@ using KernelMetaPtr = std::shared_ptr<KernelMetaInfo>;
|
|||
class KernelMeta {
|
||||
public:
|
||||
KernelMeta() = default;
|
||||
void Initialize(int pid);
|
||||
void RemoveKernelCache();
|
||||
void Initialize();
|
||||
std::string Search(const std::string &kernel_name) const;
|
||||
bool Insert(const std::string &kernel_name, const std::string &kernel_json);
|
||||
std::string kernel_meta_path() const { return kernel_meta_path_; }
|
||||
|
|
|
@ -26,46 +26,26 @@ namespace mindspore {
|
|||
namespace kernel {
|
||||
constexpr size_t kSizeFloat16 = sizeof(float16);
|
||||
constexpr size_t kSizeFloat32 = sizeof(float);
|
||||
constexpr size_t kScalarIndex = 0;
|
||||
constexpr size_t kAdamWeightDecayInputSize = 9;
|
||||
constexpr size_t kAdamWeightDecayOutputSize = 3;
|
||||
|
||||
void AdamWeightDecayCPUKernel::ParallelForAdam(const CTask &task, size_t count) {
|
||||
auto max_thread_num = common::ThreadPool::GetInstance().GetSyncRunThreadNum();
|
||||
const float block_size = 128.0;
|
||||
const float align_size = 16.0;
|
||||
size_t thread_num = count < block_size * max_thread_num ? std::ceil(count / block_size) : max_thread_num;
|
||||
std::vector<common::Task> tasks;
|
||||
size_t start = 0;
|
||||
size_t once_compute_size = align_size * std::ceil(count / (align_size * thread_num));
|
||||
while (start < count) {
|
||||
size_t end = (start + once_compute_size) > count ? count : (start + once_compute_size);
|
||||
auto block = [&, start, end]() {
|
||||
task(start, end);
|
||||
return common::SUCCESS;
|
||||
};
|
||||
tasks.emplace_back(block);
|
||||
start += once_compute_size;
|
||||
}
|
||||
common::ThreadPool::GetInstance().SyncRun(tasks);
|
||||
}
|
||||
|
||||
template <typename T, typename S>
|
||||
void AdamWeightDecayCPUKernel::LaunchFusedAdam(const std::vector<AddressPtr> &inputs,
|
||||
const std::vector<AddressPtr> &outputs) {
|
||||
auto var = reinterpret_cast<T *>(inputs[0]->addr);
|
||||
auto m = reinterpret_cast<T *>(inputs[1]->addr);
|
||||
auto v = reinterpret_cast<T *>(inputs[2]->addr);
|
||||
auto lr = reinterpret_cast<T *>(inputs[3]->addr)[0];
|
||||
auto beta1 = reinterpret_cast<T *>(inputs[4]->addr)[0];
|
||||
auto beta2 = reinterpret_cast<T *>(inputs[5]->addr)[0];
|
||||
auto epsilon = reinterpret_cast<T *>(inputs[6]->addr)[0];
|
||||
auto decay = reinterpret_cast<T *>(inputs[7]->addr)[0];
|
||||
auto gradient16 = reinterpret_cast<S *>(inputs[8]->addr);
|
||||
void AdamWeightDecayCPUKernel::LaunchFusedAdam(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &) {
|
||||
auto var = reinterpret_cast<T *>(inputs[VAR]->addr);
|
||||
auto m = reinterpret_cast<T *>(inputs[M]->addr);
|
||||
auto v = reinterpret_cast<T *>(inputs[V]->addr);
|
||||
auto lr = reinterpret_cast<T *>(inputs[LR]->addr)[kScalarIndex];
|
||||
auto beta1 = reinterpret_cast<T *>(inputs[BETA1]->addr)[kScalarIndex];
|
||||
auto beta2 = reinterpret_cast<T *>(inputs[BETA2]->addr)[kScalarIndex];
|
||||
auto epsilon = reinterpret_cast<T *>(inputs[EPSILON]->addr)[kScalarIndex];
|
||||
auto decay = reinterpret_cast<T *>(inputs[DECAY]->addr)[kScalarIndex];
|
||||
auto gradient16 = reinterpret_cast<S *>(inputs[GRAD]->addr);
|
||||
const auto beta1_minus = 1 - beta1;
|
||||
const auto beta2_minus = 1 - beta2;
|
||||
|
||||
// multithreading
|
||||
size_t lens = inputs[0]->size > 0 ? static_cast<size_t>(inputs[0]->size / sizeof(float)) : 1;
|
||||
size_t lens = inputs[VAR]->size > 0 ? static_cast<size_t>(inputs[VAR]->size / sizeof(float)) : 1;
|
||||
std::function<void(size_t, size_t)> task;
|
||||
|
||||
task = [&](size_t start, size_t end) {
|
||||
|
@ -81,28 +61,27 @@ void AdamWeightDecayCPUKernel::LaunchFusedAdam(const std::vector<AddressPtr> &in
|
|||
var[i] -= lr * update;
|
||||
}
|
||||
};
|
||||
ParallelForAdam(task, lens);
|
||||
CPUKernelUtils::ParallelFor(task, lens);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void AdamWeightDecayCPUKernel::LaunchAdamWeightDecay(const std::vector<AddressPtr> &inputs,
|
||||
const std::vector<AddressPtr> &outputs) {
|
||||
auto var = reinterpret_cast<T *>(inputs[0]->addr);
|
||||
auto m = reinterpret_cast<T *>(inputs[1]->addr);
|
||||
auto v = reinterpret_cast<T *>(inputs[2]->addr);
|
||||
auto lr = reinterpret_cast<T *>(inputs[3]->addr)[0];
|
||||
auto beta1 = reinterpret_cast<T *>(inputs[4]->addr)[0];
|
||||
auto beta2 = reinterpret_cast<T *>(inputs[5]->addr)[0];
|
||||
auto epsilon = reinterpret_cast<T *>(inputs[6]->addr)[0];
|
||||
auto decay = reinterpret_cast<T *>(inputs[7]->addr)[0];
|
||||
auto gradient = reinterpret_cast<T *>(inputs[8]->addr);
|
||||
const std::vector<AddressPtr> &) {
|
||||
auto var = reinterpret_cast<T *>(inputs[VAR]->addr);
|
||||
auto m = reinterpret_cast<T *>(inputs[M]->addr);
|
||||
auto v = reinterpret_cast<T *>(inputs[V]->addr);
|
||||
auto lr = reinterpret_cast<T *>(inputs[LR]->addr)[kScalarIndex];
|
||||
auto beta1 = reinterpret_cast<T *>(inputs[BETA1]->addr)[kScalarIndex];
|
||||
auto beta2 = reinterpret_cast<T *>(inputs[BETA2]->addr)[kScalarIndex];
|
||||
auto epsilon = reinterpret_cast<T *>(inputs[EPSILON]->addr)[kScalarIndex];
|
||||
auto decay = reinterpret_cast<T *>(inputs[DECAY]->addr)[kScalarIndex];
|
||||
auto gradient = reinterpret_cast<T *>(inputs[GRAD]->addr);
|
||||
const auto beta1_minus = 1 - beta1;
|
||||
const auto beta2_minus = 1 - beta2;
|
||||
|
||||
// multithreading
|
||||
size_t lens = inputs[0]->size > 0 ? static_cast<size_t>(inputs[0]->size / sizeof(float)) : 1;
|
||||
size_t lens = inputs[VAR]->size > 0 ? static_cast<size_t>(inputs[VAR]->size / sizeof(float)) : 1;
|
||||
std::function<void(size_t, size_t)> task;
|
||||
|
||||
task = [&](size_t start, size_t end) {
|
||||
size_t i = AdamWeightDecayFp32(var, m, v, lr, beta1, beta2, epsilon, decay, gradient, start, end);
|
||||
// remaining
|
||||
|
@ -114,14 +93,14 @@ void AdamWeightDecayCPUKernel::LaunchAdamWeightDecay(const std::vector<AddressPt
|
|||
var[i] -= lr * update;
|
||||
}
|
||||
};
|
||||
ParallelForAdam(task, lens);
|
||||
CPUKernelUtils::ParallelFor(task, lens);
|
||||
}
|
||||
|
||||
void AdamWeightDecayCPUKernel::InitKernel(const CNodePtr &kernel_node) {
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
std::vector<size_t> var_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
|
||||
dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
|
||||
gradient_dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 8);
|
||||
std::vector<size_t> var_shape = AnfAlgo::GetInputDeviceShape(kernel_node, VAR);
|
||||
dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, VAR);
|
||||
gradient_dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, GRAD);
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != kAdamWeightDecayInputSize) {
|
||||
MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but AdamWeightDecay needs 9 inputs.";
|
||||
|
@ -155,12 +134,12 @@ void AdamWeightDecayCPUKernel::CheckParam(const std::vector<kernel::AddressPtr>
|
|||
}
|
||||
size_t elem1_size = elem_num_ * kSizeFloat32;
|
||||
size_t elem2_size = gradient_dtype_ == kNumberTypeFloat16 ? elem_num_ * kSizeFloat16 : elem1_size;
|
||||
if (inputs[0]->size != elem1_size || inputs[1]->size != elem1_size || inputs[2]->size != elem1_size ||
|
||||
inputs[8]->size != elem2_size) {
|
||||
if (inputs[VAR]->size != elem1_size || inputs[M]->size != elem1_size || inputs[V]->size != elem1_size ||
|
||||
inputs[GRAD]->size != elem2_size) {
|
||||
MS_LOG(EXCEPTION) << "Error input data size!";
|
||||
}
|
||||
if (inputs[3]->size != kSizeFloat32 || inputs[4]->size != kSizeFloat32 || inputs[5]->size != kSizeFloat32 ||
|
||||
inputs[6]->size != kSizeFloat32 || inputs[7]->size != kSizeFloat32) {
|
||||
if (inputs[LR]->size != kSizeFloat32 || inputs[BETA1]->size != kSizeFloat32 || inputs[BETA2]->size != kSizeFloat32 ||
|
||||
inputs[EPSILON]->size != kSizeFloat32 || inputs[DECAY]->size != kSizeFloat32) {
|
||||
MS_LOG(EXCEPTION) << "The attribute beta, lr, epsilon and weight decay must be float!";
|
||||
}
|
||||
}
|
||||
|
|
|
@ -32,7 +32,6 @@ class AdamWeightDecayCPUKernel : public CPUKernel {
|
|||
const std::vector<AddressPtr> &outputs) override;
|
||||
|
||||
private:
|
||||
void ParallelForAdam(const CTask &task, size_t count);
|
||||
void CheckParam(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
|
||||
template <typename T, typename S>
|
||||
void LaunchFusedAdam(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
|
||||
|
@ -41,6 +40,7 @@ class AdamWeightDecayCPUKernel : public CPUKernel {
|
|||
size_t elem_num_{0};
|
||||
TypeId dtype_{kTypeUnknown};
|
||||
TypeId gradient_dtype_{kTypeUnknown};
|
||||
enum input_list_ { VAR, M, V, LR, BETA1, BETA2, EPSILON, DECAY, GRAD };
|
||||
};
|
||||
|
||||
MS_REG_CPU_KERNEL(AdamWeightDecay,
|
||||
|
|
|
@ -76,27 +76,10 @@ void ApplyAdagradCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
|
|||
|
||||
// multithreading
|
||||
size_t length = inputs[0]->size / sizeof(T);
|
||||
size_t max_thread_num = std::thread::hardware_concurrency();
|
||||
size_t use_thread_num = length < 128 * max_thread_num ? std::ceil(length / 128.0) : max_thread_num;
|
||||
std::vector<std::thread> threads;
|
||||
threads.reserve(use_thread_num);
|
||||
size_t start = 0;
|
||||
const size_t batch_size = (length + use_thread_num - 1) / use_thread_num;
|
||||
|
||||
if (batch_size == 0) {
|
||||
MS_LOG(EXCEPTION) << "Error occur in launch kernel";
|
||||
return;
|
||||
}
|
||||
while (start < length) {
|
||||
size_t end = (start + batch_size) > length ? length : (start + batch_size);
|
||||
threads.emplace_back(
|
||||
std::thread(&ApplyAdagradCPUKernel::LaunchApplyAdagrad<T *>, this, var, accum, lr, gradient, start, end));
|
||||
start += batch_size;
|
||||
}
|
||||
|
||||
for (auto &it : threads) {
|
||||
it.join();
|
||||
}
|
||||
auto task = [this, &var, &accum, lr, gradient](size_t start, size_t end) {
|
||||
LaunchApplyAdagrad(var, accum, lr, gradient, start, end);
|
||||
};
|
||||
CPUKernelUtils::ParallelForAutoSearch(task, length, ¶llel_search_info_);
|
||||
|
||||
// Copy result to output tensor
|
||||
auto output_var = reinterpret_cast<T *>(outputs[0]->addr);
|
||||
|
|
|
@ -13,10 +13,12 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "backend/kernel_compiler/cpu/arithmetic_logic_cpu_kernel.h"
|
||||
#include <cmath>
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include "backend/kernel_compiler/cpu/arithmetic_logic_cpu_kernel.h"
|
||||
#include <functional>
|
||||
#include "runtime/device/cpu/cpu_device_address.h"
|
||||
|
||||
namespace mindspore {
|
||||
|
@ -29,7 +31,9 @@ void ArithmeticLogicCPUKernel<T>::Less(const T *input1, const T *input2, bool *o
|
|||
auto iter = base_iter;
|
||||
iter.SetPos(start);
|
||||
for (size_t i = start; i < end; i++) {
|
||||
out[i] = input1[iter.GetInputPosA()] < input2[iter.GetInputPosB()];
|
||||
auto x = input1[iter.GetInputPosA()];
|
||||
auto y = input2[iter.GetInputPosB()];
|
||||
out[i] = std::less<T>()(x, y);
|
||||
iter.GenNextPos();
|
||||
}
|
||||
};
|
||||
|
@ -37,7 +41,9 @@ void ArithmeticLogicCPUKernel<T>::Less(const T *input1, const T *input2, bool *o
|
|||
} else {
|
||||
base_iter.SetPos(0);
|
||||
for (size_t i = 0; i < output_size_; i++) {
|
||||
out[i] = input1[base_iter.GetInputPosA()] < input2[base_iter.GetInputPosB()];
|
||||
auto x = input1[base_iter.GetInputPosA()];
|
||||
auto y = input2[base_iter.GetInputPosB()];
|
||||
out[i] = std::less<T>()(x, y);
|
||||
base_iter.GenNextPos();
|
||||
}
|
||||
}
|
||||
|
@ -50,7 +56,9 @@ void ArithmeticLogicCPUKernel<T>::Equal(const T *input1, const T *input2, bool *
|
|||
auto iter = base_iter;
|
||||
iter.SetPos(start);
|
||||
for (size_t i = start; i < end; i++) {
|
||||
out[i] = input1[iter.GetInputPosA()] == input2[iter.GetInputPosB()];
|
||||
auto x = input1[iter.GetInputPosA()];
|
||||
auto y = input2[iter.GetInputPosB()];
|
||||
out[i] = std::equal_to<T>()(x, y);
|
||||
iter.GenNextPos();
|
||||
}
|
||||
};
|
||||
|
@ -64,7 +72,9 @@ void ArithmeticLogicCPUKernel<T>::NotEqual(const T *input1, const T *input2, boo
|
|||
auto iter = base_iter;
|
||||
iter.SetPos(start);
|
||||
for (size_t i = start; i < end; i++) {
|
||||
out[i] = input1[iter.GetInputPosA()] != input2[iter.GetInputPosB()];
|
||||
auto x = input1[iter.GetInputPosA()];
|
||||
auto y = input2[iter.GetInputPosB()];
|
||||
out[i] = std::not_equal_to<T>()(x, y);
|
||||
iter.GenNextPos();
|
||||
}
|
||||
};
|
||||
|
@ -106,7 +116,9 @@ void ArithmeticLogicCPUKernel<T>::Greater(const T *input1, const T *input2, bool
|
|||
auto iter = base_iter;
|
||||
iter.SetPos(start);
|
||||
for (size_t i = start; i < end; i++) {
|
||||
out[i] = input1[iter.GetInputPosA()] > input2[iter.GetInputPosB()];
|
||||
auto x = input1[iter.GetInputPosA()];
|
||||
auto y = input2[iter.GetInputPosB()];
|
||||
out[i] = std::greater<T>()(x, y);
|
||||
iter.GenNextPos();
|
||||
}
|
||||
};
|
||||
|
@ -120,7 +132,9 @@ void ArithmeticLogicCPUKernel<T>::GreaterEqual(const T *input1, const T *input2,
|
|||
auto iter = base_iter;
|
||||
iter.SetPos(start);
|
||||
for (size_t i = start; i < end; i++) {
|
||||
out[i] = input1[iter.GetInputPosA()] >= input2[iter.GetInputPosB()];
|
||||
auto x = input1[iter.GetInputPosA()];
|
||||
auto y = input2[iter.GetInputPosB()];
|
||||
out[i] = std::greater_equal<T>()(x, y);
|
||||
iter.GenNextPos();
|
||||
}
|
||||
};
|
||||
|
@ -134,7 +148,9 @@ void ArithmeticLogicCPUKernel<T>::LessEqual(const T *input1, const T *input2, bo
|
|||
auto iter = base_iter;
|
||||
iter.SetPos(start);
|
||||
for (size_t i = start; i < end; i++) {
|
||||
out[i] = input1[iter.GetInputPosA()] <= input2[iter.GetInputPosB()];
|
||||
auto x = input1[iter.GetInputPosA()];
|
||||
auto y = input2[iter.GetInputPosB()];
|
||||
out[i] = std::less_equal<T>()(x, y);
|
||||
iter.GenNextPos();
|
||||
}
|
||||
};
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/**
|
||||
* Copyright 2019 Huawei Technologies Co., Ltd
|
||||
* Copyright 2019-2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
#include <string>
|
||||
|
||||
#include "runtime/device/kernel_info.h"
|
||||
#include "runtime/device/cpu/kernel_select_cpu.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
|
@ -111,6 +112,11 @@ std::pair<bool, size_t> CPUKernelFactory::CPUKernelAttrCheck(const std::string &
|
|||
MS_LOG(INFO) << "Not registered CPU kernel: op[" << kernel_name << "]!";
|
||||
return std::make_pair(false, 0);
|
||||
}
|
||||
|
||||
if (device::cpu::IsDynamicParamKernel(kernel_name)) {
|
||||
return std::make_pair(true, 0);
|
||||
}
|
||||
|
||||
auto kernel_attrs = GetSupportedKernelAttrList(kernel_name);
|
||||
if (kernel_attrs[0].GetInputSize() == 0 && kernel_attrs[0].GetOutputSize() == 0) {
|
||||
auto op_info_ptr = mindspore::kernel::OpLib::FindOp(kernel_name, kernel::OpImplyType::kCPU);
|
||||
|
|
|
@ -43,9 +43,9 @@ void DropoutGradCpuBwdKernel::InitKernel(const CNodePtr &kernel_node) {
|
|||
bool DropoutGradCpuBwdKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
|
||||
const std::vector<AddressPtr> &outputs) {
|
||||
if (dtype_ == kNumberTypeFloat16) {
|
||||
DropoutBackwardKernel<float16>(inputs, outputs, num_count_, keep_prob_);
|
||||
DropoutBackwardKernel<float16>(inputs, outputs, keep_prob_);
|
||||
} else if (dtype_ == kNumberTypeFloat32) {
|
||||
DropoutBackwardKernel<float>(inputs, outputs, num_count_, keep_prob_);
|
||||
DropoutBackwardKernel<float>(inputs, outputs, keep_prob_);
|
||||
} else {
|
||||
MS_LOG(ERROR) << "Input data type: " << dtype_ << " is not supported for DropoutGrad kernel for CPU.";
|
||||
}
|
||||
|
@ -55,8 +55,7 @@ bool DropoutGradCpuBwdKernel::Launch(const std::vector<AddressPtr> &inputs, cons
|
|||
|
||||
template <typename T>
|
||||
void DropoutGradCpuBwdKernel::DropoutBackwardKernel(const std::vector<AddressPtr> &inputs,
|
||||
const std::vector<AddressPtr> &outputs, size_t num_count,
|
||||
float keep_prob) {
|
||||
const std::vector<AddressPtr> &outputs, float keep_prob) {
|
||||
auto *output = reinterpret_cast<T *>(outputs[0]->addr);
|
||||
const auto *input = reinterpret_cast<T *>(inputs[0]->addr);
|
||||
const auto *mask = reinterpret_cast<T *>(inputs[1]->addr);
|
||||
|
@ -70,7 +69,7 @@ void DropoutGradCpuBwdKernel::DropoutBackwardKernel(const std::vector<AddressPtr
|
|||
input_tmp[i] = static_cast<float>(input[i]);
|
||||
mask_tmp[i] = static_cast<float>(mask[i]);
|
||||
}
|
||||
DropoutGrad(input_tmp, mask_tmp, output_tmp, num_count_, scale);
|
||||
DropoutGrad(input_tmp, mask_tmp, output_tmp, SizeToInt(num_count_), scale);
|
||||
for (size_t i = 0; i < num_count_; ++i) {
|
||||
output[i] = static_cast<float16>(output_tmp[i]);
|
||||
}
|
||||
|
@ -78,7 +77,7 @@ void DropoutGradCpuBwdKernel::DropoutBackwardKernel(const std::vector<AddressPtr
|
|||
delete[] output_tmp;
|
||||
delete[] mask_tmp;
|
||||
} else if constexpr (std::is_same_v<T, float>) {
|
||||
DropoutGrad(input, mask, output, num_count_, scale);
|
||||
DropoutGrad(input, mask, output, SizeToInt(num_count_), scale);
|
||||
}
|
||||
}
|
||||
} // namespace kernel
|
||||
|
|
|
@ -40,7 +40,7 @@ class DropoutGradCpuBwdKernel : public CPUKernel {
|
|||
TypeId dtype_{kTypeUnknown};
|
||||
template <typename T>
|
||||
void DropoutBackwardKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs,
|
||||
size_t num_count, float keep_prob);
|
||||
float keep_prob);
|
||||
};
|
||||
|
||||
MS_REG_CPU_KERNEL(DropoutGrad, KernelAttr(), DropoutGradCpuBwdKernel);
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
* Copyright 2020-2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
|
@ -13,8 +13,10 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include <map>
|
||||
|
||||
#include "backend/kernel_compiler/cpu/eltwise_grad_cpu_kernel.h"
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include "common/thread_pool.h"
|
||||
#include "runtime/device/cpu/cpu_device_address.h"
|
||||
#include "nnacl/fp32_grad/activation_grad.h"
|
||||
|
@ -25,50 +27,50 @@ namespace mindspore {
|
|||
namespace kernel {
|
||||
template <typename T>
|
||||
void EltWiseGradCPUKernel<T>::ReluGrad(const T *input1, const T *input2, T *out, size_t start, size_t end) const {
|
||||
if constexpr (std::is_same_v<T, float>) {
|
||||
int ret = ::ReluGrad(input1 + start, input2 + start, end - start, out + start);
|
||||
if (ret == NNACL_ERR) {
|
||||
MS_LOG(EXCEPTION) << "ReLUGrad failed.";
|
||||
}
|
||||
} else {
|
||||
if constexpr (!std::is_same<T, float>::value) {
|
||||
MS_LOG(EXCEPTION) << "ReLUGrad only support float";
|
||||
}
|
||||
|
||||
int ret = ::ReluGrad(input1 + start, input2 + start, end - start, out + start);
|
||||
if (ret == NNACL_ERR) {
|
||||
MS_LOG(EXCEPTION) << "ReLUGrad execute failed.";
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void EltWiseGradCPUKernel<T>::ReLU6Grad(const T *input1, const T *input2, T *out, size_t start, size_t end) const {
|
||||
if constexpr (std::is_same_v<T, float>) {
|
||||
int ret = ::Relu6Grad(input1 + start, input2 + start, end - start, out + start);
|
||||
if (ret == NNACL_ERR) {
|
||||
MS_LOG(EXCEPTION) << "ReLU6Grad failed.";
|
||||
}
|
||||
} else {
|
||||
if constexpr (!std::is_same<T, float>::value) {
|
||||
MS_LOG(EXCEPTION) << "ReLU6Grad only support float";
|
||||
}
|
||||
|
||||
int ret = ::Relu6Grad(input1 + start, input2 + start, end - start, out + start);
|
||||
if (ret == NNACL_ERR) {
|
||||
MS_LOG(EXCEPTION) << "ReLU6Grad execute failed.";
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void EltWiseGradCPUKernel<T>::AbsGrad(const T *input1, const T *input2, T *out, size_t start, size_t end) const {
|
||||
if constexpr (std::is_same_v<T, float>) {
|
||||
int ret = ::ElementAbsGrad(input1 + start, input2 + start, out + start, end - start);
|
||||
if (ret == NNACL_ERR) {
|
||||
MS_LOG(EXCEPTION) << "AbsGrad failed.";
|
||||
}
|
||||
} else {
|
||||
if constexpr (!std::is_same<T, float>::value) {
|
||||
MS_LOG(EXCEPTION) << "AbsGrad only support float";
|
||||
}
|
||||
|
||||
int ret = ::ElementAbsGrad(input1 + start, input2 + start, out + start, end - start);
|
||||
if (ret == NNACL_ERR) {
|
||||
MS_LOG(EXCEPTION) << "AbsGrad execute failed.";
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void EltWiseGradCPUKernel<T>::SigmoidGrad(const T *input1, const T *input2, T *out, size_t start, size_t end) const {
|
||||
if constexpr (std::is_same_v<T, float>) {
|
||||
int ret = ::SigmoidGrad(input2 + start, input1 + start, end - start, out + start);
|
||||
if (ret == NNACL_ERR) {
|
||||
MS_LOG(EXCEPTION) << "SigmoidGrad failed.";
|
||||
}
|
||||
} else {
|
||||
if constexpr (!std::is_same<T, float>::value) {
|
||||
MS_LOG(EXCEPTION) << "SigmoidGrad only support float";
|
||||
}
|
||||
|
||||
int ret = ::SigmoidGrad(input2 + start, input1 + start, end - start, out + start);
|
||||
if (ret == NNACL_ERR) {
|
||||
MS_LOG(EXCEPTION) << "SigmoidGrad execute failed.";
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
|
@ -80,14 +82,14 @@ void EltWiseGradCPUKernel<T>::SqrtGrad(const T *input1, const T *input2, T *out,
|
|||
|
||||
template <typename T>
|
||||
void EltWiseGradCPUKernel<T>::TanhGrad(const T *input1, const T *input2, T *out, size_t start, size_t end) const {
|
||||
if constexpr (std::is_same_v<T, float>) {
|
||||
int ret = ::TanhGrad(input2 + start, input1 + start, end - start, out + start);
|
||||
if (ret == NNACL_ERR) {
|
||||
MS_LOG(EXCEPTION) << "TanhGrad failed.";
|
||||
}
|
||||
} else {
|
||||
if constexpr (!std::is_same<T, float>::value) {
|
||||
MS_LOG(EXCEPTION) << "TanhGrad only support float";
|
||||
}
|
||||
|
||||
int ret = ::TanhGrad(input2 + start, input1 + start, end - start, out + start);
|
||||
if (ret == NNACL_ERR) {
|
||||
MS_LOG(EXCEPTION) << "TanhGrad execute failed.";
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
|
@ -207,6 +209,18 @@ void EltWiseGradCPUKernel<T>::AcoshGrad(const T *input1, const T *input2, T *out
|
|||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void EltWiseGradCPUKernel<T>::SoftplusGrad(const T *input1, const T *input2, T *out, size_t start, size_t end) const {
|
||||
if constexpr (!std::is_same<T, float>::value) {
|
||||
MS_LOG(EXCEPTION) << "SoftplusGrad only support float";
|
||||
}
|
||||
|
||||
int ret = ::SoftplusGrad(input1 + start, input2 + start, end - start, out + start);
|
||||
if (ret == NNACL_ERR) {
|
||||
MS_LOG(EXCEPTION) << "SoftplusGrad execute failed.";
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void EltWiseGradCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
|
@ -219,12 +233,19 @@ bool EltWiseGradCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inpu
|
|||
const std::vector<kernel::AddressPtr> &outputs) {
|
||||
static const std::map<std::string,
|
||||
std::function<void(EltWiseGradCPUKernel *, const T *, const T *, T *, size_t, size_t)>>
|
||||
elt_map{{"ReluGrad", &EltWiseGradCPUKernel<T>::ReluGrad}, {"ReLU6Grad", &EltWiseGradCPUKernel<T>::ReLU6Grad},
|
||||
{"SigmoidGrad", &EltWiseGradCPUKernel<T>::SigmoidGrad}, {"AbsGrad", &EltWiseGradCPUKernel<T>::AbsGrad},
|
||||
{"TanhGrad", &EltWiseGradCPUKernel<T>::TanhGrad}, {"SqrtGrad", &EltWiseGradCPUKernel<T>::SqrtGrad},
|
||||
{"GeLUGrad", &EltWiseGradCPUKernel<T>::GeluGrad}, {"AsinGrad", &EltWiseGradCPUKernel<T>::AsinGrad},
|
||||
{"ACosGrad", &EltWiseGradCPUKernel<T>::ACosGrad}, {"AtanGrad", &EltWiseGradCPUKernel<T>::AtanGrad},
|
||||
{"AsinhGrad", &EltWiseGradCPUKernel<T>::AsinhGrad}, {"AcoshGrad", &EltWiseGradCPUKernel<T>::AcoshGrad}};
|
||||
elt_map{{prim::kPrimReluGrad->name(), &EltWiseGradCPUKernel<T>::ReluGrad},
|
||||
{prim::kPrimRelu6Grad->name(), &EltWiseGradCPUKernel<T>::ReLU6Grad},
|
||||
{prim::kPrimSigmoidGrad->name(), &EltWiseGradCPUKernel<T>::SigmoidGrad},
|
||||
{prim::kPrimAbsGrad->name(), &EltWiseGradCPUKernel<T>::AbsGrad},
|
||||
{prim::kPrimTanhGrad->name(), &EltWiseGradCPUKernel<T>::TanhGrad},
|
||||
{prim::kPrimSqrtGrad->name(), &EltWiseGradCPUKernel<T>::SqrtGrad},
|
||||
{prim::kPrimGeLUGrad->name(), &EltWiseGradCPUKernel<T>::GeluGrad},
|
||||
{prim::kPrimAsinGrad->name(), &EltWiseGradCPUKernel<T>::AsinGrad},
|
||||
{prim::kPrimACosGrad->name(), &EltWiseGradCPUKernel<T>::ACosGrad},
|
||||
{prim::kPrimAtanGrad->name(), &EltWiseGradCPUKernel<T>::AtanGrad},
|
||||
{prim::kPrimAsinhGrad->name(), &EltWiseGradCPUKernel<T>::AsinhGrad},
|
||||
{prim::kPrimAcoshGrad->name(), &EltWiseGradCPUKernel<T>::AcoshGrad},
|
||||
{prim::kPrimSoftplusGrad->name(), &EltWiseGradCPUKernel<T>::SoftplusGrad}};
|
||||
if (inputs.size() < 2 || outputs.size() != 1) {
|
||||
MS_LOG(ERROR) << kernel_name_ << " requires at least 2 inputs and 1 output, but got " << inputs.size()
|
||||
<< " inputs and " << outputs.size() << " output.";
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
* Copyright 2020-2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
|
@ -48,6 +48,7 @@ class EltWiseGradCPUKernel : public CPUKernel {
|
|||
void AtanGrad(const T *input1, const T *input2, T *out, size_t start, size_t end) const;
|
||||
void AsinhGrad(const T *input1, const T *input2, T *out, size_t start, size_t end) const;
|
||||
void AcoshGrad(const T *input1, const T *input2, T *out, size_t start, size_t end) const;
|
||||
void SoftplusGrad(const T *input1, const T *input2, T *out, size_t start, size_t end) const;
|
||||
|
||||
std::string kernel_name_ = "";
|
||||
};
|
||||
|
@ -103,6 +104,10 @@ MS_REG_CPU_KERNEL_T(
|
|||
AcoshGrad,
|
||||
KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
|
||||
EltWiseGradCPUKernel, float);
|
||||
MS_REG_CPU_KERNEL_T(
|
||||
SoftplusGrad,
|
||||
KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
|
||||
EltWiseGradCPUKernel, float);
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
||||
|
|
|
@ -13,39 +13,47 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "backend/kernel_compiler/cpu/mkldnn/eltwise_cpu_kernel.h"
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
|
||||
#include "runtime/device/cpu/cpu_device_address.h"
|
||||
#include "utils/ms_utils.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
namespace {
|
||||
struct DescParam {
|
||||
dnnl::algorithm algorithm;
|
||||
float alpha = 0.f;
|
||||
float beta = 0.f;
|
||||
};
|
||||
} // namespace
|
||||
|
||||
dnnl::eltwise_forward::desc EltWiseCPUKernel::GetForwardEltwiseDesc(const CNodePtr &kernel_node,
|
||||
const dnnl::memory::desc src_desc) {
|
||||
static const std::unordered_map<std::string, DescParam> eltWiseOpDescMap{
|
||||
{prim::kPrimRelu->name(), DescParam{dnnl::algorithm::eltwise_relu}},
|
||||
{prim::kPrimRelu6->name(), DescParam{dnnl::algorithm::eltwise_clip, 0.f, 6.f}},
|
||||
{prim::kPrimAbs->name(), DescParam{dnnl::algorithm::eltwise_abs}},
|
||||
{prim::kPrimExp->name(), DescParam{dnnl::algorithm::eltwise_exp}},
|
||||
{prim::kPrimLog->name(), DescParam{dnnl::algorithm::eltwise_log}},
|
||||
{prim::kPrimSigmoid->name(), DescParam{dnnl::algorithm::eltwise_logistic}},
|
||||
{prim::kPrimSqrt->name(), DescParam{dnnl::algorithm::eltwise_sqrt}},
|
||||
{prim::kPrimSquare->name(), DescParam{dnnl::algorithm::eltwise_square}},
|
||||
{prim::kPrimTanh->name(), DescParam{dnnl::algorithm::eltwise_tanh}},
|
||||
{prim::kPrimElu->name(), DescParam{dnnl::algorithm::eltwise_elu, 1.f, 0.f}},
|
||||
{prim::kPrimSoftplus->name(), DescParam{dnnl::algorithm::eltwise_soft_relu}},
|
||||
};
|
||||
|
||||
std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
if (kernel_name == "ReLU") {
|
||||
return dnnl::eltwise_forward::desc(DnnlForward, dnnl::algorithm::eltwise_relu, src_desc, 0.0);
|
||||
} else if (kernel_name == "ReLU6") {
|
||||
return dnnl::eltwise_forward::desc(DnnlForward, dnnl::algorithm::eltwise_clip, src_desc, 0.0, 6.0);
|
||||
} else if (kernel_name == "Abs") {
|
||||
return dnnl::eltwise_forward::desc(DnnlForward, dnnl::algorithm::eltwise_abs, src_desc);
|
||||
} else if (kernel_name == "Exp") {
|
||||
return dnnl::eltwise_forward::desc(DnnlForward, dnnl::algorithm::eltwise_exp, src_desc);
|
||||
} else if (kernel_name == "Log") {
|
||||
return dnnl::eltwise_forward::desc(DnnlForward, dnnl::algorithm::eltwise_log, src_desc);
|
||||
} else if (kernel_name == "Sigmoid") {
|
||||
return dnnl::eltwise_forward::desc(DnnlForward, dnnl::algorithm::eltwise_logistic, src_desc);
|
||||
} else if (kernel_name == "Sqrt") {
|
||||
return dnnl::eltwise_forward::desc(DnnlForward, dnnl::algorithm::eltwise_sqrt, src_desc);
|
||||
} else if (kernel_name == "Square") {
|
||||
return dnnl::eltwise_forward::desc(DnnlForward, dnnl::algorithm::eltwise_square, src_desc);
|
||||
} else if (kernel_name == "Tanh") {
|
||||
return dnnl::eltwise_forward::desc(DnnlForward, dnnl::algorithm::eltwise_tanh, src_desc);
|
||||
} else if (kernel_name == "Elu") {
|
||||
return dnnl::eltwise_forward::desc(DnnlForward, dnnl::algorithm::eltwise_elu, src_desc, 1.0);
|
||||
} else {
|
||||
MS_LOG(EXCEPTION) << "Eltwise operators don't support " << kernel_name;
|
||||
const auto desc_pair = eltWiseOpDescMap.find(kernel_name);
|
||||
if (desc_pair == eltWiseOpDescMap.end()) {
|
||||
MS_LOG(EXCEPTION) << "EltWiseCPUKernel does not support " << kernel_name;
|
||||
}
|
||||
return dnnl::eltwise_forward::desc(DnnlForward, desc_pair->second.algorithm, src_desc, desc_pair->second.alpha,
|
||||
desc_pair->second.beta);
|
||||
}
|
||||
|
||||
void EltWiseCPUKernel::InitKernel(const CNodePtr &kernel_node) {
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/**
|
||||
* Copyright 2019 Huawei Technologies Co., Ltd
|
||||
* Copyright 2019-2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
|
@ -56,6 +56,8 @@ MS_REG_CPU_KERNEL(Square, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutpu
|
|||
EltWiseCPUKernel);
|
||||
MS_REG_CPU_KERNEL(Tanh, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
|
||||
EltWiseCPUKernel);
|
||||
MS_REG_CPU_KERNEL(Softplus, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
|
||||
EltWiseCPUKernel);
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
||||
|
|
|
@ -36,6 +36,24 @@ file(GLOB KERNEL_SRC
|
|||
${NNACL_DIR}/fp32_grad/*.c
|
||||
)
|
||||
|
||||
if(MSLITE_STRING_KERNEL)
|
||||
file(GLOB KERNEL_SRC_INFER_STRING
|
||||
${NNACL_DIR}/infer/string/*.c
|
||||
)
|
||||
set(KERNEL_SRC
|
||||
${KERNEL_SRC}
|
||||
${KERNEL_SRC_INFER_STRING}
|
||||
)
|
||||
endif()
|
||||
if(MSLITE_CONTROL_TENSORLIST)
|
||||
file(GLOB KERNEL_SRC_INFER_CONTROL_TENSORLIST
|
||||
${NNACL_DIR}/infer/control/*.c
|
||||
)
|
||||
set(KERNEL_SRC
|
||||
${KERNEL_SRC}
|
||||
${KERNEL_SRC_INFER_CONTROL_TENSORLIST}
|
||||
)
|
||||
endif()
|
||||
if(PLATFORM_ARM64)
|
||||
file(GLOB ASSEMBLY_SRC ${NNACL_DIR}/assembly/arm64/*.S)
|
||||
set_property(SOURCE ${ASSEMBLY_SRC} PROPERTY LANGUAGE C)
|
||||
|
|
|
@ -5,7 +5,8 @@
|
|||
|
||||
//void MatmulInt8DpNeon64(const int8_t *a, const int8_t *b, int8_t *dst, int row8, int col8, int deep4,
|
||||
// const int *a_sums, const int *bias, int act_min, int act_max, int out_zp,
|
||||
// int *multiplier, int *left_shift, int *right_shift, int row, int col, int stride, int peroc);
|
||||
// const int *multiplier, const int *left_shift, const int *right_shift, int row,
|
||||
// int col, int stride, int peroc);
|
||||
|
||||
// x0: a(left matrix ptr)
|
||||
// x1: b(right matrix ptr)
|
||||
|
|
|
@ -4,8 +4,9 @@
|
|||
.align 5
|
||||
|
||||
//void MatmulInt8DpOpt(const int8_t *a, const int8_t *b, int8_t *dst, int row, int col, int deep4, const int *a_sums,
|
||||
// const int *bias, int act_min, int act_max, int out_zp, int32_t *multiplier, int32_t *left_shift,
|
||||
// int32_t *right_shift, size_t stride, size_t filter_peroc, int32_t *filter_zp)
|
||||
// const int *bias, int act_min, int act_max, int out_zp, const int32_t *multiplier,
|
||||
// const int32_t *left_shift, const int32_t *right_shift, size_t stride, size_t filter_peroc,
|
||||
// const int32_t *filter_zp)
|
||||
|
||||
// x0: a(left matrix ptr)
|
||||
// x1: b(right matrix ptr)
|
||||
|
|
|
@ -23,19 +23,19 @@ void BatchToSpaceNoCropForNHWC(const void *input, void *output, const int *in_sh
|
|||
int in_h = in_shape[1];
|
||||
int in_w = in_shape[2];
|
||||
int in_c = in_shape[3];
|
||||
size_t stride_h = block_w * out_n;
|
||||
size_t output_offset = 0;
|
||||
size_t copy_size = in_c * data_size;
|
||||
size_t in_stride_h = in_w * in_c;
|
||||
size_t in_stride_n = in_stride_h * in_h;
|
||||
int stride_h = block_w * out_n;
|
||||
int output_offset = 0;
|
||||
int copy_size = in_c * data_size;
|
||||
int in_stride_h = in_w * in_c;
|
||||
int in_stride_n = in_stride_h * in_h;
|
||||
for (int n = 0; n < out_n; ++n) {
|
||||
for (int h = 0; h < in_h; ++h) {
|
||||
size_t h_offset = h * in_stride_h;
|
||||
int h_offset = h * in_stride_h;
|
||||
for (int bh = 0; bh < block_h; ++bh) {
|
||||
for (int w = 0; w < in_w; ++w) {
|
||||
size_t w_offset = w * in_c;
|
||||
int w_offset = w * in_c;
|
||||
for (int bw = 0; bw < block_w; ++bw) {
|
||||
size_t in_offset = in_stride_n * (bh * stride_h + bw * out_n + n) + w_offset + h_offset;
|
||||
int in_offset = in_stride_n * (bh * stride_h + bw * out_n + n) + w_offset + h_offset;
|
||||
memcpy((int8_t *)output + output_offset, (int8_t *)input + in_offset * data_size, copy_size);
|
||||
output_offset += copy_size;
|
||||
}
|
||||
|
@ -49,6 +49,9 @@ void BatchToSpaceForNHWC(const void *input, void *output, const int *in_shape, i
|
|||
const int *crops, int data_size) {
|
||||
int block_h = block[0];
|
||||
int block_w = block[1];
|
||||
if (block_h == 0 || block_w == 0) {
|
||||
return;
|
||||
}
|
||||
int in_h = in_shape[1];
|
||||
int in_w = in_shape[2];
|
||||
int in_c = in_shape[3];
|
||||
|
@ -61,27 +64,27 @@ void BatchToSpaceForNHWC(const void *input, void *output, const int *in_shape, i
|
|||
int w_end = MSMIN((in_w * block_w - crops[3]) / block_w + 1, in_w);
|
||||
int w_valid_end = in_w * block_w - crops[3] - 1;
|
||||
|
||||
size_t stride_h = block_w * out_n;
|
||||
size_t output_offset = 0;
|
||||
size_t copy_size = in_c * data_size;
|
||||
size_t in_stride_h = in_w * in_c;
|
||||
size_t in_stride_n = in_stride_h * in_h;
|
||||
int stride_h = block_w * out_n;
|
||||
int output_offset = 0;
|
||||
int copy_size = in_c * data_size;
|
||||
int in_stride_h = in_w * in_c;
|
||||
int in_stride_n = in_stride_h * in_h;
|
||||
for (int n = 0; n < out_n; ++n) {
|
||||
for (int h = h_start; h < h_end; ++h) {
|
||||
size_t h_offset = h * in_stride_h;
|
||||
int h_offset = h * in_stride_h;
|
||||
for (int bh = 0; bh < block_h; ++bh) {
|
||||
size_t h_index = h * block_h + bh;
|
||||
int h_index = h * block_h + bh;
|
||||
if (h_index < h_valid_begin || h_index > h_valid_end) {
|
||||
continue;
|
||||
}
|
||||
for (int w = w_start; w < w_end; ++w) {
|
||||
size_t w_offset = w * in_c;
|
||||
int w_offset = w * in_c;
|
||||
for (int bw = 0; bw < block_w; ++bw) {
|
||||
size_t w_index = w * block_w + bw;
|
||||
int w_index = w * block_w + bw;
|
||||
if (w_index < w_valid_begin || w_index > w_valid_end) {
|
||||
continue;
|
||||
}
|
||||
size_t in_offset = in_stride_n * (bh * stride_h + bw * out_n + n) + w_offset + h_offset;
|
||||
int in_offset = in_stride_n * (bh * stride_h + bw * out_n + n) + w_offset + h_offset;
|
||||
memcpy((int8_t *)output + output_offset, (int8_t *)input + in_offset * data_size, copy_size);
|
||||
output_offset += copy_size;
|
||||
}
|
||||
|
|
|
@ -62,7 +62,7 @@ void pad_input_shape(int *input_shape, int input_shape_len, int output_shape_len
|
|||
shape_info->input_shape_size_ = dim_max + 1; \
|
||||
\
|
||||
size_t before_dim_elements_num = accumulate(input_shape, 0, dim_max - 1); \
|
||||
size_t after_dim_elements_num = input_shape[dim_max]; \
|
||||
size_t after_dim_elements_num = (size_t)(input_shape[dim_max]); \
|
||||
size_t dim_broadcast_rate = (size_t)(output_shape[dim_max] / input_shape[dim_max]); \
|
||||
for (size_t i = 0; i < before_dim_elements_num; ++i) { \
|
||||
const type *in_ptr = input + i * after_dim_elements_num; \
|
||||
|
|
|
@ -24,15 +24,18 @@ void Concat(void **input, int input_num, int axis, int **inputs_output_shape, si
|
|||
}
|
||||
|
||||
int after_axis_size = data_size;
|
||||
for (size_t i = axis + 1; i < shape_size; ++i) {
|
||||
for (size_t i = (size_t)(axis) + 1; i < shape_size; ++i) {
|
||||
after_axis_size *= inputs_output_shape[0][i];
|
||||
}
|
||||
int axis_offset = 0;
|
||||
uint8_t *dst_base = (output);
|
||||
size_t output_stride = after_axis_size * inputs_output_shape[input_num][axis];
|
||||
int output_stride = after_axis_size * inputs_output_shape[input_num][axis];
|
||||
for (int i = 0; i < input_num; ++i) {
|
||||
const uint8_t *src_base = (input[i]);
|
||||
size_t input_stride = after_axis_size * inputs_output_shape[i][axis];
|
||||
if (inputs_output_shape[i] == NULL) {
|
||||
continue;
|
||||
}
|
||||
int input_stride = after_axis_size * inputs_output_shape[i][axis];
|
||||
int offset = UP_DIV(input_stride, thread_num);
|
||||
int count = input_stride - offset * task_id;
|
||||
if (count <= 0) {
|
||||
|
|
|
@ -22,17 +22,17 @@ void DepthToSpaceForNHWC(const void *input, void *output, const int *in_shape, c
|
|||
int32_t in_shape_dim1 = in_shape[1];
|
||||
size_t copy_size = block_size * param->out_stride_dim2_ * param->data_type_size_;
|
||||
for (int i = 0; i < in_shape[0]; ++i) {
|
||||
size_t in_offset_n = i * param->in_stride_dim0_;
|
||||
size_t out_offset_n = i * param->out_stride_dim0_;
|
||||
int in_offset_n = i * param->in_stride_dim0_;
|
||||
int out_offset_n = i * param->out_stride_dim0_;
|
||||
for (int j = 0; j < in_shape_dim1; ++j) {
|
||||
size_t in_offset_h = in_offset_n + j * param->in_stride_dim1_;
|
||||
size_t out_offset_h = out_offset_n + j * block_size * param->out_stride_dim1_;
|
||||
int in_offset_h = in_offset_n + j * param->in_stride_dim1_;
|
||||
int out_offset_h = out_offset_n + j * block_size * param->out_stride_dim1_;
|
||||
for (int k = 0; k < in_shape_dim2; ++k) {
|
||||
size_t in_offset_w = in_offset_h + k * param->in_stride_dim2_;
|
||||
size_t out_offset_w = out_offset_h + k * block_size * param->out_stride_dim2_;
|
||||
int in_offset_w = in_offset_h + k * param->in_stride_dim2_;
|
||||
int out_offset_w = out_offset_h + k * block_size * param->out_stride_dim2_;
|
||||
for (int l = 0; l < block_size; ++l) {
|
||||
size_t out_offset = (out_offset_w + l * param->out_stride_dim1_) * param->data_type_size_;
|
||||
size_t in_offset = (in_offset_w + l * block_size * param->out_stride_dim2_) * param->data_type_size_;
|
||||
int out_offset = (out_offset_w + l * param->out_stride_dim1_) * param->data_type_size_;
|
||||
int in_offset = (in_offset_w + l * block_size * param->out_stride_dim2_) * param->data_type_size_;
|
||||
memcpy((int8_t *)output + out_offset, (int8_t *)input + in_offset, copy_size);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -118,7 +118,9 @@ int B(const float *poly_array, float *matrix_b, int in_unit) {
|
|||
float matrix_t[MAX_LEN]; // n * in_unit
|
||||
|
||||
T(poly_array, matrix_t, n);
|
||||
LT(poly_array, matrix_lt, n);
|
||||
if (LT(poly_array, matrix_lt, n) != NNACL_OK) {
|
||||
return NNACL_ERR;
|
||||
}
|
||||
MatrixTranspose(matrix_lt, matrix_l, n, n);
|
||||
MatrixMultiply(matrix_l, matrix_t, matrix_b, n, n, in_unit);
|
||||
matrix_b[in_unit * in_unit - 1] = 1;
|
||||
|
|
|
@ -47,43 +47,43 @@ void DoSlice(const void *input, void *output, SliceParameter *param, int thread_
|
|||
int8_t *int8_in = (int8_t *)input;
|
||||
int8_t *int8_out = (int8_t *)output;
|
||||
|
||||
size_t out_stride[8];
|
||||
int out_stride[8];
|
||||
out_stride[7] = 1;
|
||||
for (int i = 6; i >= 0; --i) {
|
||||
out_stride[i] = out_stride[i + 1] * param->size_[i + 1];
|
||||
}
|
||||
|
||||
size_t count_per_thread = UP_DIV(param->size_[5], param->op_parameter_.thread_num_);
|
||||
size_t thread_begin = thread_id * count_per_thread;
|
||||
size_t thread_end = MSMIN(param->size_[5], thread_begin + count_per_thread);
|
||||
size_t copy_size = param->size_[7] * data_size;
|
||||
size_t in_stride[8];
|
||||
int count_per_thread = UP_DIV(param->size_[5], param->op_parameter_.thread_num_);
|
||||
int thread_begin = thread_id * count_per_thread;
|
||||
int thread_end = MSMIN(param->size_[5], thread_begin + count_per_thread);
|
||||
int copy_size = param->size_[7] * data_size;
|
||||
int in_stride[8];
|
||||
in_stride[7] = 1;
|
||||
for (int i = 6; i >= 0; --i) {
|
||||
in_stride[i] = param->shape_[i + 1] * in_stride[i + 1];
|
||||
}
|
||||
|
||||
for (int ii = 0; ii < param->size_[0]; ++ii) {
|
||||
size_t out_offset0 = ii * out_stride[0];
|
||||
size_t in_offset0 = (ii + param->begin_[0]) * in_stride[0] + param->begin_[7];
|
||||
int out_offset0 = ii * out_stride[0];
|
||||
int in_offset0 = (ii + param->begin_[0]) * in_stride[0] + param->begin_[7];
|
||||
for (int jj = 0; jj < param->size_[1]; ++jj) {
|
||||
size_t out_offset1 = jj * out_stride[1] + out_offset0;
|
||||
size_t in_offset1 = (jj + param->begin_[1]) * in_stride[1] + in_offset0;
|
||||
int out_offset1 = jj * out_stride[1] + out_offset0;
|
||||
int in_offset1 = (jj + param->begin_[1]) * in_stride[1] + in_offset0;
|
||||
for (int kk = 0; kk < param->size_[2]; ++kk) {
|
||||
size_t out_offset2 = kk * out_stride[2] + out_offset1;
|
||||
size_t in_offset2 = (kk + param->begin_[2]) * in_stride[2] + in_offset1;
|
||||
int out_offset2 = kk * out_stride[2] + out_offset1;
|
||||
int in_offset2 = (kk + param->begin_[2]) * in_stride[2] + in_offset1;
|
||||
for (int ll = 0; ll < param->size_[3]; ++ll) {
|
||||
size_t out_offset3 = ll * out_stride[3] + out_offset2;
|
||||
size_t in_offset3 = (ll + param->begin_[3]) * in_stride[3] + in_offset2;
|
||||
int out_offset3 = ll * out_stride[3] + out_offset2;
|
||||
int in_offset3 = (ll + param->begin_[3]) * in_stride[3] + in_offset2;
|
||||
for (int i = 0; i < param->size_[4]; ++i) {
|
||||
size_t out_offset4 = i * out_stride[4] + out_offset3;
|
||||
size_t in_offset4 = (i + param->begin_[4]) * in_stride[4] + in_offset3;
|
||||
for (size_t j = thread_begin; j < thread_end; ++j) {
|
||||
size_t out_offset5 = j * out_stride[5] + out_offset4;
|
||||
size_t in_offset5 = (j + param->begin_[5]) * in_stride[5] + in_offset4;
|
||||
int out_offset4 = i * out_stride[4] + out_offset3;
|
||||
int in_offset4 = (i + param->begin_[4]) * in_stride[4] + in_offset3;
|
||||
for (int j = thread_begin; j < thread_end; ++j) {
|
||||
int out_offset5 = j * out_stride[5] + out_offset4;
|
||||
int in_offset5 = (j + param->begin_[5]) * in_stride[5] + in_offset4;
|
||||
for (int k = 0; k < param->size_[6]; ++k) {
|
||||
size_t out_offset6 = k * out_stride[6] + out_offset5;
|
||||
size_t in_offset6 = (k + param->begin_[6]) * in_stride[6] + in_offset5;
|
||||
int out_offset6 = k * out_stride[6] + out_offset5;
|
||||
int in_offset6 = (k + param->begin_[6]) * in_stride[6] + in_offset5;
|
||||
memcpy(int8_out + out_offset6 * data_size, int8_in + in_offset6 * data_size, copy_size);
|
||||
}
|
||||
}
|
||||
|
@ -105,8 +105,8 @@ void DoSliceNoParallel(const void *input, void *output, SliceParameter *param, i
|
|||
int8_t *int8_in = (int8_t *)input;
|
||||
int8_t *int8_out = (int8_t *)output;
|
||||
|
||||
size_t copy_size = param->size_[7] * data_size;
|
||||
size_t in_stride[8];
|
||||
int copy_size = param->size_[7] * data_size;
|
||||
int in_stride[8];
|
||||
in_stride[7] = 1;
|
||||
for (int i = 6; i >= 0; --i) {
|
||||
in_stride[i] = param->shape_[i + 1] * in_stride[i + 1];
|
||||
|
@ -115,9 +115,9 @@ void DoSliceNoParallel(const void *input, void *output, SliceParameter *param, i
|
|||
for (int i = 0; i < DIMENSION_8D; ++i) {
|
||||
axis_copy_flag[i] = WhetherCopyByAxis(param->begin_, param->end_, param->shape_, i);
|
||||
}
|
||||
size_t out_offset = 0;
|
||||
int out_offset = 0;
|
||||
for (int32_t dim0 = param->begin_[0]; dim0 < param->end_[0]; ++dim0) {
|
||||
size_t in_offset0 = dim0 * in_stride[0] + param->begin_[7];
|
||||
int in_offset0 = dim0 * in_stride[0] + param->begin_[7];
|
||||
#define FAST_COPY_IF_NEED(rank) \
|
||||
if (axis_copy_flag[rank]) { \
|
||||
int left_block_num = param->end_[rank] - dim##rank; \
|
||||
|
@ -128,24 +128,24 @@ void DoSliceNoParallel(const void *input, void *output, SliceParameter *param, i
|
|||
continue; \
|
||||
}
|
||||
FAST_COPY_IF_NEED(0);
|
||||
for (size_t dim1 = param->begin_[1]; dim1 < param->end_[1]; ++dim1) {
|
||||
size_t in_offset1 = dim1 * in_stride[1] + in_offset0;
|
||||
for (int dim1 = param->begin_[1]; dim1 < param->end_[1]; ++dim1) {
|
||||
int in_offset1 = dim1 * in_stride[1] + in_offset0;
|
||||
FAST_COPY_IF_NEED(1);
|
||||
for (int32_t dim2 = param->begin_[2]; dim2 < param->end_[2]; ++dim2) {
|
||||
size_t in_offset2 = in_offset1 + dim2 * in_stride[2];
|
||||
int in_offset2 = in_offset1 + dim2 * in_stride[2];
|
||||
FAST_COPY_IF_NEED(2);
|
||||
for (int32_t dim3 = param->begin_[3]; dim3 < param->end_[3]; ++dim3) {
|
||||
size_t in_offset3 = in_offset2 + dim3 * in_stride[3];
|
||||
int in_offset3 = in_offset2 + dim3 * in_stride[3];
|
||||
FAST_COPY_IF_NEED(3);
|
||||
for (int32_t dim4 = param->begin_[4]; dim4 < param->end_[4]; ++dim4) {
|
||||
size_t in_offset4 = in_offset3 + dim4 * in_stride[4];
|
||||
int in_offset4 = in_offset3 + dim4 * in_stride[4];
|
||||
FAST_COPY_IF_NEED(4);
|
||||
for (int32_t dim5 = param->begin_[5]; dim5 < param->end_[5]; ++dim5) {
|
||||
size_t in_offset5 = in_offset4 + dim5 * in_stride[5];
|
||||
int in_offset5 = in_offset4 + dim5 * in_stride[5];
|
||||
FAST_COPY_IF_NEED(5);
|
||||
#undef FAST_COPY_IF_NEED
|
||||
for (int32_t dim6 = param->begin_[6]; dim6 < param->end_[6]; ++dim6) {
|
||||
size_t in_offset6 = in_offset5 + dim6 * in_stride[6];
|
||||
int in_offset6 = in_offset5 + dim6 * in_stride[6];
|
||||
memcpy(int8_out + out_offset * data_size, int8_in + in_offset6 * data_size, copy_size);
|
||||
out_offset += param->size_[7];
|
||||
}
|
||||
|
|
|
@ -21,10 +21,6 @@
|
|||
|
||||
int DoSplit(void *in_data, void **out_data, const int *input_shape, int offset, int num_unit,
|
||||
SplitParameter *split_param, int data_size) {
|
||||
if (in_data == NULL || out_data == NULL) {
|
||||
return NNACL_ERR;
|
||||
}
|
||||
|
||||
int8_t *int8_in = (int8_t *)in_data;
|
||||
|
||||
int num_split = split_param->num_split_;
|
||||
|
|
|
@ -26,15 +26,15 @@ void DoCopyData(const uint8_t *input_data, uint8_t *output_data, size_t size, si
|
|||
}
|
||||
|
||||
int DoTileOneDimension(uint8_t *input_data, uint8_t *output_data, size_t dim, const TileParameter *parameter) {
|
||||
size_t src_dim_size = parameter->in_shape_[dim];
|
||||
int src_dim_size = parameter->in_shape_[dim];
|
||||
if (dim == parameter->in_dim_ - 1) {
|
||||
DoCopyData(input_data, output_data, src_dim_size, parameter->data_size_, parameter->multiples_[dim]);
|
||||
return 0;
|
||||
}
|
||||
for (size_t i = 0; i < src_dim_size; ++i) {
|
||||
for (size_t j = 0; j < parameter->multiples_[dim]; ++j) {
|
||||
size_t in_pos = parameter->in_strides_[dim] * i;
|
||||
size_t out_pos = parameter->out_strides_[dim] * (i + j * src_dim_size);
|
||||
for (int i = 0; i < src_dim_size; ++i) {
|
||||
for (int j = 0; j < parameter->multiples_[dim]; ++j) {
|
||||
int in_pos = parameter->in_strides_[dim] * i;
|
||||
int out_pos = parameter->out_strides_[dim] * (i + j * src_dim_size);
|
||||
DoTileOneDimension(input_data + in_pos * parameter->data_size_, output_data + out_pos * parameter->data_size_,
|
||||
dim + 1, parameter);
|
||||
}
|
||||
|
|
|
@ -18,20 +18,20 @@
|
|||
#define MINDSPORE_NNACL_BASE_TILE_H_
|
||||
|
||||
#include "nnacl/op_base.h"
|
||||
|
||||
#define MAX_TILE_DIM_SIZE 8
|
||||
typedef struct TileParameter {
|
||||
// primitive parameter
|
||||
OpParameter op_parameter_;
|
||||
int multiples_[8];
|
||||
int dims_[8];
|
||||
int multiples_[MAX_TILE_DIM_SIZE];
|
||||
int dims_[MAX_TILE_DIM_SIZE];
|
||||
size_t dims_size_;
|
||||
size_t multiples_size_;
|
||||
|
||||
// shape correlative
|
||||
int in_shape_[8];
|
||||
int out_shape_[8];
|
||||
int in_strides_[8];
|
||||
int out_strides_[8];
|
||||
int in_shape_[MAX_TILE_DIM_SIZE];
|
||||
int out_shape_[MAX_TILE_DIM_SIZE];
|
||||
int in_strides_[MAX_TILE_DIM_SIZE];
|
||||
int out_strides_[MAX_TILE_DIM_SIZE];
|
||||
|
||||
// other parameter
|
||||
int in_dim_;
|
||||
|
|
|
@ -184,7 +184,7 @@
|
|||
for (int i = dims - 1; i > 0; --i) { \
|
||||
*(size + i - 1) = *(size + i) * output_shape[i]; \
|
||||
} \
|
||||
for (size_t idx = 0; idx < (*size) * output_shape[0]; ++idx) { \
|
||||
for (int idx = 0; idx < (*size) * output_shape[0]; ++idx) { \
|
||||
int pos = idx; \
|
||||
int output_idx = 0; \
|
||||
int input_idx = 0; \
|
||||
|
@ -215,7 +215,7 @@
|
|||
return; \
|
||||
} \
|
||||
count = MSMIN(offset_size, count); \
|
||||
for (size_t idx = task_offset; idx < task_offset + count; ++idx) { \
|
||||
for (int idx = task_offset; idx < task_offset + count; ++idx) { \
|
||||
int pos = idx; \
|
||||
int output_idx = 0; \
|
||||
int input_idx = 0; \
|
||||
|
|
|
@ -16,15 +16,19 @@
|
|||
|
||||
#include "nnacl/common_func.h"
|
||||
|
||||
int offset(const int *shape, const int dim0, const int dim1, const int dim2, const int dim3) {
|
||||
int Offset(const int *shape, const int dim0, const int dim1, const int dim2, const int dim3) {
|
||||
return ((dim0 * shape[1] + dim1) * shape[2] + dim2) * shape[3] + dim3;
|
||||
}
|
||||
|
||||
int offsetComm(const int *shape, const int dim0, const int dim1, const int dim2) {
|
||||
int OffsetComm(const int *shape, const int dim0, const int dim1, const int dim2) {
|
||||
return ((dim0 * shape[1] + dim1) * shape[2] + dim2) * shape[3];
|
||||
}
|
||||
|
||||
int offset4d(const int *shape, const int *dims) { return offset(shape, dims[0], dims[1], dims[2], dims[3]); }
|
||||
int Offset4d(const int *shape, const int *dims) { return Offset(shape, dims[0], dims[1], dims[2], dims[3]); }
|
||||
|
||||
int Offset6d(const int *shape, const int *dims) {
|
||||
return ((OffsetComm(shape, dims[0], dims[1], dims[2]) + dims[3]) * shape[4] + dims[4]) * shape[5];
|
||||
}
|
||||
|
||||
int8_t MinInt8(int8_t a, int8_t b) { return b ^ ((a ^ b) & -(a < b)); }
|
||||
|
||||
|
|
|
@ -36,9 +36,10 @@ void ReluFp32C8(float *data, float *dst, int ele_num);
|
|||
void Relu6Fp32C8(float *data, float *dst, int ele_num);
|
||||
#endif
|
||||
#endif
|
||||
int offset(const int *shape, const int dim0, const int dim1, const int dim2, const int dim3);
|
||||
int offsetComm(const int *shape, const int dim0, const int dim1, const int dim2);
|
||||
int offset4d(const int *shape, const int *dims);
|
||||
int Offset(const int *shape, const int dim0, const int dim1, const int dim2, const int dim3);
|
||||
int OffsetComm(const int *shape, const int dim0, const int dim1, const int dim2);
|
||||
int Offset4d(const int *shape, const int *dims);
|
||||
int Offset6d(const int *shape, const int *dims);
|
||||
|
||||
static inline bool isAddOverflow(int32_t x, int32_t y) {
|
||||
int32_t sum = x + y;
|
||||
|
|
|
@ -19,16 +19,22 @@
|
|||
|
||||
void PadFp16(const float16_t *input_data, float16_t *output_data, const int *input_shape, const int *output_shape,
|
||||
const int *paddings, const int tid, const int thread_num) {
|
||||
int in[4], out[4];
|
||||
int in[DEFAULT_PAD_NDIMS], out[DEFAULT_PAD_NDIMS];
|
||||
for (in[0] = 0; in[0] < input_shape[0]; in[0]++) {
|
||||
out[0] = in[0] + paddings[0];
|
||||
for (in[1] = tid; in[1] < input_shape[1]; in[1] += thread_num) {
|
||||
out[1] = in[1] + paddings[2];
|
||||
for (in[2] = 0; in[2] < input_shape[2]; in[2]++) {
|
||||
out[2] = in[2] + paddings[4];
|
||||
float16_t *dst = output_data + offset(output_shape, out[0], out[1], out[2], paddings[6]);
|
||||
const float16_t *src = input_data + offset(input_shape, in[0], in[1], in[2], 0);
|
||||
memcpy(dst, src, input_shape[3] * sizeof(float16_t));
|
||||
for (in[3] = 0; in[3] < input_shape[3]; in[3]++) {
|
||||
out[3] = in[3] + paddings[6];
|
||||
for (in[4] = 0; in[4] < input_shape[4]; in[4]++) {
|
||||
out[4] = in[4] + paddings[8];
|
||||
float16_t *dst = output_data + Offset6d(output_shape, out) + paddings[10];
|
||||
const float16_t *src = input_data + Offset6d(input_shape, in);
|
||||
memcpy(dst, src, input_shape[5] * sizeof(float16_t));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -152,17 +152,15 @@ int AdamDeltaFp32(float *delta, float *m, float *v, float lr, float beta1, float
|
|||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int AdamWeightDecayFp32(float *var, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
|
||||
const float *gradient, size_t start, size_t end) {
|
||||
size_t AdamWeightDecayFp32(float *var, float *m, float *v, float lr, float beta1, float beta2, float epsilon,
|
||||
float decay, const float *gradient, size_t start, size_t end) {
|
||||
size_t c1 = start;
|
||||
#ifdef ENABLE_AVX512
|
||||
const float beta1_minus = 1 - beta1;
|
||||
const float beta2_minus = 1 - beta2;
|
||||
struct AVX_Data beta1_r, beta2_r, beta1_minus_r, beta2_minus_r, lr_neg_r, epsilon_r, decay_r;
|
||||
beta1_r.data = _mm512_set1_ps(beta1);
|
||||
beta2_r.data = _mm512_set1_ps(beta2);
|
||||
beta1_minus_r.data = _mm512_set1_ps(beta1_minus);
|
||||
beta2_minus_r.data = _mm512_set1_ps(beta2_minus);
|
||||
beta1_minus_r.data = _mm512_set1_ps(1.0f - beta1);
|
||||
beta2_minus_r.data = _mm512_set1_ps(1.0f - beta2);
|
||||
lr_neg_r.data = _mm512_set1_ps(-lr);
|
||||
epsilon_r.data = _mm512_set1_ps(epsilon);
|
||||
decay_r.data = _mm512_set1_ps(decay);
|
||||
|
@ -260,17 +258,15 @@ int AdamWeightDecayFp32(float *var, float *m, float *v, float lr, float beta1, f
|
|||
return c1;
|
||||
}
|
||||
|
||||
int FusedAdamFp32(float *var, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
|
||||
const int16_t *gradient16, size_t start, size_t end) {
|
||||
size_t FusedAdamFp32(float *var, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
|
||||
const int16_t *gradient16, size_t start, size_t end) {
|
||||
size_t c1 = start;
|
||||
#ifdef ENABLE_AVX512
|
||||
const float beta1_minus = 1 - beta1;
|
||||
const float beta2_minus = 1 - beta2;
|
||||
struct AVX_Data beta1_r, beta2_r, beta1_minus_r, beta2_minus_r, lr_neg_r, epsilon_r, decay_r;
|
||||
beta1_r.data = _mm512_set1_ps(beta1);
|
||||
beta2_r.data = _mm512_set1_ps(beta2);
|
||||
beta1_minus_r.data = _mm512_set1_ps(beta1_minus);
|
||||
beta2_minus_r.data = _mm512_set1_ps(beta2_minus);
|
||||
beta1_minus_r.data = _mm512_set1_ps(1.0f - beta1);
|
||||
beta2_minus_r.data = _mm512_set1_ps(1.0f - beta2);
|
||||
lr_neg_r.data = _mm512_set1_ps(-lr);
|
||||
epsilon_r.data = _mm512_set1_ps(epsilon);
|
||||
decay_r.data = _mm512_set1_ps(decay);
|
||||
|
|
|
@ -71,10 +71,10 @@ int AdamFp32(float *var, float *m, float *v, float lr, float beta1, float beta2,
|
|||
size_t start, size_t end, bool use_nesterov);
|
||||
int AdamDeltaFp32(float *delta, float *m, float *v, float lr, float beta1, float beta2, float epsilon,
|
||||
const float *gradient, size_t start, size_t end, bool use_nesterov);
|
||||
int AdamWeightDecayFp32(float *var, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
|
||||
const float *gradient, size_t start, size_t end);
|
||||
int FusedAdamFp32(float *var, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
|
||||
const int16_t *gradient16, size_t start, size_t end);
|
||||
size_t AdamWeightDecayFp32(float *var, float *m, float *v, float lr, float beta1, float beta2, float epsilon,
|
||||
float decay, const float *gradient, size_t start, size_t end);
|
||||
size_t FusedAdamFp32(float *var, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
|
||||
const int16_t *gradient16, size_t start, size_t end);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -49,8 +49,8 @@ void ArgMaxTopK1(const float *input, void *output, float *output_value, const Ar
|
|||
float *outputfp32 = (float *)output;
|
||||
int *outputint = (int *)output;
|
||||
for (int i = 0; i < pre_axis_count; ++i) {
|
||||
size_t output_offset = i * after_axis_count;
|
||||
size_t input_offset = output_offset * axis_count;
|
||||
int output_offset = i * after_axis_count;
|
||||
int input_offset = output_offset * axis_count;
|
||||
for (int j = 0; j < after_axis_count; ++j) {
|
||||
float value = -FLT_MAX;
|
||||
int index = 0;
|
||||
|
@ -79,8 +79,8 @@ void ArgMinTopK1(const float *input, void *output, float *output_value, const Ar
|
|||
float *outputfp32 = (float *)output;
|
||||
int *outputint = (int *)output;
|
||||
for (int i = 0; i < pre_axis_count; ++i) {
|
||||
size_t output_offset = i * after_axis_count;
|
||||
size_t input_offset = output_offset * axis_count;
|
||||
int output_offset = i * after_axis_count;
|
||||
int input_offset = output_offset * axis_count;
|
||||
for (int j = 0; j < after_axis_count; ++j) {
|
||||
float value = FLT_MAX;
|
||||
int index = 0;
|
||||
|
@ -109,13 +109,13 @@ void ArgMinMaxDim0(const float *input, void *output, float *output_value, const
|
|||
int *outputint = (int *)output;
|
||||
for (int32_t i = 0; i < param->in_strides_[0]; ++i) {
|
||||
for (int j = 0; j < in_shape[0]; ++j) {
|
||||
size_t offset = param->in_strides_[0] * j + i;
|
||||
int offset = param->in_strides_[0] * j + i;
|
||||
param->arg_elements_[j].index_ = j;
|
||||
param->arg_elements_[j].data_.f_data_ = input[offset];
|
||||
}
|
||||
qsort(param->arg_elements_, in_shape[0], sizeof(ArgElement), *compare_func);
|
||||
for (int j = 0; j < param->topk_; ++j) {
|
||||
size_t out_offset = j * param->out_strides_[0] + i;
|
||||
int out_offset = j * param->out_strides_[0] + i;
|
||||
if (param->out_value_) {
|
||||
outputfp32[out_offset] = param->arg_elements_[j].data_.f_data_;
|
||||
} else {
|
||||
|
@ -135,17 +135,17 @@ void ArgMinMaxDim1(const float *input, void *output, float *output_value, const
|
|||
int *outputint = (int *)output;
|
||||
int in_shape1 = in_shape[1];
|
||||
for (int i = 0; i < in_shape[0]; ++i) {
|
||||
size_t in_dim0_offset = i * param->in_strides_[0];
|
||||
size_t out_dim0_offset = i * param->out_strides_[0];
|
||||
int in_dim0_offset = i * param->in_strides_[0];
|
||||
int out_dim0_offset = i * param->out_strides_[0];
|
||||
for (int j = 0; j < param->in_strides_[1]; ++j) {
|
||||
for (int k = 0; k < in_shape1; ++k) {
|
||||
size_t offset = param->in_strides_[1] * k + in_dim0_offset + j;
|
||||
int offset = param->in_strides_[1] * k + in_dim0_offset + j;
|
||||
param->arg_elements_[k].index_ = k;
|
||||
param->arg_elements_[k].data_.f_data_ = input[offset];
|
||||
}
|
||||
qsort(param->arg_elements_, in_shape1, sizeof(ArgElement), *compare_func);
|
||||
for (int k = 0; k < param->topk_; ++k) {
|
||||
size_t out_offset = out_dim0_offset + j + k * param->out_strides_[1];
|
||||
int out_offset = out_dim0_offset + j + k * param->out_strides_[1];
|
||||
if (param->out_value_) {
|
||||
outputfp32[out_offset] = param->arg_elements_[k].data_.f_data_;
|
||||
} else {
|
||||
|
@ -167,20 +167,20 @@ void ArgMinMaxDim2(const float *input, void *output, float *output_value, const
|
|||
float *outputfp32 = (float *)output;
|
||||
int *outputint = (int *)output;
|
||||
for (int i = 0; i < in_shape[0]; ++i) {
|
||||
size_t in_dim0_offset = i * param->in_strides_[0];
|
||||
size_t out_dim0_offset = i * param->out_strides_[0];
|
||||
int in_dim0_offset = i * param->in_strides_[0];
|
||||
int out_dim0_offset = i * param->out_strides_[0];
|
||||
for (int j = 0; j < in_shape1; ++j) {
|
||||
size_t in_dim1_offset = j * param->in_strides_[1] + in_dim0_offset;
|
||||
size_t out_dim1_offset = j * param->out_strides_[1] + out_dim0_offset;
|
||||
int in_dim1_offset = j * param->in_strides_[1] + in_dim0_offset;
|
||||
int out_dim1_offset = j * param->out_strides_[1] + out_dim0_offset;
|
||||
for (int k = 0; k < param->in_strides_[2]; ++k) {
|
||||
for (int l = 0; l < in_shape2; ++l) {
|
||||
size_t offset = param->in_strides_[2] * l + k + in_dim1_offset;
|
||||
int offset = param->in_strides_[2] * l + k + in_dim1_offset;
|
||||
param->arg_elements_[l].index_ = l;
|
||||
param->arg_elements_[l].data_.f_data_ = input[offset];
|
||||
}
|
||||
qsort(param->arg_elements_, in_shape2, sizeof(ArgElement), *compare_func);
|
||||
for (int l = 0; l < param->topk_; ++l) {
|
||||
size_t out_offset = out_dim1_offset + k + l * param->out_strides_[2];
|
||||
int out_offset = out_dim1_offset + k + l * param->out_strides_[2];
|
||||
if (param->out_value_) {
|
||||
outputfp32[out_offset] = param->arg_elements_[l].data_.f_data_;
|
||||
} else {
|
||||
|
@ -203,26 +203,26 @@ void ArgMinMaxDim3(const float *input, void *output, float *output_value, const
|
|||
float *outputfp32 = (float *)output;
|
||||
int *outputint = (int *)output;
|
||||
for (int i = 0; i < in_shape[0]; ++i) {
|
||||
size_t in_dim0_offset = i * param->in_strides_[0];
|
||||
size_t out_dim0_offset = i * param->out_strides_[0];
|
||||
int in_dim0_offset = i * param->in_strides_[0];
|
||||
int out_dim0_offset = i * param->out_strides_[0];
|
||||
for (int j = 0; j < in_shape1; ++j) {
|
||||
size_t in_dim1_offset = j * param->in_strides_[1] + in_dim0_offset;
|
||||
size_t out_dim1_offset = j * param->out_strides_[1] + out_dim0_offset;
|
||||
int in_dim1_offset = j * param->in_strides_[1] + in_dim0_offset;
|
||||
int out_dim1_offset = j * param->out_strides_[1] + out_dim0_offset;
|
||||
for (int k = 0; k < in_shape2; ++k) {
|
||||
size_t in_dim2_offset = k * param->in_strides_[2] + in_dim1_offset;
|
||||
size_t out_dim2_offset = k * param->out_strides_[2] + out_dim1_offset;
|
||||
int in_dim2_offset = k * param->in_strides_[2] + in_dim1_offset;
|
||||
int out_dim2_offset = k * param->out_strides_[2] + out_dim1_offset;
|
||||
for (int l = 0; l < in_shape3; ++l) {
|
||||
size_t offset = l + in_dim2_offset;
|
||||
int offset = l + in_dim2_offset;
|
||||
param->arg_elements_[l].index_ = l;
|
||||
param->arg_elements_[l].data_.f_data_ = input[offset];
|
||||
}
|
||||
qsort(param->arg_elements_, in_shape3, sizeof(ArgElement), *compare_func);
|
||||
for (int l = 0; l < param->topk_; ++l) {
|
||||
size_t out_offset = out_dim2_offset + l;
|
||||
int out_offset = out_dim2_offset + l;
|
||||
if (param->out_value_) {
|
||||
outputfp32[out_offset] = param->arg_elements_[l].data_.f_data_;
|
||||
} else {
|
||||
outputint[out_offset] = param->arg_elements_[l].index_;
|
||||
outputint[out_offset] = (int)(param->arg_elements_[l].index_);
|
||||
}
|
||||
if (output_value != NULL) {
|
||||
output_value[out_offset] = param->arg_elements_[l].data_.f_data_;
|
||||
|
|
|
@ -21,10 +21,10 @@ void PostConvFuncComm(const float *src_ptr_, float *out_ptr, const float *bias_p
|
|||
if (size == 0) {
|
||||
return;
|
||||
}
|
||||
for (int oc = 0; oc < output_channel; oc++) {
|
||||
for (size_t oc = 0; oc < output_channel; oc++) {
|
||||
int oc_div = oc / size;
|
||||
int oc_mod = oc % size;
|
||||
for (int hw = 0; hw < plane_size; hw++) {
|
||||
for (int hw = 0; hw < (int)plane_size; hw++) {
|
||||
int src_index = oc_div * size * plane_stride + hw * size + oc_mod;
|
||||
int dst_index = hw * oc_stride + oc;
|
||||
float value = src_ptr_[src_index];
|
||||
|
|
|
@ -52,7 +52,8 @@ int ConvDw(float *output_data, const float *input_data, const float *weight_data
|
|||
int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih_origin, conv_param->dilation_h_));
|
||||
|
||||
for (int ow = 0; ow < conv_param->output_w_; ow++) {
|
||||
memcpy(dst_data + ow * conv_param->output_channel_, bias_data, conv_param->output_channel_ * sizeof(float));
|
||||
memcpy(dst_data + ow * conv_param->output_channel_, bias_data,
|
||||
conv_param->output_channel_ * (int)(sizeof(float)));
|
||||
}
|
||||
for (int kh = start_kh; kh < end_kh; kh++) {
|
||||
int ih = ih_origin + conv_param->dilation_w_ * kh;
|
||||
|
@ -764,10 +765,10 @@ void ConvDwFp32IndirectRow(float *output, float **input, const float *weights, c
|
|||
int output_width, int input_stride, bool relu, bool relu6, int kernel) {
|
||||
do {
|
||||
float **in = input;
|
||||
size_t c = channels;
|
||||
size_t c = (size_t)channels;
|
||||
const float *w = weights;
|
||||
float *out = output;
|
||||
memcpy(out, bias, channels * sizeof(float));
|
||||
memcpy(out, bias, channels * (int)sizeof(float));
|
||||
for (; c >= C4NUM; c -= C4NUM) {
|
||||
for (int i = 0; i < C4NUM; i++) {
|
||||
for (int k = 0; k < kernel; k++) {
|
||||
|
|
|
@ -61,7 +61,7 @@ void DeConvPostFp32C8(const float *src, float *tmp, const float *bias, float *ds
|
|||
for (int c = 0; c < oc8; c += 8) {
|
||||
float *dst_ptr = tmp + c * output_plane;
|
||||
const float *src_ptr = src + c * in_plane_round * kernel_plane;
|
||||
memset(dst_ptr, 0, output_plane * C8NUM * sizeof(float));
|
||||
memset(dst_ptr, 0, output_plane * C8NUM * (int)sizeof(float));
|
||||
|
||||
for (int ih = 0; ih < conv_param->input_h_; ih++) {
|
||||
for (int iw = 0; iw < conv_param->input_w_; iw++) {
|
||||
|
|
|
@ -43,7 +43,7 @@ int CopyData(float *input_data, const int *ids, float *output_data, int num,
|
|||
parameter->is_regulated_[ids[num]] = true;
|
||||
}
|
||||
|
||||
memcpy(out_data, in_data, sizeof(float) * parameter->layer_size_);
|
||||
memcpy(out_data, in_data, sizeof(float) * (size_t)(parameter->layer_size_));
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
|
@ -52,7 +52,7 @@ int EmbeddingLookup(float *input_data, const int *ids, float *output_data, const
|
|||
if (parameter->op_parameter_.thread_num_ == 0) {
|
||||
return NNACL_PARAM_INVALID;
|
||||
}
|
||||
for (size_t i = task_id; i < parameter->ids_size_; i += parameter->op_parameter_.thread_num_) {
|
||||
for (int i = task_id; i < parameter->ids_size_; i += parameter->op_parameter_.thread_num_) {
|
||||
int ret = CopyData(input_data, ids, output_data, i, parameter);
|
||||
if (ret != NNACL_OK) {
|
||||
return ret;
|
||||
|
|
|
@ -21,7 +21,7 @@
|
|||
int GatherNd(const float *input, float *output, const int *in_offset, int area, int count) {
|
||||
int i = 0;
|
||||
for (i = 0; i < count; i++) {
|
||||
(void)memcpy(output + area * i, input + in_offset[i], area * sizeof(float));
|
||||
(void)memcpy(output + area * i, input + in_offset[i], (size_t)(area) * sizeof(float));
|
||||
}
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
|
|
@ -41,7 +41,7 @@ void PackLstmBias(float *dst, const float *src, int batch, int col, int col_alig
|
|||
for (int i = 0; i < unidirectional_batch; i++) {
|
||||
const float *src_batch = src + i * col;
|
||||
float *dst_batch = dst + i * col_align;
|
||||
memcpy(dst_batch, src_batch, col * sizeof(float));
|
||||
memcpy(dst_batch, src_batch, col * (int)sizeof(float));
|
||||
}
|
||||
if (is_bidirectional) {
|
||||
const float *backward_src = src + batch * col;
|
||||
|
|
|
@ -263,9 +263,9 @@ void RowMajor2Col12Major_arm32(const float *src_c, float *dst_c, size_t col) {
|
|||
void RowMajor2Col12Major(const float *src_ptr, float *dst_ptr, int row, int col) {
|
||||
const float *src_r = src_ptr;
|
||||
float *dst_r = dst_ptr;
|
||||
size_t ri = 0;
|
||||
int ri = 0;
|
||||
for (; ri < (row / C12NUM * C12NUM); ri += C12NUM) {
|
||||
size_t ci = 0;
|
||||
int ci = 0;
|
||||
for (; ci < (col / C4NUM * C4NUM); ci += C4NUM) {
|
||||
const float *src_c = src_r + ci;
|
||||
float *dst_c = dst_r + ci * C12NUM;
|
||||
|
@ -340,7 +340,7 @@ void RowMajor2Col12Major(const float *src_ptr, float *dst_ptr, int row, int col)
|
|||
for (; ci < col; ci++) {
|
||||
const float *src_c = src_r + ci;
|
||||
float *dst_c = dst_r + ci * C12NUM;
|
||||
for (size_t i = 0; i < C12NUM; i++) {
|
||||
for (int i = 0; i < C12NUM; i++) {
|
||||
dst_c[i] = src_c[i * col];
|
||||
}
|
||||
}
|
||||
|
@ -348,16 +348,15 @@ void RowMajor2Col12Major(const float *src_ptr, float *dst_ptr, int row, int col)
|
|||
dst_r += C12NUM * col;
|
||||
}
|
||||
for (; ri < row; ri++, dst_r++, src_r += col) {
|
||||
for (size_t i = 0; i < col; i++) {
|
||||
for (int i = 0; i < col; i++) {
|
||||
dst_r[i * C12NUM] = src_r[i];
|
||||
}
|
||||
}
|
||||
for (; ri < UP_ROUND(row, C12NUM); ri++, dst_r++) {
|
||||
for (size_t i = 0; i < col; i++) {
|
||||
for (int i = 0; i < col; i++) {
|
||||
dst_r[i * C12NUM] = 0;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef ENABLE_ARM64
|
||||
|
@ -532,20 +531,20 @@ void RowMajor2Col8Major_arm32(const float *src_c, float *dst_c, size_t col) {
|
|||
#endif
|
||||
#endif
|
||||
void RowMajor2Col8Major(const float *src_ptr, float *dst_ptr, int row, int col) {
|
||||
size_t row8 = row / C8NUM * C8NUM;
|
||||
int row8 = row / C8NUM * C8NUM;
|
||||
#ifdef ENABLE_ARM64
|
||||
size_t col_skip = col / C8NUM * C8NUM;
|
||||
int col_skip = col / C8NUM * C8NUM;
|
||||
int skip_size = C8NUM;
|
||||
#else
|
||||
size_t col_skip = col / C4NUM * C4NUM;
|
||||
int col_skip = col / C4NUM * C4NUM;
|
||||
int skip_size = C4NUM;
|
||||
#endif
|
||||
const float *src_r = src_ptr;
|
||||
float *dst_r = dst_ptr;
|
||||
|
||||
size_t ri = 0;
|
||||
int ri = 0;
|
||||
for (; ri < row8; ri += C8NUM) {
|
||||
size_t ci = 0;
|
||||
int ci = 0;
|
||||
for (; ci < col_skip; ci += skip_size) {
|
||||
const float *src_c = src_r + ci;
|
||||
float *dst_c = dst_r + ci * C8NUM;
|
||||
|
@ -593,7 +592,7 @@ void RowMajor2Col8Major(const float *src_ptr, float *dst_ptr, int row, int col)
|
|||
for (; ci < col; ci++) {
|
||||
const float *src_c = src_r + ci;
|
||||
float *dst_c = dst_r + ci * C8NUM;
|
||||
for (size_t i = 0; i < C8NUM; i++) {
|
||||
for (int i = 0; i < C8NUM; i++) {
|
||||
dst_c[i] = src_c[i * col];
|
||||
}
|
||||
}
|
||||
|
@ -601,29 +600,28 @@ void RowMajor2Col8Major(const float *src_ptr, float *dst_ptr, int row, int col)
|
|||
dst_r += C8NUM * col;
|
||||
}
|
||||
for (; ri < row; ri++, src_r += col, dst_r++) {
|
||||
for (size_t i = 0; i < col; i++) {
|
||||
for (int i = 0; i < col; i++) {
|
||||
dst_r[i * C8NUM] = src_r[i];
|
||||
}
|
||||
}
|
||||
|
||||
for (; ri < UP_ROUND(row, C8NUM); ri++, dst_r++) {
|
||||
for (size_t i = 0; i < col; i++) {
|
||||
for (int i = 0; i < col; i++) {
|
||||
dst_r[i * C8NUM] = 0;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void RowMajor2Col16Major(const float *src_ptr, float *dst_ptr, int row, int col) {
|
||||
size_t row16 = row / C16NUM * C16NUM;
|
||||
size_t col_skip = col / C4NUM * C4NUM;
|
||||
int row16 = row / C16NUM * C16NUM;
|
||||
int col_skip = col / C4NUM * C4NUM;
|
||||
int skip_size = C4NUM;
|
||||
const float *src_r = src_ptr;
|
||||
float *dst_r = dst_ptr;
|
||||
|
||||
size_t ri = 0;
|
||||
int ri = 0;
|
||||
for (; ri < row16; ri += C16NUM) {
|
||||
size_t ci = 0;
|
||||
int ci = 0;
|
||||
for (; ci < col_skip; ci += skip_size) {
|
||||
const float *src_c = src_r + ci;
|
||||
float *dst_c = dst_r + ci * C16NUM;
|
||||
|
@ -636,7 +634,7 @@ void RowMajor2Col16Major(const float *src_ptr, float *dst_ptr, int row, int col)
|
|||
for (; ci < col; ci++) {
|
||||
const float *src_c = src_r + ci;
|
||||
float *dst_c = dst_r + ci * C16NUM;
|
||||
for (size_t i = 0; i < C16NUM; i++) {
|
||||
for (int i = 0; i < C16NUM; i++) {
|
||||
dst_c[i] = src_c[i * col];
|
||||
}
|
||||
}
|
||||
|
@ -644,21 +642,20 @@ void RowMajor2Col16Major(const float *src_ptr, float *dst_ptr, int row, int col)
|
|||
dst_r += C16NUM * col;
|
||||
}
|
||||
for (; ri < row; ri++) {
|
||||
for (size_t i = 0; i < col; i++) {
|
||||
for (int i = 0; i < col; i++) {
|
||||
dst_r[i * C16NUM] = src_r[i];
|
||||
}
|
||||
src_r += col;
|
||||
dst_r += 1;
|
||||
}
|
||||
|
||||
size_t total_row = UP_ROUND(row, C16NUM);
|
||||
int total_row = UP_ROUND(row, C16NUM);
|
||||
for (; ri < total_row; ri++) {
|
||||
for (size_t i = 0; i < col; i++) {
|
||||
for (int i = 0; i < col; i++) {
|
||||
dst_r[i * C16NUM] = 0;
|
||||
}
|
||||
dst_r += 1;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void RowMajor2Col32Major(const float *src_ptr, float *dst_ptr, int row, int col) {
|
||||
|
@ -680,15 +677,15 @@ void RowMajor2Col32Major(const float *src_ptr, float *dst_ptr, int row, int col)
|
|||
}
|
||||
|
||||
void RowMajor2Col6Major(const float *src_ptr, float *dst_ptr, int row, int col) {
|
||||
size_t totalRow = UP_ROUND(row, C6NUM);
|
||||
size_t row6 = row / C6NUM * C6NUM;
|
||||
size_t col8 = col / C8NUM * C8NUM;
|
||||
int totalRow = UP_ROUND(row, C6NUM);
|
||||
int row6 = row / C6NUM * C6NUM;
|
||||
int col8 = col / C8NUM * C8NUM;
|
||||
const float *src_r = src_ptr;
|
||||
float *dst_r = dst_ptr;
|
||||
|
||||
size_t ri = 0;
|
||||
int ri = 0;
|
||||
for (; ri < row6; ri += C6NUM) {
|
||||
size_t ci = 0;
|
||||
int ci = 0;
|
||||
for (; ci < col8; ci += C8NUM) {
|
||||
const float *src_c = src_r + ci;
|
||||
float *dst_c = dst_r + ci * C6NUM;
|
||||
|
@ -753,7 +750,7 @@ void RowMajor2Col6Major(const float *src_ptr, float *dst_ptr, int row, int col)
|
|||
for (; ci < col; ci++) {
|
||||
const float *src_c = src_r + ci;
|
||||
float *dst_c = dst_r + ci * C6NUM;
|
||||
for (size_t i = 0; i < C6NUM; i++) {
|
||||
for (int i = 0; i < C6NUM; i++) {
|
||||
dst_c[i] = src_c[i * col];
|
||||
}
|
||||
}
|
||||
|
@ -762,7 +759,7 @@ void RowMajor2Col6Major(const float *src_ptr, float *dst_ptr, int row, int col)
|
|||
}
|
||||
|
||||
for (; ri < row; ri++) {
|
||||
for (size_t i = 0; i < col; i++) {
|
||||
for (int i = 0; i < col; i++) {
|
||||
dst_r[i * C6NUM] = src_r[i];
|
||||
}
|
||||
src_r += col;
|
||||
|
@ -770,30 +767,29 @@ void RowMajor2Col6Major(const float *src_ptr, float *dst_ptr, int row, int col)
|
|||
}
|
||||
|
||||
for (; ri < totalRow; ri++) {
|
||||
for (size_t i = 0; i < col; i++) {
|
||||
for (int i = 0; i < col; i++) {
|
||||
dst_r[i * C6NUM] = 0;
|
||||
}
|
||||
dst_r += 1;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void RowMajor2Col4Major(const float *src_ptr, float *dst_ptr, int row, int col) {
|
||||
size_t total_row = UP_ROUND(row, C4NUM);
|
||||
size_t row4 = row / C4NUM * C4NUM;
|
||||
size_t col4 = col / C4NUM * C4NUM;
|
||||
int total_row = UP_ROUND(row, C4NUM);
|
||||
int row4 = row / C4NUM * C4NUM;
|
||||
int col4 = col / C4NUM * C4NUM;
|
||||
const float *src_r = src_ptr;
|
||||
float *dst_r = dst_ptr;
|
||||
|
||||
size_t ri = 0;
|
||||
int ri = 0;
|
||||
for (; ri < row4; ri += C4NUM) {
|
||||
size_t ci = 0;
|
||||
int ci = 0;
|
||||
for (; ci < col4; ci += C4NUM) {
|
||||
const float *src_c = src_r + ci;
|
||||
float *dst_c = dst_r + ci * C4NUM;
|
||||
|
||||
#ifdef ENABLE_ARM32
|
||||
size_t stride = col * 4;
|
||||
int stride = col * 4;
|
||||
asm volatile(
|
||||
"mov r10, %[src_c]\n"
|
||||
"mov r12, %[dst_c]\n"
|
||||
|
@ -840,8 +836,8 @@ void RowMajor2Col4Major(const float *src_ptr, float *dst_ptr, int row, int col)
|
|||
_mm_storeu_ps(dst_c + 8, dst2);
|
||||
_mm_storeu_ps(dst_c + 12, dst3);
|
||||
#else
|
||||
for (int tr = 0; tr < C4NUM; tr++) {
|
||||
for (int tc = 0; tc < C4NUM; tc++) {
|
||||
for (size_t tr = 0; tr < C4NUM; tr++) {
|
||||
for (size_t tc = 0; tc < C4NUM; tc++) {
|
||||
dst_c[tc * C4NUM + tr] = src_c[tr * col + tc];
|
||||
}
|
||||
}
|
||||
|
@ -850,7 +846,7 @@ void RowMajor2Col4Major(const float *src_ptr, float *dst_ptr, int row, int col)
|
|||
for (; ci < col; ci++) {
|
||||
const float *src_c = src_r + ci;
|
||||
float *dst_c = dst_r + ci * C4NUM;
|
||||
for (size_t i = 0; i < C4NUM; i++) {
|
||||
for (int i = 0; i < C4NUM; i++) {
|
||||
dst_c[i] = src_c[i * col];
|
||||
}
|
||||
}
|
||||
|
@ -858,7 +854,7 @@ void RowMajor2Col4Major(const float *src_ptr, float *dst_ptr, int row, int col)
|
|||
dst_r += C4NUM * col;
|
||||
}
|
||||
for (; ri < row; ri++) {
|
||||
for (size_t i = 0; i < col; i++) {
|
||||
for (int i = 0; i < col; i++) {
|
||||
dst_r[i * C4NUM] = src_r[i];
|
||||
}
|
||||
src_r += col;
|
||||
|
@ -866,12 +862,11 @@ void RowMajor2Col4Major(const float *src_ptr, float *dst_ptr, int row, int col)
|
|||
}
|
||||
|
||||
for (; ri < total_row; ri++) {
|
||||
for (size_t i = 0; i < col; i++) {
|
||||
for (int i = 0; i < col; i++) {
|
||||
dst_r[i * C4NUM] = 0;
|
||||
}
|
||||
dst_r += 1;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
#ifndef ENABLE_ARM
|
||||
|
|
|
@ -23,16 +23,22 @@ void Pad(const float *input_data, float *output_data, const int *input_shape, co
|
|||
if (thread_num == 0) {
|
||||
return;
|
||||
}
|
||||
int in[4], out[4];
|
||||
int in[DEFAULT_PAD_NDIMS], out[DEFAULT_PAD_NDIMS];
|
||||
for (in[0] = 0; in[0] < input_shape[0]; in[0]++) {
|
||||
out[0] = in[0] + paddings[0];
|
||||
for (in[1] = tid; in[1] < input_shape[1]; in[1] += thread_num) {
|
||||
out[1] = in[1] + paddings[2];
|
||||
for (in[2] = 0; in[2] < input_shape[2]; in[2]++) {
|
||||
out[2] = in[2] + paddings[4];
|
||||
float *dst = output_data + offset(output_shape, out[0], out[1], out[2], paddings[6]);
|
||||
const float *src = input_data + offset(input_shape, in[0], in[1], in[2], 0);
|
||||
memcpy(dst, src, input_shape[3] * sizeof(float));
|
||||
for (in[3] = 0; in[3] < input_shape[3]; in[3]++) {
|
||||
out[3] = in[3] + paddings[6];
|
||||
for (in[4] = 0; in[4] < input_shape[4]; in[4]++) {
|
||||
out[4] = in[4] + paddings[8];
|
||||
float *dst = output_data + Offset6d(output_shape, out) + paddings[10];
|
||||
const float *src = input_data + Offset6d(input_shape, in);
|
||||
memcpy(dst, src, input_shape[5] * (int)(sizeof(float)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -57,8 +63,7 @@ int TransOut2InputDimIndex(int out_dim_index, int left_pad, int in_dim, int offs
|
|||
|
||||
int GetInputFlattenIndex(int out_flatten_index, const int *input_shape, const PadParameter *pad_param) {
|
||||
int in_flatten_index = 0;
|
||||
int i;
|
||||
for (i = 0; i < COMM_SHAPE_SIZE; ++i) {
|
||||
for (int i = 0; i < DEFAULT_PAD_NDIMS; ++i) {
|
||||
int left_pad = pad_param->paddings_[i * 2];
|
||||
NNACL_CHECK_ZERO_RETURN_ERR(pad_param->out_strides[i])
|
||||
int out_dim_index = out_flatten_index / pad_param->out_strides[i];
|
||||
|
|
|
@ -510,8 +510,8 @@ int ResizeNearestNeighbor(const float *input_data, float *output_data, const int
|
|||
} else {
|
||||
input_x = (int)(floorf(actual_x));
|
||||
}
|
||||
int in_offset = offset(input_shape, batch, input_y, input_x, 0);
|
||||
int out_offset = offset(output_shape, batch, y, x, 0);
|
||||
int in_offset = Offset(input_shape, batch, input_y, input_x, 0);
|
||||
int out_offset = Offset(output_shape, batch, y, x, 0);
|
||||
memcpy(output_data + out_offset, input_data + in_offset, c * sizeof(float));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,10 +20,8 @@
|
|||
#include "nnacl/nnacl_utils.h"
|
||||
|
||||
int Reverse(const float *input, float *output, size_t elem_size, int *index) {
|
||||
for (int i = 0; i < elem_size; i++) {
|
||||
for (size_t i = 0; i < elem_size; i++) {
|
||||
NNACL_ASSERT(index[i] >= 0);
|
||||
}
|
||||
for (int i = 0; i < elem_size; i++) {
|
||||
output[index[i]] = input[i];
|
||||
}
|
||||
return NNACL_OK;
|
||||
|
|
|
@ -23,7 +23,7 @@ int DoScatterND(float *output_ptr, const float *update, int *output_unit_offsets
|
|||
return NNACL_ERR;
|
||||
}
|
||||
for (int i = 0; i < num_units; i++) {
|
||||
(void)memcpy(output_ptr + output_unit_offsets[i], update + unit_size * i, unit_size * sizeof(float));
|
||||
(void)memcpy(output_ptr + output_unit_offsets[i], update + unit_size * i, (size_t)(unit_size) * sizeof(float));
|
||||
}
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
|
|
@ -25,7 +25,7 @@ void SpliceFp32(const float *src_data, int src_row, int src_col, const SplicePar
|
|||
forward_index++;
|
||||
const float *tmp_src_data = src_data + r_off * src_col;
|
||||
float *tmp_dst_data = dst_row_data + off * src_col;
|
||||
memcpy(tmp_dst_data, tmp_src_data, src_col * sizeof(float));
|
||||
memcpy(tmp_dst_data, tmp_src_data, (size_t)(src_col) * sizeof(float));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -70,7 +70,7 @@ int DoStridedSliceIntFp64Bool(const void *in_data, void *out_data, StridedSliceP
|
|||
if (param->num_axes_ < DIMENSION_8D) {
|
||||
PadStridedSliceParameterTo8D(param);
|
||||
}
|
||||
size_t dim_offset[DIMENSION_8D - 1];
|
||||
int dim_offset[DIMENSION_8D - 1];
|
||||
dim_offset[6] = in_shape[7];
|
||||
dim_offset[5] = in_shape[6] * dim_offset[6];
|
||||
dim_offset[4] = in_shape[5] * dim_offset[5];
|
||||
|
@ -132,7 +132,7 @@ int DoStridedSlice(const void *in_data, void *out_data, StridedSliceParameter *p
|
|||
if (param->num_axes_ < DIMENSION_8D) {
|
||||
PadStridedSliceParameterTo8D(param);
|
||||
}
|
||||
size_t dim_offset[DIMENSION_8D - 1];
|
||||
int dim_offset[DIMENSION_8D - 1];
|
||||
dim_offset[6] = in_shape[7];
|
||||
dim_offset[5] = in_shape[6] * dim_offset[6];
|
||||
dim_offset[4] = in_shape[5] * dim_offset[5];
|
||||
|
|
|
@ -180,15 +180,15 @@ void TransposeDimsFp32(const float *in_data, float *out_data, const int *output_
|
|||
int *strides = (int *)(transpose_param->strides_);
|
||||
int *out_strides = (int *)(transpose_param->out_strides_);
|
||||
int num_axes = transpose_param->num_axes_;
|
||||
size_t data_size = (*out_strides) * output_shape[0];
|
||||
size_t offset_size = UP_DIV(data_size, thread_num);
|
||||
size_t task_offset = offset_size * task_id;
|
||||
int data_size = (*out_strides) * output_shape[0];
|
||||
int offset_size = UP_DIV(data_size, thread_num);
|
||||
int task_offset = offset_size * task_id;
|
||||
int count = data_size - task_offset;
|
||||
if (count <= 0) {
|
||||
return;
|
||||
}
|
||||
count = MSMIN(offset_size, count);
|
||||
for (size_t idx = task_offset; idx < task_offset + count; ++idx) {
|
||||
for (int idx = task_offset; idx < task_offset + count; ++idx) {
|
||||
int pos = idx;
|
||||
int output_idx = 0;
|
||||
int input_idx = 0;
|
||||
|
|
|
@ -45,7 +45,7 @@ void WinogradInputTransform(const float *input_data, float *trans_input, float *
|
|||
int dst_plane_offset = c * in_channel;
|
||||
for (int ic = 0; ic < ic4; ic++) {
|
||||
// clear tmp buffer
|
||||
memset(tmp_data, 0, input_unit * input_unit * C4NUM * sizeof(float));
|
||||
memset(tmp_data, 0, input_unit * input_unit * C4NUM * (int)(sizeof(float)));
|
||||
|
||||
int real_c = in_channel - ic * C4NUM;
|
||||
real_c = real_c > C4NUM ? C4NUM : real_c;
|
||||
|
@ -87,7 +87,7 @@ void WinogradInputTransform(const float *input_data, float *trans_input, float *
|
|||
// input transform
|
||||
const int tile_num = C12NUM;
|
||||
int dst_ic4_offset = dst_plane_offset + ic * C4NUM;
|
||||
size_t dst_step = tile_num * in_channel;
|
||||
int dst_step = tile_num * in_channel;
|
||||
float *trans_input_ptr = trans_input + dst_ic4_offset;
|
||||
func(tmp_data, trans_input_ptr, C4NUM, dst_step, real_c);
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
* Copyright 2020-2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
|
@ -17,6 +17,7 @@
|
|||
#include <math.h>
|
||||
#include "nnacl/op_base.h"
|
||||
#include "nnacl/fp32/arithmetic_fp32.h"
|
||||
#include "nnacl/fp32/exp_fp32.h"
|
||||
#include "nnacl/fp32_grad/activation_grad.h"
|
||||
#include "nnacl/errorcode.h"
|
||||
|
||||
|
@ -110,3 +111,27 @@ int GeluGrad(const float *src0, const float *src1, size_t length, float *dst) {
|
|||
}
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int SoftplusGrad(const float *src0, const float *src1, int length, float *dst) {
|
||||
int i = 0;
|
||||
#if defined(ENABLE_AVX)
|
||||
for (; i <= length - C8NUM; i += C8NUM) {
|
||||
simd_exp_avx(-(MS_LD256_F32(src1 + i)), dst + i);
|
||||
MS_ST256_F32(dst + i,
|
||||
MS_DIV256_F32(MS_LD256_F32(src0 + i), MS_ADD256_F32(MS_MOV256_F32(1.0f), MS_LD256_F32(dst + i))));
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_ARM) || defined(ENABLE_SSE)
|
||||
for (; i <= length - C4NUM; i += C4NUM) {
|
||||
simd_exp(MS_SUBQ_F32(MS_MOVQ_F32(0.0f), MS_LDQ_F32(src1 + i)), dst + i);
|
||||
MS_STQ_F32(dst + i, MS_DIVQ_F32(MS_LDQ_F32(src0 + i), MS_ADDQ_F32(MS_MOVQ_F32(1.0f), MS_LDQ_F32(dst + i))));
|
||||
}
|
||||
#endif
|
||||
|
||||
for (; i < length; ++i) {
|
||||
single_exp(-src1[i], dst + i);
|
||||
dst[i] = src0[i] / (1.0f + dst[i]);
|
||||
}
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
* Copyright 2020-2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
|
@ -39,6 +39,7 @@ int HSwishGrad(const float *src0, const float *src1, size_t length, float *dst);
|
|||
int HSigmoidGrad(const float *src0, const float *src1, size_t length, float *dst);
|
||||
int EluGrad(const float *src0, const float *src1, size_t length, float *dst, float alpha);
|
||||
int GeluGrad(const float *src0, const float *src1, size_t length, float *dst);
|
||||
int SoftplusGrad(const float *src, const float *src1, int length, float *dst);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
@ -231,7 +231,7 @@ static void RowMajor2Col12MajorStride(const float *src_ptr, float *dst_ptr, size
|
|||
for (; ci < col; ci++) {
|
||||
const float *src_c = src_r + ci;
|
||||
float *dst_c = dst_r + ci * C12NUM;
|
||||
for (size_t i = 0; i < C12NUM; i++) {
|
||||
for (int i = 0; i < C12NUM; i++) {
|
||||
dst_c[i] = src_c[i * lead];
|
||||
}
|
||||
}
|
||||
|
@ -240,7 +240,7 @@ static void RowMajor2Col12MajorStride(const float *src_ptr, float *dst_ptr, size
|
|||
}
|
||||
|
||||
for (; ri < row; ri++) {
|
||||
for (size_t i = 0; i < col; i++) {
|
||||
for (int i = 0; i < col; i++) {
|
||||
dst_r[i * C12NUM] = src_r[i];
|
||||
}
|
||||
src_r += lead;
|
||||
|
@ -248,12 +248,11 @@ static void RowMajor2Col12MajorStride(const float *src_ptr, float *dst_ptr, size
|
|||
}
|
||||
|
||||
for (; ri < row_up_12; ri++) {
|
||||
for (size_t i = 0; i < col; i++) {
|
||||
for (int i = 0; i < col; i++) {
|
||||
dst_r[i * C12NUM] = 0;
|
||||
}
|
||||
dst_r += 1;
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -261,10 +260,10 @@ static void RowMajor2Col8MajorStride(const float *src_ptr, float *dst_ptr, size_
|
|||
size_t row8 = row / C8NUM * C8NUM;
|
||||
#ifdef ENABLE_ARM64
|
||||
size_t col_skip = col / C8NUM * C8NUM;
|
||||
int skip_size = C8NUM;
|
||||
size_t skip_size = C8NUM;
|
||||
#else
|
||||
size_t col_skip = col / C4NUM * C4NUM;
|
||||
int skip_size = C4NUM;
|
||||
size_t skip_size = C4NUM;
|
||||
#endif
|
||||
const float *src_r = src_ptr;
|
||||
float *dst_r = dst_ptr;
|
||||
|
@ -450,7 +449,7 @@ static void RowMajor2Col8MajorStride(const float *src_ptr, float *dst_ptr, size_
|
|||
for (; ci < col; ci++) {
|
||||
const float *src_c = src_r + ci;
|
||||
float *dst_c = dst_r + ci * C8NUM;
|
||||
for (size_t i = 0; i < C8NUM; i++) {
|
||||
for (int i = 0; i < C8NUM; i++) {
|
||||
dst_c[i] = src_c[i * lead];
|
||||
}
|
||||
}
|
||||
|
@ -458,7 +457,7 @@ static void RowMajor2Col8MajorStride(const float *src_ptr, float *dst_ptr, size_
|
|||
dst_r += C8NUM * col;
|
||||
}
|
||||
for (; ri < row; ri++) {
|
||||
for (size_t i = 0; i < col; i++) {
|
||||
for (int i = 0; i < col; i++) {
|
||||
dst_r[i * C8NUM] = src_r[i];
|
||||
}
|
||||
src_r += lead;
|
||||
|
|
|
@ -64,11 +64,11 @@ void ReduceSumByAxes(const float *input, const int *input_dims, float *output, c
|
|||
if (output_dims[idx] != input_dims[idx]) same_shape = 0;
|
||||
}
|
||||
if (same_shape) {
|
||||
memcpy(output, input, num_outputs * sizeof(float));
|
||||
memcpy(output, input, (size_t)(num_outputs) * sizeof(float));
|
||||
return;
|
||||
}
|
||||
|
||||
memset(output, 0, num_outputs * sizeof(float)); // zero output
|
||||
memset(output, 0, (size_t)(num_outputs) * sizeof(float)); // zero output
|
||||
|
||||
int input_iter[8] = {0};
|
||||
int axes[5] = {0};
|
||||
|
|
|
@ -37,13 +37,13 @@ void SoftmaxGrad(const float *input_ptr, const float *yt_ptr, float *output_ptr,
|
|||
for (int i = 0; i < inner_size * input_shape[axis]; i++) sum_mul[i] = 1.0;
|
||||
for (int i = 0; i < n_dim; i++) dim *= input_shape[i];
|
||||
dim /= outter_size;
|
||||
memcpy(output_ptr, yt_ptr, ele_size * sizeof(float));
|
||||
memcpy(output_ptr, yt_ptr, (size_t)(ele_size) * sizeof(float));
|
||||
|
||||
const int M = input_shape[axis];
|
||||
const int N = inner_size;
|
||||
for (int i = 0; i < outter_size; i++) {
|
||||
int outter_offset = i * dim;
|
||||
memset(sum_data, 0.0f, inner_size * sizeof(float));
|
||||
memset(sum_data, 0, (size_t)(inner_size) * sizeof(float));
|
||||
for (int k = 0; k < inner_size; k++) {
|
||||
int inner_offset = outter_offset + k;
|
||||
for (int j = 0; j < input_shape[axis]; j++) {
|
||||
|
|
|
@ -20,7 +20,7 @@
|
|||
static size_t CalcIndex(const int *shape, size_t size, int i, size_t pos) {
|
||||
size_t res = 1;
|
||||
for (size_t j = 0; j < size; j++) {
|
||||
res *= shape[(i + 1) + j];
|
||||
res *= shape[((size_t)(i) + 1) + j];
|
||||
}
|
||||
return (pos / res % shape[i]);
|
||||
}
|
||||
|
@ -37,7 +37,7 @@ int DoStridedSliceGrad(const float *inputs, float *output, const int *dx_shape,
|
|||
const int *s = param->strides_;
|
||||
const int *b = param->begins_;
|
||||
for (int i = 0; i < DIMENSION_8D; i++) {
|
||||
size *= param->in_shape_[i];
|
||||
size *= (size_t)(param->in_shape_[i]);
|
||||
}
|
||||
|
||||
for (size_t pos = 0; pos < size; pos++) {
|
||||
|
|
|
@ -56,13 +56,13 @@ int AddnInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **o
|
|||
for (size_t d = 0; d < inputs[max_dims_idx]->shape_size_; ++d) {
|
||||
size_t max_dim = 0;
|
||||
for (size_t i = 0; i < inputs_size; ++i) {
|
||||
size_t shift = max_dims - inputs[i]->shape_size_;
|
||||
size_t dim = (i < shift) ? 1 : inputs[i]->shape_[d];
|
||||
size_t shift = max_dims - (size_t)(inputs[i]->shape_size_);
|
||||
size_t dim = (i < shift) ? 1 : (size_t)(inputs[i]->shape_[d]);
|
||||
if (dim > max_dim) {
|
||||
max_dim = dim;
|
||||
}
|
||||
}
|
||||
output->shape_[d] = max_dim; // set the biggest dimension in the output tensor
|
||||
output->shape_[d] = (int)(max_dim); // set the biggest dimension in the output tensor
|
||||
}
|
||||
|
||||
return NNACL_OK;
|
||||
|
|
|
@ -17,8 +17,8 @@
|
|||
#include "nnacl/infer/affine_infer.h"
|
||||
#include "nnacl/infer/infer_register.h"
|
||||
|
||||
int MatmulInfer(AffineParameter *param, int a_shape[MAX_SHAPE_SIZE], size_t a_shape_size, int b_shape[MAX_SHAPE_SIZE],
|
||||
size_t b_shape_size) {
|
||||
int MatmulInfer(const AffineParameter *param, int a_shape[MAX_SHAPE_SIZE], size_t a_shape_size,
|
||||
int b_shape[MAX_SHAPE_SIZE], size_t b_shape_size) {
|
||||
MatMulParameter *matmul_param = param->matmul_parameter_;
|
||||
if (matmul_param->a_transpose_) {
|
||||
if (a_shape_size < 2) {
|
||||
|
|
|
@ -56,8 +56,8 @@ int ArgMinMaxInferShape(const TensorC *const *inputs, size_t inputs_size, Tensor
|
|||
int output_shape[MAX_SHAPE_SIZE] = {0};
|
||||
size_t output_shape_size = 0;
|
||||
ShapeSet(output_shape, &output_shape_size, input->shape_, input->shape_size_);
|
||||
size_t input_shape_size = input->shape_size_;
|
||||
int axis = param->axis_ < 0 ? param->axis_ + (int)input_shape_size : param->axis_;
|
||||
int input_shape_size = (int)input->shape_size_;
|
||||
int axis = param->axis_ < 0 ? param->axis_ + input_shape_size : param->axis_;
|
||||
if (axis >= input_shape_size || axis < 0) {
|
||||
return NNACL_PARAM_INVALID;
|
||||
}
|
||||
|
|
|
@ -55,10 +55,10 @@ int ArithmeticGradInferShape(const TensorC *const *inputs, size_t inputs_size, T
|
|||
|
||||
if (GetElementNum(dx1) < GetElementNum(dx2)) {
|
||||
param->ndim_ = in_shape1_size;
|
||||
param->in_elements_num0_ = param->ndim_;
|
||||
param->in_elements_num1_ = param->ndim_;
|
||||
param->out_elements_num_ = param->ndim_;
|
||||
int fill_dim_num = in_shape1_size - in_shape0_size; // This will not work for batch!
|
||||
param->in_elements_num0_ = (int)param->ndim_;
|
||||
param->in_elements_num1_ = (int)param->ndim_;
|
||||
param->out_elements_num_ = (int)param->ndim_;
|
||||
size_t fill_dim_num = in_shape1_size - in_shape0_size; // This will not work for batch!
|
||||
int j = 0;
|
||||
for (unsigned int i = 0; i < in_shape1_size; i++) {
|
||||
if (i < fill_dim_num) {
|
||||
|
@ -76,7 +76,7 @@ int ArithmeticGradInferShape(const TensorC *const *inputs, size_t inputs_size, T
|
|||
param->out_elements_num_ = param->ndim_;
|
||||
param->broadcasting_ = true;
|
||||
int j = 0;
|
||||
int fill_dim_num = in_shape0_size - in_shape1_size;
|
||||
size_t fill_dim_num = in_shape0_size - in_shape1_size;
|
||||
for (unsigned int i = 0; i < in_shape0_size; i++) {
|
||||
if (i < fill_dim_num) {
|
||||
param->in_shape1_[i] = 1;
|
||||
|
|
|
@ -66,7 +66,7 @@ int AudioSpectrogramInferShape(const TensorC *const *inputs, size_t inputs_size,
|
|||
int sample_sub_window = input->shape_[0] - param->window_size_;
|
||||
output_shape[1] = sample_sub_window < 0 ? 0 : 1 + sample_sub_window / param->stride_;
|
||||
// compute fft length
|
||||
int fft_length = GetFftLength(param->window_size_);
|
||||
int fft_length = (int)GetFftLength(param->window_size_);
|
||||
output_shape[2] = fft_length / 2 + 1;
|
||||
SetShapeArray(output, output_shape, 3);
|
||||
return NNACL_OK;
|
||||
|
|
|
@ -33,8 +33,8 @@ int BiasGradInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC
|
|||
int inshape[MAX_SHAPE_SIZE];
|
||||
size_t inshape_size = 0;
|
||||
ShapeSet(inshape, &inshape_size, in0->shape_, in0->shape_size_);
|
||||
int ndim = inshape_size;
|
||||
for (int i = 0; i < ndim - 1; i++) {
|
||||
size_t ndim = inshape_size;
|
||||
for (size_t i = 0; i < ndim - 1; i++) {
|
||||
inshape[i] = 1;
|
||||
}
|
||||
SetDataTypeFormat(out, in0);
|
||||
|
|
|
@ -111,12 +111,12 @@ int BroadcastToInferShape(const TensorC *const *inputs, size_t inputs_size, Tens
|
|||
const int *input_shape = input->shape_;
|
||||
size_t input_shape_size = input->shape_size_;
|
||||
int shape[MAX_SHAPE_SIZE];
|
||||
int input_shape_index = input_shape_size - 1;
|
||||
int input_shape_index = (int)(input_shape_size)-1;
|
||||
if (input_shape_size > dst_shape_size) {
|
||||
return NNACL_ERR;
|
||||
}
|
||||
|
||||
for (int i = dst_shape_size - 1; i >= 0; --i) {
|
||||
for (int i = (int)(dst_shape_size)-1; i >= 0; --i) {
|
||||
if (dst_shape[i] < 0) {
|
||||
return NNACL_ERR;
|
||||
}
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
#include <string.h>
|
||||
#include "nnacl/infer/infer_register.h"
|
||||
|
||||
#ifdef ENABLE_CONTROL_TENSORLIST
|
||||
int MallocTensorListData(TensorListC *tensor_list, TypeIdC dtype, const vvector *tensor_shape) {
|
||||
// This function will create a new tensors_
|
||||
// Your must to set shape(param2: tensor_shape) and data_type_(tensors_data_type_ = param1: dtype) of each tensor in
|
||||
|
@ -35,7 +36,7 @@ int MallocTensorListData(TensorListC *tensor_list, TypeIdC dtype, const vvector
|
|||
return NNACL_NULL_PTR;
|
||||
}
|
||||
memset(tensor_list->tensors_, 0, tensor_list->element_num_ * sizeof(TensorC));
|
||||
for (int i = 0; i < tensor_list->element_num_; ++i) {
|
||||
for (size_t i = 0; i < tensor_list->element_num_; ++i) {
|
||||
tensor_list->tensors_[i].format_ = Format_NHWC;
|
||||
tensor_list->tensors_[i].data_type_ = dtype;
|
||||
ShapeSet(tensor_list->tensors_[i].shape_, &(tensor_list->tensors_[i].shape_size_), tensor_shape->shape_[i],
|
||||
|
@ -69,6 +70,7 @@ bool TensorListIsFullyDefined(const int *shape, size_t shape_size) {
|
|||
}
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
||||
int CheckAugmentNull(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size,
|
||||
const OpParameter *parameter) {
|
||||
|
@ -157,7 +159,7 @@ void SetShapeTensor(TensorC *dst, const TensorC *src) {
|
|||
}
|
||||
|
||||
void SetShapeArray(TensorC *dst, const int *src, size_t src_size) {
|
||||
for (size_t i = 0; i < src_size; i++) {
|
||||
for (size_t i = 0; i < src_size && i < MAX_SHAPE_SIZE; i++) {
|
||||
dst->shape_[i] = src[i];
|
||||
}
|
||||
dst->shape_size_ = src_size;
|
||||
|
@ -286,13 +288,17 @@ int GetDimensionSize(const TensorC *tensor, const size_t index) {
|
|||
}
|
||||
|
||||
void ShapeSet(int *dst_shape, size_t *dst_shape_size, const int *src_shape, size_t src_shape_size) {
|
||||
for (size_t i = 0; i < src_shape_size; i++) {
|
||||
size_t i = 0;
|
||||
for (; i < src_shape_size && i < MAX_SHAPE_SIZE; i++) {
|
||||
dst_shape[i] = src_shape[i];
|
||||
}
|
||||
*dst_shape_size = src_shape_size;
|
||||
*dst_shape_size = i;
|
||||
}
|
||||
|
||||
void ShapePush(int *shape, size_t *shape_size, int value) {
|
||||
if (*shape_size >= MAX_SHAPE_SIZE) {
|
||||
return;
|
||||
}
|
||||
shape[*shape_size] = value;
|
||||
*shape_size = *shape_size + 1;
|
||||
}
|
||||
|
@ -301,6 +307,9 @@ int ShapeInsert(int *shape, size_t *shape_size, int index, int value) {
|
|||
if (index < 0 || index > *shape_size) {
|
||||
return NNACL_ERR;
|
||||
}
|
||||
if (*shape_size >= MAX_SHAPE_SIZE) {
|
||||
return NNACL_ERR;
|
||||
}
|
||||
for (int i = *shape_size; i > index; i--) {
|
||||
shape[i] = shape[i - 1];
|
||||
}
|
||||
|
@ -325,7 +334,7 @@ bool ShapeEqual(const int *shape0, size_t shape0_size, const int *shape1, size_t
|
|||
if (shape0_size != shape1_size) {
|
||||
return false;
|
||||
}
|
||||
for (int i = 0; i < shape0_size; i++) {
|
||||
for (size_t i = 0; i < shape0_size; i++) {
|
||||
if (shape0[i] != shape1[i]) {
|
||||
return false;
|
||||
}
|
||||
|
@ -401,96 +410,6 @@ int FftInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **ou
|
|||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int VectorCInit(VectorC *vc, size_t per_malloc_size) {
|
||||
if (per_malloc_size == 0) {
|
||||
return NNACL_ERR;
|
||||
}
|
||||
vc->data_ = (int *)malloc(per_malloc_size * sizeof(int));
|
||||
if (vc->data_ == NULL) {
|
||||
return NNACL_ERR;
|
||||
}
|
||||
vc->size_ = 0;
|
||||
vc->max_size_ = per_malloc_size;
|
||||
vc->per_malloc_size_ = per_malloc_size;
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int VectorCSet(VectorC *vc, const int *src_shape, size_t src_shape_size) {
|
||||
if (src_shape_size == 0) {
|
||||
vc->size_ = 0;
|
||||
} else {
|
||||
free(vc->data_);
|
||||
if (vc->per_malloc_size_ == 0) {
|
||||
return NNACL_ERR;
|
||||
}
|
||||
vc->max_size_ = (src_shape_size / vc->per_malloc_size_ + 1) * vc->per_malloc_size_;
|
||||
vc->data_ = (int *)malloc(sizeof(int) * vc->max_size_);
|
||||
if (vc->data_ == NULL) {
|
||||
return NNACL_ERR;
|
||||
}
|
||||
for (size_t i = 0; i < src_shape_size; i++) {
|
||||
vc->data_[i] = src_shape[i];
|
||||
}
|
||||
vc->size_ = src_shape_size;
|
||||
}
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int VectorCPush(VectorC *vc, int value) {
|
||||
if (vc->size_ + 1 > vc->max_size_) {
|
||||
int *tmp = (int *)malloc(vc->per_malloc_size_ * sizeof(int) + vc->max_size_ * sizeof(int));
|
||||
if (tmp == NULL) {
|
||||
return NNACL_ERR;
|
||||
}
|
||||
memcpy(tmp, vc->data_, vc->size_ * sizeof(int));
|
||||
free(vc->data_);
|
||||
vc->data_ = tmp;
|
||||
vc->max_size_ = vc->max_size_ + vc->per_malloc_size_;
|
||||
}
|
||||
vc->data_[vc->size_] = value;
|
||||
vc->size_++;
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int VectorCInsert(VectorC *vc, int index, int value) {
|
||||
if (vc->size_ + 1 > vc->max_size_) {
|
||||
int *tmp = (int *)malloc(vc->per_malloc_size_ * sizeof(int) + vc->max_size_ * sizeof(int));
|
||||
if (tmp == NULL) {
|
||||
return NNACL_ERR;
|
||||
}
|
||||
memcpy(tmp, vc->data_, vc->size_ * sizeof(int));
|
||||
free(vc->data_);
|
||||
vc->data_ = tmp;
|
||||
vc->max_size_ = vc->max_size_ + vc->per_malloc_size_;
|
||||
}
|
||||
memmove(vc->data_ + index + 1, vc->data_ + index, (vc->size_ - index) * sizeof(int));
|
||||
vc->data_[index] = value;
|
||||
vc->size_++;
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
void VectorCErase(VectorC *vc, int index) {
|
||||
memmove(vc->data_ + index, vc->data_ + index + 1, (vc->size_ - index - 1) * sizeof(int));
|
||||
vc->size_--;
|
||||
}
|
||||
|
||||
bool VectorCEqual(const VectorC *vc1, const VectorC *vc2) {
|
||||
if (vc1->size_ != vc2->size_) {
|
||||
return false;
|
||||
}
|
||||
for (size_t i = 0; i < vc1->size_; i++) {
|
||||
if (vc1->data_[i] != vc2->data_[i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void VectorCFree(VectorC *vc) {
|
||||
free(vc->data_);
|
||||
vc->data_ = NULL;
|
||||
}
|
||||
|
||||
bool InferFlag(const TensorC *const *inputs, size_t inputs_size) {
|
||||
if (inputs == NULL) {
|
||||
return false;
|
||||
|
@ -499,18 +418,22 @@ bool InferFlag(const TensorC *const *inputs, size_t inputs_size) {
|
|||
if (inputs[i] == NULL) {
|
||||
return false;
|
||||
}
|
||||
#ifdef ENABLE_CONTROL_TENSORLIST
|
||||
if (inputs[i]->data_type_ == kObjectTypeTensorType) {
|
||||
TensorListC *input_tensor_list = (TensorListC *)inputs[i];
|
||||
if (input_tensor_list->shape_value_ == -1) {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
#endif
|
||||
for (size_t j = 0; j < inputs[i]->shape_size_; ++j) {
|
||||
if (inputs[i]->shape_[j] == -1) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
#ifdef ENABLE_CONTROL_TENSORLIST
|
||||
}
|
||||
#endif
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -138,6 +138,7 @@ typedef struct vvector {
|
|||
size_t size_; // number of shapes
|
||||
} vvector;
|
||||
|
||||
#ifdef ENABLE_CONTROL_TENSORLIST
|
||||
typedef struct TensorListC {
|
||||
bool is_ready_;
|
||||
int data_type_;
|
||||
|
@ -150,6 +151,7 @@ typedef struct TensorListC {
|
|||
size_t element_shape_size_;
|
||||
TensorC *tensors_;
|
||||
} TensorListC;
|
||||
#endif
|
||||
|
||||
typedef struct VectorC {
|
||||
int *data_;
|
||||
|
@ -158,9 +160,11 @@ typedef struct VectorC {
|
|||
size_t per_malloc_size_;
|
||||
} VectorC;
|
||||
|
||||
#ifdef ENABLE_CONTROL_TENSORLIST
|
||||
int MallocTensorListData(TensorListC *tensor_list, TypeIdC dtype, const vvector *tensor_shape);
|
||||
int TensorListMergeShape(int *element_shape, size_t *element_shape_size, const int *tmp, size_t tmp_size);
|
||||
bool TensorListIsFullyDefined(const int *shape, size_t shape_size);
|
||||
#endif
|
||||
|
||||
int GetBatch(const TensorC *tensor);
|
||||
int GetHeight(const TensorC *tensor);
|
||||
|
@ -202,13 +206,6 @@ int CommonInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC *
|
|||
int FftInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size,
|
||||
const OpParameter *parameter);
|
||||
|
||||
int VectorCInit(VectorC *vc, size_t per_malloc_size);
|
||||
int VectorCSet(VectorC *vc, const int *src_shape, size_t src_shape_size);
|
||||
int VectorCPush(VectorC *vc, int value);
|
||||
int VectorCInsert(VectorC *vc, int index, int value);
|
||||
void VectorCErase(VectorC *vc, int index);
|
||||
bool VectorCEqual(const VectorC *vc1, const VectorC *vc2);
|
||||
void VectorCFree(VectorC *vc);
|
||||
bool InferFlag(const TensorC *const *inputs, size_t inputs_size);
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
|
@ -54,8 +54,13 @@ int ConcatInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC *
|
|||
}
|
||||
int output_axis_dim = input0_shape[axis];
|
||||
for (size_t i = 1; i < inputs_size; ++i) {
|
||||
if (inputs[i]->shape_size_ != input0_shape_size) {
|
||||
return NNACL_PARAM_INVALID;
|
||||
size_t input_i_shape_size = inputs[i]->shape_size_;
|
||||
if (input_i_shape_size != input0_shape_size) {
|
||||
if (input_i_shape_size != 0) {
|
||||
return NNACL_PARAM_INVALID;
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
int shape_tmp[MAX_SHAPE_SIZE] = {0};
|
||||
size_t shape_tmp_size = 0;
|
||||
|
|
|
@ -37,7 +37,7 @@ int ConstantOfShapeInferShape(const TensorC *const *inputs, size_t inputs_size,
|
|||
return NNACL_ERR;
|
||||
}
|
||||
int out_shape[MAX_SHAPE_SIZE];
|
||||
size_t out_shape_size = size;
|
||||
int out_shape_size = size;
|
||||
switch (in_tensor->data_type_) {
|
||||
case kNumberTypeInt32: {
|
||||
int32_t *in_data = (int32_t *)(in_tensor->data_);
|
||||
|
|
|
@ -34,7 +34,10 @@ int Conv2dGradFilterInferShape(const TensorC *const *inputs, size_t inputs_size,
|
|||
if (inputs[2]->shape_size_ < 1 || inputs[2]->data_ == NULL) {
|
||||
return NNACL_ERR;
|
||||
}
|
||||
size_t filter_shape_size = inputs[2]->shape_[0];
|
||||
if (inputs[2]->shape_[0] < 0) {
|
||||
return NNACL_ERR;
|
||||
}
|
||||
size_t filter_shape_size = (size_t)(inputs[2]->shape_[0]);
|
||||
if (filter_shape_size != 4) {
|
||||
return NNACL_ERR;
|
||||
}
|
||||
|
|
|
@ -40,16 +40,16 @@ int Conv2dGradInputInferShape(const TensorC *const *inputs, size_t inputs_size,
|
|||
if (inputs[2]->shape_size_ < 1 || inputs[2]->data_ == NULL) {
|
||||
return NNACL_ERR;
|
||||
}
|
||||
size_t shape_size = inputs[2]->shape_[0];
|
||||
if (shape_size != 4) {
|
||||
size_t data_size = (size_t)inputs[2]->shape_[0];
|
||||
if (data_size != 4) {
|
||||
return NNACL_ERR;
|
||||
}
|
||||
int shape[MAX_SHAPE_SIZE];
|
||||
const int nchw2nhwc[4] = {0, 2, 3, 1};
|
||||
for (int i = 0; i < shape_size; i++) {
|
||||
for (size_t i = 0; i < data_size; i++) {
|
||||
shape[i] = *((int *)(inputs[2]->data_) + nchw2nhwc[i]);
|
||||
}
|
||||
SetShapeArray(out, shape, shape_size);
|
||||
SetShapeArray(out, shape, data_size);
|
||||
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue