[feat] [assistant] [I3T96T] add new Dataset operator CMUARCTICDataset

This commit is contained in:
djc 2021-08-22 16:26:45 +08:00
parent c5bc9af49f
commit b077aa1cab
1432 changed files with 21434 additions and 10802 deletions

View File

@ -24,6 +24,9 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
-Wno-return-std-move -Wno-unused-private-field -Wno-unused-lambda-capture -Wno-sign-compare \
-Wno-overloaded-virtual -Wno-unneeded-internal-declaration -Wno-unused-variable -Wno-pessimizing-move \
-Wno-inconsistent-missing-override -DHALF_ENABLE_CPP11_USER_LITERALS=0 -D_FORTIFY_SOURCE=2")
elseif(ENABLE_SYM_FILE)
set(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -O2 -g -ggdb -Wl,--allow-shlib-undefined \
-DHALF_ENABLE_CPP11_USER_LITERALS=0 -D_FORTIFY_SOURCE=2")
else()
set(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -O2 -Wl,--allow-shlib-undefined \
-DHALF_ENABLE_CPP11_USER_LITERALS=0 -D_FORTIFY_SOURCE=2")

View File

@ -27,7 +27,7 @@ usage()
echo " [-P on|off] [-z [on|off]] [-M on|off] [-V 10.1|11.1|310|910] [-I arm64|arm32|x86_64] [-K] \\"
echo " [-B on|off] [-E] [-l on|off] [-n full|lite|off] [-H on|off] \\"
echo " [-A on|off] [-S on|off] [-k on|off] [-W sse|neon|avx|avx512|off] \\"
echo " [-L Tensor-RT path] \\"
echo " [-L Tensor-RT path] [-y on|off] \\"
echo ""
echo "Options:"
echo " -d Debug mode"
@ -64,6 +64,7 @@ usage()
echo " -W Enable x86_64 SSE or AVX instruction set, use [sse|neon|avx|avx512|off], default off for lite and avx for CPU"
echo " -H Enable hidden"
echo " -L Link and specify Tensor-RT library path, default disable Tensor-RT lib linking"
echo " -y Compile the symbol table switch and save the symbol table to the directory output"
}
# check value of input is 'on' or 'off'
@ -122,8 +123,9 @@ checkopts()
TENSORRT_HOME=""
USER_ENABLE_DUMP_IR=false
USER_ENABLE_DEBUGGER=false
ENABLE_SYM_FILE="off"
# Process the options
while getopts 'drvj:c:t:hb:s:a:g:p:ie:m:l:I:RP:D:zM:V:K:B:En:A:S:k:W:H:L:' opt
while getopts 'drvj:c:t:hb:s:a:g:p:ie:m:l:I:RP:D:zM:V:K:B:En:A:S:k:W:H:L:y' opt
do
CASE_SENSIVE_ARG=${OPTARG}
OPTARG=$(echo ${OPTARG} | tr '[A-Z]' '[a-z]')
@ -140,6 +142,9 @@ checkopts()
exit 1
fi
;;
y)
ENABLE_SYM_FILE="on"
;;
r)
DEBUG_MODE="off"
;;
@ -442,6 +447,9 @@ build_mindspore()
if [[ -n "$TRAIN_MODE" ]]; then
CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_${TRAIN_MODE}=ON"
fi
if [[ "X$ENABLE_SYM_FILE" = "Xon" ]]; then
CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_SYM_FILE=ON"
fi
if [[ "X$ENABLE_ASAN" = "Xon" ]]; then
CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_ASAN=ON"
fi

View File

@ -1,10 +1,10 @@
if(MSVC)
set(flatbuffers_CXXFLAGS "${CMAKE_CXX_FLAGS}")
set(flatbuffers_CFLAGS "${CMAKE_CXX_FLAGS}")
set(flatbuffers_CFLAGS "${CMAKE_C_FLAGS}")
set(flatbuffers_LDFLAGS "${CMAKE_SHARED_LINKER_FLAGS}")
else()
set(flatbuffers_CXXFLAGS "-D_FORTIFY_SOURCE=2 -O2")
set(flatbuffers_CFLAGS "-D_FORTIFY_SOURCE=2 -O2")
set(flatbuffers_CXXFLAGS "-fPIC -fPIE -D_FORTIFY_SOURCE=2 -O2 -fstack-protector-strong")
set(flatbuffers_CFLAGS "-fPIC -fPIE -D_FORTIFY_SOURCE=2 -O2 -fstack-protector-strong")
endif()
if(WIN32)

View File

@ -89,7 +89,6 @@ if(ENABLE_MINDDATA)
include(${CMAKE_SOURCE_DIR}/cmake/external_libs/tinyxml2.cmake)
include(${CMAKE_SOURCE_DIR}/cmake/external_libs/cppjieba.cmake)
include(${CMAKE_SOURCE_DIR}/cmake/external_libs/sentencepiece.cmake)
include(${CMAKE_SOURCE_DIR}/cmake/external_libs/ffmpeg.cmake)
endif()
if(ENABLE_MINDDATA)

View File

@ -25,6 +25,7 @@ option(ENABLE_ACL "enable acl" OFF)
option(ENABLE_GLIBCXX "enable_glibcxx" OFF)
option(MODE_ASCEND_ALL "supports all ascend platform" OFF)
option(MODE_ASCEND_ACL "supports ascend acl mode only" OFF)
option(ENABLE_SYM_FILE "enable sym file" OFF)
if(NOT ENABLE_D AND NOT ENABLE_TESTCASES AND NOT ENABLE_ACL AND NOT ENABLE_GE)
set(ENABLE_GLIBCXX ON)

View File

@ -12,6 +12,8 @@ set(CPACK_TEMPORARY_PACKAGE_FILE_NAME ${BUILD_PATH}/package/mindspore)
set(CPACK_TEMPORARY_INSTALL_DIRECTORY ${BUILD_PATH}/package/mindspore)
set(CPACK_PACK_ROOT_DIR ${BUILD_PATH}/package/)
set(CPACK_CMAKE_SOURCE_DIR ${CMAKE_SOURCE_DIR})
set(CPACK_ENABLE_SYM_FILE ${ENABLE_SYM_FILE})
set(CPACK_CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE})
if(ENABLE_GE)
set(CPACK_MS_BACKEND "ge")
set(CPACK_MS_TARGET "ascend or cpu")
@ -125,17 +127,6 @@ if(ENABLE_MINDDATA)
DESTINATION ${INSTALL_LIB_DIR} RENAME libicudata.so.67 COMPONENT mindspore)
install(FILES ${icu4c_LIBPATH}/libicui18n.so.67.1
DESTINATION ${INSTALL_LIB_DIR} RENAME libicui18n.so.67 COMPONENT mindspore)
install(FILES ${ffmpeg_LIBPATH}/libavcodec.so.58.91.100
DESTINATION ${INSTALL_LIB_DIR} RENAME libavcodec.so.58 COMPONENT mindspore)
install(FILES ${ffmpeg_LIBPATH}/libavformat.so.58.45.100
DESTINATION ${INSTALL_LIB_DIR} RENAME libavformat.so.58 COMPONENT mindspore)
install(FILES ${ffmpeg_LIBPATH}/libavutil.so.56.51.100
DESTINATION ${INSTALL_LIB_DIR} RENAME libavutil.so.56 COMPONENT mindspore)
install(FILES ${ffmpeg_LIBPATH}/libswresample.so.3.7.100
DESTINATION ${INSTALL_LIB_DIR} RENAME libswresample.so.3 COMPONENT mindspore)
install(FILES ${ffmpeg_LIBPATH}/libswscale.so.5.7.100
DESTINATION ${INSTALL_LIB_DIR} RENAME libswscale.so.5 COMPONENT mindspore)
endif()
if(ENABLE_CPU)

View File

@ -77,6 +77,48 @@ set(ENV{BACKEND_TARGET} ${CPACK_MS_TARGET})
set(ENV{MS_PACKAGE_NAME} ${CPACK_MS_PACKAGE_NAME})
set(ENV{COMMIT_ID} ${GIT_COMMIT_ID})
file(GLOB DEBUG_SYM
${MS_PACK_ROOT_DIR}/mindspore/*.so
${MS_PACK_ROOT_DIR}/mindspore/lib/*.so
)
file(GLOB DEBUG_STRIP_SYM
${MS_PACK_ROOT_DIR}/mindspore/*.so
${MS_PACK_ROOT_DIR}/mindspore/lib/*.so*
)
set(CMAKE_OBJCOPY $ENV{CROSS_COMPILE}objcopy)
set(CMAKE_STRIP $ENV{CROSS_COMPILE}strip)
if(CPACK_ENABLE_SYM_FILE)
foreach(schema ${DEBUG_SYM})
execute_process(
COMMAND ${CMAKE_OBJCOPY} "--only-keep-debug" ${schema} ${schema}.sym
WORKING_DIRECTORY ${MS_PACK_ROOT_DIR}
)
endforeach()
endif()
if("${CPACK_CMAKE_BUILD_TYPE}" STREQUAL "Release")
foreach(schema ${DEBUG_STRIP_SYM})
execute_process(
COMMAND ${CMAKE_STRIP} ${schema}
WORKING_DIRECTORY ${MS_PACK_ROOT_DIR}
)
endforeach()
endif()
file(GLOB DEBUG_SYM_FILE
${MS_PACK_ROOT_DIR}/mindspore/*.sym
${MS_PACK_ROOT_DIR}/mindspore/lib/*.sym
)
if(CPACK_ENABLE_SYM_FILE)
file(MAKE_DIRECTORY ${MS_ROOT_DIR}/debug_info)
file(COPY ${DEBUG_SYM_FILE} DESTINATION ${MS_ROOT_DIR}/debug_info/)
file(REMOVE_RECURSE ${DEBUG_SYM_FILE})
endif()
execute_process(
COMMAND ${PYTHON} ${MS_ROOT_DIR}/setup.py "bdist_wheel"
WORKING_DIRECTORY ${MS_PACK_ROOT_DIR}
@ -104,3 +146,16 @@ file(COPY ${MS_PACK_ROOT_DIR}/${NEW_FILE_NAME} DESTINATION ${MS_ROOT_DIR}/output
file(SHA256 ${MS_ROOT_DIR}/output/${NEW_FILE_NAME} SHA256_VAR)
file(WRITE ${MS_ROOT_DIR}/output/${NEW_FILE_NAME}.sha256 ${SHA256_VAR} " " ${NEW_FILE_NAME})
set(CMAKE_TAR $ENV{CROSS_COMPILE}tar)
if(CPACK_ENABLE_SYM_FILE)
file(MAKE_DIRECTORY ${MS_ROOT_DIR}/output/${PACKAGE_NAME}-${VERSION}-${PY_TAGS}-${PLATFORM_TAG})
file(COPY ${MS_ROOT_DIR}/debug_info/ DESTINATION
${MS_ROOT_DIR}/output/${PACKAGE_NAME}-${VERSION}-${PY_TAGS}-${PLATFORM_TAG}/)
execute_process(COMMAND
${CMAKE_COMMAND} -E ${CMAKE_TAR} cfv
${MS_ROOT_DIR}/output/${PACKAGE_NAME}-${VERSION}-${PY_TAGS}-${PLATFORM_TAG}.zip
${MS_ROOT_DIR}/output/${PACKAGE_NAME}-${VERSION}-${PY_TAGS}-${PLATFORM_TAG}/ --format=zip
WORKING_DIRECTORY ${MS_ROOT_DIR})
file(REMOVE_RECURSE ${MS_ROOT_DIR}/debug_info)
file(REMOVE_RECURSE ${MS_ROOT_DIR}/output/${PACKAGE_NAME}-${VERSION}-${PY_TAGS}-${PLATFORM_TAG})
endif()

View File

@ -91,18 +91,6 @@ if(ENABLE_MINDDATA)
DESTINATION ${INSTALL_LIB_DIR}
COMPONENT mindspore
)
file(GLOB_RECURSE FFMPEG_LIB_LIST
${ffmpeg_LIBPATH}/libavcodec*
${ffmpeg_LIBPATH}/libavformat*
${ffmpeg_LIBPATH}/libavutil*
${ffmpeg_LIBPATH}/libswresample*
${ffmpeg_LIBPATH}/libswscale*
)
install(
FILES ${FFMPEG_LIB_LIST}
DESTINATION ${INSTALL_LIB_DIR}
COMPONENT mindspore
)
endif()
# CPU mode

View File

@ -42,7 +42,6 @@ set(opencv_LIBPATH ${opencv_LIBPATH}/../bin/)
set(jpeg_turbo_LIBPATH ${jpeg_turbo_LIBPATH}/../bin/)
set(sqlite_LIBPATH ${sqlite_LIBPATH}/../bin/)
set(tinyxml2_LIBPATH ${tinyxml2_LIBPATH}/../bin/)
set(ffmpeg_LIBPATH ${ffmpeg_LIBPATH}/../bin/)
message("offline debugger does not support windows system temporarily")
@ -98,18 +97,6 @@ if(ENABLE_MINDDATA)
DESTINATION ${INSTALL_LIB_DIR}
COMPONENT mindspore
)
file(GLOB_RECURSE FFMPEG_LIB_LIST
${ffmpeg_LIBPATH}/libavcodec*
${ffmpeg_LIBPATH}/libavformat*
${ffmpeg_LIBPATH}/libavutil*
${ffmpeg_LIBPATH}/libswresample*
${ffmpeg_LIBPATH}/libswscale*
)
install(
FILES ${FFMPEG_LIB_LIST}
DESTINATION ${INSTALL_LIB_DIR}
COMPONENT mindspore
)
endif()
if(ENABLE_CPU)

View File

@ -1,2 +1,4 @@
approvers:
- zhoufeng54
reviewers:
- HW_KK
- HW_KK

View File

@ -58,8 +58,11 @@ RUN apt install -y libffi-dev libssl-dev zlib1g-dev libbz2-dev libncurses5-dev \
&& make install -j4 \
&& rm -f /usr/local/bin/python \
&& rm -f /usr/local/bin/pip \
&& rm -f /usr/local/lib/libpython3.7m.so.1.0 \
&& ln -s ${PYTHON_ROOT_PATH}/bin/python3.7 /usr/local/bin/python \
&& ln -s ${PYTHON_ROOT_PATH}/bin/pip3.7 /usr/local/bin/pip \
&& ln -s ${PYTHON_ROOT_PATH}/lib/libpython3.7m.so.1.0 /usr/local/lib/libpython3.7m.so.1.0 \
&& ldconfig \
&& rm -rf /tmp/cpython-3.7.5 \
&& rm -f /tmp/v3.7.5.tar.gz

View File

@ -51,13 +51,16 @@ RUN apt install -y libffi-dev libssl-dev zlib1g-dev libbz2-dev libncurses5-dev \
&& tar -xvf v3.7.5.tar.gz \
&& cd /tmp/cpython-3.7.5 \
&& mkdir -p ${PYTHON_ROOT_PATH} \
&& ./configure --prefix=${PYTHON_ROOT_PATH} \
&& ./configure --prefix=${PYTHON_ROOT_PATH} --enable-shared \
&& make -j4 \
&& make install -j4 \
&& rm -f /usr/local/bin/python \
&& rm -f /usr/local/bin/pip \
&& rm -f /usr/local/lib/libpython3.7m.so.1.0 \
&& ln -s ${PYTHON_ROOT_PATH}/bin/python3.7 /usr/local/bin/python \
&& ln -s ${PYTHON_ROOT_PATH}/bin/pip3.7 /usr/local/bin/pip \
&& ln -s ${PYTHON_ROOT_PATH}/lib/libpython3.7m.so.1.0 /usr/local/lib/libpython3.7m.so.1.0 \
&& ldconfig \
&& rm -rf /tmp/cpython-3.7.5 \
&& rm -f /tmp/v3.7.5.tar.gz

View File

@ -1,4 +1,4 @@
FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
FROM nvidia/cuda:11.1-cudnn8-devel-ubuntu18.04
MAINTAINER leonwanghui <leon.wanghui@huawei.com>
@ -43,7 +43,7 @@ RUN DEBIAN_FRONTEND=noninteractive apt install -y \
libnuma-dev
# Configure cuDNN (v7.6.5)
RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7.6.5 /usr/local/cuda/lib64/libcudnn.so
RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.8.0.5 /usr/local/cuda/lib64/libcudnn.so
# Set bash
RUN echo "dash dash/sh boolean false" | debconf-set-selections
@ -62,8 +62,11 @@ RUN apt install -y libffi-dev libssl-dev zlib1g-dev libbz2-dev libncurses5-dev \
&& make install -j4 \
&& rm -f /usr/local/bin/python \
&& rm -f /usr/local/bin/pip \
&& rm -f /usr/local/lib/libpython3.7m.so.1.0 \
&& ln -s ${PYTHON_ROOT_PATH}/bin/python3.7 /usr/local/bin/python \
&& ln -s ${PYTHON_ROOT_PATH}/bin/pip3.7 /usr/local/bin/pip \
&& ln -s ${PYTHON_ROOT_PATH}/lib/libpython3.7m.so.1.0 /usr/local/lib/libpython3.7m.so.1.0 \
&& ldconfig \
&& rm -rf /tmp/cpython-3.7.5 \
&& rm -f /tmp/v3.7.5.tar.gz

View File

@ -1,4 +1,4 @@
FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
FROM nvidia/cuda:11.1-cudnn8-devel-ubuntu18.04
MAINTAINER leonwanghui <leon.wanghui@huawei.com>
@ -53,13 +53,16 @@ RUN apt install -y libffi-dev libssl-dev zlib1g-dev libbz2-dev libncurses5-dev \
&& tar -xvf v3.7.5.tar.gz \
&& cd /tmp/cpython-3.7.5 \
&& mkdir -p ${PYTHON_ROOT_PATH} \
&& ./configure --prefix=${PYTHON_ROOT_PATH} \
&& ./configure --prefix=${PYTHON_ROOT_PATH} --enable-shared \
&& make -j4 \
&& make install -j4 \
&& rm -f /usr/local/bin/python \
&& rm -f /usr/local/bin/pip \
&& rm -f /usr/local/lib/libpython3.7m.so.1.0 \
&& ln -s ${PYTHON_ROOT_PATH}/bin/python3.7 /usr/local/bin/python \
&& ln -s ${PYTHON_ROOT_PATH}/bin/pip3.7 /usr/local/bin/pip \
&& ln -s ${PYTHON_ROOT_PATH}/lib/libpython3.7m.so.1.0 /usr/local/lib/libpython3.7m.so.1.0 \
&& ldconfig \
&& rm -rf /tmp/cpython-3.7.5 \
&& rm -f /tmp/v3.7.5.tar.gz

View File

@ -38,12 +38,19 @@ class Allocator;
class Delegate;
class DeviceInfoContext;
/// \brief Context is used to store environment variables during execution.
class MS_API Context {
public:
Context();
~Context() = default;
/// \brief Set the number of threads at runtime. This option is only valid for MindSpore Lite.
///
/// \param[in] thread_num the number of threads at runtime.
void SetThreadNum(int32_t thread_num);
/// \brief Get the current thread number setting.
///
/// \return The current thread number setting.
int32_t GetThreadNum() const;
/// \brief Set the thread affinity to CPU cores.
@ -60,6 +67,10 @@ class MS_API Context {
void SetDelegate(const std::shared_ptr<Delegate> &delegate);
std::shared_ptr<Delegate> GetDelegate() const;
/// \brief Get a mutable reference of DeviceInfoContext vector in this context. Only MindSpore Lite supports
/// heterogeneous scenarios with multiple members in the vector.
///
/// \return Mutable reference of DeviceInfoContext vector in this context.
std::vector<std::shared_ptr<DeviceInfoContext>> &MutableDeviceInfo();
private:
@ -67,14 +78,24 @@ class MS_API Context {
std::shared_ptr<Data> data_;
};
/// \brief DeviceInfoContext defines different device contexts.
class MS_API DeviceInfoContext : public std::enable_shared_from_this<DeviceInfoContext> {
public:
struct Data;
DeviceInfoContext();
virtual ~DeviceInfoContext() = default;
/// \brief Get the type of this DeviceInfoContext.
///
/// \return Type of this DeviceInfoContext.
virtual enum DeviceType GetDeviceType() const = 0;
/// \brief A similar function to RTTI is provided when the -fno-rtti compilation option is turned on, which converts
/// DeviceInfoContext to a shared pointer of type T, and returns nullptr if the conversion fails.
///
/// \param T Type
/// \return A pointer of type T after conversion. If the conversion fails, it will be nullptr.
template <class T>
std::shared_ptr<T> Cast() {
static_assert(std::is_base_of<DeviceInfoContext, T>::value, "Wrong cast type.");
@ -98,27 +119,60 @@ class MS_API DeviceInfoContext : public std::enable_shared_from_this<DeviceInfoC
std::shared_ptr<Data> data_;
};
/// \brief Derived from DeviceInfoContext, The configuration of the model running on the CPU. This option is only valid
/// for MindSpore Lite.
class MS_API CPUDeviceInfo : public DeviceInfoContext {
public:
/// \brief Get the type of this DeviceInfoContext.
///
/// \return Type of this DeviceInfoContext.
enum DeviceType GetDeviceType() const override { return DeviceType::kCPU; };
/// \brief Set enables to perform the float16 inference
///
/// \param[in] is_fp16 Enable float16 inference or not.
void SetEnableFP16(bool is_fp16);
/// \brief Get enables to perform the float16 inference
///
/// \return Whether enable float16 inference.
bool GetEnableFP16() const;
};
/// \brief Derived from DeviceInfoContext, The configuration of the model running on the NPU. This option is only valid
/// for MindSpore Lite.
class MS_API KirinNPUDeviceInfo : public DeviceInfoContext {
public:
/// \brief Get the type of this DeviceInfoContext.
///
/// \return Type of this DeviceInfoContext.
enum DeviceType GetDeviceType() const override { return DeviceType::kKirinNPU; };
/// \brief Set the NPU frequency.
///
/// \param[in] frequency Can be set to 1 (low power consumption), 2 (balanced), 3 (high performance), 4 (extreme
/// performance), default as 3.
void SetFrequency(int frequency);
/// \brief Get the NPU frequency.
///
/// \return NPU frequency
int GetFrequency() const;
};
/// \brief Derived from DeviceInfoContext, The configuration of the model running on the GPU.
class MS_API GPUDeviceInfo : public DeviceInfoContext {
public:
/// \brief Get the type of this DeviceInfoContext.
///
/// \return Type of this DeviceInfoContext.
enum DeviceType GetDeviceType() const override { return DeviceType::kGPU; };
/// \brief Set device id.
///
/// \param[in] device_id The device id.
void SetDeviceID(uint32_t device_id);
/// \brief Get the device id.
///
/// \return The device id.
uint32_t GetDeviceID() const;
void SetGpuTrtInferMode(bool gpu_trt_infer_mode);
@ -127,8 +181,15 @@ class MS_API GPUDeviceInfo : public DeviceInfoContext {
inline void SetPrecisionMode(const std::string &precison_mode);
inline std::string GetPrecisionMode() const;
/// \brief Set enables to perform the float16 inference
///
/// \param[in] is_fp16 Enable float16 inference or not.
void SetEnableFP16(bool is_fp16);
/// \brief Get enables to perform the float16 inference
///
/// \return Whether enable float16 inference.
bool GetEnableFP16() const;
private:
void SetPrecisionMode(const std::vector<char> &precision_mode);
std::vector<char> GetPrecisionModeChar() const;
@ -139,52 +200,113 @@ void GPUDeviceInfo::SetPrecisionMode(const std::string &precision_mode) {
}
std::string GPUDeviceInfo::GetPrecisionMode() const { return CharToString(GetPrecisionModeChar()); }
/// \brief Derived from DeviceInfoContext, The configuration of the model running on the Ascend910. This option is
/// invalid for MindSpore Lite.
class MS_API Ascend910DeviceInfo : public DeviceInfoContext {
public:
/// \brief Get the type of this DeviceInfoContext.
///
/// \return Type of this DeviceInfoContext.
enum DeviceType GetDeviceType() const override { return DeviceType::kAscend910; };
/// \brief Set device id.
///
/// \param[in] device_id The device id.
void SetDeviceID(uint32_t device_id);
/// \brief Get the device id.
///
/// \return The device id.
uint32_t GetDeviceID() const;
};
/// \brief Derived from DeviceInfoContext, The configuration of the model running on the Ascend310. This option is
/// invalid for MindSpore Lite.
class MS_API Ascend310DeviceInfo : public DeviceInfoContext {
public:
/// \brief Get the type of this DeviceInfoContext.
///
/// \return Type of this DeviceInfoContext.
enum DeviceType GetDeviceType() const override { return DeviceType::kAscend310; };
/// \brief Set device id.
///
/// \param[in] device_id The device id.
void SetDeviceID(uint32_t device_id);
/// \brief Get the device id.
///
/// \return The device id.
uint32_t GetDeviceID() const;
inline void SetDumpConfigPath(const std::string &cfg_path);
inline std::string GetDumpConfigPath() const;
// aipp config file
/// \brief Set AIPP configuration file path.
///
/// \param[in] cfg_path AIPP configuration file path.
inline void SetInsertOpConfigPath(const std::string &cfg_path);
/// \brief Get AIPP configuration file path.
///
/// \return AIPP configuration file path.
inline std::string GetInsertOpConfigPath() const;
// nchw or nhwc
/// \brief Set format of model inputs.
///
/// \param[in] format Optional "NCHW", "NHWC", etc.
inline void SetInputFormat(const std::string &format);
/// \brief Get format of model inputs.
///
/// \return The format of model inputs.
inline std::string GetInputFormat() const;
// Mandatory while dynamic batch: e.g. "input_op_name1: 1,2,3,4;input_op_name2: 4,3,2,1"
/// \brief Set shape of model inputs.
///
/// \param[in] shape e.g. "input_op_name1: 1,2,3,4;input_op_name2: 4,3,2,1".
inline void SetInputShape(const std::string &shape);
/// \brief Get shape of model inputs.
///
/// \return The shape of model inputs.
inline std::string GetInputShape() const;
/// \brief Set shape of model inputs.
///
/// \param[in] shape e.g. {{1, {1,2,3,4}}, {2, {4,3,2,1}}} means the first input shape 1,2,3,4 and the second input
/// shape 4,3,2,1.
void SetInputShapeMap(const std::map<int, std::vector<int>> &shape);
/// \brief Get shape of model inputs.
///
/// \return The shape of model inputs.
std::map<int, std::vector<int>> GetInputShapeMap() const;
void SetDynamicBatchSize(const std::vector<size_t> &dynamic_batch_size);
inline std::string GetDynamicBatchSize() const;
// FP32, UINT8 or FP16, default as FP32
/// \brief Set type of model outputs.
///
/// \param[in] output_type FP32, UINT8 or FP16, default as FP32.
void SetOutputType(enum DataType output_type);
/// \brief Get type of model outputs.
///
/// \return The set type of model outputs.
enum DataType GetOutputType() const;
// "force_fp16", "allow_fp32_to_fp16", "must_keep_origin_dtype" or "allow_mix_precision", default as "force_fp16"
/// \brief Set precision mode of model.
///
/// \param[in] precision_mode Optional "force_fp16", "allow_fp32_to_fp16", "must_keep_origin_dtype" and
/// "allow_mix_precision", "force_fp16" is set as default
inline void SetPrecisionMode(const std::string &precision_mode);
/// \brief Get precision mode of model.
///
/// \return The set type of model outputs
inline std::string GetPrecisionMode() const;
// Optional "high_performance" and "high_precision", "high_performance" is set as default
/// \brief Set op select implementation mode.
///
/// \param[in] op_select_impl_mode Optional "high_performance" and "high_precision", "high_performance" is set as
/// default.
inline void SetOpSelectImplMode(const std::string &op_select_impl_mode);
/// \brief Get op select implementation mode.
///
/// \return The set op select implementation mode.
inline std::string GetOpSelectImplMode() const;
inline void SetFusionSwitchConfigPath(const std::string &cfg_path);

View File

@ -37,32 +37,75 @@ class Metrics;
namespace dataset {
class Dataset;
} // namespace dataset
/// \brief The Model class is used to define a MindSpore model, facilitating computational graph management.
class MS_API Model {
public:
Model();
~Model();
Model(const Model &) = delete;
void operator=(const Model &) = delete;
/// \brief Builds a model so that it can run on a device.
///
/// \param[in] graph GraphCell is a derivative of Cell. Cell is not available currently. GraphCell can be constructed
/// from Graph, for example, model.Build(GraphCell(graph), context).
/// \param[in] model_context A context used to store options during execution.
/// \param[in] train_cfg A config used by training.
///
/// \return Status.
Status Build(GraphCell graph, const std::shared_ptr<Context> &model_context = nullptr,
const std::shared_ptr<TrainCfg> &train_cfg = nullptr);
/// \brief Resizes the shapes of inputs.
///
/// \param[in] inputs A vector that includes all input tensors in order.
/// \param[in] dims Defines the new shapes of inputs, should be consistent with inputs.
///
/// \return Status.
Status Resize(const std::vector<MSTensor> &inputs, const std::vector<std::vector<int64_t>> &dims);
/// \brief Inference model.
///
/// \param[in] inputs A vector where model inputs are arranged in sequence.
/// \param[out] outputs Which is a pointer to a vector. The model outputs are filled in the container in sequence.
/// \param[in] before CallBack before predict.
/// \param[in] after CallBack after predict.
///
/// \return Status.
Status Predict(const std::vector<MSTensor> &inputs, std::vector<MSTensor> *outputs,
const MSKernelCallBack &before = nullptr, const MSKernelCallBack &after = nullptr);
/// \brief Obtains all input tensors of the model.
///
/// \return The vector that includes all input tensors.
std::vector<MSTensor> GetInputs();
/// \brief Obtains the input tensor of the model by name.
///
/// \return The input tensor with the given name, if the name is not found, an invalid tensor is returned.
inline MSTensor GetInputByTensorName(const std::string &tensor_name);
Status InitMetrics(std::vector<Metrics *> metrics);
std::vector<Metrics *> GetMetrics();
/// \brief Obtains all output tensors of the model.
///
/// \return The vector that includes all output tensors.
std::vector<MSTensor> GetOutputs();
/// \brief Obtains names of all output tensors of the model.
///
/// \return A vector that includes names of all output tensors.
inline std::vector<std::string> GetOutputTensorNames();
/// \brief Obtains the output tensor of the model by name.
///
/// \return The output tensor with the given name, if the name is not found, an invalid tensor is returned.
inline MSTensor GetOutputByTensorName(const std::string &tensor_name);
inline std::vector<MSTensor> GetOutputsByNodeName(const std::string &tensor_name);
/// \brief Inference model.
///
/// \param[in] device_type Device typeoptions are kGPU, kAscend910, etc.
/// \param[in] model_type The type of model file, options are ModelType::kMindIR, ModelType::kOM.
///
/// \return Is supported or not.
static bool CheckModelSupport(enum DeviceType device_type, ModelType model_type);
Status SetTrainMode(bool train);

View File

@ -27,13 +27,43 @@
#include "include/api/dual_abi_helper.h"
namespace mindspore {
/// \brief The Serialization class is used to summarize methods for reading and writing model files.
class MS_API Serialization {
public:
/// \brief Loads a model file from memory buffer.
///
/// \param[in] model_data A buffer filled by model file.
/// \param[in] data_size The size of the buffer.
/// \param[in] model_type The Type of model file, options are ModelType::kMindIR, ModelType::kOM.
/// \param[out] graph The output parameter, an object saves graph data.
/// \param[in] dec_key The decryption key, key length is 16, 24, or 32.
/// \param[in] dec_mode The decryption mode, optional options are AES-GCM, AES-CBC.
///
/// \return Status.
inline static Status Load(const void *model_data, size_t data_size, ModelType model_type, Graph *graph,
const Key &dec_key = {}, const std::string &dec_mode = kDecModeAesGcm);
/// \brief Loads a model file from path, is not supported on MindSpore Lite.
///
/// \param[in] file The path of model file.
/// \param[in] model_type The Type of model file, options are ModelType::kMindIR, ModelType::kOM.
/// \param[out] graph The output parameter, an object saves graph data.
/// \param[in] dec_key The decryption key, key length is 16, 24, or 32.
/// \param[in] dec_mode The decryption mode, optional options are AES-GCM, AES-CBC.
///
/// \return Status.
inline static Status Load(const std::string &file, ModelType model_type, Graph *graph, const Key &dec_key = {},
const std::string &dec_mode = kDecModeAesGcm);
/// \brief Load multiple models from multiple files, MindSpore Lite does not provide this feature.
///
/// \param[in] files The path of model files.
/// \param[in] model_type The Type of model file, options are ModelType::kMindIR, ModelType::kOM.
/// \param[out] graph The output parameter, an object saves graph data.
/// \param[in] dec_key The decryption key, key length is 16, 24, or 32.
/// \param[in] dec_mode The decryption mode, optional options are AES-GCM, AES-CBC.
///
/// \return Status.
inline static Status Load(const std::vector<std::string> &files, ModelType model_type, std::vector<Graph> *graphs,
const Key &dec_key = {}, const std::string &dec_mode = kDecModeAesGcm);
static Status SetParameters(const std::map<std::string, Buffer> &parameters, Model *model);

View File

@ -25,11 +25,17 @@
#include "include/api/dual_abi_helper.h"
#include "include/api/format.h"
#ifndef MS_API
#ifdef _WIN32
#ifdef BUILDING_DLL
#define MS_API __declspec(dllexport)
#else
#define MS_API __declspec(dllimport)
#endif
#else
#define MS_API __attribute__((visibility("default")))
#endif
#endif
namespace mindspore {
enum ModelType : uint32_t {
@ -64,18 +70,64 @@ struct QuantParam {
};
class Allocator;
/// \brief The MSTensor class defines a tensor in MindSpore.
class MS_API MSTensor {
public:
class Impl;
/// \brief Creates a MSTensor object, whose data need to be copied before accessed by Model, must be used in pairs
/// with DestroyTensorPtr.
///
/// \param[in] name The name of the MSTensor.
/// \param[in] type The data type of the MSTensor.
/// \param[in] shape The shape of the MSTensor.
/// \param[in] data The data pointer that points to allocated memory.
/// \param[in] data_len The length of the memory, in bytes.
///
/// \return A pointer of MSTensor.
static inline MSTensor *CreateTensor(const std::string &name, DataType type, const std::vector<int64_t> &shape,
const void *data, size_t data_len) noexcept;
/// \brief Creates a MSTensor object, whose data can be directly accessed by Model, must be used in pairs with
/// DestroyTensorPtr.
///
/// \param[in] name The name of the MSTensor.
/// \param[in] type The data type of the MSTensor.
/// \param[in] shape The shape of the MSTensor.
/// \param[in] data The data pointer that points to allocated memory.
/// \param[in] data_len The length of the memory, in bytes.
///
/// \return A pointer of MSTensor.
static inline MSTensor *CreateRefTensor(const std::string &name, DataType type, const std::vector<int64_t> &shape,
const void *data, size_t data_len) noexcept;
/// \brief Creates a MSTensor object, whose device data can be directly accessed by Model, must be used in pairs with
/// DestroyTensorPtr.
///
/// \param[in] name The name of the MSTensor.
/// \param[in] type The data type of the MSTensor.
/// \param[in] shape The shape of the MSTensor.
/// \param[in] data The data pointer that points to device memory.
/// \param[in] data_len The length of the memory, in bytes.
///
/// \return A pointer of MSTensor.
static inline MSTensor *CreateDevTensor(const std::string &name, DataType type, const std::vector<int64_t> &shape,
const void *data, size_t data_len) noexcept;
/// \brief Create a string type MSTensor object whose data can be accessed by Model only after being copied, must be
/// used in pair with DestroyTensorPtr.
///
/// \param[in] name The name of the MSTensor.
/// \param[in] str A vector container containing several strings.
///
/// \return A pointer of MSTensor.
static inline MSTensor *StringsToTensor(const std::string &name, const std::vector<std::string> &str);
/// \brief Parse the string type MSTensor object into strings.
///
/// \param[in] tensor A MSTensor object.
///
/// \return A vector container containing several strings.
static inline std::vector<std::string> TensorToStrings(const MSTensor &tensor);
/// \brief Destroy an object created by Clone, StringsToTensor, CreateRefTensor, CreateDevTensor or CreateTensor. Do
/// not use it to destroy MSTensor from other sources.
///
/// \param[in] tensor A MSTensor object.
static void DestroyTensorPtr(MSTensor *tensor) noexcept;
MSTensor();
@ -85,19 +137,51 @@ class MS_API MSTensor {
explicit MSTensor(std::nullptr_t);
~MSTensor();
/// \brief Obtains the name of the MSTensor.
///
/// \return The name of the MSTensor.
inline std::string Name() const;
/// \brief Obtains the data type of the MSTensor.
///
/// \return The data type of the MSTensor.
enum DataType DataType() const;
/// \brief Obtains the shape of the MSTensor.
///
/// \return The shape of the MSTensor.
const std::vector<int64_t> &Shape() const;
/// \brief Obtains the number of elements of the MSTensor.
///
/// \return The number of elements of the MSTensor.
int64_t ElementNum() const;
/// \brief Obtains a shared pointer to the copy of data of the MSTensor. The data can be read on host.
///
/// \return A shared pointer to the copy of data of the MSTensor.
std::shared_ptr<const void> Data() const;
/// \brief Obtains the pointer to the data of the MSTensor. If the MSTensor is a device tensor, the data cannot be
/// accessed directly on host.
///
/// \return A pointer to the data of the MSTensor.
void *MutableData();
/// \brief Obtains the length of the data of the MSTensor, in bytes.
///
/// \return The length of the data of the MSTensor, in bytes.
size_t DataSize() const;
/// \brief Gets the boolean value that indicates whether the memory of MSTensor is on device.
///
/// \return The boolean value that indicates whether the memory of MSTensor is on device.
bool IsDevice() const;
/// \brief Gets a deep copy of the MSTensor, must be used in pair with DestroyTensorPtr.
///
/// \return A pointer points to a deep copy of the MSTensor.
MSTensor *Clone() const;
/// \brief Gets the boolean value that indicates whether the MSTensor is valid.
///
/// \return The boolean value that indicates whether the MSTensor is valid.
bool operator==(std::nullptr_t) const;
/// \brief Gets the boolean value that indicates whether the MSTensor is valid.
///
/// \return The boolean value that indicates whether the MSTensor is valid.
bool operator!=(std::nullptr_t) const;
bool operator==(const MSTensor &tensor) const;

View File

@ -23,6 +23,7 @@ from itertools import repeat, zip_longest
from collections import deque
from collections.abc import Iterable
import numpy as np
from mindspore import context
from mindspore import log as logger
from mindspore.common import dtype as mstype
from mindspore._c_expression import Tensor as Tensor_
@ -846,6 +847,10 @@ class Validator:
"""Returns an empty Tensor."""
return Tensor_(dtype, shape)
@staticmethod
def check_type_support(dtype, device, supported_dtypes):
return dtype in supported_dtypes or not context.get_context('device_target') == device
def check_input_format(input_param):
"""Judge input format."""

View File

@ -21,7 +21,7 @@ from . import model
def estimate_ops(json_str: str):
"""Call costmodel to estimate ops."""
"""Call cost model to estimate ops."""
try:
json_obj = json.loads(json_str)
graph_descs = json_obj["graph_desc"]
@ -38,7 +38,7 @@ def estimate_ops(json_str: str):
def estimate_calulation_amount(json_str: str):
"""Call costmodel to estimate calculation amount of op."""
"""Call cost model to estimate calculation amount of op."""
try:
graph_desc = json.loads(json_str)
comp = model.load_composite(graph_desc)

View File

@ -24,7 +24,7 @@ from . import utils
def split_with_json(json_str, flags_str):
"""Call costmodel to split GraphKernel"""
"""Call cost model to split GraphKernel"""
try:
graph_desc = json.loads(json_str)
flags = json.loads(flags_str)

View File

@ -50,11 +50,6 @@ def _compile_akg_task_gpu(json_strs, attrs):
if not res:
raise ValueError("Compile error, args: {}! build attrs: {}".format(json_str, attrs))
pid_path = os.path.realpath("./cuda_meta_" + str(os.getpid()))
if os.path.exists(pid_path):
copy_json(pid_path, os.path.realpath("./cuda_meta_" + str(os.getppid())))
shutil.rmtree(pid_path)
def _compile_akg_task_ascend(json_strs, attrs):
"""

View File

@ -159,12 +159,17 @@ def resolve_symbol(namespace, symbol):
if getattr(resolve_, "__hash__") is None:
return resolve_
# Raise NotImplementedError when parsing the numpy methods, but not the numpy constant.
if namespace.name == "numpy" and isinstance(resolve_, (types.FunctionType, types.MethodType, types.ModuleType)):
raise NotImplementedError(
f"MindSpore does not support to use the numpy methods in the function construct with the graph mode.")
# If need trope the obj
if resolve_ in convert_object_map:
resolve_ = convert_object_map.get(resolve_)
logger.debug("convert resolve = %r", resolve_)
if resolve_ == NO_IMPLEMENT:
raise NotImplementedError(f"Not support for `{symbol}`")
raise NotImplementedError(f"Not support for `{symbol}`.")
except Exception as e:
if isinstance(e, NotImplementedError):
raise e

View File

@ -1312,7 +1312,8 @@ def sum(x, axis=None, dtype=None, keepdims=False, initial=None): # pylint: disab
>>> print(input_x.sum(axis=1))
[10. 35.]
"""
dtype = x.dtype if dtype is None else dtype
input_x = x.astype(mstype.int32) if x.dtype == mstype.bool_ else x
dtype = input_x.dtype if dtype is None else dtype
if not isinstance(keepdims, int):
const_utils.raise_type_error("integer argument expected")
if initial is not None and not isinstance(initial, (int, float, bool)):
@ -1322,14 +1323,14 @@ def sum(x, axis=None, dtype=None, keepdims=False, initial=None): # pylint: disab
else:
axis = check_and_canonicalize_axes(axis, x.ndim)
if x.dtype == mstype.bool_:
x = x.astype("int32")
if not check_type_support(input_x.dtype, 'GPU', (mstype.float64, mstype.float32, mstype.float16)):
input_x = input_x.astype(mstype.float32)
if 0 in x.shape:
x = const_utils.make_tensor([0], x.dtype)
if keepdims:
res = _reduce_sum_keepdims(x, axis)
res = _reduce_sum_keepdims(input_x, axis)
else:
res = _reduce_sum_default(x, axis)
res = _reduce_sum_default(input_x, axis)
if initial is not None:
res += initial
return res.astype(dtype)
@ -1648,6 +1649,7 @@ get_log2_size = constexpr(validator.get_log2_size)
check_axis_type = constexpr(validator.check_axis_type)
check_and_canonicalize_axes = constexpr(validator.check_and_canonicalize_axes)
empty_compile = constexpr(validator.empty_compile)
check_type_support = constexpr(validator.check_type_support)
def tensor_bool(x):

View File

@ -325,7 +325,7 @@ endif()
set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
set_property(SOURCE "pipeline/jit/init.cc" PROPERTY
COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_PIPELINE)
pybind11_add_module(_c_expression "pipeline/jit/init.cc")
pybind11_add_module(_c_expression NO_EXTRAS "pipeline/jit/init.cc")
MESSAGE(STATUS "operation system is ${CMAKE_SYSTEM}")
if(CMAKE_SYSTEM_NAME MATCHES "Linux")

View File

@ -35,6 +35,7 @@ if(ENABLE_CPU)
"cpu/fl/*.cc"
"cpu/ps/*.cc"
"cpu/quantum/*.cc"
"cpu/pyfunc/*.cc"
)
if(NOT ENABLE_MPI)

View File

@ -16,6 +16,11 @@
#include "backend/kernel_compiler/akg/akg_kernel_build.h"
#include <stdio.h>
#include <errno.h>
#include <fcntl.h>
#include <unistd.h>
#include <algorithm>
#include <map>
#include <memory>
@ -23,6 +28,7 @@
#include <unordered_set>
#include <utility>
#include <vector>
#include <iostream>
#include "nlohmann/json.hpp"
#include "ir/dtype.h"
#include "ir/func_graph.h"
@ -34,9 +40,320 @@
namespace mindspore {
namespace kernel {
#define INIT_SET_FROM_2D_ARRAY(set_var, list_idx) \
std::set<size_t> set_var(kernel_lists_[list_idx], kernel_lists_[list_idx] + kernel_lists_[list_idx][kMaxKernelNum_]);
#define LIST_BEGIN(list_idx) kernel_lists_[list_idx]
#define LIST_END(list_idx) (kernel_lists_[list_idx] + kernel_lists_[list_idx][kMaxKernelNum_])
#define RESET_LIST_SIZE(list_idx, val) kernel_lists_[list_idx][kMaxKernelNum_] = val
#define INCREASE_LIST_SIZE(list_idx, val) kernel_lists_[list_idx][kMaxKernelNum_] += val
constexpr int32_t PROCESS_NUM = 16;
constexpr int32_t TIME_OUT = 300;
bool AkgKernelPool::LockMng::TryLock() {
// Try to lock 100 times. Return errno if lock unsuccessfully
uint32_t trial = 100;
int32_t ret = -1;
while (trial > 0) {
ret = lockf(fd_, F_TLOCK, 0);
if (ret == 0 || (errno != EACCES && errno != EAGAIN)) {
break;
}
trial--;
usleep(5000);
}
if (ret == -1) {
MS_LOG(ERROR) << "Failed to acquire the lock, errno:" << strerror(errno) << ".";
return false;
}
return true;
}
void AkgKernelPool::LockMng::Unlock() {
auto ret = lockf(fd_, F_ULOCK, 0);
if (ret == -1) {
MS_LOG(ERROR) << "Failed to release the lock, errno:" << strerror(errno);
}
}
std::string AkgKernelPool::GetCurrentPath() {
char cwd[PATH_MAX];
char *ret = getcwd(cwd, sizeof(cwd));
if (ret == nullptr) {
MS_LOG(ERROR) << "Get current work directory failed, errno:" << strerror(errno);
return "";
}
char abspath[PATH_MAX];
char *res = realpath(cwd, abspath);
if (res == nullptr) {
MS_LOG(ERROR) << "Change to realpath failed, errno:" << strerror(errno);
return "";
}
return std::string(abspath);
}
void *AkgKernelPool::CreateSharedMem(const std::string &path) {
is_creator_ = false;
auto hash_id = std::hash<std::string>()(path);
auto key_id = static_cast<key_t>(hash_id);
auto mem_size = sizeof(size_t) * kListNum_ * (kMaxKernelNum_ + 1) + 512;
{
LockMng lock(fd_);
if (!lock.locked_) {
MS_LOG(ERROR) << "Failed to acquire lock.";
return nullptr;
}
// check if the shared memory exists or not.
// remove shared memory if exists and the nattach is 0
struct shmid_ds buf;
auto id = shmget(key_id, mem_size, 0);
if (id != -1) {
auto ret = shmctl(id, IPC_STAT, &buf);
if (ret == -1) {
MS_LOG(ERROR) << "Failed to get the info of shared memory, errno:" << strerror(errno);
return nullptr;
}
if (buf.shm_nattch == 0) {
ret = shmctl(id, IPC_RMID, nullptr);
if (ret < 0) {
MS_LOG(EXCEPTION) << "Realse shared_mem failed, errno:" << strerror(errno);
}
}
}
}
LockMng lock(fd_);
if (!lock.locked_) {
MS_LOG(ERROR) << "Failed to acquire lock.";
return nullptr;
}
shm_id_ = shmget(key_id, mem_size, IPC_CREAT | IPC_EXCL | 0600);
if (shm_id_ == -1) {
if (errno == EEXIST) {
shm_id_ = shmget(key_id, mem_size, 0);
}
if (shm_id_ == -1) {
MS_LOG(ERROR) << "Create shared_mem failed, error no:" << strerror(errno);
return nullptr;
}
} else {
is_creator_ = true;
}
auto local_addr = shmat(shm_id_, nullptr, 0);
if (local_addr == reinterpret_cast<void *>(-1)) {
MS_LOG(ERROR) << "Attach to shared_mem failed, error no:" << strerror(errno);
return nullptr;
}
if (is_creator_) {
(void)memset(local_addr, 0, mem_size);
}
return local_addr;
}
int32_t AkgKernelPool::Init(const std::vector<JsonNodePair> &build_args) {
auto cp = GetCurrentPath();
if (cp.empty()) {
return -1;
}
fd_ = open(kKeyName_, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
if (fd_ == -1) {
MS_LOG(ERROR) << "open file <" << kKeyName_ << "> failed, errno:" << strerror(errno);
return -1;
}
auto addr = CreateSharedMem(cp);
if (addr == nullptr) {
return -1;
}
InitKernelLists(addr);
auto ret = AddKernels(build_args);
if (ret != 0) {
MS_LOG(ERROR) << "AkgKernelPool AddKernels failed.";
return false;
}
return 0;
}
AkgKernelPool::~AkgKernelPool() {
// Detach shared memory
auto ret = shmdt(reinterpret_cast<void *>(kernel_lists_[0]));
if (ret < 0) {
MS_LOG(EXCEPTION) << "Shared_mem detach failed, errno:" << strerror(errno);
}
// Realse shared_memroy
if (is_creator_) {
ret = shmctl(shm_id_, IPC_RMID, nullptr);
if (ret < 0) {
MS_LOG(EXCEPTION) << "Realse shared_mem failed, errno:" << strerror(errno);
}
}
// Close key file
if (fd_ != -1) {
(void)close(fd_);
}
}
int32_t AkgKernelPool::AddKernels(const std::vector<JsonNodePair> &build_args) {
LockMng lock(fd_);
if (!lock.locked_) {
MS_LOG(ERROR) << "Failed to acquire lock.";
return -1;
}
INIT_SET_FROM_2D_ARRAY(todo_list, kToDoIdx_);
INIT_SET_FROM_2D_ARRAY(doing_list, kDoingIdx_);
INIT_SET_FROM_2D_ARRAY(done_list, kDoneIdx_);
for (const auto &[json_generator, anf_node] : build_args) {
MS_EXCEPTION_IF_NULL(anf_node);
auto kernel_name = json_generator.kernel_name();
auto hash_id = std::hash<std::string>()(kernel_name);
if (self_kernel_ids_.count(hash_id) != 0) {
MS_LOG(ERROR) << "Duplicated hash_id in list.";
return -1;
}
self_kernel_ids_.emplace(hash_id);
}
std::set<size_t> diff_from_todo;
std::set<size_t> diff_from_doing;
std::set<size_t> diff_from_done;
// add the unique kernel only once, so need to check if it exists in todo_list, doing_list, or done_list
std::set_difference(self_kernel_ids_.begin(), self_kernel_ids_.end(), todo_list.begin(), todo_list.end(),
std::inserter(diff_from_todo, diff_from_todo.begin()));
std::set_difference(diff_from_todo.begin(), diff_from_todo.end(), doing_list.begin(), doing_list.end(),
std::inserter(diff_from_doing, diff_from_doing.begin()));
std::set_difference(diff_from_doing.begin(), diff_from_doing.end(), done_list.begin(), done_list.end(),
std::inserter(diff_from_done, diff_from_done.begin()));
auto new_kernel_size = diff_from_done.size();
if (new_kernel_size + todo_list.size() > static_cast<size_t>(kMaxKernelNum_)) {
MS_LOG(ERROR) << "The size of kernels is " << new_kernel_size << ", while the left space of the pool is "
<< kMaxKernelNum_ - todo_list.size();
return -1;
}
std::copy(diff_from_done.begin(), diff_from_done.end(), LIST_END(kToDoIdx_));
INCREASE_LIST_SIZE(kToDoIdx_, new_kernel_size);
return 0;
}
int32_t AkgKernelPool::FetchKernels(std::set<size_t> *out) {
LockMng lock(fd_);
if (!lock.locked_) {
MS_LOG(ERROR) << "Failed to acquire lock.";
return -1;
}
std::set<size_t> left_in_todo_list;
// filter out kernels which belongs to other processes
auto FilterBySelfList = [&left_in_todo_list, &out, this](size_t id) {
if (this->self_kernel_ids_.count(id) != 0) {
out->emplace(id);
} else {
left_in_todo_list.emplace(id);
}
};
std::for_each(LIST_BEGIN(kToDoIdx_), LIST_END(kToDoIdx_), FilterBySelfList);
std::copy(out->begin(), out->end(), LIST_END(kDoingIdx_));
INCREASE_LIST_SIZE(kDoingIdx_, out->size());
std::copy(left_in_todo_list.begin(), left_in_todo_list.end(), LIST_BEGIN(kToDoIdx_));
RESET_LIST_SIZE(kToDoIdx_, left_in_todo_list.size());
return 0;
}
int32_t AkgKernelPool::UpdateAndWait(const std::set<size_t> &ids) {
if (!ids.empty()) {
LockMng lock(fd_);
if (!lock.locked_) {
MS_LOG(ERROR) << "Failed to acquire lock.";
return -1;
}
// update the state of finished kernels to `done`
std::copy(ids.begin(), ids.end(), LIST_END(kDoneIdx_));
INCREASE_LIST_SIZE(kDoneIdx_, ids.size());
// delete the finished kernels from doing_list
std::vector<size_t> left_in_doing_list;
INIT_SET_FROM_2D_ARRAY(doing_list, kDoingIdx_);
std::set_difference(doing_list.begin(), doing_list.end(), ids.begin(), ids.end(),
std::inserter(left_in_doing_list, left_in_doing_list.begin()));
std::copy(left_in_doing_list.begin(), left_in_doing_list.end(), LIST_BEGIN(kDoingIdx_));
RESET_LIST_SIZE(kDoingIdx_, left_in_doing_list.size());
}
auto ret = Wait();
if (ret != 0) {
MS_LOG(ERROR) << "AkgKernelPool Wait failed.";
return -1;
}
return 0;
}
int32_t AkgKernelPool::Wait() {
// wait until all the kernels which belong to this process finish compiling
uint32_t trials = 1000;
while (trials > 0) {
{
LockMng lock(fd_);
if (!lock.locked_) {
MS_LOG(ERROR) << "Failed to acquire lock.";
return -1;
}
INIT_SET_FROM_2D_ARRAY(done_list, kDoneIdx_);
if (std::all_of(self_kernel_ids_.begin(), self_kernel_ids_.end(),
[&done_list](size_t id) { return done_list.count(id) != 0; })) {
return 0;
}
}
usleep(1000000);
trials--;
}
MS_LOG(ERROR) << "Time out while wait kernel compiling";
return -1;
}
std::vector<std::string> AkgKernelBuilder::GetNotCachedKernelJsons(const std::vector<JsonNodePair> &build_args) {
// Remove cached nodes, gether unique nodes, and collect repeated nodes which need postprecess.
std::vector<std::string> jsons;
@ -66,6 +383,31 @@ std::vector<std::string> AkgKernelBuilder::GetNotCachedKernelJsons(const std::ve
return jsons;
}
std::vector<JsonNodePair> AkgKernelBuilder::GetNotCachedKernels(const std::vector<JsonNodePair> &build_args) {
std::unordered_set<std::string> kernel_name_set;
std::vector<JsonNodePair> new_build_args;
for (const auto &[json_generator, anf_node] : build_args) {
MS_EXCEPTION_IF_NULL(anf_node);
auto kernel_name = json_generator.kernel_name();
auto cached_kernel_pack = AkgSearchCache(kernel_name);
if (cached_kernel_pack != nullptr) {
MS_LOG(DEBUG) << "Use cached kernel, kernel_name[" << kernel_name << "], fullname_with_scope["
<< anf_node->fullname_with_scope() << "].";
AkgSetKernelMod(cached_kernel_pack, json_generator, anf_node);
continue;
}
if (kernel_name_set.count(kernel_name) != 0) {
repeat_nodes_.push_back({json_generator, anf_node});
continue;
}
kernel_name_set.insert(kernel_name);
new_build_args.push_back({json_generator, anf_node});
}
return new_build_args;
}
bool AkgKernelBuilder::InsertToCache(const std::vector<JsonNodePair> &build_args) {
for (const auto &[json_generator, anf_node] : build_args) {
auto kernel_name = json_generator.kernel_name();
@ -97,32 +439,77 @@ bool AkgKernelBuilder::HandleRepeatNodes() {
return true;
}
std::vector<std::string> AkgKernelBuilder::GetKernelJsonsByHashId(const std::vector<JsonNodePair> &build_args,
std::set<size_t> fetched_ids) {
std::vector<std::string> jsons;
for (const auto &[json_generator, anf_node] : build_args) {
MS_EXCEPTION_IF_NULL(anf_node);
auto kernel_name = json_generator.kernel_name();
auto hash_id = std::hash<std::string>()(kernel_name);
if (fetched_ids.count(hash_id) == 0) {
continue;
}
auto kernel_json = json_generator.kernel_json_str();
AkgSaveJsonInfo(kernel_name, kernel_json);
jsons.push_back(kernel_json);
}
return jsons;
}
bool AkgKernelBuilder::AkgOpParallelBuild(const std::vector<JsonNodePair> &build_args) {
repeat_nodes_.clear();
auto jsons = GetNotCachedKernelJsons(build_args);
if (jsons.empty()) {
auto new_build_args = GetNotCachedKernels(build_args);
if (new_build_args.empty()) {
return true;
}
auto client = GetClient();
MS_EXCEPTION_IF_NULL(client);
if (!client->AkgStart(PROCESS_NUM, TIME_OUT)) {
MS_LOG(ERROR) << "Akg start failed.";
AkgKernelPool kp;
auto ret = kp.Init(new_build_args);
if (ret != 0) {
MS_LOG(ERROR) << "AkgKernelPool init failed.";
return false;
}
auto attrs = CollectBuildAttrs();
if (!attrs.empty() && !client->AkgSendAttr(attrs)) {
MS_LOG(ERROR) << "Akg send attr failed.";
std::set<size_t> fetched_ids;
ret = kp.FetchKernels(&fetched_ids);
if (ret != 0) {
MS_LOG(ERROR) << "AkgKernelPool FetchKernels failed.";
return false;
}
if (!client->AkgSendData(jsons)) {
MS_LOG(ERROR) << "Akg send data failed.";
return false;
}
if (!client->AkgWait()) {
MS_LOG(ERROR) << "Akg compile failed.";
if (!fetched_ids.empty()) {
auto jsons = GetKernelJsonsByHashId(new_build_args, fetched_ids);
auto client = GetClient();
MS_EXCEPTION_IF_NULL(client);
if (!client->AkgStart(PROCESS_NUM, TIME_OUT)) {
MS_LOG(ERROR) << "Akg start failed.";
return false;
}
auto attrs = CollectBuildAttrs();
if (!attrs.empty() && !client->AkgSendAttr(attrs)) {
MS_LOG(ERROR) << "Akg send attr failed.";
return false;
}
if (!client->AkgSendData(jsons)) {
MS_LOG(ERROR) << "Akg send data failed.";
return false;
}
if (!client->AkgWait()) {
MS_LOG(ERROR) << "Akg compile failed.";
return false;
}
}
ret = kp.UpdateAndWait(fetched_ids);
if (ret != 0) {
MS_LOG(ERROR) << "AkgKernelPool UpdateAndWait failed.";
return false;
}
// All unique done here, cache them and set kernel.
if (!InsertToCache(build_args)) {
MS_LOG(ERROR) << "Insert cache failed.";

View File

@ -17,10 +17,13 @@
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_AKG_KERNEL_BUILD_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_AKG_KERNEL_BUILD_H_
#include <sys/shm.h>
#include <string>
#include <utility>
#include <vector>
#include <map>
#include <set>
#include "ir/anf.h"
#include "backend/kernel_compiler/kernel.h"
#include "backend/session/kernel_build_client.h"
@ -45,12 +48,84 @@ class AkgKernelBuilder {
private:
std::vector<std::string> GetNotCachedKernelJsons(const std::vector<JsonNodePair> &build_args);
std::vector<JsonNodePair> GetNotCachedKernels(const std::vector<JsonNodePair> &build_args);
std::vector<std::string> GetKernelJsonsByHashId(const std::vector<JsonNodePair> &build_args,
std::set<size_t> fetched_ids);
bool InsertToCache(const std::vector<JsonNodePair> &build_args);
bool HandleRepeatNodes();
bool AkgOpParallelBuild(const std::vector<JsonNodePair> &build_args);
std::vector<JsonNodePair> repeat_nodes_;
std::string CollectBuildAttrs();
};
class AkgKernelPool {
public:
class LockMng {
public:
explicit LockMng(int32_t fd) {
fd_ = fd;
locked_ = TryLock();
}
virtual ~LockMng() {
if (locked_) {
Unlock();
}
}
bool locked_{false};
private:
bool TryLock();
void Unlock();
int32_t fd_{-1};
};
public:
AkgKernelPool() = default;
virtual ~AkgKernelPool();
int32_t Init(const std::vector<JsonNodePair> &build_args);
int32_t FetchKernels(std::set<size_t> *out);
int32_t UpdateAndWait(const std::set<size_t> &ids);
constexpr inline static size_t kMaxKernelNum_{1000};
constexpr inline static key_t kSharedMemKey_{0x57565845};
// allocate memory for todo_list, doing_list, done_list
constexpr inline static size_t kListNum_{3};
constexpr inline static auto kKeyName_ = "./akg_build_tmp.key";
constexpr inline static int32_t kToDoIdx_ = 0;
constexpr inline static int32_t kDoingIdx_ = 1;
constexpr inline static int32_t kDoneIdx_ = 2;
private:
void *CreateSharedMem(const std::string &path);
std::string GetCurrentPath();
inline void InitKernelLists(void *addr) {
kernel_lists_[kToDoIdx_] = reinterpret_cast<size_t *>(addr);
kernel_lists_[kDoingIdx_] = kernel_lists_[kToDoIdx_] + kMaxKernelNum_ + 1;
kernel_lists_[kDoneIdx_] = kernel_lists_[kDoingIdx_] + kMaxKernelNum_ + 1;
}
int32_t AddKernels(const std::vector<JsonNodePair> &kernel_jsons);
int32_t Wait();
int32_t shm_id_{-1};
bool is_creator_{false};
int32_t fd_{-1};
// includes 3 lists: todo_list, doing_list, done_list.
// each list has kMaxKernelNum_ + 1 elements and, the count of elements in each list
// is stored in kernel_lists_[xx][kMaxKernelNum_]
size_t *kernel_lists_[kListNum_]{nullptr, nullptr, nullptr};
std::set<size_t> self_kernel_ids_;
};
} // namespace kernel
} // namespace mindspore

View File

@ -44,8 +44,10 @@ KernelPackPtr AkgAscendKernelBuilder::AkgInsertCache(const std::string &kernel_n
void AkgAscendKernelBuilder::AkgSetKernelMod(const KernelPackPtr &kernel_pack,
const AkgKernelJsonGenerator &json_generator, const AnfNodePtr &anf_node) {
auto kernel_mod_ptr = std::make_shared<AkgKernelMod>(kernel_pack);
auto kernel_json_info = kernel_pack->kernel_json_info();
kernel_mod_ptr->SetInputSizeList(json_generator.input_size_list());
kernel_mod_ptr->SetOutputSizeList(json_generator.output_size_list());
kernel_mod_ptr->SetWorkspaceSizeList(kernel_json_info.workspaces);
AnfAlgo::SetKernelMod(kernel_mod_ptr, anf_node.get());
}

View File

@ -49,7 +49,7 @@ const std::vector<size_t> &AkgKernelMod::GetOutputSizeList() const { return outp
const std::vector<size_t> &AkgKernelMod::GetWorkspaceSizeList() const { return workspace_size_list_; }
bool AkgKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
bool AkgKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs, void *stream_ptr) {
if (stream_ptr == nullptr) {
MS_LOG(ERROR) << "stream_ptr should not be nullptr.";
@ -74,6 +74,10 @@ bool AkgKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vect
[](const AddressPtr &input) -> void * { return input->addr; });
(void)std::transform(std::begin(outputs), std::end(outputs), std::back_inserter(runtime_args),
[](const AddressPtr &output) -> void * { return output->addr; });
if (!workspace.empty()) {
(void)std::transform(std::begin(workspace), std::end(workspace), std::back_inserter(runtime_args),
[](const AddressPtr &addr) -> void * { return addr->addr; });
}
rtL2Ctrl_t *l2ctrl = nullptr;
auto stream = static_cast<rtStream_t *>(stream_ptr);
@ -86,7 +90,8 @@ bool AkgKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vect
return true;
}
std::vector<TaskInfoPtr> AkgKernelMod::GenTask(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
std::vector<TaskInfoPtr> AkgKernelMod::GenTask(const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs, uint32_t stream_id) {
if (kernel_pack_ == nullptr) {
MS_LOG(EXCEPTION) << "kernel pack should not be nullptr.";
@ -107,6 +112,10 @@ std::vector<TaskInfoPtr> AkgKernelMod::GenTask(const std::vector<AddressPtr> &in
[](const AddressPtr &input) -> void * { return input->addr; });
(void)std::transform(std::begin(outputs), std::end(outputs), std::back_inserter(output_data_addrs),
[](const AddressPtr &output) -> void * { return output->addr; });
if (!workspace.empty()) {
(void)std::transform(std::begin(workspace), std::end(workspace), std::back_inserter(workspace_addrs),
[](const AddressPtr &workspace) -> void * { return workspace->addr; });
}
uint32_t block_dim = DEFAULT_BLOCK_DIM; // default blockdim equal to 1.
auto func_stub = KernelManager::GenFuncStub(*kernel_pack_, false, &block_dim);

View File

@ -39,8 +39,10 @@ KernelPackPtr AkgGpuKernelBuilder::AkgInsertCache(const std::string &kernel_name
void AkgGpuKernelBuilder::AkgSetKernelMod(const KernelPackPtr &kernel_pack,
const AkgKernelJsonGenerator &json_generator, const AnfNodePtr &anf_node) {
auto kernel_mod_ptr = std::make_shared<GpuKernelMod>(kernel_pack);
auto kernel_json_info = kernel_pack->kernel_json_info();
kernel_mod_ptr->SetInputSizeList(json_generator.input_size_list());
kernel_mod_ptr->SetOutputSizeList(json_generator.output_size_list());
kernel_mod_ptr->SetWorkspaceSizeList(kernel_json_info.workspaces);
AnfAlgo::SetKernelMod(kernel_mod_ptr, anf_node.get());
}

View File

@ -92,13 +92,15 @@ void GpuKernelMod::SetInputSizeList(const std::vector<size_t> &size_list) { inpu
void GpuKernelMod::SetOutputSizeList(const std::vector<size_t> &size_list) { output_size_list_ = size_list; }
void GpuKernelMod::SetWorkspaceSizeList(const std::vector<size_t> &size_list) { workspace_size_list_ = size_list; }
const std::vector<size_t> &GpuKernelMod::GetInputSizeList() const { return input_size_list_; }
const std::vector<size_t> &GpuKernelMod::GetOutputSizeList() const { return output_size_list_; }
const std::vector<size_t> &GpuKernelMod::GetWorkspaceSizeList() const { return workspace_size_list_; }
bool GpuKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
bool GpuKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs, void *stream_ptr) {
if (stream_ptr == 0) {
MS_LOG(ERROR) << "stream_ptr should not be nullptr.";
@ -122,6 +124,10 @@ bool GpuKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vect
[](const AddressPtr &input) -> void * { return reinterpret_cast<void *>(&(input->addr)); });
(void)std::transform(std::begin(outputs), std::end(outputs), std::back_inserter(runtimeargs),
[](const AddressPtr &output) -> void * { return reinterpret_cast<void *>(&(output->addr)); });
if (!workspace.empty()) {
(void)std::transform(std::begin(workspace), std::end(workspace), std::back_inserter(runtimeargs),
[](const AddressPtr &addr) -> void * { return addr->addr; });
}
result = cuLaunchKernel(kernel_addr, thread_info[0], thread_info[1], thread_info[2], thread_info[3], thread_info[4],
thread_info[5], 0, reinterpret_cast<CUstream>(stream_ptr),
reinterpret_cast<void **>(&runtimeargs[0]), 0);

View File

@ -60,6 +60,7 @@ class GpuKernelMod : public KernelMod {
void SetInputSizeList(const std::vector<size_t> &size_list);
void SetOutputSizeList(const std::vector<size_t> &size_list);
void SetWorkspaceSizeList(const std::vector<size_t> &size_list);
const std::vector<size_t> &GetInputSizeList() const override;
const std::vector<size_t> &GetOutputSizeList() const override;
const std::vector<size_t> &GetWorkspaceSizeList() const override;

View File

@ -141,14 +141,8 @@ FusionType GetFusionTypeByName(const std::string &name) {
return iter->first;
}
void KernelMeta::Initialize(int pid) {
if (pid == -1) {
kernel_meta_path_ = std::string(kGpuKernelMeta) + "_" + std::to_string(getpid()) + "/";
} else {
kernel_meta_path_ = std::string(kGpuKernelMeta) + "_" + std::to_string(pid) + "/";
}
// remove old kernel cache
RemoveKernelCache();
void KernelMeta::Initialize() {
kernel_meta_path_ = std::string(kGpuKernelMeta) + "/";
#if defined(_WIN32) || defined(_WIN64)
auto ret = mkdir(kernel_meta_path_.c_str());
@ -161,21 +155,6 @@ void KernelMeta::Initialize(int pid) {
initialized_ = true;
}
void KernelMeta::RemoveKernelCache() {
DIR *dir = opendir(kernel_meta_path_.c_str());
if (dir == nullptr) {
return;
}
struct dirent *entry;
while ((entry = readdir(dir)) != nullptr) {
std::string kernel_file = entry->d_name;
std::string kernel_file_realpath = kernel_meta_path_ + kernel_file;
(void)remove(kernel_file_realpath.c_str());
}
(void)closedir(dir);
(void)rmdir(kernel_meta_path_.c_str());
}
std::string KernelMeta::Search(const std::string &kernel_name) const {
if (!initialized_) {
return "";
@ -227,7 +206,7 @@ KernelPackPtr SearchCache(const std::string &kernel_name, const std::string &pro
KernelPackPtr kernel_pack = std::make_shared<KernelPack>();
// just a tmp solution.
if (!kernel_pack->ReadFromJsonFile(kernel_json, processor)) {
MS_LOG(DEBUG) << "Read cache json and bin file failed[" << kernel_json << "].";
MS_LOG(ERROR) << "Read cache json and bin file failed[" << kernel_json << "].";
return nullptr;
} else {
return kernel_pack;
@ -250,7 +229,7 @@ KernelPackPtr InsertCache(const std::string &kernel_name, const std::string &pro
(void)kernel_json.append(kernel_name).append(kJsonSuffix);
KernelPackPtr kernel_pack = std::make_shared<KernelPack>();
if (!kernel_pack->ReadFromJsonFile(kernel_json, processor)) {
MS_LOG(DEBUG) << "Read json and bin file failed[" << kernel_json << "].";
MS_LOG(ERROR) << "Read json and bin file failed[" << kernel_json << "].";
return nullptr;
}
@ -714,6 +693,9 @@ void GetFuncGraphOutputNodes(const FuncGraphPtr &func_graph, std::vector<AnfNode
for (size_t input_idx = 1; input_idx < cnode->inputs().size(); ++input_idx) {
auto input_node = cnode->input(input_idx);
MS_EXCEPTION_IF_NULL(input_node);
if (input_node->isa<CNode>() && AnfAlgo::GetInputTensorNum(input_node) == 0) {
continue;
}
output_list->push_back(AnfAlgo::VisitKernel(input_node, 0).first);
}
} else {

View File

@ -55,8 +55,7 @@ using KernelMetaPtr = std::shared_ptr<KernelMetaInfo>;
class KernelMeta {
public:
KernelMeta() = default;
void Initialize(int pid);
void RemoveKernelCache();
void Initialize();
std::string Search(const std::string &kernel_name) const;
bool Insert(const std::string &kernel_name, const std::string &kernel_json);
std::string kernel_meta_path() const { return kernel_meta_path_; }

View File

@ -26,46 +26,26 @@ namespace mindspore {
namespace kernel {
constexpr size_t kSizeFloat16 = sizeof(float16);
constexpr size_t kSizeFloat32 = sizeof(float);
constexpr size_t kScalarIndex = 0;
constexpr size_t kAdamWeightDecayInputSize = 9;
constexpr size_t kAdamWeightDecayOutputSize = 3;
void AdamWeightDecayCPUKernel::ParallelForAdam(const CTask &task, size_t count) {
auto max_thread_num = common::ThreadPool::GetInstance().GetSyncRunThreadNum();
const float block_size = 128.0;
const float align_size = 16.0;
size_t thread_num = count < block_size * max_thread_num ? std::ceil(count / block_size) : max_thread_num;
std::vector<common::Task> tasks;
size_t start = 0;
size_t once_compute_size = align_size * std::ceil(count / (align_size * thread_num));
while (start < count) {
size_t end = (start + once_compute_size) > count ? count : (start + once_compute_size);
auto block = [&, start, end]() {
task(start, end);
return common::SUCCESS;
};
tasks.emplace_back(block);
start += once_compute_size;
}
common::ThreadPool::GetInstance().SyncRun(tasks);
}
template <typename T, typename S>
void AdamWeightDecayCPUKernel::LaunchFusedAdam(const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> &outputs) {
auto var = reinterpret_cast<T *>(inputs[0]->addr);
auto m = reinterpret_cast<T *>(inputs[1]->addr);
auto v = reinterpret_cast<T *>(inputs[2]->addr);
auto lr = reinterpret_cast<T *>(inputs[3]->addr)[0];
auto beta1 = reinterpret_cast<T *>(inputs[4]->addr)[0];
auto beta2 = reinterpret_cast<T *>(inputs[5]->addr)[0];
auto epsilon = reinterpret_cast<T *>(inputs[6]->addr)[0];
auto decay = reinterpret_cast<T *>(inputs[7]->addr)[0];
auto gradient16 = reinterpret_cast<S *>(inputs[8]->addr);
void AdamWeightDecayCPUKernel::LaunchFusedAdam(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &) {
auto var = reinterpret_cast<T *>(inputs[VAR]->addr);
auto m = reinterpret_cast<T *>(inputs[M]->addr);
auto v = reinterpret_cast<T *>(inputs[V]->addr);
auto lr = reinterpret_cast<T *>(inputs[LR]->addr)[kScalarIndex];
auto beta1 = reinterpret_cast<T *>(inputs[BETA1]->addr)[kScalarIndex];
auto beta2 = reinterpret_cast<T *>(inputs[BETA2]->addr)[kScalarIndex];
auto epsilon = reinterpret_cast<T *>(inputs[EPSILON]->addr)[kScalarIndex];
auto decay = reinterpret_cast<T *>(inputs[DECAY]->addr)[kScalarIndex];
auto gradient16 = reinterpret_cast<S *>(inputs[GRAD]->addr);
const auto beta1_minus = 1 - beta1;
const auto beta2_minus = 1 - beta2;
// multithreading
size_t lens = inputs[0]->size > 0 ? static_cast<size_t>(inputs[0]->size / sizeof(float)) : 1;
size_t lens = inputs[VAR]->size > 0 ? static_cast<size_t>(inputs[VAR]->size / sizeof(float)) : 1;
std::function<void(size_t, size_t)> task;
task = [&](size_t start, size_t end) {
@ -81,28 +61,27 @@ void AdamWeightDecayCPUKernel::LaunchFusedAdam(const std::vector<AddressPtr> &in
var[i] -= lr * update;
}
};
ParallelForAdam(task, lens);
CPUKernelUtils::ParallelFor(task, lens);
}
template <typename T>
void AdamWeightDecayCPUKernel::LaunchAdamWeightDecay(const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> &outputs) {
auto var = reinterpret_cast<T *>(inputs[0]->addr);
auto m = reinterpret_cast<T *>(inputs[1]->addr);
auto v = reinterpret_cast<T *>(inputs[2]->addr);
auto lr = reinterpret_cast<T *>(inputs[3]->addr)[0];
auto beta1 = reinterpret_cast<T *>(inputs[4]->addr)[0];
auto beta2 = reinterpret_cast<T *>(inputs[5]->addr)[0];
auto epsilon = reinterpret_cast<T *>(inputs[6]->addr)[0];
auto decay = reinterpret_cast<T *>(inputs[7]->addr)[0];
auto gradient = reinterpret_cast<T *>(inputs[8]->addr);
const std::vector<AddressPtr> &) {
auto var = reinterpret_cast<T *>(inputs[VAR]->addr);
auto m = reinterpret_cast<T *>(inputs[M]->addr);
auto v = reinterpret_cast<T *>(inputs[V]->addr);
auto lr = reinterpret_cast<T *>(inputs[LR]->addr)[kScalarIndex];
auto beta1 = reinterpret_cast<T *>(inputs[BETA1]->addr)[kScalarIndex];
auto beta2 = reinterpret_cast<T *>(inputs[BETA2]->addr)[kScalarIndex];
auto epsilon = reinterpret_cast<T *>(inputs[EPSILON]->addr)[kScalarIndex];
auto decay = reinterpret_cast<T *>(inputs[DECAY]->addr)[kScalarIndex];
auto gradient = reinterpret_cast<T *>(inputs[GRAD]->addr);
const auto beta1_minus = 1 - beta1;
const auto beta2_minus = 1 - beta2;
// multithreading
size_t lens = inputs[0]->size > 0 ? static_cast<size_t>(inputs[0]->size / sizeof(float)) : 1;
size_t lens = inputs[VAR]->size > 0 ? static_cast<size_t>(inputs[VAR]->size / sizeof(float)) : 1;
std::function<void(size_t, size_t)> task;
task = [&](size_t start, size_t end) {
size_t i = AdamWeightDecayFp32(var, m, v, lr, beta1, beta2, epsilon, decay, gradient, start, end);
// remaining
@ -114,14 +93,14 @@ void AdamWeightDecayCPUKernel::LaunchAdamWeightDecay(const std::vector<AddressPt
var[i] -= lr * update;
}
};
ParallelForAdam(task, lens);
CPUKernelUtils::ParallelFor(task, lens);
}
void AdamWeightDecayCPUKernel::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
std::vector<size_t> var_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
gradient_dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 8);
std::vector<size_t> var_shape = AnfAlgo::GetInputDeviceShape(kernel_node, VAR);
dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, VAR);
gradient_dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, GRAD);
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != kAdamWeightDecayInputSize) {
MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but AdamWeightDecay needs 9 inputs.";
@ -155,12 +134,12 @@ void AdamWeightDecayCPUKernel::CheckParam(const std::vector<kernel::AddressPtr>
}
size_t elem1_size = elem_num_ * kSizeFloat32;
size_t elem2_size = gradient_dtype_ == kNumberTypeFloat16 ? elem_num_ * kSizeFloat16 : elem1_size;
if (inputs[0]->size != elem1_size || inputs[1]->size != elem1_size || inputs[2]->size != elem1_size ||
inputs[8]->size != elem2_size) {
if (inputs[VAR]->size != elem1_size || inputs[M]->size != elem1_size || inputs[V]->size != elem1_size ||
inputs[GRAD]->size != elem2_size) {
MS_LOG(EXCEPTION) << "Error input data size!";
}
if (inputs[3]->size != kSizeFloat32 || inputs[4]->size != kSizeFloat32 || inputs[5]->size != kSizeFloat32 ||
inputs[6]->size != kSizeFloat32 || inputs[7]->size != kSizeFloat32) {
if (inputs[LR]->size != kSizeFloat32 || inputs[BETA1]->size != kSizeFloat32 || inputs[BETA2]->size != kSizeFloat32 ||
inputs[EPSILON]->size != kSizeFloat32 || inputs[DECAY]->size != kSizeFloat32) {
MS_LOG(EXCEPTION) << "The attribute beta, lr, epsilon and weight decay must be float!";
}
}

View File

@ -32,7 +32,6 @@ class AdamWeightDecayCPUKernel : public CPUKernel {
const std::vector<AddressPtr> &outputs) override;
private:
void ParallelForAdam(const CTask &task, size_t count);
void CheckParam(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
template <typename T, typename S>
void LaunchFusedAdam(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
@ -41,6 +40,7 @@ class AdamWeightDecayCPUKernel : public CPUKernel {
size_t elem_num_{0};
TypeId dtype_{kTypeUnknown};
TypeId gradient_dtype_{kTypeUnknown};
enum input_list_ { VAR, M, V, LR, BETA1, BETA2, EPSILON, DECAY, GRAD };
};
MS_REG_CPU_KERNEL(AdamWeightDecay,

View File

@ -76,27 +76,10 @@ void ApplyAdagradCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
// multithreading
size_t length = inputs[0]->size / sizeof(T);
size_t max_thread_num = std::thread::hardware_concurrency();
size_t use_thread_num = length < 128 * max_thread_num ? std::ceil(length / 128.0) : max_thread_num;
std::vector<std::thread> threads;
threads.reserve(use_thread_num);
size_t start = 0;
const size_t batch_size = (length + use_thread_num - 1) / use_thread_num;
if (batch_size == 0) {
MS_LOG(EXCEPTION) << "Error occur in launch kernel";
return;
}
while (start < length) {
size_t end = (start + batch_size) > length ? length : (start + batch_size);
threads.emplace_back(
std::thread(&ApplyAdagradCPUKernel::LaunchApplyAdagrad<T *>, this, var, accum, lr, gradient, start, end));
start += batch_size;
}
for (auto &it : threads) {
it.join();
}
auto task = [this, &var, &accum, lr, gradient](size_t start, size_t end) {
LaunchApplyAdagrad(var, accum, lr, gradient, start, end);
};
CPUKernelUtils::ParallelForAutoSearch(task, length, &parallel_search_info_);
// Copy result to output tensor
auto output_var = reinterpret_cast<T *>(outputs[0]->addr);

View File

@ -13,10 +13,12 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/cpu/arithmetic_logic_cpu_kernel.h"
#include <cmath>
#include <string>
#include <map>
#include "backend/kernel_compiler/cpu/arithmetic_logic_cpu_kernel.h"
#include <functional>
#include "runtime/device/cpu/cpu_device_address.h"
namespace mindspore {
@ -29,7 +31,9 @@ void ArithmeticLogicCPUKernel<T>::Less(const T *input1, const T *input2, bool *o
auto iter = base_iter;
iter.SetPos(start);
for (size_t i = start; i < end; i++) {
out[i] = input1[iter.GetInputPosA()] < input2[iter.GetInputPosB()];
auto x = input1[iter.GetInputPosA()];
auto y = input2[iter.GetInputPosB()];
out[i] = std::less<T>()(x, y);
iter.GenNextPos();
}
};
@ -37,7 +41,9 @@ void ArithmeticLogicCPUKernel<T>::Less(const T *input1, const T *input2, bool *o
} else {
base_iter.SetPos(0);
for (size_t i = 0; i < output_size_; i++) {
out[i] = input1[base_iter.GetInputPosA()] < input2[base_iter.GetInputPosB()];
auto x = input1[base_iter.GetInputPosA()];
auto y = input2[base_iter.GetInputPosB()];
out[i] = std::less<T>()(x, y);
base_iter.GenNextPos();
}
}
@ -50,7 +56,9 @@ void ArithmeticLogicCPUKernel<T>::Equal(const T *input1, const T *input2, bool *
auto iter = base_iter;
iter.SetPos(start);
for (size_t i = start; i < end; i++) {
out[i] = input1[iter.GetInputPosA()] == input2[iter.GetInputPosB()];
auto x = input1[iter.GetInputPosA()];
auto y = input2[iter.GetInputPosB()];
out[i] = std::equal_to<T>()(x, y);
iter.GenNextPos();
}
};
@ -64,7 +72,9 @@ void ArithmeticLogicCPUKernel<T>::NotEqual(const T *input1, const T *input2, boo
auto iter = base_iter;
iter.SetPos(start);
for (size_t i = start; i < end; i++) {
out[i] = input1[iter.GetInputPosA()] != input2[iter.GetInputPosB()];
auto x = input1[iter.GetInputPosA()];
auto y = input2[iter.GetInputPosB()];
out[i] = std::not_equal_to<T>()(x, y);
iter.GenNextPos();
}
};
@ -106,7 +116,9 @@ void ArithmeticLogicCPUKernel<T>::Greater(const T *input1, const T *input2, bool
auto iter = base_iter;
iter.SetPos(start);
for (size_t i = start; i < end; i++) {
out[i] = input1[iter.GetInputPosA()] > input2[iter.GetInputPosB()];
auto x = input1[iter.GetInputPosA()];
auto y = input2[iter.GetInputPosB()];
out[i] = std::greater<T>()(x, y);
iter.GenNextPos();
}
};
@ -120,7 +132,9 @@ void ArithmeticLogicCPUKernel<T>::GreaterEqual(const T *input1, const T *input2,
auto iter = base_iter;
iter.SetPos(start);
for (size_t i = start; i < end; i++) {
out[i] = input1[iter.GetInputPosA()] >= input2[iter.GetInputPosB()];
auto x = input1[iter.GetInputPosA()];
auto y = input2[iter.GetInputPosB()];
out[i] = std::greater_equal<T>()(x, y);
iter.GenNextPos();
}
};
@ -134,7 +148,9 @@ void ArithmeticLogicCPUKernel<T>::LessEqual(const T *input1, const T *input2, bo
auto iter = base_iter;
iter.SetPos(start);
for (size_t i = start; i < end; i++) {
out[i] = input1[iter.GetInputPosA()] <= input2[iter.GetInputPosB()];
auto x = input1[iter.GetInputPosA()];
auto y = input2[iter.GetInputPosB()];
out[i] = std::less_equal<T>()(x, y);
iter.GenNextPos();
}
};

View File

@ -1,5 +1,5 @@
/**
* Copyright 2019 Huawei Technologies Co., Ltd
* Copyright 2019-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.

View File

@ -21,6 +21,7 @@
#include <string>
#include "runtime/device/kernel_info.h"
#include "runtime/device/cpu/kernel_select_cpu.h"
namespace mindspore {
namespace kernel {
@ -111,6 +112,11 @@ std::pair<bool, size_t> CPUKernelFactory::CPUKernelAttrCheck(const std::string &
MS_LOG(INFO) << "Not registered CPU kernel: op[" << kernel_name << "]!";
return std::make_pair(false, 0);
}
if (device::cpu::IsDynamicParamKernel(kernel_name)) {
return std::make_pair(true, 0);
}
auto kernel_attrs = GetSupportedKernelAttrList(kernel_name);
if (kernel_attrs[0].GetInputSize() == 0 && kernel_attrs[0].GetOutputSize() == 0) {
auto op_info_ptr = mindspore::kernel::OpLib::FindOp(kernel_name, kernel::OpImplyType::kCPU);

View File

@ -43,9 +43,9 @@ void DropoutGradCpuBwdKernel::InitKernel(const CNodePtr &kernel_node) {
bool DropoutGradCpuBwdKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
const std::vector<AddressPtr> &outputs) {
if (dtype_ == kNumberTypeFloat16) {
DropoutBackwardKernel<float16>(inputs, outputs, num_count_, keep_prob_);
DropoutBackwardKernel<float16>(inputs, outputs, keep_prob_);
} else if (dtype_ == kNumberTypeFloat32) {
DropoutBackwardKernel<float>(inputs, outputs, num_count_, keep_prob_);
DropoutBackwardKernel<float>(inputs, outputs, keep_prob_);
} else {
MS_LOG(ERROR) << "Input data type: " << dtype_ << " is not supported for DropoutGrad kernel for CPU.";
}
@ -55,8 +55,7 @@ bool DropoutGradCpuBwdKernel::Launch(const std::vector<AddressPtr> &inputs, cons
template <typename T>
void DropoutGradCpuBwdKernel::DropoutBackwardKernel(const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> &outputs, size_t num_count,
float keep_prob) {
const std::vector<AddressPtr> &outputs, float keep_prob) {
auto *output = reinterpret_cast<T *>(outputs[0]->addr);
const auto *input = reinterpret_cast<T *>(inputs[0]->addr);
const auto *mask = reinterpret_cast<T *>(inputs[1]->addr);
@ -70,7 +69,7 @@ void DropoutGradCpuBwdKernel::DropoutBackwardKernel(const std::vector<AddressPtr
input_tmp[i] = static_cast<float>(input[i]);
mask_tmp[i] = static_cast<float>(mask[i]);
}
DropoutGrad(input_tmp, mask_tmp, output_tmp, num_count_, scale);
DropoutGrad(input_tmp, mask_tmp, output_tmp, SizeToInt(num_count_), scale);
for (size_t i = 0; i < num_count_; ++i) {
output[i] = static_cast<float16>(output_tmp[i]);
}
@ -78,7 +77,7 @@ void DropoutGradCpuBwdKernel::DropoutBackwardKernel(const std::vector<AddressPtr
delete[] output_tmp;
delete[] mask_tmp;
} else if constexpr (std::is_same_v<T, float>) {
DropoutGrad(input, mask, output, num_count_, scale);
DropoutGrad(input, mask, output, SizeToInt(num_count_), scale);
}
}
} // namespace kernel

View File

@ -40,7 +40,7 @@ class DropoutGradCpuBwdKernel : public CPUKernel {
TypeId dtype_{kTypeUnknown};
template <typename T>
void DropoutBackwardKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs,
size_t num_count, float keep_prob);
float keep_prob);
};
MS_REG_CPU_KERNEL(DropoutGrad, KernelAttr(), DropoutGradCpuBwdKernel);

View File

@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -13,8 +13,10 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <map>
#include "backend/kernel_compiler/cpu/eltwise_grad_cpu_kernel.h"
#include <string>
#include <map>
#include "common/thread_pool.h"
#include "runtime/device/cpu/cpu_device_address.h"
#include "nnacl/fp32_grad/activation_grad.h"
@ -25,50 +27,50 @@ namespace mindspore {
namespace kernel {
template <typename T>
void EltWiseGradCPUKernel<T>::ReluGrad(const T *input1, const T *input2, T *out, size_t start, size_t end) const {
if constexpr (std::is_same_v<T, float>) {
int ret = ::ReluGrad(input1 + start, input2 + start, end - start, out + start);
if (ret == NNACL_ERR) {
MS_LOG(EXCEPTION) << "ReLUGrad failed.";
}
} else {
if constexpr (!std::is_same<T, float>::value) {
MS_LOG(EXCEPTION) << "ReLUGrad only support float";
}
int ret = ::ReluGrad(input1 + start, input2 + start, end - start, out + start);
if (ret == NNACL_ERR) {
MS_LOG(EXCEPTION) << "ReLUGrad execute failed.";
}
}
template <typename T>
void EltWiseGradCPUKernel<T>::ReLU6Grad(const T *input1, const T *input2, T *out, size_t start, size_t end) const {
if constexpr (std::is_same_v<T, float>) {
int ret = ::Relu6Grad(input1 + start, input2 + start, end - start, out + start);
if (ret == NNACL_ERR) {
MS_LOG(EXCEPTION) << "ReLU6Grad failed.";
}
} else {
if constexpr (!std::is_same<T, float>::value) {
MS_LOG(EXCEPTION) << "ReLU6Grad only support float";
}
int ret = ::Relu6Grad(input1 + start, input2 + start, end - start, out + start);
if (ret == NNACL_ERR) {
MS_LOG(EXCEPTION) << "ReLU6Grad execute failed.";
}
}
template <typename T>
void EltWiseGradCPUKernel<T>::AbsGrad(const T *input1, const T *input2, T *out, size_t start, size_t end) const {
if constexpr (std::is_same_v<T, float>) {
int ret = ::ElementAbsGrad(input1 + start, input2 + start, out + start, end - start);
if (ret == NNACL_ERR) {
MS_LOG(EXCEPTION) << "AbsGrad failed.";
}
} else {
if constexpr (!std::is_same<T, float>::value) {
MS_LOG(EXCEPTION) << "AbsGrad only support float";
}
int ret = ::ElementAbsGrad(input1 + start, input2 + start, out + start, end - start);
if (ret == NNACL_ERR) {
MS_LOG(EXCEPTION) << "AbsGrad execute failed.";
}
}
template <typename T>
void EltWiseGradCPUKernel<T>::SigmoidGrad(const T *input1, const T *input2, T *out, size_t start, size_t end) const {
if constexpr (std::is_same_v<T, float>) {
int ret = ::SigmoidGrad(input2 + start, input1 + start, end - start, out + start);
if (ret == NNACL_ERR) {
MS_LOG(EXCEPTION) << "SigmoidGrad failed.";
}
} else {
if constexpr (!std::is_same<T, float>::value) {
MS_LOG(EXCEPTION) << "SigmoidGrad only support float";
}
int ret = ::SigmoidGrad(input2 + start, input1 + start, end - start, out + start);
if (ret == NNACL_ERR) {
MS_LOG(EXCEPTION) << "SigmoidGrad execute failed.";
}
}
template <typename T>
@ -80,14 +82,14 @@ void EltWiseGradCPUKernel<T>::SqrtGrad(const T *input1, const T *input2, T *out,
template <typename T>
void EltWiseGradCPUKernel<T>::TanhGrad(const T *input1, const T *input2, T *out, size_t start, size_t end) const {
if constexpr (std::is_same_v<T, float>) {
int ret = ::TanhGrad(input2 + start, input1 + start, end - start, out + start);
if (ret == NNACL_ERR) {
MS_LOG(EXCEPTION) << "TanhGrad failed.";
}
} else {
if constexpr (!std::is_same<T, float>::value) {
MS_LOG(EXCEPTION) << "TanhGrad only support float";
}
int ret = ::TanhGrad(input2 + start, input1 + start, end - start, out + start);
if (ret == NNACL_ERR) {
MS_LOG(EXCEPTION) << "TanhGrad execute failed.";
}
}
template <typename T>
@ -207,6 +209,18 @@ void EltWiseGradCPUKernel<T>::AcoshGrad(const T *input1, const T *input2, T *out
}
}
template <typename T>
void EltWiseGradCPUKernel<T>::SoftplusGrad(const T *input1, const T *input2, T *out, size_t start, size_t end) const {
if constexpr (!std::is_same<T, float>::value) {
MS_LOG(EXCEPTION) << "SoftplusGrad only support float";
}
int ret = ::SoftplusGrad(input1 + start, input2 + start, end - start, out + start);
if (ret == NNACL_ERR) {
MS_LOG(EXCEPTION) << "SoftplusGrad execute failed.";
}
}
template <typename T>
void EltWiseGradCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
@ -219,12 +233,19 @@ bool EltWiseGradCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inpu
const std::vector<kernel::AddressPtr> &outputs) {
static const std::map<std::string,
std::function<void(EltWiseGradCPUKernel *, const T *, const T *, T *, size_t, size_t)>>
elt_map{{"ReluGrad", &EltWiseGradCPUKernel<T>::ReluGrad}, {"ReLU6Grad", &EltWiseGradCPUKernel<T>::ReLU6Grad},
{"SigmoidGrad", &EltWiseGradCPUKernel<T>::SigmoidGrad}, {"AbsGrad", &EltWiseGradCPUKernel<T>::AbsGrad},
{"TanhGrad", &EltWiseGradCPUKernel<T>::TanhGrad}, {"SqrtGrad", &EltWiseGradCPUKernel<T>::SqrtGrad},
{"GeLUGrad", &EltWiseGradCPUKernel<T>::GeluGrad}, {"AsinGrad", &EltWiseGradCPUKernel<T>::AsinGrad},
{"ACosGrad", &EltWiseGradCPUKernel<T>::ACosGrad}, {"AtanGrad", &EltWiseGradCPUKernel<T>::AtanGrad},
{"AsinhGrad", &EltWiseGradCPUKernel<T>::AsinhGrad}, {"AcoshGrad", &EltWiseGradCPUKernel<T>::AcoshGrad}};
elt_map{{prim::kPrimReluGrad->name(), &EltWiseGradCPUKernel<T>::ReluGrad},
{prim::kPrimRelu6Grad->name(), &EltWiseGradCPUKernel<T>::ReLU6Grad},
{prim::kPrimSigmoidGrad->name(), &EltWiseGradCPUKernel<T>::SigmoidGrad},
{prim::kPrimAbsGrad->name(), &EltWiseGradCPUKernel<T>::AbsGrad},
{prim::kPrimTanhGrad->name(), &EltWiseGradCPUKernel<T>::TanhGrad},
{prim::kPrimSqrtGrad->name(), &EltWiseGradCPUKernel<T>::SqrtGrad},
{prim::kPrimGeLUGrad->name(), &EltWiseGradCPUKernel<T>::GeluGrad},
{prim::kPrimAsinGrad->name(), &EltWiseGradCPUKernel<T>::AsinGrad},
{prim::kPrimACosGrad->name(), &EltWiseGradCPUKernel<T>::ACosGrad},
{prim::kPrimAtanGrad->name(), &EltWiseGradCPUKernel<T>::AtanGrad},
{prim::kPrimAsinhGrad->name(), &EltWiseGradCPUKernel<T>::AsinhGrad},
{prim::kPrimAcoshGrad->name(), &EltWiseGradCPUKernel<T>::AcoshGrad},
{prim::kPrimSoftplusGrad->name(), &EltWiseGradCPUKernel<T>::SoftplusGrad}};
if (inputs.size() < 2 || outputs.size() != 1) {
MS_LOG(ERROR) << kernel_name_ << " requires at least 2 inputs and 1 output, but got " << inputs.size()
<< " inputs and " << outputs.size() << " output.";

View File

@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -48,6 +48,7 @@ class EltWiseGradCPUKernel : public CPUKernel {
void AtanGrad(const T *input1, const T *input2, T *out, size_t start, size_t end) const;
void AsinhGrad(const T *input1, const T *input2, T *out, size_t start, size_t end) const;
void AcoshGrad(const T *input1, const T *input2, T *out, size_t start, size_t end) const;
void SoftplusGrad(const T *input1, const T *input2, T *out, size_t start, size_t end) const;
std::string kernel_name_ = "";
};
@ -103,6 +104,10 @@ MS_REG_CPU_KERNEL_T(
AcoshGrad,
KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
EltWiseGradCPUKernel, float);
MS_REG_CPU_KERNEL_T(
SoftplusGrad,
KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
EltWiseGradCPUKernel, float);
} // namespace kernel
} // namespace mindspore

View File

@ -13,39 +13,47 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/cpu/mkldnn/eltwise_cpu_kernel.h"
#include <string>
#include <unordered_map>
#include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
#include "runtime/device/cpu/cpu_device_address.h"
#include "utils/ms_utils.h"
namespace mindspore {
namespace kernel {
namespace {
struct DescParam {
dnnl::algorithm algorithm;
float alpha = 0.f;
float beta = 0.f;
};
} // namespace
dnnl::eltwise_forward::desc EltWiseCPUKernel::GetForwardEltwiseDesc(const CNodePtr &kernel_node,
const dnnl::memory::desc src_desc) {
static const std::unordered_map<std::string, DescParam> eltWiseOpDescMap{
{prim::kPrimRelu->name(), DescParam{dnnl::algorithm::eltwise_relu}},
{prim::kPrimRelu6->name(), DescParam{dnnl::algorithm::eltwise_clip, 0.f, 6.f}},
{prim::kPrimAbs->name(), DescParam{dnnl::algorithm::eltwise_abs}},
{prim::kPrimExp->name(), DescParam{dnnl::algorithm::eltwise_exp}},
{prim::kPrimLog->name(), DescParam{dnnl::algorithm::eltwise_log}},
{prim::kPrimSigmoid->name(), DescParam{dnnl::algorithm::eltwise_logistic}},
{prim::kPrimSqrt->name(), DescParam{dnnl::algorithm::eltwise_sqrt}},
{prim::kPrimSquare->name(), DescParam{dnnl::algorithm::eltwise_square}},
{prim::kPrimTanh->name(), DescParam{dnnl::algorithm::eltwise_tanh}},
{prim::kPrimElu->name(), DescParam{dnnl::algorithm::eltwise_elu, 1.f, 0.f}},
{prim::kPrimSoftplus->name(), DescParam{dnnl::algorithm::eltwise_soft_relu}},
};
std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
if (kernel_name == "ReLU") {
return dnnl::eltwise_forward::desc(DnnlForward, dnnl::algorithm::eltwise_relu, src_desc, 0.0);
} else if (kernel_name == "ReLU6") {
return dnnl::eltwise_forward::desc(DnnlForward, dnnl::algorithm::eltwise_clip, src_desc, 0.0, 6.0);
} else if (kernel_name == "Abs") {
return dnnl::eltwise_forward::desc(DnnlForward, dnnl::algorithm::eltwise_abs, src_desc);
} else if (kernel_name == "Exp") {
return dnnl::eltwise_forward::desc(DnnlForward, dnnl::algorithm::eltwise_exp, src_desc);
} else if (kernel_name == "Log") {
return dnnl::eltwise_forward::desc(DnnlForward, dnnl::algorithm::eltwise_log, src_desc);
} else if (kernel_name == "Sigmoid") {
return dnnl::eltwise_forward::desc(DnnlForward, dnnl::algorithm::eltwise_logistic, src_desc);
} else if (kernel_name == "Sqrt") {
return dnnl::eltwise_forward::desc(DnnlForward, dnnl::algorithm::eltwise_sqrt, src_desc);
} else if (kernel_name == "Square") {
return dnnl::eltwise_forward::desc(DnnlForward, dnnl::algorithm::eltwise_square, src_desc);
} else if (kernel_name == "Tanh") {
return dnnl::eltwise_forward::desc(DnnlForward, dnnl::algorithm::eltwise_tanh, src_desc);
} else if (kernel_name == "Elu") {
return dnnl::eltwise_forward::desc(DnnlForward, dnnl::algorithm::eltwise_elu, src_desc, 1.0);
} else {
MS_LOG(EXCEPTION) << "Eltwise operators don't support " << kernel_name;
const auto desc_pair = eltWiseOpDescMap.find(kernel_name);
if (desc_pair == eltWiseOpDescMap.end()) {
MS_LOG(EXCEPTION) << "EltWiseCPUKernel does not support " << kernel_name;
}
return dnnl::eltwise_forward::desc(DnnlForward, desc_pair->second.algorithm, src_desc, desc_pair->second.alpha,
desc_pair->second.beta);
}
void EltWiseCPUKernel::InitKernel(const CNodePtr &kernel_node) {

View File

@ -1,5 +1,5 @@
/**
* Copyright 2019 Huawei Technologies Co., Ltd
* Copyright 2019-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -56,6 +56,8 @@ MS_REG_CPU_KERNEL(Square, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutpu
EltWiseCPUKernel);
MS_REG_CPU_KERNEL(Tanh, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
EltWiseCPUKernel);
MS_REG_CPU_KERNEL(Softplus, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
EltWiseCPUKernel);
} // namespace kernel
} // namespace mindspore

View File

@ -36,6 +36,24 @@ file(GLOB KERNEL_SRC
${NNACL_DIR}/fp32_grad/*.c
)
if(MSLITE_STRING_KERNEL)
file(GLOB KERNEL_SRC_INFER_STRING
${NNACL_DIR}/infer/string/*.c
)
set(KERNEL_SRC
${KERNEL_SRC}
${KERNEL_SRC_INFER_STRING}
)
endif()
if(MSLITE_CONTROL_TENSORLIST)
file(GLOB KERNEL_SRC_INFER_CONTROL_TENSORLIST
${NNACL_DIR}/infer/control/*.c
)
set(KERNEL_SRC
${KERNEL_SRC}
${KERNEL_SRC_INFER_CONTROL_TENSORLIST}
)
endif()
if(PLATFORM_ARM64)
file(GLOB ASSEMBLY_SRC ${NNACL_DIR}/assembly/arm64/*.S)
set_property(SOURCE ${ASSEMBLY_SRC} PROPERTY LANGUAGE C)

View File

@ -5,7 +5,8 @@
//void MatmulInt8DpNeon64(const int8_t *a, const int8_t *b, int8_t *dst, int row8, int col8, int deep4,
// const int *a_sums, const int *bias, int act_min, int act_max, int out_zp,
// int *multiplier, int *left_shift, int *right_shift, int row, int col, int stride, int peroc);
// const int *multiplier, const int *left_shift, const int *right_shift, int row,
// int col, int stride, int peroc);
// x0: a(left matrix ptr)
// x1: b(right matrix ptr)

View File

@ -4,8 +4,9 @@
.align 5
//void MatmulInt8DpOpt(const int8_t *a, const int8_t *b, int8_t *dst, int row, int col, int deep4, const int *a_sums,
// const int *bias, int act_min, int act_max, int out_zp, int32_t *multiplier, int32_t *left_shift,
// int32_t *right_shift, size_t stride, size_t filter_peroc, int32_t *filter_zp)
// const int *bias, int act_min, int act_max, int out_zp, const int32_t *multiplier,
// const int32_t *left_shift, const int32_t *right_shift, size_t stride, size_t filter_peroc,
// const int32_t *filter_zp)
// x0: a(left matrix ptr)
// x1: b(right matrix ptr)

View File

@ -23,19 +23,19 @@ void BatchToSpaceNoCropForNHWC(const void *input, void *output, const int *in_sh
int in_h = in_shape[1];
int in_w = in_shape[2];
int in_c = in_shape[3];
size_t stride_h = block_w * out_n;
size_t output_offset = 0;
size_t copy_size = in_c * data_size;
size_t in_stride_h = in_w * in_c;
size_t in_stride_n = in_stride_h * in_h;
int stride_h = block_w * out_n;
int output_offset = 0;
int copy_size = in_c * data_size;
int in_stride_h = in_w * in_c;
int in_stride_n = in_stride_h * in_h;
for (int n = 0; n < out_n; ++n) {
for (int h = 0; h < in_h; ++h) {
size_t h_offset = h * in_stride_h;
int h_offset = h * in_stride_h;
for (int bh = 0; bh < block_h; ++bh) {
for (int w = 0; w < in_w; ++w) {
size_t w_offset = w * in_c;
int w_offset = w * in_c;
for (int bw = 0; bw < block_w; ++bw) {
size_t in_offset = in_stride_n * (bh * stride_h + bw * out_n + n) + w_offset + h_offset;
int in_offset = in_stride_n * (bh * stride_h + bw * out_n + n) + w_offset + h_offset;
memcpy((int8_t *)output + output_offset, (int8_t *)input + in_offset * data_size, copy_size);
output_offset += copy_size;
}
@ -49,6 +49,9 @@ void BatchToSpaceForNHWC(const void *input, void *output, const int *in_shape, i
const int *crops, int data_size) {
int block_h = block[0];
int block_w = block[1];
if (block_h == 0 || block_w == 0) {
return;
}
int in_h = in_shape[1];
int in_w = in_shape[2];
int in_c = in_shape[3];
@ -61,27 +64,27 @@ void BatchToSpaceForNHWC(const void *input, void *output, const int *in_shape, i
int w_end = MSMIN((in_w * block_w - crops[3]) / block_w + 1, in_w);
int w_valid_end = in_w * block_w - crops[3] - 1;
size_t stride_h = block_w * out_n;
size_t output_offset = 0;
size_t copy_size = in_c * data_size;
size_t in_stride_h = in_w * in_c;
size_t in_stride_n = in_stride_h * in_h;
int stride_h = block_w * out_n;
int output_offset = 0;
int copy_size = in_c * data_size;
int in_stride_h = in_w * in_c;
int in_stride_n = in_stride_h * in_h;
for (int n = 0; n < out_n; ++n) {
for (int h = h_start; h < h_end; ++h) {
size_t h_offset = h * in_stride_h;
int h_offset = h * in_stride_h;
for (int bh = 0; bh < block_h; ++bh) {
size_t h_index = h * block_h + bh;
int h_index = h * block_h + bh;
if (h_index < h_valid_begin || h_index > h_valid_end) {
continue;
}
for (int w = w_start; w < w_end; ++w) {
size_t w_offset = w * in_c;
int w_offset = w * in_c;
for (int bw = 0; bw < block_w; ++bw) {
size_t w_index = w * block_w + bw;
int w_index = w * block_w + bw;
if (w_index < w_valid_begin || w_index > w_valid_end) {
continue;
}
size_t in_offset = in_stride_n * (bh * stride_h + bw * out_n + n) + w_offset + h_offset;
int in_offset = in_stride_n * (bh * stride_h + bw * out_n + n) + w_offset + h_offset;
memcpy((int8_t *)output + output_offset, (int8_t *)input + in_offset * data_size, copy_size);
output_offset += copy_size;
}

View File

@ -62,7 +62,7 @@ void pad_input_shape(int *input_shape, int input_shape_len, int output_shape_len
shape_info->input_shape_size_ = dim_max + 1; \
\
size_t before_dim_elements_num = accumulate(input_shape, 0, dim_max - 1); \
size_t after_dim_elements_num = input_shape[dim_max]; \
size_t after_dim_elements_num = (size_t)(input_shape[dim_max]); \
size_t dim_broadcast_rate = (size_t)(output_shape[dim_max] / input_shape[dim_max]); \
for (size_t i = 0; i < before_dim_elements_num; ++i) { \
const type *in_ptr = input + i * after_dim_elements_num; \

View File

@ -24,15 +24,18 @@ void Concat(void **input, int input_num, int axis, int **inputs_output_shape, si
}
int after_axis_size = data_size;
for (size_t i = axis + 1; i < shape_size; ++i) {
for (size_t i = (size_t)(axis) + 1; i < shape_size; ++i) {
after_axis_size *= inputs_output_shape[0][i];
}
int axis_offset = 0;
uint8_t *dst_base = (output);
size_t output_stride = after_axis_size * inputs_output_shape[input_num][axis];
int output_stride = after_axis_size * inputs_output_shape[input_num][axis];
for (int i = 0; i < input_num; ++i) {
const uint8_t *src_base = (input[i]);
size_t input_stride = after_axis_size * inputs_output_shape[i][axis];
if (inputs_output_shape[i] == NULL) {
continue;
}
int input_stride = after_axis_size * inputs_output_shape[i][axis];
int offset = UP_DIV(input_stride, thread_num);
int count = input_stride - offset * task_id;
if (count <= 0) {

View File

@ -22,17 +22,17 @@ void DepthToSpaceForNHWC(const void *input, void *output, const int *in_shape, c
int32_t in_shape_dim1 = in_shape[1];
size_t copy_size = block_size * param->out_stride_dim2_ * param->data_type_size_;
for (int i = 0; i < in_shape[0]; ++i) {
size_t in_offset_n = i * param->in_stride_dim0_;
size_t out_offset_n = i * param->out_stride_dim0_;
int in_offset_n = i * param->in_stride_dim0_;
int out_offset_n = i * param->out_stride_dim0_;
for (int j = 0; j < in_shape_dim1; ++j) {
size_t in_offset_h = in_offset_n + j * param->in_stride_dim1_;
size_t out_offset_h = out_offset_n + j * block_size * param->out_stride_dim1_;
int in_offset_h = in_offset_n + j * param->in_stride_dim1_;
int out_offset_h = out_offset_n + j * block_size * param->out_stride_dim1_;
for (int k = 0; k < in_shape_dim2; ++k) {
size_t in_offset_w = in_offset_h + k * param->in_stride_dim2_;
size_t out_offset_w = out_offset_h + k * block_size * param->out_stride_dim2_;
int in_offset_w = in_offset_h + k * param->in_stride_dim2_;
int out_offset_w = out_offset_h + k * block_size * param->out_stride_dim2_;
for (int l = 0; l < block_size; ++l) {
size_t out_offset = (out_offset_w + l * param->out_stride_dim1_) * param->data_type_size_;
size_t in_offset = (in_offset_w + l * block_size * param->out_stride_dim2_) * param->data_type_size_;
int out_offset = (out_offset_w + l * param->out_stride_dim1_) * param->data_type_size_;
int in_offset = (in_offset_w + l * block_size * param->out_stride_dim2_) * param->data_type_size_;
memcpy((int8_t *)output + out_offset, (int8_t *)input + in_offset, copy_size);
}
}

View File

@ -118,7 +118,9 @@ int B(const float *poly_array, float *matrix_b, int in_unit) {
float matrix_t[MAX_LEN]; // n * in_unit
T(poly_array, matrix_t, n);
LT(poly_array, matrix_lt, n);
if (LT(poly_array, matrix_lt, n) != NNACL_OK) {
return NNACL_ERR;
}
MatrixTranspose(matrix_lt, matrix_l, n, n);
MatrixMultiply(matrix_l, matrix_t, matrix_b, n, n, in_unit);
matrix_b[in_unit * in_unit - 1] = 1;

View File

@ -47,43 +47,43 @@ void DoSlice(const void *input, void *output, SliceParameter *param, int thread_
int8_t *int8_in = (int8_t *)input;
int8_t *int8_out = (int8_t *)output;
size_t out_stride[8];
int out_stride[8];
out_stride[7] = 1;
for (int i = 6; i >= 0; --i) {
out_stride[i] = out_stride[i + 1] * param->size_[i + 1];
}
size_t count_per_thread = UP_DIV(param->size_[5], param->op_parameter_.thread_num_);
size_t thread_begin = thread_id * count_per_thread;
size_t thread_end = MSMIN(param->size_[5], thread_begin + count_per_thread);
size_t copy_size = param->size_[7] * data_size;
size_t in_stride[8];
int count_per_thread = UP_DIV(param->size_[5], param->op_parameter_.thread_num_);
int thread_begin = thread_id * count_per_thread;
int thread_end = MSMIN(param->size_[5], thread_begin + count_per_thread);
int copy_size = param->size_[7] * data_size;
int in_stride[8];
in_stride[7] = 1;
for (int i = 6; i >= 0; --i) {
in_stride[i] = param->shape_[i + 1] * in_stride[i + 1];
}
for (int ii = 0; ii < param->size_[0]; ++ii) {
size_t out_offset0 = ii * out_stride[0];
size_t in_offset0 = (ii + param->begin_[0]) * in_stride[0] + param->begin_[7];
int out_offset0 = ii * out_stride[0];
int in_offset0 = (ii + param->begin_[0]) * in_stride[0] + param->begin_[7];
for (int jj = 0; jj < param->size_[1]; ++jj) {
size_t out_offset1 = jj * out_stride[1] + out_offset0;
size_t in_offset1 = (jj + param->begin_[1]) * in_stride[1] + in_offset0;
int out_offset1 = jj * out_stride[1] + out_offset0;
int in_offset1 = (jj + param->begin_[1]) * in_stride[1] + in_offset0;
for (int kk = 0; kk < param->size_[2]; ++kk) {
size_t out_offset2 = kk * out_stride[2] + out_offset1;
size_t in_offset2 = (kk + param->begin_[2]) * in_stride[2] + in_offset1;
int out_offset2 = kk * out_stride[2] + out_offset1;
int in_offset2 = (kk + param->begin_[2]) * in_stride[2] + in_offset1;
for (int ll = 0; ll < param->size_[3]; ++ll) {
size_t out_offset3 = ll * out_stride[3] + out_offset2;
size_t in_offset3 = (ll + param->begin_[3]) * in_stride[3] + in_offset2;
int out_offset3 = ll * out_stride[3] + out_offset2;
int in_offset3 = (ll + param->begin_[3]) * in_stride[3] + in_offset2;
for (int i = 0; i < param->size_[4]; ++i) {
size_t out_offset4 = i * out_stride[4] + out_offset3;
size_t in_offset4 = (i + param->begin_[4]) * in_stride[4] + in_offset3;
for (size_t j = thread_begin; j < thread_end; ++j) {
size_t out_offset5 = j * out_stride[5] + out_offset4;
size_t in_offset5 = (j + param->begin_[5]) * in_stride[5] + in_offset4;
int out_offset4 = i * out_stride[4] + out_offset3;
int in_offset4 = (i + param->begin_[4]) * in_stride[4] + in_offset3;
for (int j = thread_begin; j < thread_end; ++j) {
int out_offset5 = j * out_stride[5] + out_offset4;
int in_offset5 = (j + param->begin_[5]) * in_stride[5] + in_offset4;
for (int k = 0; k < param->size_[6]; ++k) {
size_t out_offset6 = k * out_stride[6] + out_offset5;
size_t in_offset6 = (k + param->begin_[6]) * in_stride[6] + in_offset5;
int out_offset6 = k * out_stride[6] + out_offset5;
int in_offset6 = (k + param->begin_[6]) * in_stride[6] + in_offset5;
memcpy(int8_out + out_offset6 * data_size, int8_in + in_offset6 * data_size, copy_size);
}
}
@ -105,8 +105,8 @@ void DoSliceNoParallel(const void *input, void *output, SliceParameter *param, i
int8_t *int8_in = (int8_t *)input;
int8_t *int8_out = (int8_t *)output;
size_t copy_size = param->size_[7] * data_size;
size_t in_stride[8];
int copy_size = param->size_[7] * data_size;
int in_stride[8];
in_stride[7] = 1;
for (int i = 6; i >= 0; --i) {
in_stride[i] = param->shape_[i + 1] * in_stride[i + 1];
@ -115,9 +115,9 @@ void DoSliceNoParallel(const void *input, void *output, SliceParameter *param, i
for (int i = 0; i < DIMENSION_8D; ++i) {
axis_copy_flag[i] = WhetherCopyByAxis(param->begin_, param->end_, param->shape_, i);
}
size_t out_offset = 0;
int out_offset = 0;
for (int32_t dim0 = param->begin_[0]; dim0 < param->end_[0]; ++dim0) {
size_t in_offset0 = dim0 * in_stride[0] + param->begin_[7];
int in_offset0 = dim0 * in_stride[0] + param->begin_[7];
#define FAST_COPY_IF_NEED(rank) \
if (axis_copy_flag[rank]) { \
int left_block_num = param->end_[rank] - dim##rank; \
@ -128,24 +128,24 @@ void DoSliceNoParallel(const void *input, void *output, SliceParameter *param, i
continue; \
}
FAST_COPY_IF_NEED(0);
for (size_t dim1 = param->begin_[1]; dim1 < param->end_[1]; ++dim1) {
size_t in_offset1 = dim1 * in_stride[1] + in_offset0;
for (int dim1 = param->begin_[1]; dim1 < param->end_[1]; ++dim1) {
int in_offset1 = dim1 * in_stride[1] + in_offset0;
FAST_COPY_IF_NEED(1);
for (int32_t dim2 = param->begin_[2]; dim2 < param->end_[2]; ++dim2) {
size_t in_offset2 = in_offset1 + dim2 * in_stride[2];
int in_offset2 = in_offset1 + dim2 * in_stride[2];
FAST_COPY_IF_NEED(2);
for (int32_t dim3 = param->begin_[3]; dim3 < param->end_[3]; ++dim3) {
size_t in_offset3 = in_offset2 + dim3 * in_stride[3];
int in_offset3 = in_offset2 + dim3 * in_stride[3];
FAST_COPY_IF_NEED(3);
for (int32_t dim4 = param->begin_[4]; dim4 < param->end_[4]; ++dim4) {
size_t in_offset4 = in_offset3 + dim4 * in_stride[4];
int in_offset4 = in_offset3 + dim4 * in_stride[4];
FAST_COPY_IF_NEED(4);
for (int32_t dim5 = param->begin_[5]; dim5 < param->end_[5]; ++dim5) {
size_t in_offset5 = in_offset4 + dim5 * in_stride[5];
int in_offset5 = in_offset4 + dim5 * in_stride[5];
FAST_COPY_IF_NEED(5);
#undef FAST_COPY_IF_NEED
for (int32_t dim6 = param->begin_[6]; dim6 < param->end_[6]; ++dim6) {
size_t in_offset6 = in_offset5 + dim6 * in_stride[6];
int in_offset6 = in_offset5 + dim6 * in_stride[6];
memcpy(int8_out + out_offset * data_size, int8_in + in_offset6 * data_size, copy_size);
out_offset += param->size_[7];
}

View File

@ -21,10 +21,6 @@
int DoSplit(void *in_data, void **out_data, const int *input_shape, int offset, int num_unit,
SplitParameter *split_param, int data_size) {
if (in_data == NULL || out_data == NULL) {
return NNACL_ERR;
}
int8_t *int8_in = (int8_t *)in_data;
int num_split = split_param->num_split_;

View File

@ -26,15 +26,15 @@ void DoCopyData(const uint8_t *input_data, uint8_t *output_data, size_t size, si
}
int DoTileOneDimension(uint8_t *input_data, uint8_t *output_data, size_t dim, const TileParameter *parameter) {
size_t src_dim_size = parameter->in_shape_[dim];
int src_dim_size = parameter->in_shape_[dim];
if (dim == parameter->in_dim_ - 1) {
DoCopyData(input_data, output_data, src_dim_size, parameter->data_size_, parameter->multiples_[dim]);
return 0;
}
for (size_t i = 0; i < src_dim_size; ++i) {
for (size_t j = 0; j < parameter->multiples_[dim]; ++j) {
size_t in_pos = parameter->in_strides_[dim] * i;
size_t out_pos = parameter->out_strides_[dim] * (i + j * src_dim_size);
for (int i = 0; i < src_dim_size; ++i) {
for (int j = 0; j < parameter->multiples_[dim]; ++j) {
int in_pos = parameter->in_strides_[dim] * i;
int out_pos = parameter->out_strides_[dim] * (i + j * src_dim_size);
DoTileOneDimension(input_data + in_pos * parameter->data_size_, output_data + out_pos * parameter->data_size_,
dim + 1, parameter);
}

View File

@ -18,20 +18,20 @@
#define MINDSPORE_NNACL_BASE_TILE_H_
#include "nnacl/op_base.h"
#define MAX_TILE_DIM_SIZE 8
typedef struct TileParameter {
// primitive parameter
OpParameter op_parameter_;
int multiples_[8];
int dims_[8];
int multiples_[MAX_TILE_DIM_SIZE];
int dims_[MAX_TILE_DIM_SIZE];
size_t dims_size_;
size_t multiples_size_;
// shape correlative
int in_shape_[8];
int out_shape_[8];
int in_strides_[8];
int out_strides_[8];
int in_shape_[MAX_TILE_DIM_SIZE];
int out_shape_[MAX_TILE_DIM_SIZE];
int in_strides_[MAX_TILE_DIM_SIZE];
int out_strides_[MAX_TILE_DIM_SIZE];
// other parameter
int in_dim_;

View File

@ -184,7 +184,7 @@
for (int i = dims - 1; i > 0; --i) { \
*(size + i - 1) = *(size + i) * output_shape[i]; \
} \
for (size_t idx = 0; idx < (*size) * output_shape[0]; ++idx) { \
for (int idx = 0; idx < (*size) * output_shape[0]; ++idx) { \
int pos = idx; \
int output_idx = 0; \
int input_idx = 0; \
@ -215,7 +215,7 @@
return; \
} \
count = MSMIN(offset_size, count); \
for (size_t idx = task_offset; idx < task_offset + count; ++idx) { \
for (int idx = task_offset; idx < task_offset + count; ++idx) { \
int pos = idx; \
int output_idx = 0; \
int input_idx = 0; \

View File

@ -16,15 +16,19 @@
#include "nnacl/common_func.h"
int offset(const int *shape, const int dim0, const int dim1, const int dim2, const int dim3) {
int Offset(const int *shape, const int dim0, const int dim1, const int dim2, const int dim3) {
return ((dim0 * shape[1] + dim1) * shape[2] + dim2) * shape[3] + dim3;
}
int offsetComm(const int *shape, const int dim0, const int dim1, const int dim2) {
int OffsetComm(const int *shape, const int dim0, const int dim1, const int dim2) {
return ((dim0 * shape[1] + dim1) * shape[2] + dim2) * shape[3];
}
int offset4d(const int *shape, const int *dims) { return offset(shape, dims[0], dims[1], dims[2], dims[3]); }
int Offset4d(const int *shape, const int *dims) { return Offset(shape, dims[0], dims[1], dims[2], dims[3]); }
int Offset6d(const int *shape, const int *dims) {
return ((OffsetComm(shape, dims[0], dims[1], dims[2]) + dims[3]) * shape[4] + dims[4]) * shape[5];
}
int8_t MinInt8(int8_t a, int8_t b) { return b ^ ((a ^ b) & -(a < b)); }

View File

@ -36,9 +36,10 @@ void ReluFp32C8(float *data, float *dst, int ele_num);
void Relu6Fp32C8(float *data, float *dst, int ele_num);
#endif
#endif
int offset(const int *shape, const int dim0, const int dim1, const int dim2, const int dim3);
int offsetComm(const int *shape, const int dim0, const int dim1, const int dim2);
int offset4d(const int *shape, const int *dims);
int Offset(const int *shape, const int dim0, const int dim1, const int dim2, const int dim3);
int OffsetComm(const int *shape, const int dim0, const int dim1, const int dim2);
int Offset4d(const int *shape, const int *dims);
int Offset6d(const int *shape, const int *dims);
static inline bool isAddOverflow(int32_t x, int32_t y) {
int32_t sum = x + y;

View File

@ -19,16 +19,22 @@
void PadFp16(const float16_t *input_data, float16_t *output_data, const int *input_shape, const int *output_shape,
const int *paddings, const int tid, const int thread_num) {
int in[4], out[4];
int in[DEFAULT_PAD_NDIMS], out[DEFAULT_PAD_NDIMS];
for (in[0] = 0; in[0] < input_shape[0]; in[0]++) {
out[0] = in[0] + paddings[0];
for (in[1] = tid; in[1] < input_shape[1]; in[1] += thread_num) {
out[1] = in[1] + paddings[2];
for (in[2] = 0; in[2] < input_shape[2]; in[2]++) {
out[2] = in[2] + paddings[4];
float16_t *dst = output_data + offset(output_shape, out[0], out[1], out[2], paddings[6]);
const float16_t *src = input_data + offset(input_shape, in[0], in[1], in[2], 0);
memcpy(dst, src, input_shape[3] * sizeof(float16_t));
for (in[3] = 0; in[3] < input_shape[3]; in[3]++) {
out[3] = in[3] + paddings[6];
for (in[4] = 0; in[4] < input_shape[4]; in[4]++) {
out[4] = in[4] + paddings[8];
float16_t *dst = output_data + Offset6d(output_shape, out) + paddings[10];
const float16_t *src = input_data + Offset6d(input_shape, in);
memcpy(dst, src, input_shape[5] * sizeof(float16_t));
}
}
}
}
}

View File

@ -152,17 +152,15 @@ int AdamDeltaFp32(float *delta, float *m, float *v, float lr, float beta1, float
return NNACL_OK;
}
int AdamWeightDecayFp32(float *var, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
const float *gradient, size_t start, size_t end) {
size_t AdamWeightDecayFp32(float *var, float *m, float *v, float lr, float beta1, float beta2, float epsilon,
float decay, const float *gradient, size_t start, size_t end) {
size_t c1 = start;
#ifdef ENABLE_AVX512
const float beta1_minus = 1 - beta1;
const float beta2_minus = 1 - beta2;
struct AVX_Data beta1_r, beta2_r, beta1_minus_r, beta2_minus_r, lr_neg_r, epsilon_r, decay_r;
beta1_r.data = _mm512_set1_ps(beta1);
beta2_r.data = _mm512_set1_ps(beta2);
beta1_minus_r.data = _mm512_set1_ps(beta1_minus);
beta2_minus_r.data = _mm512_set1_ps(beta2_minus);
beta1_minus_r.data = _mm512_set1_ps(1.0f - beta1);
beta2_minus_r.data = _mm512_set1_ps(1.0f - beta2);
lr_neg_r.data = _mm512_set1_ps(-lr);
epsilon_r.data = _mm512_set1_ps(epsilon);
decay_r.data = _mm512_set1_ps(decay);
@ -260,17 +258,15 @@ int AdamWeightDecayFp32(float *var, float *m, float *v, float lr, float beta1, f
return c1;
}
int FusedAdamFp32(float *var, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
const int16_t *gradient16, size_t start, size_t end) {
size_t FusedAdamFp32(float *var, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
const int16_t *gradient16, size_t start, size_t end) {
size_t c1 = start;
#ifdef ENABLE_AVX512
const float beta1_minus = 1 - beta1;
const float beta2_minus = 1 - beta2;
struct AVX_Data beta1_r, beta2_r, beta1_minus_r, beta2_minus_r, lr_neg_r, epsilon_r, decay_r;
beta1_r.data = _mm512_set1_ps(beta1);
beta2_r.data = _mm512_set1_ps(beta2);
beta1_minus_r.data = _mm512_set1_ps(beta1_minus);
beta2_minus_r.data = _mm512_set1_ps(beta2_minus);
beta1_minus_r.data = _mm512_set1_ps(1.0f - beta1);
beta2_minus_r.data = _mm512_set1_ps(1.0f - beta2);
lr_neg_r.data = _mm512_set1_ps(-lr);
epsilon_r.data = _mm512_set1_ps(epsilon);
decay_r.data = _mm512_set1_ps(decay);

View File

@ -71,10 +71,10 @@ int AdamFp32(float *var, float *m, float *v, float lr, float beta1, float beta2,
size_t start, size_t end, bool use_nesterov);
int AdamDeltaFp32(float *delta, float *m, float *v, float lr, float beta1, float beta2, float epsilon,
const float *gradient, size_t start, size_t end, bool use_nesterov);
int AdamWeightDecayFp32(float *var, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
const float *gradient, size_t start, size_t end);
int FusedAdamFp32(float *var, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
const int16_t *gradient16, size_t start, size_t end);
size_t AdamWeightDecayFp32(float *var, float *m, float *v, float lr, float beta1, float beta2, float epsilon,
float decay, const float *gradient, size_t start, size_t end);
size_t FusedAdamFp32(float *var, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
const int16_t *gradient16, size_t start, size_t end);
#ifdef __cplusplus
}
#endif

View File

@ -49,8 +49,8 @@ void ArgMaxTopK1(const float *input, void *output, float *output_value, const Ar
float *outputfp32 = (float *)output;
int *outputint = (int *)output;
for (int i = 0; i < pre_axis_count; ++i) {
size_t output_offset = i * after_axis_count;
size_t input_offset = output_offset * axis_count;
int output_offset = i * after_axis_count;
int input_offset = output_offset * axis_count;
for (int j = 0; j < after_axis_count; ++j) {
float value = -FLT_MAX;
int index = 0;
@ -79,8 +79,8 @@ void ArgMinTopK1(const float *input, void *output, float *output_value, const Ar
float *outputfp32 = (float *)output;
int *outputint = (int *)output;
for (int i = 0; i < pre_axis_count; ++i) {
size_t output_offset = i * after_axis_count;
size_t input_offset = output_offset * axis_count;
int output_offset = i * after_axis_count;
int input_offset = output_offset * axis_count;
for (int j = 0; j < after_axis_count; ++j) {
float value = FLT_MAX;
int index = 0;
@ -109,13 +109,13 @@ void ArgMinMaxDim0(const float *input, void *output, float *output_value, const
int *outputint = (int *)output;
for (int32_t i = 0; i < param->in_strides_[0]; ++i) {
for (int j = 0; j < in_shape[0]; ++j) {
size_t offset = param->in_strides_[0] * j + i;
int offset = param->in_strides_[0] * j + i;
param->arg_elements_[j].index_ = j;
param->arg_elements_[j].data_.f_data_ = input[offset];
}
qsort(param->arg_elements_, in_shape[0], sizeof(ArgElement), *compare_func);
for (int j = 0; j < param->topk_; ++j) {
size_t out_offset = j * param->out_strides_[0] + i;
int out_offset = j * param->out_strides_[0] + i;
if (param->out_value_) {
outputfp32[out_offset] = param->arg_elements_[j].data_.f_data_;
} else {
@ -135,17 +135,17 @@ void ArgMinMaxDim1(const float *input, void *output, float *output_value, const
int *outputint = (int *)output;
int in_shape1 = in_shape[1];
for (int i = 0; i < in_shape[0]; ++i) {
size_t in_dim0_offset = i * param->in_strides_[0];
size_t out_dim0_offset = i * param->out_strides_[0];
int in_dim0_offset = i * param->in_strides_[0];
int out_dim0_offset = i * param->out_strides_[0];
for (int j = 0; j < param->in_strides_[1]; ++j) {
for (int k = 0; k < in_shape1; ++k) {
size_t offset = param->in_strides_[1] * k + in_dim0_offset + j;
int offset = param->in_strides_[1] * k + in_dim0_offset + j;
param->arg_elements_[k].index_ = k;
param->arg_elements_[k].data_.f_data_ = input[offset];
}
qsort(param->arg_elements_, in_shape1, sizeof(ArgElement), *compare_func);
for (int k = 0; k < param->topk_; ++k) {
size_t out_offset = out_dim0_offset + j + k * param->out_strides_[1];
int out_offset = out_dim0_offset + j + k * param->out_strides_[1];
if (param->out_value_) {
outputfp32[out_offset] = param->arg_elements_[k].data_.f_data_;
} else {
@ -167,20 +167,20 @@ void ArgMinMaxDim2(const float *input, void *output, float *output_value, const
float *outputfp32 = (float *)output;
int *outputint = (int *)output;
for (int i = 0; i < in_shape[0]; ++i) {
size_t in_dim0_offset = i * param->in_strides_[0];
size_t out_dim0_offset = i * param->out_strides_[0];
int in_dim0_offset = i * param->in_strides_[0];
int out_dim0_offset = i * param->out_strides_[0];
for (int j = 0; j < in_shape1; ++j) {
size_t in_dim1_offset = j * param->in_strides_[1] + in_dim0_offset;
size_t out_dim1_offset = j * param->out_strides_[1] + out_dim0_offset;
int in_dim1_offset = j * param->in_strides_[1] + in_dim0_offset;
int out_dim1_offset = j * param->out_strides_[1] + out_dim0_offset;
for (int k = 0; k < param->in_strides_[2]; ++k) {
for (int l = 0; l < in_shape2; ++l) {
size_t offset = param->in_strides_[2] * l + k + in_dim1_offset;
int offset = param->in_strides_[2] * l + k + in_dim1_offset;
param->arg_elements_[l].index_ = l;
param->arg_elements_[l].data_.f_data_ = input[offset];
}
qsort(param->arg_elements_, in_shape2, sizeof(ArgElement), *compare_func);
for (int l = 0; l < param->topk_; ++l) {
size_t out_offset = out_dim1_offset + k + l * param->out_strides_[2];
int out_offset = out_dim1_offset + k + l * param->out_strides_[2];
if (param->out_value_) {
outputfp32[out_offset] = param->arg_elements_[l].data_.f_data_;
} else {
@ -203,26 +203,26 @@ void ArgMinMaxDim3(const float *input, void *output, float *output_value, const
float *outputfp32 = (float *)output;
int *outputint = (int *)output;
for (int i = 0; i < in_shape[0]; ++i) {
size_t in_dim0_offset = i * param->in_strides_[0];
size_t out_dim0_offset = i * param->out_strides_[0];
int in_dim0_offset = i * param->in_strides_[0];
int out_dim0_offset = i * param->out_strides_[0];
for (int j = 0; j < in_shape1; ++j) {
size_t in_dim1_offset = j * param->in_strides_[1] + in_dim0_offset;
size_t out_dim1_offset = j * param->out_strides_[1] + out_dim0_offset;
int in_dim1_offset = j * param->in_strides_[1] + in_dim0_offset;
int out_dim1_offset = j * param->out_strides_[1] + out_dim0_offset;
for (int k = 0; k < in_shape2; ++k) {
size_t in_dim2_offset = k * param->in_strides_[2] + in_dim1_offset;
size_t out_dim2_offset = k * param->out_strides_[2] + out_dim1_offset;
int in_dim2_offset = k * param->in_strides_[2] + in_dim1_offset;
int out_dim2_offset = k * param->out_strides_[2] + out_dim1_offset;
for (int l = 0; l < in_shape3; ++l) {
size_t offset = l + in_dim2_offset;
int offset = l + in_dim2_offset;
param->arg_elements_[l].index_ = l;
param->arg_elements_[l].data_.f_data_ = input[offset];
}
qsort(param->arg_elements_, in_shape3, sizeof(ArgElement), *compare_func);
for (int l = 0; l < param->topk_; ++l) {
size_t out_offset = out_dim2_offset + l;
int out_offset = out_dim2_offset + l;
if (param->out_value_) {
outputfp32[out_offset] = param->arg_elements_[l].data_.f_data_;
} else {
outputint[out_offset] = param->arg_elements_[l].index_;
outputint[out_offset] = (int)(param->arg_elements_[l].index_);
}
if (output_value != NULL) {
output_value[out_offset] = param->arg_elements_[l].data_.f_data_;

View File

@ -21,10 +21,10 @@ void PostConvFuncComm(const float *src_ptr_, float *out_ptr, const float *bias_p
if (size == 0) {
return;
}
for (int oc = 0; oc < output_channel; oc++) {
for (size_t oc = 0; oc < output_channel; oc++) {
int oc_div = oc / size;
int oc_mod = oc % size;
for (int hw = 0; hw < plane_size; hw++) {
for (int hw = 0; hw < (int)plane_size; hw++) {
int src_index = oc_div * size * plane_stride + hw * size + oc_mod;
int dst_index = hw * oc_stride + oc;
float value = src_ptr_[src_index];

View File

@ -52,7 +52,8 @@ int ConvDw(float *output_data, const float *input_data, const float *weight_data
int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih_origin, conv_param->dilation_h_));
for (int ow = 0; ow < conv_param->output_w_; ow++) {
memcpy(dst_data + ow * conv_param->output_channel_, bias_data, conv_param->output_channel_ * sizeof(float));
memcpy(dst_data + ow * conv_param->output_channel_, bias_data,
conv_param->output_channel_ * (int)(sizeof(float)));
}
for (int kh = start_kh; kh < end_kh; kh++) {
int ih = ih_origin + conv_param->dilation_w_ * kh;
@ -764,10 +765,10 @@ void ConvDwFp32IndirectRow(float *output, float **input, const float *weights, c
int output_width, int input_stride, bool relu, bool relu6, int kernel) {
do {
float **in = input;
size_t c = channels;
size_t c = (size_t)channels;
const float *w = weights;
float *out = output;
memcpy(out, bias, channels * sizeof(float));
memcpy(out, bias, channels * (int)sizeof(float));
for (; c >= C4NUM; c -= C4NUM) {
for (int i = 0; i < C4NUM; i++) {
for (int k = 0; k < kernel; k++) {

View File

@ -61,7 +61,7 @@ void DeConvPostFp32C8(const float *src, float *tmp, const float *bias, float *ds
for (int c = 0; c < oc8; c += 8) {
float *dst_ptr = tmp + c * output_plane;
const float *src_ptr = src + c * in_plane_round * kernel_plane;
memset(dst_ptr, 0, output_plane * C8NUM * sizeof(float));
memset(dst_ptr, 0, output_plane * C8NUM * (int)sizeof(float));
for (int ih = 0; ih < conv_param->input_h_; ih++) {
for (int iw = 0; iw < conv_param->input_w_; iw++) {

View File

@ -43,7 +43,7 @@ int CopyData(float *input_data, const int *ids, float *output_data, int num,
parameter->is_regulated_[ids[num]] = true;
}
memcpy(out_data, in_data, sizeof(float) * parameter->layer_size_);
memcpy(out_data, in_data, sizeof(float) * (size_t)(parameter->layer_size_));
return NNACL_OK;
}
@ -52,7 +52,7 @@ int EmbeddingLookup(float *input_data, const int *ids, float *output_data, const
if (parameter->op_parameter_.thread_num_ == 0) {
return NNACL_PARAM_INVALID;
}
for (size_t i = task_id; i < parameter->ids_size_; i += parameter->op_parameter_.thread_num_) {
for (int i = task_id; i < parameter->ids_size_; i += parameter->op_parameter_.thread_num_) {
int ret = CopyData(input_data, ids, output_data, i, parameter);
if (ret != NNACL_OK) {
return ret;

View File

@ -21,7 +21,7 @@
int GatherNd(const float *input, float *output, const int *in_offset, int area, int count) {
int i = 0;
for (i = 0; i < count; i++) {
(void)memcpy(output + area * i, input + in_offset[i], area * sizeof(float));
(void)memcpy(output + area * i, input + in_offset[i], (size_t)(area) * sizeof(float));
}
return NNACL_OK;
}

View File

@ -41,7 +41,7 @@ void PackLstmBias(float *dst, const float *src, int batch, int col, int col_alig
for (int i = 0; i < unidirectional_batch; i++) {
const float *src_batch = src + i * col;
float *dst_batch = dst + i * col_align;
memcpy(dst_batch, src_batch, col * sizeof(float));
memcpy(dst_batch, src_batch, col * (int)sizeof(float));
}
if (is_bidirectional) {
const float *backward_src = src + batch * col;

View File

@ -263,9 +263,9 @@ void RowMajor2Col12Major_arm32(const float *src_c, float *dst_c, size_t col) {
void RowMajor2Col12Major(const float *src_ptr, float *dst_ptr, int row, int col) {
const float *src_r = src_ptr;
float *dst_r = dst_ptr;
size_t ri = 0;
int ri = 0;
for (; ri < (row / C12NUM * C12NUM); ri += C12NUM) {
size_t ci = 0;
int ci = 0;
for (; ci < (col / C4NUM * C4NUM); ci += C4NUM) {
const float *src_c = src_r + ci;
float *dst_c = dst_r + ci * C12NUM;
@ -340,7 +340,7 @@ void RowMajor2Col12Major(const float *src_ptr, float *dst_ptr, int row, int col)
for (; ci < col; ci++) {
const float *src_c = src_r + ci;
float *dst_c = dst_r + ci * C12NUM;
for (size_t i = 0; i < C12NUM; i++) {
for (int i = 0; i < C12NUM; i++) {
dst_c[i] = src_c[i * col];
}
}
@ -348,16 +348,15 @@ void RowMajor2Col12Major(const float *src_ptr, float *dst_ptr, int row, int col)
dst_r += C12NUM * col;
}
for (; ri < row; ri++, dst_r++, src_r += col) {
for (size_t i = 0; i < col; i++) {
for (int i = 0; i < col; i++) {
dst_r[i * C12NUM] = src_r[i];
}
}
for (; ri < UP_ROUND(row, C12NUM); ri++, dst_r++) {
for (size_t i = 0; i < col; i++) {
for (int i = 0; i < col; i++) {
dst_r[i * C12NUM] = 0;
}
}
return;
}
#ifdef ENABLE_ARM64
@ -532,20 +531,20 @@ void RowMajor2Col8Major_arm32(const float *src_c, float *dst_c, size_t col) {
#endif
#endif
void RowMajor2Col8Major(const float *src_ptr, float *dst_ptr, int row, int col) {
size_t row8 = row / C8NUM * C8NUM;
int row8 = row / C8NUM * C8NUM;
#ifdef ENABLE_ARM64
size_t col_skip = col / C8NUM * C8NUM;
int col_skip = col / C8NUM * C8NUM;
int skip_size = C8NUM;
#else
size_t col_skip = col / C4NUM * C4NUM;
int col_skip = col / C4NUM * C4NUM;
int skip_size = C4NUM;
#endif
const float *src_r = src_ptr;
float *dst_r = dst_ptr;
size_t ri = 0;
int ri = 0;
for (; ri < row8; ri += C8NUM) {
size_t ci = 0;
int ci = 0;
for (; ci < col_skip; ci += skip_size) {
const float *src_c = src_r + ci;
float *dst_c = dst_r + ci * C8NUM;
@ -593,7 +592,7 @@ void RowMajor2Col8Major(const float *src_ptr, float *dst_ptr, int row, int col)
for (; ci < col; ci++) {
const float *src_c = src_r + ci;
float *dst_c = dst_r + ci * C8NUM;
for (size_t i = 0; i < C8NUM; i++) {
for (int i = 0; i < C8NUM; i++) {
dst_c[i] = src_c[i * col];
}
}
@ -601,29 +600,28 @@ void RowMajor2Col8Major(const float *src_ptr, float *dst_ptr, int row, int col)
dst_r += C8NUM * col;
}
for (; ri < row; ri++, src_r += col, dst_r++) {
for (size_t i = 0; i < col; i++) {
for (int i = 0; i < col; i++) {
dst_r[i * C8NUM] = src_r[i];
}
}
for (; ri < UP_ROUND(row, C8NUM); ri++, dst_r++) {
for (size_t i = 0; i < col; i++) {
for (int i = 0; i < col; i++) {
dst_r[i * C8NUM] = 0;
}
}
return;
}
void RowMajor2Col16Major(const float *src_ptr, float *dst_ptr, int row, int col) {
size_t row16 = row / C16NUM * C16NUM;
size_t col_skip = col / C4NUM * C4NUM;
int row16 = row / C16NUM * C16NUM;
int col_skip = col / C4NUM * C4NUM;
int skip_size = C4NUM;
const float *src_r = src_ptr;
float *dst_r = dst_ptr;
size_t ri = 0;
int ri = 0;
for (; ri < row16; ri += C16NUM) {
size_t ci = 0;
int ci = 0;
for (; ci < col_skip; ci += skip_size) {
const float *src_c = src_r + ci;
float *dst_c = dst_r + ci * C16NUM;
@ -636,7 +634,7 @@ void RowMajor2Col16Major(const float *src_ptr, float *dst_ptr, int row, int col)
for (; ci < col; ci++) {
const float *src_c = src_r + ci;
float *dst_c = dst_r + ci * C16NUM;
for (size_t i = 0; i < C16NUM; i++) {
for (int i = 0; i < C16NUM; i++) {
dst_c[i] = src_c[i * col];
}
}
@ -644,21 +642,20 @@ void RowMajor2Col16Major(const float *src_ptr, float *dst_ptr, int row, int col)
dst_r += C16NUM * col;
}
for (; ri < row; ri++) {
for (size_t i = 0; i < col; i++) {
for (int i = 0; i < col; i++) {
dst_r[i * C16NUM] = src_r[i];
}
src_r += col;
dst_r += 1;
}
size_t total_row = UP_ROUND(row, C16NUM);
int total_row = UP_ROUND(row, C16NUM);
for (; ri < total_row; ri++) {
for (size_t i = 0; i < col; i++) {
for (int i = 0; i < col; i++) {
dst_r[i * C16NUM] = 0;
}
dst_r += 1;
}
return;
}
void RowMajor2Col32Major(const float *src_ptr, float *dst_ptr, int row, int col) {
@ -680,15 +677,15 @@ void RowMajor2Col32Major(const float *src_ptr, float *dst_ptr, int row, int col)
}
void RowMajor2Col6Major(const float *src_ptr, float *dst_ptr, int row, int col) {
size_t totalRow = UP_ROUND(row, C6NUM);
size_t row6 = row / C6NUM * C6NUM;
size_t col8 = col / C8NUM * C8NUM;
int totalRow = UP_ROUND(row, C6NUM);
int row6 = row / C6NUM * C6NUM;
int col8 = col / C8NUM * C8NUM;
const float *src_r = src_ptr;
float *dst_r = dst_ptr;
size_t ri = 0;
int ri = 0;
for (; ri < row6; ri += C6NUM) {
size_t ci = 0;
int ci = 0;
for (; ci < col8; ci += C8NUM) {
const float *src_c = src_r + ci;
float *dst_c = dst_r + ci * C6NUM;
@ -753,7 +750,7 @@ void RowMajor2Col6Major(const float *src_ptr, float *dst_ptr, int row, int col)
for (; ci < col; ci++) {
const float *src_c = src_r + ci;
float *dst_c = dst_r + ci * C6NUM;
for (size_t i = 0; i < C6NUM; i++) {
for (int i = 0; i < C6NUM; i++) {
dst_c[i] = src_c[i * col];
}
}
@ -762,7 +759,7 @@ void RowMajor2Col6Major(const float *src_ptr, float *dst_ptr, int row, int col)
}
for (; ri < row; ri++) {
for (size_t i = 0; i < col; i++) {
for (int i = 0; i < col; i++) {
dst_r[i * C6NUM] = src_r[i];
}
src_r += col;
@ -770,30 +767,29 @@ void RowMajor2Col6Major(const float *src_ptr, float *dst_ptr, int row, int col)
}
for (; ri < totalRow; ri++) {
for (size_t i = 0; i < col; i++) {
for (int i = 0; i < col; i++) {
dst_r[i * C6NUM] = 0;
}
dst_r += 1;
}
return;
}
void RowMajor2Col4Major(const float *src_ptr, float *dst_ptr, int row, int col) {
size_t total_row = UP_ROUND(row, C4NUM);
size_t row4 = row / C4NUM * C4NUM;
size_t col4 = col / C4NUM * C4NUM;
int total_row = UP_ROUND(row, C4NUM);
int row4 = row / C4NUM * C4NUM;
int col4 = col / C4NUM * C4NUM;
const float *src_r = src_ptr;
float *dst_r = dst_ptr;
size_t ri = 0;
int ri = 0;
for (; ri < row4; ri += C4NUM) {
size_t ci = 0;
int ci = 0;
for (; ci < col4; ci += C4NUM) {
const float *src_c = src_r + ci;
float *dst_c = dst_r + ci * C4NUM;
#ifdef ENABLE_ARM32
size_t stride = col * 4;
int stride = col * 4;
asm volatile(
"mov r10, %[src_c]\n"
"mov r12, %[dst_c]\n"
@ -840,8 +836,8 @@ void RowMajor2Col4Major(const float *src_ptr, float *dst_ptr, int row, int col)
_mm_storeu_ps(dst_c + 8, dst2);
_mm_storeu_ps(dst_c + 12, dst3);
#else
for (int tr = 0; tr < C4NUM; tr++) {
for (int tc = 0; tc < C4NUM; tc++) {
for (size_t tr = 0; tr < C4NUM; tr++) {
for (size_t tc = 0; tc < C4NUM; tc++) {
dst_c[tc * C4NUM + tr] = src_c[tr * col + tc];
}
}
@ -850,7 +846,7 @@ void RowMajor2Col4Major(const float *src_ptr, float *dst_ptr, int row, int col)
for (; ci < col; ci++) {
const float *src_c = src_r + ci;
float *dst_c = dst_r + ci * C4NUM;
for (size_t i = 0; i < C4NUM; i++) {
for (int i = 0; i < C4NUM; i++) {
dst_c[i] = src_c[i * col];
}
}
@ -858,7 +854,7 @@ void RowMajor2Col4Major(const float *src_ptr, float *dst_ptr, int row, int col)
dst_r += C4NUM * col;
}
for (; ri < row; ri++) {
for (size_t i = 0; i < col; i++) {
for (int i = 0; i < col; i++) {
dst_r[i * C4NUM] = src_r[i];
}
src_r += col;
@ -866,12 +862,11 @@ void RowMajor2Col4Major(const float *src_ptr, float *dst_ptr, int row, int col)
}
for (; ri < total_row; ri++) {
for (size_t i = 0; i < col; i++) {
for (int i = 0; i < col; i++) {
dst_r[i * C4NUM] = 0;
}
dst_r += 1;
}
return;
}
#ifndef ENABLE_ARM

View File

@ -23,16 +23,22 @@ void Pad(const float *input_data, float *output_data, const int *input_shape, co
if (thread_num == 0) {
return;
}
int in[4], out[4];
int in[DEFAULT_PAD_NDIMS], out[DEFAULT_PAD_NDIMS];
for (in[0] = 0; in[0] < input_shape[0]; in[0]++) {
out[0] = in[0] + paddings[0];
for (in[1] = tid; in[1] < input_shape[1]; in[1] += thread_num) {
out[1] = in[1] + paddings[2];
for (in[2] = 0; in[2] < input_shape[2]; in[2]++) {
out[2] = in[2] + paddings[4];
float *dst = output_data + offset(output_shape, out[0], out[1], out[2], paddings[6]);
const float *src = input_data + offset(input_shape, in[0], in[1], in[2], 0);
memcpy(dst, src, input_shape[3] * sizeof(float));
for (in[3] = 0; in[3] < input_shape[3]; in[3]++) {
out[3] = in[3] + paddings[6];
for (in[4] = 0; in[4] < input_shape[4]; in[4]++) {
out[4] = in[4] + paddings[8];
float *dst = output_data + Offset6d(output_shape, out) + paddings[10];
const float *src = input_data + Offset6d(input_shape, in);
memcpy(dst, src, input_shape[5] * (int)(sizeof(float)));
}
}
}
}
}
@ -57,8 +63,7 @@ int TransOut2InputDimIndex(int out_dim_index, int left_pad, int in_dim, int offs
int GetInputFlattenIndex(int out_flatten_index, const int *input_shape, const PadParameter *pad_param) {
int in_flatten_index = 0;
int i;
for (i = 0; i < COMM_SHAPE_SIZE; ++i) {
for (int i = 0; i < DEFAULT_PAD_NDIMS; ++i) {
int left_pad = pad_param->paddings_[i * 2];
NNACL_CHECK_ZERO_RETURN_ERR(pad_param->out_strides[i])
int out_dim_index = out_flatten_index / pad_param->out_strides[i];

View File

@ -510,8 +510,8 @@ int ResizeNearestNeighbor(const float *input_data, float *output_data, const int
} else {
input_x = (int)(floorf(actual_x));
}
int in_offset = offset(input_shape, batch, input_y, input_x, 0);
int out_offset = offset(output_shape, batch, y, x, 0);
int in_offset = Offset(input_shape, batch, input_y, input_x, 0);
int out_offset = Offset(output_shape, batch, y, x, 0);
memcpy(output_data + out_offset, input_data + in_offset, c * sizeof(float));
}
}

View File

@ -20,10 +20,8 @@
#include "nnacl/nnacl_utils.h"
int Reverse(const float *input, float *output, size_t elem_size, int *index) {
for (int i = 0; i < elem_size; i++) {
for (size_t i = 0; i < elem_size; i++) {
NNACL_ASSERT(index[i] >= 0);
}
for (int i = 0; i < elem_size; i++) {
output[index[i]] = input[i];
}
return NNACL_OK;

View File

@ -23,7 +23,7 @@ int DoScatterND(float *output_ptr, const float *update, int *output_unit_offsets
return NNACL_ERR;
}
for (int i = 0; i < num_units; i++) {
(void)memcpy(output_ptr + output_unit_offsets[i], update + unit_size * i, unit_size * sizeof(float));
(void)memcpy(output_ptr + output_unit_offsets[i], update + unit_size * i, (size_t)(unit_size) * sizeof(float));
}
return NNACL_OK;
}

View File

@ -25,7 +25,7 @@ void SpliceFp32(const float *src_data, int src_row, int src_col, const SplicePar
forward_index++;
const float *tmp_src_data = src_data + r_off * src_col;
float *tmp_dst_data = dst_row_data + off * src_col;
memcpy(tmp_dst_data, tmp_src_data, src_col * sizeof(float));
memcpy(tmp_dst_data, tmp_src_data, (size_t)(src_col) * sizeof(float));
}
}
}

View File

@ -70,7 +70,7 @@ int DoStridedSliceIntFp64Bool(const void *in_data, void *out_data, StridedSliceP
if (param->num_axes_ < DIMENSION_8D) {
PadStridedSliceParameterTo8D(param);
}
size_t dim_offset[DIMENSION_8D - 1];
int dim_offset[DIMENSION_8D - 1];
dim_offset[6] = in_shape[7];
dim_offset[5] = in_shape[6] * dim_offset[6];
dim_offset[4] = in_shape[5] * dim_offset[5];
@ -132,7 +132,7 @@ int DoStridedSlice(const void *in_data, void *out_data, StridedSliceParameter *p
if (param->num_axes_ < DIMENSION_8D) {
PadStridedSliceParameterTo8D(param);
}
size_t dim_offset[DIMENSION_8D - 1];
int dim_offset[DIMENSION_8D - 1];
dim_offset[6] = in_shape[7];
dim_offset[5] = in_shape[6] * dim_offset[6];
dim_offset[4] = in_shape[5] * dim_offset[5];

View File

@ -180,15 +180,15 @@ void TransposeDimsFp32(const float *in_data, float *out_data, const int *output_
int *strides = (int *)(transpose_param->strides_);
int *out_strides = (int *)(transpose_param->out_strides_);
int num_axes = transpose_param->num_axes_;
size_t data_size = (*out_strides) * output_shape[0];
size_t offset_size = UP_DIV(data_size, thread_num);
size_t task_offset = offset_size * task_id;
int data_size = (*out_strides) * output_shape[0];
int offset_size = UP_DIV(data_size, thread_num);
int task_offset = offset_size * task_id;
int count = data_size - task_offset;
if (count <= 0) {
return;
}
count = MSMIN(offset_size, count);
for (size_t idx = task_offset; idx < task_offset + count; ++idx) {
for (int idx = task_offset; idx < task_offset + count; ++idx) {
int pos = idx;
int output_idx = 0;
int input_idx = 0;

View File

@ -45,7 +45,7 @@ void WinogradInputTransform(const float *input_data, float *trans_input, float *
int dst_plane_offset = c * in_channel;
for (int ic = 0; ic < ic4; ic++) {
// clear tmp buffer
memset(tmp_data, 0, input_unit * input_unit * C4NUM * sizeof(float));
memset(tmp_data, 0, input_unit * input_unit * C4NUM * (int)(sizeof(float)));
int real_c = in_channel - ic * C4NUM;
real_c = real_c > C4NUM ? C4NUM : real_c;
@ -87,7 +87,7 @@ void WinogradInputTransform(const float *input_data, float *trans_input, float *
// input transform
const int tile_num = C12NUM;
int dst_ic4_offset = dst_plane_offset + ic * C4NUM;
size_t dst_step = tile_num * in_channel;
int dst_step = tile_num * in_channel;
float *trans_input_ptr = trans_input + dst_ic4_offset;
func(tmp_data, trans_input_ptr, C4NUM, dst_step, real_c);
}

View File

@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -17,6 +17,7 @@
#include <math.h>
#include "nnacl/op_base.h"
#include "nnacl/fp32/arithmetic_fp32.h"
#include "nnacl/fp32/exp_fp32.h"
#include "nnacl/fp32_grad/activation_grad.h"
#include "nnacl/errorcode.h"
@ -110,3 +111,27 @@ int GeluGrad(const float *src0, const float *src1, size_t length, float *dst) {
}
return NNACL_OK;
}
int SoftplusGrad(const float *src0, const float *src1, int length, float *dst) {
int i = 0;
#if defined(ENABLE_AVX)
for (; i <= length - C8NUM; i += C8NUM) {
simd_exp_avx(-(MS_LD256_F32(src1 + i)), dst + i);
MS_ST256_F32(dst + i,
MS_DIV256_F32(MS_LD256_F32(src0 + i), MS_ADD256_F32(MS_MOV256_F32(1.0f), MS_LD256_F32(dst + i))));
}
#endif
#if defined(ENABLE_ARM) || defined(ENABLE_SSE)
for (; i <= length - C4NUM; i += C4NUM) {
simd_exp(MS_SUBQ_F32(MS_MOVQ_F32(0.0f), MS_LDQ_F32(src1 + i)), dst + i);
MS_STQ_F32(dst + i, MS_DIVQ_F32(MS_LDQ_F32(src0 + i), MS_ADDQ_F32(MS_MOVQ_F32(1.0f), MS_LDQ_F32(dst + i))));
}
#endif
for (; i < length; ++i) {
single_exp(-src1[i], dst + i);
dst[i] = src0[i] / (1.0f + dst[i]);
}
return NNACL_OK;
}

View File

@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -39,6 +39,7 @@ int HSwishGrad(const float *src0, const float *src1, size_t length, float *dst);
int HSigmoidGrad(const float *src0, const float *src1, size_t length, float *dst);
int EluGrad(const float *src0, const float *src1, size_t length, float *dst, float alpha);
int GeluGrad(const float *src0, const float *src1, size_t length, float *dst);
int SoftplusGrad(const float *src, const float *src1, int length, float *dst);
#ifdef __cplusplus
}

View File

@ -231,7 +231,7 @@ static void RowMajor2Col12MajorStride(const float *src_ptr, float *dst_ptr, size
for (; ci < col; ci++) {
const float *src_c = src_r + ci;
float *dst_c = dst_r + ci * C12NUM;
for (size_t i = 0; i < C12NUM; i++) {
for (int i = 0; i < C12NUM; i++) {
dst_c[i] = src_c[i * lead];
}
}
@ -240,7 +240,7 @@ static void RowMajor2Col12MajorStride(const float *src_ptr, float *dst_ptr, size
}
for (; ri < row; ri++) {
for (size_t i = 0; i < col; i++) {
for (int i = 0; i < col; i++) {
dst_r[i * C12NUM] = src_r[i];
}
src_r += lead;
@ -248,12 +248,11 @@ static void RowMajor2Col12MajorStride(const float *src_ptr, float *dst_ptr, size
}
for (; ri < row_up_12; ri++) {
for (size_t i = 0; i < col; i++) {
for (int i = 0; i < col; i++) {
dst_r[i * C12NUM] = 0;
}
dst_r += 1;
}
return;
}
#endif
@ -261,10 +260,10 @@ static void RowMajor2Col8MajorStride(const float *src_ptr, float *dst_ptr, size_
size_t row8 = row / C8NUM * C8NUM;
#ifdef ENABLE_ARM64
size_t col_skip = col / C8NUM * C8NUM;
int skip_size = C8NUM;
size_t skip_size = C8NUM;
#else
size_t col_skip = col / C4NUM * C4NUM;
int skip_size = C4NUM;
size_t skip_size = C4NUM;
#endif
const float *src_r = src_ptr;
float *dst_r = dst_ptr;
@ -450,7 +449,7 @@ static void RowMajor2Col8MajorStride(const float *src_ptr, float *dst_ptr, size_
for (; ci < col; ci++) {
const float *src_c = src_r + ci;
float *dst_c = dst_r + ci * C8NUM;
for (size_t i = 0; i < C8NUM; i++) {
for (int i = 0; i < C8NUM; i++) {
dst_c[i] = src_c[i * lead];
}
}
@ -458,7 +457,7 @@ static void RowMajor2Col8MajorStride(const float *src_ptr, float *dst_ptr, size_
dst_r += C8NUM * col;
}
for (; ri < row; ri++) {
for (size_t i = 0; i < col; i++) {
for (int i = 0; i < col; i++) {
dst_r[i * C8NUM] = src_r[i];
}
src_r += lead;

View File

@ -64,11 +64,11 @@ void ReduceSumByAxes(const float *input, const int *input_dims, float *output, c
if (output_dims[idx] != input_dims[idx]) same_shape = 0;
}
if (same_shape) {
memcpy(output, input, num_outputs * sizeof(float));
memcpy(output, input, (size_t)(num_outputs) * sizeof(float));
return;
}
memset(output, 0, num_outputs * sizeof(float)); // zero output
memset(output, 0, (size_t)(num_outputs) * sizeof(float)); // zero output
int input_iter[8] = {0};
int axes[5] = {0};

View File

@ -37,13 +37,13 @@ void SoftmaxGrad(const float *input_ptr, const float *yt_ptr, float *output_ptr,
for (int i = 0; i < inner_size * input_shape[axis]; i++) sum_mul[i] = 1.0;
for (int i = 0; i < n_dim; i++) dim *= input_shape[i];
dim /= outter_size;
memcpy(output_ptr, yt_ptr, ele_size * sizeof(float));
memcpy(output_ptr, yt_ptr, (size_t)(ele_size) * sizeof(float));
const int M = input_shape[axis];
const int N = inner_size;
for (int i = 0; i < outter_size; i++) {
int outter_offset = i * dim;
memset(sum_data, 0.0f, inner_size * sizeof(float));
memset(sum_data, 0, (size_t)(inner_size) * sizeof(float));
for (int k = 0; k < inner_size; k++) {
int inner_offset = outter_offset + k;
for (int j = 0; j < input_shape[axis]; j++) {

View File

@ -20,7 +20,7 @@
static size_t CalcIndex(const int *shape, size_t size, int i, size_t pos) {
size_t res = 1;
for (size_t j = 0; j < size; j++) {
res *= shape[(i + 1) + j];
res *= shape[((size_t)(i) + 1) + j];
}
return (pos / res % shape[i]);
}
@ -37,7 +37,7 @@ int DoStridedSliceGrad(const float *inputs, float *output, const int *dx_shape,
const int *s = param->strides_;
const int *b = param->begins_;
for (int i = 0; i < DIMENSION_8D; i++) {
size *= param->in_shape_[i];
size *= (size_t)(param->in_shape_[i]);
}
for (size_t pos = 0; pos < size; pos++) {

View File

@ -56,13 +56,13 @@ int AddnInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **o
for (size_t d = 0; d < inputs[max_dims_idx]->shape_size_; ++d) {
size_t max_dim = 0;
for (size_t i = 0; i < inputs_size; ++i) {
size_t shift = max_dims - inputs[i]->shape_size_;
size_t dim = (i < shift) ? 1 : inputs[i]->shape_[d];
size_t shift = max_dims - (size_t)(inputs[i]->shape_size_);
size_t dim = (i < shift) ? 1 : (size_t)(inputs[i]->shape_[d]);
if (dim > max_dim) {
max_dim = dim;
}
}
output->shape_[d] = max_dim; // set the biggest dimension in the output tensor
output->shape_[d] = (int)(max_dim); // set the biggest dimension in the output tensor
}
return NNACL_OK;

View File

@ -17,8 +17,8 @@
#include "nnacl/infer/affine_infer.h"
#include "nnacl/infer/infer_register.h"
int MatmulInfer(AffineParameter *param, int a_shape[MAX_SHAPE_SIZE], size_t a_shape_size, int b_shape[MAX_SHAPE_SIZE],
size_t b_shape_size) {
int MatmulInfer(const AffineParameter *param, int a_shape[MAX_SHAPE_SIZE], size_t a_shape_size,
int b_shape[MAX_SHAPE_SIZE], size_t b_shape_size) {
MatMulParameter *matmul_param = param->matmul_parameter_;
if (matmul_param->a_transpose_) {
if (a_shape_size < 2) {

View File

@ -56,8 +56,8 @@ int ArgMinMaxInferShape(const TensorC *const *inputs, size_t inputs_size, Tensor
int output_shape[MAX_SHAPE_SIZE] = {0};
size_t output_shape_size = 0;
ShapeSet(output_shape, &output_shape_size, input->shape_, input->shape_size_);
size_t input_shape_size = input->shape_size_;
int axis = param->axis_ < 0 ? param->axis_ + (int)input_shape_size : param->axis_;
int input_shape_size = (int)input->shape_size_;
int axis = param->axis_ < 0 ? param->axis_ + input_shape_size : param->axis_;
if (axis >= input_shape_size || axis < 0) {
return NNACL_PARAM_INVALID;
}

View File

@ -55,10 +55,10 @@ int ArithmeticGradInferShape(const TensorC *const *inputs, size_t inputs_size, T
if (GetElementNum(dx1) < GetElementNum(dx2)) {
param->ndim_ = in_shape1_size;
param->in_elements_num0_ = param->ndim_;
param->in_elements_num1_ = param->ndim_;
param->out_elements_num_ = param->ndim_;
int fill_dim_num = in_shape1_size - in_shape0_size; // This will not work for batch!
param->in_elements_num0_ = (int)param->ndim_;
param->in_elements_num1_ = (int)param->ndim_;
param->out_elements_num_ = (int)param->ndim_;
size_t fill_dim_num = in_shape1_size - in_shape0_size; // This will not work for batch!
int j = 0;
for (unsigned int i = 0; i < in_shape1_size; i++) {
if (i < fill_dim_num) {
@ -76,7 +76,7 @@ int ArithmeticGradInferShape(const TensorC *const *inputs, size_t inputs_size, T
param->out_elements_num_ = param->ndim_;
param->broadcasting_ = true;
int j = 0;
int fill_dim_num = in_shape0_size - in_shape1_size;
size_t fill_dim_num = in_shape0_size - in_shape1_size;
for (unsigned int i = 0; i < in_shape0_size; i++) {
if (i < fill_dim_num) {
param->in_shape1_[i] = 1;

View File

@ -66,7 +66,7 @@ int AudioSpectrogramInferShape(const TensorC *const *inputs, size_t inputs_size,
int sample_sub_window = input->shape_[0] - param->window_size_;
output_shape[1] = sample_sub_window < 0 ? 0 : 1 + sample_sub_window / param->stride_;
// compute fft length
int fft_length = GetFftLength(param->window_size_);
int fft_length = (int)GetFftLength(param->window_size_);
output_shape[2] = fft_length / 2 + 1;
SetShapeArray(output, output_shape, 3);
return NNACL_OK;

View File

@ -33,8 +33,8 @@ int BiasGradInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC
int inshape[MAX_SHAPE_SIZE];
size_t inshape_size = 0;
ShapeSet(inshape, &inshape_size, in0->shape_, in0->shape_size_);
int ndim = inshape_size;
for (int i = 0; i < ndim - 1; i++) {
size_t ndim = inshape_size;
for (size_t i = 0; i < ndim - 1; i++) {
inshape[i] = 1;
}
SetDataTypeFormat(out, in0);

View File

@ -111,12 +111,12 @@ int BroadcastToInferShape(const TensorC *const *inputs, size_t inputs_size, Tens
const int *input_shape = input->shape_;
size_t input_shape_size = input->shape_size_;
int shape[MAX_SHAPE_SIZE];
int input_shape_index = input_shape_size - 1;
int input_shape_index = (int)(input_shape_size)-1;
if (input_shape_size > dst_shape_size) {
return NNACL_ERR;
}
for (int i = dst_shape_size - 1; i >= 0; --i) {
for (int i = (int)(dst_shape_size)-1; i >= 0; --i) {
if (dst_shape[i] < 0) {
return NNACL_ERR;
}

View File

@ -18,6 +18,7 @@
#include <string.h>
#include "nnacl/infer/infer_register.h"
#ifdef ENABLE_CONTROL_TENSORLIST
int MallocTensorListData(TensorListC *tensor_list, TypeIdC dtype, const vvector *tensor_shape) {
// This function will create a new tensors_
// Your must to set shape(param2: tensor_shape) and data_type_(tensors_data_type_ = param1: dtype) of each tensor in
@ -35,7 +36,7 @@ int MallocTensorListData(TensorListC *tensor_list, TypeIdC dtype, const vvector
return NNACL_NULL_PTR;
}
memset(tensor_list->tensors_, 0, tensor_list->element_num_ * sizeof(TensorC));
for (int i = 0; i < tensor_list->element_num_; ++i) {
for (size_t i = 0; i < tensor_list->element_num_; ++i) {
tensor_list->tensors_[i].format_ = Format_NHWC;
tensor_list->tensors_[i].data_type_ = dtype;
ShapeSet(tensor_list->tensors_[i].shape_, &(tensor_list->tensors_[i].shape_size_), tensor_shape->shape_[i],
@ -69,6 +70,7 @@ bool TensorListIsFullyDefined(const int *shape, size_t shape_size) {
}
return true;
}
#endif
int CheckAugmentNull(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size,
const OpParameter *parameter) {
@ -157,7 +159,7 @@ void SetShapeTensor(TensorC *dst, const TensorC *src) {
}
void SetShapeArray(TensorC *dst, const int *src, size_t src_size) {
for (size_t i = 0; i < src_size; i++) {
for (size_t i = 0; i < src_size && i < MAX_SHAPE_SIZE; i++) {
dst->shape_[i] = src[i];
}
dst->shape_size_ = src_size;
@ -286,13 +288,17 @@ int GetDimensionSize(const TensorC *tensor, const size_t index) {
}
void ShapeSet(int *dst_shape, size_t *dst_shape_size, const int *src_shape, size_t src_shape_size) {
for (size_t i = 0; i < src_shape_size; i++) {
size_t i = 0;
for (; i < src_shape_size && i < MAX_SHAPE_SIZE; i++) {
dst_shape[i] = src_shape[i];
}
*dst_shape_size = src_shape_size;
*dst_shape_size = i;
}
void ShapePush(int *shape, size_t *shape_size, int value) {
if (*shape_size >= MAX_SHAPE_SIZE) {
return;
}
shape[*shape_size] = value;
*shape_size = *shape_size + 1;
}
@ -301,6 +307,9 @@ int ShapeInsert(int *shape, size_t *shape_size, int index, int value) {
if (index < 0 || index > *shape_size) {
return NNACL_ERR;
}
if (*shape_size >= MAX_SHAPE_SIZE) {
return NNACL_ERR;
}
for (int i = *shape_size; i > index; i--) {
shape[i] = shape[i - 1];
}
@ -325,7 +334,7 @@ bool ShapeEqual(const int *shape0, size_t shape0_size, const int *shape1, size_t
if (shape0_size != shape1_size) {
return false;
}
for (int i = 0; i < shape0_size; i++) {
for (size_t i = 0; i < shape0_size; i++) {
if (shape0[i] != shape1[i]) {
return false;
}
@ -401,96 +410,6 @@ int FftInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **ou
return NNACL_OK;
}
int VectorCInit(VectorC *vc, size_t per_malloc_size) {
if (per_malloc_size == 0) {
return NNACL_ERR;
}
vc->data_ = (int *)malloc(per_malloc_size * sizeof(int));
if (vc->data_ == NULL) {
return NNACL_ERR;
}
vc->size_ = 0;
vc->max_size_ = per_malloc_size;
vc->per_malloc_size_ = per_malloc_size;
return NNACL_OK;
}
int VectorCSet(VectorC *vc, const int *src_shape, size_t src_shape_size) {
if (src_shape_size == 0) {
vc->size_ = 0;
} else {
free(vc->data_);
if (vc->per_malloc_size_ == 0) {
return NNACL_ERR;
}
vc->max_size_ = (src_shape_size / vc->per_malloc_size_ + 1) * vc->per_malloc_size_;
vc->data_ = (int *)malloc(sizeof(int) * vc->max_size_);
if (vc->data_ == NULL) {
return NNACL_ERR;
}
for (size_t i = 0; i < src_shape_size; i++) {
vc->data_[i] = src_shape[i];
}
vc->size_ = src_shape_size;
}
return NNACL_OK;
}
int VectorCPush(VectorC *vc, int value) {
if (vc->size_ + 1 > vc->max_size_) {
int *tmp = (int *)malloc(vc->per_malloc_size_ * sizeof(int) + vc->max_size_ * sizeof(int));
if (tmp == NULL) {
return NNACL_ERR;
}
memcpy(tmp, vc->data_, vc->size_ * sizeof(int));
free(vc->data_);
vc->data_ = tmp;
vc->max_size_ = vc->max_size_ + vc->per_malloc_size_;
}
vc->data_[vc->size_] = value;
vc->size_++;
return NNACL_OK;
}
int VectorCInsert(VectorC *vc, int index, int value) {
if (vc->size_ + 1 > vc->max_size_) {
int *tmp = (int *)malloc(vc->per_malloc_size_ * sizeof(int) + vc->max_size_ * sizeof(int));
if (tmp == NULL) {
return NNACL_ERR;
}
memcpy(tmp, vc->data_, vc->size_ * sizeof(int));
free(vc->data_);
vc->data_ = tmp;
vc->max_size_ = vc->max_size_ + vc->per_malloc_size_;
}
memmove(vc->data_ + index + 1, vc->data_ + index, (vc->size_ - index) * sizeof(int));
vc->data_[index] = value;
vc->size_++;
return NNACL_OK;
}
void VectorCErase(VectorC *vc, int index) {
memmove(vc->data_ + index, vc->data_ + index + 1, (vc->size_ - index - 1) * sizeof(int));
vc->size_--;
}
bool VectorCEqual(const VectorC *vc1, const VectorC *vc2) {
if (vc1->size_ != vc2->size_) {
return false;
}
for (size_t i = 0; i < vc1->size_; i++) {
if (vc1->data_[i] != vc2->data_[i]) {
return false;
}
}
return true;
}
void VectorCFree(VectorC *vc) {
free(vc->data_);
vc->data_ = NULL;
}
bool InferFlag(const TensorC *const *inputs, size_t inputs_size) {
if (inputs == NULL) {
return false;
@ -499,18 +418,22 @@ bool InferFlag(const TensorC *const *inputs, size_t inputs_size) {
if (inputs[i] == NULL) {
return false;
}
#ifdef ENABLE_CONTROL_TENSORLIST
if (inputs[i]->data_type_ == kObjectTypeTensorType) {
TensorListC *input_tensor_list = (TensorListC *)inputs[i];
if (input_tensor_list->shape_value_ == -1) {
return false;
}
} else {
#endif
for (size_t j = 0; j < inputs[i]->shape_size_; ++j) {
if (inputs[i]->shape_[j] == -1) {
return false;
}
}
#ifdef ENABLE_CONTROL_TENSORLIST
}
#endif
}
return true;
}

View File

@ -138,6 +138,7 @@ typedef struct vvector {
size_t size_; // number of shapes
} vvector;
#ifdef ENABLE_CONTROL_TENSORLIST
typedef struct TensorListC {
bool is_ready_;
int data_type_;
@ -150,6 +151,7 @@ typedef struct TensorListC {
size_t element_shape_size_;
TensorC *tensors_;
} TensorListC;
#endif
typedef struct VectorC {
int *data_;
@ -158,9 +160,11 @@ typedef struct VectorC {
size_t per_malloc_size_;
} VectorC;
#ifdef ENABLE_CONTROL_TENSORLIST
int MallocTensorListData(TensorListC *tensor_list, TypeIdC dtype, const vvector *tensor_shape);
int TensorListMergeShape(int *element_shape, size_t *element_shape_size, const int *tmp, size_t tmp_size);
bool TensorListIsFullyDefined(const int *shape, size_t shape_size);
#endif
int GetBatch(const TensorC *tensor);
int GetHeight(const TensorC *tensor);
@ -202,13 +206,6 @@ int CommonInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC *
int FftInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size,
const OpParameter *parameter);
int VectorCInit(VectorC *vc, size_t per_malloc_size);
int VectorCSet(VectorC *vc, const int *src_shape, size_t src_shape_size);
int VectorCPush(VectorC *vc, int value);
int VectorCInsert(VectorC *vc, int index, int value);
void VectorCErase(VectorC *vc, int index);
bool VectorCEqual(const VectorC *vc1, const VectorC *vc2);
void VectorCFree(VectorC *vc);
bool InferFlag(const TensorC *const *inputs, size_t inputs_size);
#ifdef __cplusplus

View File

@ -54,8 +54,13 @@ int ConcatInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC *
}
int output_axis_dim = input0_shape[axis];
for (size_t i = 1; i < inputs_size; ++i) {
if (inputs[i]->shape_size_ != input0_shape_size) {
return NNACL_PARAM_INVALID;
size_t input_i_shape_size = inputs[i]->shape_size_;
if (input_i_shape_size != input0_shape_size) {
if (input_i_shape_size != 0) {
return NNACL_PARAM_INVALID;
} else {
continue;
}
}
int shape_tmp[MAX_SHAPE_SIZE] = {0};
size_t shape_tmp_size = 0;

View File

@ -37,7 +37,7 @@ int ConstantOfShapeInferShape(const TensorC *const *inputs, size_t inputs_size,
return NNACL_ERR;
}
int out_shape[MAX_SHAPE_SIZE];
size_t out_shape_size = size;
int out_shape_size = size;
switch (in_tensor->data_type_) {
case kNumberTypeInt32: {
int32_t *in_data = (int32_t *)(in_tensor->data_);

View File

@ -34,7 +34,10 @@ int Conv2dGradFilterInferShape(const TensorC *const *inputs, size_t inputs_size,
if (inputs[2]->shape_size_ < 1 || inputs[2]->data_ == NULL) {
return NNACL_ERR;
}
size_t filter_shape_size = inputs[2]->shape_[0];
if (inputs[2]->shape_[0] < 0) {
return NNACL_ERR;
}
size_t filter_shape_size = (size_t)(inputs[2]->shape_[0]);
if (filter_shape_size != 4) {
return NNACL_ERR;
}

View File

@ -40,16 +40,16 @@ int Conv2dGradInputInferShape(const TensorC *const *inputs, size_t inputs_size,
if (inputs[2]->shape_size_ < 1 || inputs[2]->data_ == NULL) {
return NNACL_ERR;
}
size_t shape_size = inputs[2]->shape_[0];
if (shape_size != 4) {
size_t data_size = (size_t)inputs[2]->shape_[0];
if (data_size != 4) {
return NNACL_ERR;
}
int shape[MAX_SHAPE_SIZE];
const int nchw2nhwc[4] = {0, 2, 3, 1};
for (int i = 0; i < shape_size; i++) {
for (size_t i = 0; i < data_size; i++) {
shape[i] = *((int *)(inputs[2]->data_) + nchw2nhwc[i]);
}
SetShapeArray(out, shape, shape_size);
SetShapeArray(out, shape, data_size);
return NNACL_OK;
}

Some files were not shown because too many files have changed in this diff Show More