syn code 0706

This commit is contained in:
changzherui 2020-07-06 15:28:10 +08:00
commit 17da929b82
1120 changed files with 38701 additions and 55861 deletions

1
.gitignore vendored
View File

@ -26,6 +26,7 @@ cmake-build-debug
*_pb2.py
*.pb.h
*.pb.cc
*.pb
# Object files
*.o

View File

@ -12,7 +12,7 @@ if (NOT CMAKE_SYSTEM_NAME MATCHES "Windows")
endif ()
if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
set(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -O2 -Werror -Wno-return-std-move -Wno-unused-private-field -Wno-unused-lambda-capture -Wno-sign-compare -Wno-overloaded-virtual -Wno-unneeded-internal-declaration -Wno-unused-variable -Wno-pessimizing-move -Wno-inconsistent-missing-override -DHALF_ENABLE_CPP11_USER_LITERALS=0 -D_FORTIFY_SOURCE=2")
set(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -O2 -Werror -Wno-return-std-move -Wno-unused-private-field -Wno-unused-lambda-capture -Wno-sign-compare -Wno-overloaded-virtual -Wno-unneeded-internal-declaration -Wno-unused-variable -Wno-pessimizing-move -Wno-inconsistent-missing-override -DHALF_ENABLE_CPP11_USER_LITERALS=0 -D_FORTIFY_SOURCE=2")
else()
set(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -O2 -Wl,--allow-shlib-undefined -DHALF_ENABLE_CPP11_USER_LITERALS=0 -D_FORTIFY_SOURCE=2")
endif()
@ -38,6 +38,10 @@ if (NOT Patch_FOUND)
endif ()
message(PATCH_EXECUTABLE = ${Patch_EXECUTABLE})
if (ENABLE_AKG AND ENABLE_D)
add_subdirectory("${CMAKE_SOURCE_DIR}/akg")
endif()
include(${CMAKE_SOURCE_DIR}/cmake/mind_expression.cmake)
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/third_party/flatbuffers/include)
@ -86,10 +90,6 @@ if (ENABLE_GE OR ENABLE_D OR ENABLE_TESTCASES)
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/graphengine/third_party/fwkacllib/inc/toolchain)
endif()
if (ENABLE_AKG AND ENABLE_D)
add_subdirectory("${CMAKE_SOURCE_DIR}/akg")
endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden")
add_subdirectory(mindspore/ccsrc)
if (ENABLE_TESTCASES)

View File

@ -29,7 +29,7 @@ enrichment of the AI software/hardware application ecosystem.
<img src="docs/MindSpore-architecture.png" alt="MindSpore Architecture" width="600"/>
For more details please check out our [Architecture Guide](https://www.mindspore.cn/docs/en/0.3.0-alpha/architecture.html).
For more details please check out our [Architecture Guide](https://www.mindspore.cn/docs/en/master/architecture.html).
### Automatic Differentiation
@ -66,7 +66,6 @@ MindSpore offers build options across multiple backends:
| Ascend910 | Ubuntu-x86 | ✔️ |
| | EulerOS-x86 | ✔️ |
| | EulerOS-aarch64 | ✔️ |
| GPU CUDA 9.2 | Ubuntu-x86 | ✔️ |
| GPU CUDA 10.1 | Ubuntu-x86 | ✔️ |
| CPU | Ubuntu-x86 | ✔️ |
| | Windows-x86 | ✔️ |
@ -76,7 +75,7 @@ For installation using `pip`, take `CPU` and `Ubuntu-x86` build version as an ex
1. Download whl from [MindSpore download page](https://www.mindspore.cn/versions/en), and install the package.
```
pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/0.3.0-alpha/MindSpore/cpu/ubuntu_x86/mindspore-0.3.0-cp37-cp37m-linux_x86_64.whl
pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/0.5.0-beta/MindSpore/cpu/ubuntu_x86/mindspore-0.5.0-cp37-cp37m-linux_x86_64.whl
```
2. Run the following command to verify the install.
@ -133,8 +132,8 @@ currently the containerized build options are supported as follows:
For `CPU` backend, you can directly pull and run the latest stable image using the below command:
```
docker pull mindspore/mindspore-cpu:0.3.0-alpha
docker run -it mindspore/mindspore-cpu:0.3.0-alpha /bin/bash
docker pull mindspore/mindspore-cpu:0.5.0-beta
docker run -it mindspore/mindspore-cpu:0.5.0-beta /bin/bash
```
* GPU
@ -151,8 +150,8 @@ currently the containerized build options are supported as follows:
Then you can pull and run the latest stable image using the below command:
```
docker pull mindspore/mindspore-gpu:0.3.0-alpha
docker run -it --runtime=nvidia --privileged=true mindspore/mindspore-gpu:0.3.0-alpha /bin/bash
docker pull mindspore/mindspore-gpu:0.5.0-beta
docker run -it --runtime=nvidia --privileged=true mindspore/mindspore-gpu:0.5.0-beta /bin/bash
```
To test if the docker image works, please execute the python code below and check the output:
@ -187,7 +186,7 @@ please check out [docker](docker/README.md) repo for the details.
## Quickstart
See the [Quick Start](https://www.mindspore.cn/tutorial/en/0.3.0-alpha/quick_start/quick_start.html)
See the [Quick Start](https://www.mindspore.cn/tutorial/en/master/quick_start/quick_start.html)
to implement the image classification.
## Docs

View File

@ -1,3 +1,75 @@
# Release 0.5.0-beta
## Major Features and Improvements
### Ascend 910 Training and Inference Framework
* New models
* ResNext50: a simple, highly modularized network architecture using aggregated resdiual transformations for image classification on ImageNet 2012 dataset.
* MASS: a pre-training method for sequence to sequence based language generation tasks on Text Summarization and Conversational Response Generation using News Crawls 2007-2017 dataset, Gigaword corpus and Cornell movie dialog corpus.
* Transformer: a neural network architecture for language understanding on WMT 2014 English-German dataset.
* GCNGraph Convolutional Networks for the task of classification of nodes in a graph on Cora and Citeseer datasets.
* GATan attention-based graph neural network for node classification on Cora and CiteSeer dataset.
* Frontend and user interface
* Support tensor value and assignment of mixed tensor index in graph mode.
* Support tensor comparison, len operator, constexpr syntax, value and assignment of tensor index in pynative mode.
* Support converting MindSpore IR to pb format for infer model.
* Support print operator to write data directly on the hard disk.
* Add the double recursive programming solution for very high speed parallel strategy search in automatic parallel.
* User interfaces change log
* Allow the learning rate of AdamWeightDecayDynamicLR and Lamb to be 0([!1826](https://gitee.com/mindspore/mindspore/pulls/1826))
* Restricting the entire network input parameter is Tensor([!1967](https://gitee.com/mindspore/mindspore/pulls/1967))
* Turn shape and dtype into attributes instead of interfaces([!1919](https://gitee.com/mindspore/mindspore/pulls/1919))
* Delete multitypefungraph([!2116](https://gitee.com/mindspore/mindspore/pulls/2116))
* Refactor the callback module in an encapsulated way, use _CallbackManager instead of _build_callbacks([!2236](https://gitee.com/mindspore/mindspore/pulls/2236))
* Delete EmbeddingLookup([!2163](https://gitee.com/mindspore/mindspore/pulls/2163))
* Checkpoint add model_type([!2517](https://gitee.com/mindspore/mindspore/pulls/2517))
* Executor and performance optimization
* Heterogeneous execution on CPU and Ascend devices supported, and is verified in Wide&Deep model.
* Quantitative training of MobileNetV2, Lenet and Resnet50 on Ascend-910 are supported.
* Support new fusion architecture, which can do fusion optimization across graphs and kernels to improve execution speed.
* Data processing, augmentation, and save format
* Support data processing pipeline performance profiling.
* Support public dataset loading, such as CLUE and Coco.
* Support more text processing, such as more tokenizers and vocab data.
* Support MindRecord padded data.
### Other Hardware Support
* GPU platform
* New model supported: Bert / Wide&Deep.
* Support setting max device memory.
* CPU platform
* New model supported: LSTM.
## Bugfixes
* Models
* Bert, Move Bert from `example` to `model_zoo`, optimize network for better performance. ([!1902](https://gitee.com/mindspore/mindspore/pulls/1902))
* VGG16, Move VGG16 from `example` to `model_zoo`, optimize network for better accuracy. ([!2645](https://gitee.com/mindspore/mindspore/pulls/2645))
* Alexnet, modify parameter setting to improve accuracy ([!1364](https://gitee.com/mindspore/mindspore/pulls/2370))
* Wide&Deep, Move Wide&Deep from `example` to `model_zoo`, optimize network for better performance. ([!2221](https://gitee.com/mindspore/mindspore/pulls/2221))
* Python API
* Fix bug in auto cast([!1766](https://gitee.com/mindspore/mindspore/pulls/1766))
* Fix bug of register_backward_hook([!2148](https://gitee.com/mindspore/mindspore/pulls/2148))
* Fix bug of tuple args in pynative mode([!1878](https://gitee.com/mindspore/mindspore/pulls/1878))
* Fix bug of checking numbers of arguments and graph parameters([!1701](https://gitee.com/mindspore/mindspore/pulls/1701))
* Executor
* Fix bug of loading input data repeatedly in pynative mode([!1966](https://gitee.com/mindspore/mindspore/pulls/1966))
* Fix bug of list cannot be used as input in pynative mode([!1765](https://gitee.com/mindspore/mindspore/pulls/1765))
* Fix bug of kernel select ([!2103](https://gitee.com/mindspore/mindspore/pulls/2103))
* Fix bug of pattern matching for batchnorm fusion in the case of auto mix precision.([!1851](https://gitee.com/mindspore/mindspore/pulls/1851))
* Fix bug of generate hccl's kernel info.([!2393](https://gitee.com/mindspore/mindspore/mindspore/pulls/2393))
* GPU platform
* Fix bug of summary feature invalid([!2173](https://gitee.com/mindspore/mindspore/pulls/2173))
* Data processing
* Fix bug of Cifar dataset reading([!2096](https://gitee.com/mindspore/mindspore/pulls/2096))
* Fix bug of C++ behavior in RandomCropAndResize([!2026](https://gitee.com/mindspore/mindspore/pulls/2026))
* Fix the bug of mindrecord shuffle([!2420](https://gitee.com/mindspore/mindspore/pulls/2420))
## Contributors
Thanks goes to these wonderful people:
Alexey Shevlyakov, avakh, baihuawei, BowenK, buxue, caifubi, caojian05, Cathy Wong, changzherui, chenfei, chengxianbin, chenhaozhe, chenjianping, chentingting, chenzomi, chujinjin, Danish Farid, dayschan, dengwentao, dinghao, etone-chan, fangzehua, fary86, geekun, Giancarlo Colmenares, gong chen, gukecai, guohongzilong, hangangqiang, heleiwang, hesham, He Wei, hexia, hongxing, huangdongrun, huanghui, islam_amin, Jamie Nisbet, Jesse Lee, jiangjinsheng, jiangzhiwen, jinyaohui, jjfeing, jojobugfree, Jonathan Yan, jonyguo, Junhan Hu, Kang, kingfo, kouzhenzhong, kpy, kswang, laiyongqiang, leopz, liangzelang, lichenever, lihongkang, Li Hongzhang, lilei, limingqi107, lirongzhen1, liubuyu, liuchongming74, liuwenhao4, liuxiao, Lixia Chen, liyanliu, liyong, lizhenyu, lvliang, Mahdi, Margaret_wangrui, meixiaowei, ms_yan, nhussain, ougongchang, panfengfeng, panyifeng, peilinwang, Peilin Wang, pkuliuliu, qianlong, rick_sanchez, shibeiji, Shida He, shijianning, simson, sunsuodong, suteng, Tinazhang, Tron Zhang, unknown, VectorSL, wandongdong, wangcong, wangdongxu, wangdongxu6, wanghua, wangnan39, Wei Luning, wenchunjiang, wenkai, wilfChen, WilliamLian, wukesong, Xian Weizhao, Xiaoda Zhang, xiefangqi, xulei2020, xunxue, xutianchun, Yang, yanghaitao, yanghaitao1, yanghaoran, yangjie, yangjie159, YangLuo, Yanjun Peng, yankai, yanzhenxiang2020, yao_yf, Yi Huaijie, yoonlee666, yuchaojie, yujianfeng, zhangzhongpeng, zhangdengcheng, Zhang Qinghua, zhangyinxia, zhangz0911gm, zhaojichen, zhaoting, zhaozhenlong, zhoufeng, zhouneng, zhousiyi, Zirui Wu, Ziyan, zjun, ZPaC, lihongzhang, wangdongxu
Contributions of any kind are welcome!
# Release 0.3.0-alpha
## Major Features and Improvements

2
akg

@ -1 +1 @@
Subproject commit c460176523d039c8995f1d71089753725ebc0792
Subproject commit df57a6cf9450e347d1854687d1fe66a420ee3b35

View File

@ -25,7 +25,7 @@ usage()
echo "Usage:"
echo "bash build.sh [-d] [-r] [-v] [-c on|off] [-t on|off] [-g on|off] [-h] [-b ge] [-m infer|train] \\"
echo " [-a on|off] [-Q on|off] [-p on|off] [-i] [-L] [-R] [-D on|off] [-j[n]] [-e gpu|d|cpu] \\"
echo " [-P on|off] [-z [on|off]] [-M on|off] [-V 9.2|10.1] [-I] [-K]"
echo " [-P on|off] [-z [on|off]] [-M on|off] [-V 9.2|10.1] [-I] [-K] [-B on|off] [-E]"
echo ""
echo "Options:"
echo " -d Debug mode"
@ -50,10 +50,12 @@ usage()
echo " -D Enable dumping of function graph ir, default on"
echo " -z Compile dataset & mindrecord, default on"
echo " -M Enable MPI and NCCL for GPU training, gpu default on"
echo " -V Specify the minimum required cuda version, default CUDA 9.2"
echo " -V Specify the minimum required cuda version, default CUDA 10.1"
echo " -I Compile predict, default off"
echo " -K Compile with AKG, default off"
echo " -K Compile with AKG, default on"
echo " -s Enable serving module, default off"
echo " -B Enable debugger, default off"
echo " -E Enable IBVERBS for parameter server, default off"
}
# check value of input is 'on' or 'off'
@ -88,14 +90,17 @@ checkopts()
ENABLE_DUMP_IR="on"
COMPILE_MINDDATA="on"
ENABLE_MPI="off"
CUDA_VERSION="9.2"
CUDA_VERSION="10.1"
COMPILE_PREDICT="off"
USE_GLOG="on"
PREDICT_PLATFORM=""
ENABLE_AKG="on"
ENABLE_SERVING="off"
ENABLE_DEBUGGER="off"
ENABLE_IBVERBS="off"
# Process the options
while getopts 'drvj:c:t:hsb:a:g:p:ie:m:I:LRP:Q:D:zM:V:K:s' opt
while getopts 'drvj:c:t:hsb:a:g:p:ie:m:I:LRP:Q:D:zM:V:K:sB:E' opt
do
OPTARG=$(echo ${OPTARG} | tr '[A-Z]' '[a-z]')
case "${opt}" in
@ -191,6 +196,10 @@ checkopts()
usage
exit 1
fi
if [[ "X$OPTARG" == "X9.2" ]]; then
echo "Unsupported CUDA version 9.2"
exit 1
fi
CUDA_VERSION="$OPTARG"
;;
P)
@ -240,6 +249,15 @@ checkopts()
ENABLE_SERVING="on"
echo "enable serving"
;;
B)
check_on_off $OPTARG B
ENABLE_DEBUGGER="on"
echo "enable debugger"
;;
E)
ENABLE_IBVERBS="on"
echo "enable IBVERBS for parameter server"
;;
*)
echo "Unknown option ${opt}!"
usage
@ -322,7 +340,13 @@ build_mindspore()
if [[ "X$ENABLE_SERVING" = "Xon" ]]; then
CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_SERVING=ON"
fi
if [[ "X$ENABLE_DEBUGGER" = "Xon" ]]; then
CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_DEBUGGER=ON"
fi
if [[ "X$ENABLE_IBVERBS" = "Xon" ]]; then
CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_IBVERBS=ON"
fi
echo "${CMAKE_ARGS}"
if [[ "X$INC_BUILD" = "Xoff" ]]; then
cmake ${CMAKE_ARGS} ../..
@ -446,9 +470,9 @@ build_predict()
cd "${BASEPATH}/predict/output/"
if [[ "$PREDICT_PLATFORM" == "x86_64" ]]; then
tar -cf MSPredict-0.3.0-linux_x86_64.tar.gz include/ lib/ --warning=no-file-changed
tar -cf MSPredict-0.5.0-linux_x86_64.tar.gz include/ lib/ --warning=no-file-changed
elif [[ "$PREDICT_PLATFORM" == "arm64" ]]; then
tar -cf MSPredict-0.3.0-linux_aarch64.tar.gz include/ lib/ --warning=no-file-changed
tar -cf MSPredict-0.5.0-linux_aarch64.tar.gz include/ lib/ --warning=no-file-changed
fi
echo "success to build predict project!"
}

View File

@ -0,0 +1,14 @@
mindspore_add_pkg(absl
VER 20200225.2
LIBS absl_strings absl_throw_delegate absl_raw_logging_internal absl_int128 absl_bad_optional_access
URL https://github.com/abseil/abseil-cpp/archive/20200225.2.tar.gz
MD5 73f2b6e72f1599a9139170c29482ddc4
CMAKE_OPTION -DCMAKE_BUILD_TYPE:STRING=Release -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=TRUE)
include_directories(${absl_INC})
add_library(mindspore::absl_strings ALIAS absl::absl_strings)
add_library(mindspore::absl_throw_delegate ALIAS absl::absl_throw_delegate)
add_library(mindspore::absl_raw_logging_internal ALIAS absl::absl_raw_logging_internal)
add_library(mindspore::absl_int128 ALIAS absl::absl_int128)
add_library(mindspore::absl_bad_optional_access ALIAS absl::absl_bad_optional_access)

View File

@ -0,0 +1,12 @@
mindspore_add_pkg(c-ares
VER 1.15.0
LIBS cares
URL https://github.com/c-ares/c-ares/releases/download/cares-1_15_0/c-ares-1.15.0.tar.gz
MD5 d2391da274653f7643270623e822dff7
CMAKE_OPTION -DCMAKE_BUILD_TYPE:STRING=Release
-DCARES_SHARED:BOOL=OFF
-DCARES_STATIC:BOOL=ON
-DCARES_STATIC_PIC:BOOL=ON)
include_directories(${c-ares_INC})
add_library(mindspore::cares ALIAS c-ares::cares)

View File

@ -0,0 +1,110 @@
set(grpc_USE_STATIC_LIBS ON)
if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
set(grpc_CXXFLAGS "-fstack-protector-all -Wno-uninitialized -Wno-unused-parameter -fPIC -fvisibility=hidden -D_FORTIFY_SOURCE=2 -O2")
elseif (${CMAKE_SYSTEM_NAME} MATCHES "Windows")
set(grpc_CXXFLAGS "-fstack-protector-all -Wno-maybe-uninitialized -Wno-unused-parameter -fPIC -fvisibility=hidden -D_FORTIFY_SOURCE=2 -O2")
else()
set(grpc_CXXFLAGS "-fstack-protector-all -Wno-maybe-uninitialized -Wno-unused-parameter -fPIC -fvisibility=hidden -D_FORTIFY_SOURCE=2 -D_GLIBCXX_USE_CXX11_ABI=0 -O2")
endif()
set(grpc_LDFLAGS "-Wl,-z,relro,-z,now,-z,noexecstack")
if (EXISTS ${protobuf_ROOT}/lib64)
set(_FINDPACKAGE_PROTOBUF_CONFIG_DIR "${protobuf_ROOT}/lib64/cmake/protobuf")
else()
set(_FINDPACKAGE_PROTOBUF_CONFIG_DIR "${protobuf_ROOT}/lib/cmake/protobuf")
endif()
message("grpc using Protobuf_DIR : " ${_FINDPACKAGE_PROTOBUF_CONFIG_DIR})
if (EXISTS ${absl_ROOT}/lib64)
set(_FINDPACKAGE_ABSL_CONFIG_DIR "${absl_ROOT}/lib64/cmake/absl")
else()
set(_FINDPACKAGE_ABSL_CONFIG_DIR "${absl_ROOT}/lib/cmake/absl")
endif()
message("grpc using absl_DIR : " ${_FINDPACKAGE_ABSL_CONFIG_DIR})
set(_CMAKE_ARGS_OPENSSL_ROOT_DIR "")
if (OPENSSL_ROOT_DIR)
set(_CMAKE_ARGS_OPENSSL_ROOT_DIR "-DOPENSSL_ROOT_DIR:PATH=${OPENSSL_ROOT_DIR}")
endif()
mindspore_add_pkg(grpc
VER 1.27.3
LIBS grpc++ grpc gpr upb address_sorting
EXE grpc_cpp_plugin
URL https://github.com/grpc/grpc/archive/v1.27.3.tar.gz
MD5 0c6c3fc8682d4262dd0e5e6fabe1a7e2
CMAKE_OPTION -DCMAKE_BUILD_TYPE:STRING=Release
-DgRPC_INSTALL:BOOL=ON
-DgRPC_BUILD_TESTS:BOOL=OFF
-DgRPC_PROTOBUF_PROVIDER:STRING=package
-DgRPC_PROTOBUF_PACKAGE_TYPE:STRING=CONFIG
-DProtobuf_DIR:PATH=${_FINDPACKAGE_PROTOBUF_CONFIG_DIR}
-DgRPC_ZLIB_PROVIDER:STRING=package
-DZLIB_ROOT:PATH=${zlib_ROOT}
-DgRPC_ABSL_PROVIDER:STRING=package
-Dabsl_DIR:PATH=${_FINDPACKAGE_ABSL_CONFIG_DIR}
-DgRPC_CARES_PROVIDER:STRING=package
-Dc-ares_DIR:PATH=${c-ares_ROOT}/lib/cmake/c-ares
-DgRPC_SSL_PROVIDER:STRING=package
${_CMAKE_ARGS_OPENSSL_ROOT_DIR}
)
include_directories(${grpc_INC})
add_library(mindspore::grpc++ ALIAS grpc::grpc++)
# link other grpc libs
target_link_libraries(grpc::grpc++ INTERFACE grpc::grpc grpc::gpr grpc::upb grpc::address_sorting)
# link built dependencies
target_link_libraries(grpc::grpc++ INTERFACE mindspore::z)
target_link_libraries(grpc::grpc++ INTERFACE mindspore::cares)
target_link_libraries(grpc::grpc++ INTERFACE mindspore::absl_strings mindspore::absl_throw_delegate
mindspore::absl_raw_logging_internal mindspore::absl_int128 mindspore::absl_bad_optional_access)
# link system openssl
find_package(OpenSSL REQUIRED)
target_link_libraries(grpc::grpc++ INTERFACE OpenSSL::SSL OpenSSL::Crypto)
function(ms_grpc_generate c_var h_var)
if(NOT ARGN)
message(SEND_ERROR "Error: ms_grpc_generate() called without any proto files")
return()
endif()
set(${c_var})
set(${h_var})
foreach(file ${ARGN})
get_filename_component(abs_file ${file} ABSOLUTE)
get_filename_component(file_name ${file} NAME_WE)
get_filename_component(file_dir ${abs_file} PATH)
file(RELATIVE_PATH rel_path ${CMAKE_CURRENT_SOURCE_DIR} ${file_dir})
list(APPEND ${c_var} "${CMAKE_BINARY_DIR}/proto/${file_name}.pb.cc")
list(APPEND ${h_var} "${CMAKE_BINARY_DIR}/proto/${file_name}.pb.h")
list(APPEND ${c_var} "${CMAKE_BINARY_DIR}/proto/${file_name}.grpc.pb.cc")
list(APPEND ${h_var} "${CMAKE_BINARY_DIR}/proto/${file_name}.grpc.pb.h")
add_custom_command(
OUTPUT "${CMAKE_BINARY_DIR}/proto/${file_name}.pb.cc"
"${CMAKE_BINARY_DIR}/proto/${file_name}.pb.h"
"${CMAKE_BINARY_DIR}/proto/${file_name}.grpc.pb.cc"
"${CMAKE_BINARY_DIR}/proto/${file_name}.grpc.pb.h"
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_BINARY_DIR}/proto"
COMMAND protobuf::protoc --version
COMMAND protobuf::protoc -I${file_dir} --cpp_out=${CMAKE_BINARY_DIR}/proto
--grpc_out=${CMAKE_BINARY_DIR}/proto --plugin=protoc-gen-grpc=$<TARGET_FILE:grpc::grpc_cpp_plugin> ${abs_file}
DEPENDS protobuf::protoc grpc::grpc_cpp_plugin ${abs_file}
COMMENT "Running C++ gRPC compiler on ${file}" VERBATIM)
endforeach()
set_source_files_properties(${${c_var}} ${${h_var}} PROPERTIES GENERATED TRUE)
set(${c_var} ${${c_var}} PARENT_SCOPE)
set(${h_var} ${${h_var}} PARENT_SCOPE)
endfunction()

View File

@ -0,0 +1,14 @@
set(pslite_USE_STATIC_LIBS ON)
if (${ENABLE_IBVERBS} STREQUAL "ON")
set(pslite_CXXFLAGS "USE_IBVERBS=1")
endif()
mindspore_add_pkg(pslite
LIBS ps
URL https://github.com/dmlc/ps-lite/archive/34fd45cae457d59850fdcb2066467778d0673f21.zip
MD5 393c0e27b68bfaf96718caa3aa96f5a3
PATCHES ${CMAKE_SOURCE_DIR}/third_party/patch/pslite/ps_lite.patch001
ONLY_MAKE True
ONLY_MAKE_INCS include/*
ONLY_MAKE_LIBS build/*)
include_directories(${pslite_INC})
add_library(mindspore::pslite ALIAS pslite::ps)

View File

@ -0,0 +1,5 @@
mindspore_add_pkg(zeromq
VER 4.1.4
HEAD_ONLY ./
URL https://raw.githubusercontent.com/mli/deps/master/build/zeromq-4.1.4.tar.gz
MD5 a611ecc93fffeb6d058c0e6edf4ad4fb)

View File

@ -0,0 +1,9 @@
mindspore_add_pkg(zlib
VER 1.2.11
LIBS z
URL https://github.com/madler/zlib/archive/v1.2.11.tar.gz
MD5 0095d2d2d1f3442ce1318336637b695f
CMAKE_OPTION -DCMAKE_BUILD_TYPE:STRING=Release)
include_directories(${zlib_INC})
add_library(mindspore::z ALIAS zlib::z)

View File

@ -14,12 +14,26 @@ include(${CMAKE_SOURCE_DIR}/cmake/external_libs/eigen.cmake)
include(${CMAKE_SOURCE_DIR}/cmake/external_libs/json.cmake)
include(${CMAKE_SOURCE_DIR}/cmake/dependency_securec.cmake)
include(${CMAKE_SOURCE_DIR}/cmake/external_libs/protobuf.cmake)
if (ENABLE_DEBUGGER)
# build dependencies of gRPC
include(${CMAKE_SOURCE_DIR}/cmake/external_libs/absl.cmake)
include(${CMAKE_SOURCE_DIR}/cmake/external_libs/c-ares.cmake)
include(${CMAKE_SOURCE_DIR}/cmake/external_libs/zlib.cmake)
# build gRPC
include(${CMAKE_SOURCE_DIR}/cmake/external_libs/grpc.cmake)
endif()
include(${CMAKE_SOURCE_DIR}/cmake/external_libs/pybind11.cmake)
MESSAGE("go to link flatbuffers")
include(${CMAKE_SOURCE_DIR}/cmake/external_libs/flatbuffers.cmake)
if(USE_GLOG)
include(${CMAKE_SOURCE_DIR}/cmake/external_libs/glog.cmake)
endif()
if (NOT ${CMAKE_SYSTEM_NAME} MATCHES "Windows")
include(${CMAKE_SOURCE_DIR}/cmake/external_libs/zeromq.cmake)
include(${CMAKE_SOURCE_DIR}/cmake/external_libs/pslite.cmake)
endif()
find_package(Python3)
include_directories(${Python3_INCLUDE_DIRS})

View File

@ -17,6 +17,8 @@ option(ENABLE_DUMP_E2E "Enable dump e2e file, default on" OFF)
option(ENABLE_DUMP_IR "Enable dump funciton graph ir, default on" ON)
option(ENABLE_MPI "enable mpi" OFF)
option(ENABLE_AKG "enable akg" OFF)
option(ENABLE_DEBUGGER "enable debugger" OFF)
option(ENABLE_IBVERBS "enable IBVERBS for parameter server" OFF)
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
if (WIN32)
@ -112,3 +114,7 @@ endif()
if(ENABLE_DUMP_E2E)
add_compile_definitions(ENABLE_DUMP_E2E)
endif()
if(ENABLE_DEBUGGER)
add_compile_definitions(ENABLE_DEBUGGER)
endif()

View File

@ -128,6 +128,11 @@ if (ENABLE_MPI)
DESTINATION ${INSTALL_BASE_DIR}
COMPONENT mindspore
)
install(
TARGETS mpi_adapter
DESTINATION ${INSTALL_LIB_DIR}
COMPONENT mindspore
)
endif ()
if (ENABLE_GPU)

View File

@ -206,7 +206,7 @@ function(mindspore_add_pkg pkg_name )
set(options )
set(oneValueArgs URL MD5 GIT_REPOSITORY GIT_TAG VER EXE DIR HEAD_ONLY CMAKE_PATH RELEASE LIB_PATH CUSTOM_CMAKE)
set(multiValueArgs CMAKE_OPTION LIBS PRE_CONFIGURE_COMMAND CONFIGURE_COMMAND BUILD_OPTION INSTALL_INCS INSTALL_LIBS PATCHES SUBMODULES SOURCEMODULES)
set(multiValueArgs CMAKE_OPTION LIBS PRE_CONFIGURE_COMMAND CONFIGURE_COMMAND BUILD_OPTION INSTALL_INCS INSTALL_LIBS PATCHES SUBMODULES SOURCEMODULES ONLY_MAKE ONLY_MAKE_INCS ONLY_MAKE_LIBS)
cmake_parse_arguments(PKG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN} )
if (NOT PKG_LIB_PATH)
@ -290,7 +290,7 @@ function(mindspore_add_pkg pkg_name )
foreach(_PATCH_FILE ${PKG_PATCHES})
get_filename_component(_PATCH_FILE_NAME ${_PATCH_FILE} NAME)
set(_LF_PATCH_FILE ${CMAKE_BINARY_DIR}/_ms_patch/${_PATCH_FILE_NAME})
configure_file(${_PATCH_FILE} ${_LF_PATCH_FILE} NEWLINE_STYLE LF)
configure_file(${_PATCH_FILE} ${_LF_PATCH_FILE} NEWLINE_STYLE LF @ONLY)
message("patching ${${pkg_name}_SOURCE_DIR} -p1 < ${_LF_PATCH_FILE}")
execute_process(COMMAND ${Patch_EXECUTABLE} -p1 INPUT_FILE ${_LF_PATCH_FILE}
@ -324,6 +324,16 @@ function(mindspore_add_pkg pkg_name )
target_include_directories(${pkg_name} INTERFACE ${${pkg_name}_INC})
endif ()
elseif (PKG_ONLY_MAKE)
__exec_cmd(COMMAND ${CMAKE_MAKE_PROGRAM} ${${pkg_name}_CXXFLAGS} -j${THNUM}
WORKING_DIRECTORY ${${pkg_name}_SOURCE_DIR})
set(PKG_INSTALL_INCS ${PKG_ONLY_MAKE_INCS})
set(PKG_INSTALL_LIBS ${PKG_ONLY_MAKE_LIBS})
file(GLOB ${pkg_name}_INSTALL_INCS ${${pkg_name}_SOURCE_DIR}/${PKG_INSTALL_INCS})
file(GLOB ${pkg_name}_INSTALL_LIBS ${${pkg_name}_SOURCE_DIR}/${PKG_INSTALL_LIBS})
file(COPY ${${pkg_name}_INSTALL_INCS} DESTINATION ${${pkg_name}_BASE_DIR}/include)
file(COPY ${${pkg_name}_INSTALL_LIBS} DESTINATION ${${pkg_name}_BASE_DIR}/lib)
elseif (PKG_CMAKE_OPTION)
# in cmake
file(MAKE_DIRECTORY ${${pkg_name}_SOURCE_DIR}/_build)

View File

@ -0,0 +1,67 @@
FROM ubuntu:18.04
MAINTAINER leonwanghui <leon.wanghui@huawei.com>
# Set env
ENV PYTHON_ROOT_PATH /usr/local/python-3.7.5
ENV PATH /usr/local/bin:$PATH
# Install base tools
RUN apt update \
&& DEBIAN_FRONTEND=noninteractive apt install -y \
vim \
wget \
curl \
xz-utils \
net-tools \
openssh-client \
git \
ntpdate \
tzdata \
tcl \
sudo \
bash-completion
# Install compile tools
RUN DEBIAN_FRONTEND=noninteractive apt install -y \
gcc \
g++ \
zlibc \
make \
libgmp-dev \
patch \
autoconf \
libtool \
automake \
flex
# Set bash
RUN echo "dash dash/sh boolean false" | debconf-set-selections
RUN DEBIAN_FRONTEND=noninteractive dpkg-reconfigure dash
# Install python (v3.7.5)
RUN apt install -y libffi-dev libssl-dev zlib1g-dev libbz2-dev libncurses5-dev \
libgdbm-dev libgdbm-compat-dev liblzma-dev libreadline-dev libsqlite3-dev \
&& cd /tmp \
&& wget https://github.com/python/cpython/archive/v3.7.5.tar.gz \
&& tar -xvf v3.7.5.tar.gz \
&& cd /tmp/cpython-3.7.5 \
&& mkdir -p ${PYTHON_ROOT_PATH} \
&& ./configure --prefix=${PYTHON_ROOT_PATH} \
&& make -j4 \
&& make install -j4 \
&& rm -f /usr/local/bin/python \
&& rm -f /usr/local/bin/pip \
&& ln -s ${PYTHON_ROOT_PATH}/bin/python3.7 /usr/local/bin/python \
&& ln -s ${PYTHON_ROOT_PATH}/bin/pip3.7 /usr/local/bin/pip \
&& rm -rf /tmp/cpython-3.7.5 \
&& rm -f /tmp/v3.7.5.tar.gz
# Set pip source
RUN mkdir -pv /root/.pip \
&& echo "[global]" > /root/.pip/pip.conf \
&& echo "trusted-host=mirrors.aliyun.com" >> /root/.pip/pip.conf \
&& echo "index-url=http://mirrors.aliyun.com/pypi/simple/" >> /root/.pip/pip.conf
# Install MindSpore cpu whl package
RUN pip install --no-cache-dir https://ms-release.obs.cn-north-4.myhuaweicloud.com/0.5.0-beta/MindSpore/cpu/ubuntu_x86/mindspore-0.5.0-cp37-cp37m-linux_x86_64.whl

View File

@ -0,0 +1,83 @@
FROM nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04
MAINTAINER leonwanghui <leon.wanghui@huawei.com>
# Set env
ENV PYTHON_ROOT_PATH /usr/local/python-3.7.5
ENV OMPI_ROOT_PATH /usr/local/openmpi-3.1.5
ENV PATH ${OMPI_ROOT_PATH}/bin:/usr/local/bin:$PATH
ENV LD_LIBRARY_PATH ${OMPI_ROOT_PATH}/lib:$LD_LIBRARY_PATH
# Install base tools
RUN apt update \
&& DEBIAN_FRONTEND=noninteractive apt install -y \
vim \
wget \
curl \
xz-utils \
net-tools \
openssh-client \
git \
ntpdate \
tzdata \
tcl \
sudo \
bash-completion
# Install compile tools
RUN DEBIAN_FRONTEND=noninteractive apt install -y \
gcc \
g++ \
zlibc \
make \
libgmp-dev \
patch \
autoconf \
libtool \
automake \
flex \
libnccl2=2.4.8-1+cuda10.1 \
libnccl-dev=2.4.8-1+cuda10.1
# Set bash
RUN echo "dash dash/sh boolean false" | debconf-set-selections
RUN DEBIAN_FRONTEND=noninteractive dpkg-reconfigure dash
# Install python (v3.7.5)
RUN apt install -y libffi-dev libssl-dev zlib1g-dev libbz2-dev libncurses5-dev \
libgdbm-dev libgdbm-compat-dev liblzma-dev libreadline-dev libsqlite3-dev \
&& cd /tmp \
&& wget https://github.com/python/cpython/archive/v3.7.5.tar.gz \
&& tar -xvf v3.7.5.tar.gz \
&& cd /tmp/cpython-3.7.5 \
&& mkdir -p ${PYTHON_ROOT_PATH} \
&& ./configure --prefix=${PYTHON_ROOT_PATH} \
&& make -j4 \
&& make install -j4 \
&& rm -f /usr/local/bin/python \
&& rm -f /usr/local/bin/pip \
&& ln -s ${PYTHON_ROOT_PATH}/bin/python3.7 /usr/local/bin/python \
&& ln -s ${PYTHON_ROOT_PATH}/bin/pip3.7 /usr/local/bin/pip \
&& rm -rf /tmp/cpython-3.7.5 \
&& rm -f /tmp/v3.7.5.tar.gz
# Set pip source
RUN mkdir -pv /root/.pip \
&& echo "[global]" > /root/.pip/pip.conf \
&& echo "trusted-host=mirrors.aliyun.com" >> /root/.pip/pip.conf \
&& echo "index-url=http://mirrors.aliyun.com/pypi/simple/" >> /root/.pip/pip.conf
# Install openmpi (v3.1.5)
RUN cd /tmp \
&& wget https://download.open-mpi.org/release/open-mpi/v3.1/openmpi-3.1.5.tar.gz \
&& tar -xvf openmpi-3.1.5.tar.gz \
&& cd /tmp/openmpi-3.1.5 \
&& mkdir -p ${OMPI_ROOT_PATH} \
&& ./configure --prefix=${OMPI_ROOT_PATH} \
&& make -j4 \
&& make install -j4 \
&& rm -rf /tmp/openmpi-3.1.5 \
&& rm -f /tmp/openmpi-3.1.5.tar.gz
# Install MindSpore cuda-10.1 whl package
RUN pip install --no-cache-dir https://ms-release.obs.cn-north-4.myhuaweicloud.com/0.5.0-beta/MindSpore/gpu/ubuntu_x86/cuda-10.1/mindspore_gpu-0.5.0-cp37-cp37m-linux_x86_64.whl

View File

@ -1,82 +0,0 @@
# Guideline to Convert Training Data CLUERNER2020 to MindRecord For Bert Fine Tuning
<!-- TOC -->
- [What does the example do](#what-does-the-example-do)
- [How to use the example to process CLUERNER2020](#how-to-use-the-example-to-process-cluerner2020)
- [Download CLUERNER2020 and unzip](#download-cluerner2020-and-unzip)
- [Generate MindRecord](#generate-mindrecord)
- [Create MindDataset By MindRecord](#create-minddataset-by-mindrecord)
<!-- /TOC -->
## What does the example do
This example is based on [CLUERNER2020](https://www.cluebenchmarks.com/introduce.html) training data, generating MindRecord file, and finally used for Bert Fine Tuning progress.
1. run.sh: generate MindRecord entry script
2. run_read.py: create MindDataset by MindRecord entry script.
- create_dataset.py: use MindDataset to read MindRecord to generate dataset.
## How to use the example to process CLUERNER2020
Download CLUERNER2020, convert it to MindRecord, use MindDataset to read MindRecord.
### Download CLUERNER2020 and unzip
1. Download the training data zip.
> [CLUERNER2020 dataset download address](https://www.cluebenchmarks.com/introduce.html) **-> 任务介绍 -> CLUENER 细粒度命名实体识别 -> cluener下载链接**
2. Unzip the training data to dir example/nlp_to_mindrecord/CLUERNER2020/cluener_public.
```
unzip -d {your-mindspore}/example/nlp_to_mindrecord/CLUERNER2020/data/cluener_public cluener_public.zip
```
### Generate MindRecord
1. Run the run.sh script.
```bash
bash run.sh
```
2. Output like this:
```
...
[INFO] ME(17603:139620983514944,MainProcess):2020-04-28-16:56:12.498.235 [mindspore/mindrecord/filewriter.py:313] The list of mindrecord files created are: ['data/train.mindrecord'], and the list of index files are: ['data/train.mindrecord.db']
...
[INFO] ME(17603,python):2020-04-28-16:56:13.400.175 [mindspore/ccsrc/mindrecord/io/shard_writer.cc:667] WriteRawData] Write 1 records successfully.
[INFO] ME(17603,python):2020-04-28-16:56:13.400.863 [mindspore/ccsrc/mindrecord/io/shard_writer.cc:667] WriteRawData] Write 1 records successfully.
[INFO] ME(17603,python):2020-04-28-16:56:13.401.534 [mindspore/ccsrc/mindrecord/io/shard_writer.cc:667] WriteRawData] Write 1 records successfully.
[INFO] ME(17603,python):2020-04-28-16:56:13.402.179 [mindspore/ccsrc/mindrecord/io/shard_writer.cc:667] WriteRawData] Write 1 records successfully.
[INFO] ME(17603,python):2020-04-28-16:56:13.402.702 [mindspore/ccsrc/mindrecord/io/shard_writer.cc:667] WriteRawData] Write 1 records successfully.
...
[INFO] ME(17603:139620983514944,MainProcess):2020-04-28-16:56:13.431.208 [mindspore/mindrecord/filewriter.py:313] The list of mindrecord files created are: ['data/dev.mindrecord'], and the list of index files are: ['data/dev.mindrecord.db']
```
3. Generate files like this:
```bash
$ ls output/
dev.mindrecord dev.mindrecord.db README.md train.mindrecord train.mindrecord.db
```
### Create MindDataset By MindRecord
1. Run the run_read.sh script.
```bash
bash run_read.sh
```
2. Output like this:
```
...
example 1340: input_ids: [ 101 3173 1290 4852 7676 3949 122 3299 123 126 3189 4510 8020 6381 5442 7357 2590 3636 8021 7676 3949 4294 1166 6121 3124 1277 6121 3124 7270 2135 3295 5789 3326 123 126 3189 1355 6134 1093 1325 3173 2399 6590 6791 8024 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
example 1340: input_mask: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
example 1340: segment_ids: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
example 1340: label_ids: [ 0 18 19 20 2 4 0 0 0 0 0 0 0 34 36 26 27 28 0 34 35 35 35 35 35 35 35 35 35 36 26 27 28 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
example 1341: input_ids: [ 101 1728 711 4293 3868 1168 2190 2150 3791 934 3633 3428 4638 6237 7025 8024 3297 1400 5310 3362 6206 5023 5401 1744 3297 7770 3791 7368 976 1139 1104 2137 511 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
example 1341: input_mask: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
example 1341: segment_ids: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
example 1341: label_ids: [ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 18 19 19 19 19 20 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
...
```

View File

@ -1,36 +0,0 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""create MindDataset by MindRecord"""
import mindspore.dataset as ds
def create_dataset(data_file):
"""create MindDataset"""
num_readers = 4
data_set = ds.MindDataset(dataset_file=data_file, num_parallel_workers=num_readers, shuffle=True)
index = 0
for item in data_set.create_dict_iterator():
# print("example {}: {}".format(index, item))
print("example {}: input_ids: {}".format(index, item['input_ids']))
print("example {}: input_mask: {}".format(index, item['input_mask']))
print("example {}: segment_ids: {}".format(index, item['segment_ids']))
print("example {}: label_ids: {}".format(index, item['label_ids']))
index += 1
if index % 1000 == 0:
print("read rows: {}".format(index))
print("total rows: {}".format(index))
if __name__ == '__main__':
create_dataset('output/train.mindrecord')
create_dataset('output/dev.mindrecord')

View File

@ -1 +0,0 @@
cluener_public

View File

@ -1 +0,0 @@
## output dir

View File

@ -1,40 +0,0 @@
#!/bin/bash
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
rm -f output/train.mindrecord*
rm -f output/dev.mindrecord*
if [ ! -d "../../../third_party/to_mindrecord/CLUERNER2020" ]; then
echo "The patch base dir ../../../third_party/to_mindrecord/CLUERNER2020 is not exist."
exit 1
fi
if [ ! -f "../../../third_party/patch/to_mindrecord/CLUERNER2020/data_processor_seq.patch" ]; then
echo "The patch file ../../../third_party/patch/to_mindrecord/CLUERNER2020/data_processor_seq.patch is not exist."
exit 1
fi
# patch for data_processor_seq.py
patch -p0 -d ../../../third_party/to_mindrecord/CLUERNER2020/ -o data_processor_seq_patched.py < ../../../third_party/patch/to_mindrecord/CLUERNER2020/data_processor_seq.patch
if [ $? -ne 0 ]; then
echo "Patch ../../../third_party/to_mindrecord/CLUERNER2020/data_processor_seq.py failed"
exit 1
fi
# use patched script
python ../../../third_party/to_mindrecord/CLUERNER2020/data_processor_seq_patched.py \
--vocab_file=../../../third_party/to_mindrecord/CLUERNER2020/vocab.txt \
--label2id_file=../../../third_party/to_mindrecord/CLUERNER2020/label2id.json

View File

@ -1 +0,0 @@
## The input dataset

View File

@ -1,173 +0,0 @@
# Guideline to Convert Training Data enwiki to MindRecord For Bert Pre Training
<!-- TOC -->
- [What does the example do](#what-does-the-example-do)
- [How to use the example to process enwiki](#how-to-use-the-example-to-process-enwiki)
- [Download enwiki training data](#download-enwiki-training-data)
- [Process the enwiki](#process-the-enwiki)
- [Generate MindRecord](#generate-mindrecord)
- [Create MindDataset By MindRecord](#create-minddataset-by-mindrecord)
<!-- /TOC -->
## What does the example do
This example is based on [enwiki](https://dumps.wikimedia.org/enwiki) training data, generating MindRecord file, and finally used for Bert network training.
1. run.sh: generate MindRecord entry script.
2. run_read.py: create MindDataset by MindRecord entry script.
- create_dataset.py: use MindDataset to read MindRecord to generate dataset.
## How to use the example to process enwiki
Download enwiki data, process it, convert it to MindRecord, use MindDataset to read MindRecord.
### Download enwiki training data
> [enwiki dataset download address](https://dumps.wikimedia.org/enwiki) **-> 20200501 -> enwiki-20200501-pages-articles-multistream.xml.bz2**
### Process the enwiki
1. Please follow the steps in [process enwiki](https://github.com/mlperf/training/tree/master/language_model/tensorflow/bert)
- All permissions of this step belong to the link address website.
### Generate MindRecord
1. Run the run.sh script.
```
bash run.sh input_dir output_dir vocab_file
```
- input_dir: the directory which contains files like 'part-00251-of-00500'.
- output_dir: which will store the output mindrecord files.
- vocab_file: the vocab file which you can download from other opensource project.
2. The output like this:
```
...
Begin preprocess Wed Jun 10 09:21:23 CST 2020
Begin preprocess input file: /mnt/data/results/part-00000-of-00500
Begin output file: part-00000-of-00500.mindrecord
Total task: 510, processing: 1
Begin preprocess input file: /mnt/data/results/part-00001-of-00500
Begin output file: part-00001-of-00500.mindrecord
Total task: 510, processing: 2
Begin preprocess input file: /mnt/data/results/part-00002-of-00500
Begin output file: part-00002-of-00500.mindrecord
Total task: 510, processing: 3
Begin preprocess input file: /mnt/data/results/part-00003-of-00500
Begin output file: part-00003-of-00500.mindrecord
Total task: 510, processing: 4
Begin preprocess input file: /mnt/data/results/part-00004-of-00500
Begin output file: part-00004-of-00500.mindrecord
Total task: 510, processing: 4
...
```
3. Generate files like this:
```bash
$ ls {your_output_dir}/
part-00000-of-00500.mindrecord part-00000-of-00500.mindrecord.db part-00001-of-00500.mindrecord part-00001-of-00500.mindrecord.db part-00002-of-00500.mindrecord part-00002-of-00500.mindrecord.db ...
```
### Create MindDataset By MindRecord
1. Run the run_read.sh script.
```bash
bash run_read.sh input_dir
```
- input_dir: the directory which contains mindrecord files.
2. The output like this:
```
...
example 633: input_ids: [ 101 2043 19781 4305 2140 4520 2041 1010 103 2034 2455 2002
7879 2003 1996 2455 1997 103 26378 4160 1012 102 7291 2001
1996 103 1011 2343 1997 6327 1010 3423 1998 103 4262 2005
1996 2118 1997 2329 3996 103 102 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0]
example 633: input_mask: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
example 633: segment_ids: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
example 633: masked_lm_positions: [ 8 17 20 25 33 41 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0]
example 633: masked_lm_ids: [ 1996 16137 1012 3580 2451 1012 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0]
example 633: masked_lm_weights: [1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0.]
example 633: next_sentence_labels: [1]
...
```

View File

@ -1,43 +0,0 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""create MindDataset by MindRecord"""
import argparse
import mindspore.dataset as ds
def create_dataset(data_file):
"""create MindDataset"""
num_readers = 4
data_set = ds.MindDataset(dataset_file=data_file, num_parallel_workers=num_readers, shuffle=True)
index = 0
for item in data_set.create_dict_iterator():
# print("example {}: {}".format(index, item))
print("example {}: input_ids: {}".format(index, item['input_ids']))
print("example {}: input_mask: {}".format(index, item['input_mask']))
print("example {}: segment_ids: {}".format(index, item['segment_ids']))
print("example {}: masked_lm_positions: {}".format(index, item['masked_lm_positions']))
print("example {}: masked_lm_ids: {}".format(index, item['masked_lm_ids']))
print("example {}: masked_lm_weights: {}".format(index, item['masked_lm_weights']))
print("example {}: next_sentence_labels: {}".format(index, item['next_sentence_labels']))
index += 1
if index % 1000 == 0:
print("read rows: {}".format(index))
print("total rows: {}".format(index))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input_file", nargs='+', type=str, help='Input mindreord file')
args = parser.parse_args()
create_dataset(args.input_file)

View File

@ -1,133 +0,0 @@
#!/bin/bash
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
if [ $# -ne 3 ]; then
echo "Usage: $0 input_dir output_dir vocab_file"
exit 1
fi
if [ ! -d $1 ]; then
echo "The input dir: $1 is not exist."
exit 1
fi
if [ ! -d $2 ]; then
echo "The output dir: $2 is not exist."
exit 1
fi
rm -fr $2/*.mindrecord*
if [ ! -f $3 ]; then
echo "The vocab file: $3 is not exist."
exit 1
fi
data_dir=$1
output_dir=$2
vocab_file=$3
file_list=()
output_filename=()
file_index=0
function getdir() {
elements=`ls $1`
for element in ${elements[*]};
do
dir_or_file=$1"/"$element
if [ -d $dir_or_file ];
then
getdir $dir_or_file
else
file_list[$file_index]=$dir_or_file
echo "${dir_or_file}" | tr '/' '\n' > dir_file_list.txt # dir dir file to mapfile
mapfile parent_dir < dir_file_list.txt
rm dir_file_list.txt >/dev/null 2>&1
tmp_output_filename=${parent_dir[${#parent_dir[@]}-1]}".mindrecord"
output_filename[$file_index]=`echo ${tmp_output_filename} | sed 's/ //g'`
file_index=`expr $file_index + 1`
fi
done
}
getdir "${data_dir}"
# echo "The input files: "${file_list[@]}
# echo "The output files: "${output_filename[@]}
if [ ! -d "../../../third_party/to_mindrecord/zhwiki" ]; then
echo "The patch base dir ../../../third_party/to_mindrecord/zhwiki is not exist."
exit 1
fi
if [ ! -f "../../../third_party/patch/to_mindrecord/zhwiki/create_pretraining_data.patch" ]; then
echo "The patch file ../../../third_party/patch/to_mindrecord/zhwiki/create_pretraining_data.patch is not exist."
exit 1
fi
# patch for create_pretraining_data.py
patch -p0 -d ../../../third_party/to_mindrecord/zhwiki/ -o create_pretraining_data_patched.py < ../../../third_party/patch/to_mindrecord/zhwiki/create_pretraining_data.patch
if [ $? -ne 0 ]; then
echo "Patch ../../../third_party/to_mindrecord/zhwiki/create_pretraining_data.py failed"
exit 1
fi
# get the cpu core count
num_cpu_core=`cat /proc/cpuinfo | grep "processor" | wc -l`
avaiable_core_size=`expr $num_cpu_core / 3 \* 2`
echo "Begin preprocess `date`"
# using patched script to generate mindrecord
file_list_len=`expr ${#file_list[*]} - 1`
for index in $(seq 0 $file_list_len); do
echo "Begin preprocess input file: ${file_list[$index]}"
echo "Begin output file: ${output_filename[$index]}"
python ../../../third_party/to_mindrecord/zhwiki/create_pretraining_data_patched.py \
--input_file=${file_list[$index]} \
--output_file=${output_dir}/${output_filename[$index]} \
--partition_number=1 \
--vocab_file=${vocab_file} \
--do_lower_case=True \
--max_seq_length=512 \
--max_predictions_per_seq=76 \
--masked_lm_prob=0.15 \
--random_seed=12345 \
--dupe_factor=10 >/tmp/${output_filename[$index]}.log 2>&1 &
process_count=`ps -ef | grep create_pretraining_data_patched | grep -v grep | wc -l`
echo "Total task: ${#file_list[*]}, processing: ${process_count}"
if [ $process_count -ge $avaiable_core_size ]; then
while [ 1 ]; do
process_num=`ps -ef | grep create_pretraining_data_patched | grep -v grep | wc -l`
if [ $process_count -gt $process_num ]; then
process_count=$process_num
break;
fi
sleep 2
done
fi
done
process_num=`ps -ef | grep create_pretraining_data_patched | grep -v grep | wc -l`
while [ 1 ]; do
if [ $process_num -eq 0 ]; then
break;
fi
echo "There are still ${process_num} preprocess running ..."
sleep 2
process_num=`ps -ef | grep create_pretraining_data_patched | grep -v grep | wc -l`
done
echo "Preprocess all the data success."
echo "End preprocess `date`"

View File

@ -1,113 +0,0 @@
# Guideline to Convert Training Data zhwiki to MindRecord For Bert Pre Training
<!-- TOC -->
- [What does the example do](#what-does-the-example-do)
- [Run simple test](#run-simple-test)
- [How to use the example to process zhwiki](#how-to-use-the-example-to-process-zhwiki)
- [Download zhwiki training data](#download-zhwiki-training-data)
- [Extract the zhwiki](#extract-the-zhwiki)
- [Generate MindRecord](#generate-mindrecord)
- [Create MindDataset By MindRecord](#create-minddataset-by-mindrecord)
<!-- /TOC -->
## What does the example do
This example is based on [zhwiki](https://dumps.wikimedia.org/zhwiki) training data, generating MindRecord file, and finally used for Bert network training.
1. run.sh: generate MindRecord entry script.
2. run_read.py: create MindDataset by MindRecord entry script.
- create_dataset.py: use MindDataset to read MindRecord to generate dataset.
## Run simple test
Follow the step:
```bash
bash run_simple.sh # generate output/simple.mindrecord* by ../../../third_party/to_mindrecord/zhwiki/sample_text.txt
bash run_read_simple.sh # use MindDataset to read output/simple.mindrecord*
```
## How to use the example to process zhwiki
Download zhwiki data, extract it, convert it to MindRecord, use MindDataset to read MindRecord.
### Download zhwiki training data
> [zhwiki dataset download address](https://dumps.wikimedia.org/zhwiki) **-> 20200401 -> zhwiki-20200401-pages-articles-multistream.xml.bz2**
- put the zhwiki-20200401-pages-articles-multistream.xml.bz2 in {your-mindspore}/example/nlp_to_mindrecord/zhwiki/data directory.
### Extract the zhwiki
1. Download [wikiextractor](https://github.com/attardi/wikiextractor) script to {your-mindspore}/example/nlp_to_mindrecord/zhwiki/data directory.
```
$ ls data/
README.md wikiextractor zhwiki-20200401-pages-articles-multistream.xml.bz2
```
2. Extract the zhwiki.
```python
python data/wikiextractor/WikiExtractor.py data/zhwiki-20200401-pages-articles-multistream.xml.bz2 --processes 4 --templates data/template --bytes 8M --min_text_length 0 --filter_disambig_pages --output data/extract
```
3. Generate like this:
```
$ ls data/extract
AA AB
```
### Generate MindRecord
1. Run the run.sh script.
```
bash run.sh
```
> Caution: This process maybe slow, please wait patiently. If you do not have a machine with enough memory and cpu, it is recommended that you modify the script to generate mindrecord in step by step.
2. The output like this:
```
patching file create_pretraining_data_patched.py (read from create_pretraining_data.py)
Begin preprocess input file: ./data/extract/AA/wiki_00
Begin output file: AAwiki_00.mindrecord
Total task: 5, processing: 1
Begin preprocess input file: ./data/extract/AA/wiki_01
Begin output file: AAwiki_01.mindrecord
Total task: 5, processing: 2
Begin preprocess input file: ./data/extract/AA/wiki_02
Begin output file: AAwiki_02.mindrecord
Total task: 5, processing: 3
Begin preprocess input file: ./data/extract/AB/wiki_02
Begin output file: ABwiki_02.mindrecord
Total task: 5, processing: 4
...
```
3. Generate files like this:
```bash
$ ls output/
AAwiki_00.mindrecord AAwiki_00.mindrecord.db AAwiki_01.mindrecord AAwiki_01.mindrecord.db AAwiki_02.mindrecord AAwiki_02.mindrecord.db ... ABwiki_00.mindrecord ABwiki_00.mindrecord.db ...
```
### Create MindDataset By MindRecord
1. Run the run_read.sh script.
```bash
bash run_read.sh
```
2. The output like this:
```
...
example 74: input_ids: [ 101 8168 118 12847 8783 9977 15908 117 8256 9245 11643 8168 8847 8588 11575 8154 8228 143 8384 8376 9197 10241 103 10564 11421 8199 12268 112 161 8228 11541 9586 8436 8174 8363 9864 9702 103 103 119 103 9947 10564 103 8436 8806 11479 103 8912 119 103 103 103 12209 8303 103 8757 8824 117 8256 103 8619 8168 11541 102 11684 8196 103 8228 8847 11523 117 9059 9064 12410 8358 8181 10764 117 11167 11706 9920 148 8332 11390 8936 8205 10951 11997 103 8154 117 103 8670 10467 112 161 10951 13139 12413 117 10288 143 10425 8205 152 10795 8472 8196 103 161 12126 9172 13129 12106 8217 8174 12244 8205 143 103 8461 8277 10628 160 8221 119 102]
example 74: input_mask: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
example 74: segment_ids: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
example 74: masked_lm_positions: [ 6 22 37 38 40 43 47 50 51 52 55 60 67 76 89 92 98 109 120 0]
example 74: masked_lm_ids: [ 8118 8165 8329 8890 8554 8458 119 8850 8565 10392 8174 11467 10291 8181 8549 12718 13139 112 158 0]
example 74: masked_lm_weights: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0.]
example 74: next_sentence_labels: [0]
...
```

View File

@ -1,43 +0,0 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""create MindDataset by MindRecord"""
import argparse
import mindspore.dataset as ds
def create_dataset(data_file):
"""create MindDataset"""
num_readers = 4
data_set = ds.MindDataset(dataset_file=data_file, num_parallel_workers=num_readers, shuffle=True)
index = 0
for item in data_set.create_dict_iterator():
# print("example {}: {}".format(index, item))
print("example {}: input_ids: {}".format(index, item['input_ids']))
print("example {}: input_mask: {}".format(index, item['input_mask']))
print("example {}: segment_ids: {}".format(index, item['segment_ids']))
print("example {}: masked_lm_positions: {}".format(index, item['masked_lm_positions']))
print("example {}: masked_lm_ids: {}".format(index, item['masked_lm_ids']))
print("example {}: masked_lm_weights: {}".format(index, item['masked_lm_weights']))
print("example {}: next_sentence_labels: {}".format(index, item['next_sentence_labels']))
index += 1
if index % 1000 == 0:
print("read rows: {}".format(index))
print("total rows: {}".format(index))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input_file", nargs='+', type=str, help='Input mindreord file')
args = parser.parse_args()
create_dataset(args.input_file)

View File

@ -1,3 +0,0 @@
wikiextractor/
zhwiki-20200401-pages-articles-multistream.xml.bz2
extract/

View File

@ -1 +0,0 @@
## The input dataset

View File

@ -1 +0,0 @@
## Output the mindrecord

View File

@ -1,112 +0,0 @@
#!/bin/bash
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
rm -f output/*.mindrecord*
data_dir="./data/extract"
file_list=()
output_filename=()
file_index=0
function getdir() {
elements=`ls $1`
for element in ${elements[*]};
do
dir_or_file=$1"/"$element
if [ -d $dir_or_file ];
then
getdir $dir_or_file
else
file_list[$file_index]=$dir_or_file
echo "${dir_or_file}" | tr '/' '\n' > dir_file_list.txt # dir dir file to mapfile
mapfile parent_dir < dir_file_list.txt
rm dir_file_list.txt >/dev/null 2>&1
tmp_output_filename=${parent_dir[${#parent_dir[@]}-2]}${parent_dir[${#parent_dir[@]}-1]}".mindrecord"
output_filename[$file_index]=`echo ${tmp_output_filename} | sed 's/ //g'`
file_index=`expr $file_index + 1`
fi
done
}
getdir "${data_dir}"
# echo "The input files: "${file_list[@]}
# echo "The output files: "${output_filename[@]}
if [ ! -d "../../../third_party/to_mindrecord/zhwiki" ]; then
echo "The patch base dir ../../../third_party/to_mindrecord/zhwiki is not exist."
exit 1
fi
if [ ! -f "../../../third_party/patch/to_mindrecord/zhwiki/create_pretraining_data.patch" ]; then
echo "The patch file ../../../third_party/patch/to_mindrecord/zhwiki/create_pretraining_data.patch is not exist."
exit 1
fi
# patch for create_pretraining_data.py
patch -p0 -d ../../../third_party/to_mindrecord/zhwiki/ -o create_pretraining_data_patched.py < ../../../third_party/patch/to_mindrecord/zhwiki/create_pretraining_data.patch
if [ $? -ne 0 ]; then
echo "Patch ../../../third_party/to_mindrecord/zhwiki/create_pretraining_data.py failed"
exit 1
fi
# get the cpu core count
num_cpu_core=`cat /proc/cpuinfo | grep "processor" | wc -l`
avaiable_core_size=`expr $num_cpu_core / 3 \* 2`
echo "Begin preprocess `date`"
# using patched script to generate mindrecord
file_list_len=`expr ${#file_list[*]} - 1`
for index in $(seq 0 $file_list_len); do
echo "Begin preprocess input file: ${file_list[$index]}"
echo "Begin output file: ${output_filename[$index]}"
python ../../../third_party/to_mindrecord/zhwiki/create_pretraining_data_patched.py \
--input_file=${file_list[$index]} \
--output_file=output/${output_filename[$index]} \
--partition_number=1 \
--vocab_file=../../../third_party/to_mindrecord/zhwiki/vocab.txt \
--do_lower_case=True \
--max_seq_length=128 \
--max_predictions_per_seq=20 \
--masked_lm_prob=0.15 \
--random_seed=12345 \
--dupe_factor=10 >/tmp/${output_filename[$index]}.log 2>&1 & # user defined
process_count=`ps -ef | grep create_pretraining_data_patched | grep -v grep | wc -l`
echo "Total task: ${#file_list[*]}, processing: ${process_count}"
if [ $process_count -ge $avaiable_core_size ]; then
while [ 1 ]; do
process_num=`ps -ef | grep create_pretraining_data_patched | grep -v grep | wc -l`
if [ $process_count -gt $process_num ]; then
process_count=$process_num
break;
fi
sleep 2
done
fi
done
process_num=`ps -ef | grep create_pretraining_data_patched | grep -v grep | wc -l`
while [ 1 ]; do
if [ $process_num -eq 0 ]; then
break;
fi
echo "There are still ${process_num} preprocess running ..."
sleep 2
process_num=`ps -ef | grep create_pretraining_data_patched | grep -v grep | wc -l`
done
echo "Preprocess all the data success."
echo "End preprocess `date`"

View File

@ -1,47 +0,0 @@
#!/bin/bash
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
rm -f output/simple.mindrecord*
if [ ! -d "../../../third_party/to_mindrecord/zhwiki" ]; then
echo "The patch base dir ../../../third_party/to_mindrecord/zhwiki is not exist."
exit 1
fi
if [ ! -f "../../../third_party/patch/to_mindrecord/zhwiki/create_pretraining_data.patch" ]; then
echo "The patch file ../../../third_party/patch/to_mindrecord/zhwiki/create_pretraining_data.patch is not exist."
exit 1
fi
# patch for create_pretraining_data.py
patch -p0 -d ../../../third_party/to_mindrecord/zhwiki/ -o create_pretraining_data_patched.py < ../../../third_party/patch/to_mindrecord/zhwiki/create_pretraining_data.patch
if [ $? -ne 0 ]; then
echo "Patch ../../../third_party/to_mindrecord/zhwiki/create_pretraining_data.py failed"
exit 1
fi
# using patched script to generate mindrecord
python ../../../third_party/to_mindrecord/zhwiki/create_pretraining_data_patched.py \
--input_file=../../../third_party/to_mindrecord/zhwiki/sample_text.txt \
--output_file=output/simple.mindrecord \
--partition_number=4 \
--vocab_file=../../../third_party/to_mindrecord/zhwiki/vocab.txt \
--do_lower_case=True \
--max_seq_length=128 \
--max_predictions_per_seq=20 \
--masked_lm_prob=0.15 \
--random_seed=12345 \
--dupe_factor=10 # user defined

View File

@ -1,137 +0,0 @@
# ResNet-50 Example
## Description
This is an example of training ResNet-50 with CIFAR-10 dataset in MindSpore.
## Requirements
- Install [MindSpore](https://www.mindspore.cn/install/en).
- Download the dataset CIFAR-10
> Unzip the CIFAR-10 dataset to any path you want and the folder structure should include train and eval dataset as follows:
> ```
> .
> ├── cifar-10-batches-bin # train dataset
> └── cifar-10-verify-bin # infer dataset
> ```
## Example structure
```shell
.
├── config.py # parameter configuration
├── dataset.py # data preprocessing
├── eval.py # infer script
├── lr_generator.py # generate learning rate for each step
├── run_distribute_train.sh # launch distributed training(8 pcs)
├── run_infer.sh # launch infering
├── run_standalone_train.sh # launch standalone training(1 pcs)
└── train.py # train script
```
## Parameter configuration
Parameters for both training and inference can be set in config.py.
```
"class_num": 10, # dataset class num
"batch_size": 32, # batch size of input tensor
"loss_scale": 1024, # loss scale
"momentum": 0.9, # momentum
"weight_decay": 1e-4, # weight decay
"epoch_size": 90, # only valid for taining, which is always 1 for inference
"buffer_size": 100, # number of queue size in data preprocessing
"image_height": 224, # image height
"image_width": 224, # image width
"save_checkpoint": True, # whether save checkpoint or not
"save_checkpoint_steps": 195, # the step interval between two checkpoints. By default, the last checkpoint will be saved after the last step
"keep_checkpoint_max": 10, # only keep the last keep_checkpoint_max checkpoint
"save_checkpoint_path": "./", # path to save checkpoint
"warmup_epochs": 5, # number of warmup epoch
"lr_decay_mode": "poly" # decay mode can be selected in steps, ploy and default
"lr_init": 0.01, # initial learning rate
"lr_end": 0.00001, # final learning rate
"lr_max": 0.1, # maximum learning rate
```
## Running the example
### Train
#### Usage
```
# distributed training
Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH]
# standalone training
Usage: sh run_standalone_train.sh [DATASET_PATH]
```
#### Launch
```
# distribute training example
sh run_distribute_train.sh rank_table.json ~/cifar-10-batches-bin
# standalone training example
sh run_standalone_train.sh ~/cifar-10-batches-bin
```
> About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/en/master/advanced_use/distributed_training.html).
#### Result
Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". Under this, you can find checkpoint file together with result like the followings in log.
```
# distribute training result(8 pcs)
epoch: 1 step: 195, loss is 1.9601055
epoch: 2 step: 195, loss is 1.8555021
epoch: 3 step: 195, loss is 1.6707983
epoch: 4 step: 195, loss is 1.8162166
epoch: 5 step: 195, loss is 1.393667
```
### Infer
#### Usage
```
# infer
Usage: sh run_infer.sh [DATASET_PATH] [CHECKPOINT_PATH]
```
#### Launch
```
# infer example
sh run_infer.sh ~/cifar10-10-verify-bin ~/resnet50_cifar10/train_parallel0/resnet-90_195.ckpt
```
> checkpoint can be produced in training process.
#### Result
Inference result will be stored in the example path, whose folder name is "infer". Under this, you can find result like the followings in log.
```
result: {'acc': 0.91446314102564111} ckpt=~/resnet50_cifar10/train_parallel0/resnet-90_195.ckpt
```
### Running on GPU
```
# distributed training example
mpirun -n 8 python train.py --dataset_path=~/cifar-10-batches-bin --device_target="GPU" --run_distribute=True
# standalone training example
python train.py --dataset_path=~/cifar-10-batches-bin --device_target="GPU"
# infer example
python eval.py --dataset_path=~/cifar10-10-verify-bin --device_target="GPU" --checkpoint_path=resnet-90_195.ckpt
```

View File

@ -1,81 +0,0 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
create train or eval dataset.
"""
import os
import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset.transforms.vision.c_transforms as C
import mindspore.dataset.transforms.c_transforms as C2
from mindspore.communication.management import init, get_rank, get_group_size
from config import config
def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend"):
"""
create a train or eval dataset
Args:
dataset_path(string): the path of dataset.
do_train(bool): whether dataset is used for train or eval.
repeat_num(int): the repeat times of dataset. Default: 1
batch_size(int): the batch size of dataset. Default: 32
target(str): the device target. Default: Ascend
Returns:
dataset
"""
if target == "Ascend":
device_num = int(os.getenv("DEVICE_NUM"))
rank_id = int(os.getenv("RANK_ID"))
else:
init("nccl")
rank_id = get_rank()
device_num = get_group_size()
if device_num == 1:
ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True)
else:
ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=device_num, shard_id=rank_id)
# define map operations
trans = []
if do_train:
trans += [
C.RandomCrop((32, 32), (4, 4, 4, 4)),
C.RandomHorizontalFlip(prob=0.5)
]
trans += [
C.Resize((config.image_height, config.image_width)),
C.Rescale(1.0 / 255.0, 0.0),
C.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010]),
C.HWC2CHW()
]
type_cast_op = C2.TypeCast(mstype.int32)
ds = ds.map(input_columns="label", num_parallel_workers=8, operations=type_cast_op)
ds = ds.map(input_columns="image", num_parallel_workers=8, operations=trans)
# apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
# apply dataset repeat operation
ds = ds.repeat(repeat_num)
return ds

View File

@ -1,72 +0,0 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
eval.
"""
import os
import argparse
from dataset import create_dataset
from config import config
from mindspore import context
from mindspore.model_zoo.resnet import resnet50
from mindspore.parallel._auto_parallel_context import auto_parallel_context
from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits
from mindspore.train.model import Model, ParallelMode
from mindspore.train.serialization import load_checkpoint, load_param_into_net
from mindspore.communication.management import init, get_group_size
parser = argparse.ArgumentParser(description='Image classification')
parser.add_argument('--run_distribute', type=bool, default=False, help='Run distribute')
parser.add_argument('--device_num', type=int, default=1, help='Device num.')
parser.add_argument('--do_train', type=bool, default=False, help='Do train or not.')
parser.add_argument('--do_eval', type=bool, default=True, help='Do eval or not.')
parser.add_argument('--checkpoint_path', type=str, default=None, help='Checkpoint file path')
parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path')
parser.add_argument('--device_target', type=str, default='Ascend', help='Device target')
args_opt = parser.parse_args()
if __name__ == '__main__':
target = args_opt.device_target
context.set_context(mode=context.GRAPH_MODE, device_target=target, save_graphs=False)
if not args_opt.do_eval and args_opt.run_distribute:
if target == "Ascend":
device_id = int(os.getenv('DEVICE_ID'))
context.set_context(device_id=device_id)
context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
mirror_mean=True)
auto_parallel_context().set_all_reduce_fusion_split_indices([140])
init()
elif target == "GPU":
init("nccl")
context.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL,
mirror_mean=True)
epoch_size = config.epoch_size
net = resnet50(class_num=config.class_num)
loss = SoftmaxCrossEntropyWithLogits(sparse=True)
if args_opt.do_eval:
dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=False, batch_size=config.batch_size,
target=target)
step_size = dataset.get_dataset_size()
if args_opt.checkpoint_path:
param_dict = load_checkpoint(args_opt.checkpoint_path)
load_param_into_net(net, param_dict)
net.set_train(False)
model = Model(net, loss_fn=loss, metrics={'acc'})
res = model.eval(dataset)
print("result:", res, "ckpt=", args_opt.checkpoint_path)

View File

@ -1,97 +0,0 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""train_imagenet."""
import os
import argparse
import numpy as np
from dataset import create_dataset
from lr_generator import get_lr
from config import config
from mindspore import context
from mindspore import Tensor
from mindspore.model_zoo.resnet import resnet50
from mindspore.parallel._auto_parallel_context import auto_parallel_context
from mindspore.nn.optim.momentum import Momentum
from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits
from mindspore.train.model import Model, ParallelMode
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
from mindspore.train.loss_scale_manager import FixedLossScaleManager
from mindspore.communication.management import init, get_rank, get_group_size
parser = argparse.ArgumentParser(description='Image classification')
parser.add_argument('--run_distribute', type=bool, default=False, help='Run distribute')
parser.add_argument('--device_num', type=int, default=1, help='Device num.')
parser.add_argument('--do_train', type=bool, default=True, help='Do train or not.')
parser.add_argument('--do_eval', type=bool, default=False, help='Do eval or not.')
parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path')
parser.add_argument('--device_target', type=str, default='Ascend', help='Device target')
args_opt = parser.parse_args()
if __name__ == '__main__':
target = args_opt.device_target
ckpt_save_dir = config.save_checkpoint_path
context.set_context(mode=context.GRAPH_MODE, device_target=target, save_graphs=False)
np.random.seed(1)
if not args_opt.do_eval and args_opt.run_distribute:
if target == "Ascend":
device_id = int(os.getenv('DEVICE_ID'))
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False, device_id=device_id,
enable_auto_mixed_precision=True)
init()
context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
mirror_mean=True)
auto_parallel_context().set_all_reduce_fusion_split_indices([107, 160])
ckpt_save_dir = config.save_checkpoint_path
elif target == "GPU":
context.set_context(mode=context.GRAPH_MODE, device_target="GPU", save_graphs=False)
init("nccl")
context.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL,
mirror_mean=True)
ckpt_save_dir = config.save_checkpoint_path + "ckpt_" + str(get_rank()) + "/"
epoch_size = config.epoch_size
net = resnet50(class_num=config.class_num)
if args_opt.do_train:
dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True,
repeat_num=epoch_size, batch_size=config.batch_size, target=target)
step_size = dataset.get_dataset_size()
loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False)
lr = Tensor(get_lr(global_step=0, lr_init=config.lr_init, lr_end=config.lr_end, lr_max=config.lr_max,
warmup_epochs=config.warmup_epochs, total_epochs=epoch_size, steps_per_epoch=step_size,
lr_decay_mode='poly'))
opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum,
config.weight_decay, config.loss_scale)
if target == 'GPU':
loss = SoftmaxCrossEntropyWithLogits(sparse=True, is_grad=False, reduction='mean')
opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum)
model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'})
else:
loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'},
amp_level="O2", keep_batchnorm_fp32=False)
time_cb = TimeMonitor(data_size=step_size)
loss_cb = LossMonitor()
cb = [time_cb, loss_cb]
if config.save_checkpoint:
config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs*step_size,
keep_checkpoint_max=config.keep_checkpoint_max)
ckpt_cb = ModelCheckpoint(prefix="resnet", directory=ckpt_save_dir, config=config_ck)
cb += [ckpt_cb]
model.train(epoch_size, dataset, callbacks=cb)

View File

@ -1,150 +0,0 @@
# ResNet-50 Example
## Description
This is an example of training ResNet-50 with ImageNet2012 dataset in MindSpore.
## Requirements
- Install [MindSpore](https://www.mindspore.cn/install/en).
- Download the dataset ImageNet2012
> Unzip the ImageNet2012 dataset to any path you want and the folder structure should include train and eval dataset as follows:
> ```
> .
> ├── ilsvrc # train dataset
> └── ilsvrc_eval # infer dataset
> ```
## Example structure
```shell
.
├── crossentropy.py # CrossEntropy loss function
├── config.py # parameter configuration
├── dataset.py # data preprocessing
├── eval.py # infer script
├── lr_generator.py # generate learning rate for each step
├── run_distribute_train.sh # launch distributed training(8 pcs)
├── run_infer.sh # launch infering
├── run_standalone_train.sh # launch standalone training(1 pcs)
└── train.py # train script
```
## Parameter configuration
Parameters for both training and inference can be set in config.py.
```
"class_num": 1001, # dataset class number
"batch_size": 32, # batch size of input tensor
"loss_scale": 1024, # loss scale
"momentum": 0.9, # momentum optimizer
"weight_decay": 1e-4, # weight decay
"epoch_size": 90, # only valid for taining, which is always 1 for inference
"pretrained_epoch_size": 1, # epoch size that model has been trained before load pretrained checkpoint
"buffer_size": 1000, # number of queue size in data preprocessing
"image_height": 224, # image height
"image_width": 224, # image width
"save_checkpoint": True, # whether save checkpoint or not
"save_checkpoint_epochs": 1, # the epoch interval between two checkpoints. By default, the last checkpoint will be saved after the last epoch
"keep_checkpoint_max": 10, # only keep the last keep_checkpoint_max checkpoint
"save_checkpoint_path": "./", # path to save checkpoint relative to the executed path
"warmup_epochs": 0, # number of warmup epoch
"lr_decay_mode": "cosine", # decay mode for generating learning rate
"label_smooth": True, # label smooth
"label_smooth_factor": 0.1, # label smooth factor
"lr_init": 0, # initial learning rate
"lr_max": 0.1, # maximum learning rate
```
## Running the example
### Train
#### Usage
```
# distributed training
Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
# standalone training
Usage: sh run_standalone_train.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
```
#### Launch
```bash
# distributed training example(8 pcs)
sh run_distribute_train.sh rank_table_8p.json dataset/ilsvrc
# If you want to load pretrained ckpt file
sh run_distribute_train.sh rank_table_8p.json dataset/ilsvrc ./pretrained.ckpt
# standalone training example(1 pcs)
sh run_standalone_train.sh dataset/ilsvrc
# If you want to load pretrained ckpt file
sh run_standalone_train.sh dataset/ilsvrc ./pretrained.ckpt
```
> About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/en/master/advanced_use/distributed_training.html).
#### Result
Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". Under this, you can find checkpoint file together with result like the followings in log.
```
# distribute training result(8 pcs)
epoch: 1 step: 5004, loss is 4.8995576
epoch: 2 step: 5004, loss is 3.9235563
epoch: 3 step: 5004, loss is 3.833077
epoch: 4 step: 5004, loss is 3.2795618
epoch: 5 step: 5004, loss is 3.1978393
```
### Infer
#### Usage
```
# infer
Usage: sh run_infer.sh [DATASET_PATH] [CHECKPOINT_PATH]
```
#### Launch
```bash
# infer with checkpoint
sh run_infer.sh dataset/ilsvrc_eval train_parallel0/resnet-90_5004.ckpt
```
> checkpoint can be produced in training process.
#### Result
Inference result will be stored in the example path, whose folder name is "infer". Under this, you can find result like the followings in log.
```
result: {'acc': 0.7671054737516005} ckpt=train_parallel0/resnet-90_5004.ckpt
```
### Running on GPU
```
# distributed training example
mpirun -n 8 python train.py --dataset_path=dataset/ilsvrc/train --device_target="GPU" --run_distribute=True
# standalone training example
python train.py --dataset_path=dataset/ilsvrc/train --device_target="GPU"
# standalone training example with pretrained checkpoint
python train.py --dataset_path=dataset/ilsvrc/train --device_target="GPU" --pre_trained=pretrained.ckpt
# infer example
python eval.py --dataset_path=dataset/ilsvrc/val --device_target="GPU" --checkpoint_path=resnet-90_5004ss.ckpt
```

View File

@ -1,85 +0,0 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
create train or eval dataset.
"""
import os
import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset.transforms.vision.c_transforms as C
import mindspore.dataset.transforms.c_transforms as C2
from mindspore.communication.management import init, get_rank, get_group_size
def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend"):
"""
create a train or eval dataset
Args:
dataset_path(string): the path of dataset.
do_train(bool): whether dataset is used for train or eval.
repeat_num(int): the repeat times of dataset. Default: 1
batch_size(int): the batch size of dataset. Default: 32
target(str): the device target. Default: Ascend
Returns:
dataset
"""
if target == "Ascend":
device_num = int(os.getenv("DEVICE_NUM"))
rank_id = int(os.getenv("RANK_ID"))
else:
init("nccl")
rank_id = get_rank()
device_num = get_group_size()
if device_num == 1:
ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True)
else:
ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=device_num, shard_id=rank_id)
image_size = 224
mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]
std = [0.229 * 255, 0.224 * 255, 0.225 * 255]
# define map operations
if do_train:
trans = [
C.RandomCropDecodeResize(image_size, scale=(0.08, 1.0), ratio=(0.75, 1.333)),
C.RandomHorizontalFlip(prob=0.5),
C.Normalize(mean=mean, std=std),
C.HWC2CHW()
]
else:
trans = [
C.Decode(),
C.Resize((256, 256)),
C.CenterCrop(image_size),
C.Normalize(mean=mean, std=std),
C.HWC2CHW()
]
type_cast_op = C2.TypeCast(mstype.int32)
ds = ds.map(input_columns="image", num_parallel_workers=8, operations=trans)
ds = ds.map(input_columns="label", num_parallel_workers=8, operations=type_cast_op)
# apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
# apply dataset repeat operation
ds = ds.repeat(repeat_num)
return ds

View File

@ -1,62 +0,0 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
eval.
"""
import os
import argparse
from dataset import create_dataset
from config import config
from mindspore import context
from mindspore.model_zoo.resnet import resnet50
from mindspore.train.model import Model
from mindspore.train.serialization import load_checkpoint, load_param_into_net
from crossentropy import CrossEntropy
parser = argparse.ArgumentParser(description='Image classification')
parser.add_argument('--run_distribute', type=bool, default=False, help='Run distribute')
parser.add_argument('--device_num', type=int, default=1, help='Device num.')
parser.add_argument('--do_train', type=bool, default=False, help='Do train or not.')
parser.add_argument('--do_eval', type=bool, default=True, help='Do eval or not.')
parser.add_argument('--checkpoint_path', type=str, default=None, help='Checkpoint file path')
parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path')
parser.add_argument('--device_target', type=str, default='Ascend', help='Device target')
args_opt = parser.parse_args()
target = args_opt.device_target
context.set_context(mode=context.GRAPH_MODE, device_target=target, save_graphs=False)
if target == "Ascend":
device_id = int(os.getenv('DEVICE_ID'))
context.set_context(device_id=device_id)
if __name__ == '__main__':
net = resnet50(class_num=config.class_num)
if not config.use_label_smooth:
config.label_smooth_factor = 0.0
loss = CrossEntropy(smooth_factor=config.label_smooth_factor, num_classes=config.class_num)
if args_opt.do_eval:
dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=False, batch_size=config.batch_size,
target=target)
step_size = dataset.get_dataset_size()
if args_opt.checkpoint_path:
param_dict = load_checkpoint(args_opt.checkpoint_path)
load_param_into_net(net, param_dict)
net.set_train(False)
model = Model(net, loss_fn=loss, metrics={'acc'})
res = model.eval(dataset)
print("result:", res, "ckpt=", args_opt.checkpoint_path)

View File

@ -1,122 +0,0 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""train_imagenet."""
import os
import argparse
import numpy as np
from dataset import create_dataset
from lr_generator import get_lr
from config import config
from mindspore import context
from mindspore import Tensor
from mindspore.model_zoo.resnet import resnet50
from mindspore.parallel._auto_parallel_context import auto_parallel_context
from mindspore.nn.optim.momentum import Momentum
from mindspore.train.model import Model, ParallelMode
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
from mindspore.train.loss_scale_manager import FixedLossScaleManager
from mindspore.train.serialization import load_checkpoint, load_param_into_net
from mindspore.communication.management import init, get_rank, get_group_size
import mindspore.nn as nn
import mindspore.common.initializer as weight_init
from crossentropy import CrossEntropy
parser = argparse.ArgumentParser(description='Image classification')
parser.add_argument('--run_distribute', type=bool, default=False, help='Run distribute')
parser.add_argument('--device_num', type=int, default=1, help='Device num.')
parser.add_argument('--do_train', type=bool, default=True, help='Do train or not.')
parser.add_argument('--do_eval', type=bool, default=False, help='Do eval or not.')
parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path')
parser.add_argument('--device_target', type=str, default='Ascend', help='Device target')
parser.add_argument('--pre_trained', type=str, default=None, help='Pretrained checkpoint path')
args_opt = parser.parse_args()
if __name__ == '__main__':
target = args_opt.device_target
ckpt_save_dir = config.save_checkpoint_path
context.set_context(mode=context.GRAPH_MODE, device_target=target, save_graphs=False)
np.random.seed(1)
if not args_opt.do_eval and args_opt.run_distribute:
if target == "Ascend":
device_id = int(os.getenv('DEVICE_ID'))
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False, device_id=device_id,
enable_auto_mixed_precision=True)
init()
context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
mirror_mean=True)
auto_parallel_context().set_all_reduce_fusion_split_indices([107, 160])
ckpt_save_dir = config.save_checkpoint_path
elif target == "GPU":
context.set_context(mode=context.GRAPH_MODE, device_target="GPU", save_graphs=False)
init("nccl")
context.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL,
mirror_mean=True)
ckpt_save_dir = config.save_checkpoint_path + "ckpt_" + str(get_rank()) + "/"
epoch_size = config.epoch_size
net = resnet50(class_num=config.class_num)
# weight init
if args_opt.pre_trained:
param_dict = load_checkpoint(args_opt.pre_trained)
load_param_into_net(net, param_dict)
epoch_size = config.epoch_size - config.pretrained_epoch_size
else:
for _, cell in net.cells_and_names():
if isinstance(cell, nn.Conv2d):
cell.weight.default_input = weight_init.initializer(weight_init.XavierUniform(),
cell.weight.default_input.shape,
cell.weight.default_input.dtype).to_tensor()
if isinstance(cell, nn.Dense):
cell.weight.default_input = weight_init.initializer(weight_init.TruncatedNormal(),
cell.weight.default_input.shape,
cell.weight.default_input.dtype).to_tensor()
if not config.use_label_smooth:
config.label_smooth_factor = 0.0
loss = CrossEntropy(smooth_factor=config.label_smooth_factor, num_classes=config.class_num)
if args_opt.do_train:
dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True,
repeat_num=epoch_size, batch_size=config.batch_size, target=target)
step_size = dataset.get_dataset_size()
loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False)
lr = get_lr(lr_init=config.lr_init, lr_end=0.0, lr_max=config.lr_max, warmup_epochs=config.warmup_epochs,
total_epochs=config.epoch_size, steps_per_epoch=step_size, lr_decay_mode='cosine')
if args_opt.pre_trained:
lr = lr[config.pretrained_epoch_size * step_size:]
lr = Tensor(lr)
opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum,
config.weight_decay, config.loss_scale)
if target == "Ascend":
model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'},
amp_level="O2", keep_batchnorm_fp32=False)
elif target == "GPU":
model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'})
time_cb = TimeMonitor(data_size=step_size)
loss_cb = LossMonitor()
cb = [time_cb, loss_cb]
if config.save_checkpoint:
config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs*step_size,
keep_checkpoint_max=config.keep_checkpoint_max)
ckpt_cb = ModelCheckpoint(prefix="resnet", directory=ckpt_save_dir, config=config_ck)
cb += [ckpt_cb]
model.train(epoch_size, dataset, callbacks=cb)

View File

@ -593,6 +593,17 @@ def check_bool(input_param):
raise TypeError("Input type must be bool!")
def check_string(input_param, valid_values):
"""String type judgment."""
if isinstance(input_param, str) and input_param in valid_values:
return input_param
if len(valid_values) == 1:
raise ValueError(f'Input should be str and must be {valid_values[0]},'
f' but got {input_param}.')
raise ValueError(f'Input should be str and must be one of {valid_values},'
f' but got {input_param}.')
def check_input_format(input_param):
"""Judge input format."""
if input_param == "NCHW":

View File

@ -19,6 +19,7 @@
import ast
import types
import inspect
import hashlib
from textwrap import dedent
from dataclasses import is_dataclass
import asttokens
@ -319,7 +320,6 @@ def get_dataclass_methods(cls):
if isinstance(getattr(cls, name), (types.FunctionType,))}
return methods
class Parser:
"""
Parser python code to ast tree.
@ -327,7 +327,10 @@ class Parser:
Args:
fn(FunctionType/MethodType): Need parse object instance.
parse_method(ExtendInfoOfParseObj): Extend information for parse the function.
ast_cache: Dictionary for caching ast tree.
"""
ast_cache = {}
def __init__(self, fn: (types.FunctionType, types.MethodType), parse_method=None) -> None:
self.fn = fn
self.parse_method = parse_method
@ -348,11 +351,15 @@ class Parser:
tree = None
if isinstance(self.fn, (types.FunctionType, types.MethodType)):
original_src = inspect.getsource(self.fn)
src = dedent(original_src)
self.col_offset = \
len(original_src.split('\n')[0]) - len(src.split('\n')[0])
logger.debug("get source = %s", src)
tree = asttokens.ASTTokens(src, parse=True).tree
hexstr = hashlib.sha256(original_src.encode()).hexdigest()
tree = Parser.ast_cache.get(hexstr)
if not tree:
src = dedent(original_src)
self.col_offset = \
len(original_src.split('\n')[0]) - len(src.split('\n')[0])
logger.debug("get source = %s", src)
tree = asttokens.ASTTokens(src, parse=True).tree
Parser.ast_cache[hexstr] = tree
else:
logger.error("Fn type is invalid")
return tree

View File

@ -17,6 +17,7 @@
"""Resources for ast tree parse."""
import ast
import math
from mindspore import IndexedSlices
from mindspore.ops.composite import multitype_ops
from mindspore.ops import functional as F, composite as C
from . import standard_method as M
@ -111,10 +112,11 @@ convert_object_map = {
# system function
T.len: M.ms_len,
T.bool: M.bool_,
T.map: C.HyperMap(),
T.map: C.Map(),
T.partial: F.partial,
T.zip: C.zip_operation,
T.print: F.print_,
T.enumerate: M.enumerate_,
# custom define operation
T.iter: M.ms_iter,
@ -135,4 +137,7 @@ convert_object_map = {
math.sin: NO_IMPLEMENT,
math.cos: NO_IMPLEMENT,
math.tan: NO_IMPLEMENT,
# user defined
IndexedSlices: F.make_indexed_slices,
}

View File

@ -104,6 +104,15 @@ def bool_(x):
return x.__bool__()
def enumerate_(x, start=0):
"""Enumerate list or tuple."""
x_type = F.typeof(x)
ret = ()
if check_is_tuple_or_list(x_type, "enumerate"):
ret = zip(range(start, start + len(x)), x)
return ret
def while_cond(x):
"""For while condtion, if the condition is a tensor, the loop will not be unrolled"""
if F.issubclass_(F.typeof(x), F.typeof(mstype.tensor)):
@ -113,6 +122,13 @@ def while_cond(x):
return x
@constexpr
def check_is_tuple_or_list(x, op_name):
"""check whether x is list or tuple."""
if isinstance(x, (mstype.list_type, mstype.tuple_type)):
return True
raise TypeError(f"For '{op_name}', the input parameter should be tuple or list, but got {x}.")
@constexpr
def check_is_tensor_bool_cond(shp):
"""check if tensor is a bool condition"""

View File

@ -27,7 +27,7 @@ from operator import ( # noqa
# support system function call
from builtins import ( # noqa
bool, getattr, setattr, len, iter, next, pow, range, map, zip, print
bool, getattr, setattr, len, iter, next, pow, range, map, zip, print, enumerate
)
# support functools
@ -44,7 +44,7 @@ __all__ = ['add', 'sub', 'mul', 'truediv', 'floordiv', 'mod', 'eq', 'ne', 'lt',
'not_', 'and_', 'or_', 'xor', 'lshift', 'rshift', 'invert', 'is_', 'is_not', 'contains',
'matmul', 'getitem', 'setitem',
'bool', 'getattr', 'setattr', 'len', 'iter', 'next', 'pow', 'range', 'map', 'zip',
'partial', 'print',
'partial', 'print', 'enumerate',
'exp', 'log', 'sin', 'cos', 'tan']

View File

@ -71,6 +71,17 @@ message("onnx proto path is :" ${ONNX_PROTO})
ms_protobuf_generate(ONNX_PROTO_SRCS ONNX_PROTO_HDRS ${ONNX_PROTO})
list(APPEND MINDSPORE_PROTO_LIST ${ONNX_PROTO_SRCS})
if (ENABLE_DEBUGGER)
# debugger: compile proto files
include_directories("${CMAKE_BINARY_DIR}/debug/debugger")
file(GLOB_RECURSE DEBUGGER_PROTO_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "debug/debugger/debug_graph.proto")
ms_protobuf_generate(DEBUGGER_PROTO_SRCS DEBUGGER_PROTO_HDRS ${DEBUGGER_PROTO_LIST})
file(GLOB_RECURSE DEBUGGER_GRPC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "debug/debugger/debug_grpc.proto")
ms_grpc_generate(DEBUGGER_GRPC_SRCS DEBUGGER_GRPC_HDRS ${DEBUGGER_GRPC_LIST})
list(APPEND MINDSPORE_PROTO_LIST ${DEBUGGER_PROTO_SRCS})
list(APPEND MINDSPORE_PROTO_LIST ${DEBUGGER_GRPC_SRCS})
endif ()
if (ENABLE_DUMP_PROTO)
include_directories(${CMAKE_BINARY_DIR})
file(GLOB_RECURSE PROTO_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "utils/node_strategy.proto")
@ -125,12 +136,21 @@ endforeach ()
set_property(SOURCE ${SUB_OBJECTS_SRC} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_ME)
add_library(mindspore STATIC ${SUB_OBJECTS_SRC})
target_link_libraries(proto_input mindspore::protobuf)
if (ENABLE_DEBUGGER)
# debugger: link grpc
target_link_libraries(proto_input mindspore::grpc++)
endif()
target_link_libraries(mindspore proto_input)
if (ENABLE_CPU AND ENABLE_MPI)
target_link_libraries(mindspore securec mindspore::flatbuffers mindspore::ompi)
if (ENABLE_MPI)
target_link_libraries(mindspore securec mindspore::flatbuffers mpi_adapter)
else ()
target_link_libraries(mindspore securec mindspore::flatbuffers)
endif ()
if (NOT WIN32)
target_link_libraries(mindspore dl)
endif()
@ -210,6 +230,10 @@ else ()
target_link_libraries(_c_expression PRIVATE -Wl,--whole-archive mindspore -Wl,--no-whole-archive)
target_link_libraries(_c_expression PRIVATE mindspore::pybind11_module)
target_link_libraries(_c_expression PRIVATE mindspore_gvar)
target_link_libraries(_c_expression PRIVATE mindspore::pslite mindspore::protobuf ${zeromq_DIRPATH}/zmq_install/lib/libzmq.a)
if (${ENABLE_IBVERBS} STREQUAL "ON")
target_link_libraries(_c_expression PRIVATE ibverbs rdmacm)
endif()
endif ()
if (USE_GLOG)
@ -217,6 +241,7 @@ if (USE_GLOG)
endif ()
if (ENABLE_DUMP_PROTO)
message("add protobuf lib to c_expression")
target_link_libraries(_c_expression PRIVATE mindspore::protobuf)
endif ()
@ -256,10 +281,11 @@ endif ()
if (USE_GLOG)
target_link_libraries(inference PRIVATE mindspore::glog)
else()
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
target_link_options(inference PRIVATE -Wl,-init,mindspore_log_init)
elseif (CMAKE_SYSTEM_NAME MATCHES "Darwin")
set_target_properties(inference PROPERTIES MACOSX_RPATH ON)
endif ()
endif()
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
target_link_options(inference PRIVATE -Wl,-init,common_log_init)
elseif (CMAKE_SYSTEM_NAME MATCHES "Darwin")
set_target_properties(inference PROPERTIES MACOSX_RPATH ON)
endif ()

View File

@ -15,6 +15,7 @@
*/
#include "dataset/api/de_pipeline.h"
#include <algorithm>
#include <set>
#include <map>
@ -45,7 +46,7 @@
namespace mindspore {
namespace dataset {
using pFunction = Status (DEPipeline::*)(const py::dict &, std::shared_ptr<DatasetOp> *);
using pFunction = Status (DEPipeline::*)(const py::dict &, std::shared_ptr<DatasetOp> *, std::shared_ptr<DatasetOp> *);
static std::unordered_map<uint32_t, pFunction> g_parse_op_func_ = {
{kShuffle, &DEPipeline::ParseShuffleOp},
@ -107,18 +108,44 @@ DEPipeline::~DEPipeline() {
}
// Function to add a Node to the Execution Tree.
Status DEPipeline::AddNodeToTree(const OpName &op_name, const py::dict &args, DsOpPtr *out) {
// For each operator, Parse through the list of arguments,
// then call the respective builder/constructor.
Status DEPipeline::AddNodeToTree(const OpName &op_name, const py::dict &args, py::dict *output) {
// For each operator, Parse through the list of arguments, then call the respective builder/constructor.
// Note that each call to the parse function may result in building more than one dataset operator.
// For example, one call to ParseNNNOp may result in multiple internal C nodes:
// nodeA
// |
// nodeB
// |
// nodeC
// However, the python side dataset is more abstract, and it does not know about the potential subtree that
// is being built here. Since the python api is hooking tree nodes together (parent/child hookups), the
// python side needs to know about nodeA and NodeC to be able to appropriately hook up parents and child
// to this subtee.
// Thus, it is required that both the top-most parent and bottom-most child are returned from the parse
// function.
DsOpPtr top = nullptr;
DsOpPtr bottom = nullptr;
auto iter = g_parse_op_func_.find(op_name);
if (iter != g_parse_op_func_.end()) {
pFunction func = iter->second;
RETURN_IF_NOT_OK((this->*func)(args, out));
RETURN_IF_NOT_OK((this->*func)(args, &top, &bottom));
if (top == nullptr) {
RETURN_STATUS_UNEXPECTED("An operator was parsed but it did not produce a C node.");
}
// It is not required that the parse function always produces the bottom pointer. If it's still null,
// then set top and bottom to be the same operator
if (bottom == nullptr) bottom = top;
// Pack these pointers into a py dict so that we can return both back to python.
(*output)["top"] = top;
(*output)["bottom"] = bottom;
} else {
RETURN_STATUS_UNEXPECTED("No such Op");
}
// Associate current dataset op node with the tree.
RETURN_IF_NOT_OK(tree_->AssociateNode(*out));
RETURN_IF_NOT_OK(tree_->AssociateNode(top));
return Status::OK();
}
// Function to add a child and parent relationship.
@ -300,7 +327,8 @@ Status DEPipeline::SetBatchParameters(const py::dict &args) {
return Status::OK();
}
Status DEPipeline::ParseShuffleOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr) {
Status DEPipeline::ParseShuffleOp(const py::dict &args, std::shared_ptr<DatasetOp> *top,
std::shared_ptr<DatasetOp> *bottom) {
std::shared_ptr<ShuffleOp::Builder> builder = std::make_shared<ShuffleOp::Builder>();
if (!args["buffer_size"].is_none()) {
(void)builder->SetShuffleSize(ToInt(args["buffer_size"]));
@ -322,7 +350,7 @@ Status DEPipeline::ParseShuffleOp(const py::dict &args, std::shared_ptr<DatasetO
std::shared_ptr<ShuffleOp> op;
RETURN_IF_NOT_OK(builder->Build(&op));
*ptr = op;
*top = op;
return Status::OK();
}
@ -350,7 +378,8 @@ Status DEPipeline::BuildMindrecordSamplerChain(const py::handle &handle,
return Status::OK();
}
Status DEPipeline::ParseMindRecordOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr) {
Status DEPipeline::ParseMindRecordOp(const py::dict &args, std::shared_ptr<DatasetOp> *top,
std::shared_ptr<DatasetOp> *bottom) {
if (args["dataset_file"].is_none()) {
std::string err_msg = "Error: at least one of dataset_files is missing";
RETURN_STATUS_UNEXPECTED(err_msg);
@ -403,13 +432,15 @@ Status DEPipeline::ParseMindRecordOp(const py::dict &args, std::shared_ptr<Datas
std::shared_ptr<MindRecordOp> op;
RETURN_IF_NOT_OK(builder->Build(&op));
num_rows_ = op->num_rows();
*ptr = op;
*top = op;
return Status::OK();
}
Status DEPipeline::ParseMapOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr) {
std::shared_ptr<MapOp::Builder> builder = std::make_shared<MapOp::Builder>();
Status DEPipeline::ParseMapOp(const py::dict &args, std::shared_ptr<DatasetOp> *top,
std::shared_ptr<DatasetOp> *bottom) {
MapOp::Builder map_builder;
std::vector<std::shared_ptr<TensorOp>> tensor_op_list;
std::vector<std::string> project_columns;
if (args["operations"].is_none()) RETURN_STATUS_UNEXPECTED("Error: 'operations' is not set. \n");
@ -419,15 +450,15 @@ Status DEPipeline::ParseMapOp(const py::dict &args, std::shared_ptr<DatasetOp> *
if (!value.is_none()) {
if (key == "input_columns") {
std::vector<std::string> in_col_names = ToStringVector(args["input_columns"]);
(void)builder->SetInColNames(in_col_names);
(void)map_builder.SetInColNames(in_col_names);
} else if (key == "output_columns") {
(void)builder->SetOutColNames(ToStringVector(value));
(void)map_builder.SetOutColNames(ToStringVector(value));
} else if (key == "columns_order") {
(void)builder->SetColOrder(ToStringVector(value));
project_columns = ToStringVector(value);
} else if (key == "num_parallel_workers") {
(void)builder->SetNumWorkers(ToInt(value));
(void)map_builder.SetNumWorkers(ToInt(value));
} else if (key == "prefetch_size") {
(void)builder->SetOpConnectorSize(ToInt(value));
(void)map_builder.SetOpConnectorSize(ToInt(value));
} else if (key == "operations") {
py::handle tensor_ops = args["operations"];
// operation can be a list of TensorOps or a single TensorOp.
@ -445,20 +476,34 @@ Status DEPipeline::ParseMapOp(const py::dict &args, std::shared_ptr<DatasetOp> *
}
}
if (tensor_op_list.empty()) RETURN_STATUS_UNEXPECTED("Error: tensor_op is invalid or not set.");
(void)builder->SetTensorFuncs(std::move(tensor_op_list));
(void)map_builder.SetTensorFuncs(std::move(tensor_op_list));
} else {
RETURN_STATUS_UNEXPECTED("Error: Unhandled key: " + key);
}
}
}
std::shared_ptr<MapOp> op;
RETURN_IF_NOT_OK(builder->Build(&op));
*ptr = op;
std::shared_ptr<MapOp> map_op;
RETURN_IF_NOT_OK(map_builder.Build(&map_op));
RETURN_IF_NOT_OK(tree_->AssociateNode(map_op));
*top = map_op;
// Add a project op over top of the map if the user wanted to reposition the columns
if (!project_columns.empty()) {
ProjectOp::Builder proj_builder(project_columns);
std::shared_ptr<ProjectOp> proj_op;
RETURN_IF_NOT_OK(proj_builder.Build(&proj_op));
RETURN_IF_NOT_OK(tree_->AssociateNode(proj_op));
RETURN_IF_NOT_OK(proj_op->AddChild(map_op));
*top = proj_op;
*bottom = map_op;
}
return Status::OK();
}
Status DEPipeline::ParseFilterOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr) {
Status DEPipeline::ParseFilterOp(const py::dict &args, std::shared_ptr<DatasetOp> *top,
std::shared_ptr<DatasetOp> *bottom) {
std::shared_ptr<FilterOp::Builder> builder = std::make_shared<FilterOp::Builder>();
if (args["predicate"].is_none()) {
@ -489,11 +534,12 @@ Status DEPipeline::ParseFilterOp(const py::dict &args, std::shared_ptr<DatasetOp
std::shared_ptr<FilterOp> op;
RETURN_IF_NOT_OK(builder->Build(&op));
*ptr = op;
*top = op;
return Status::OK();
}
Status DEPipeline::ParseRepeatOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr) {
Status DEPipeline::ParseRepeatOp(const py::dict &args, std::shared_ptr<DatasetOp> *top,
std::shared_ptr<DatasetOp> *bottom) {
if (args["count"].is_none()) {
std::string err_msg = "Error: count is invalid or not set.";
RETURN_STATUS_UNEXPECTED(err_msg);
@ -501,22 +547,24 @@ Status DEPipeline::ParseRepeatOp(const py::dict &args, std::shared_ptr<DatasetOp
repeat_num_ = ToInt(args["count"]);
std::shared_ptr<RepeatOp> op;
RETURN_IF_NOT_OK(RepeatOp::Builder(ToInt(args["count"])).Build(&op));
*ptr = op;
*top = op;
return Status::OK();
}
Status DEPipeline::ParseSkipOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr) {
Status DEPipeline::ParseSkipOp(const py::dict &args, std::shared_ptr<DatasetOp> *top,
std::shared_ptr<DatasetOp> *bottom) {
if (args["count"].is_none()) {
std::string err_msg = "Error: count is invalid or not set.";
RETURN_STATUS_UNEXPECTED(err_msg);
}
std::shared_ptr<SkipOp> op;
RETURN_IF_NOT_OK(SkipOp::Builder(ToInt(args["count"])).Build(&op));
*ptr = op;
*top = op;
return Status::OK();
}
Status DEPipeline::ParseGeneratorOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr) {
Status DEPipeline::ParseGeneratorOp(const py::dict &args, std::shared_ptr<DatasetOp> *top,
std::shared_ptr<DatasetOp> *bottom) {
std::shared_ptr<GeneratorOp::Builder> builder = std::make_shared<GeneratorOp::Builder>();
for (auto arg : args) {
std::string key = py::str(arg.first);
@ -538,11 +586,12 @@ Status DEPipeline::ParseGeneratorOp(const py::dict &args, std::shared_ptr<Datase
}
std::shared_ptr<GeneratorOp> op;
RETURN_IF_NOT_OK(builder->Build(&op));
*ptr = op;
*top = op;
return Status::OK();
}
Status DEPipeline::ParseBatchOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr) {
Status DEPipeline::ParseBatchOp(const py::dict &args, std::shared_ptr<DatasetOp> *top,
std::shared_ptr<DatasetOp> *bottom) {
std::shared_ptr<BatchOp::Builder> builder;
if (py::isinstance<py::int_>(args["batch_size"])) {
batch_size_ = ToInt(args["batch_size"]);
@ -582,11 +631,12 @@ Status DEPipeline::ParseBatchOp(const py::dict &args, std::shared_ptr<DatasetOp>
std::shared_ptr<BatchOp> op;
RETURN_IF_NOT_OK(builder->Build(&op));
*ptr = op;
*top = op;
return Status::OK();
}
Status DEPipeline::ParseBucketBatchByLengthOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr) {
Status DEPipeline::ParseBucketBatchByLengthOp(const py::dict &args, std::shared_ptr<DatasetOp> *top,
std::shared_ptr<DatasetOp> *bottom) {
std::vector<std::string> mandatory_arguments = {"length_dependent_columns", "bucket_boundaries",
"bucket_batch_sizes"};
for (auto name : mandatory_arguments) {
@ -632,11 +682,12 @@ Status DEPipeline::ParseBucketBatchByLengthOp(const py::dict &args, std::shared_
std::shared_ptr<BucketBatchByLengthOp> op;
RETURN_IF_NOT_OK(builder->Build(&op));
*ptr = op;
*top = op;
return Status::OK();
}
Status DEPipeline::ParseBarrierOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr) {
Status DEPipeline::ParseBarrierOp(const py::dict &args, std::shared_ptr<DatasetOp> *top,
std::shared_ptr<DatasetOp> *bottom) {
std::shared_ptr<BarrierOp::Builder> builder = std::make_shared<BarrierOp::Builder>();
// Right now barrier should only take num_rows_per_buffer = 1
// The reason for this is because having it otherwise can lead to blocking issues
@ -656,11 +707,12 @@ Status DEPipeline::ParseBarrierOp(const py::dict &args, std::shared_ptr<DatasetO
std::shared_ptr<BarrierOp> op;
RETURN_IF_NOT_OK(builder->Build(&op));
*ptr = op;
*top = op;
return Status::OK();
}
Status DEPipeline::ParseDeviceQueueOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr) {
Status DEPipeline::ParseDeviceQueueOp(const py::dict &args, std::shared_ptr<DatasetOp> *top,
std::shared_ptr<DatasetOp> *bottom) {
int32_t prefetch_size = 0;
if (args.contains("prefetch_size")) {
if (args["prefetch_size"].is_none()) {
@ -687,11 +739,12 @@ Status DEPipeline::ParseDeviceQueueOp(const py::dict &args, std::shared_ptr<Data
}
std::shared_ptr<DeviceQueueOp> op;
RETURN_IF_NOT_OK(builder->Build(&op));
*ptr = op;
*top = op;
return Status::OK();
}
Status DEPipeline::ParseRenameOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr) {
Status DEPipeline::ParseRenameOp(const py::dict &args, std::shared_ptr<DatasetOp> *top,
std::shared_ptr<DatasetOp> *bottom) {
std::vector<std::string> in_col_names;
std::vector<std::string> out_col_names;
std::shared_ptr<RenameOp::Builder> builder = std::make_shared<RenameOp::Builder>();
@ -718,48 +771,57 @@ Status DEPipeline::ParseRenameOp(const py::dict &args, std::shared_ptr<DatasetOp
(void)builder->SetOutColNames(out_col_names);
std::shared_ptr<RenameOp> op;
RETURN_IF_NOT_OK(builder->Build(&op));
*ptr = op;
*top = op;
return Status::OK();
}
Status DEPipeline::ParseTakeOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr) {
Status DEPipeline::ParseTakeOp(const py::dict &args, std::shared_ptr<DatasetOp> *top,
std::shared_ptr<DatasetOp> *bottom) {
if (args["count"].is_none()) {
std::string err_msg = "Error: count is invalid or not set.";
RETURN_STATUS_UNEXPECTED(err_msg);
}
std::shared_ptr<TakeOp> op;
RETURN_IF_NOT_OK(TakeOp::Builder(ToInt(args["count"])).Build(&op));
*ptr = op;
*top = op;
return Status::OK();
}
Status DEPipeline::ParseZipOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr) {
Status DEPipeline::ParseZipOp(const py::dict &args, std::shared_ptr<DatasetOp> *top,
std::shared_ptr<DatasetOp> *bottom) {
std::shared_ptr<ZipOp::Builder> builder = std::make_shared<ZipOp::Builder>();
std::shared_ptr<ZipOp> op;
RETURN_IF_NOT_OK(builder->Build(&op));
*ptr = op;
*top = op;
return Status::OK();
}
Status DEPipeline::ParseConcatOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr) {
Status DEPipeline::ParseConcatOp(const py::dict &args, std::shared_ptr<DatasetOp> *top,
std::shared_ptr<DatasetOp> *bottom) {
std::shared_ptr<ConcatOp::Builder> builder = std::make_shared<ConcatOp::Builder>();
std::shared_ptr<ConcatOp> op;
RETURN_IF_NOT_OK(builder->Build(&op));
*ptr = op;
*top = op;
return Status::OK();
}
Status DEPipeline::ParseTFReaderOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr) {
Status DEPipeline::ParseTFReaderOp(const py::dict &args, std::shared_ptr<DatasetOp> *top,
std::shared_ptr<DatasetOp> *bottom) {
// Required arguments
std::vector<std::string> files_list;
std::shared_ptr<TFReaderOp::Builder> builder = std::make_shared<TFReaderOp::Builder>();
if (!args["dataset_files"].is_none()) {
(void)builder->SetDatasetFilesList(ToStringVector(args["dataset_files"]));
files_list = ToStringVector(args["dataset_files"]);
(void)builder->SetDatasetFilesList(files_list);
} else {
std::string err_msg = "Error: at least one of dataset_files or schema_file is missing";
RETURN_STATUS_UNEXPECTED(err_msg);
}
std::vector<std::string> columns_to_load;
bool schema_exists = false;
bool shuffle_required = false;
int64_t num_devices = 0;
int64_t total_rows = 0;
// Optional arguments
for (auto arg : args) {
std::string key = py::str(arg.first);
@ -773,13 +835,15 @@ Status DEPipeline::ParseTFReaderOp(const py::dict &args, std::shared_ptr<Dataset
} else if (key == "shuffle_files") {
(void)builder->SetShuffleFiles(ToBool(value));
} else if (key == "shuffle_global") {
(void)builder->SetShuffleGlobal(ToBool(value));
shuffle_required = ToBool(value);
} else if (key == "schema_file_path" || key == "schema_json_string") {
schema_exists = true;
} else if (key == "num_samples") {
(void)builder->setTotalRows(ToInt(value));
total_rows = ToInt(value);
(void)builder->setTotalRows(total_rows);
} else if (key == "num_shards") {
(void)builder->SetNumDevices(ToInt(value));
num_devices = ToInt(value);
(void)builder->SetNumDevices(num_devices);
} else if (key == "shard_id") {
(void)builder->SetDeviceId(ToInt(value));
} else if (key == "shard_equal_rows") {
@ -796,13 +860,33 @@ Status DEPipeline::ParseTFReaderOp(const py::dict &args, std::shared_ptr<Dataset
}
(void)builder->SetDataSchema(std::move(schema));
}
std::shared_ptr<TFReaderOp> op;
RETURN_IF_NOT_OK(builder->Build(&op));
*ptr = op;
std::shared_ptr<TFReaderOp> tf_op;
RETURN_IF_NOT_OK(builder->Build(&tf_op));
RETURN_IF_NOT_OK(tree_->AssociateNode(tf_op));
*top = tf_op;
if (shuffle_required) {
const boolean estimate = true;
const int64_t workers = 8;
std::shared_ptr<DatasetOp> shuffle_op = nullptr;
int64_t shuffle_size = 0;
int64_t num_rows = 0;
// First, get the number of rows in the dataset via estimate and then compute the shuffle size
RETURN_IF_NOT_OK(TFReaderOp::CountTotalRows(&num_rows, files_list, workers, estimate));
RETURN_IF_NOT_OK(ComputeShuffleSize(files_list.size(), num_devices, num_rows, total_rows, &shuffle_size));
// Add the shuffle op over top of this op and return the subtree (top/bottom) to caller
RETURN_IF_NOT_OK(AddShuffleOp(shuffle_size, tf_op, &shuffle_op));
*top = shuffle_op;
*bottom = tf_op;
}
return Status::OK();
}
Status DEPipeline::ParseProjectOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr) {
Status DEPipeline::ParseProjectOp(const py::dict &args, std::shared_ptr<DatasetOp> *top,
std::shared_ptr<DatasetOp> *bottom) {
if (args["columns"].is_none()) {
std::string err_msg = "Error: columns is missing";
RETURN_STATUS_UNEXPECTED(err_msg);
@ -811,11 +895,12 @@ Status DEPipeline::ParseProjectOp(const py::dict &args, std::shared_ptr<DatasetO
std::shared_ptr<ProjectOp::Builder> builder = std::make_shared<ProjectOp::Builder>(columns_to_project);
std::shared_ptr<ProjectOp> op;
RETURN_IF_NOT_OK(builder->Build(&op));
*ptr = op;
*top = op;
return Status::OK();
}
Status DEPipeline::ParseImageFolderOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr) {
Status DEPipeline::ParseImageFolderOp(const py::dict &args, std::shared_ptr<DatasetOp> *top,
std::shared_ptr<DatasetOp> *bottom) {
// Required arguments
if (args["dataset_dir"].is_none()) {
std::string err_msg = "Error: No dataset path specified";
@ -846,11 +931,12 @@ Status DEPipeline::ParseImageFolderOp(const py::dict &args, std::shared_ptr<Data
}
std::shared_ptr<ImageFolderOp> op;
RETURN_IF_NOT_OK(builder->Build(&op));
*ptr = op;
*top = op;
return Status::OK();
}
Status DEPipeline::ParseManifestOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr) {
Status DEPipeline::ParseManifestOp(const py::dict &args, std::shared_ptr<DatasetOp> *top,
std::shared_ptr<DatasetOp> *bottom) {
// Required arguments
if (args["dataset_file"].is_none()) {
std::string err_msg = "Error: No dataset files specified for manifest";
@ -881,11 +967,12 @@ Status DEPipeline::ParseManifestOp(const py::dict &args, std::shared_ptr<Dataset
}
std::shared_ptr<ManifestOp> op;
RETURN_IF_NOT_OK(builder->Build(&op));
*ptr = op;
*top = op;
return Status::OK();
}
Status DEPipeline::ParseVOCOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr) {
Status DEPipeline::ParseVOCOp(const py::dict &args, std::shared_ptr<DatasetOp> *top,
std::shared_ptr<DatasetOp> *bottom) {
if (args["dataset_dir"].is_none()) {
std::string err_msg = "Error: No dataset path specified";
RETURN_STATUS_UNEXPECTED(err_msg);
@ -924,11 +1011,13 @@ Status DEPipeline::ParseVOCOp(const py::dict &args, std::shared_ptr<DatasetOp> *
}
std::shared_ptr<VOCOp> op;
RETURN_IF_NOT_OK(builder->Build(&op));
*ptr = op;
*top = op;
return Status::OK();
}
Status DEPipeline::ParseCocoOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr) {
Status DEPipeline::ParseCocoOp(const py::dict &args, std::shared_ptr<DatasetOp> *top,
std::shared_ptr<DatasetOp> *bottom) {
if (args["dataset_dir"].is_none()) {
std::string err_msg = "Error: No dataset path specified";
RETURN_STATUS_UNEXPECTED(err_msg);
@ -965,11 +1054,12 @@ Status DEPipeline::ParseCocoOp(const py::dict &args, std::shared_ptr<DatasetOp>
}
std::shared_ptr<CocoOp> op;
RETURN_IF_NOT_OK(builder->Build(&op));
*ptr = op;
*top = op;
return Status::OK();
}
Status DEPipeline::ParseCifar10Op(const py::dict &args, std::shared_ptr<DatasetOp> *ptr) {
Status DEPipeline::ParseCifar10Op(const py::dict &args, std::shared_ptr<DatasetOp> *top,
std::shared_ptr<DatasetOp> *bottom) {
// Required arguments
if (args["dataset_dir"].is_none()) {
std::string err_msg = "Error: No dataset path specified";
@ -998,11 +1088,12 @@ Status DEPipeline::ParseCifar10Op(const py::dict &args, std::shared_ptr<DatasetO
std::shared_ptr<CifarOp> op;
RETURN_IF_NOT_OK(builder->Build(&op));
*ptr = op;
*top = op;
return Status::OK();
}
Status DEPipeline::ParseCifar100Op(const py::dict &args, std::shared_ptr<DatasetOp> *ptr) {
Status DEPipeline::ParseCifar100Op(const py::dict &args, std::shared_ptr<DatasetOp> *top,
std::shared_ptr<DatasetOp> *bottom) {
// Required arguments
if (args["dataset_dir"].is_none()) {
std::string err_msg = "Error: No dataset path specified";
@ -1031,11 +1122,12 @@ Status DEPipeline::ParseCifar100Op(const py::dict &args, std::shared_ptr<Dataset
std::shared_ptr<CifarOp> op;
RETURN_IF_NOT_OK(builder->Build(&op));
*ptr = op;
*top = op;
return Status::OK();
}
Status DEPipeline::ParseRandomDataOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr) {
Status DEPipeline::ParseRandomDataOp(const py::dict &args, std::shared_ptr<DatasetOp> *top,
std::shared_ptr<DatasetOp> *bottom) {
// Required arguments
RandomDataOp::Builder builder;
@ -1072,13 +1164,14 @@ Status DEPipeline::ParseRandomDataOp(const py::dict &args, std::shared_ptr<Datas
}
std::shared_ptr<RandomDataOp> op;
RETURN_IF_NOT_OK(builder.Build(&op));
*ptr = op;
*top = op;
return Status::OK();
}
int32_t DEPipeline::GetNumClasses() const { return num_classes_; }
Status DEPipeline::ParseMnistOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr) {
Status DEPipeline::ParseMnistOp(const py::dict &args, std::shared_ptr<DatasetOp> *top,
std::shared_ptr<DatasetOp> *bottom) {
// Required arguments
if (args["dataset_dir"].is_none()) {
std::string err_msg = "Error: No dataset path specified";
@ -1104,11 +1197,12 @@ Status DEPipeline::ParseMnistOp(const py::dict &args, std::shared_ptr<DatasetOp>
}
std::shared_ptr<MnistOp> op;
RETURN_IF_NOT_OK(builder->Build(&op));
*ptr = op;
*top = op;
return Status::OK();
}
Status DEPipeline::ParseCelebAOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr) {
Status DEPipeline::ParseCelebAOp(const py::dict &args, std::shared_ptr<DatasetOp> *top,
std::shared_ptr<DatasetOp> *bottom) {
// Required arguments
if (args["dataset_dir"].is_none()) {
std::string err_msg = "Error: No dataset path specified";
@ -1143,19 +1237,24 @@ Status DEPipeline::ParseCelebAOp(const py::dict &args, std::shared_ptr<DatasetOp
std::shared_ptr<CelebAOp> op;
RETURN_IF_NOT_OK(builder->Build(&op));
*ptr = op;
*top = op;
return Status::OK();
}
Status DEPipeline::ParseTextFileOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr) {
Status DEPipeline::ParseTextFileOp(const py::dict &args, std::shared_ptr<DatasetOp> *top,
std::shared_ptr<DatasetOp> *bottom) {
// Required arguments
std::vector<std::string> files_list;
std::shared_ptr<TextFileOp::Builder> builder = std::make_shared<TextFileOp::Builder>();
if (!args["dataset_files"].is_none()) {
(void)builder->SetTextFilesList(ToStringVector(args["dataset_files"]));
files_list = ToStringVector(args["dataset_files"]);
(void)builder->SetTextFilesList(files_list);
} else {
RETURN_STATUS_UNEXPECTED("Error: dataset_files is missing");
}
// Optional arguments
bool shuffle_required = false;
int64_t num_devices = 0;
for (auto arg : args) {
std::string key = py::str(arg.first);
py::handle value = arg.second;
@ -1165,19 +1264,38 @@ Status DEPipeline::ParseTextFileOp(const py::dict &args, std::shared_ptr<Dataset
} else if (key == "shuffle_files") {
(void)builder->SetShuffleFiles(ToBool(value));
} else if (key == "shuffle_global") {
(void)builder->SetShuffleGlobal(ToBool(value));
shuffle_required = ToBool(value);
} else if (key == "num_samples") {
(void)builder->SetTotalRows(ToInt(value));
} else if (key == "num_shards") {
(void)builder->SetNumDevices(ToInt(value));
num_devices = ToInt(value);
(void)builder->SetNumDevices(num_devices);
} else if (key == "shard_id") {
(void)builder->SetDeviceId(ToInt(value));
}
}
}
std::shared_ptr<TextFileOp> op;
RETURN_IF_NOT_OK(builder->Build(&op));
*ptr = op;
std::shared_ptr<TextFileOp> txt_op;
RETURN_IF_NOT_OK(builder->Build(&txt_op));
RETURN_IF_NOT_OK(tree_->AssociateNode(txt_op));
*top = txt_op;
if (shuffle_required) {
std::shared_ptr<DatasetOp> shuffle_op = nullptr;
int64_t shuffle_size = 0;
int64_t num_rows = 0;
// First, get the number of rows in the dataset and then compute the shuffle size
RETURN_IF_NOT_OK(TextFileOp::CountAllFileRows(files_list, &num_rows));
RETURN_IF_NOT_OK(ComputeShuffleSize(files_list.size(), num_devices, num_rows, 0, &shuffle_size));
// Add the shuffle op over top of this op and return the subtree (top/bottom) to caller
RETURN_IF_NOT_OK(AddShuffleOp(shuffle_size, txt_op, &shuffle_op));
*top = shuffle_op;
*bottom = txt_op;
}
return Status::OK();
}
@ -1208,7 +1326,8 @@ Status DEPipeline::ParsePadInfo(py::handle value, PadInfo *pad_info) {
return Status::OK();
}
Status DEPipeline::ParseBuildVocabOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr) {
Status DEPipeline::ParseBuildVocabOp(const py::dict &args, std::shared_ptr<DatasetOp> *top,
std::shared_ptr<DatasetOp> *bottom) {
std::shared_ptr<BuildVocabOp::Builder> builder = std::make_shared<BuildVocabOp::Builder>();
for (auto arg : args) {
std::string key = py::str(arg.first);
@ -1235,18 +1354,23 @@ Status DEPipeline::ParseBuildVocabOp(const py::dict &args, std::shared_ptr<Datas
}
std::shared_ptr<BuildVocabOp> op;
RETURN_IF_NOT_OK(builder->Build(&op));
*ptr = op;
*top = op;
return Status::OK();
}
Status DEPipeline::ParseClueOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr) {
Status DEPipeline::ParseClueOp(const py::dict &args, std::shared_ptr<DatasetOp> *top,
std::shared_ptr<DatasetOp> *bottom) {
std::vector<std::string> files_list;
std::shared_ptr<ClueOp::Builder> builder = std::make_shared<ClueOp::Builder>();
if (!args["dataset_files"].is_none()) {
(void)builder->SetClueFilesList(ToStringVector(args["dataset_files"]));
files_list = ToStringVector(args["dataset_files"]);
(void)builder->SetClueFilesList(files_list);
} else {
RETURN_STATUS_UNEXPECTED("Error: dataset_files is missing");
}
// Optional arguments
bool shuffle_required = false;
int64_t num_devices = 0;
for (auto arg : args) {
std::string key = py::str(arg.first);
py::handle value = arg.second;
@ -1256,11 +1380,12 @@ Status DEPipeline::ParseClueOp(const py::dict &args, std::shared_ptr<DatasetOp>
} else if (key == "shuffle_files") {
(void)builder->SetShuffleFiles(ToBool(value));
} else if (key == "shuffle_global") {
(void)builder->SetShuffleGlobal(ToBool(value));
shuffle_required = ToBool(value);
} else if (key == "num_samples") {
(void)builder->SetNumSamples(ToInt(value));
} else if (key == "num_shards") {
(void)builder->SetNumDevices(ToInt(value));
num_devices = ToInt(value);
(void)builder->SetNumDevices(num_devices);
} else if (key == "shard_id") {
(void)builder->SetDeviceId(ToInt(value));
} else if (key == "cols_to_keyword") {
@ -1276,9 +1401,76 @@ Status DEPipeline::ParseClueOp(const py::dict &args, std::shared_ptr<DatasetOp>
}
}
}
std::shared_ptr<ClueOp> op;
RETURN_IF_NOT_OK(builder->Build(&op));
*ptr = op;
std::shared_ptr<ClueOp> clue_op;
RETURN_IF_NOT_OK(builder->Build(&clue_op));
RETURN_IF_NOT_OK(tree_->AssociateNode(clue_op));
*top = clue_op;
if (shuffle_required) {
std::shared_ptr<DatasetOp> shuffle_op = nullptr;
int64_t shuffle_size = 0;
int64_t num_rows = 0;
// First, get the number of rows in the dataset and then compute the shuffle size
RETURN_IF_NOT_OK(ClueOp::CountAllFileRows(files_list, &num_rows));
RETURN_IF_NOT_OK(ComputeShuffleSize(files_list.size(), num_devices, num_rows, 0, &shuffle_size));
// Add the shuffle op over top of this op and return the subtree (top/bottom) to caller
RETURN_IF_NOT_OK(AddShuffleOp(shuffle_size, clue_op, &shuffle_op));
*top = shuffle_op;
*bottom = clue_op;
}
return Status::OK();
}
// Helper function to inject a shuffle operator over top of the current operation being built.
Status DEPipeline::AddShuffleOp(int64_t shuffle_size, std::shared_ptr<DatasetOp> input_op,
std::shared_ptr<DatasetOp> *shuffle_op) {
std::shared_ptr<ShuffleOp> new_shuffle_op = nullptr;
ShuffleOp::Builder shuffle_builder;
(void)shuffle_builder.SetShuffleSize(shuffle_size);
RETURN_IF_NOT_OK(shuffle_builder.Build(&new_shuffle_op));
RETURN_IF_NOT_OK(tree_->AssociateNode(new_shuffle_op));
RETURN_IF_NOT_OK(new_shuffle_op->AddChild(input_op));
// We have now created:
//
// ShuffleOp
// |
// input_op
//
*shuffle_op = new_shuffle_op;
return Status::OK();
}
// Common code for computing a default shuffle size
Status DEPipeline::ComputeShuffleSize(int64_t num_files, int64_t num_devices, int64_t num_rows, int64_t total_rows,
int64_t *shuffle_size) {
const int64_t average_files_multiplier = 4;
const int64_t shuffle_max = 10000;
int64_t avg_rows_per_file = 0;
// Adjust the num rows per shard if sharding was given
if (num_devices > 0) {
if (num_rows % num_devices == 0) {
num_rows = num_rows / num_devices;
} else {
num_rows = (num_rows / num_devices) + 1;
}
}
// Cap based on total rows directive. Some ops do not have this and give value of 0.
if (total_rows > 0) {
num_rows = std::min(num_rows, total_rows);
}
// get the average per file
avg_rows_per_file = num_rows / num_files;
*shuffle_size = std::max(avg_rows_per_file * average_files_multiplier, shuffle_max);
return Status::OK();
}
} // namespace dataset

View File

@ -77,7 +77,7 @@ class DEPipeline {
~DEPipeline();
// Function to add a Node to the Execution Tree.
Status AddNodeToTree(const OpName &op_name, const py::dict &args, DsOpPtr *out);
Status AddNodeToTree(const OpName &op_name, const py::dict &args, py::dict *output);
// Function to add a child and parent relationship.
static Status AddChildToParentNode(const DsOpPtr &child_op, const DsOpPtr &parent_op);
@ -104,73 +104,74 @@ class DEPipeline {
int GetRepeatCount() const;
Status ParseShuffleOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
Status ParseShuffleOp(const py::dict &args, std::shared_ptr<DatasetOp> *top, std::shared_ptr<DatasetOp> *bottom);
Status ParseMindRecordOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
Status ParseMindRecordOp(const py::dict &args, std::shared_ptr<DatasetOp> *top, std::shared_ptr<DatasetOp> *bottom);
Status BuildMindrecordSamplerChain(const py::handle &handle,
std::vector<std::shared_ptr<mindrecord::ShardOperator>> *operators,
int num_padded);
Status ParseMapOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
Status ParseMapOp(const py::dict &args, std::shared_ptr<DatasetOp> *top, std::shared_ptr<DatasetOp> *bottom);
Status ParseFilterOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
Status ParseFilterOp(const py::dict &args, std::shared_ptr<DatasetOp> *top, std::shared_ptr<DatasetOp> *bottom);
Status ParseRepeatOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
Status ParseRepeatOp(const py::dict &args, std::shared_ptr<DatasetOp> *top, std::shared_ptr<DatasetOp> *bottom);
Status ParseSkipOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
Status ParseSkipOp(const py::dict &args, std::shared_ptr<DatasetOp> *top, std::shared_ptr<DatasetOp> *bottom);
Status ParseBatchOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
Status ParseBatchOp(const py::dict &args, std::shared_ptr<DatasetOp> *top, std::shared_ptr<DatasetOp> *bottom);
Status ParseBucketBatchByLengthOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
Status ParseBucketBatchByLengthOp(const py::dict &args, std::shared_ptr<DatasetOp> *top,
std::shared_ptr<DatasetOp> *bottom);
Status ParseBarrierOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
Status ParseBarrierOp(const py::dict &args, std::shared_ptr<DatasetOp> *top, std::shared_ptr<DatasetOp> *bottom);
Status ParseGeneratorOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
Status ParseGeneratorOp(const py::dict &args, std::shared_ptr<DatasetOp> *top, std::shared_ptr<DatasetOp> *bottom);
Status ParseRenameOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
Status ParseRenameOp(const py::dict &args, std::shared_ptr<DatasetOp> *top, std::shared_ptr<DatasetOp> *bottom);
Status ParseTakeOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
Status ParseTakeOp(const py::dict &args, std::shared_ptr<DatasetOp> *top, std::shared_ptr<DatasetOp> *bottom);
Status ParseZipOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
Status ParseZipOp(const py::dict &args, std::shared_ptr<DatasetOp> *top, std::shared_ptr<DatasetOp> *bottom);
Status ParseConcatOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
Status ParseConcatOp(const py::dict &args, std::shared_ptr<DatasetOp> *top, std::shared_ptr<DatasetOp> *bottom);
Status ParseDeviceQueueOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
Status ParseDeviceQueueOp(const py::dict &args, std::shared_ptr<DatasetOp> *top, std::shared_ptr<DatasetOp> *bottom);
Status ParseTFReaderOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
Status ParseTFReaderOp(const py::dict &args, std::shared_ptr<DatasetOp> *top, std::shared_ptr<DatasetOp> *bottom);
Status ParseProjectOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
Status ParseProjectOp(const py::dict &args, std::shared_ptr<DatasetOp> *top, std::shared_ptr<DatasetOp> *bottom);
Status ParseImageFolderOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
Status ParseImageFolderOp(const py::dict &args, std::shared_ptr<DatasetOp> *top, std::shared_ptr<DatasetOp> *bottom);
Status ParseManifestOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
Status ParseManifestOp(const py::dict &args, std::shared_ptr<DatasetOp> *top, std::shared_ptr<DatasetOp> *bottom);
Status ParseVOCOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
Status ParseVOCOp(const py::dict &args, std::shared_ptr<DatasetOp> *top, std::shared_ptr<DatasetOp> *bottom);
Status ParseCocoOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
Status ParseCocoOp(const py::dict &args, std::shared_ptr<DatasetOp> *top, std::shared_ptr<DatasetOp> *bottom);
Status ParseCifar10Op(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
Status ParseCifar10Op(const py::dict &args, std::shared_ptr<DatasetOp> *top, std::shared_ptr<DatasetOp> *bottom);
Status ParseCifar100Op(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
Status ParseCifar100Op(const py::dict &args, std::shared_ptr<DatasetOp> *top, std::shared_ptr<DatasetOp> *bottom);
Status ParseRandomDataOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
Status ParseRandomDataOp(const py::dict &args, std::shared_ptr<DatasetOp> *top, std::shared_ptr<DatasetOp> *bottom);
void PrintTree();
int32_t GetNumClasses() const;
Status ParseMnistOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
Status ParseMnistOp(const py::dict &args, std::shared_ptr<DatasetOp> *top, std::shared_ptr<DatasetOp> *bottom);
Status SetBatchParameters(const py::dict &args);
Status ParseCelebAOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
Status ParseCelebAOp(const py::dict &args, std::shared_ptr<DatasetOp> *top, std::shared_ptr<DatasetOp> *bottom);
Status ParseTextFileOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
Status ParseTextFileOp(const py::dict &args, std::shared_ptr<DatasetOp> *top, std::shared_ptr<DatasetOp> *bottom);
Status ParseBuildVocabOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
Status ParseBuildVocabOp(const py::dict &args, std::shared_ptr<DatasetOp> *top, std::shared_ptr<DatasetOp> *bottom);
Status ParseClueOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
Status ParseClueOp(const py::dict &args, std::shared_ptr<DatasetOp> *top, std::shared_ptr<DatasetOp> *bottom);
private:
// Execution tree that links the dataset operators.
@ -180,6 +181,25 @@ class DEPipeline {
static Status ParsePadInfo(py::handle value, PadInfo *pad_info);
/// \brief Helper function to inject a shuffle operator over top of the current operation being built.
/// \param[in] shuffle_size The size to use in the shuffle buffer
/// \param[in] input_op The operator to build shuffle on top of
/// \param[out] shuffle_op The top node of the created subtree (subtree contains two nodes). In this case it will be
/// the shuffle operator
/// \return Status return code
Status AddShuffleOp(int64_t shuffle_size, std::shared_ptr<DatasetOp> input_op,
std::shared_ptr<DatasetOp> *shuffle_op);
/// \brief Helper function to compute the shuffle size
/// \param[in] num_files The number of files in the dataset
/// \param[in] num_devices The number of devices in the dataset
/// \param[in] num_rows The number of rows in the dataset
/// \param[in] total_rows An upper bound on the total rows in the dataset
/// \param[out] shuffle_size The resultant computed shuffle size
/// \return Status return code
Status ComputeShuffleSize(int64_t num_files, int64_t num_devices, int64_t num_rows, int64_t total_rows,
int64_t *shuffle_size);
int batch_size_;
int repeat_num_;
int num_rows_;

View File

@ -63,12 +63,14 @@
#include "dataset/kernels/image/random_horizontal_flip_bbox_op.h"
#include "dataset/kernels/image/random_horizontal_flip_op.h"
#include "dataset/kernels/image/random_resize_op.h"
#include "dataset/kernels/image/random_resize_with_bbox_op.h"
#include "dataset/kernels/image/random_rotation_op.h"
#include "dataset/kernels/image/random_vertical_flip_op.h"
#include "dataset/kernels/image/random_vertical_flip_with_bbox_op.h"
#include "dataset/kernels/image/rescale_op.h"
#include "dataset/kernels/image/resize_bilinear_op.h"
#include "dataset/kernels/image/resize_op.h"
#include "dataset/kernels/image/resize_with_bbox_op.h"
#include "dataset/kernels/image/uniform_aug_op.h"
#include "dataset/kernels/no_op.h"
#include "dataset/text/kernels/jieba_tokenizer_op.h"
@ -116,9 +118,9 @@ void bindDEPipeline(py::module *m) {
.def(
"AddNodeToTree",
[](DEPipeline &de, const OpName &op_name, const py::dict &args) {
DsOpPtr op;
THROW_IF_ERROR(de.AddNodeToTree(op_name, args, &op));
return op;
py::dict out;
THROW_IF_ERROR(de.AddNodeToTree(op_name, args, &out));
return out;
},
py::return_value_policy::reference)
.def_static("AddChildToParentNode",
@ -348,6 +350,18 @@ void bindTensorOps1(py::module *m) {
.def(py::init<int32_t, int32_t, InterpolationMode>(), py::arg("targetHeight"),
py::arg("targetWidth") = ResizeOp::kDefWidth, py::arg("interpolation") = ResizeOp::kDefInterpolation);
(void)py::class_<ResizeWithBBoxOp, TensorOp, std::shared_ptr<ResizeWithBBoxOp>>(
*m, "ResizeWithBBoxOp", "Tensor operation to resize an image. Takes height, width and mode.")
.def(py::init<int32_t, int32_t, InterpolationMode>(), py::arg("targetHeight"),
py::arg("targetWidth") = ResizeWithBBoxOp::kDefWidth,
py::arg("interpolation") = ResizeWithBBoxOp::kDefInterpolation);
(void)py::class_<RandomResizeWithBBoxOp, TensorOp, std::shared_ptr<RandomResizeWithBBoxOp>>(
*m, "RandomResizeWithBBoxOp",
"Tensor operation to resize an image using a randomly selected interpolation. Takes height and width.")
.def(py::init<int32_t, int32_t>(), py::arg("targetHeight"),
py::arg("targetWidth") = RandomResizeWithBBoxOp::kDefTargetWidth);
(void)py::class_<UniformAugOp, TensorOp, std::shared_ptr<UniformAugOp>>(
*m, "UniformAugOp", "Tensor operation to apply random augmentation(s).")
.def(py::init<std::vector<std::shared_ptr<TensorOp>>, int32_t>(), py::arg("operations"),

View File

@ -41,6 +41,7 @@ Status ConfigManager::FromJson(const nlohmann::json &j) {
set_worker_connector_size(j.value("workerConnectorSize", worker_connector_size_));
set_op_connector_size(j.value("opConnectorSize", op_connector_size_));
set_seed(j.value("seed", seed_));
set_monitor_sampling_interval(j.value("monitorSamplingInterval", monitor_sampling_interval_));
return Status::OK();
}

View File

@ -18,7 +18,6 @@
#include "utils/log_adapter.h"
#include "dataset/core/pybind_support.h"
#include "dataset/util/de_error.h"
namespace mindspore {
namespace dataset {

View File

@ -152,7 +152,7 @@ Tensor::Tensor(const std::vector<std::string> &strings, const TensorShape &shape
this->data_end_ = data_ + offset_arr[i];
DS_ASSERT(num_bytes == 0);
MS_ASSERT(num_bytes == 0);
if (shape.known()) Tensor::Reshape(shape);
}
Tensor::Tensor(const dataengine::BytesList &bytes_list, const TensorShape &shape)
@ -191,7 +191,7 @@ Tensor::Tensor(const dataengine::BytesList &bytes_list, const TensorShape &shape
data_end_ = data_ + offset_arr[i];
DS_ASSERT(num_bytes == 0);
MS_ASSERT(num_bytes == 0);
if (shape.known()) Tensor::Reshape(shape);
}
Status Tensor::CreateTensor(std::shared_ptr<Tensor> *ptr, TensorImpl tensor_impl, const TensorShape &shape,
@ -420,7 +420,7 @@ bool Tensor::operator==(const Tensor &rhs) const {
// Description: A function that print the value as specified by its index
void Tensor::PrintItemAt(const std::vector<dsize_t> &index, std::ostream &out) const {
Status rc;
DS_ASSERT(data_);
MS_ASSERT(data_);
switch (type_.value()) {
CASE_PRINT_HEX(DataType::DE_BOOL, bool);

View File

@ -33,7 +33,6 @@
#include "dataset/core/data_type.h"
#include "dataset/core/tensor_shape.h"
#include "dataset/util/allocator.h"
#include "dataset/util/de_error.h"
#include "dataset/util/status.h"
#include "proto/example.pb.h"

View File

@ -22,7 +22,6 @@
#include "common/utils.h"
#include "utils/log_adapter.h"
#include "dataset/core/constants.h"
#include "dataset/util/de_error.h"
namespace mindspore {
namespace dataset {

View File

@ -97,13 +97,15 @@ class Connector {
virtual Status Pop(int32_t worker_id, // The worker-id of the caller. See the requirement at the top of this file.
T *result) noexcept {
{
DS_ASSERT(worker_id < num_consumers_);
MS_ASSERT(worker_id < num_consumers_);
std::unique_lock<std::mutex> lk(m_);
RETURN_IF_NOT_OK(cv_.Wait(&lk, [this, worker_id]() { return expect_consumer_ == worker_id; }));
RETURN_IF_NOT_OK(queues_[pop_from_]->PopFront(result));
pop_from_ = (pop_from_ + 1) % num_producers_;
out_buffers_count_++;
expect_consumer_ = (expect_consumer_ + 1) % num_consumers_;
}
cv_.NotifyAll();
return Status::OK();
}
@ -114,19 +116,21 @@ class Connector {
// @param worker_id The id of a worker thread calling this method.
// @param el A const lvalue element to be passed/added/pushed.
Status Push(int32_t worker_id, const T &el) noexcept {
DS_ASSERT(worker_id < static_cast<int32_t>(queues_.size()));
DS_ASSERT(queues_[worker_id] != nullptr);
MS_ASSERT(worker_id < static_cast<int32_t>(queues_.size()));
MS_ASSERT(queues_[worker_id] != nullptr);
return (queues_[worker_id]->Add(el));
}
auto out_buffers_count() const { return out_buffers_count_.load(); }
// Add an element into the DbConnector without the overhead of synchronization.
// It may block when the internal queue is full.
// The element passed to this function will be forwarded into the internal queue.
// @param worker_id The id of a worker thread calling this method.
// @param el An element to be passed/added/pushed.
virtual Status Push(int32_t worker_id, T &&el) noexcept {
DS_ASSERT(worker_id < static_cast<int32_t>(queues_.size()));
DS_ASSERT(queues_[worker_id] != nullptr);
MS_ASSERT(worker_id < static_cast<int32_t>(queues_.size()));
MS_ASSERT(queues_[worker_id] != nullptr);
return (queues_[worker_id]->Add(std::forward<T>(el)));
}
@ -138,6 +142,7 @@ class Connector {
}
expect_consumer_ = 0;
pop_from_ = 0;
out_buffers_count_ = 0;
MS_LOG(DEBUG) << "Connector counters reset.";
}
@ -198,6 +203,7 @@ class Connector {
// Used in the Pop(), when a thread call pop() but it is not the expect_consumer_.
std::mutex m_;
CondVar cv_;
std::atomic<std::int64_t> out_buffers_count_ = 0;
};
} // namespace dataset
} // namespace mindspore

View File

@ -27,7 +27,6 @@
#include "dataset/util/status.h"
#include "dataset/core/tensor_shape.h"
#include "utils/log_adapter.h"
#include "dataset/util/de_error.h"
namespace mindspore {
namespace dataset {
@ -184,35 +183,7 @@ TensorShape ColDescriptor::shape() const {
const char DataSchema::DEFAULT_DATA_SCHEMA_FILENAME[] = "datasetSchema.json";
// Constructor 1: Simple constructor that leaves things uninitialized.
DataSchema::DataSchema() : dataset_type_(DatasetType::kUnknown), num_rows_(0) {}
DatasetType DataSchema::GetDatasetTYpeFromString(const std::string &type) const {
// Convert the string to a more easy to manage enum flavour of the buffer type.
if (type == "ARROW") {
return DatasetType::kArrow;
} else if (type == "TF") {
return DatasetType::kTf;
} else {
return DatasetType::kUnknown;
}
}
Status DataSchema::LoadDatasetType(const std::string &schema_file_path) {
try {
std::ifstream in(schema_file_path);
nlohmann::json js;
in >> js;
// First, get the column for the type of dataset.
dataset_type_str_ = js.value("datasetType", "");
dataset_type_ = GetDatasetTYpeFromString(dataset_type_str_);
dir_structure_ = js.value("directoryStructure", "");
}
// Catch any exception and convert to Status return code
catch (const std::exception &err) {
RETURN_STATUS_UNEXPECTED("Schema file failed to load");
}
return Status::OK();
}
DataSchema::DataSchema() : num_rows_(0) {}
// Internal helper function. Parses the json schema file in any order and produces a schema that
// does not follow any particular order (json standard does not enforce any ordering protocol).
@ -400,8 +371,6 @@ Status DataSchema::LoadSchemaString(const std::string &schema_json_string,
nlohmann::json js = nlohmann::json::parse(schema_json_string);
RETURN_IF_NOT_OK(PreLoadExceptionCheck(js));
num_rows_ = js.value("numRows", 0);
dataset_type_str_ = js.value("datasetType", "");
dataset_type_ = GetDatasetTYpeFromString(dataset_type_str_);
nlohmann::json column_tree = js.at("columns");
if (column_tree.empty()) {
RETURN_STATUS_UNEXPECTED("columns is null");
@ -425,22 +394,16 @@ DataSchema::~DataSchema() = default;
// Getter for the ColDescriptor by index
const ColDescriptor &DataSchema::column(int32_t idx) const {
DS_ASSERT(idx < static_cast<int>(col_descs_.size()));
MS_ASSERT(idx < static_cast<int>(col_descs_.size()));
return col_descs_[idx];
}
// A print method typically used for debugging
void DataSchema::Print(std::ostream &out) const {
out << "Dataset type string : (";
if (dataset_type_str_.empty()) {
out << "none specified)\n";
} else {
out << dataset_type_str_ << ")\n";
}
out << "Dataset schema: (";
for (const auto &col_desc : col_descs_) {
out << col_desc << "\n";
}
out << "Dataset type: " << static_cast<uint32_t>(dataset_type_) << "\n";
}
// Adds a column descriptor to the schema

View File

@ -30,196 +30,176 @@
namespace mindspore {
namespace dataset {
// A simple class to provide meta info about a column.
/// \class ColDescriptor data_schema.h
/// \brief A simple class to provide meta info about a column.
class ColDescriptor {
public:
// Constructor 1: Simple constructor that leaves things uninitialized.
/// \brief Constructor 1: Simple constructor that leaves things uninitialized.
ColDescriptor();
// Constructor 2: Main constructor
// @param col_name - The name of the column
// @param col_type - The DE Datatype of the column
// @param tensor_impl - The (initial) type of tensor implementation for the column
// @param rank - The number of dimension of the data
// @param in_shape - option argument for input shape
/// \brief Constructor 2: Main constructor
/// \param[in] col_name - The name of the column
/// \param[in] col_type - The DE Datatype of the column
/// \param[in] tensor_impl - The (initial) type of tensor implementation for the column
/// \param[in] rank - The number of dimension of the data
/// \param[in] in_shape - option argument for input shape
ColDescriptor(const std::string &col_name, DataType col_type, TensorImpl tensor_impl, int32_t rank,
const TensorShape *in_shape = nullptr);
// Explicit copy constructor is required
// @param in_cd - the source ColDescriptor
/// \brief Explicit copy constructor is required
/// \param[in] in_cd - the source ColDescriptor
ColDescriptor(const ColDescriptor &in_cd);
// Assignment overload
// @param in_cd - the source ColDescriptor
/// \brief Assignment overload
/// \param in_cd - the source ColDescriptor
ColDescriptor &operator=(const ColDescriptor &in_cd);
// Destructor
/// \brief Destructor
~ColDescriptor();
// A print method typically used for debugging
// @param out - The output stream to write output to
/// \brief A print method typically used for debugging
/// \param out - The output stream to write output to
void Print(std::ostream &out) const;
// Given a number of elements, this function will compute what the actual Tensor shape would be.
// If there is no starting TensorShape in this column, or if there is a shape but it contains
// an unknown dimension, then the output shape returned shall resolve dimensions as needed.
// @param num_elements - The number of elements in the data for a Tensor
// @param out_shape - The materialized output Tensor shape
// @return Status - The error code return
/// \brief Given a number of elements, this function will compute what the actual Tensor shape would be.
/// If there is no starting TensorShape in this column, or if there is a shape but it contains
/// an unknown dimension, then the output shape returned shall resolve dimensions as needed.
/// \param[in] num_elements - The number of elements in the data for a Tensor
/// \param[inout] out_shape - The materialized output Tensor shape
/// \return Status - The error code return
Status MaterializeTensorShape(int32_t num_elements, TensorShape *out_shape) const;
// << Stream output operator overload
// @notes This allows you to write the debug print info using stream operators
// @param out - reference to the output stream being overloaded
// @param cd - reference to the ColDescriptor to display
// @return - the output stream must be returned
/// \brief << Stream output operator overload
/// This allows you to write the debug print info using stream operators
/// \param[in] out - reference to the output stream being overloaded
/// \param[in] cd - reference to the ColDescriptor to display
/// \return - the output stream must be returned
friend std::ostream &operator<<(std::ostream &out, const ColDescriptor &cd) {
cd.Print(out);
return out;
}
// getter function
// @return The column's DataType
/// \brief getter function
/// \return The column's DataType
DataType type() const { return type_; }
// getter function
// @return The column's rank
/// \brief getter function
/// \return The column's rank
int32_t rank() const { return rank_; }
// getter function
// @return The column's name
/// \brief getter function
/// \return The column's name
std::string name() const { return col_name_; }
// getter function
// @return The column's shape
/// \brief getter function
/// \return The column's shape
TensorShape shape() const;
// getter function
// @return TF if the column has an assigned fixed shape.
/// \brief getter function
/// \return TF if the column has an assigned fixed shape.
bool hasShape() const { return tensor_shape_ != nullptr; }
// getter function
// @return The column's tensor implementation type
/// \brief getter function
/// \return The column's tensor implementation type
TensorImpl tensorImpl() const { return tensor_impl_; }
private:
DataType type_; // The columns type
int32_t rank_; // The rank for this column (number of dimensions)
TensorImpl tensor_impl_; // The initial flavour of the tensor for this column.
TensorImpl tensor_impl_; // The initial flavour of the tensor for this column
std::unique_ptr<TensorShape> tensor_shape_; // The fixed shape (if given by user)
std::string col_name_; // The name of the column
};
// A list of the columns.
/// \class DataSchema data_schema.h
/// \brief A list of the columns.
class DataSchema {
public:
// Constructor
/// \brief Constructor
DataSchema();
// Destructor
/// \brief Destructor
~DataSchema();
// Populates the schema with a dataset type from a json file. It does not populate any of the
// column info. To populate everything, use loadSchema() afterwards.
// @param schema_file_path - Absolute path to the schema file to use for getting dataset type info.
Status LoadDatasetType(const std::string &schema_file_path);
// Parses a schema json file and populates the columns and meta info.
// @param schema_file_path - the schema file that has the column's info to load
// @param columns_to_load - list of strings for columns to load. if empty, assumes all columns.
// @return Status - The error code return
/// \brief Parses a schema json file and populates the columns and meta info.
/// \param[in] schema_file_path - the schema file that has the column's info to load
/// \param[in] columns_to_load - list of strings for columns to load. if empty, assumes all columns.
/// \return Status - The error code return
Status LoadSchemaFile(const std::string &schema_file_path, const std::vector<std::string> &columns_to_load);
// Parses a schema JSON string and populates the columns and meta info.
// @param schema_json_string - the schema file that has the column's info to load
// @param columns_to_load - list of strings for columns to load. if empty, assumes all columns.
// @return Status - The error code return
/// \brief Parses a schema JSON string and populates the columns and meta info.
/// \param[in] schema_json_string - the schema file that has the column's info to load
/// \param[in] columns_to_load - list of strings for columns to load. if empty, assumes all columns.
/// \return Status - The error code return
Status LoadSchemaString(const std::string &schema_json_string, const std::vector<std::string> &columns_to_load);
// A print method typically used for debugging
// @param out - The output stream to write output to
/// \brief A print method typically used for debugging
/// \param[in] out - The output stream to write output to
void Print(std::ostream &out) const;
// << Stream output operator overload
// @notes This allows you to write the debug print info using stream operators
// @param out - reference to the output stream being overloaded
// @param ds - reference to the DataSchema to display
// @return - the output stream must be returned
/// \brief << Stream output operator overload. This allows you to write the debug print info using stream operators
/// \param[in] out - reference to the output stream being overloaded
/// \param[in] ds - reference to the DataSchema to display
/// \return - the output stream must be returned
friend std::ostream &operator<<(std::ostream &out, const DataSchema &ds) {
ds.Print(out);
return out;
}
// Adds a column descriptor to the schema
// @param cd - The ColDescriptor to add
// @return Status - The error code return
/// \brief Adds a column descriptor to the schema
/// \param[in] cd - The ColDescriptor to add
/// \return Status - The error code return
Status AddColumn(const ColDescriptor &cd);
// Setter
// @param in_type - The Dataset type to set into the schema
void set_dataset_type(DatasetType in_type) { dataset_type_ = in_type; }
// getter
// @return The dataset type of the schema
DatasetType dataset_type() const { return dataset_type_; }
// getter
// @return The reference to a ColDescriptor to get (const version)
/// \brief getter
/// \return The reference to a ColDescriptor to get (const version)
const ColDescriptor &column(int32_t idx) const;
// getter
// @return The number of columns in the schema
/// \brief getter
/// \return The number of columns in the schema
int32_t NumColumns() const { return col_descs_.size(); }
bool Empty() const { return NumColumns() == 0; }
std::string dir_structure() const { return dir_structure_; }
std::string dataset_type_str() const { return dataset_type_str_; }
/// \brief getter
/// \return The number of rows read from schema
int64_t num_rows() const { return num_rows_; }
static const char DEFAULT_DATA_SCHEMA_FILENAME[];
// Loops through all columns in the schema and returns a map with the column
// name to column index number.
// @param out_column_name_map - The output map of columns names to column index
// @return Status - The error code return
/// \brief Loops through all columns in the schema and returns a map with the column name to column index number.
/// \param[inout] out_column_name_map - The output map of columns names to column index
/// \return Status - The error code return
Status GetColumnNameMap(std::unordered_map<std::string, int32_t> *out_column_name_map);
private:
// Internal helper function. Parses the json schema file in any order and produces a schema that
// does not follow any particular order (json standard does not enforce any ordering protocol).
// This one produces a schema that contains all of the columns from the schema file.
// @param column_tree - The nlohmann tree from the json file to parse
// @return Status - The error code return
/// \brief Internal helper function. Parses the json schema file in any order and produces a schema that
/// does not follow any particular order (json standard does not enforce any ordering protocol).
/// This one produces a schema that contains all of the columns from the schema file.
/// \param[in] column_tree - The nlohmann tree from the json file to parse
/// \return Status - The error code return
Status AnyOrderLoad(nlohmann::json column_tree);
// Internal helper function. For each input column name, perform a lookup to the json document to
// find the matching column. When the match is found, process that column to build the column
// descriptor and add to the schema in the order in which the input column names are given.
// @param column_tree - The nlohmann tree from the json file to parse
// @param columns_to_load - list of strings for the columns to add to the schema
// @return Status - The error code return
/// \brief Internal helper function. For each input column name, perform a lookup to the json document to
/// find the matching column. When the match is found, process that column to build the column
/// descriptor and add to the schema in the order in which the input column names are given.
/// \param[in] column_tree - The nlohmann tree from the json file to parse
/// \param[in] columns_to_load - list of strings for the columns to add to the schema
/// \return Status - The error code return
Status ColumnOrderLoad(nlohmann::json column_tree, const std::vector<std::string> &columns_to_load);
// Internal helper function. Given the json tree for a given column, load it into our schema.
// @param columnTree - The nlohmann child tree for a given column to load.
// @param col_name - The string name of the column for that subtree.
// @return Status - The error code return
/// \brief Internal helper function. Given the json tree for a given column, load it into our schema.
/// \param[in] columnTree - The nlohmann child tree for a given column to load.
/// \param[in] col_name - The string name of the column for that subtree.
/// \return Status - The error code return
Status ColumnLoad(nlohmann::json column_child_tree, const std::string &col_name);
// Internal helper function. Performs sanity checks on the json file setup.
// @param js - The nlohmann tree for the schema file
// @return Status - The error code return
/// \brief Internal helper function. Performs sanity checks on the json file setup.
/// \param[in] js - The nlohmann tree for the schema file
/// \return Status - The error code return
Status PreLoadExceptionCheck(const nlohmann::json &js);
DatasetType GetDatasetTYpeFromString(const std::string &type) const;
std::vector<ColDescriptor> col_descs_; // Vector of column descriptors
std::string dataset_type_str_; // A string that represents the type of dataset
DatasetType dataset_type_; // The numeric form of the dataset type from enum
std::string dir_structure_; // Implicit or flatten
int64_t num_rows_;
};
} // namespace dataset

View File

@ -27,7 +27,7 @@
namespace mindspore {
namespace dataset {
// Constructor of the IteratorBase
IteratorBase::IteratorBase() : curr_buffer_(nullptr), eof_handled_(false), first_row_(true) {}
IteratorBase::IteratorBase() : curr_buffer_(nullptr), eof_handled_(false) {}
IteratorBase::~IteratorBase() = default;
@ -51,13 +51,10 @@ Status IteratorBase::GetNextAsMap(TensorMap *out_map) {
// The column name mapping comes from the source operator that is producing the data into the iterator.
// To avoid having to fetch this for every time, we'll take a local copy of the column name id mapping
// and save in the iterator. We only have to do this once. All subsequent iterations use the same mapping.
// Note: This can only be done after the first row has been produced, as this guarantees the the child has
// it's column mapping set up.
if (first_row_) {
if (col_name_id_map_.empty()) {
// Determine the column name map by calling the derived class method to retrieve the column
// name map
col_name_id_map_ = this->GetColumnNameMap();
first_row_ = false;
}
// Populate the out map from the row and return it

View File

@ -72,7 +72,6 @@ class IteratorBase {
protected:
std::unique_ptr<DataBuffer> curr_buffer_; // holds the current buffer
bool eof_handled_; // T/F if this op got an eof
bool first_row_; // internal tracking for first row case
std::unordered_map<std::string, int32_t> col_name_id_map_;
};

View File

@ -144,9 +144,6 @@ Status BarrierOp::prepare(TensorQTable *const table) {
table->push_back(std::move(new_row));
// Assign the column name id map
RETURN_IF_NOT_OK(DatasetOp::AssignColMapFromChild());
// the update code below shouldn't do anything bad if the column name already exists.
return Status::OK();
}

View File

@ -76,7 +76,6 @@ Status BatchOp::operator()() {
std::unique_ptr<TensorQTable> table = std::make_unique<TensorQTable>();
child_iterator_ = std::make_unique<ChildIterator>(this, 0, 0);
RETURN_IF_NOT_OK(child_iterator_->FetchNextTensorRow(&new_row));
RETURN_IF_NOT_OK(DatasetOp::AssignColMapFromChild()); // must come after the first fetch above
int32_t cur_batch_size = 0;
RETURN_IF_NOT_OK(GetBatchSize(&cur_batch_size, CBatchInfo(0, 0, 0)));
while (child_iterator_->eof_handled() == false) {
@ -410,7 +409,7 @@ Status BatchOp::UnpackPadInfo(const PadInfo &pad_info,
// Visitor accept method for NodePass
Status BatchOp::Accept(NodePass *p, bool *modified) {
// Downcast shared pointer then call visitor
return p->RunOnNode(std::static_pointer_cast<BatchOp>(shared_from_this()), modified);
return p->RunOnNode(shared_from_base<BatchOp>(), modified);
}
} // namespace dataset

View File

@ -115,7 +115,6 @@ Status BucketBatchByLengthOp::operator()() {
TensorRow current_row;
child_iterator_ = std::make_unique<ChildIterator>(this, 0, 0);
RETURN_IF_NOT_OK(child_iterator_->FetchNextTensorRow(&current_row));
RETURN_IF_NOT_OK(AssignColMapFromChild());
while (!child_iterator_->eof_handled()) {
while (!current_row.empty()) {
int32_t element_length;

View File

@ -86,7 +86,6 @@ Status BuildVocabOp::operator()() {
child_iterator_ = std::make_unique<ChildIterator>(this, 0, 0);
TensorRow new_row;
RETURN_IF_NOT_OK(child_iterator_->FetchNextTensorRow(&new_row));
RETURN_IF_NOT_OK(AssignColMapFromChild());
if (!col_names_.empty()) {
col_ids_.reserve(col_names_.size());
for (std::string col : col_names_) {

View File

@ -66,12 +66,6 @@ Status ConcatOp::operator()() {
std::unique_ptr<DataBuffer> buf;
RETURN_IF_NOT_OK(child_[0]->GetNextBuffer(&buf));
// Obtain columns_name_id_map from child_[0]
column_name_id_map_ = child_[0]->column_name_id_map();
if (column_name_id_map_.empty()) {
RETURN_STATUS_UNEXPECTED("Child column name map cannot be empty!");
}
int eof_count = 0;
while (eof_count != children_num_) {
for (int i = 0; i < children_num_; i++) {
@ -115,17 +109,13 @@ Status ConcatOp::Verify(int32_t id, const std::unique_ptr<DataBuffer> &buf) {
buf->GetRow(0, &new_row);
if (id == 0) {
// Obtain the column name, data type and data rank in child[0]
column_name_id_ = child_[id]->column_name_id_map();
// Obtain the data type and data rank in child[0]
for (auto item : new_row) {
data_type_.push_back(item->type());
data_rank_.push_back(item->Rank());
}
} else {
// Compare the column name, data type and data rank with these in child[0]
if (child_[id]->column_name_id_map() != column_name_id_) {
RETURN_STATUS_UNEXPECTED("The column name or column order is not the same with previous dataset.");
}
// Compare the data type and data rank with these in child[0]
int32_t index = 0;
for (auto item : new_row) {
if ((item->type() != data_type_[index]) || item->Rank() != data_rank_[index++]) {
@ -138,7 +128,27 @@ Status ConcatOp::Verify(int32_t id, const std::unique_ptr<DataBuffer> &buf) {
Status ConcatOp::PrepareNodePostAction() {
RETURN_IF_NOT_OK(PipelineOp::PrepareNodePostAction());
tree_->AddToRepeatStack(shared_from_this());
tree_->AddToEOEOpStack(shared_from_this());
return Status::OK();
}
// We need to overwrite the super class ComputeColMap here because the number of children is more than 1.
Status ConcatOp::ComputeColMap() {
if (column_name_id_map_.empty()) {
// Obtain columns_name_id_map from child_[0]
column_name_id_map_ = child_[0]->column_name_id_map();
if (column_name_id_map_.empty()) {
RETURN_STATUS_UNEXPECTED("Child column name map cannot be empty!");
}
// Verify all children have the same column name map
for (int32_t i = 0; i < child_.size(); ++i) {
if (child_[i]->column_name_id_map() != column_name_id_map_) {
RETURN_STATUS_UNEXPECTED("The column name or column order is not the same with previous dataset.");
}
}
} else {
MS_LOG(WARNING) << "Column name map is already set!";
}
return Status::OK();
}
} // namespace dataset

View File

@ -85,6 +85,10 @@ class ConcatOp : public PipelineOp {
// @return Name of the current Op
std::string Name() const override { return "ConcatOp"; }
// Private function for computing the assignment of the column name map.
// @return - Status
Status ComputeColMap() override;
private:
Status Verify(int32_t id, const std::unique_ptr<DataBuffer> &buf);

View File

@ -18,29 +18,31 @@
#include <iomanip>
#include <iostream>
#include <memory>
#include <regex>
#include <utility>
#include <string>
#include <algorithm>
#include "dataset/engine/execution_tree.h"
#include "dataset/engine/datasetops/device_queue_op.h"
#include "dataset/engine/datasetops/source/sampler/sampler.h"
#include "dataset/engine/data_buffer.h"
#include "dataset/engine/db_connector.h"
#include "dataset/engine/opt/pass.h"
#include "utils/system/crc32c.h"
#include "utils/log_adapter.h"
namespace mindspore {
namespace dataset {
// Constructor
DatasetOp::DatasetOp(int32_t op_connector_size)
DatasetOp::DatasetOp(int32_t op_connector_size, std::shared_ptr<Sampler> sampler)
: oc_queue_size_(op_connector_size),
sampler_(sampler),
operator_id_(kInvalidOperatorId),
tree_(nullptr),
state_(OpState::kDeOpIdle),
op_ctrl_flags_(kDeOpNone),
out_connector_(nullptr),
first_fetch_(true) {
out_connector_(nullptr) {
// The operator starts out with an invalid operator id. The only way to
// get it out of invalid state is to assign the operator to an execution tree.
}
@ -105,13 +107,58 @@ Status DatasetOp::InsertAsParent(std::shared_ptr<DatasetOp> to_add) {
void DatasetOp::AddParent(DatasetOp *parent) { parent_.push_back(parent); }
// Removes a parent operator from this operator
void DatasetOp::RemoveParent(DatasetOp *parent) {
void DatasetOp::RemoveParent(const DatasetOp *parent) {
parent_.erase(std::remove(parent_.begin(), parent_.end(), parent), parent_.end());
}
// Removes this node from the tree and connects it's parent/child together
Status DatasetOp::Remove() {
if (parent_.size() > 1) {
std::string err_msg("No support for op removal if the operator has more than one parent");
RETURN_STATUS_UNEXPECTED(err_msg);
}
if (child_.size() > 1) {
std::string err_msg("No support for op removal if the operator has more than one child");
RETURN_STATUS_UNEXPECTED(err_msg);
}
// Scenario's when removing node B:
// A -> B -> C
// A -> B
// B -> C
//
// If we remove B, then first take our child A and update it's parent to be C
// It's possible the parent is null if we are the root node being removed.
if (!child_.empty()) {
// If we have a parent, then assign chlid's parent to point to our parent.
if (!parent_.empty()) {
child_[0]->parent_[0] = parent_[0];
} else {
// We don't have a parent, so we are the root node being removed.
// clear the parent list of our child so that it becomes the new root.
child_[0]->parent_.clear();
tree_->AssignRoot(child_[0]);
}
}
// Next, if we had a parent, then set it's child to be our child.
if (!parent_.empty()) {
// if we have a child, then set our parent to point to it
if (!child_.empty()) {
parent_[0]->child_[0] = child_[0];
} else {
// We don't have a child, so clear the child list of the current
// parent because it will be empty once we are removed.
parent_[0]->child_.clear();
}
}
return Status::OK();
}
// Getter function to get a shared pointer to our childAdds a operator to become our child.
std::shared_ptr<DatasetOp> DatasetOp::child(int32_t child_index) const {
DS_ASSERT(child_index < static_cast<int>(child_.size()));
MS_ASSERT(child_index < static_cast<int>(child_.size()));
// Return a shared pointer
return child_[child_index];
}
@ -151,6 +198,9 @@ void DatasetOp::Print(std::ostream &out, bool show_all) const {
}
out << "\nConnector queue size : " << oc_queue_size_ << "\nOperator control flags : 0x" << std::hex
<< std::setw(8) << std::setfill('0') << op_ctrl_flags_ << std::dec << std::setfill(' ');
if (sampler_) {
sampler_->Print(out, show_all);
}
}
}
@ -223,11 +273,10 @@ Status DatasetOp::PrepareNodePreAction() {
Status DatasetOp::PrepareNodePostAction() {
// If this op does not have any children and it is in a repeat path of the tree...
if (child_.empty() && BitTest(op_ctrl_flags_, kDeOpRepeated)) {
// push ourselves onto the tree repeat stack. Later, the repeat operator
// push ourselves onto the eoe operator stack. Later, a repeat/epoch ctrl operator
// above us will consume them.
tree_->AddToRepeatStack(shared_from_this());
tree_->AddToEOEOpStack(shared_from_this());
}
// Creating Connector object for each op.
// The consumer of the root node is assumed to be one thread.
// If multiple threads are consuming from the root node, they will get the ordered data in round robin fashion.
@ -240,6 +289,10 @@ Status DatasetOp::PrepareNodePostAction() {
RETURN_IF_NOT_OK(out_connector_->Register(tree_->AllTasks()));
}
RETURN_IF_NOT_OK(this->RegisterWorkerConnectors());
// Generate the column name map for the current op.
RETURN_IF_NOT_OK(this->ComputeColMap());
return Status::OK();
}
@ -262,38 +315,84 @@ std::string DatasetOp::ColumnNameMapAsString() const {
return outStr;
}
// A helper function for providing assignment of the column name map.
// This grabs the map from child 0 and assigns it into this op.
// Can only be used if number of children is 1.
Status DatasetOp::AssignColMapFromChild() {
// Computing the assignment of the column name map.
// This just inherits the column map from its first child, can only be used if the number of children is 1.
// Operations changing the column map must overwrite this function.
Status DatasetOp::ComputeColMap() {
if (child_.size() > 1) {
RETURN_STATUS_UNEXPECTED("Assigning column name map from child only works for single-child operators.");
}
// Assign the correct column name map to this op by taking it from the input child.
// This must be done AFTER the first fetch, but only needs to be done once by the first worker to
// do the first fetch.
if (first_fetch_) {
// If there was a single worker, or this is being called from a master thread in a parallel op,
// then the mutex is not really needed here, although it's harmless.
std::unique_lock<std::mutex> lock(column_name_map_mutex_);
// If the map has not been set up yet, then we are the first one in to set it up. The first_fetch_ (dirty read)
// bool allows us to avoid acquiring the lock if the map has already been set.
if (column_name_id_map_.empty()) {
column_name_id_map_ = child_[0]->column_name_id_map();
if (column_name_id_map_.empty()) {
column_name_id_map_ = child_[0]->column_name_id_map();
first_fetch_ = false;
if (column_name_id_map_.empty()) {
RETURN_STATUS_UNEXPECTED("Child column name map cannot be empty!");
}
RETURN_STATUS_UNEXPECTED("Child column name map cannot be empty!");
}
MS_LOG(DEBUG) << "Setting column map after first fetch:\n" << DatasetOp::ColumnNameMapAsString();
MS_LOG(DEBUG) << "Setting column map:\n" << DatasetOp::ColumnNameMapAsString();
} else {
MS_LOG(WARNING) << "Column name map is already set!";
}
return Status::OK();
}
Status DatasetOp::PreAccept(NodePass *p, bool *modified) {
// DatasetOp is the base class of visitor target pre-visit.
// This method will only be called if its derived class does not implement one.
return p->PreRunOnNode(shared_from_this(), modified);
}
Status DatasetOp::Accept(NodePass *p, bool *modified) {
// DatasetOp is the base class of visitor target.
// This method will only be called if its derived class does not implement one.
return p->RunOnNode(shared_from_this(), modified);
}
// A helper function with some common code that leaf nodes can use during
// prepare phase for checking if they need to assign a sampler to the cache.
Status DatasetOp::SaveSamplerForCache(bool random_access_op) {
// If we are a descendant under a cache op and we have a sampler, then save this sampler
// to a stack so that the cache can pick it up during it's processing above us.
if (sampler_) {
if (BitTest(tree_->PrepareFlags(), ExecutionTree::kDePrepCache)) {
// use move semantic to set our sampler_ to null after the move. This is okay because a sampler is
// useless to a random data op. It was only being used as a temporary holding until the cache can
// be created
tree_->AddToSamplerStack(sampler_);
MS_LOG(INFO) << "Preparing a leaf op: passing sampler up the tree for Cache handling.";
} else if (!random_access_op) {
// A sampler exists, but we are not in a caching tree and we are not a random access mappable leaf.
// This is an error because that type of leaf does not use sampling unless there's a cache to hook it into.
RETURN_STATUS_UNEXPECTED(
"Non-mappable leaf op has a sampler, but it only supports sampling if there is a cache after it in the tree");
}
}
if (!random_access_op) {
// Since we don't truly need the sampler for this non-mappable dataset and it's been saved for the cache
// we can remove it now from the base.
sampler_.reset();
}
return Status::OK();
}
uint32_t DatasetOp::GenerateCRC(const std::shared_ptr<DatasetOp> &op) {
std::stringstream ss;
op->tree_->Print(ss, op);
std::string ss_str = ss.str();
// Filter out the Operator control flags field when generating the check sum
ss_str = std::regex_replace(ss_str, std::regex("Operator control flags.*\n"), "");
// Filter out the Device id field to allow cache sharing for a distributed run of the same pipeline
ss_str = std::regex_replace(ss_str, std::regex("Device id.*\n"), "");
ss_str = std::regex_replace(ss_str, std::regex("device_id.*\n"), "");
// The Cache crc and Server cache id field is different when creating new cache_client and re-using the same
// cache_client later. So we filter out these two fields to allow cache sharing.
ss_str = std::regex_replace(ss_str, std::regex("Cache crc.*\n"), "");
ss_str = std::regex_replace(ss_str, std::regex("Server cache id.*\n"), "");
uint32_t cache_crc = system::Crc32c::GetMaskCrc32cValue(ss_str.c_str(), ss_str.length());
return cache_crc;
}
} // namespace dataset
} // namespace mindspore

View File

@ -34,8 +34,10 @@ class DataBuffer;
class NodePass;
// The base class DatasetOp is the main tree node. It is an abstract class, so
// the actual implementation of the operators will be derived from here.
class Sampler;
/// \brief The base class DatasetOp is the main tree node. It is an abstract class, so
/// the actual implementation of the operators will be derived from here.
class DatasetOp : public std::enable_shared_from_this<DatasetOp> {
// Allow execution tree to access internal members
friend class ExecutionTree;
@ -53,109 +55,114 @@ class DatasetOp : public std::enable_shared_from_this<DatasetOp> {
// Flags that control operator runtime behaviours
enum OpState { kDeOpRunning = 0, kDeOpIdle = 1, kDeOpTerminated };
// Constructor
// @param op_connector_size - The size for the output connector of this operator.
explicit DatasetOp(int32_t op_connector_size);
/// Constructor
/// \param op_connector_size - The size for the output connector of this operator.
/// \param sampler - The sampler for the op
explicit DatasetOp(int32_t op_connector_size, std::shared_ptr<Sampler> sampler);
// Destructor
/// Destructor
virtual ~DatasetOp() { tree_ = nullptr; }
// Adds a operator to become our child.
// @param child - shared pointer to the child to add.
/// Adds a operator to become our child.
/// \param child - shared pointer to the child to add.
Status AddChild(std::shared_ptr<DatasetOp> child);
// Remove a operator from our children.
// @param child - shared pointer to the child to remove.
/// Remove a operator from our children.
/// \param child - shared pointer to the child to remove.
Status RemoveChild(std::shared_ptr<DatasetOp> child);
// Getter function to get a shared pointer to our child
// @param child_index - An operator can have n children. Indicates choose which child to return.
/// \brief Removes this node from the tree and connects it's parent/child together.
/// \return Status eerror code returned
Status Remove();
/// \brief Getter function to get a shared pointer to our child
/// \param child_index - An operator can have n children. Indicates choose which child to return.
std::shared_ptr<DatasetOp> child(int32_t child_index) const;
// Inserts a operator as the parent current op.
// Inserted op will become the sole parent of the current op.
// The existing parent of the current op will be transferred to the inserted op.
/// \brief Inserts a operator as the parent current op.
/// Inserted op will become the sole parent of the current op.
/// The existing parent of the current op will be transferred to the inserted op.
Status InsertAsParent(std::shared_ptr<DatasetOp> to_add);
// Creates the connector within this operator
// @param num_producers - number of threads that write into this connector
// @param num_consumers - number of threads that read from this connector
/// \brief Creates the connector within this operator
/// \param num_producers - number of threads that write into this connector
/// \param num_consumers - number of threads that read from this connector
void CreateConnector(int32_t num_producers, int32_t num_consumers);
// A print method typically used for debugging
// @param out - The output stream to write output to
// @param show_all - A bool to control if you want to show all info or just a summary
/// \brief A print method typically used for debugging
/// \param out - The output stream to write output to
/// \param show_all - A bool to control if you want to show all info or just a summary
virtual void Print(std::ostream &out, bool show_all) const;
// << Stream output operator overload
// @notes This allows you to write the debug print info using stream operators
// @param out - reference to the output stream being overloaded
// @param dO - reference to the DatasetOp to display
// @return - the output stream must be returned
/// \brief << Stream output operator overload
/// \notes This allows you to write the debug print info using stream operators
/// \param out - reference to the output stream being overloaded
/// \param dO - reference to the DatasetOp to display
/// \return - the output stream must be returned
friend std::ostream &operator<<(std::ostream &out, const DatasetOp &dO) {
dO.Print(out, false);
return out;
}
// Class functor operator ().
// DatasetOps operate by launching a thread (see ExecutionTree).
// This pure virtual version makes the requirement that derived classes must provide a functor
// that will execute their main runtime loop code.
// @return Status - The error code return
/// \brief Class functor operator ().
/// DatasetOps operate by launching a thread (see ExecutionTree).
/// This pure virtual version makes the requirement that derived classes must provide a functor
/// that will execute their main runtime loop code.
/// \return Status - The error code return
virtual Status operator()() = 0;
// Gets the next buffer from the given child
// @notes See GetNextInput for similar function that has built-in message handling
// @param p_buffer - The shared pointer for the fetched buffer to return (by reference)
// @param worker_id - The worker id
// @return Status - The error code return
/// \brief Gets the next buffer from the given child
/// \notes See GetNextInput for similar function that has built-in message handling
/// \param p_buffer - The shared pointer for the fetched buffer to return (by reference)
/// \param worker_id - The worker id
/// \return Status - The error code return
virtual Status GetNextBuffer(std::unique_ptr<DataBuffer> *p_buffer, int32_t worker_id) {
return GetNextBuffer(p_buffer, worker_id, false);
}
// Gets the next buffer from the given child
// @notes See GetNextInput for similar function that has built-in message handling
// @param p_buffer - The shared pointer for the fetched buffer to return (by reference)
// @return Status - The error code return
/// \brief Gets the next buffer from the given child
/// \notes See GetNextInput for similar function that has built-in message handling
/// \param p_buffer - The shared pointer for the fetched buffer to return (by reference)
/// \return Status - The error code return
virtual Status GetNextBuffer(std::unique_ptr<DataBuffer> *p_buffer) { return GetNextBuffer(p_buffer, 0, false); }
// Gets the next buffer from the given child
// @notes See GetNextInput for similar function that has built-in message handling
// @param p_buffer - The shared pointer for the fetched buffer to return (by reference)
// @param worker_id - The worker id
// @param retry_if_eoe Set this flag to true to allow calling pop() again after the first pop() returns EOE.
// @return Status - The error code return
/// \brief Gets the next buffer from the given child
/// \notes See GetNextInput for similar function that has built-in message handling
/// \param p_buffer - The shared pointer for the fetched buffer to return (by reference)
/// \param worker_id - The worker id
/// \param retry_if_eoe Set this flag to true to allow calling pop() again after the first pop() returns EOE.
/// \return Status - The error code return
virtual Status GetNextBuffer(std::unique_ptr<DataBuffer> *p_buffer, int32_t worker_id, bool retry_if_eoe);
// Gets the next buffer from the given child . This function also has built-in eoe and eof
// message handling so that child classes don't have to manually code pass-through logic when
// those messages are received.
// @param p_buffer - The shared pointer for the fetched buffer to return (by reference)
// @param worker_id - The worker id
// @return Status - The error code return
/// \brief Gets the next buffer from the given child . This function also has built-in eoe and eof
/// message handling so that child classes don't have to manually code pass-through logic when
/// those messages are received.
/// \param p_buffer - The shared pointer for the fetched buffer to return (by reference)
/// \param worker_id - The worker id
/// \return Status - The error code return
Status GetNextInput(std::unique_ptr<DataBuffer> *p_buffer, int32_t worker_id = 0, int32_t child_index = 0);
// Performs handling for when an eoe message is received.
// The base class implementation simply flows the eoe message to output. Derived classes
// may override if they need to perform special eoe handling.
// @param worker_id - The worker id
// @return Status - The error code return
/// \brief Performs handling for when an eoe message is received.
/// The base class implementation simply flows the eoe message to output. Derived classes
/// may override if they need to perform special eoe handling.
/// \param worker_id - The worker id
/// \return Status - The error code return
virtual Status EoeReceived(int32_t worker_id);
// Performs handling for when an eof message is received.
// The base class implementation simply flows the eof message to output. Derived classes
// may override if they need to perform special eof handling.
// @param worker_id - The worker id
// @return Status - The error code return
/// \brief Performs handling for when an eof message is received.
/// The base class implementation simply flows the eof message to output. Derived classes
/// may override if they need to perform special eof handling.
/// \param worker_id - The worker id
/// \return Status - The error code return
virtual Status EofReceived(int32_t worker_id);
// Derived classes may implement the reset function if the operator is stateful and needs
// specific reset handling that is not contained in this common code version of the reset
// @return Status - The error code return
/// \brief Derived classes may implement the reset function if the operator is stateful and needs
/// specific reset handling that is not contained in this common code version of the reset
/// \return Status - The error code return
virtual Status Reset();
// This calls the reset function on this subtree in pre-order
// @return Status - The error code return
/// \brief This calls the reset function on this subtree in pre-order
/// \return Status - The error code return
virtual Status ResetSubtree() {
RETURN_IF_NOT_OK(Reset());
for (const auto &c : child_) {
@ -164,64 +171,68 @@ class DatasetOp : public std::enable_shared_from_this<DatasetOp> {
return Status::OK();
}
// During tree prepare phase, operators may have specific pre-operations to perform depending on
// their role.
// @notes Derived versions of this function should always call it's superclass version first
// before providing their own implementations.
/// \brief During tree prepare phase, operators may have specific pre-operations to perform depending on
/// their role.
/// \notes Derived versions of this function should always call it's superclass version first
/// before providing their own implementations.
virtual Status PrepareNodePreAction();
// During tree prepare phase, operators may have specific post-operations to perform depending on
// their role.
// @notes Derived versions of this function should always call it's superclass version first
// before providing their own implementations.
/// \brief During tree prepare phase, operators may have specific post-operations to perform depending on
/// their role.
/// \notes Derived versions of this function should always call it's superclass version first
/// before providing their own implementations.
virtual Status PrepareNodePostAction();
// Getter function
// @return The operator id
/// \brief Getter function
/// \return The operator id
int32_t id() const { return operator_id_; }
// Getter function
// @return The prepare flags
/// \brief Getter function
/// \return The prepare flags
virtual uint32_t PrepareFlags() const;
// Getter function
// @return The number of workers in this op
/// \brief Getter function
/// \return The number of workers in this op
virtual int32_t num_workers() const = 0;
// Getter function
// @return The number of threads consuming from previous op.
/// \brief Getter function
/// \return The number of threads consuming from previous op.
virtual int32_t num_consumers() const = 0;
// Getter function
// @return The number of threads producing to the output connector.
/// \brief Getter function
/// \return The number of threads producing to the output connector.
virtual int32_t num_producers() const = 0;
// Getter function
// @return T/F if this is an inlined operator
/// \brief Getter function
/// \return T/F if this is an inlined operator
bool inlined() const { return (oc_queue_size_ == 0); }
// Setter function
// @return Sets the control flags
/// \brief Setter function
/// \return Sets the control flags
void set_control_flag(uint64_t flag) { BitSet(&op_ctrl_flags_, flag); }
// Register the internal worker connectors. No op unless it is a parallel op
// @return Status
/// \brief Setter function
/// \return Sets the control flags
void ClearControlFlag(uint64_t flag) { BitClear(&op_ctrl_flags_, flag); }
/// \brief Register the internal worker connectors. No op unless it is a parallel op
/// \return Status
virtual Status RegisterWorkerConnectors() { return Status::OK(); }
// Getter for the column name mapping
// @return The returned map
/// \brief Getter for the column name mapping
/// \return The returned map
std::unordered_map<std::string, int32_t> column_name_id_map() const { return column_name_id_map_; }
// Checks if the column name map has been set up yet for this op
// @return - T/F if the operator has the map set up
/// \brief Checks if the column name map has been set up yet for this op
/// \return - T/F if the operator has the map set up
bool HasColumnNameMap() const { return (column_name_id_map_.empty()); }
// gives a string output for the column map for handy debug printing
// @return - the column name map as a string
/// \brief gives a string output for the column map for handy debug printing
/// \return - the column name map as a string
std::string ColumnNameMapAsString() const;
// Getter function
// @return connector size of current op
/// \brief Getter function
/// \return connector size of current op
int32_t ConnectorSize() const {
if (!inlined()) {
return out_connector_->size();
@ -230,8 +241,13 @@ class DatasetOp : public std::enable_shared_from_this<DatasetOp> {
return ChildOpConnectorSize();
}
// Getter function
// @return connector size of current op
/// \brief Counting number of buffer sent out by a connector
int64_t ConnectorOutBufferCount() const {
return out_connector_ == nullptr ? int64_t(-1) : static_cast<int64_t>(out_connector_->out_buffers_count());
}
/// \brief Getter function
/// \return connector size of current op
int32_t ConnectorCapacity() const {
if (!inlined()) {
return out_connector_->capacity();
@ -240,51 +256,84 @@ class DatasetOp : public std::enable_shared_from_this<DatasetOp> {
return ChildOpConnectorCapacity();
}
// Getter function
// @return connector size of child op
/// \brief Getter function
/// \return connector size of child op
int32_t ChildOpConnectorSize(int32_t child_index = 0) const { return child_[child_index]->ConnectorSize(); }
// Getter function
// @return connector capacity of child op
/// \brief Getter function
/// \return connector capacity of child op
int32_t ChildOpConnectorCapacity(int32_t child_index = 0) const { return child_[child_index]->ConnectorCapacity(); }
// Children Getter
// @return Vector of Children
/// \brief Children Getter
/// \return Vector of Children
std::vector<std::shared_ptr<DatasetOp>> Children() const { return child_; }
// Base method for NodePass visit.
// Subclass needs to override this if it requires special node visit access.
// Check "dataset/engine/opt/pass.h" for more details.
// @return Statue of the node visit
/// \brief Base method for NodePass pre-visit. A tree walk consists of walking down the tree and also walking back up
/// in a depth-first order. PreAccept is the node visit on the way down, whereas the regular Accept is the main
/// visit on the way back up the tree during a post-order traversal. Subclass needs to override this if it
/// requires special node visit access. Check "dataset/engine/opt/pass.h" for more details.
/// \param[in] p The node to visit
/// \param[out] modified Indicator if the node was modified
/// \return Status of the node visit
virtual Status PreAccept(NodePass *p, bool *modified);
/// \brief Base method for NodePass visit. Subclass needs to override this if it requires special node visit access.
/// Check "dataset/engine/opt/pass.h" for more details.
/// \param[in] p The node to visit
/// \param[out] modified Indicator if the node was modified
/// \return Status of the node visit
virtual Status Accept(NodePass *p, bool *modified);
// Op name getter
// @return Name of the current Op
/// Op name getter
/// \return Name of the current Op
virtual std::string Name() const { return "DatasetOp"; }
// Execution Tree getter
// @return Pointer to the ExecutionTree the current op belongs to, no ownership
/// Execution Tree getter
/// \return Pointer to the ExecutionTree the current op belongs to, no ownership
ExecutionTree *Tree() { return tree_; }
/// Getter for the sampler
/// \return Shared pointer to the sampler (may return nullptr)
std::shared_ptr<Sampler> sampler() { return sampler_; }
/// Computes a CRC value for the operator
static uint32_t GenerateCRC(const std::shared_ptr<DatasetOp> &op);
/// \brief A helper templated function for casting "this" pointer to shared_ptr<derived>
/// Similar to shared_from_this, except this one will give you the derived class as shared_ptr
/// \return A shared_ptr casted to the derived class
template <typename Derived>
std::shared_ptr<Derived> shared_from_base() {
return std::static_pointer_cast<Derived>(shared_from_this());
}
protected:
// Adds a parent operator to this operator
// @notes External callers do not have access to this function.
// @param parent - The parent node to add
/// Adds a parent operator to this operator
/// \notes External callers do not have access to this function.
/// \param parent - The parent node to add
void AddParent(DatasetOp *parent);
// Removes a parent operator from this operator
// @notes External callers do not have access to this function.
// @param parent - The parent node to remove
void RemoveParent(DatasetOp *parent);
/// Removes a parent operator from this operator
/// \notes External callers do not have access to this function.
/// \param parent - The parent node to remove
void RemoveParent(const DatasetOp *parent);
// A helper function for providing an assignment of the column name map.
// This grabs the map from child 0 and assigns it into this op.
// Can only be used if number of children is 1.
// @return - Status
Status AssignColMapFromChild();
/// Compute the current op's column map using its child's column map.
/// Get called during the tree post-prepare phase in PrepareNodePostAction.
/// This base implementation just inherits the map from child 0, and can only be used if the number of children is 1.
/// Operations changing the column map it inherits from the child must overwrite this function.
/// \return - Status
virtual Status ComputeColMap();
/// A helper function with some common code that leaf nodes can use during
/// pre/pare phase for checking if they need to assign a sampler to the cache.
/// \param random_access_op - indicate if this is a mappable random access leaf or not
/// \return - Status
Status SaveSamplerForCache(bool random_access_op);
std::vector<std::shared_ptr<DatasetOp>> child_; // Child nodes
std::vector<DatasetOp *> parent_; // Parent nodes. No ownership
std::shared_ptr<Sampler> sampler_; // Some leaf ops might have a sampler
int32_t oc_queue_size_; // Capacity for each out_connector_
int32_t operator_id_; // Generated id for the node
ExecutionTree *tree_; // Back pointer to our tree.
@ -292,18 +341,17 @@ class DatasetOp : public std::enable_shared_from_this<DatasetOp> {
uint32_t op_ctrl_flags_; // Flags for the operator
std::unique_ptr<DbConnector> out_connector_; // Output Connector
std::unordered_map<std::string, int32_t> column_name_id_map_; // Mapping between col index and col name
bool first_fetch_; // For use when setting column map
std::mutex column_name_map_mutex_; // For protecting shared access to the column map
private:
// Sets the operator id.
// @notes No public interface. Only the class itself, or it's friend the execution tree can set
// this
// @param op_id - the Id value to set into the operator
/// Sets the operator id.
/// \notes No public interface. Only the class itself, or it's friend the execution tree can set
/// this
/// \param op_id - the Id value to set into the operator
void set_id(int32_t op_id) { operator_id_ = op_id; }
// Sets the tree into the op so that the operator has a back pointer to the tree.
// @param tree - the tree to assign to the op.
/// Sets the tree into the op so that the operator has a back pointer to the tree.
/// \param tree - the tree to assign to the op.
void set_tree(ExecutionTree *tree) { tree_ = tree; }
};
} // namespace dataset

View File

@ -313,7 +313,7 @@ void DeviceQueueOp::Print(std::ostream &out, bool show_all) const {
// Visitor accept method for NodePass
Status DeviceQueueOp::Accept(NodePass *p, bool *modified) {
// Downcast shared pointer then call visitor
return p->RunOnNode(std::static_pointer_cast<DeviceQueueOp>(shared_from_this()), modified);
return p->RunOnNode(shared_from_base<DeviceQueueOp>(), modified);
}
} // namespace dataset

View File

@ -126,9 +126,6 @@ Status FilterOp::WorkerEntry(int32_t worker_id) {
continue;
}
// Now that the first fetch is in, use the helper function to assign the column name map to this op.
RETURN_IF_NOT_OK(DatasetOp::AssignColMapFromChild());
RETURN_IF_NOT_OK(CheckColumns(in_buffer.get(), &in_columns_));
// if the databuffer was all filtered, it is marked as kFilterEmpty.
@ -264,7 +261,7 @@ Status FilterOp::InvokePredicateFunc(const TensorRow &input, bool *out_predicate
// Visitor accept method for NodePass
Status FilterOp::Accept(NodePass *p, bool *modified) {
// Downcast shared pointer then call visitor
return p->RunOnNode(std::static_pointer_cast<FilterOp>(shared_from_this()), modified);
return p->RunOnNode(shared_from_base<FilterOp>(), modified);
}
} // namespace dataset
} // namespace mindspore

View File

@ -54,20 +54,19 @@ Status MapOp::Builder::sanityCheck() const {
Status MapOp::Builder::Build(std::shared_ptr<MapOp> *ptr) {
RETURN_IF_NOT_OK(sanityCheck());
*ptr = std::make_shared<MapOp>(std::move(build_in_col_names_), std::move(build_out_col_names_),
std::move(build_tensor_funcs_), std::move(build_col_order_), build_num_workers_,
build_op_connector_size_, build_perf_mode_);
std::move(build_tensor_funcs_), build_num_workers_, build_op_connector_size_,
build_perf_mode_);
return Status::OK();
}
// Constructor of MapOp
MapOp::MapOp(const std::vector<std::string> &in_col_names, const std::vector<std::string> &out_col_names,
std::vector<std::shared_ptr<TensorOp>> tensor_funcs, const std::vector<std::string> &columns_order,
int32_t num_workers, int32_t op_connector_size, bool perf_mode)
std::vector<std::shared_ptr<TensorOp>> tensor_funcs, int32_t num_workers, int32_t op_connector_size,
bool perf_mode)
: ParallelOp(num_workers, op_connector_size),
tfuncs_(std::move(tensor_funcs)),
in_columns_(in_col_names),
out_columns_(out_col_names),
columns_order_(columns_order),
perf_mode_(perf_mode) {
// If caller didn't specify the out_col_names, assume they are same as the in_columns.
if (out_columns_.empty() || out_columns_[0].empty()) {
@ -101,7 +100,7 @@ void MapOp::Print(std::ostream &out, bool show_all) const {
}
out << "\n TensorOps:";
for (size_t i = 0; i < tfuncs_.size(); i++) {
out << " " << tfuncs_[i];
out << " " << *(tfuncs_[i].get());
}
out << "\n\n";
}
@ -156,14 +155,15 @@ Status MapOp::WorkerEntry(int32_t worker_id) {
// initializations that happen after the first fetch.
RETURN_IF_NOT_OK(FetchNextBuffer(&in_buffer, worker_id));
// Initialize details related to column selections and column map by calling WorkerEntryInit.
// WorkerEntryInit contains thread-safe lock to ensure that this init work is only performed once
// by the first worker to enter the codepath. All other threads will share the const info that
// gets set up here going forward.
// Sanity check the databuffer.
// Special case: if there's more threads than buffers, some threads simply get the final control
// messages (eoe/eof), and so they will not perform the init work.
// messages (eoe/eof), and so they will not perform the check.
if (!in_buffer->eoe() && !in_buffer->eof()) {
RETURN_IF_NOT_OK(WorkerEntryInit(in_buffer.get()));
int32_t num_rows = in_buffer->NumRows();
int32_t num_cols = in_buffer->NumCols();
if (num_rows == 0 || num_cols == 0) {
RETURN_STATUS_UNEXPECTED("MapOp is getting an empty DataBuffer.");
}
}
// Now that init work is done, drop into the main fetching loop.
@ -258,63 +258,18 @@ Status MapOp::WorkerCompute(DataBuffer *in_buffer, TensorQTable *new_tensor_tabl
return Status::OK();
}
// initialize some internal data structure used by WorkerEntry()
Status MapOp::WorkerEntryInit(const DataBuffer *in_buf) {
int32_t num_rows = in_buf->NumRows();
int32_t num_cols = in_buf->NumCols();
if (num_rows == 0 || num_cols == 0) {
RETURN_STATUS_UNEXPECTED("MapOp is getting an empty DataBuffer.");
Status MapOp::ComputeColMap() {
// If the map has not been set up yet in the base class, then set it up
if (column_name_id_map_.empty()) {
std::unordered_map<std::string, int32_t> current_name_id_map = child_[0]->column_name_id_map();
// Initialize private variables
RETURN_IF_NOT_OK(InitPrivateVariable(&current_name_id_map));
// Create the final column name to index mapping in the base class field
CreateFinalColMap(&current_name_id_map);
MS_LOG(DEBUG) << "Column name map for map op set: " << this->ColumnNameMapAsString();
} else {
MS_LOG(WARNING) << "Column name map is already set!";
}
// We can't use AssignColMapFromChild() here since we need to modify the column map. We need to be threadsafe
// though for saving the final map in the op, so use the lock here.
if (first_fetch_) {
std::unique_lock<std::mutex> lock(column_name_map_mutex_);
// If the map has not been set up yet in the base class, then we are the first one in to set it up
// (and we are under protection of the mutex lock)
if (column_name_id_map_.empty()) {
std::unordered_map<std::string, int32_t> current_name_id_map = child_[0]->column_name_id_map();
// If input_columns is empty(), The col at index-0 will be picked.
if (in_columns_.empty()) {
for (const auto &pair : current_name_id_map) {
if (pair.second == 0) {
MS_LOG(INFO) << "Input columns empty for map op, will apply to the first column in the current table.";
in_columns_.push_back(pair.first);
break;
}
}
// If caller didn't specify the out_col_names, assume they are same as the input_columns.
// This was done in the constructor, but if input columns was empty to start we have to redo it here.
if (out_columns_.empty() || out_columns_[0].empty()) {
out_columns_ = in_columns_;
}
}
// Before we continue, issue a sanity check to make sure the input columns from user and the incoming
// columns from child are correct
RETURN_IF_NOT_OK(this->ValidateInColumns(current_name_id_map));
// initialize keep_input_columns, true means to keep the column.
keep_input_columns_.resize(num_cols, true);
for (const auto &col_name : in_columns_) {
int32_t missed = current_name_id_map[col_name];
keep_input_columns_[missed] = false;
}
// initialize to_process_indices.
for (const auto &col_name : in_columns_) {
to_process_indices_.push_back(current_name_id_map[col_name]);
}
// Create the final column name to index mapping in the base class field
CreateFinalColMap(&current_name_id_map);
first_fetch_ = false;
}
} // mutex lock will release here
MS_LOG(DEBUG) << "Column name map for map op set: " << this->ColumnNameMapAsString();
return Status::OK();
}
@ -330,6 +285,42 @@ Status MapOp::ValidateInColumns(const std::unordered_map<std::string, int32_t> &
return Status::OK();
}
Status MapOp::InitPrivateVariable(std::unordered_map<std::string, int32_t> *col_name_id_map) {
// If input_columns is empty(), The col at index-0 will be picked.
if (in_columns_.empty()) {
for (const auto &pair : *col_name_id_map) {
if (pair.second == 0) {
MS_LOG(INFO) << "Input columns empty for map op, will apply to the first column in the current table.";
in_columns_.push_back(pair.first);
break;
}
}
// If caller didn't specify the out_col_names, assume they are same as the input_columns.
// This was done in the constructor, but if input columns was empty to start we have to redo it here.
if (out_columns_.empty() || out_columns_[0].empty()) {
out_columns_ = in_columns_;
}
}
// Before we continue, issue a sanity check to make sure the input columns from user and the incoming
// columns from child are correct
RETURN_IF_NOT_OK(this->ValidateInColumns(*col_name_id_map));
// initialize keep_input_columns, true means to keep the column.
keep_input_columns_.resize(col_name_id_map->size(), true);
for (const auto &col_name : in_columns_) {
int32_t missed = (*col_name_id_map)[col_name];
keep_input_columns_[missed] = false;
}
// initialize to_process_indices.
for (const auto &col_name : in_columns_) {
to_process_indices_.push_back((*col_name_id_map)[col_name]);
}
return Status::OK();
}
// Create the final column name to index mapping and get indices of the columns this mapop does not use.
void MapOp::CreateFinalColMap(std::unordered_map<std::string, int32_t> *col_name_id_map) {
std::unordered_map<std::string, int32_t> final_col_name_id_map;
@ -376,7 +367,7 @@ void MapOp::CreateFinalColMap(std::unordered_map<std::string, int32_t> *col_name
// Visitor accept method for NodePass
Status MapOp::Accept(NodePass *p, bool *modified) {
// Downcast shared pointer then call visitor
return p->RunOnNode(std::static_pointer_cast<MapOp>(shared_from_this()), modified);
return p->RunOnNode(shared_from_base<MapOp>(), modified);
}
} // namespace dataset
} // namespace mindspore

View File

@ -93,13 +93,6 @@ class MapOp : public ParallelOp {
return *this;
}
// Setter method.
// @return Builder setter method returns reference to the builder.
Builder &SetColOrder(const std::vector<std::string> &col_order_) {
build_col_order_ = col_order_;
return *this;
}
// Setter method.
// @return Builder setter method returns reference to the builder.
Builder &SetNumWorkers(int32_t num_workers) {
@ -130,7 +123,6 @@ class MapOp : public ParallelOp {
std::vector<std::string> build_in_col_names_;
std::vector<std::string> build_out_col_names_;
std::vector<std::shared_ptr<TensorOp>> build_tensor_funcs_;
std::vector<std::string> build_col_order_;
int32_t build_num_workers_;
int32_t build_op_connector_size_;
bool build_perf_mode_; // Default true.
@ -145,12 +137,11 @@ class MapOp : public ParallelOp {
// @param in_col_names A list of input column names (should match the input/output \p tensorFuncs).
// @param out_col_names A list of output column names (should match the input/output \p tensorFuncs).
// @param tensor_funcs A list of TensorOp pointers for MapOp to apply to each data.
// @param columns_order names A full list of column names (should match the whole dataset view post \p tensorFuncs).
// @param num_workers The number of worker threads.
// @param op_connector_size The size of each queue in the connector.
MapOp(const std::vector<std::string> &in_col_names, const std::vector<std::string> &out_col_names,
std::vector<std::shared_ptr<TensorOp>> tensor_funcs, const std::vector<std::string> &columns_order,
int32_t num_workers, int32_t op_connector_size, bool perf_mode);
std::vector<std::shared_ptr<TensorOp>> tensor_funcs, int32_t num_workers, int32_t op_connector_size,
bool perf_mode);
// Destructor
~MapOp() = default;
@ -190,10 +181,6 @@ class MapOp : public ParallelOp {
// @return Name of the current Op
std::string Name() const override { return "MapOp"; }
// Columns order getter
// @return The post map columns order
std::vector<std::string> const &ColumnsOrder() const { return columns_order_; }
private:
// Local queues where worker threads can pop from.
// Popping directly from the Connector can block if the previous designated threads haven't pop.
@ -215,9 +202,6 @@ class MapOp : public ParallelOp {
// Indices of the columns to process.
std::vector<size_t> to_process_indices_;
// Variable to store the column_order of all columns post tensorOps
std::vector<std::string> columns_order_;
// Performance mode is when the main thread creates local queues, pulls databuffers from the previous
// op's Connector and distributes them to the local queues. Workers pull from the local queues.
// If this flag is false, each worker pulls directly from the Connector. This use less resources
@ -258,15 +242,18 @@ class MapOp : public ParallelOp {
// @param col_name_id_map The column name to index mapping obtained from child operator
void CreateFinalColMap(std::unordered_map<std::string, int32_t> *col_name_id_map);
// Private function that initialize some internal data structure used by WorkerEntry()
// @param in_buf A raw pointer to the DataBuffer. A raw pointer is fine because this function does not manage memory
// and is not shared with other threads.
Status WorkerEntryInit(const DataBuffer *in_buf);
// Validating if each of the input_columns exists in the DataBuffer.
// @param - the column map to check
// @return - status return code
Status ValidateInColumns(const std::unordered_map<std::string, int32_t> &col_name_id_map);
// Private function for computing the assignment of the column name map.
// @return - Status
Status ComputeColMap() override;
// Private function for initializing private variables such as in_columns_, out_columns_.
// @return - Status
Status InitPrivateVariable(std::unordered_map<std::string, int32_t> *col_name_id_map);
};
} // namespace dataset
} // namespace mindspore

View File

@ -26,8 +26,8 @@
namespace mindspore {
namespace dataset {
// Constructor
ParallelOp::ParallelOp(int32_t num_workers, int32_t op_connector_size)
: DatasetOp(op_connector_size),
ParallelOp::ParallelOp(int32_t num_workers, int32_t op_connector_size, std::shared_ptr<Sampler> sampler)
: DatasetOp(op_connector_size, sampler),
num_workers_(num_workers),
num_producers_(num_workers),
worker_connector_size_(1),

View File

@ -38,7 +38,8 @@ class ParallelOp : public DatasetOp {
// Constructor
// @param num_workers
// @param op_connector_size - size of the output connector for this operator
ParallelOp(int32_t num_workers, int32_t op_connector_size);
// @param sampler - The sampler for the op
ParallelOp(int32_t num_workers, int32_t op_connector_size, std::shared_ptr<Sampler> sampler = nullptr);
// Destructor
~ParallelOp() = default;

View File

@ -20,7 +20,8 @@
namespace mindspore {
namespace dataset {
// Constructor
PipelineOp::PipelineOp(int32_t op_connector_size) : DatasetOp(op_connector_size) {}
PipelineOp::PipelineOp(int32_t op_connector_size, std::shared_ptr<Sampler> sampler)
: DatasetOp(op_connector_size, sampler) {}
// A print method typically used for debugging
void PipelineOp::Print(std::ostream &out, bool show_all) const {

View File

@ -32,7 +32,8 @@ class PipelineOp : public DatasetOp {
// Constructor
// @param op_connector_size - size of the output connector
// @return Builder setter method returns reference to the builder.
explicit PipelineOp(int32_t op_connector_size);
// @param sampler - The sampler for the op
explicit PipelineOp(int32_t op_connector_size, std::shared_ptr<Sampler> sampler = nullptr);
// Destructor
~PipelineOp() = default;

View File

@ -74,24 +74,6 @@ void ProjectOp::Print(std::ostream &out, bool show_all) const {
Status ProjectOp::GetNextBuffer(std::unique_ptr<DataBuffer> *p_buffer, int32_t worker_id, bool retry_if_eoe) {
RETURN_IF_NOT_OK(child_[0]->GetNextBuffer(p_buffer, worker_id, retry_if_eoe));
if (!((*p_buffer)->eoe()) && !((*p_buffer)->eof())) {
// Only for the first buffer fetched, get the column map of the incoming data and save it
// into our own column name map after making the appropriate mods
// We cannot use the super class AssignColMapFromChild here because we're making a modification of the
// map from the child map.
if (first_fetch_) {
std::unordered_map<std::string, int32_t> child_column_name_mapping = child_[0]->column_name_id_map();
for (size_t i = 0; i < columns_to_project_.size(); i++) {
std::string &current_column = columns_to_project_[i];
if (child_column_name_mapping.find(current_column) == child_column_name_mapping.end()) {
std::string err_msg = "ProjectOp: column " + current_column + " does not exist in child operator.";
RETURN_STATUS_UNEXPECTED(err_msg);
}
// Setup the new column name mapping for ourself (base class field)
column_name_id_map_[current_column] = i;
projected_column_indices_.push_back(child_column_name_mapping[current_column]);
}
first_fetch_ = false; // we only need to do this path once
}
RETURN_IF_NOT_OK(Project(p_buffer));
}
return Status::OK();
@ -149,7 +131,29 @@ Status ProjectOp::EofReceived(int32_t worker_id) { return Status::OK(); }
// Visitor accept method for NodePass
Status ProjectOp::Accept(NodePass *p, bool *modified) {
// Downcast shared pointer then call visitor
return p->RunOnNode(std::static_pointer_cast<ProjectOp>(shared_from_this()), modified);
return p->RunOnNode(shared_from_base<ProjectOp>(), modified);
}
// Compute the column map and save it into our own column name map
// We cannot use the super class ComputeColMap here because we're making a modification of the
// map from the child map.
Status ProjectOp::ComputeColMap() {
if (column_name_id_map_.empty()) {
std::unordered_map<std::string, int32_t> child_column_name_mapping = child_[0]->column_name_id_map();
for (size_t i = 0; i < columns_to_project_.size(); i++) {
std::string &current_column = columns_to_project_[i];
if (child_column_name_mapping.find(current_column) == child_column_name_mapping.end()) {
std::string err_msg = "ProjectOp: column " + current_column + " does not exist in child operator.";
RETURN_STATUS_UNEXPECTED(err_msg);
}
// Setup the new column name mapping for ourself (base class field)
column_name_id_map_[current_column] = i;
projected_column_indices_.push_back(child_column_name_mapping[current_column]);
}
} else {
MS_LOG(WARNING) << "Column name map is already set!";
}
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

View File

@ -116,6 +116,10 @@ class ProjectOp : public PipelineOp {
std::vector<int32_t> projected_column_indices_;
Status Project(std::unique_ptr<DataBuffer> *data_buffer);
// Computing the assignment of the column name map.
// @return - Status
Status ComputeColMap() override;
};
} // namespace dataset
} // namespace mindspore

View File

@ -69,12 +69,6 @@ Status RenameOp::operator()() {
RETURN_STATUS_UNEXPECTED(err_msg);
}
// First, populate the column map from the input child.
// This will not be the final map for output from this op.
RETURN_IF_NOT_OK(DatasetOp::AssignColMapFromChild());
// core rename functionality only needs to happen once, to identify the new column names/indexes
RETURN_IF_NOT_OK(RenameColumns());
while (curr_buffer->eof() == false) {
while (curr_buffer->eoe() == false) {
// push the renamed input buffer
@ -95,45 +89,52 @@ Status RenameOp::operator()() {
return Status::OK();
}
// renames the columns
Status RenameOp::RenameColumns() {
// iterate over my index in input vector, find the corresponding position
std::unordered_map<std::string, int32_t> new_col_name_id_map = {};
// parameter for input check
size_t found = 0;
// Rename core functionality to compute the new column name id map.
// We need to overwrite the super class ComputeColMap here because we're making a modification of the
// map from the child map.
Status RenameOp::ComputeColMap() {
if (column_name_id_map_.empty()) {
column_name_id_map_ = child_[0]->column_name_id_map();
// iterate over my index in input vector, find the corresponding position
std::unordered_map<std::string, int32_t> new_col_name_id_map = {};
// parameter for input check
size_t found = 0;
// iterate over all the pairs and if there is a name match with rename, rename the column and add it to new map
// by doing it this way we recreate a new ColNameIdMap and allow for switching
for (const auto &pair : column_name_id_map_) {
std::string name = pair.first;
int32_t id = pair.second;
// find name
std::vector<std::string>::iterator it;
it = std::find(in_columns_.begin(), in_columns_.end(), name);
// for c input checks here we have to count the number of times we find the stuff in in_columns_
// because we iterate over the mInputList n times
if (it != in_columns_.end()) {
// found
found += 1;
int index = std::distance(in_columns_.begin(), it);
MS_LOG(DEBUG) << "Rename operator index found " << index << " value " << id << ".";
// iterate over all the pairs and if there is a name match with rename, rename the column and add it to new map
// by doing it this way we recreate a new ColNameIdMap and allow for switching
for (const auto &pair : column_name_id_map_) {
std::string name = pair.first;
int32_t id = pair.second;
// find name
std::vector<std::string>::iterator it;
it = std::find(in_columns_.begin(), in_columns_.end(), name);
// for c input checks here we have to count the number of times we find the stuff in in_columns_
// because we iterate over the mInputList n times
if (it != in_columns_.end()) {
// found
found += 1;
int index = std::distance(in_columns_.begin(), it);
MS_LOG(DEBUG) << "Rename operator index found " << index << " value " << id << ".";
new_col_name_id_map[out_columns_[index]] = id;
} else {
// not found
MS_LOG(DEBUG) << "Rename operator index not found: " << id << " is the column id.";
new_col_name_id_map[name] = id;
new_col_name_id_map[out_columns_[index]] = id;
} else {
// not found
MS_LOG(DEBUG) << "Rename operator index not found: " << id << " is the column id.";
new_col_name_id_map[name] = id;
}
}
// only checks number of renamed columns have been found, this input check doesn't check everything
if (found != in_columns_.size()) {
MS_LOG(DEBUG) << "Rename operator column names found: " << found << " out of " << in_columns_.size() << ".";
std::string err_msg = "Renamed column doesn't exist in dataset";
RETURN_STATUS_UNEXPECTED(err_msg);
}
}
// only checks number of renamed columns have been found, this input check doesn't check everything
if (found != in_columns_.size()) {
MS_LOG(DEBUG) << "Rename operator column names found: " << found << " out of " << in_columns_.size() << ".";
std::string err_msg = "Renamed column doesn't exist in dataset";
RETURN_STATUS_UNEXPECTED(err_msg);
}
// Now, overwrite our column map with the new renamed columns/id's
column_name_id_map_ = new_col_name_id_map;
// Now, overwrite our column map with the new renamed columns/id's
column_name_id_map_ = new_col_name_id_map;
} else {
MS_LOG(WARNING) << "Column name map is already set!";
}
return Status::OK();
}
@ -175,7 +176,7 @@ Status RenameOp::EoeReceived(int32_t) {
// Visitor accept method for NodePass
Status RenameOp::Accept(NodePass *p, bool *modified) {
// Downcast shared pointer then call visitor
return p->RunOnNode(std::static_pointer_cast<RenameOp>(shared_from_this()), modified);
return p->RunOnNode(shared_from_base<RenameOp>(), modified);
}
} // namespace dataset
} // namespace mindspore

View File

@ -122,7 +122,9 @@ class RenameOp : public PipelineOp {
protected:
// Rename core functionality
Status RenameColumns();
// Computing the assignment of the new column name map.
// @return - Status
Status ComputeColMap() override;
// Variable to store the input column names
std::vector<std::string> in_columns_;

View File

@ -82,14 +82,14 @@ void RepeatOp::Print(std::ostream &out, bool show_all) const {
Status RepeatOp::PrepareNodePostAction() {
// Run any common code from super class first before adding our own specific logic
RETURN_IF_NOT_OK(PipelineOp::PrepareNodePostAction());
std::shared_ptr<DatasetOp> leaf_op = tree_->PopFromRepeatStack();
std::shared_ptr<DatasetOp> leaf_op = tree_->PopFromEOEOpStack();
while (leaf_op != nullptr) {
// Track the leaf operators that are under this repeat op.
eoe_ops_.push_back(leaf_op);
leaf_op = tree_->PopFromRepeatStack();
leaf_op = tree_->PopFromEOEOpStack();
}
// Push ourselves to the stack in case one of our ascendants is repeat too.
tree_->AddToRepeatStack(shared_from_this());
tree_->AddToEOEOpStack(shared_from_this());
return Status::OK();
}
@ -123,8 +123,6 @@ Status RepeatOp::GetNextBuffer(std::unique_ptr<DataBuffer> *p_buffer, int32_t wo
if (buf->eof()) {
RETURN_IF_NOT_OK(EofReceived(worker_id));
}
// Update the column name map if needed
RETURN_IF_NOT_OK(DatasetOp::AssignColMapFromChild());
*p_buffer = std::move(buf);
return Status::OK();
}
@ -192,7 +190,7 @@ int32_t RepeatOp::num_producers() const {
// Visitor accept method for NodePass
Status RepeatOp::Accept(NodePass *p, bool *modified) {
// Downcast shared pointer then call visitor
return p->RunOnNode(std::static_pointer_cast<RepeatOp>(shared_from_this()), modified);
return p->RunOnNode(shared_from_base<RepeatOp>(), modified);
}
} // namespace dataset
} // namespace mindspore

View File

@ -266,9 +266,6 @@ Status ShuffleOp::InitShuffleBuffer() {
RETURN_STATUS_UNEXPECTED("Unable to fetch a single row for shuffle buffer.");
}
// Now that a first fetch is done, assign the column map for this operator
RETURN_IF_NOT_OK(DatasetOp::AssignColMapFromChild());
// Now fill the rest of the shuffle buffer until we are unable to get the next row or we reached
// the desired shuffle buffer size.
while (!new_row.empty() && shuffle_buffer_->size() < static_cast<size_t>(shuffle_size_ - 1)) {
@ -301,7 +298,7 @@ Status ShuffleOp::EoeReceived(int32_t worker_id) {
// Visitor accept method for NodePass
Status ShuffleOp::Accept(NodePass *p, bool *modified) {
// Downcast shared pointer then call visitor
return p->RunOnNode(std::static_pointer_cast<ShuffleOp>(shared_from_this()), modified);
return p->RunOnNode(shared_from_base<ShuffleOp>(), modified);
}
} // namespace dataset
} // namespace mindspore

View File

@ -86,9 +86,6 @@ Status SkipOp::operator()() {
std::unique_ptr<DataBuffer> curr_buffer;
RETURN_IF_NOT_OK(GetNextInput(&curr_buffer));
// After the first buffer fetch above we can do the one-time assign of the column name map
RETURN_IF_NOT_OK(DatasetOp::AssignColMapFromChild());
while (curr_buffer->eof() == false) {
// Reset count
skip_count_ = 0;
@ -133,7 +130,7 @@ Status SkipOp::EofReceived(int32_t worker_id) {
// Visitor accept method for NodePass
Status SkipOp::Accept(NodePass *p, bool *modified) {
// Downcast shared pointer then call visitor
return p->RunOnNode(std::static_pointer_cast<SkipOp>(shared_from_this()), modified);
return p->RunOnNode(shared_from_base<SkipOp>(), modified);
}
} // namespace dataset
} // namespace mindspore

View File

@ -70,20 +70,14 @@ Status CelebAOp::Builder::SanityCheck() {
CelebAOp::CelebAOp(int32_t num_workers, int32_t rows_per_buffer, const std::string &dir, int32_t queue_size,
bool decode, const std::string &dataset_type, const std::set<std::string> &exts,
std::unique_ptr<DataSchema> schema, std::shared_ptr<Sampler> sampler)
: ParallelOp(num_workers, queue_size),
: ParallelOp(num_workers, queue_size, std::move(sampler)),
rows_per_buffer_(rows_per_buffer),
folder_path_(dir),
decode_(decode),
extensions_(exts),
data_schema_(std::move(schema)),
sampler_(std::move(sampler)),
num_rows_in_attr_file_(0),
dataset_type_(dataset_type) {
// Set the column name map (base class field)
for (int32_t index = 0; index < data_schema_->NumColumns(); index++) {
column_name_id_map_[data_schema_->column(index).name()] = index;
}
attr_info_queue_ = std::make_unique<Queue<std::vector<std::string>>>(queue_size);
io_block_queues_.Init(num_workers_, queue_size);
}
@ -413,5 +407,17 @@ Status CelebAOp::Reset() {
wp_.Set(); // wake up master thread after reset is done
return Status::OK();
}
Status CelebAOp::ComputeColMap() {
// Set the column name map (base class field)
if (column_name_id_map_.empty()) {
for (int32_t index = 0; index < data_schema_->NumColumns(); index++) {
column_name_id_map_[data_schema_->column(index).name()] = index;
}
} else {
MS_LOG(WARNING) << "Column name map is already set!";
}
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

View File

@ -212,12 +212,15 @@ class CelebAOp : public ParallelOp, RandomAccessOp {
// @return Status - The error code return
Status Reset() override;
// Private function for computing the assignment of the column name map.
// @return - Status
Status ComputeColMap() override;
int32_t rows_per_buffer_;
std::string folder_path_; // directory of celeba folder
bool decode_;
std::set<std::string> extensions_; // extensions allowed
std::unique_ptr<DataSchema> data_schema_;
std::shared_ptr<Sampler> sampler_;
std::unique_ptr<Queue<std::vector<std::string>>> attr_info_queue_;
int64_t num_rows_in_attr_file_; // rows number specified in attr file
QueueList<std::unique_ptr<IOBlock>> io_block_queues_;

View File

@ -79,18 +79,13 @@ Status CifarOp::Builder::SanityCheck() {
CifarOp::CifarOp(CifarType type, int32_t num_works, int32_t rows_per_buf, const std::string &file_dir,
int32_t queue_size, std::unique_ptr<DataSchema> data_schema, std::shared_ptr<Sampler> sampler)
: ParallelOp(num_works, queue_size),
: ParallelOp(num_works, queue_size, std::move(sampler)),
cifar_type_(type),
rows_per_buffer_(rows_per_buf),
folder_path_(file_dir),
data_schema_(std::move(data_schema)),
sampler_(std::move(sampler)),
row_cnt_(0),
buf_cnt_(0) {
// set the column name map (base class field)
for (uint32_t i = 0; i < data_schema_->NumColumns(); ++i) {
column_name_id_map_[data_schema_->column(i).name()] = i;
}
constexpr uint64_t kUtilQueueSize = 512;
cifar_raw_data_block_ = std::make_unique<Queue<std::vector<unsigned char>>>(kUtilQueueSize);
io_block_queues_.Init(num_workers_, queue_size);
@ -454,5 +449,17 @@ Status CifarOp::CountTotalRows(const std::string &dir, bool isCIFAR10, int64_t *
return Status::OK();
}
}
Status CifarOp::ComputeColMap() {
// set the column name map (base class field)
if (column_name_id_map_.empty()) {
for (uint32_t i = 0; i < data_schema_->NumColumns(); ++i) {
column_name_id_map_[data_schema_->column(i).name()] = i;
}
} else {
MS_LOG(WARNING) << "Column name map is already set!";
}
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

View File

@ -208,11 +208,14 @@ class CifarOp : public ParallelOp, public RandomAccessOp {
// @return Status - The error code return
Status GetClassIds(std::map<int32_t, std::vector<int64_t>> *cls_ids) const override;
// Private function for computing the assignment of the column name map.
// @return - Status
Status ComputeColMap() override;
CifarType cifar_type_;
int32_t rows_per_buffer_;
std::string folder_path_;
std::unique_ptr<DataSchema> data_schema_;
std::shared_ptr<Sampler> sampler_;
int64_t row_cnt_;
int64_t buf_cnt_;

View File

@ -31,11 +31,7 @@
namespace mindspore {
namespace dataset {
ClueOp::Builder::Builder()
: builder_device_id_(0),
builder_num_devices_(1),
builder_num_samples_(0),
builder_shuffle_files_(false),
builder_shuffle_global_(false) {
: builder_device_id_(0), builder_num_devices_(1), builder_num_samples_(0), builder_shuffle_files_(false) {
std::shared_ptr<ConfigManager> config_manager = GlobalContext::config_manager();
builder_num_workers_ = config_manager->num_parallel_workers();
builder_op_connector_size_ = config_manager->op_connector_size();
@ -66,8 +62,8 @@ Status ClueOp::Builder::Build(std::shared_ptr<ClueOp> *op) {
std::shared_ptr<ClueOp> clue_op = std::make_shared<ClueOp>(
builder_num_workers_, builder_rows_per_buffer_, builder_num_samples_, builder_worker_connector_size_, ck_map,
builder_clue_files_list_, builder_op_connector_size_, builder_shuffle_files_, builder_shuffle_global_,
builder_num_devices_, builder_device_id_);
builder_clue_files_list_, builder_op_connector_size_, builder_shuffle_files_, builder_num_devices_,
builder_device_id_);
RETURN_IF_NOT_OK(clue_op->Init());
*op = std::move(clue_op);
@ -87,7 +83,7 @@ std::vector<std::string> ClueOp::Builder::split(const std::string &s, char delim
ClueOp::ClueOp(int32_t num_workers, int64_t rows_per_buffer, int64_t num_samples, int32_t worker_connector_size,
ColKeyMap cols_to_keyword, std::vector<std::string> clue_files_list, int32_t op_connector_size,
bool shuffle_files, bool shuffle_global, int32_t num_device, int32_t device_id)
bool shuffle_files, int32_t num_device, int32_t device_id)
: ParallelOp(num_workers, op_connector_size),
rows_per_buffer_(rows_per_buffer),
num_rows_per_shard_(0),
@ -98,7 +94,6 @@ ClueOp::ClueOp(int32_t num_workers, int64_t rows_per_buffer, int64_t num_samples
load_jagged_connector_(true),
cols_to_keyword_(cols_to_keyword),
shuffle_files_(shuffle_files),
shuffle_global_(shuffle_global),
finished_reading_dataset_(false),
num_devices_(num_device),
device_id_(device_id),
@ -112,13 +107,6 @@ Status ClueOp::Init() {
int32_t safe_queue_size = static_cast<int32_t>(std::ceil(clue_files_list_.size() / num_workers_) + 1);
io_block_queues_.Init(num_workers_, safe_queue_size);
// Set the column name mapping (base class field)
int count = 0;
for (auto &p : cols_to_keyword_) {
column_name_id_map_[p.first] = count;
count++;
}
RETURN_IF_NOT_OK(ParallelOp::CreateWorkerConnector(worker_connector_size_));
jagged_buffer_connector_ = std::make_unique<JaggedConnector>(num_workers_, 1, worker_connector_size_);
@ -549,5 +537,19 @@ Status ClueOp::CountAllFileRows(const std::vector<std::string> &files, int64_t *
}
return Status::OK();
}
Status ClueOp::ComputeColMap() {
// Set the column name mapping (base class field)
if (column_name_id_map_.empty()) {
int count = 0;
for (auto &p : cols_to_keyword_) {
column_name_id_map_[p.first] = count;
count++;
}
} else {
MS_LOG(WARNING) << "Column name map is already set!";
}
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

View File

@ -104,13 +104,6 @@ class ClueOp : public ParallelOp {
return *this;
}
// Setter method.
// @return Builder - setter method returns reference to the builder.
Builder &SetShuffleGlobal(bool shuffle_global) {
builder_shuffle_global_ = shuffle_global;
return *this;
}
// Setter method.
// @return Builder - setter method returns reference to the builder.
Builder &SetNumSamples(int64_t num_samples) {
@ -139,15 +132,13 @@ class ClueOp : public ParallelOp {
int32_t builder_worker_connector_size_;
std::vector<std::string> builder_clue_files_list_;
bool builder_shuffle_files_;
bool builder_shuffle_global_;
std::map<std::string, std::string> builder_cols_to_keyword_;
};
// Constructor of ClueOp
// @param shuffle_global - whether or not to shuffle the entire dataset.
ClueOp(int32_t num_workers, int64_t rows_per_buffer, int64_t num_samples, int32_t worker_connector_size,
ColKeyMap cols_to_keyword, std::vector<std::string> clue_files_list, int32_t op_connector_size,
bool shuffle_files, bool shuffle_global, int32_t num_devices, int32_t device_id);
bool shuffle_files, int32_t num_devices, int32_t device_id);
// Default destructor
~ClueOp() = default;
@ -182,10 +173,6 @@ class ClueOp : public ParallelOp {
// @return Vector of the input file names
std::vector<std::string> FileNames() { return clue_files_list_; }
// Global shuffle flag getter
// @return Bool - whether this Op requires global shuffle
bool RequireGlobalShuffle() { return shuffle_global_; }
private:
// The entry point for when workers are launched.
// @param worker_id - the id of the worker that is executing this function.
@ -263,9 +250,12 @@ class ClueOp : public ParallelOp {
// @return Status - the error code returned.
Status GetValue(const nlohmann::json &js, std::vector<std::string> key_chain, std::shared_ptr<Tensor> *t);
// Private function for computing the assignment of the column name map.
// @return - Status
Status ComputeColMap() override;
int32_t device_id_;
bool shuffle_files_;
bool shuffle_global_;
bool finished_reading_dataset_;
int32_t num_devices_;
int64_t rows_per_buffer_;

View File

@ -129,10 +129,6 @@ CocoOp::CocoOp(const TaskType &task_type, const std::string &image_folder_path,
rows_per_buffer_(rows_per_buffer),
sampler_(std::move(sampler)),
data_schema_(std::move(data_schema)) {
// Set the column name map (base class field)
for (int32_t i = 0; i < data_schema_->NumColumns(); ++i) {
column_name_id_map_[data_schema_->column(i).name()] = i;
}
io_block_queues_.Init(num_workers_, queue_size);
}
@ -627,5 +623,17 @@ Status CocoOp::GetClassIndexing(const std::string &dir, const std::string &file,
*output_class_indexing = op->label_index_;
return Status::OK();
}
Status CocoOp::ComputeColMap() {
// Set the column name map (base class field)
if (column_name_id_map_.empty()) {
for (int32_t i = 0; i < data_schema_->NumColumns(); ++i) {
column_name_id_map_[data_schema_->column(i).name()] = i;
}
} else {
MS_LOG(WARNING) << "Column name map is already set!";
}
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

View File

@ -306,6 +306,10 @@ class CocoOp : public ParallelOp, public RandomAccessOp {
template <typename T>
Status SearchNodeInJson(nlohmann::json input_tree, std::string node_name, T *output_node);
// Private function for computing the assignment of the column name map.
// @return - Status
Status ComputeColMap() override;
bool decode_;
int64_t row_cnt_;
int64_t buf_cnt_;

View File

@ -94,12 +94,6 @@ void GeneratorOp::Dealloc() noexcept {
Status GeneratorOp::Init() {
// Reset BufferID
buffer_id_ = 0;
// Setup column names map (base class field)
if (column_name_id_map_.empty()) {
for (int i = 0; i < column_names_.size(); ++i) {
column_name_id_map_[column_names_[i]] = i;
}
}
Status ret;
{
// Acquire Python GIL
@ -255,7 +249,19 @@ Status GeneratorOp::Reset() {
// Visitor accept method for NodePass
Status GeneratorOp::Accept(NodePass *p, bool *modified) {
// Downcast shared pointer then call visitor
return p->RunOnNode(std::static_pointer_cast<GeneratorOp>(shared_from_this()), modified);
return p->RunOnNode(shared_from_base<GeneratorOp>(), modified);
}
Status GeneratorOp::ComputeColMap() {
// Setup column names map (base class field)
if (column_name_id_map_.empty()) {
for (int i = 0; i < column_names_.size(); ++i) {
column_name_id_map_[column_names_[i]] = i;
}
} else {
MS_LOG(WARNING) << "Column name map is already set!";
}
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

View File

@ -150,6 +150,10 @@ class GeneratorOp : public PipelineOp {
Status PyRowToTensorRow(py::object py_data, TensorRow *tensor_row);
Status FillBuffer(TensorQTable *tt);
// Private function for computing the assignment of the column name map.
// @return - Status
Status ComputeColMap() override;
};
#pragma GCC visibility pop

View File

@ -65,7 +65,7 @@ ImageFolderOp::ImageFolderOp(int32_t num_wkrs, int32_t rows_per_buffer, std::str
bool recursive, bool do_decode, const std::set<std::string> &exts,
const std::map<std::string, int32_t> &map, std::unique_ptr<DataSchema> data_schema,
std::shared_ptr<Sampler> sampler)
: ParallelOp(num_wkrs, queue_size),
: ParallelOp(num_wkrs, queue_size, std::move(sampler)),
rows_per_buffer_(rows_per_buffer),
folder_path_(file_dir),
recursive_(recursive),
@ -73,15 +73,10 @@ ImageFolderOp::ImageFolderOp(int32_t num_wkrs, int32_t rows_per_buffer, std::str
extensions_(exts),
class_index_(map),
data_schema_(std::move(data_schema)),
sampler_(std::move(sampler)),
row_cnt_(0),
buf_cnt_(0),
sampler_ind_(0),
dirname_offset_(0) {
// Set the column name map (base class field)
for (int32_t i = 0; i < data_schema_->NumColumns(); ++i) {
column_name_id_map_[data_schema_->column(i).name()] = i;
}
folder_name_queue_ = std::make_unique<Queue<std::string>>(num_wkrs * queue_size);
image_name_queue_ = std::make_unique<Queue<FolderImagesPair>>(num_wkrs * queue_size);
io_block_queues_.Init(num_workers_, queue_size);
@ -108,7 +103,7 @@ Status ImageFolderOp::PrescanMasterEntry(const std::string &filedir) {
// following loop puts the 2 level of shuffles together into 1 vector
for (size_t ind = 0; ind < v.size(); ++ind) {
while (v[ind]->second.empty() == false) {
DS_ASSERT(!(v[ind]->first.empty())); // make sure that v[ind]->first.substr(1) is not out of bound
MS_ASSERT(!(v[ind]->first.empty())); // make sure that v[ind]->first.substr(1) is not out of bound
v[ind]->second.front()->second = class_index_.empty() ? ind : class_index_[v[ind]->first.substr(1)];
image_label_pairs_.push_back(v[ind]->second.front());
v[ind]->second.pop();
@ -416,7 +411,19 @@ Status ImageFolderOp::CountRowsAndClasses(const std::string &path, const std::se
// Visitor accept method for NodePass
Status ImageFolderOp::Accept(NodePass *p, bool *modified) {
// Downcast shared pointer then call visitor
return p->RunOnNode(std::static_pointer_cast<ImageFolderOp>(shared_from_this()), modified);
return p->RunOnNode(shared_from_base<ImageFolderOp>(), modified);
}
Status ImageFolderOp::ComputeColMap() {
// Set the column name map (base class field)
if (column_name_id_map_.empty()) {
for (int32_t i = 0; i < data_schema_->NumColumns(); ++i) {
column_name_id_map_[data_schema_->column(i).name()] = i;
}
} else {
MS_LOG(WARNING) << "Column name map is already set!";
}
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

View File

@ -248,6 +248,10 @@ class ImageFolderOp : public ParallelOp, public RandomAccessOp {
// @return Status - The error code return
Status Reset() override;
// Private function for computing the assignment of the column name map.
// @return - Status
Status ComputeColMap() override;
int32_t rows_per_buffer_;
std::string folder_path_; // directory of image folder
bool recursive_;
@ -255,7 +259,6 @@ class ImageFolderOp : public ParallelOp, public RandomAccessOp {
std::set<std::string> extensions_; // extensions allowed
std::map<std::string, int32_t> class_index_;
std::unique_ptr<DataSchema> data_schema_;
std::shared_ptr<Sampler> sampler_;
int64_t row_cnt_;
int64_t buf_cnt_;
int64_t sampler_ind_;

View File

@ -64,7 +64,7 @@ Status ManifestOp::Builder::SanityCheck() {
ManifestOp::ManifestOp(int32_t num_works, int32_t rows_per_buffer, std::string file, int32_t queue_size, bool decode,
const std::map<std::string, int32_t> &class_index, std::unique_ptr<DataSchema> data_schema,
std::shared_ptr<Sampler> sampler, std::string usage)
: ParallelOp(num_works, queue_size),
: ParallelOp(num_works, queue_size, std::move(sampler)),
rows_per_buffer_(rows_per_buffer),
io_block_pushed_(0),
row_cnt_(0),
@ -72,14 +72,9 @@ ManifestOp::ManifestOp(int32_t num_works, int32_t rows_per_buffer, std::string f
data_schema_(std::move(data_schema)),
file_(file),
class_index_(class_index),
sampler_(std::move(sampler)),
decode_(decode),
usage_(usage),
buf_cnt_(0) {
// Set the column name map (base class field)
for (int32_t i = 0; i < data_schema_->NumColumns(); ++i) {
column_name_id_map_[data_schema_->column(i).name()] = i;
}
io_block_queues_.Init(num_workers_, queue_size);
(void)std::transform(usage_.begin(), usage_.end(), usage_.begin(), ::tolower);
}
@ -420,5 +415,17 @@ Status ManifestOp::GetClassIndexing(const std::string &file, const py::dict &dic
return Status::OK();
}
Status ManifestOp::ComputeColMap() {
// Set the column name map (base class field)
if (column_name_id_map_.empty()) {
for (int32_t i = 0; i < data_schema_->NumColumns(); ++i) {
column_name_id_map_[data_schema_->column(i).name()] = i;
}
} else {
MS_LOG(WARNING) << "Column name map is already set!";
}
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

View File

@ -219,6 +219,10 @@ class ManifestOp : public ParallelOp, public RandomAccessOp {
// @return Status - The error code return
Status CountDatasetInfo();
// Private function for computing the assignment of the column name map.
// @return - Status
Status ComputeColMap() override;
int32_t rows_per_buffer_;
int64_t io_block_pushed_;
int64_t row_cnt_;
@ -226,7 +230,6 @@ class ManifestOp : public ParallelOp, public RandomAccessOp {
std::unique_ptr<DataSchema> data_schema_;
std::string file_; // file that store the information of images
std::map<std::string, int32_t> class_index_;
std::shared_ptr<Sampler> sampler_;
bool decode_;
std::string usage_;
int64_t buf_cnt_;

View File

@ -196,10 +196,6 @@ Status MindRecordOp::Init() {
data_schema_ = std::move(tmp_schema);
}
for (int i = 0; i < static_cast<int>(columns_to_load_.size()); i++) {
column_name_id_map_[columns_to_load_[i]] = i;
}
return Status::OK();
}
@ -500,7 +496,18 @@ Status MindRecordOp::CountTotalRows(const std::vector<std::string> dataset_path,
// Visitor accept method for NodePass
Status MindRecordOp::Accept(NodePass *p, bool *modified) {
// Downcast shared pointer then call visitor
return p->RunOnNode(std::static_pointer_cast<MindRecordOp>(shared_from_this()), modified);
return p->RunOnNode(shared_from_base<MindRecordOp>(), modified);
}
Status MindRecordOp::ComputeColMap() {
if (column_name_id_map_.empty()) {
for (int i = 0; i < static_cast<int>(columns_to_load_.size()); i++) {
column_name_id_map_[columns_to_load_[i]] = i;
}
} else {
MS_LOG(WARNING) << "Column name map is already set!";
}
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

View File

@ -234,6 +234,10 @@ class MindRecordOp : public ParallelOp {
Status FetchBlockBuffer(const int32_t &buffer_id);
// Private function for computing the assignment of the column name map.
// @return - Status
Status ComputeColMap() override;
int32_t rows_per_buffer_; // The number of requested rows per buffer.
std::vector<std::string> dataset_file_; // dataset files
bool load_dataset_; // load dataset from single file or not

Some files were not shown because too many files have changed in this diff Show More