!15977 Add MD Cropper

From: @mhmotallebi Reviewed-by: Signed-off-by:
2021-05-13 10:54:30 +08:00 · 2021-05-13 10:54:30 +08:00 · 4eb91ba6f9
parent 885c606a4b c6ed4151e7
commit 4eb91ba6f9
11 changed files with 967 additions and 14 deletions
--- a/cmake/package_lite.cmake
+++ b/cmake/package_lite.cmake
@ -12,6 +12,7 @@ if(SUPPORT_TRAIN)
    set(RUNTIME_LIB_DIR ${RUNTIME_PKG_NAME}/train/lib)
    set(MIND_DATA_INC_DIR ${RUNTIME_PKG_NAME}/train/include/dataset)
    set(TURBO_DIR ${RUNTIME_PKG_NAME}/train/third_party/libjpeg-turbo)
+    set(SECUREC_DIR ${RUNTIME_PKG_NAME}/train/third_party/securec)
    set(MINDSPORE_LITE_LIB_NAME libmindspore-lite-train)
    set(BENCHMARK_NAME benchmark_train)
    set(BENCHMARK_ROOT_DIR ${RUNTIME_PKG_NAME}/tools/benchmark_train)
@ -21,6 +22,7 @@ else()
    set(RUNTIME_LIB_DIR ${RUNTIME_PKG_NAME}/inference/lib)
    set(MIND_DATA_INC_DIR ${RUNTIME_PKG_NAME}/inference/include/dataset)
    set(TURBO_DIR ${RUNTIME_PKG_NAME}/inference/third_party/libjpeg-turbo)
+    set(SECUREC_DIR ${RUNTIME_PKG_NAME}/inference/third_party/securec)
    set(MINDSPORE_LITE_LIB_NAME libmindspore-lite)
    set(BENCHMARK_NAME benchmark)
    set(BENCHMARK_ROOT_DIR ${RUNTIME_PKG_NAME}/tools/benchmark)
@ -40,21 +42,33 @@ if(BUILD_MINDDATA STREQUAL "full")

    if(PLATFORM_ARM64)
        file(GLOB JPEGTURBO_LIB_LIST ${jpeg_turbo_LIBPATH}/*.so)
-        install(FILES ${TOP_DIR}/mindspore/lite/build/minddata/libminddata-lite.so
-                DESTINATION ${RUNTIME_LIB_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME})
+        install(FILES ${TOP_DIR}/mindspore/lite/build/minddata/libminddata-lite.so DESTINATION
+                ${RUNTIME_LIB_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME})
+        install(FILES ${TOP_DIR}/mindspore/lite/build/minddata/libminddata-lite_static.a DESTINATION
+                ${RUNTIME_LIB_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME})
        install(FILES ${JPEGTURBO_LIB_LIST} DESTINATION ${TURBO_DIR}/lib COMPONENT ${RUNTIME_COMPONENT_NAME})
+        install(FILES ${TOP_DIR}/mindspore/lite/build/securec/src/libsecurec.a
+                DESTINATION ${SECUREC_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME})
    elseif(PLATFORM_ARM32)
        file(GLOB JPEGTURBO_LIB_LIST ${jpeg_turbo_LIBPATH}/*.so)
        install(FILES ${TOP_DIR}/mindspore/lite/build/minddata/libminddata-lite.so DESTINATION
                ${RUNTIME_LIB_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME})
+        install(FILES ${TOP_DIR}/mindspore/lite/build/minddata/libminddata-lite_static.a DESTINATION
+                ${RUNTIME_LIB_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME})
        install(FILES ${JPEGTURBO_LIB_LIST} DESTINATION ${TURBO_DIR}/lib COMPONENT ${RUNTIME_COMPONENT_NAME})
+        install(FILES ${TOP_DIR}/mindspore/lite/build/securec/src/libsecurec.a
+                DESTINATION ${SECUREC_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME})
    else()
        install(FILES ${TOP_DIR}/mindspore/lite/build/minddata/libminddata-lite.so DESTINATION
                ${RUNTIME_LIB_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME})
+        install(FILES ${TOP_DIR}/mindspore/lite/build/minddata/libminddata-lite_static.a DESTINATION
+                ${RUNTIME_LIB_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME})
        install(FILES ${jpeg_turbo_LIBPATH}/libjpeg.so.62.3.0 DESTINATION ${TURBO_DIR}/lib
                RENAME libjpeg.so.62 COMPONENT ${RUNTIME_COMPONENT_NAME})
        install(FILES ${jpeg_turbo_LIBPATH}/libturbojpeg.so.0.2.0 DESTINATION ${TURBO_DIR}/lib
                RENAME libturbojpeg.so.0 COMPONENT ${RUNTIME_COMPONENT_NAME})
+        install(FILES ${TOP_DIR}/mindspore/lite/build/securec/src/libsecurec.a
+                DESTINATION ${SECUREC_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME})
    endif()
 endif()

--- a/mindspore/lite/minddata/CMakeLists.txt
+++ b/mindspore/lite/minddata/CMakeLists.txt
@ -289,7 +289,15 @@ if(BUILD_MINDDATA STREQUAL "full")
            ${MINDDATA_FULL_SRC}
            )

+    add_library(minddata-lite_static STATIC
+            ${MINDDATA_KERNELS_IMAGE_LITE_CV_FILES}
+            ${CMAKE_CURRENT_SOURCE_DIR}/../src/common/log_adapter.cc
+            ${CORE_DIR}/utils/ms_utils.cc
+            ${MINDDATA_FULL_SRC}
+            )
+
    add_dependencies(minddata-lite fbs_src)
+    add_dependencies(minddata-lite_static fbs_src)

    find_package(Threads REQUIRED)
    target_link_libraries(minddata-lite
@ -299,6 +307,13 @@ if(BUILD_MINDDATA STREQUAL "full")
        mindspore::json
        Threads::Threads
        )
+    target_link_libraries(minddata-lite_static
+        securec
+        mindspore::jpeg_turbo
+        mindspore::turbojpeg
+        mindspore::json
+        Threads::Threads
+        )

    # ref: https://github.com/android/ndk/issues/1202
    if(PLATFORM_ARM32)
@ -307,10 +322,12 @@ if(BUILD_MINDDATA STREQUAL "full")
    MESSAGE(FATAL_ERROR "Cannot find libclang_rt.builtins-arm-androi2d.a in $ENV{ANDROID_NDK}")
    endif()
    target_link_libraries(minddata-lite ${LIBCLANG_RT_LIB})
+    target_link_libraries(minddata-lite_static ${LIBCLANG_RT_LIB})
    endif()

    if(PLATFORM_ARM32 OR PLATFORM_ARM64)
        target_link_libraries(minddata-lite log)
+        target_link_libraries(minddata-lite_static log)
    elseif()
 endif()
 elseif(BUILD_MINDDATA STREQUAL "wrapper")
--- a/mindspore/lite/tools/dataset/cropper/.gitignore
+++ b/mindspore/lite/tools/dataset/cropper/.gitignore
@ -0,0 +1 @@
+debug.txt
--- a/mindspore/lite/tools/dataset/cropper/CMakeLists.txt
+++ b/mindspore/lite/tools/dataset/cropper/CMakeLists.txt
@ -0,0 +1,33 @@
+cmake_minimum_required(VERSION 3.15.5)
+project(MinddataCropper)
+
+add_compile_definitions(_GLIBCXX_USE_CXX11_ABI=0)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17 -fPIE -fPIC -Wl,--allow-shlib-undefined -s")
+
+file(GLOB minddata_OBJ CONFIGURE_DEPENDS "tmp/*.o")
+
+if(NOT minddata_OBJ)
+    message(FATAL_ERROR "Your code is not using any MindData functionality.\n \
+    ... libminddata-lite_min.so is not needed\n... Terminating crop.sh")
+endif()
+
+message(STATUS ${CMAKE_CXX_COMPILER})
+
+add_custom_command(
+        OUTPUT libminddata-lite_min.so
+        PRE_BUILD
+        COMMAND ${CMAKE_CXX_COMPILER}
+        -shared
+        -o libminddata-lite_min.so
+        ${minddata_OBJ}
+        ${EXTERNAL_DEPS}
+        -pthread
+        -std=c++17
+        -fPIE -fPIC
+        -s
+)
+
+add_custom_target(
+        minddata-lite ALL
+        DEPENDS libminddata-lite_min.so
+)
--- a/mindspore/lite/tools/dataset/cropper/README.md
+++ b/mindspore/lite/tools/dataset/cropper/README.md
@ -0,0 +1,71 @@
+
+# Objective
+
+The goal of this tool is to allow the user to reduce the size of MindData lite package they ship with their code.
+
+# How to run
+
+This tool has two parts: the first part only needs to be run once, when the source code for mindspore is changed
+while the second part should be run every time the user code changes.
+
+Note that you need to run this tool on the server side if you are planning to use your code on an edge device.
+
+## Step 1: Configure the cropper tool
+
+You need to have mindspore installed on your system to run this python script.
+Additionally, you need to have the mindspore source code present in your system
+as this script processes mindspore's source code.
+
+To execute the first part simply run:
+
+```console
+python cropper_configure.py
+```
+
+## Step 2: Crop the MindData lite package
+
+The second part needs to be run every time the user adds or removes one of MD operators in their code.
+
+For the second part, you need to run:
+
+```console
+./crop.sh -p <path to mindspore package> <source files>
+```
+
+Note that you need to provide the name of all files that are using any of the MindData functionalities.
+
+`ANDROID_NDK` environment variable needs to be set as well if the target device is android.
+
+Example: `./crop.sh -p ~/mindspore/ foo.cc foo.h bar.cc bar.h`
+
+This code will create the __libminddata-lite_min.so__ library specific to your code and will also print for you a list of
+shared objects that your code depends on (including __libminddata-lite\_min.so__).
+Note that you need to copy these files to your target device and set the linker flag accordingly.
+
+# How it works
+
+The first step (configuration) creates a few of files that are needed in the second step.
+These files include _dependencies.txt_, _associations.txt_, and _debug.txt_.
+While the third file (_debug.txt_) is only for debugging purposes (debugging cropper tool),
+the other two files are used in the second part.
+_associations.txt_ contains the entry points (IR level source files) for ops that the user may use in their code.
+The other file, _dependencies.txt_, contains all dependencies for all those entry points.
+
+When the user runs the crop script, _parser.py_ will be run on their code to find the ops they have used.
+Afterwards, the text files will be used to keep the needed object files
+(by removing unnecessary object files from the static library containing all of them).
+Finally, the remaining object files will be used to create a new shared object file (_libminddata-lite\_min.so_).
+
+# Requirements
+
+Step 1:
+
+* Python3
+* mindspore
+* mindspore source code
+
+Step 2:
+
+* Python3
+* cmake
+* Android NDK (if target device is android)
--- a/mindspore/lite/tools/dataset/cropper/associations.txt
+++ b/mindspore/lite/tools/dataset/cropper/associations.txt
--- a/mindspore/lite/tools/dataset/cropper/build_lib.py
+++ b/mindspore/lite/tools/dataset/cropper/build_lib.py
@ -0,0 +1,154 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+""" build MindData lite minimum library """
+
+import glob
+import itertools
+import json
+from operator import itemgetter
+import os
+from pprint import pprint
+import sys
+import warnings
+
+import parser
+
+DEPENDENCY_FILENAME = 'dependencies.txt'
+ASSOCIATION_FILENAME = 'associations.txt'
+ALL_DEPS_FILENAME = 'needed_dependencies.txt'
+OBJECTS_DIR = 'tmp/'
+
+ESSENTIAL_OBJECTS = [
+    # 'types.cc.o',
+    # 'tensor_impl.cc.o',
+    'random_sampler.cc.o',  # default value for datasets (may not exist in their code)
+    'random_sampler_ir.cc.o',  # default value for datasets (may not exist in their code)
+]
+
+
+def load_dependencies():
+    """
+    Read dependencies.txt and load it into a dict.
+
+    :return: a dict containing list of dependencies for almost any file in MindData lite
+    """
+    if not os.path.isfile(DEPENDENCY_FILENAME):
+        raise FileNotFoundError("dependency file ({}) does not exist.\n"
+                                "Please run cropper_configure.py first.".format(DEPENDENCY_FILENAME))
+    with open(DEPENDENCY_FILENAME) as f:
+        dep_dict = json.load(f)
+    return dep_dict
+
+
+def load_associations():
+    """
+    Read associations.txt and load it into a dict.
+
+    :return: a dict containing entry point (a filename) for each op
+    """
+    if not os.path.isfile(ASSOCIATION_FILENAME):
+        raise FileNotFoundError("association file ({}) does not exist.\n"
+                                "Please run cropper_configure.py first.".format(ASSOCIATION_FILENAME))
+    with open(ASSOCIATION_FILENAME) as f:
+        _dict = json.load(f)
+    return _dict
+
+
+def get_unique_dependencies(dependencies_dict, associations_dict, user_ops):
+    """
+    Find which dependencies we need to include according to the ops found in the user code.
+
+    :param dependencies_dict: a dict containing list of dependencies for almost any file in MindData lite
+    :param associations_dict: a dcit containing entry point (a filename) for each op
+    :param user_ops: a list of ops found in the user code
+    :return: a list of dependencies needed based on the user code
+    """
+    selected_entries = []  # itemgetter(*user_ops)(associations_dict)
+    for op in user_ops:
+        print('{} --> {}'.format(op, associations_dict[op]))
+        selected_entries.append(associations_dict[op])
+    selected_files = itemgetter(*selected_entries)(dependencies_dict)
+    selected_files = list(itertools.chain(*selected_files))
+    return sorted(list(set().union(selected_files)))
+
+
+def remove_unused_objects(final_deps, essentials, all_object_files):
+    """
+    Remove object files that are determined to be NOT needed to run user code
+    as they are not in the dependencies of user code.
+
+    :param final_deps: a list of dependencies needed based on the user code
+    :param essentials: essential objects that should not be removed from final lib
+    :param all_object_files: a lsit of all objects available in our static library
+    :return: None
+    """
+    # find objects which are not part of any dependency (lstrip is needed for remove '_' added in crop.sh)
+    to_be_removed = [x for x in all_object_files if not any(x.lstrip('_')[:-5] in y for y in final_deps)]
+    # keep the ones that are not an essential object file. (lstrip is needed for remove '_' added in crop.sh)
+    to_be_removed = [x for x in to_be_removed if not any(x.lstrip('_') in y for y in essentials)]
+
+    print('Removing:', len(to_be_removed), 'unused objects.')
+    pprint(sorted(to_be_removed))
+    for filename in to_be_removed:
+        os.remove(os.path.join(OBJECTS_DIR, filename))
+
+
+def main():
+    # load tables created using cropper.py
+    dependencies_dict = load_dependencies()
+    associations_dict = load_associations()
+
+    # get all objects filename
+    all_object_files = [x[x.rfind('/') + 1:] for x in glob.glob('{}*.o'.format(OBJECTS_DIR))]
+    print("All Obj files: {}".format(len(all_object_files)))
+
+    # find ops in user code
+    my_parser = parser.SimpleParser()
+    temp = [my_parser.parse(x) for x in user_code_filenames]
+    user_ops = set(itertools.chain(*temp))
+    print('user ops: {}'.format(user_ops))
+
+    # user is not using any MindData op
+    if not user_ops:
+        warnings.warn('No MindData Ops detected in your code...')
+        remove_unused_objects([], [], all_object_files)
+        with open(os.path.join(OBJECTS_DIR, ALL_DEPS_FILENAME), 'w') as _:
+            pass
+        exit(0)
+
+    # find dependencies required (based on user ops)
+    unique_deps = get_unique_dependencies(dependencies_dict, associations_dict, user_ops)
+    print('Unique Deps (.h): {}'.format(len(unique_deps)))
+    print('Unique Deps (.cc): {}'.format(len(list(filter(lambda x: x[-2:] == 'cc', unique_deps)))))
+
+    # add essential files to dependency files
+    final_deps = set(unique_deps + dependencies_dict['ESSENTIAL'])
+    print('Total Deps (.h): {}'.format(len(final_deps)))
+
+    # delete the rest of the object files from directory.
+    remove_unused_objects(final_deps, ESSENTIAL_OBJECTS, all_object_files)
+
+    # write all dependencies to the file (for extracting external ones)
+    with open(os.path.join(OBJECTS_DIR, ALL_DEPS_FILENAME), 'w') as fout:
+        fout.write("\n".join(unique_deps) + '\n')
+
+
+if __name__ == "__main__":
+    # get user code filename(s) as argument(s) to code
+    if len(sys.argv) <= 1:
+        print("usage: python build_lib.py <xxx.y> [<xxx.z>]")
+        exit(1)
+    user_code_filenames = sys.argv[1:]
+    main()
--- a/mindspore/lite/tools/dataset/cropper/crop.sh
+++ b/mindspore/lite/tools/dataset/cropper/crop.sh
@ -0,0 +1,188 @@
+#!/bin/bash
+
+usage()
+{
+  echo "Usage:"
+  echo "bash crop.sh -p <path-to-mindspore-directory> <source-file> [<more-source-files>] \\"
+  echo "bash crop.sh -h \\"
+  echo ""
+  echo "Options:"
+  echo "    -p path to mindspore directory"
+  echo "    -h print usage"
+
+}
+
+# check and set options
+checkopts()
+{
+  while getopts ':p:h' opt
+  do
+    OPTARG=$(echo ${OPTARG} | tr '[A-Z]' '[a-z]')
+    case "${opt}" in
+      p)
+        MINDSPORE_PATH="$(cd "${OPTARG}" &> /dev/null && pwd )"
+        ;;
+      h)
+        usage
+        exit 1
+        ;;
+      *)
+        echo "Unknown option: \"${OPTARG}\""
+        usage
+        exit 1
+    esac
+  done
+}
+
+checkopts "$@"
+
+# exit if less than 3 args are given by user
+if [ $# -lt 3 ]; then
+  usage
+  exit 1
+fi
+
+# exit if mindspore path is not given by user
+if [ -z "${MINDSPORE_PATH}" ]; then
+  echo -e "\e[31mPlease set MINDSPORE_PATH environment variable.\e[0m"
+  exit 1
+fi
+
+ORIGNAL_PATH="$PWD"
+FILE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+
+# getting absolute paths for user provided filenames
+USER_CODES=""
+for i in "${@:OPTIND}";
+do
+  USER_CODES+="$(cd "$(dirname "${i}" )" &> /dev/null && pwd )/$(basename "${i}")"
+done
+# exit if user has not given any argument as their code
+if [ -z "${USER_CODES}" ]; then
+  echo -e "\e[31mPlease provide your file names as arguments.\e[0m"
+  exit 1
+fi
+echo "Provided files: $USER_CODES"
+
+echo "MS PATH: $MINDSPORE_PATH"
+echo "CWD: $ORIGNAL_PATH"
+echo "File PATH: $FILE_PATH"
+
+
+cd $FILE_PATH
+
+MD_LIB_FILENAME="libminddata-lite_static.a"
+
+# locate original MindData lite library
+MD_LIB_PATH=`find $MINDSPORE_PATH -name "${MD_LIB_FILENAME}" | head -n 1`
+if [ -z "${MD_LIB_PATH}" ]; then
+  echo -e "\e[31mMindData lite static library could not be found.\e[0m"
+  cd $ORIGNAL_PATH
+  exit 1
+fi
+
+
+# extract all objects of static lib to tmp/
+mkdir -p tmp
+cp $MD_LIB_PATH tmp
+cd tmp
+# extract objects with identical names by prepending (one or more) '_' to their names
+# (this scruipt supports more than 2 duplicate filenames)
+DUPLICATES=`ar t "${MD_LIB_FILENAME}" | sort | uniq -d`
+for dup in $DUPLICATES;
+do
+  i=0
+  prepend_var="_"
+  while :
+  do
+    i=$((i + 1))
+    # check if more duplicates are available (break otherwise)
+    error_output=$(ar xN $i "${MD_LIB_FILENAME}" $dup  2>&1)
+    if [ -n "$error_output" ]; then
+      break
+    fi
+    mv $dup "${prepend_var}${dup}"
+    prepend_var="${prepend_var}_"
+  done
+done
+
+# extract unique files from static library
+UNIQUES=`ar t "${MD_LIB_FILENAME}" | sort | uniq -u`
+ar x "${MD_LIB_FILENAME}" ${UNIQUES}
+cd ..
+
+# remove unused object files
+# write needed depsendencies to tmp/needed_dependencies.txt
+python build_lib.py ${USER_CODES}
+retVal=$?
+if [ $retVal -ne 0 ]; then
+  cd $ORIGNAL_PATH
+  exit 1
+fi
+
+LD_SEP='\n'
+EX_SEP=$';'
+LD_PATHS=""
+EXTERNAL_DEPS=""
+
+# locate external dependencies for MindData lite
+LIBJPEG_PATH=`find $MINDSPORE_PATH -name "libjpeg.so*" | head -n 1`
+LIBTURBOJPEG_PATH=`find $MINDSPORE_PATH -name "libturbojpeg.so*" | head -n 1`
+LIBSECUREC_PATH=`find $MINDSPORE_PATH -name libsecurec.a | head -n 1`
+
+# resolve symbolc links
+if [ "$(uname)" == "Darwin" ]; then
+  c=$(file -b "$(readlink $LIBJPEG_PATH)")
+elif [ "$(expr substr "$(uname -s)" 1 5)" == "Linux" ]; then
+  c=$(file -b "$(readlink -f $LIBJPEG_PATH)")
+fi
+# detect system architecture
+IFS="," read -r -a array <<< "$c"
+TARGET_ARCHITECTURE=${array[1]##* }
+echo "Architecture: $TARGET_ARCHITECTURE"
+
+# exit if $ANDROID_NDK is not set by user for ARM32 or ARM64
+if [ "$TARGET_ARCHITECTURE" == "ARM64" ]; then
+  if [ -z "${ANDROID_NDK}" ]; then
+    echo -e "\e[31mPlease set ANDROID_NDK environment variable.\e[0m"
+    cd $ORIGNAL_PATH
+    exit 1
+  fi
+elif [ "$TARGET_ARCHITECTURE" == "ARM32" ]; then
+  if [ -z "${ANDROID_NDK}" ]; then
+    echo -e "\e[31mPlease set ANDROID_NDK environment variable.\e[0m"
+    cd $ORIGNAL_PATH
+    exit 1
+  fi
+  # add LIBCLANG_RT_LIB for ARM32
+  LIBCLANG_RT_LIB=`find $ANDROID_NDK -name libclang_rt.builtins-arm-android.a | head -n 1`
+  EXTERNAL_DEPS=${EXTERNAL_DEPS}${LIBCLANG_RT_LIB}${EX_SEP}
+else
+  echo "No need for ANDROID_NDK"
+fi
+# Note: add .a files only to EXTERNAL_DEPS.
+if grep -q 'jpeg' "tmp/needed_dependencies.txt"; then
+  LD_PATHS=${LD_PATHS}${LIBJPEG_PATH}${LD_SEP}
+  LD_PATHS=${LD_PATHS}${LIBTURBOJPEG_PATH}${LD_SEP}
+  EXTERNAL_DEPS=${EXTERNAL_DEPS}${LIBJPEG_PATH}${EX_SEP}
+  EXTERNAL_DEPS=${EXTERNAL_DEPS}${LIBTURBOJPEG_PATH}${EX_SEP}
+fi
+# we always need securec library
+EXTERNAL_DEPS=${EXTERNAL_DEPS}${LIBSECUREC_PATH}${EX_SEP}
+
+# create .so lib from remaining object files
+cmake -S . -B . \
+      -DEXTERNAL_DEPS="${EXTERNAL_DEPS}" \
+      -DARCHITECTURE=$TARGET_ARCHITECTURE
+
+# no dependencies to MindData lite
+retVal=$?
+if [ $retVal -eq 0 ]; then
+  make
+  echo -e "\e[32mLibrary was built successfully, The new list of MindData-related dependencies is as follows:\e[0m"
+  echo -e "\e[36m$LD_PATHS$PWD/libminddata-lite_min.so\e[0m"
+fi
+
+rm -rf tmp/
+
+cd $ORIGNAL_PATH
--- a/mindspore/lite/tools/dataset/cropper/cropper_configure.py
+++ b/mindspore/lite/tools/dataset/cropper/cropper_configure.py
@ -0,0 +1,389 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+""" configure cropper tool """
+
+from functools import lru_cache
+import glob
+import json
+import os
+import queue
+import re
+import subprocess
+
+from mindspore import log as logger
+
+DEFINE_STR = "-DENABLE_ANDROID -DENABLE_ARM -DENABLE_ARM64 -DENABLE_NEON -DNO_DLIB -DUSE_ANDROID_LOG -DANDROID"
+
+ASSOCIATIONS_FILENAME = 'associations.txt'
+DEPENDENCIES_FILENAME = 'dependencies.txt'
+ERRORS_FILENAME = 'debug.txt'
+OUTPUT_LOCATION = "mindspore/lite/tools/dataset/cropper"
+
+# needed for gcc command for include directories
+MANUAL_HEADERS = [
+    ".",
+    "mindspore",
+    "mindspore/ccsrc",
+    "mindspore/ccsrc/minddata/dataset",
+    "mindspore/ccsrc/minddata/dataset/kernels/image",
+    "mindspore/core",
+    "mindspore/lite",
+]
+
+# To stop gcc command once reaching these external headers
+# (not all of them may be used now in MindData lite)
+EXTERNAL_DEPS = [
+    "graphengine/inc/external",
+    "akg/third_party/fwkacllib/inc",
+    "third_party",
+    "third_party/securec/include",
+    "build/mindspore/_deps/sqlite-src",
+    "build/mindspore/_deps/pybind11-src/include",
+    "build/mindspore/_deps/tinyxml2-src",
+    "build/mindspore/_deps/jpeg_turbo-src",
+    "build/mindspore/_deps/jpeg_turbo-src/_build",
+    "build/mindspore/_deps/icu4c-src/icu4c/source/i18n",
+    "build/mindspore/_deps/icu4c-src/icu4c/source/common",
+    "mindspore/lite/build/_deps/tinyxml2-src",
+    "mindspore/lite/build/_deps/jpeg_turbo-src",
+    "mindspore/lite/build/_deps/jpeg_turbo-src/_build",
+    "mindspore/lite/build/_deps/nlohmann_json-src",
+]
+
+# API files which the corresponding objects and all objects for their dependencies must always be included.
+ESSENTIAL_FILES_1 = [
+    "api/data_helper.cc",
+    "api/datasets.cc",
+    "api/execute.cc",
+    "api/iterator.cc",
+]
+
+# API files which the corresponding objects must always be included.
+# (corresponding IR files will be included according to user ops)
+ESSENTIAL_FILES_2 = [
+    "api/text.cc",
+    "api/transforms.cc",
+    "api/samplers.cc",
+    "api/vision.cc",
+]
+
+DATASET_PATH = "mindspore/ccsrc/minddata/dataset"
+
+OPS_DIRS = [
+    "engine/ir/datasetops",
+    "engine/ir/datasetops/source",
+    "engine/ir/datasetops/source/samplers",
+    "kernels/ir/vision",
+    "kernels/ir/data",
+    "text/ir/kernels",
+]
+
+
+def extract_classname_samplers(header_content):
+    """
+    Use regex to find class names in header files of samplers
+
+    :param header_content: string containing header of a sampler IR file
+    :return: list of sampler classes found
+    """
+    return re.findall(r"(?<=class )[\w\d_]+(?=Obj : )", header_content)
+
+
+def extract_classname_source_node(header_content):
+    """
+    Use regex to find class names in header files of source nodes
+
+    :param header_content: string containing header of a source node IR file
+    :return: list of source node classes found
+    """
+    return re.findall(r"(?<=class )[\w\d_]+(?=Node : )", header_content)
+
+
+def extract_classname_nonsource_node(header_content):
+    """
+    Use regex to find class names in header files of non-source nodes
+
+    :param header_content: string containing header of a non-source IR file
+    :return: list of non-source node classes found
+    """
+    return re.findall(r"(?<=class )[\w\d_]+(?=Node : )", header_content)
+
+
+def extract_classname_vision(header_content):
+    """
+    Use regex to find class names in header files of vision ops
+
+    :param header_content: string containing header of a vision op IR file
+    :return: list of vision ops found
+    """
+    return re.findall(r"(?<=class )[\w\d_]+(?=Operation : )", header_content)
+
+
+def extract_classname_data(header_content):
+    """
+    Use regex to find class names in header files of data ops
+
+    :param header_content: string containing header of a data op IR file
+    :return: list of data ops found
+    """
+    return re.findall(r"(?<=class )[\w\d_]+(?=Operation : )", header_content)
+
+
+def extract_classname_text(header_content):
+    """
+    Use regex to find class names in header files of text ops
+
+    :param header_content: string containing header of a text op IR file
+    :return: list of text ops found
+    """
+    return re.findall(r"(?<=class )[\w\d_]+(?=Operation : )", header_content)
+
+
+# For each op type (directory) store the corresponding function which extracts op name
+registered_functions = {
+    os.path.join(DATASET_PATH, 'engine/ir/datasetops/source/samplers'): extract_classname_samplers,
+    os.path.join(DATASET_PATH, 'engine/ir/datasetops/source'): extract_classname_source_node,
+    os.path.join(DATASET_PATH, 'engine/ir/datasetops'): extract_classname_nonsource_node,
+    os.path.join(DATASET_PATH, 'kernels/ir/vision'): extract_classname_vision,
+    os.path.join(DATASET_PATH, 'kernels/ir/data'): extract_classname_data,
+    os.path.join(DATASET_PATH, 'text/ir/kernels'): extract_classname_text,
+}
+
+
+def get_headers():
+    """
+    Get the headers flag: "-Ixx/yy -Ixx/zz ..."
+
+    :return: a string to be passed to compiler
+    """
+    headers_paths = MANUAL_HEADERS + EXTERNAL_DEPS
+
+    output = "-I{}/".format("/ -I".join(headers_paths))
+
+    return output
+
+
+@lru_cache(maxsize=1024)
+def get_dependencies_of_file(headers_flag, filename):
+    """
+    Create dependency list for a file (file0.cc):
+    file0.cc.o: file1.h, file2.h, ...
+
+    :param headers_flag: string containing headers include paths with -I prepended to them.
+    :param filename: a string containing path of a file.
+    :return: a list of file names [file0.cc, file1.h, file2.h, file3.h] and error string
+    """
+    command = 'gcc -MM -MG {0} {1} {2}'.format(filename, DEFINE_STR, headers_flag)
+    stdout, stderr = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
+    deps = re.split(r'[\s\\]+', stdout.decode('utf-8').strip(), flags=re.MULTILINE)[1:]
+
+    return deps, stderr.decode('utf-8')
+
+
+def needs_processing(dep_cc, processed_cc, queue_cc_set):
+    """
+    Determine if a file's dependencies need to be processed.
+
+    :param dep_cc: the candidate file to be processed by gcc
+    :param processed_cc: set of files that have been already processed.
+    :param queue_cc_set: files currently in the queue (to be processed)
+    :return: boolean, whether the file should be further processed by gcc.
+    """
+    # don't add the file to the queue if already processed
+    if dep_cc in processed_cc:
+        return False
+    # don't add the file to the queue if it is already there
+    if dep_cc in queue_cc_set:
+        return False
+    # if file doesn't exist, don't process as it will cause error (may happen for cache)
+    if not os.path.isfile(dep_cc):
+        return False
+    return True
+
+
+def build_source_file_path(dep_h):
+    """
+    Given the path to a header file, find the path for the associated source file.
+    - if an external dependency, return "EXTERNAL"
+    - if not found, keep the header file's path
+
+    :param dep_h: a string containing path to the header file
+    :return: dep_cc: a string containing path to the source file
+    """
+    for x in EXTERNAL_DEPS:
+        if x in dep_h:
+            dep_cc = "EXTERNAL"
+            return dep_cc
+    if 'include/api/types.h' in dep_h:
+        dep_cc = "mindspore/ccsrc/cxx_api/types.cc"
+        return dep_cc
+    dep_cc = dep_h.replace('.hpp', '.cc').replace('.h', '.cc')
+    if not os.path.isfile(dep_cc):
+        dep_cc = dep_h
+    return dep_cc
+
+
+def get_all_dependencies_of_file(headers_flag, filename):
+    """
+    Create dependency list for a file (incl. all source files needed).
+
+    :param headers_flag: string containing headers include paths with -I prepended to them.
+    :param filename: a string containing path of a file.
+    :return: all dependencies of that file and the error string
+    """
+    errors = []
+    # a queue to process files
+    queue_cc = queue.SimpleQueue()
+    # a set of items that have ever been in queue_cc (faster access time)
+    queue_cc_set = set()
+    # store processed files
+    processed_cc = set()
+
+    # add the source file to the queue
+    queue_cc.put(filename)
+    queue_cc_set.add(filename)
+
+    while not queue_cc.empty():
+        # process the first item in the queue
+        curr_cc = queue_cc.get()
+        deps, error = get_dependencies_of_file(headers_flag, curr_cc)
+        errors.append(error)
+        processed_cc.add(curr_cc)
+        # prepare its dependencies for processing
+        for dep_h in deps:
+            dep_cc = build_source_file_path(dep_h)
+            # ignore if marked as an external dependency
+            if dep_cc == "EXTERNAL":
+                processed_cc.add(dep_h)
+                continue
+            # add to queue if needs processing
+            if needs_processing(dep_cc, processed_cc, queue_cc_set):
+                queue_cc.put(dep_cc)
+                queue_cc_set.add(dep_cc)
+    logger.debug('file: {} | deps: {}'.format(filename[filename.rfind('/') + 1:], len(processed_cc)))
+
+    return list(processed_cc), "".join(errors)
+
+
+def get_deps_essential(headers_flag):
+    """
+    Return dependencies required for any run (essential).
+
+    :param headers_flag: string containing headers include paths with -I prepended to them.
+    :return: a list of essential files, and the error string
+    """
+    essentials = []
+    errors = []
+
+    # find dependencies for ESSENTIAL_FILES_1 as we need them too.
+    for filename in [os.path.join(DATASET_PATH, x) for x in ESSENTIAL_FILES_1]:
+        deps, err = get_all_dependencies_of_file(headers_flag, filename)
+        errors.append(err)
+        essentials.extend(deps)
+        essentials.append(filename)
+    # we only need ESSENTIAL_FILES_2 themselves (IR files are split)
+    for filename in [os.path.join(DATASET_PATH, x) for x in ESSENTIAL_FILES_2]:
+        essentials.append(filename)
+    essentials = list(set(essentials))
+
+    return essentials, "".join(errors)
+
+
+def get_deps_non_essential(headers_flag):
+    """
+    Find the entry points (IR Level) for each op and write them in associations dict.
+    Starting from these entry point, recursively find the dependencies for each file and write in a dict.
+
+    :param headers_flag: string containing headers include paths with -I prepended to them.
+    :return: dependencies dict, associations dict, the error string
+    """
+    dependencies = dict()  # what files each file imports
+    associations = dict()  # what file each op is defined in (IR level)
+    errors = []
+    for dirname in [os.path.join(DATASET_PATH, x) for x in OPS_DIRS]:
+        # Get the proper regex function for this directory
+        if dirname not in registered_functions:
+            raise ValueError("Directory has no registered regex function:", dirname)
+        extract_classname = registered_functions[dirname]
+        # iterate over source files in the directory
+        for src_filename in glob.glob("{}/*.cc".format(dirname)):
+            # get the dependencies of source file
+            deps, err = get_all_dependencies_of_file(headers_flag, src_filename)
+            dependencies[src_filename] = deps
+            errors.append(err)
+            # locate the corresponding header file and read it
+            header_filename = src_filename.replace('.cc', '.h')
+            if not os.path.isfile(header_filename):
+                raise ValueError("Header file doesn't exist!")
+            with open(header_filename, 'r') as f:
+                content = f.read().strip()
+            # extract ops from header file
+            ops = extract_classname(content)
+            # add the op to associations table
+            for raw_op in ops:
+                op = raw_op.lower().replace('_', '')
+                associations[op] = src_filename
+    return dependencies, associations, "".join(errors)
+
+
+def main():
+    """
+    Configure the cropper tool by creating  associations.txt and dependencies.txt
+    """
+    errors = ""
+    dependencies = {}
+
+    # convert to a single string with '-I' prepended to each dir name
+    headers_flag = get_headers()
+
+    # get dependencies for essential files
+    all_deps, err = get_deps_essential(headers_flag)
+    dependencies['ESSENTIAL'] = all_deps
+    errors += err
+    logger.debug('len(ESSENTIAL): {}'.format(len(dependencies['ESSENTIAL'])))
+
+    # get dependencies for other files (non-essentials)
+    other_dependencies, all_associations, err = get_deps_non_essential(headers_flag)
+    dependencies.update(other_dependencies)
+    errors += err
+
+    with open(os.path.join(OUTPUT_LOCATION, DEPENDENCIES_FILENAME), "w") as f:
+        json.dump(dependencies, f)
+
+    with open(os.path.join(OUTPUT_LOCATION, ASSOCIATIONS_FILENAME), "w") as f:
+        json.dump(all_associations, f)
+
+    with open(os.path.join(OUTPUT_LOCATION, ERRORS_FILENAME), "w") as f:
+        f.write(errors)
+
+
+if __name__ == "__main__":
+
+    logger.info('STARTING: cropper_configure.py ')
+
+    original_path = os.getcwd()
+    script_path = os.path.dirname(os.path.abspath(__file__))
+
+    try:
+        # change directory to mindspore directory
+        os.chdir(os.path.join(script_path, "../../../../.."))
+        main()
+    except (OSError, IndexError, KeyError):
+        logger.error('FAILED: cropper_configure.py!')
+        raise
+    else:
+        logger.info('SUCCESS: cropper_configure.py ')
+    finally:
+        os.chdir(original_path)
--- a/mindspore/lite/tools/dataset/cropper/dependencies.txt
+++ b/mindspore/lite/tools/dataset/cropper/dependencies.txt
--- a/mindspore/lite/tools/dataset/cropper/parser.py
+++ b/mindspore/lite/tools/dataset/cropper/parser.py
@ -0,0 +1,84 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+""" extract ops from user code """
+
+from abc import ABC, abstractmethod
+from functools import lru_cache
+import json
+import os
+
+ASSOCIATION_FILENAME = 'associations.txt'
+
+
+@lru_cache(maxsize=1)
+def _load_ops_names():
+    """
+    Get the name of all ops available in MindData lite.
+
+    :return: a list of all available ops in MindData lite
+    """
+    with open(os.path.expanduser(ASSOCIATION_FILENAME), 'r') as f:
+        _dict = json.load(f)
+    return _dict.keys()
+
+
+class Parser(ABC):
+    """
+    Abstract Base Class for parsers for looking up ops in user code.
+    """
+
+    def __init__(self):
+        self._all_ops = _load_ops_names()
+
+    @abstractmethod
+    def parse(self, user_filename):
+        """
+        finds ops detected in the user code
+
+        :param user_filename: string, name of file containing user code
+        :return: list of ops found in the user code
+        """
+
+
+class SimpleParser(Parser):
+    """
+    A simple parser that works by string matching:
+    Code uses an op if it is found anywhere in the text.
+    """
+
+    def parse(self, user_filename):
+        """
+        Find and return ops in the user file.
+
+        :param user_filename: filename of user code
+        :return: a list of ops present in the file
+        """
+        if not os.path.isfile(user_filename):
+            raise FileNotFoundError("file does not exist: {}".format(user_filename))
+        with open(user_filename) as f:
+            data = f.read().strip()
+        user_ops = self._simple_string_match(data)
+        return user_ops
+
+    def _simple_string_match(self, user_text):
+        """
+        Find and return ops in the user code (provided as a string).
+
+        :param user_text: string containing user code
+        :return: a list of ops found in the user_text
+        """
+        processed_user_text = user_text.strip().lower()
+        user_ops = [op for op in self._all_ops if op in processed_user_text]
+        return user_ops