!15560 Optimize cache_admin tool to allow multiple session destroy

From: @lixiachen Reviewed-by: @robingrosman,@nsyca Signed-off-by: @robingrosman
2021-04-27 07:55:29 +08:00 · 2021-04-27 07:55:29 +08:00 · 83cdb8bb38
parent 90828cc0ac 44fb7ac4bb
commit 83cdb8bb38
5 changed files with 85 additions and 25 deletions
--- a/mindspore/ccsrc/minddata/dataset/engine/cache/cache_admin_arg.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/cache/cache_admin_arg.cc
@ -17,6 +17,7 @@
 #include <sys/types.h>
 #include <sys/wait.h>
 #include <unistd.h>
 #include <algorithm>
 #include <cerrno>
 #include <iomanip>
 #include <iostream>
@ -39,7 +40,6 @@ const char CacheAdminArgHandler::kServerBinary[] = "cache_server";
 CacheAdminArgHandler::CacheAdminArgHandler()
    : port_(kCfgDefaultCachePort),
      session_id_(0),
      num_workers_(kDefaultNumWorkers),
      shm_mem_sz_(kDefaultSharedMemorySizeInGB),
      log_level_(kDefaultLogLevel),
@ -102,6 +102,52 @@ CacheAdminArgHandler::CacheAdminArgHandler()
 CacheAdminArgHandler::~CacheAdminArgHandler() = default;
 Status CacheAdminArgHandler::AssignArg(std::string option, std::vector<uint32_t> *out_arg,
                                       std::stringstream *arg_stream, CommandId command_id) {
  // Detect if the user tried to provide this argument more than once
  ArgValue selected_arg = arg_map_[option];
  if (used_args_[selected_arg]) {
    std::string err_msg = "The " + option + " argument was given more than once.";
    return Status(StatusCode::kMDSyntaxError, err_msg);
  }
  // Flag that this arg is used now
  used_args_[selected_arg] = true;
  // Some options are just arguments, for example "--port 50052" is not a command, it's just a argument.
  // Other options are actual commands, for example "--destroy_session 1234".  This executes the destroy session.
  // If this option is also a command, make sure there has not been multiple commands given before assigning it.
  if (command_id != CommandId::kCmdUnknown) {
    if (command_id_ != CommandId::kCmdUnknown) {
      std::string err_msg = "Only one command at a time is allowed.  Invalid command: " + option;
      return Status(StatusCode::kMDSyntaxError, err_msg);
    } else {
      command_id_ = command_id;
    }
  }
  uint32_t value_as_uint;
  while (arg_stream->rdbuf()->in_avail() != 0) {
    *arg_stream >> value_as_uint;
    if (arg_stream->fail()) {
      arg_stream->clear();
      std::string value_as_string;
      *arg_stream >> value_as_string;
      std::string err_msg = "Invalid numeric value: " + value_as_string;
      return Status(StatusCode::kMDSyntaxError, err_msg);
    } else {
      out_arg->push_back(value_as_uint);
    }
  }
  if (out_arg->empty()) {
    std::string err_msg = option + " option requires an argument field.  Syntax: " + option + " <field>";
    return Status(StatusCode::kMDSyntaxError, err_msg);
  }
  return Status::OK();
 }
 Status CacheAdminArgHandler::AssignArg(std::string option, int32_t *out_arg, std::stringstream *arg_stream,
                                       CommandId command_id) {
  // Detect if the user tried to provide this argument more than once
@ -269,11 +315,7 @@ Status CacheAdminArgHandler::ParseArgStream(std::stringstream *arg_stream) {
        break;
      }
      case ArgValue::kArgDestroySession: {
-        // session_id is an unsigned type. We may need to template the AssignArg function so that
+        RETURN_IF_NOT_OK(AssignArg(tok, &session_ids_, arg_stream, CommandId::kCmdDestroySession));
        // it can handle different flavours of integers instead of just int32_t.
        int32_t session_int;
        RETURN_IF_NOT_OK(AssignArg(tok, &session_int, arg_stream, CommandId::kCmdDestroySession));
        session_id_ = session_int;
        break;
      }
      case ArgValue::kArgNumWorkers: {
@ -376,11 +418,13 @@ Status CacheAdminArgHandler::RunCommand() {
      CacheClientGreeter comm(hostname_, port_, 1);
      RETURN_IF_NOT_OK(comm.ServiceStart());
      CacheClientInfo cinfo;
-      cinfo.set_session_id(session_id_);
+      for (session_id_type id : session_ids_) {
        cinfo.set_session_id(id);
        auto rq = std::make_shared<DropSessionRequest>(cinfo);
        RETURN_IF_NOT_OK(comm.HandleRequest(rq));
        RETURN_IF_NOT_OK(rq->Wait());
-      std::cout << "Drop session successfully for server on port " << std::to_string(port_) << std::endl;
+        std::cout << "Drop session " << id << " successfully for server on port " << std::to_string(port_) << std::endl;
      }
      break;
    }
    case CommandId::kCmdListSessions: {
--- a/mindspore/ccsrc/minddata/dataset/engine/cache/cache_admin_arg.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/cache/cache_admin_arg.h
@ -22,6 +22,7 @@
 #include <string>
 #include <sstream>
 #include <thread>
 #include <vector>
 #include "minddata/dataset/util/status.h"
 #include "minddata/dataset/engine/cache/cache_client.h"
@ -94,6 +95,9 @@ class CacheAdminArgHandler {
  Status AssignArg(std::string option, float *out_arg, std::stringstream *arg_stream,
                   CommandId command_id = CommandId::kCmdUnknown);
  Status AssignArg(std::string option, std::vector<uint32_t> *out_arg, std::stringstream *arg_stream,
                   CommandId command_id = CommandId::kCmdUnknown);
  Status Validate();
  CommandId command_id_;
@ -102,7 +106,7 @@ class CacheAdminArgHandler {
  int32_t shm_mem_sz_;
  int32_t log_level_;
  float memory_cap_ratio_;
-  session_id_type session_id_;
+  std::vector<session_id_type> session_ids_;
  std::string hostname_;
  std::string spill_dir_;
  std::string trailing_args_;
--- a/model_zoo/official/cv/resnet/README.md
+++ b/model_zoo/official/cv/resnet/README.md
@ -332,16 +332,16 @@ bash run_parameter_server_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet201
 #### Evaluation while training
 ```bash
-# evaluation while distributed training Ascend example:
+# evaluation with distributed training Ascend example:
 bash run_distribute_train.sh [resnet18|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)
-# evaluation while standalone training Ascend example:
+# evaluation with standalone training Ascend example:
 bash run_standalone_train.sh [resnet18|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)
-# evaluation while distributed training GPU example:
+# evaluation with distributed training GPU example:
 bash run_distribute_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012]  [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)
-# evaluation while standalone training GPU example:
+# evaluation with standalone training GPU example:
 bash run_standalone_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012]  [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)
 ```
--- a/model_zoo/official/cv/resnet/src/dataset.py
+++ b/model_zoo/official/cv/resnet/src/dataset.py
@ -34,8 +34,8 @@ def create_dataset1(dataset_path, do_train, repeat_num=1, batch_size=32, target=
        batch_size(int): the batch size of dataset. Default: 32
        target(str): the device target. Default: Ascend
        distribute(bool): data for distribute or not. Default: False
-        enable_cache(bool): whether tensor caching service is used for eval.
+        enable_cache(bool): whether tensor caching service is used for eval. Default: False
-        cache_session_id(int): If enable_cache, cache session_id need to be provided.
+        cache_session_id(int): If enable_cache, cache session_id need to be provided. Default: None
    Returns:
        dataset
@ -104,8 +104,8 @@ def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target=
        batch_size(int): the batch size of dataset. Default: 32
        target(str): the device target. Default: Ascend
        distribute(bool): data for distribute or not. Default: False
-        enable_cache(bool): whether tensor caching service is used for eval.
+        enable_cache(bool): whether tensor caching service is used for eval. Default: False
-        cache_session_id(int): If enable_cache, cache session_id need to be provided.
+        cache_session_id(int): If enable_cache, cache session_id need to be provided. Default: None
    Returns:
        dataset
@ -182,8 +182,8 @@ def create_dataset3(dataset_path, do_train, repeat_num=1, batch_size=32, target=
        batch_size(int): the batch size of dataset. Default: 32
        target(str): the device target. Default: Ascend
        distribute(bool): data for distribute or not. Default: False
-        enable_cache(bool): whether tensor caching service is used for eval.
+        enable_cache(bool): whether tensor caching service is used for eval. Default: False
-        cache_session_id(int): If enable_cache, cache session_id need to be provided.
+        cache_session_id(int): If enable_cache, cache session_id need to be provided. Default: None
    Returns:
        dataset
@ -259,8 +259,8 @@ def create_dataset4(dataset_path, do_train, repeat_num=1, batch_size=32, target=
        batch_size(int): the batch size of dataset. Default: 32
        target(str): the device target. Default: Ascend
        distribute(bool): data for distribute or not. Default: False
-        enable_cache(bool): whether tensor caching service is used for eval.
+        enable_cache(bool): whether tensor caching service is used for eval. Default: False
-        cache_session_id(int): If enable_cache, cache session_id need to be provided.
+        cache_session_id(int): If enable_cache, cache session_id need to be provided. Default: None
    Returns:
        dataset
--- a/tests/ut/python/cachetests/cachetest_args.sh
+++ b/tests/ut/python/cachetests/cachetest_args.sh
@ -16,7 +16,7 @@
 # source the globals and functions for use with cache testing
 export SKIP_ADMIN_COUNTER=false
-declare failed_tests
+declare session_id failed_tests
 . cachetest_lib.sh
 echo
@ -160,6 +160,18 @@ cmd="${CACHE_ADMIN} -d 99999"
 CacheAdminCmd "${cmd}" 1
 HandleRcExit $? 0 0
 # generate two new sessions to test multi-destroy
 GetSession
 HandleRcExit $? 0 0
 session_id1=$session_id
 GetSession
 HandleRcExit $? 0 0
 session_id2=$session_id
 # test multi-session destroy
 cmd="${CACHE_ADMIN} -d ${session_id1} ${session_id2}"
 CacheAdminCmd "${cmd}" 0
 HandleRcExit $? 0 0
 # stop cache server at this point
 StopServer
 HandleRcExit $? 1 1