forked from mindspore-Ecosystem/mindspore
!15560 Optimize cache_admin tool to allow multiple session destroy
From: @lixiachen Reviewed-by: @robingrosman,@nsyca Signed-off-by: @robingrosman
This commit is contained in:
commit
83cdb8bb38
|
@ -17,6 +17,7 @@
|
|||
#include <sys/types.h>
|
||||
#include <sys/wait.h>
|
||||
#include <unistd.h>
|
||||
#include <algorithm>
|
||||
#include <cerrno>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
|
@ -39,7 +40,6 @@ const char CacheAdminArgHandler::kServerBinary[] = "cache_server";
|
|||
|
||||
CacheAdminArgHandler::CacheAdminArgHandler()
|
||||
: port_(kCfgDefaultCachePort),
|
||||
session_id_(0),
|
||||
num_workers_(kDefaultNumWorkers),
|
||||
shm_mem_sz_(kDefaultSharedMemorySizeInGB),
|
||||
log_level_(kDefaultLogLevel),
|
||||
|
@ -102,6 +102,52 @@ CacheAdminArgHandler::CacheAdminArgHandler()
|
|||
|
||||
CacheAdminArgHandler::~CacheAdminArgHandler() = default;
|
||||
|
||||
Status CacheAdminArgHandler::AssignArg(std::string option, std::vector<uint32_t> *out_arg,
|
||||
std::stringstream *arg_stream, CommandId command_id) {
|
||||
// Detect if the user tried to provide this argument more than once
|
||||
ArgValue selected_arg = arg_map_[option];
|
||||
if (used_args_[selected_arg]) {
|
||||
std::string err_msg = "The " + option + " argument was given more than once.";
|
||||
return Status(StatusCode::kMDSyntaxError, err_msg);
|
||||
}
|
||||
|
||||
// Flag that this arg is used now
|
||||
used_args_[selected_arg] = true;
|
||||
|
||||
// Some options are just arguments, for example "--port 50052" is not a command, it's just a argument.
|
||||
// Other options are actual commands, for example "--destroy_session 1234". This executes the destroy session.
|
||||
// If this option is also a command, make sure there has not been multiple commands given before assigning it.
|
||||
if (command_id != CommandId::kCmdUnknown) {
|
||||
if (command_id_ != CommandId::kCmdUnknown) {
|
||||
std::string err_msg = "Only one command at a time is allowed. Invalid command: " + option;
|
||||
return Status(StatusCode::kMDSyntaxError, err_msg);
|
||||
} else {
|
||||
command_id_ = command_id;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t value_as_uint;
|
||||
while (arg_stream->rdbuf()->in_avail() != 0) {
|
||||
*arg_stream >> value_as_uint;
|
||||
if (arg_stream->fail()) {
|
||||
arg_stream->clear();
|
||||
std::string value_as_string;
|
||||
*arg_stream >> value_as_string;
|
||||
std::string err_msg = "Invalid numeric value: " + value_as_string;
|
||||
return Status(StatusCode::kMDSyntaxError, err_msg);
|
||||
} else {
|
||||
out_arg->push_back(value_as_uint);
|
||||
}
|
||||
}
|
||||
|
||||
if (out_arg->empty()) {
|
||||
std::string err_msg = option + " option requires an argument field. Syntax: " + option + " <field>";
|
||||
return Status(StatusCode::kMDSyntaxError, err_msg);
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status CacheAdminArgHandler::AssignArg(std::string option, int32_t *out_arg, std::stringstream *arg_stream,
|
||||
CommandId command_id) {
|
||||
// Detect if the user tried to provide this argument more than once
|
||||
|
@ -269,11 +315,7 @@ Status CacheAdminArgHandler::ParseArgStream(std::stringstream *arg_stream) {
|
|||
break;
|
||||
}
|
||||
case ArgValue::kArgDestroySession: {
|
||||
// session_id is an unsigned type. We may need to template the AssignArg function so that
|
||||
// it can handle different flavours of integers instead of just int32_t.
|
||||
int32_t session_int;
|
||||
RETURN_IF_NOT_OK(AssignArg(tok, &session_int, arg_stream, CommandId::kCmdDestroySession));
|
||||
session_id_ = session_int;
|
||||
RETURN_IF_NOT_OK(AssignArg(tok, &session_ids_, arg_stream, CommandId::kCmdDestroySession));
|
||||
break;
|
||||
}
|
||||
case ArgValue::kArgNumWorkers: {
|
||||
|
@ -376,11 +418,13 @@ Status CacheAdminArgHandler::RunCommand() {
|
|||
CacheClientGreeter comm(hostname_, port_, 1);
|
||||
RETURN_IF_NOT_OK(comm.ServiceStart());
|
||||
CacheClientInfo cinfo;
|
||||
cinfo.set_session_id(session_id_);
|
||||
auto rq = std::make_shared<DropSessionRequest>(cinfo);
|
||||
RETURN_IF_NOT_OK(comm.HandleRequest(rq));
|
||||
RETURN_IF_NOT_OK(rq->Wait());
|
||||
std::cout << "Drop session successfully for server on port " << std::to_string(port_) << std::endl;
|
||||
for (session_id_type id : session_ids_) {
|
||||
cinfo.set_session_id(id);
|
||||
auto rq = std::make_shared<DropSessionRequest>(cinfo);
|
||||
RETURN_IF_NOT_OK(comm.HandleRequest(rq));
|
||||
RETURN_IF_NOT_OK(rq->Wait());
|
||||
std::cout << "Drop session " << id << " successfully for server on port " << std::to_string(port_) << std::endl;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case CommandId::kCmdListSessions: {
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
#include <string>
|
||||
#include <sstream>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include "minddata/dataset/util/status.h"
|
||||
#include "minddata/dataset/engine/cache/cache_client.h"
|
||||
|
||||
|
@ -94,6 +95,9 @@ class CacheAdminArgHandler {
|
|||
Status AssignArg(std::string option, float *out_arg, std::stringstream *arg_stream,
|
||||
CommandId command_id = CommandId::kCmdUnknown);
|
||||
|
||||
Status AssignArg(std::string option, std::vector<uint32_t> *out_arg, std::stringstream *arg_stream,
|
||||
CommandId command_id = CommandId::kCmdUnknown);
|
||||
|
||||
Status Validate();
|
||||
|
||||
CommandId command_id_;
|
||||
|
@ -102,7 +106,7 @@ class CacheAdminArgHandler {
|
|||
int32_t shm_mem_sz_;
|
||||
int32_t log_level_;
|
||||
float memory_cap_ratio_;
|
||||
session_id_type session_id_;
|
||||
std::vector<session_id_type> session_ids_;
|
||||
std::string hostname_;
|
||||
std::string spill_dir_;
|
||||
std::string trailing_args_;
|
||||
|
|
|
@ -332,16 +332,16 @@ bash run_parameter_server_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet201
|
|||
#### Evaluation while training
|
||||
|
||||
```bash
|
||||
# evaluation while distributed training Ascend example:
|
||||
# evaluation with distributed training Ascend example:
|
||||
bash run_distribute_train.sh [resnet18|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)
|
||||
|
||||
# evaluation while standalone training Ascend example:
|
||||
# evaluation with standalone training Ascend example:
|
||||
bash run_standalone_train.sh [resnet18|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)
|
||||
|
||||
# evaluation while distributed training GPU example:
|
||||
# evaluation with distributed training GPU example:
|
||||
bash run_distribute_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)
|
||||
|
||||
# evaluation while standalone training GPU example:
|
||||
# evaluation with standalone training GPU example:
|
||||
bash run_standalone_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)
|
||||
```
|
||||
|
||||
|
|
|
@ -34,8 +34,8 @@ def create_dataset1(dataset_path, do_train, repeat_num=1, batch_size=32, target=
|
|||
batch_size(int): the batch size of dataset. Default: 32
|
||||
target(str): the device target. Default: Ascend
|
||||
distribute(bool): data for distribute or not. Default: False
|
||||
enable_cache(bool): whether tensor caching service is used for eval.
|
||||
cache_session_id(int): If enable_cache, cache session_id need to be provided.
|
||||
enable_cache(bool): whether tensor caching service is used for eval. Default: False
|
||||
cache_session_id(int): If enable_cache, cache session_id need to be provided. Default: None
|
||||
|
||||
Returns:
|
||||
dataset
|
||||
|
@ -104,8 +104,8 @@ def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target=
|
|||
batch_size(int): the batch size of dataset. Default: 32
|
||||
target(str): the device target. Default: Ascend
|
||||
distribute(bool): data for distribute or not. Default: False
|
||||
enable_cache(bool): whether tensor caching service is used for eval.
|
||||
cache_session_id(int): If enable_cache, cache session_id need to be provided.
|
||||
enable_cache(bool): whether tensor caching service is used for eval. Default: False
|
||||
cache_session_id(int): If enable_cache, cache session_id need to be provided. Default: None
|
||||
|
||||
Returns:
|
||||
dataset
|
||||
|
@ -182,8 +182,8 @@ def create_dataset3(dataset_path, do_train, repeat_num=1, batch_size=32, target=
|
|||
batch_size(int): the batch size of dataset. Default: 32
|
||||
target(str): the device target. Default: Ascend
|
||||
distribute(bool): data for distribute or not. Default: False
|
||||
enable_cache(bool): whether tensor caching service is used for eval.
|
||||
cache_session_id(int): If enable_cache, cache session_id need to be provided.
|
||||
enable_cache(bool): whether tensor caching service is used for eval. Default: False
|
||||
cache_session_id(int): If enable_cache, cache session_id need to be provided. Default: None
|
||||
|
||||
Returns:
|
||||
dataset
|
||||
|
@ -259,8 +259,8 @@ def create_dataset4(dataset_path, do_train, repeat_num=1, batch_size=32, target=
|
|||
batch_size(int): the batch size of dataset. Default: 32
|
||||
target(str): the device target. Default: Ascend
|
||||
distribute(bool): data for distribute or not. Default: False
|
||||
enable_cache(bool): whether tensor caching service is used for eval.
|
||||
cache_session_id(int): If enable_cache, cache session_id need to be provided.
|
||||
enable_cache(bool): whether tensor caching service is used for eval. Default: False
|
||||
cache_session_id(int): If enable_cache, cache session_id need to be provided. Default: None
|
||||
|
||||
Returns:
|
||||
dataset
|
||||
|
|
|
@ -16,7 +16,7 @@
|
|||
|
||||
# source the globals and functions for use with cache testing
|
||||
export SKIP_ADMIN_COUNTER=false
|
||||
declare failed_tests
|
||||
declare session_id failed_tests
|
||||
. cachetest_lib.sh
|
||||
echo
|
||||
|
||||
|
@ -160,6 +160,18 @@ cmd="${CACHE_ADMIN} -d 99999"
|
|||
CacheAdminCmd "${cmd}" 1
|
||||
HandleRcExit $? 0 0
|
||||
|
||||
# generate two new sessions to test multi-destroy
|
||||
GetSession
|
||||
HandleRcExit $? 0 0
|
||||
session_id1=$session_id
|
||||
GetSession
|
||||
HandleRcExit $? 0 0
|
||||
session_id2=$session_id
|
||||
# test multi-session destroy
|
||||
cmd="${CACHE_ADMIN} -d ${session_id1} ${session_id2}"
|
||||
CacheAdminCmd "${cmd}" 0
|
||||
HandleRcExit $? 0 0
|
||||
|
||||
# stop cache server at this point
|
||||
StopServer
|
||||
HandleRcExit $? 1 1
|
||||
|
|
Loading…
Reference in New Issue