forked from mindspore-Ecosystem/mindspore
!1978 MindData Profiler Infrastructure
Merge pull request !1978 from JunhanHu/perf_monitor
This commit is contained in:
commit
4aed2bf9ef
|
@ -62,6 +62,7 @@ add_dependencies(engine-datasetops-source core)
|
||||||
add_dependencies(engine-datasetops-source-sampler core)
|
add_dependencies(engine-datasetops-source-sampler core)
|
||||||
add_dependencies(engine-datasetops core)
|
add_dependencies(engine-datasetops core)
|
||||||
add_dependencies(engine-opt core)
|
add_dependencies(engine-opt core)
|
||||||
|
add_dependencies(engine-perf core)
|
||||||
add_dependencies(engine-gnn core)
|
add_dependencies(engine-gnn core)
|
||||||
add_dependencies(engine core)
|
add_dependencies(engine core)
|
||||||
add_dependencies(text core)
|
add_dependencies(text core)
|
||||||
|
@ -81,6 +82,7 @@ set(submodules
|
||||||
$<TARGET_OBJECTS:engine-datasetops-source>
|
$<TARGET_OBJECTS:engine-datasetops-source>
|
||||||
$<TARGET_OBJECTS:engine-datasetops-source-sampler>
|
$<TARGET_OBJECTS:engine-datasetops-source-sampler>
|
||||||
$<TARGET_OBJECTS:engine-gnn>
|
$<TARGET_OBJECTS:engine-gnn>
|
||||||
|
$<TARGET_OBJECTS:engine-perf>
|
||||||
$<TARGET_OBJECTS:engine-datasetops>
|
$<TARGET_OBJECTS:engine-datasetops>
|
||||||
$<TARGET_OBJECTS:engine-opt>
|
$<TARGET_OBJECTS:engine-opt>
|
||||||
$<TARGET_OBJECTS:engine>
|
$<TARGET_OBJECTS:engine>
|
||||||
|
|
|
@ -239,11 +239,13 @@ void bindTensor(py::module *m) {
|
||||||
.def("set_worker_connector_size", &ConfigManager::set_worker_connector_size)
|
.def("set_worker_connector_size", &ConfigManager::set_worker_connector_size)
|
||||||
.def("set_op_connector_size", &ConfigManager::set_op_connector_size)
|
.def("set_op_connector_size", &ConfigManager::set_op_connector_size)
|
||||||
.def("set_seed", &ConfigManager::set_seed)
|
.def("set_seed", &ConfigManager::set_seed)
|
||||||
|
.def("set_monitor_sampling_interval", &ConfigManager::set_monitor_sampling_interval)
|
||||||
.def("get_rows_per_buffer", &ConfigManager::rows_per_buffer)
|
.def("get_rows_per_buffer", &ConfigManager::rows_per_buffer)
|
||||||
.def("get_num_parallel_workers", &ConfigManager::num_parallel_workers)
|
.def("get_num_parallel_workers", &ConfigManager::num_parallel_workers)
|
||||||
.def("get_worker_connector_size", &ConfigManager::worker_connector_size)
|
.def("get_worker_connector_size", &ConfigManager::worker_connector_size)
|
||||||
.def("get_op_connector_size", &ConfigManager::op_connector_size)
|
.def("get_op_connector_size", &ConfigManager::op_connector_size)
|
||||||
.def("get_seed", &ConfigManager::seed)
|
.def("get_seed", &ConfigManager::seed)
|
||||||
|
.def("get_monitor_sampling_interval", &ConfigManager::monitor_sampling_interval)
|
||||||
.def("load", [](ConfigManager &c, std::string s) { (void)c.LoadFile(s); });
|
.def("load", [](ConfigManager &c, std::string s) { (void)c.LoadFile(s); });
|
||||||
|
|
||||||
(void)py::class_<Tensor, std::shared_ptr<Tensor>>(*m, "Tensor", py::buffer_protocol())
|
(void)py::class_<Tensor, std::shared_ptr<Tensor>>(*m, "Tensor", py::buffer_protocol())
|
||||||
|
|
|
@ -88,5 +88,7 @@ void ConfigManager::set_op_connector_size(int32_t connector_size) { op_connector
|
||||||
uint32_t ConfigManager::seed() const { return seed_; }
|
uint32_t ConfigManager::seed() const { return seed_; }
|
||||||
|
|
||||||
void ConfigManager::set_seed(uint32_t seed) { seed_ = seed; }
|
void ConfigManager::set_seed(uint32_t seed) { seed_ = seed; }
|
||||||
|
|
||||||
|
void ConfigManager::set_monitor_sampling_interval(uint32_t interval) { monitor_sampling_interval_ = interval; }
|
||||||
} // namespace dataset
|
} // namespace dataset
|
||||||
} // namespace mindspore
|
} // namespace mindspore
|
||||||
|
|
|
@ -111,12 +111,21 @@ class ConfigManager {
|
||||||
// @param seed - The default seed to use
|
// @param seed - The default seed to use
|
||||||
void set_seed(uint32_t seed);
|
void set_seed(uint32_t seed);
|
||||||
|
|
||||||
|
// setter function
|
||||||
|
// @param interval - The setting to apply to the config
|
||||||
|
void set_monitor_sampling_interval(uint32_t interval);
|
||||||
|
|
||||||
|
// getter function
|
||||||
|
// @return The iterval of monitor sampling
|
||||||
|
int32_t monitor_sampling_interval() const { return monitor_sampling_interval_; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
int32_t rows_per_buffer_{kCfgRowsPerBuffer};
|
int32_t rows_per_buffer_{kCfgRowsPerBuffer};
|
||||||
int32_t num_parallel_workers_{kCfgParallelWorkers};
|
int32_t num_parallel_workers_{kCfgParallelWorkers};
|
||||||
int32_t worker_connector_size_{kCfgWorkerConnectorSize};
|
int32_t worker_connector_size_{kCfgWorkerConnectorSize};
|
||||||
int32_t op_connector_size_{kCfgOpConnectorSize};
|
int32_t op_connector_size_{kCfgOpConnectorSize};
|
||||||
uint32_t seed_{kCfgDefaultSeed};
|
uint32_t seed_{kCfgDefaultSeed};
|
||||||
|
uint32_t monitor_sampling_interval_{kCfgMonitorSamplingInterval};
|
||||||
|
|
||||||
// Private helper function that taks a nlohmann json format and populates the settings
|
// Private helper function that taks a nlohmann json format and populates the settings
|
||||||
// @param j - The json nlohmann json info
|
// @param j - The json nlohmann json info
|
||||||
|
|
|
@ -47,6 +47,7 @@ constexpr uint32_t kCfgParallelWorkers = 4;
|
||||||
constexpr uint32_t kCfgWorkerConnectorSize = 16;
|
constexpr uint32_t kCfgWorkerConnectorSize = 16;
|
||||||
constexpr uint32_t kCfgOpConnectorSize = 16;
|
constexpr uint32_t kCfgOpConnectorSize = 16;
|
||||||
constexpr uint32_t kCfgDefaultSeed = std::mt19937::default_seed;
|
constexpr uint32_t kCfgDefaultSeed = std::mt19937::default_seed;
|
||||||
|
constexpr uint32_t kCfgMonitorSamplingInterval = 10;
|
||||||
|
|
||||||
// Invalid OpenCV type should not be from 0 to 7 (opencv4/opencv2/core/hal/interface.h)
|
// Invalid OpenCV type should not be from 0 to 7 (opencv4/opencv2/core/hal/interface.h)
|
||||||
constexpr uint8_t kCVInvalidType = 255;
|
constexpr uint8_t kCVInvalidType = 255;
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
add_subdirectory(datasetops)
|
add_subdirectory(datasetops)
|
||||||
add_subdirectory(opt)
|
add_subdirectory(opt)
|
||||||
add_subdirectory(gnn)
|
add_subdirectory(gnn)
|
||||||
|
add_subdirectory(perf)
|
||||||
if (ENABLE_TDTQUE)
|
if (ENABLE_TDTQUE)
|
||||||
add_subdirectory(tdt)
|
add_subdirectory(tdt)
|
||||||
endif ()
|
endif ()
|
||||||
|
@ -16,7 +17,7 @@ add_library(engine OBJECT
|
||||||
target_include_directories(engine PRIVATE ${pybind11_INCLUDE_DIRS})
|
target_include_directories(engine PRIVATE ${pybind11_INCLUDE_DIRS})
|
||||||
|
|
||||||
if (ENABLE_TDTQUE)
|
if (ENABLE_TDTQUE)
|
||||||
add_dependencies(engine engine-datasetops engine-datasetops-source engine-tdt engine-opt engine-gnn)
|
add_dependencies(engine engine-datasetops engine-datasetops-source engine-tdt engine-opt engine-gnn engine-perf)
|
||||||
else()
|
else()
|
||||||
add_dependencies(engine engine-datasetops engine-datasetops-source engine-opt engine-gnn)
|
add_dependencies(engine engine-datasetops engine-datasetops-source engine-opt engine-gnn engine-perf)
|
||||||
endif ()
|
endif ()
|
||||||
|
|
|
@ -83,7 +83,14 @@ Status IteratorBase::FetchNextTensorRow(TensorRow *out_row) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Constructor of the DatasetIterator
|
// Constructor of the DatasetIterator
|
||||||
DatasetIterator::DatasetIterator(std::shared_ptr<ExecutionTree> exe_tree) : IteratorBase(), root_(exe_tree->root()) {}
|
DatasetIterator::DatasetIterator(std::shared_ptr<ExecutionTree> exe_tree)
|
||||||
|
: IteratorBase(), root_(exe_tree->root()), tracing_(nullptr), cur_batch_num_(0), cur_connector_size_(0) {
|
||||||
|
std::shared_ptr<Tracing> node;
|
||||||
|
Status s = exe_tree->GetProfilingManager()->GetTracingNode(kDatasetIteratorTracingName, &node);
|
||||||
|
if (s.IsOk()) {
|
||||||
|
tracing_ = std::dynamic_pointer_cast<DatasetIteratorTracing>(node);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
DatasetIterator::~DatasetIterator() = default;
|
DatasetIterator::~DatasetIterator() = default;
|
||||||
|
|
||||||
|
@ -101,6 +108,10 @@ Status DatasetIterator::FetchNextTensorRow(TensorRow *out_row) {
|
||||||
|
|
||||||
// Check if we need to get a new DataBuffer to iterate.
|
// Check if we need to get a new DataBuffer to iterate.
|
||||||
if (curr_buffer_ == nullptr || curr_buffer_->NumRows() == 0) {
|
if (curr_buffer_ == nullptr || curr_buffer_->NumRows() == 0) {
|
||||||
|
if (tracing_ != nullptr) {
|
||||||
|
cur_connector_size_ = root_->ConnectorSize();
|
||||||
|
cur_connector_capacity_ = root_->ConnectorCapacity();
|
||||||
|
}
|
||||||
RETURN_IF_NOT_OK(root_->GetNextBuffer(&curr_buffer_));
|
RETURN_IF_NOT_OK(root_->GetNextBuffer(&curr_buffer_));
|
||||||
|
|
||||||
// Since GetNextBuffer was used rather than GetNextInput(), it means we need to manually
|
// Since GetNextBuffer was used rather than GetNextInput(), it means we need to manually
|
||||||
|
@ -121,6 +132,8 @@ Status DatasetIterator::FetchNextTensorRow(TensorRow *out_row) {
|
||||||
}
|
}
|
||||||
eof_handled_ = true;
|
eof_handled_ = true;
|
||||||
curr_buffer_.reset(); // explicitly free the eof buffer
|
curr_buffer_.reset(); // explicitly free the eof buffer
|
||||||
|
// Set tree to Finished state
|
||||||
|
root_->Tree()->SetFinished();
|
||||||
|
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
}
|
}
|
||||||
|
@ -131,13 +144,18 @@ Status DatasetIterator::FetchNextTensorRow(TensorRow *out_row) {
|
||||||
// flow of an eof up the pipeline by itself.
|
// flow of an eof up the pipeline by itself.
|
||||||
eof_handled_ = true;
|
eof_handled_ = true;
|
||||||
curr_buffer_.reset(); // explicitly free the eof buffer
|
curr_buffer_.reset(); // explicitly free the eof buffer
|
||||||
|
// Set tree to Finished state
|
||||||
|
root_->Tree()->SetFinished();
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// If we got this far, now it's time to pop that next row for return to caller
|
// If we got this far, now it's time to pop that next row for return to caller
|
||||||
RETURN_IF_NOT_OK(curr_buffer_->PopRow(out_row));
|
RETURN_IF_NOT_OK(curr_buffer_->PopRow(out_row));
|
||||||
|
if (tracing_ != nullptr) {
|
||||||
|
cur_batch_num_++;
|
||||||
|
tracing_->Record(CONNECTOR_DEPTH, cur_connector_capacity_, cur_batch_num_, cur_connector_size_);
|
||||||
|
}
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -24,6 +24,7 @@
|
||||||
#include "dataset/core/tensor.h"
|
#include "dataset/core/tensor.h"
|
||||||
#include "dataset/engine/datasetops/dataset_op.h"
|
#include "dataset/engine/datasetops/dataset_op.h"
|
||||||
#include "dataset/engine/execution_tree.h"
|
#include "dataset/engine/execution_tree.h"
|
||||||
|
#include "dataset/engine/perf/dataset_iterator_tracing.h"
|
||||||
|
|
||||||
namespace mindspore {
|
namespace mindspore {
|
||||||
namespace dataset {
|
namespace dataset {
|
||||||
|
@ -109,6 +110,10 @@ class DatasetIterator : public IteratorBase {
|
||||||
private:
|
private:
|
||||||
std::shared_ptr<DatasetOp> root_; // saves the root of the executionTree
|
std::shared_ptr<DatasetOp> root_; // saves the root of the executionTree
|
||||||
TensorRow device_queue_row_;
|
TensorRow device_queue_row_;
|
||||||
|
std::shared_ptr<DatasetIteratorTracing> tracing_; // trace profiling data
|
||||||
|
int32_t cur_batch_num_; // current batch number,used for profiling
|
||||||
|
int32_t cur_connector_size_; // current connector size of root op,used for profiling
|
||||||
|
int32_t cur_connector_capacity_; // current connector capacity of root op, used for profiling
|
||||||
};
|
};
|
||||||
|
|
||||||
// The ChildIterator derived class is for fetching rows from intermediate nodes of execution tree.
|
// The ChildIterator derived class is for fetching rows from intermediate nodes of execution tree.
|
||||||
|
|
|
@ -189,6 +189,10 @@ class BatchOp : public ParallelOp {
|
||||||
// @return - Status of the node visit.
|
// @return - Status of the node visit.
|
||||||
Status Accept(NodePass *p, bool *modified) override;
|
Status Accept(NodePass *p, bool *modified) override;
|
||||||
|
|
||||||
|
// Op name getter
|
||||||
|
// @return Name of the current Op
|
||||||
|
std::string Name() const override { return "BatchOp"; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// Worker thread for doing the memcpy of batch
|
// Worker thread for doing the memcpy of batch
|
||||||
// @param int32_t param workerId
|
// @param int32_t param workerId
|
||||||
|
|
|
@ -81,6 +81,10 @@ class ConcatOp : public PipelineOp {
|
||||||
// before providing their own implementations.
|
// before providing their own implementations.
|
||||||
Status PrepareNodePostAction() override;
|
Status PrepareNodePostAction() override;
|
||||||
|
|
||||||
|
// Op name getter
|
||||||
|
// @return Name of the current Op
|
||||||
|
std::string Name() const override { return "ConcatOp"; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
Status Verify(int32_t id, const std::unique_ptr<DataBuffer> &buf);
|
Status Verify(int32_t id, const std::unique_ptr<DataBuffer> &buf);
|
||||||
|
|
||||||
|
|
|
@ -38,6 +38,7 @@ DatasetOp::DatasetOp(int32_t op_connector_size)
|
||||||
tree_(nullptr),
|
tree_(nullptr),
|
||||||
state_(OpState::kDeOpIdle),
|
state_(OpState::kDeOpIdle),
|
||||||
op_ctrl_flags_(kDeOpNone),
|
op_ctrl_flags_(kDeOpNone),
|
||||||
|
out_connector_(nullptr),
|
||||||
first_fetch_(true) {
|
first_fetch_(true) {
|
||||||
// The operator starts out with an invalid operator id. The only way to
|
// The operator starts out with an invalid operator id. The only way to
|
||||||
// get it out of invalid state is to assign the operator to an execution tree.
|
// get it out of invalid state is to assign the operator to an execution tree.
|
||||||
|
|
|
@ -51,7 +51,7 @@ class DatasetOp : public std::enable_shared_from_this<DatasetOp> {
|
||||||
};
|
};
|
||||||
|
|
||||||
// Flags that control operator runtime behaviours
|
// Flags that control operator runtime behaviours
|
||||||
enum OpState { kDeOpRunning = 0, kDeOpIdle = 1 };
|
enum OpState { kDeOpRunning = 0, kDeOpIdle = 1, kDeOpTerminated };
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
// @param op_connector_size - The size for the output connector of this operator.
|
// @param op_connector_size - The size for the output connector of this operator.
|
||||||
|
@ -213,11 +213,23 @@ class DatasetOp : public std::enable_shared_from_this<DatasetOp> {
|
||||||
|
|
||||||
// Getter function
|
// Getter function
|
||||||
// @return connector size of current op
|
// @return connector size of current op
|
||||||
virtual int32_t ConnectorSize() const { return out_connector_->size(); }
|
int32_t ConnectorSize() const {
|
||||||
|
if (!inlined()) {
|
||||||
|
return out_connector_->size();
|
||||||
|
}
|
||||||
|
// Return -1 for inlined op
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
// Getter function
|
// Getter function
|
||||||
// @return connector size of current op
|
// @return connector size of current op
|
||||||
virtual int32_t ConnectorCapacity() const { return out_connector_->capacity(); }
|
int32_t ConnectorCapacity() const {
|
||||||
|
if (!inlined()) {
|
||||||
|
return out_connector_->size();
|
||||||
|
}
|
||||||
|
// Return -1 for inlined op
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
// Getter function
|
// Getter function
|
||||||
// @return connector size of child op
|
// @return connector size of child op
|
||||||
|
@ -228,7 +240,7 @@ class DatasetOp : public std::enable_shared_from_this<DatasetOp> {
|
||||||
int32_t ChildOpConnectorCapacity(int32_t child_index = 0) const { return child_[child_index]->ConnectorCapacity(); }
|
int32_t ChildOpConnectorCapacity(int32_t child_index = 0) const { return child_[child_index]->ConnectorCapacity(); }
|
||||||
|
|
||||||
// Children Getter
|
// Children Getter
|
||||||
// @return Vector or Children
|
// @return Vector of Children
|
||||||
std::vector<std::shared_ptr<DatasetOp>> Children() const { return child_; }
|
std::vector<std::shared_ptr<DatasetOp>> Children() const { return child_; }
|
||||||
|
|
||||||
// Base method for NodePass visit.
|
// Base method for NodePass visit.
|
||||||
|
@ -237,6 +249,14 @@ class DatasetOp : public std::enable_shared_from_this<DatasetOp> {
|
||||||
// @return Statue of the node visit
|
// @return Statue of the node visit
|
||||||
virtual Status Accept(NodePass *p, bool *modified);
|
virtual Status Accept(NodePass *p, bool *modified);
|
||||||
|
|
||||||
|
// Op name getter
|
||||||
|
// @return Name of the current Op
|
||||||
|
virtual std::string Name() const { return "DatasetOp"; }
|
||||||
|
|
||||||
|
// Execution Tree getter
|
||||||
|
// @return Pointer to the ExecutionTree the current op belongs to, no ownership
|
||||||
|
ExecutionTree *Tree() { return tree_; }
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
// Adds a parent operator to this operator
|
// Adds a parent operator to this operator
|
||||||
// @notes External callers do not have access to this function.
|
// @notes External callers do not have access to this function.
|
||||||
|
|
|
@ -13,25 +13,23 @@
|
||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
#include "dataset/engine/datasetops/device_queue_op.h"
|
|
||||||
#include <iomanip>
|
#include <iomanip>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
|
||||||
#include "dataset/core/config_manager.h"
|
#include "dataset/core/config_manager.h"
|
||||||
#include "dataset/core/global_context.h"
|
#include "dataset/core/global_context.h"
|
||||||
|
#include "dataset/engine/datasetops/device_queue_op.h"
|
||||||
#include "dataset/engine/data_buffer.h"
|
#include "dataset/engine/data_buffer.h"
|
||||||
#include "dataset/engine/dataset_iterator.h"
|
#include "dataset/engine/dataset_iterator.h"
|
||||||
|
#include "dataset/engine/opt/pass.h"
|
||||||
|
#include "dataset/engine/perf/profiling.h"
|
||||||
|
#include "dataset/engine/perf/device_queue_tracing.h"
|
||||||
#include "dataset/util/status.h"
|
#include "dataset/util/status.h"
|
||||||
#include "dataset/util/task_manager.h"
|
#include "dataset/util/task_manager.h"
|
||||||
#include "dataset/engine/opt/pass.h"
|
|
||||||
#include "dataset/util/profiling.h"
|
|
||||||
|
|
||||||
namespace mindspore {
|
namespace mindspore {
|
||||||
namespace dataset {
|
namespace dataset {
|
||||||
#define DEVICE_QUEUE_PROFILING_DATA(type, subtype, batch_num, value) \
|
|
||||||
std::to_string(type) + " " + std::to_string(subtype) + " " + std::to_string(batch_num) + " " + std::to_string(value)
|
|
||||||
|
|
||||||
DeviceQueueOp::DeviceQueueOp(std::string channel_name, DeviceType device_type, int32_t device_id, int32_t prefetch_size,
|
DeviceQueueOp::DeviceQueueOp(std::string channel_name, DeviceType device_type, int32_t device_id, int32_t prefetch_size,
|
||||||
int32_t op_connector_size, int64_t num_batch)
|
int32_t op_connector_size, int64_t num_batch)
|
||||||
: PipelineOp(op_connector_size),
|
: PipelineOp(op_connector_size),
|
||||||
|
@ -101,22 +99,16 @@ Status DeviceQueueOp::SendDataToAscend() {
|
||||||
MS_LOG(INFO) << "Device queue, sending data to Ascend.";
|
MS_LOG(INFO) << "Device queue, sending data to Ascend.";
|
||||||
int64_t total_batch = 0;
|
int64_t total_batch = 0;
|
||||||
bool is_break_loop = false;
|
bool is_break_loop = false;
|
||||||
double batch_start_time, tdt_start_time, end_time;
|
double batch_start_time, end_time;
|
||||||
int32_t batch_cost, tdt_cost;
|
int32_t batch_cost, tdt_cost;
|
||||||
int32_t connector_size = 0;
|
int32_t connector_size = 0;
|
||||||
int32_t connector_capacity;
|
int32_t connector_capacity;
|
||||||
std::shared_ptr<Profiling> profiling_node;
|
std::shared_ptr<DeviceQueueTracing> profiling_node;
|
||||||
bool isProfilingEnable = ProfilingManager::GetInstance().IsProfilingEnable();
|
bool isProfilingEnable = tree_->GetProfilingManager()->IsProfilingEnable();
|
||||||
if (isProfilingEnable) {
|
if (isProfilingEnable) {
|
||||||
std::string file_name = "critical_point_profiling";
|
std::shared_ptr<Tracing> node;
|
||||||
// Here can determine performance bottleneck is in pipeline or in tdt.
|
RETURN_IF_NOT_OK(tree_->GetProfilingManager()->GetTracingNode(kDeviceQueueTracingName, &node));
|
||||||
// Context format of this file "type subtype batchnum value"
|
profiling_node = std::dynamic_pointer_cast<DeviceQueueTracing>(node);
|
||||||
// type:0: time, 1: queue depth
|
|
||||||
// subtype:0: pipeline time, 1: push tdt time, 2: all time
|
|
||||||
// batchnum: batch number
|
|
||||||
// value: value of time(ms) or queue depth
|
|
||||||
profiling_node = std::make_shared<Profiling>(file_name, device_id_);
|
|
||||||
RETURN_IF_NOT_OK(ProfilingManager::GetInstance().RegisterProfilingNode(&profiling_node));
|
|
||||||
batch_start_time = ProfilingTime::GetCurMilliSecond();
|
batch_start_time = ProfilingTime::GetCurMilliSecond();
|
||||||
connector_capacity = ChildOpConnectorCapacity();
|
connector_capacity = ChildOpConnectorCapacity();
|
||||||
}
|
}
|
||||||
|
@ -129,29 +121,23 @@ Status DeviceQueueOp::SendDataToAscend() {
|
||||||
TensorRow currRow;
|
TensorRow currRow;
|
||||||
for (int row_id = 0; row_id < current_buffer->NumRows() && !is_break_loop; row_id++) {
|
for (int row_id = 0; row_id < current_buffer->NumRows() && !is_break_loop; row_id++) {
|
||||||
RETURN_IF_NOT_OK(current_buffer->GetRow(row_id, &currRow));
|
RETURN_IF_NOT_OK(current_buffer->GetRow(row_id, &currRow));
|
||||||
if (isProfilingEnable) {
|
auto status = tdtInstancePtr->hostPush(currRow, true, channel_name_, isProfilingEnable, tdt_cost);
|
||||||
tdt_start_time = ProfilingTime::GetCurMilliSecond();
|
|
||||||
}
|
|
||||||
auto status = tdtInstancePtr->hostPush(currRow, true, channel_name_);
|
|
||||||
if (status == TdtStatus::FAILED) {
|
if (status == TdtStatus::FAILED) {
|
||||||
return Status(StatusCode::kTDTPushFailure, "TDT Push Failed");
|
return Status(StatusCode::kTDTPushFailure, "TDT Push Failed");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (isProfilingEnable) {
|
if (isProfilingEnable) {
|
||||||
end_time = ProfilingTime::GetCurMilliSecond();
|
end_time = ProfilingTime::GetCurMilliSecond();
|
||||||
tdt_cost = (int32_t)(end_time - tdt_start_time);
|
|
||||||
// record push tdt time
|
// record push tdt time
|
||||||
profiling_node->Record(DEVICE_QUEUE_PROFILING_DATA(TIME, TDT_PUSH_TIME, total_batch + 1, tdt_cost));
|
profiling_node->Record(TIME, TDT_PUSH_TIME, total_batch + 1, tdt_cost);
|
||||||
batch_cost = (int32_t)(end_time - batch_start_time);
|
batch_cost = (int32_t)(end_time - batch_start_time);
|
||||||
// record batch time
|
// record batch time
|
||||||
profiling_node->Record(DEVICE_QUEUE_PROFILING_DATA(TIME, BATCH_TIME, total_batch + 1, batch_cost));
|
profiling_node->Record(TIME, BATCH_TIME, total_batch + 1, batch_cost);
|
||||||
// record pipeline time
|
// record pipeline time
|
||||||
profiling_node->Record(
|
profiling_node->Record(TIME, PIPELINE_TIME, total_batch + 1, batch_cost - tdt_cost);
|
||||||
DEVICE_QUEUE_PROFILING_DATA(TIME, PIPELINE_TIME, total_batch + 1, batch_cost - tdt_cost));
|
|
||||||
batch_start_time = end_time;
|
batch_start_time = end_time;
|
||||||
// record connector depth
|
// record connector depth
|
||||||
profiling_node->Record(
|
profiling_node->Record(CONNECTOR_DEPTH, connector_capacity, total_batch + 1, connector_size);
|
||||||
DEVICE_QUEUE_PROFILING_DATA(CONNECTOR_DEPTH, connector_capacity, total_batch + 1, connector_size));
|
|
||||||
}
|
}
|
||||||
total_batch++;
|
total_batch++;
|
||||||
if (num_batch_ > 0 && total_batch == num_batch_) {
|
if (num_batch_ > 0 && total_batch == num_batch_) {
|
||||||
|
@ -171,9 +157,7 @@ Status DeviceQueueOp::SendDataToAscend() {
|
||||||
RETURN_IF_NOT_OK(GetNextInput(¤t_buffer));
|
RETURN_IF_NOT_OK(GetNextInput(¤t_buffer));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (isProfilingEnable) {
|
tree_->SetFinished();
|
||||||
profiling_node->SaveToFile();
|
|
||||||
}
|
|
||||||
MS_LOG(INFO) << "Device queue total batch is " << total_batch << ", number of batches is " << num_batch_ << ".";
|
MS_LOG(INFO) << "Device queue total batch is " << total_batch << ", number of batches is " << num_batch_ << ".";
|
||||||
|
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
|
|
|
@ -140,6 +140,10 @@ class DeviceQueueOp : public PipelineOp {
|
||||||
// @return - Status of the node visit.
|
// @return - Status of the node visit.
|
||||||
Status Accept(NodePass *p, bool *modified) override;
|
Status Accept(NodePass *p, bool *modified) override;
|
||||||
|
|
||||||
|
// Op name getter
|
||||||
|
// @return Name of the current Op
|
||||||
|
std::string Name() const override { return "DeviceQueueOp"; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// Name: checkExceptions(DataBuffer);
|
// Name: checkExceptions(DataBuffer);
|
||||||
// Description: Check whether the dataBuffer meets the condition for performing DeviceQueueOp
|
// Description: Check whether the dataBuffer meets the condition for performing DeviceQueueOp
|
||||||
|
|
|
@ -127,6 +127,10 @@ class FilterOp : public ParallelOp {
|
||||||
// @return - Status of the node visit.
|
// @return - Status of the node visit.
|
||||||
Status Accept(NodePass *p, bool *modified) override;
|
Status Accept(NodePass *p, bool *modified) override;
|
||||||
|
|
||||||
|
// Op name getter
|
||||||
|
// @return Name of the current Op
|
||||||
|
std::string Name() const override { return "FilterOp"; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// predicate_func python callable which returns a boolean value.
|
// predicate_func python callable which returns a boolean value.
|
||||||
py::function predicate_func_;
|
py::function predicate_func_;
|
||||||
|
|
|
@ -177,6 +177,10 @@ class MapOp : public ParallelOp {
|
||||||
// @return - Status of the node visit.
|
// @return - Status of the node visit.
|
||||||
Status Accept(NodePass *p, bool *modified) override;
|
Status Accept(NodePass *p, bool *modified) override;
|
||||||
|
|
||||||
|
// Op name getter
|
||||||
|
// @return Name of the current Op
|
||||||
|
std::string Name() const override { return "MapOp"; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// Local queues where worker threads can pop from.
|
// Local queues where worker threads can pop from.
|
||||||
// Popping directly from the Connector can block if the previous designated threads haven't pop.
|
// Popping directly from the Connector can block if the previous designated threads haven't pop.
|
||||||
|
|
|
@ -107,6 +107,10 @@ class ProjectOp : public PipelineOp {
|
||||||
// @return - Status of the node visit.
|
// @return - Status of the node visit.
|
||||||
Status Accept(NodePass *p, bool *modified) override;
|
Status Accept(NodePass *p, bool *modified) override;
|
||||||
|
|
||||||
|
// Op name getter
|
||||||
|
// @return Name of the current Op
|
||||||
|
std::string Name() const override { return "ProjectOp"; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
std::vector<std::string> columns_to_project_;
|
std::vector<std::string> columns_to_project_;
|
||||||
std::vector<int32_t> projected_column_indices_;
|
std::vector<int32_t> projected_column_indices_;
|
||||||
|
|
|
@ -116,6 +116,10 @@ class RenameOp : public PipelineOp {
|
||||||
// @return - Status of the node visit.
|
// @return - Status of the node visit.
|
||||||
Status Accept(NodePass *p, bool *modified) override;
|
Status Accept(NodePass *p, bool *modified) override;
|
||||||
|
|
||||||
|
// Op name getter
|
||||||
|
// @return Name of the current Op
|
||||||
|
std::string Name() const override { return "RenameOp"; }
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
// Rename core functionality
|
// Rename core functionality
|
||||||
Status RenameColumns();
|
Status RenameColumns();
|
||||||
|
|
|
@ -124,9 +124,9 @@ class RepeatOp : public PipelineOp {
|
||||||
// @return - Status of the node visit.
|
// @return - Status of the node visit.
|
||||||
Status Accept(NodePass *p, bool *modified) override;
|
Status Accept(NodePass *p, bool *modified) override;
|
||||||
|
|
||||||
virtual int32_t ConnectorSize() const { return child_[0]->ConnectorSize(); }
|
// Op name getter
|
||||||
|
// @return Name of the current Op
|
||||||
virtual int32_t ConnectorCapacity() const { return child_[0]->ConnectorCapacity(); }
|
std::string Name() const override { return "RepeatOp"; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
int32_t max_repeats_; // The number of repeats that the user requested
|
int32_t max_repeats_; // The number of repeats that the user requested
|
||||||
|
|
|
@ -161,6 +161,10 @@ class ShuffleOp : public PipelineOp {
|
||||||
// @return - Status of the node visit.
|
// @return - Status of the node visit.
|
||||||
Status Accept(NodePass *p, bool *modified) override;
|
Status Accept(NodePass *p, bool *modified) override;
|
||||||
|
|
||||||
|
// Op name getter
|
||||||
|
// @return Name of the current Op
|
||||||
|
std::string Name() const override { return "ShuffleOp"; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// Private function to add a new row to the shuffle buffer.
|
// Private function to add a new row to the shuffle buffer.
|
||||||
// @return Status - The error code return
|
// @return Status - The error code return
|
||||||
|
|
|
@ -80,6 +80,10 @@ class SkipOp : public PipelineOp {
|
||||||
// @return - Status of the node visit.
|
// @return - Status of the node visit.
|
||||||
Status Accept(NodePass *p, bool *modified) override;
|
Status Accept(NodePass *p, bool *modified) override;
|
||||||
|
|
||||||
|
// Op name getter
|
||||||
|
// @return Name of the current Op
|
||||||
|
std::string Name() const override { return "SkipOp"; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
int32_t max_skips_; // The number of skips that the user requested
|
int32_t max_skips_; // The number of skips that the user requested
|
||||||
int32_t skip_count_; // A counter for the current number of executed skips
|
int32_t skip_count_; // A counter for the current number of executed skips
|
||||||
|
|
|
@ -169,6 +169,10 @@ class CelebAOp : public ParallelOp, RandomAccessOp {
|
||||||
// @return Status - The error code return
|
// @return Status - The error code return
|
||||||
Status AddIOBlock(std::unique_ptr<DataBuffer> *data_buffer);
|
Status AddIOBlock(std::unique_ptr<DataBuffer> *data_buffer);
|
||||||
|
|
||||||
|
// Op name getter
|
||||||
|
// @return Name of the current Op
|
||||||
|
std::string Name() const { return "CelebAOp"; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// Called first when function is called
|
// Called first when function is called
|
||||||
// @return
|
// @return
|
||||||
|
|
|
@ -155,6 +155,10 @@ class CifarOp : public ParallelOp, public RandomAccessOp {
|
||||||
// @return
|
// @return
|
||||||
static Status CountTotalRows(const std::string &dir, bool isCIFAR10, int64_t *count);
|
static Status CountTotalRows(const std::string &dir, bool isCIFAR10, int64_t *count);
|
||||||
|
|
||||||
|
// Op name getter
|
||||||
|
// @return Name of the current Op
|
||||||
|
std::string Name() const override { return "CifarOp"; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// Initialize Sampler, calls sampler->Init() within
|
// Initialize Sampler, calls sampler->Init() within
|
||||||
// @return Status - The error code return
|
// @return Status - The error code return
|
||||||
|
|
|
@ -127,6 +127,10 @@ class GeneratorOp : public PipelineOp {
|
||||||
// @return - Status of the node visit.
|
// @return - Status of the node visit.
|
||||||
Status Accept(NodePass *p, bool *modified) override;
|
Status Accept(NodePass *p, bool *modified) override;
|
||||||
|
|
||||||
|
// Op name getter
|
||||||
|
// @return Name of the current Op
|
||||||
|
std::string Name() const override { return "GeneratorOp"; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
py::function generator_function_;
|
py::function generator_function_;
|
||||||
std::vector<std::string> column_names_;
|
std::vector<std::string> column_names_;
|
||||||
|
|
|
@ -210,6 +210,10 @@ class ImageFolderOp : public ParallelOp, public RandomAccessOp {
|
||||||
// @return - Status of the node visit.
|
// @return - Status of the node visit.
|
||||||
Status Accept(NodePass *p, bool *modified) override;
|
Status Accept(NodePass *p, bool *modified) override;
|
||||||
|
|
||||||
|
// Op name getter
|
||||||
|
// @return Name of the current Op
|
||||||
|
std::string Name() const override { return "ImageFolderOp"; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// Initialize Sampler, calls sampler->Init() within
|
// Initialize Sampler, calls sampler->Init() within
|
||||||
// @return Status - The error code return
|
// @return Status - The error code return
|
||||||
|
|
|
@ -172,6 +172,10 @@ class ManifestOp : public ParallelOp, public RandomAccessOp {
|
||||||
static Status GetClassIndexing(const std::string &file, const py::dict &dict, const std::string &usage,
|
static Status GetClassIndexing(const std::string &file, const py::dict &dict, const std::string &usage,
|
||||||
std::map<std::string, int32_t> *output_class_indexing);
|
std::map<std::string, int32_t> *output_class_indexing);
|
||||||
|
|
||||||
|
// Op name getter
|
||||||
|
// @return Name of the current Op
|
||||||
|
std::string Name() const override { return "ManifestOp"; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// Initialize Sampler, calls sampler->Init() within
|
// Initialize Sampler, calls sampler->Init() within
|
||||||
// @return Status - The error code return
|
// @return Status - The error code return
|
||||||
|
|
|
@ -218,6 +218,10 @@ class MindRecordOp : public ParallelOp {
|
||||||
// @return - Status of the node visit.
|
// @return - Status of the node visit.
|
||||||
Status Accept(NodePass *p, bool *modified) override;
|
Status Accept(NodePass *p, bool *modified) override;
|
||||||
|
|
||||||
|
// Op name getter
|
||||||
|
// @return Name of the current Op
|
||||||
|
std::string Name() const override { return "MindRecordOp"; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
Status GetBufferFromReader(std::unique_ptr<DataBuffer> *fetched_buffer, int64_t buffer_id, int32_t worker_id);
|
Status GetBufferFromReader(std::unique_ptr<DataBuffer> *fetched_buffer, int64_t buffer_id, int32_t worker_id);
|
||||||
|
|
||||||
|
|
|
@ -152,6 +152,10 @@ class MnistOp : public ParallelOp, public RandomAccessOp {
|
||||||
// @return
|
// @return
|
||||||
static Status CountTotalRows(const std::string &dir, int64_t *count);
|
static Status CountTotalRows(const std::string &dir, int64_t *count);
|
||||||
|
|
||||||
|
// Op name getter
|
||||||
|
// @return Name of the current Op
|
||||||
|
std::string Name() const override { return "MnistOp"; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// Initialize Sampler, calls sampler->Init() within
|
// Initialize Sampler, calls sampler->Init() within
|
||||||
// @return Status - The error code return
|
// @return Status - The error code return
|
||||||
|
|
|
@ -189,6 +189,10 @@ class RandomDataOp : public ParallelOp {
|
||||||
*/
|
*/
|
||||||
int64_t GetTotalRows() const { return total_rows_; }
|
int64_t GetTotalRows() const { return total_rows_; }
|
||||||
|
|
||||||
|
// Op name getter
|
||||||
|
// @return Name of the current Op
|
||||||
|
std::string Name() const override { return "RandomDataOp"; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
/**
|
/**
|
||||||
* The entry point code for when workers are launched
|
* The entry point code for when workers are launched
|
||||||
|
|
|
@ -169,6 +169,10 @@ class TextFileOp : public ParallelOp {
|
||||||
// @return Status - the error coed returned.
|
// @return Status - the error coed returned.
|
||||||
static Status CountAllFileRows(const std::vector<std::string> &files, int64_t *count);
|
static Status CountAllFileRows(const std::vector<std::string> &files, int64_t *count);
|
||||||
|
|
||||||
|
// Op name getter
|
||||||
|
// @return Name of the current Op
|
||||||
|
std::string Name() const override { return "TextFileOp"; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// The entry point for when workers are launched.
|
// The entry point for when workers are launched.
|
||||||
// @param worker_id - the id of the worker that is executing this function.
|
// @param worker_id - the id of the worker that is executing this function.
|
||||||
|
|
|
@ -228,6 +228,10 @@ class TFReaderOp : public ParallelOp {
|
||||||
// @return - Status of the node visit.
|
// @return - Status of the node visit.
|
||||||
Status Accept(NodePass *p, bool *modified) override;
|
Status Accept(NodePass *p, bool *modified) override;
|
||||||
|
|
||||||
|
// Op name getter
|
||||||
|
// @return Name of the current Op
|
||||||
|
std::string Name() const override { return "TFReaderOp"; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// The entry point for when workers are launched.
|
// The entry point for when workers are launched.
|
||||||
// @param worker_id - the id of the worker that is executing this function.
|
// @param worker_id - the id of the worker that is executing this function.
|
||||||
|
|
|
@ -205,6 +205,10 @@ class VOCOp : public ParallelOp, public RandomAccessOp {
|
||||||
static Status GetClassIndexing(const std::string &dir, const std::string &task_type, const std::string &task_mode,
|
static Status GetClassIndexing(const std::string &dir, const std::string &task_type, const std::string &task_mode,
|
||||||
const py::dict &dict, std::map<std::string, int32_t> *output_class_indexing);
|
const py::dict &dict, std::map<std::string, int32_t> *output_class_indexing);
|
||||||
|
|
||||||
|
// Op name getter
|
||||||
|
// @return Name of the current Op
|
||||||
|
std::string Name() const override { return "VOCOp"; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// Initialize Sampler, calls sampler->Init() within
|
// Initialize Sampler, calls sampler->Init() within
|
||||||
// @return Status - The error code return
|
// @return Status - The error code return
|
||||||
|
|
|
@ -90,6 +90,10 @@ class TakeOp : public PipelineOp {
|
||||||
// @return - Status of the node visit.
|
// @return - Status of the node visit.
|
||||||
Status Accept(NodePass *p, bool *modified) override;
|
Status Accept(NodePass *p, bool *modified) override;
|
||||||
|
|
||||||
|
// Op name getter
|
||||||
|
// @return Name of the current Op
|
||||||
|
std::string Name() const override { return "TakeOp"; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
int32_t max_takes_; // The number of takes that the user requested
|
int32_t max_takes_; // The number of takes that the user requested
|
||||||
int32_t take_count_; // A counter for the current number of executed takes
|
int32_t take_count_; // A counter for the current number of executed takes
|
||||||
|
|
|
@ -110,6 +110,10 @@ class ZipOp : public PipelineOp {
|
||||||
// @return - Status of the node visit.
|
// @return - Status of the node visit.
|
||||||
Status Accept(NodePass *p, bool *modified) override;
|
Status Accept(NodePass *p, bool *modified) override;
|
||||||
|
|
||||||
|
// Op name getter
|
||||||
|
// @return Name of the current Op
|
||||||
|
std::string Name() const override { return "ZipOp"; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// Handles preprocessing of the main loop, used when starting new epoch
|
// Handles preprocessing of the main loop, used when starting new epoch
|
||||||
Status prepare(TensorQTable *const table);
|
Status prepare(TensorQTable *const table);
|
||||||
|
|
|
@ -19,9 +19,8 @@
|
||||||
#include "dataset/engine/datasetops/dataset_op.h"
|
#include "dataset/engine/datasetops/dataset_op.h"
|
||||||
#include "dataset/engine/datasetops/shuffle_op.h"
|
#include "dataset/engine/datasetops/shuffle_op.h"
|
||||||
#include "dataset/util/task_manager.h"
|
#include "dataset/util/task_manager.h"
|
||||||
#include "dataset/util/profiling.h"
|
#include "dataset/engine/perf/profiling.h"
|
||||||
|
#include "dataset/engine/perf/monitor.h"
|
||||||
#include "dataset/engine/opt/util/printer_pass.h"
|
|
||||||
|
|
||||||
namespace mindspore {
|
namespace mindspore {
|
||||||
namespace dataset {
|
namespace dataset {
|
||||||
|
@ -30,6 +29,8 @@ ExecutionTree::ExecutionTree() : id_count_(0) {
|
||||||
tg_ = std::make_unique<TaskGroup>();
|
tg_ = std::make_unique<TaskGroup>();
|
||||||
tree_state_ = kDeTStateInit;
|
tree_state_ = kDeTStateInit;
|
||||||
prepare_flags_ = kDePrepNone;
|
prepare_flags_ = kDePrepNone;
|
||||||
|
perf_monitor_ = std::make_unique<Monitor>(this);
|
||||||
|
profiling_manager_ = std::make_unique<ProfilingManager>(this);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Destructor
|
// Destructor
|
||||||
|
@ -121,6 +122,15 @@ Status ExecutionTree::Launch() {
|
||||||
}
|
}
|
||||||
std::ostringstream ss;
|
std::ostringstream ss;
|
||||||
ss << *this;
|
ss << *this;
|
||||||
|
|
||||||
|
// Profiling infrastructures need to be initialized before Op launching
|
||||||
|
if (profiling_manager_->IsProfilingEnable()) {
|
||||||
|
// Setup profiling manager
|
||||||
|
RETURN_IF_NOT_OK(profiling_manager_->Initialize());
|
||||||
|
// Launch Monitor Thread
|
||||||
|
RETURN_IF_NOT_OK(tg_->CreateAsyncTask("Monitor Thread launched", std::ref(*perf_monitor_)));
|
||||||
|
}
|
||||||
|
|
||||||
MS_LOG(DEBUG) << "Printing the tree before launch tasks:\n" << ss.str();
|
MS_LOG(DEBUG) << "Printing the tree before launch tasks:\n" << ss.str();
|
||||||
for (auto itr = this->begin(); itr != this->end(); ++itr) {
|
for (auto itr = this->begin(); itr != this->end(); ++itr) {
|
||||||
// An inlined operator is one that has an output connector size of 0, and it does not
|
// An inlined operator is one that has an output connector size of 0, and it does not
|
||||||
|
@ -133,7 +143,9 @@ Status ExecutionTree::Launch() {
|
||||||
// Set the state of the Operator as running. This only matters in Leaf ops, CacheOp and TakeOp
|
// Set the state of the Operator as running. This only matters in Leaf ops, CacheOp and TakeOp
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
tree_state_ = kDeTStateExecuting;
|
tree_state_ = kDeTStateExecuting;
|
||||||
|
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -23,12 +23,14 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include "dataset/engine/datasetops/dataset_op.h"
|
#include "dataset/engine/datasetops/dataset_op.h"
|
||||||
#include "dataset/util/status.h"
|
#include "dataset/util/status.h"
|
||||||
|
#include "mindspore/ccsrc/dataset/engine/perf/profiling.h"
|
||||||
|
|
||||||
namespace mindspore {
|
namespace mindspore {
|
||||||
namespace dataset {
|
namespace dataset {
|
||||||
// Forward declares
|
// Forward declares
|
||||||
class TaskGroup;
|
class TaskGroup;
|
||||||
class DatasetOp;
|
class DatasetOp;
|
||||||
|
class Monitor;
|
||||||
|
|
||||||
class ExecutionTree {
|
class ExecutionTree {
|
||||||
public:
|
public:
|
||||||
|
@ -44,7 +46,8 @@ class ExecutionTree {
|
||||||
kDeTStateBuilding, // The tree is being built, nodes are being added
|
kDeTStateBuilding, // The tree is being built, nodes are being added
|
||||||
kDeTStatePrepare, // The tree has been assigned a root node and is pending prepare
|
kDeTStatePrepare, // The tree has been assigned a root node and is pending prepare
|
||||||
kDeTStateReady, // The tree has been prepared and is ready to be launched
|
kDeTStateReady, // The tree has been prepared and is ready to be launched
|
||||||
kDeTStateExecuting // The tree has been launched and is executing
|
kDeTStateExecuting, // The tree has been launched and is executing
|
||||||
|
kDeTStateFinished // The tree has been drained, dataset iterator received EOF
|
||||||
};
|
};
|
||||||
|
|
||||||
class Iterator {
|
class Iterator {
|
||||||
|
@ -120,7 +123,7 @@ class ExecutionTree {
|
||||||
// Returns an iterator positioned at the start
|
// Returns an iterator positioned at the start
|
||||||
// @return Iterator - The iterator
|
// @return Iterator - The iterator
|
||||||
ExecutionTree::Iterator begin(const std::shared_ptr<DatasetOp> &root = nullptr) const {
|
ExecutionTree::Iterator begin(const std::shared_ptr<DatasetOp> &root = nullptr) const {
|
||||||
return Iterator((root == nullptr) ? root_ : root);
|
return Iterator(root == nullptr ? root_ : root);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Returns an iterator positioned at the end
|
// Returns an iterator positioned at the end
|
||||||
|
@ -207,6 +210,16 @@ class ExecutionTree {
|
||||||
// @return raw pointer to the TaskGroup
|
// @return raw pointer to the TaskGroup
|
||||||
TaskGroup *AllTasks() const { return tg_.get(); }
|
TaskGroup *AllTasks() const { return tg_.get(); }
|
||||||
|
|
||||||
|
// Return if the ExecutionTree is finished (iterator receives EOF).
|
||||||
|
// @return Bool - true is ExecutionTree is finished
|
||||||
|
bool isFinished() const { return tree_state_ == TreeState::kDeTStateFinished; }
|
||||||
|
|
||||||
|
// Set the ExecutionTree to Finished state.
|
||||||
|
void SetFinished() { tree_state_ = TreeState::kDeTStateFinished; }
|
||||||
|
|
||||||
|
// Getter for profiling manager, no ownership
|
||||||
|
ProfilingManager *GetProfilingManager() { return profiling_manager_.get(); }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// A helper functions for doing the recursive printing
|
// A helper functions for doing the recursive printing
|
||||||
// @param dataset_op - The dataset op to print
|
// @param dataset_op - The dataset op to print
|
||||||
|
@ -222,6 +235,8 @@ class ExecutionTree {
|
||||||
uint32_t prepare_flags_; // Flags used during tree prepare
|
uint32_t prepare_flags_; // Flags used during tree prepare
|
||||||
TreeState tree_state_; // Tracking the current tree state
|
TreeState tree_state_; // Tracking the current tree state
|
||||||
std::stack<std::shared_ptr<DatasetOp>> repeat_stack_; // A stack used during prepare phase
|
std::stack<std::shared_ptr<DatasetOp>> repeat_stack_; // A stack used during prepare phase
|
||||||
|
std::unique_ptr<Monitor> perf_monitor_; // Performance Monitor
|
||||||
|
std::unique_ptr<ProfilingManager> profiling_manager_; // Profiling manager
|
||||||
};
|
};
|
||||||
} // namespace dataset
|
} // namespace dataset
|
||||||
} // namespace mindspore
|
} // namespace mindspore
|
||||||
|
|
|
@ -0,0 +1,6 @@
|
||||||
|
add_library(engine-perf OBJECT
|
||||||
|
profiling.cc
|
||||||
|
monitor.cc
|
||||||
|
device_queue_tracing.cc
|
||||||
|
connector_size.cc
|
||||||
|
dataset_iterator_tracing.cc)
|
|
@ -0,0 +1,89 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
#include "dataset/engine/perf/connector_size.h"
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <fstream>
|
||||||
|
#include <memory>
|
||||||
|
#include <string>
|
||||||
|
#include "dataset/core/config_manager.h"
|
||||||
|
#include "dataset/engine/execution_tree.h"
|
||||||
|
#include "dataset/util/path.h"
|
||||||
|
|
||||||
|
using json = nlohmann::json;
|
||||||
|
namespace mindspore {
|
||||||
|
namespace dataset {
|
||||||
|
using Qrow = std::vector<int>;
|
||||||
|
|
||||||
|
// Sample action
|
||||||
|
Status ConnectorSize::Sample() {
|
||||||
|
Qrow cur_row;
|
||||||
|
std::transform(tree_->begin(), tree_->end(), std::back_inserter(cur_row),
|
||||||
|
[](DatasetOp &op) { return op.ConnectorSize(); });
|
||||||
|
// Push new row of sample
|
||||||
|
sample_table_.push_back(cur_row);
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
// JSON serializer helper function
|
||||||
|
json ConnectorSize::ParseOpInfo(const DatasetOp &node, const std::vector<int32_t> &size) {
|
||||||
|
auto children = node.Children();
|
||||||
|
std::vector<int32_t> children_id;
|
||||||
|
std::transform(children.begin(), children.end(), std::back_inserter(children_id),
|
||||||
|
[](std::shared_ptr<DatasetOp> op) -> int32_t { return op->id(); });
|
||||||
|
json json_node;
|
||||||
|
json_node["op_id"] = node.id();
|
||||||
|
json_node["op_type"] = node.Name();
|
||||||
|
json_node["num_workers"] = node.num_workers();
|
||||||
|
json metrics;
|
||||||
|
// DeviceQueueOp is a special op,it is not inlined but its output queue is invalid.
|
||||||
|
// So we should not output its queue size.
|
||||||
|
if (!node.inlined() && node.Name() != "DeviceQueueOp") {
|
||||||
|
metrics["output_queue"] = {{"size", size}, {"length", node.ConnectorCapacity()}};
|
||||||
|
}
|
||||||
|
json_node["metrics"] = metrics;
|
||||||
|
if (!children_id.empty()) {
|
||||||
|
json_node["children"] = children_id;
|
||||||
|
}
|
||||||
|
|
||||||
|
return json_node;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Save profiling data to file
|
||||||
|
Status ConnectorSize::SaveToFile() {
|
||||||
|
std::ofstream os(file_path_, std::ios::trunc);
|
||||||
|
uint32_t idx = 0;
|
||||||
|
json output;
|
||||||
|
std::shared_ptr<ConfigManager> cfg = GlobalContext::config_manager();
|
||||||
|
output["sampling_interval"] = cfg->monitor_sampling_interval();
|
||||||
|
// Traverse the ExecutionTree for JSON node generation
|
||||||
|
for (auto &node : *tree_) {
|
||||||
|
std::vector<int32_t> cur_queue_size;
|
||||||
|
std::transform(sample_table_.begin(), sample_table_.end(), std::back_inserter(cur_queue_size),
|
||||||
|
[&](const ConnectorSizeSample &sample) { return sample[idx]; });
|
||||||
|
json json_node = ParseOpInfo(node, cur_queue_size);
|
||||||
|
output["op_info"].push_back(json_node);
|
||||||
|
idx++;
|
||||||
|
}
|
||||||
|
os << output;
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
Status ConnectorSize::Init(const std::string &dir_path, const std::string &device_id) {
|
||||||
|
file_path_ = (Path(dir_path) / Path("pipeline_profiling_" + device_id + ".json")).toString();
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
} // namespace dataset
|
||||||
|
} // namespace mindspore
|
|
@ -0,0 +1,68 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
#ifndef MINDSPORE_QUEUE_DEPTH_H
|
||||||
|
#define MINDSPORE_QUEUE_DEPTH_H
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
#include <nlohmann/json.hpp>
|
||||||
|
#include "dataset/engine/perf/profiling.h"
|
||||||
|
#include "dataset/engine/datasetops/dataset_op.h"
|
||||||
|
|
||||||
|
using json = nlohmann::json;
|
||||||
|
|
||||||
|
namespace mindspore {
|
||||||
|
namespace dataset {
|
||||||
|
class ExecutionTree;
|
||||||
|
|
||||||
|
// Connector size sampling samples the output connector size of each op in the pipeline.
|
||||||
|
// It support JSON serialization for external usage.
|
||||||
|
class ConnectorSize : public Sampling {
|
||||||
|
// Connecto size sampling data is stored as a 2D vector
|
||||||
|
// op_0 ... op_m
|
||||||
|
// sample_0 size_0_0 ... size_m_0
|
||||||
|
// ... ... ... ...
|
||||||
|
// sample_n size_0_m ... size_m_n
|
||||||
|
//
|
||||||
|
// A circular buffer will be implemented in the future to make this table more flexible.
|
||||||
|
using ConnectorSizeSample = std::vector<int>;
|
||||||
|
using ConnectorSizeSampleTable = std::vector<ConnectorSizeSample>;
|
||||||
|
|
||||||
|
public:
|
||||||
|
explicit ConnectorSize(ExecutionTree *tree) : tree_(tree) {}
|
||||||
|
|
||||||
|
// Driver function for connector size sampling.
|
||||||
|
// This function samples the connector size of every nodes within the ExecutionTree
|
||||||
|
Status Sample() override;
|
||||||
|
|
||||||
|
std::string Name() const override { return kDeviceQueueTracingName; };
|
||||||
|
|
||||||
|
// Save sampling data to file
|
||||||
|
// @return Status - The error code return
|
||||||
|
Status SaveToFile() override;
|
||||||
|
|
||||||
|
Status Init(const std::string &dir_path, const std::string &device_id);
|
||||||
|
|
||||||
|
// Parse op infomation and transform to json format
|
||||||
|
json ParseOpInfo(const DatasetOp &node, const std::vector<int32_t> &size);
|
||||||
|
|
||||||
|
private:
|
||||||
|
ExecutionTree *tree_ = nullptr; // ExecutionTree pointer
|
||||||
|
ConnectorSizeSampleTable sample_table_; // Dataset structure to store all samples of connector size sampling
|
||||||
|
};
|
||||||
|
} // namespace dataset
|
||||||
|
} // namespace mindspore
|
||||||
|
#endif // MINDSPORE_QUEUE_DEPTH_H
|
|
@ -0,0 +1,64 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
#include <fstream>
|
||||||
|
#include <string>
|
||||||
|
#include "dataset/engine/perf/dataset_iterator_tracing.h"
|
||||||
|
#include "dataset/util/path.h"
|
||||||
|
|
||||||
|
namespace mindspore {
|
||||||
|
namespace dataset {
|
||||||
|
|
||||||
|
Status DatasetIteratorTracing::Record(const int32_t type, const int32_t extra_info, const int32_t batch_num,
|
||||||
|
const int32_t value) {
|
||||||
|
// Format: "type extra-info batch-num value"
|
||||||
|
// type: 0: time, 1: connector size
|
||||||
|
// extra-info: if type is 0 - 0: pipeline time, 1: push tdt time, 2: batch time
|
||||||
|
// if type is 1 - connector capacity
|
||||||
|
// batch-num: batch number
|
||||||
|
// value: if type is 0 - value is time(ms)
|
||||||
|
// if type is 1 - value is connector size
|
||||||
|
// Examples:
|
||||||
|
// 0 0 20 10 - The 20th batch took 10ms to get data from pipeline.
|
||||||
|
// 1 64 20 5 - Connector size is 5 when get the 20th batch.Connector capacity is 64.
|
||||||
|
std::string data = std::to_string(type) + " " + std::to_string(extra_info) + " " + std::to_string(batch_num) + " " +
|
||||||
|
std::to_string(value);
|
||||||
|
value_.emplace_back(data);
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
Status DatasetIteratorTracing::SaveToFile() {
|
||||||
|
if (value_.empty()) {
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::ofstream handle(file_path_, std::ios::trunc);
|
||||||
|
if (!handle.is_open()) {
|
||||||
|
RETURN_STATUS_UNEXPECTED("Profiling file can not be opened.");
|
||||||
|
}
|
||||||
|
for (auto value : value_) {
|
||||||
|
handle << value << "\n";
|
||||||
|
}
|
||||||
|
handle.close();
|
||||||
|
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
Status DatasetIteratorTracing::Init(const std::string &dir_path, const std::string &device_id) {
|
||||||
|
file_path_ = (Path(dir_path) / Path("dataset_iterator_profiling_" + device_id + ".txt")).toString();
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
} // namespace dataset
|
||||||
|
} // namespace mindspore
|
|
@ -0,0 +1,51 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
#ifndef MINDSPORE_DATASET_ITERATOR_TRACING_H
|
||||||
|
#define MINDSPORE_DATASET_ITERATOR_TRACING_H
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
#include "dataset/engine/perf/profiling.h"
|
||||||
|
|
||||||
|
namespace mindspore {
|
||||||
|
namespace dataset {
|
||||||
|
class DatasetIteratorTracing : public Tracing {
|
||||||
|
public:
|
||||||
|
// Constructor
|
||||||
|
DatasetIteratorTracing() = default;
|
||||||
|
|
||||||
|
// Destructor
|
||||||
|
~DatasetIteratorTracing() = default;
|
||||||
|
|
||||||
|
// Record tracing data
|
||||||
|
// @return Status - The error code return
|
||||||
|
Status Record(const int32_t type, const int32_t extra_info, const int32_t batch_num, const int32_t value);
|
||||||
|
|
||||||
|
std::string Name() const override { return kDatasetIteratorTracingName; };
|
||||||
|
|
||||||
|
// Save tracing data to file
|
||||||
|
// @return Status - The error code return
|
||||||
|
Status SaveToFile() override;
|
||||||
|
|
||||||
|
Status Init(const std::string &dir_path, const std::string &device_id);
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::vector<std::string> value_;
|
||||||
|
};
|
||||||
|
} // namespace dataset
|
||||||
|
} // namespace mindspore
|
||||||
|
|
||||||
|
#endif // MINDSPORE_DATASET_ITERATOR_TRACING_H
|
|
@ -0,0 +1,64 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <fstream>
|
||||||
|
#include <string>
|
||||||
|
#include "dataset/engine/perf/device_queue_tracing.h"
|
||||||
|
#include "dataset/util/path.h"
|
||||||
|
namespace mindspore {
|
||||||
|
namespace dataset {
|
||||||
|
|
||||||
|
Status DeviceQueueTracing::Record(const int32_t type, const int32_t extra_info, const int32_t batch_num,
|
||||||
|
const int32_t value) {
|
||||||
|
// Format: "type extra-info batch-num value"
|
||||||
|
// type: 0: time, 1: connector size
|
||||||
|
// extra-info: if type is 0 - 0: pipeline time, 1: push tdt time, 2: batch time
|
||||||
|
// if type is 1 - connector capacity
|
||||||
|
// batch-num: batch number
|
||||||
|
// value: if type is 0 - value is time(ms)
|
||||||
|
// if type is 1 - value is connector size
|
||||||
|
// Examples:
|
||||||
|
// 0 0 20 10 - The 20th batch took 10ms to get data from pipeline.
|
||||||
|
// 1 64 20 5 - Connector size is 5 when get the 20th batch.Connector capacity is 64.
|
||||||
|
std::string data = std::to_string(type) + " " + std::to_string(extra_info) + " " + std::to_string(batch_num) + " " +
|
||||||
|
std::to_string(value);
|
||||||
|
value_.emplace_back(data);
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
Status DeviceQueueTracing::SaveToFile() {
|
||||||
|
if (value_.empty()) {
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::ofstream handle(file_path_, std::ios::trunc);
|
||||||
|
if (!handle.is_open()) {
|
||||||
|
RETURN_STATUS_UNEXPECTED("Profiling file can not be opened.");
|
||||||
|
}
|
||||||
|
for (auto value : value_) {
|
||||||
|
handle << value << "\n";
|
||||||
|
}
|
||||||
|
handle.close();
|
||||||
|
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
Status DeviceQueueTracing::Init(const std::string &dir_path, const std::string &device_id) {
|
||||||
|
file_path_ = (Path(dir_path) / Path("critical_point_profiling_" + device_id + ".txt")).toString();
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
} // namespace dataset
|
||||||
|
} // namespace mindspore
|
|
@ -0,0 +1,52 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef MINDSPORE_DEVICE_QUEUE_TRACING_H
|
||||||
|
#define MINDSPORE_DEVICE_QUEUE_TRACING_H
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
#include "dataset/engine/perf/profiling.h"
|
||||||
|
|
||||||
|
namespace mindspore {
|
||||||
|
namespace dataset {
|
||||||
|
class DeviceQueueTracing : public Tracing {
|
||||||
|
public:
|
||||||
|
// Constructor
|
||||||
|
DeviceQueueTracing() = default;
|
||||||
|
|
||||||
|
// Destructor
|
||||||
|
~DeviceQueueTracing() = default;
|
||||||
|
|
||||||
|
// Record tracing data
|
||||||
|
// @return Status - The error code return
|
||||||
|
Status Record(const int32_t type, const int32_t extra_info, const int32_t batch_num, const int32_t value);
|
||||||
|
|
||||||
|
std::string Name() const override { return "Device Queue Tracing"; };
|
||||||
|
|
||||||
|
// Save tracing data to file
|
||||||
|
// @return Status - The error code return
|
||||||
|
Status SaveToFile() override;
|
||||||
|
|
||||||
|
Status Init(const std::string &dir_path, const std::string &device_id);
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::vector<std::string> value_;
|
||||||
|
};
|
||||||
|
} // namespace dataset
|
||||||
|
} // namespace mindspore
|
||||||
|
|
||||||
|
#endif // MINDSPORE_DEVICE_QUEUE_TRACING_H
|
|
@ -0,0 +1,50 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include "dataset/core/config_manager.h"
|
||||||
|
#include "dataset/engine/perf/monitor.h"
|
||||||
|
#include "dataset/engine/execution_tree.h"
|
||||||
|
|
||||||
|
namespace mindspore {
|
||||||
|
namespace dataset {
|
||||||
|
|
||||||
|
Monitor::Monitor(ExecutionTree *tree) : tree_(tree) {
|
||||||
|
std::shared_ptr<ConfigManager> cfg = GlobalContext::config_manager();
|
||||||
|
sampling_interval_ = cfg->monitor_sampling_interval();
|
||||||
|
}
|
||||||
|
|
||||||
|
Status Monitor::operator()() {
|
||||||
|
// Register this thread with TaskManager to receive proper interrupt signal.
|
||||||
|
TaskManager::FindMe()->Post();
|
||||||
|
|
||||||
|
// Keep sampling if
|
||||||
|
// 1) Monitor Task is not interrupted by TaskManager AND
|
||||||
|
// 2) Iterator has not received EOF
|
||||||
|
while (!this_thread::is_interrupted() && !(tree_->isFinished())) {
|
||||||
|
for (auto &node : tree_->GetProfilingManager()->GetSamplingNodes()) {
|
||||||
|
RETURN_IF_NOT_OK(node.second->Sample());
|
||||||
|
std::this_thread::sleep_for(std::chrono::milliseconds(sampling_interval_));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Output all profiling data upon request.
|
||||||
|
tree_->GetProfilingManager()->SaveProfilingData();
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace dataset
|
||||||
|
} // namespace mindspore
|
|
@ -0,0 +1,52 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef MINDSPORE_MONITOR_H
|
||||||
|
#define MINDSPORE_MONITOR_H
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
|
#include <unordered_map>
|
||||||
|
#include <vector>
|
||||||
|
#include "dataset/util/status.h"
|
||||||
|
#include "dataset/engine/perf/profiling.h"
|
||||||
|
|
||||||
|
namespace mindspore {
|
||||||
|
namespace dataset {
|
||||||
|
class ExecutionTree;
|
||||||
|
class Monitor {
|
||||||
|
public:
|
||||||
|
// Monitor object constructor
|
||||||
|
explicit Monitor(ExecutionTree *tree);
|
||||||
|
|
||||||
|
Monitor() = default;
|
||||||
|
|
||||||
|
// Functor for Perf Monitor main loop.
|
||||||
|
// This function will be the entry point of Mindspore::Dataset::Task
|
||||||
|
Status operator()();
|
||||||
|
|
||||||
|
int64_t GetSamplingInterval() { return sampling_interval_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
int64_t cur_row_;
|
||||||
|
int64_t max_samples_;
|
||||||
|
int64_t sampling_interval_;
|
||||||
|
ExecutionTree *tree_;
|
||||||
|
std::vector<std::shared_ptr<Sampling>> sampling_list_;
|
||||||
|
};
|
||||||
|
} // namespace dataset
|
||||||
|
} // namespace mindspore
|
||||||
|
|
||||||
|
#endif // MINDSPORE_MONITOR_H
|
|
@ -0,0 +1,153 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
#include "dataset/engine/perf/profiling.h"
|
||||||
|
|
||||||
|
#include <sys/time.h>
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <fstream>
|
||||||
|
#include "common/utils.h"
|
||||||
|
#include "dataset/util/path.h"
|
||||||
|
#include "dataset/engine/perf/monitor.h"
|
||||||
|
#include "dataset/engine/perf/device_queue_tracing.h"
|
||||||
|
#include "dataset/engine/perf/connector_size.h"
|
||||||
|
#include "dataset/engine/perf/dataset_iterator_tracing.h"
|
||||||
|
#include "utils/log_adapter.h"
|
||||||
|
|
||||||
|
namespace mindspore {
|
||||||
|
namespace dataset {
|
||||||
|
|
||||||
|
bool ProfilingManager::IsProfilingEnable() const {
|
||||||
|
auto profiling = common::GetEnv("PROFILING_MODE");
|
||||||
|
if (profiling.empty() || profiling != "true") {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
Status ProfilingManager::Initialize() {
|
||||||
|
// Register nodes based on config
|
||||||
|
std::string dir = common::GetEnv("MINDDATA_PROFILING_DIR");
|
||||||
|
if (dir.empty()) {
|
||||||
|
RETURN_STATUS_UNEXPECTED("Profiling dir is not set.");
|
||||||
|
}
|
||||||
|
char real_path[PATH_MAX] = {0};
|
||||||
|
if (dir.size() >= PATH_MAX) {
|
||||||
|
RETURN_STATUS_UNEXPECTED("Profiling dir is invalid.");
|
||||||
|
}
|
||||||
|
#if defined(_WIN32) || defined(_WIN64)
|
||||||
|
if (_fullpath(real_path, common::SafeCStr(dir), PATH_MAX) == nullptr) {
|
||||||
|
RETURN_STATUS_UNEXPECTED("Profiling dir is invalid.");
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
if (realpath(common::SafeCStr(dir), real_path) == nullptr) {
|
||||||
|
RETURN_STATUS_UNEXPECTED("Profiling dir is invalid.");
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
dir_path_ = real_path;
|
||||||
|
|
||||||
|
// If DEVICE_ID is not set,defult value is 0
|
||||||
|
device_id_ = common::GetEnv("DEVICE_ID");
|
||||||
|
if (device_id_.empty()) {
|
||||||
|
device_id_ = "0";
|
||||||
|
}
|
||||||
|
|
||||||
|
// Register all profiling node.
|
||||||
|
// device_queue node is used for graph mode
|
||||||
|
std::shared_ptr<Tracing> device_queue_tracing = std::make_shared<DeviceQueueTracing>();
|
||||||
|
RETURN_IF_NOT_OK(RegisterTracingNode(device_queue_tracing));
|
||||||
|
// dataset_iterator node is used for graph mode
|
||||||
|
std::shared_ptr<Tracing> dataset_iterator_tracing = std::make_shared<DatasetIteratorTracing>();
|
||||||
|
RETURN_IF_NOT_OK(RegisterTracingNode(dataset_iterator_tracing));
|
||||||
|
|
||||||
|
std::shared_ptr<Sampling> monitor_sampling = std::make_shared<ConnectorSize>(tree_);
|
||||||
|
RETURN_IF_NOT_OK(RegisterSamplingNode(monitor_sampling));
|
||||||
|
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Profiling node registration
|
||||||
|
Status ProfilingManager::RegisterTracingNode(std::shared_ptr<Tracing> node) {
|
||||||
|
// Check if node with the same name has already been registered.
|
||||||
|
auto exist = tracing_nodes_.find(node->Name());
|
||||||
|
if (exist != tracing_nodes_.end()) {
|
||||||
|
return Status(StatusCode::kProfilingError, "Profiling node already exist: " + node->Name());
|
||||||
|
}
|
||||||
|
// Register the node with its name as key.
|
||||||
|
RETURN_IF_NOT_OK(node->Init(dir_path_, device_id_));
|
||||||
|
tracing_nodes_[node->Name()] = node;
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Profiling node getter
|
||||||
|
Status ProfilingManager::GetTracingNode(const std::string &name, std::shared_ptr<Tracing> *node) {
|
||||||
|
// Check if node with the same name has already been registered.
|
||||||
|
auto exist = tracing_nodes_.find(name);
|
||||||
|
if (exist == tracing_nodes_.end()) {
|
||||||
|
return Status(StatusCode::kProfilingError, "Profiling node does not exist: " + name);
|
||||||
|
}
|
||||||
|
// Fetch node.
|
||||||
|
*node = tracing_nodes_[name];
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Profiling node registration
|
||||||
|
Status ProfilingManager::RegisterSamplingNode(std::shared_ptr<Sampling> node) {
|
||||||
|
// Check if node with the same name has already been registered.
|
||||||
|
auto exist = sampling_nodes_.find(node->Name());
|
||||||
|
if (exist != sampling_nodes_.end()) {
|
||||||
|
return Status(StatusCode::kProfilingError, "Profiling node already exist: " + node->Name());
|
||||||
|
}
|
||||||
|
// Register the node with its name as key.
|
||||||
|
RETURN_IF_NOT_OK(node->Init(dir_path_, device_id_));
|
||||||
|
sampling_nodes_[node->Name()] = node;
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Profiling node getter
|
||||||
|
Status ProfilingManager::GetSamplingNode(const std::string &name, std::shared_ptr<Sampling> *node) {
|
||||||
|
// Check if node with the same name has already been registered.
|
||||||
|
auto exist = sampling_nodes_.find(name);
|
||||||
|
if (exist == sampling_nodes_.end()) {
|
||||||
|
return Status(StatusCode::kProfilingError, "Profiling node does not exist: " + name);
|
||||||
|
}
|
||||||
|
// Fetch node.
|
||||||
|
*node = sampling_nodes_[name];
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
Status ProfilingManager::SaveProfilingData() {
|
||||||
|
if (!IsProfilingEnable()) {
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
MS_LOG(INFO) << "Start to save profiling data.";
|
||||||
|
for (auto node : tracing_nodes_) {
|
||||||
|
RETURN_IF_NOT_OK(node.second->SaveToFile());
|
||||||
|
}
|
||||||
|
for (auto node : sampling_nodes_) {
|
||||||
|
RETURN_IF_NOT_OK(node.second->SaveToFile());
|
||||||
|
}
|
||||||
|
MS_LOG(INFO) << "Save profiling data end.";
|
||||||
|
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
double ProfilingTime::GetCurMilliSecond() {
|
||||||
|
struct timeval tv = {0, 0};
|
||||||
|
(void)gettimeofday(&tv, nullptr);
|
||||||
|
return tv.tv_sec * 1000 + tv.tv_usec / 1000;
|
||||||
|
}
|
||||||
|
} // namespace dataset
|
||||||
|
} // namespace mindspore
|
|
@ -0,0 +1,140 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
#ifndef DATASET_UTIL_PROFILE_H_
|
||||||
|
#define DATASET_UTIL_PROFILE_H_
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
#include <unordered_map>
|
||||||
|
#include <memory>
|
||||||
|
#include "dataset/util/status.h"
|
||||||
|
|
||||||
|
namespace mindspore {
|
||||||
|
namespace dataset {
|
||||||
|
|
||||||
|
class Monitor;
|
||||||
|
class ExecutionTree;
|
||||||
|
|
||||||
|
const char kDeviceQueueTracingName[] = "Device Queue Tracing";
|
||||||
|
const char kDatasetIteratorTracingName[] = "Dataset Iterator Tracing";
|
||||||
|
const char kConnectorSizeSamplingName[] = "Connector Size Sampling";
|
||||||
|
|
||||||
|
// Profiling is a class of basic unit of profiling action
|
||||||
|
// This base class encapsulate the serialization output logic
|
||||||
|
class Profiling : std::enable_shared_from_this<Profiling> {
|
||||||
|
public:
|
||||||
|
// Constructor
|
||||||
|
Profiling() = default;
|
||||||
|
|
||||||
|
// Destructor
|
||||||
|
virtual ~Profiling() = default;
|
||||||
|
|
||||||
|
virtual Status Init(const std::string &dir_path, const std::string &device_id) = 0;
|
||||||
|
|
||||||
|
// Default serialization file generator
|
||||||
|
virtual Status SaveToFile() = 0;
|
||||||
|
|
||||||
|
// Profiling name
|
||||||
|
virtual std::string Name() const = 0;
|
||||||
|
|
||||||
|
protected:
|
||||||
|
std::string file_path_;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Sampling is a class of profiling which generate samples periodically.
|
||||||
|
class Sampling : public Profiling {
|
||||||
|
public:
|
||||||
|
// Sampling action function. This function will be invoked by performance monitor thread.
|
||||||
|
virtual Status Sample() = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Tracing is class of profiling which record samples upon request.
|
||||||
|
class Tracing : public Profiling {
|
||||||
|
// Tracing does not define a fixed interface to provide flexible on data recording.
|
||||||
|
};
|
||||||
|
|
||||||
|
// ProfilingManager is a class manages all profiling infrastructure
|
||||||
|
// It serves the following purposes:
|
||||||
|
// 1) Fetch profiling configs from global contexts
|
||||||
|
// 2) Setup all profiling node based on config
|
||||||
|
// 3) Provide access of profiling nodes for profiling actions
|
||||||
|
// 4) Manage profiling data serialization process
|
||||||
|
class ProfilingManager {
|
||||||
|
public:
|
||||||
|
explicit ProfilingManager(ExecutionTree *tree) : tree_(tree) {}
|
||||||
|
|
||||||
|
~ProfilingManager() = default;
|
||||||
|
|
||||||
|
Status Initialize();
|
||||||
|
|
||||||
|
// Save profile data to file
|
||||||
|
// @return Status - The error code return
|
||||||
|
Status SaveProfilingData();
|
||||||
|
|
||||||
|
// Sampling node getter
|
||||||
|
// @param name - The name of the requested node
|
||||||
|
// @param node - Pointer to the shared pointer for the Sampling node
|
||||||
|
// @return Status - The error code return
|
||||||
|
Status GetSamplingNode(const std::string &name, std::shared_ptr<Sampling> *node);
|
||||||
|
|
||||||
|
// Tracing node getter
|
||||||
|
// @param name - The name of the requested node
|
||||||
|
// @param node - Pointer to the shared pointer for the Tracing node
|
||||||
|
// @return Status - The error code return
|
||||||
|
Status GetTracingNode(const std::string &name, std::shared_ptr<Tracing> *node);
|
||||||
|
|
||||||
|
// If profiling is enabled.
|
||||||
|
bool IsProfilingEnable() const;
|
||||||
|
|
||||||
|
std::unordered_map<std::string, std::shared_ptr<Sampling>> &GetSamplingNodes() { return sampling_nodes_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::unordered_map<std::string, std::shared_ptr<Tracing>> tracing_nodes_;
|
||||||
|
|
||||||
|
std::unordered_map<std::string, std::shared_ptr<Sampling>> sampling_nodes_;
|
||||||
|
|
||||||
|
// Register profile node to tree
|
||||||
|
// @param node - Profiling node
|
||||||
|
// @return Status - The error code return
|
||||||
|
Status RegisterTracingNode(std::shared_ptr<Tracing> node);
|
||||||
|
|
||||||
|
// Register profile node to tree
|
||||||
|
// @param node - Profiling node
|
||||||
|
// @return Status - The error code return
|
||||||
|
Status RegisterSamplingNode(std::shared_ptr<Sampling> node);
|
||||||
|
|
||||||
|
ExecutionTree *tree_ = nullptr; // ExecutionTree pointer
|
||||||
|
std::string dir_path_; // where to create profiling file
|
||||||
|
std::string device_id_; // used when create profiling file,filename_deviceid.suffix
|
||||||
|
};
|
||||||
|
|
||||||
|
enum ProfilingType { TIME, CONNECTOR_DEPTH };
|
||||||
|
|
||||||
|
enum ProfilingTimeSubType {
|
||||||
|
PIPELINE_TIME,
|
||||||
|
TDT_PUSH_TIME,
|
||||||
|
BATCH_TIME,
|
||||||
|
INVALID_TIME,
|
||||||
|
};
|
||||||
|
|
||||||
|
class ProfilingTime {
|
||||||
|
public:
|
||||||
|
static double GetCurMilliSecond();
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace dataset
|
||||||
|
} // namespace mindspore
|
||||||
|
#endif
|
|
@ -16,6 +16,7 @@
|
||||||
#include "dataset/engine/tdt/tdt_plugin.h"
|
#include "dataset/engine/tdt/tdt_plugin.h"
|
||||||
#include "common/utils.h"
|
#include "common/utils.h"
|
||||||
#include "utils/log_adapter.h"
|
#include "utils/log_adapter.h"
|
||||||
|
#include "dataset/engine/perf/profiling.h"
|
||||||
|
|
||||||
namespace mindspore {
|
namespace mindspore {
|
||||||
namespace dataset {
|
namespace dataset {
|
||||||
|
@ -28,18 +29,26 @@ std::shared_ptr<TdtPlugin> TdtPlugin::GetInstance() {
|
||||||
return instance_ptr_;
|
return instance_ptr_;
|
||||||
}
|
}
|
||||||
|
|
||||||
TdtStatus TdtPlugin::hostPush(TensorRow ts_row, bool is_wait, std::string channel_name) {
|
TdtStatus TdtPlugin::hostPush(TensorRow ts_row, bool is_wait, std::string channel_name, bool profiling, int32_t &time) {
|
||||||
MS_LOG(INFO) << "TDT channel name is " << channel_name << ".";
|
MS_LOG(INFO) << "TDT channel name is " << channel_name << ".";
|
||||||
std::vector<DataItem> items;
|
std::vector<DataItem> items;
|
||||||
|
double start_time;
|
||||||
auto ret = translate(ts_row, items);
|
auto ret = translate(ts_row, items);
|
||||||
if (ret != SUCCESS) {
|
if (ret != SUCCESS) {
|
||||||
MS_LOG(ERROR) << "TDT converting tensor failed!";
|
MS_LOG(ERROR) << "TDT converting tensor failed!";
|
||||||
return FAILED;
|
return FAILED;
|
||||||
}
|
}
|
||||||
|
if (profiling) {
|
||||||
|
start_time = ProfilingTime::GetCurMilliSecond();
|
||||||
|
}
|
||||||
if (tdt::TdtHostPushData(channel_name, items) != 0) {
|
if (tdt::TdtHostPushData(channel_name, items) != 0) {
|
||||||
MS_LOG(ERROR) << "TDT pushing data failed!";
|
MS_LOG(ERROR) << "TDT pushing data failed!";
|
||||||
return FAILED;
|
return FAILED;
|
||||||
}
|
}
|
||||||
|
if (profiling) {
|
||||||
|
double end_time = ProfilingTime::GetCurMilliSecond();
|
||||||
|
time = (int32_t)(end_time - start_time);
|
||||||
|
}
|
||||||
return SUCCESS;
|
return SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -37,7 +37,7 @@ class TdtPlugin {
|
||||||
public:
|
public:
|
||||||
static std::shared_ptr<TdtPlugin> GetInstance();
|
static std::shared_ptr<TdtPlugin> GetInstance();
|
||||||
|
|
||||||
TdtStatus hostPush(TensorRow ts_row, bool is_wait, std::string channel_name);
|
TdtStatus hostPush(TensorRow ts_row, bool is_wait, std::string channel_name, bool profilig, int32_t &time);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
TdtPlugin() {}
|
TdtPlugin() {}
|
||||||
|
|
|
@ -14,5 +14,4 @@ add_library(utils OBJECT
|
||||||
status.cc
|
status.cc
|
||||||
path.cc
|
path.cc
|
||||||
wait_post.cc
|
wait_post.cc
|
||||||
sig_handler.cc
|
sig_handler.cc)
|
||||||
profiling.cc)
|
|
||||||
|
|
|
@ -1,112 +0,0 @@
|
||||||
/**
|
|
||||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
|
||||||
*
|
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
* you may not use this file except in compliance with the License.
|
|
||||||
* You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
#include "dataset/util/profiling.h"
|
|
||||||
|
|
||||||
#include <sys/time.h>
|
|
||||||
#include <cstdlib>
|
|
||||||
#include <fstream>
|
|
||||||
#include "dataset/util/path.h"
|
|
||||||
#include "common/utils.h"
|
|
||||||
#include "utils/log_adapter.h"
|
|
||||||
|
|
||||||
namespace mindspore {
|
|
||||||
namespace dataset {
|
|
||||||
Profiling::Profiling(const std::string &file_name, const int32_t device_id)
|
|
||||||
: file_name_(file_name), device_id_(device_id) {}
|
|
||||||
|
|
||||||
Status Profiling::Init() {
|
|
||||||
std::string dir = common::GetEnv("MINDDATA_PROFILING_DIR");
|
|
||||||
if (dir.empty()) {
|
|
||||||
RETURN_STATUS_UNEXPECTED("Profiling dir is not set.");
|
|
||||||
}
|
|
||||||
char real_path[PATH_MAX] = {0};
|
|
||||||
if (dir.size() >= PATH_MAX) {
|
|
||||||
RETURN_STATUS_UNEXPECTED("Profiling dir is invalid.");
|
|
||||||
}
|
|
||||||
#if defined(_WIN32) || defined(_WIN64)
|
|
||||||
if (_fullpath(real_path, common::SafeCStr(dir), PATH_MAX) == nullptr) {
|
|
||||||
RETURN_STATUS_UNEXPECTED("Profiling dir is invalid.");
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
if (realpath(common::SafeCStr(dir), real_path) == nullptr) {
|
|
||||||
RETURN_STATUS_UNEXPECTED("Profiling dir is invalid.");
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
file_path_ = (Path(real_path) / Path(file_name_ + "_" + std::to_string(device_id_) + ".txt")).toString();
|
|
||||||
return Status::OK();
|
|
||||||
}
|
|
||||||
|
|
||||||
Status Profiling::Record(const std::string &data) {
|
|
||||||
value_.emplace_back(data);
|
|
||||||
return Status::OK();
|
|
||||||
}
|
|
||||||
|
|
||||||
Status Profiling::SaveToFile() {
|
|
||||||
if (file_name_.empty()) {
|
|
||||||
RETURN_STATUS_UNEXPECTED("Profiling file name has not been set.");
|
|
||||||
}
|
|
||||||
std::ofstream handle(file_path_, std::ios::app);
|
|
||||||
if (!handle.is_open()) {
|
|
||||||
RETURN_STATUS_UNEXPECTED("Profiling file can not be opened.");
|
|
||||||
}
|
|
||||||
for (auto value : value_) {
|
|
||||||
handle << value << "\n";
|
|
||||||
}
|
|
||||||
handle.close();
|
|
||||||
|
|
||||||
return Status::OK();
|
|
||||||
}
|
|
||||||
|
|
||||||
ProfilingManager &ProfilingManager::GetInstance() {
|
|
||||||
static ProfilingManager instance;
|
|
||||||
return instance;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool ProfilingManager::IsProfilingEnable() const {
|
|
||||||
auto profiling = common::GetEnv("PROFILING_MODE");
|
|
||||||
if (profiling.empty() || profiling != "true") {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
Status ProfilingManager::RegisterProfilingNode(std::shared_ptr<Profiling> *node) {
|
|
||||||
RETURN_IF_NOT_OK((*node)->Init());
|
|
||||||
profiling_node_.emplace_back(*node);
|
|
||||||
return Status::OK();
|
|
||||||
}
|
|
||||||
|
|
||||||
Status ProfilingManager::SaveProfilingData() {
|
|
||||||
if (!IsProfilingEnable()) {
|
|
||||||
return Status::OK();
|
|
||||||
}
|
|
||||||
MS_LOG(INFO) << "Start to save profile data.";
|
|
||||||
for (auto node : profiling_node_) {
|
|
||||||
RETURN_IF_NOT_OK(node->SaveToFile());
|
|
||||||
}
|
|
||||||
MS_LOG(INFO) << "Save profile data end.";
|
|
||||||
|
|
||||||
return Status::OK();
|
|
||||||
}
|
|
||||||
|
|
||||||
double ProfilingTime::GetCurMilliSecond() {
|
|
||||||
struct timeval tv = {0, 0};
|
|
||||||
(void)gettimeofday(&tv, nullptr);
|
|
||||||
return tv.tv_sec * 1000 + tv.tv_usec / 1000;
|
|
||||||
}
|
|
||||||
} // namespace dataset
|
|
||||||
} // namespace mindspore
|
|
|
@ -1,92 +0,0 @@
|
||||||
/**
|
|
||||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
|
||||||
*
|
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
* you may not use this file except in compliance with the License.
|
|
||||||
* You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
#ifndef DATASET_UTIL_PROFILE_H_
|
|
||||||
#define DATASET_UTIL_PROFILE_H_
|
|
||||||
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
#include <memory>
|
|
||||||
#include "dataset/util/status.h"
|
|
||||||
|
|
||||||
namespace mindspore {
|
|
||||||
namespace dataset {
|
|
||||||
enum ProfilingType {
|
|
||||||
TIME,
|
|
||||||
CONNECTOR_DEPTH,
|
|
||||||
};
|
|
||||||
|
|
||||||
enum ProfilingTimeSubType {
|
|
||||||
PIPELINE_TIME,
|
|
||||||
TDT_PUSH_TIME,
|
|
||||||
BATCH_TIME,
|
|
||||||
INVALID_TIME,
|
|
||||||
};
|
|
||||||
|
|
||||||
class Profiling {
|
|
||||||
public:
|
|
||||||
// Constructor
|
|
||||||
Profiling() = default;
|
|
||||||
|
|
||||||
// Constructor if need save profile data to file
|
|
||||||
Profiling(const std::string &file_name, const int32_t device_id);
|
|
||||||
|
|
||||||
// Destructor
|
|
||||||
~Profiling() = default;
|
|
||||||
|
|
||||||
Status Init();
|
|
||||||
|
|
||||||
// Record profile data
|
|
||||||
Status Record(const std::string &data);
|
|
||||||
|
|
||||||
// Save profile data to file if necessary
|
|
||||||
Status SaveToFile();
|
|
||||||
|
|
||||||
private:
|
|
||||||
std::vector<std::string> value_;
|
|
||||||
std::string file_name_;
|
|
||||||
std::string file_path_;
|
|
||||||
int32_t device_id_;
|
|
||||||
};
|
|
||||||
|
|
||||||
class ProfilingManager {
|
|
||||||
public:
|
|
||||||
ProfilingManager() = default;
|
|
||||||
~ProfilingManager() = default;
|
|
||||||
|
|
||||||
static ProfilingManager &GetInstance();
|
|
||||||
|
|
||||||
// Save profile data to file
|
|
||||||
// @return Status - The error code return
|
|
||||||
Status SaveProfilingData();
|
|
||||||
|
|
||||||
// Register profile node to tree
|
|
||||||
// @param node - Profiling node
|
|
||||||
// @return Status - The error code return
|
|
||||||
Status RegisterProfilingNode(std::shared_ptr<Profiling> *node);
|
|
||||||
|
|
||||||
bool IsProfilingEnable() const;
|
|
||||||
|
|
||||||
private:
|
|
||||||
std::vector<std::shared_ptr<Profiling>> profiling_node_;
|
|
||||||
};
|
|
||||||
|
|
||||||
class ProfilingTime {
|
|
||||||
public:
|
|
||||||
static double GetCurMilliSecond();
|
|
||||||
};
|
|
||||||
} // namespace dataset
|
|
||||||
} // namespace mindspore
|
|
||||||
#endif
|
|
|
@ -45,6 +45,9 @@ std::string CodeAsString(const StatusCode c) {
|
||||||
case StatusCode::kDuplicateKey:
|
case StatusCode::kDuplicateKey:
|
||||||
s = "Duplicate key";
|
s = "Duplicate key";
|
||||||
break;
|
break;
|
||||||
|
case StatusCode::kProfilingError:
|
||||||
|
s = "Error encountered while profiling";
|
||||||
|
break;
|
||||||
case StatusCode::kUnexpectedError:
|
case StatusCode::kUnexpectedError:
|
||||||
default:
|
default:
|
||||||
s = "Unexpected error";
|
s = "Unexpected error";
|
||||||
|
|
|
@ -70,6 +70,7 @@ enum class StatusCode : char {
|
||||||
kPythonInterpreterFailure = 7,
|
kPythonInterpreterFailure = 7,
|
||||||
kTDTPushFailure = 8,
|
kTDTPushFailure = 8,
|
||||||
kFileNotExist = 9,
|
kFileNotExist = 9,
|
||||||
|
kProfilingError = 10,
|
||||||
// Make this error code the last one. Add new error code above it.
|
// Make this error code the last one. Add new error code above it.
|
||||||
kUnexpectedError = 127
|
kUnexpectedError = 127
|
||||||
};
|
};
|
||||||
|
|
|
@ -125,6 +125,35 @@ class ConfigurationManager:
|
||||||
"""
|
"""
|
||||||
return self.config.get_num_parallel_workers()
|
return self.config.get_num_parallel_workers()
|
||||||
|
|
||||||
|
def set_monitor_sampling_interval(self, interval):
|
||||||
|
"""
|
||||||
|
Set the default interval(ms) of monitor sampling.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
interval: interval(ms) to be used to performance monitor sampling.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If interval is invalid (<= 0 or > MAX_INT_32).
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
>>> import mindspore.dataset as ds
|
||||||
|
>>> con = ds.engine.ConfigurationManager()
|
||||||
|
>>> # sets the new interval value.
|
||||||
|
>>> con.set_monitor_sampling_interval(100)
|
||||||
|
"""
|
||||||
|
if interval <= 0 or interval > INT32_MAX:
|
||||||
|
raise ValueError("Interval given is not within the required range")
|
||||||
|
self.config.set_monitor_sampling_interval(interval)
|
||||||
|
|
||||||
|
def get_monitor_sampling_interval(self):
|
||||||
|
"""
|
||||||
|
Get the default interval of performance monitor sampling.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Interval: interval(ms) of performance monitor sampling.
|
||||||
|
"""
|
||||||
|
return self.config.get_monitor_sampling_interval()
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
"""
|
"""
|
||||||
String representation of the configurations.
|
String representation of the configurations.
|
||||||
|
|
|
@ -0,0 +1,119 @@
|
||||||
|
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# ==============================================================================
|
||||||
|
"""
|
||||||
|
Testing profiling support in DE
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
import numpy as np
|
||||||
|
import mindspore.dataset as ds
|
||||||
|
|
||||||
|
FILES = ["../data/dataset/testTFTestAllTypes/test.data"]
|
||||||
|
DATASET_ROOT = "../data/dataset/testTFTestAllTypes/"
|
||||||
|
SCHEMA_FILE = "../data/dataset/testTFTestAllTypes/datasetSchema.json"
|
||||||
|
|
||||||
|
PIPELINE_FILE = "./pipeline_profiling_1.json"
|
||||||
|
DATASET_ITERATOR_FILE = "./dataset_iterator_profiling_1.txt"
|
||||||
|
|
||||||
|
|
||||||
|
def test_profiling_simple_pipeline():
|
||||||
|
"""
|
||||||
|
Generator -> Shuffle -> Batch
|
||||||
|
"""
|
||||||
|
os.environ['PROFILING_MODE'] = 'true'
|
||||||
|
os.environ['MINDDATA_PROFILING_DIR'] = '.'
|
||||||
|
os.environ['DEVICE_ID'] = '1'
|
||||||
|
|
||||||
|
source = [(np.array([x]),) for x in range(1024)]
|
||||||
|
data1 = ds.GeneratorDataset(source, ["data"])
|
||||||
|
data1 = data1.shuffle(64)
|
||||||
|
data1 = data1.batch(32)
|
||||||
|
|
||||||
|
for _ in data1:
|
||||||
|
pass
|
||||||
|
|
||||||
|
assert os.path.exists(PIPELINE_FILE) is True
|
||||||
|
os.remove(PIPELINE_FILE)
|
||||||
|
assert os.path.exists(DATASET_ITERATOR_FILE) is True
|
||||||
|
os.remove(DATASET_ITERATOR_FILE)
|
||||||
|
del os.environ['PROFILING_MODE']
|
||||||
|
del os.environ['MINDDATA_PROFILING_DIR']
|
||||||
|
|
||||||
|
|
||||||
|
def test_profiling_complex_pipeline():
|
||||||
|
"""
|
||||||
|
Generator -> Map ->
|
||||||
|
-> Zip -> Batch
|
||||||
|
TFReader -> Shuffle ->
|
||||||
|
"""
|
||||||
|
os.environ['PROFILING_MODE'] = 'true'
|
||||||
|
os.environ['MINDDATA_PROFILING_DIR'] = '.'
|
||||||
|
os.environ['DEVICE_ID'] = '1'
|
||||||
|
|
||||||
|
source = [(np.array([x]),) for x in range(1024)]
|
||||||
|
data1 = ds.GeneratorDataset(source, ["gen"])
|
||||||
|
data1 = data1.map("gen", operations=[(lambda x: x + 1)])
|
||||||
|
|
||||||
|
pattern = DATASET_ROOT + "/test.data"
|
||||||
|
data2 = ds.TFRecordDataset(pattern, SCHEMA_FILE, shuffle=ds.Shuffle.FILES)
|
||||||
|
data2 = data2.shuffle(4)
|
||||||
|
|
||||||
|
data3 = ds.zip((data1, data2))
|
||||||
|
|
||||||
|
for _ in data3:
|
||||||
|
pass
|
||||||
|
|
||||||
|
assert os.path.exists(PIPELINE_FILE) is True
|
||||||
|
os.remove(PIPELINE_FILE)
|
||||||
|
assert os.path.exists(DATASET_ITERATOR_FILE) is True
|
||||||
|
os.remove(DATASET_ITERATOR_FILE)
|
||||||
|
del os.environ['PROFILING_MODE']
|
||||||
|
del os.environ['MINDDATA_PROFILING_DIR']
|
||||||
|
|
||||||
|
|
||||||
|
def test_profiling_sampling_iterval():
|
||||||
|
"""
|
||||||
|
Test non-default monitor sampling interval
|
||||||
|
"""
|
||||||
|
os.environ['PROFILING_MODE'] = 'true'
|
||||||
|
os.environ['MINDDATA_PROFILING_DIR'] = '.'
|
||||||
|
os.environ['DEVICE_ID'] = '1'
|
||||||
|
interval_origin = ds.config.get_monitor_sampling_interval()
|
||||||
|
|
||||||
|
ds.config.set_monitor_sampling_interval(30)
|
||||||
|
interval = ds.config.get_monitor_sampling_interval()
|
||||||
|
assert interval == 30
|
||||||
|
|
||||||
|
source = [(np.array([x]),) for x in range(1024)]
|
||||||
|
data1 = ds.GeneratorDataset(source, ["data"])
|
||||||
|
data1 = data1.shuffle(64)
|
||||||
|
data1 = data1.batch(32)
|
||||||
|
|
||||||
|
for _ in data1:
|
||||||
|
pass
|
||||||
|
|
||||||
|
assert os.path.exists(PIPELINE_FILE) is True
|
||||||
|
os.remove(PIPELINE_FILE)
|
||||||
|
assert os.path.exists(DATASET_ITERATOR_FILE) is True
|
||||||
|
os.remove(DATASET_ITERATOR_FILE)
|
||||||
|
|
||||||
|
ds.config.set_monitor_sampling_interval(interval_origin)
|
||||||
|
del os.environ['PROFILING_MODE']
|
||||||
|
del os.environ['MINDDATA_PROFILING_DIR']
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
test_profiling_simple_pipeline()
|
||||||
|
test_profiling_complex_pipeline()
|
||||||
|
test_profiling_sampling_iterval()
|
Loading…
Reference in New Issue