sync in dataset recovery before graph compile & Optimize reset performance for Random Accessable Dataset
This commit is contained in:
parent
5fad95269d
commit
8992c64eb3
|
@ -179,6 +179,12 @@ PYBIND_REGISTER(ToDevice, 1, ([](const py::module *m) {
|
|||
}
|
||||
return py::make_tuple(types, shapes);
|
||||
})
|
||||
.def("GetMbufQueueSize",
|
||||
[](ToDevice &self) {
|
||||
size_t queue_size = 0;
|
||||
THROW_IF_ERROR(self.GetMbufQueueSize(&queue_size));
|
||||
return queue_size;
|
||||
})
|
||||
.def("GetSendInfo",
|
||||
[](ToDevice &self) {
|
||||
std::vector<std::vector<double>> send_info;
|
||||
|
|
|
@ -366,6 +366,17 @@ Status ToDevice::GetDataInfo(std::vector<DataType> *const types, std::vector<Ten
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
Status ToDevice::GetMbufQueueSize(size_t *queue_size) {
|
||||
RETURN_UNEXPECTED_IF_NULL(queue_size);
|
||||
// tree_.root() must be DataQueueOp
|
||||
std::shared_ptr<DatasetOp> root = std::shared_ptr<DatasetOp>(tree_adapter_->GetRoot());
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(root != nullptr, "Root is a nullptr.");
|
||||
DataQueueOp *op = dynamic_cast<DataQueueOp *>(root.get());
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(op != nullptr, "GetMbufQueueSize only supported by DataQueueOp");
|
||||
RETURN_IF_NOT_OK(op->GetMbufQueueSize(queue_size));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status ToDevice::GetSendInfo(std::vector<std::vector<double>> *send_info) {
|
||||
RETURN_UNEXPECTED_IF_NULL(send_info);
|
||||
// tree_.root() must be DataQueueOp
|
||||
|
|
|
@ -251,6 +251,10 @@ class ToDevice : public TreeConsumer {
|
|||
/// \return Status error code
|
||||
virtual Status GetDataInfo(std::vector<DataType> *types, std::vector<TensorShape> *shapes);
|
||||
|
||||
/// Get data numbers from TDT
|
||||
/// \return Status error code
|
||||
virtual Status GetMbufQueueSize(size_t *queue_size);
|
||||
|
||||
/// Get send info in sink mode
|
||||
/// \return Status error code
|
||||
virtual Status GetSendInfo(std::vector<std::vector<double>> *send_info);
|
||||
|
|
|
@ -747,6 +747,17 @@ Status DataQueueOp::GetDataInfo(DATA_INFO *data_info) {
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
Status DataQueueOp::GetMbufQueueSize(size_t *queue_size) {
|
||||
#ifdef WITH_BACKEND
|
||||
if (device_type_ == DeviceType::Ascend) {
|
||||
*queue_size = ascend_data_queue_->QueryQueueSize();
|
||||
} else {
|
||||
*queue_size = 1;
|
||||
}
|
||||
#endif
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
std::vector<std::vector<double>> DataQueueOp::GetSendInfo() {
|
||||
std::vector<std::vector<double>> send_info_per_epoch;
|
||||
(void)std::transform(send_summary_.begin(), send_summary_.end(), std::back_inserter(send_info_per_epoch),
|
||||
|
|
|
@ -83,6 +83,8 @@ class DataQueueOp : public PipelineOp {
|
|||
|
||||
Status GetDataInfo(DATA_INFO *data_info);
|
||||
|
||||
Status GetMbufQueueSize(size_t *queue_size);
|
||||
|
||||
std::vector<std::vector<double>> GetSendInfo();
|
||||
|
||||
// Name: Print()
|
||||
|
|
|
@ -16,8 +16,61 @@
|
|||
|
||||
#include "minddata/dataset/engine/datasetops/source/sampler/skip_first_epoch_sampler.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
Status SkipFirstEpochSamplerRT::GetNextSample(TensorRow *out) {
|
||||
RETURN_UNEXPECTED_IF_NULL(out);
|
||||
if (id_count_ > num_samples_) {
|
||||
RETURN_STATUS_UNEXPECTED(
|
||||
"[Internal ERROR] Sampler index must be less than or equal to num_samples(total rows in dataset), but got:" +
|
||||
std::to_string(id_count_) + ", num_samples_: " + std::to_string(num_samples_));
|
||||
} else if (id_count_ == num_samples_) {
|
||||
(*out) = TensorRow(TensorRow::kFlagEOE);
|
||||
} else {
|
||||
if (HasChildSampler()) {
|
||||
RETURN_IF_NOT_OK(child_[0]->GetNextSample(&child_ids_));
|
||||
}
|
||||
|
||||
std::shared_ptr<Tensor> sampleIds;
|
||||
|
||||
// Compute how many ids are left to pack, and pack this amount into a new Tensor. Respect the setting for
|
||||
// samples per Tensor though.
|
||||
int64_t remaining_ids = num_samples_ - id_count_;
|
||||
int64_t num_elements = std::min(remaining_ids, samples_per_tensor_);
|
||||
|
||||
RETURN_IF_NOT_OK(CreateSamplerTensor(&sampleIds, num_elements));
|
||||
|
||||
if (HasChildSampler()) {
|
||||
std::string err_msg = "Failed to copy full sample ids into child sampler.";
|
||||
int64_t copy_data_length = num_elements * sizeof(int64_t);
|
||||
if (copy_data_length < SECUREC_MEM_MAX_LEN) {
|
||||
int ret_code = memcpy_s(sampleIds->GetMutableBuffer(), copy_data_length,
|
||||
child_ids_[0]->GetMutableBuffer() + current_id_ * sizeof(int64_t), copy_data_length);
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(ret_code == EOK, err_msg);
|
||||
} else {
|
||||
auto dest = std::memcpy(sampleIds->GetMutableBuffer(),
|
||||
child_ids_[0]->GetMutableBuffer() + current_id_ * sizeof(int64_t), copy_data_length);
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(dest == sampleIds->GetMutableBuffer(), err_msg);
|
||||
}
|
||||
current_id_ += num_elements;
|
||||
} else {
|
||||
auto idPtr = sampleIds->begin<int64_t>();
|
||||
for (int64_t i = 0; i < num_elements; i++) {
|
||||
*idPtr = current_id_;
|
||||
current_id_++; // Move the current id to the next one in the sequence
|
||||
++idPtr;
|
||||
}
|
||||
}
|
||||
id_count_ += num_elements; // Count the packed ids towards our overall sample count
|
||||
(*out) = {sampleIds};
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status SkipFirstEpochSamplerRT::ResetSampler(const bool failover_reset) {
|
||||
// This is a special sampler for Failover Reset, its internal state should
|
||||
// not reset when failover_reset is set to true.
|
||||
|
|
|
@ -30,6 +30,8 @@ class SkipFirstEpochSamplerRT : public SequentialSamplerRT {
|
|||
// Destructor.
|
||||
~SkipFirstEpochSamplerRT() = default;
|
||||
|
||||
Status GetNextSample(TensorRow *out) override;
|
||||
|
||||
/// \brief Reset for next epoch.
|
||||
/// \param[in] failover_reset A boolean to show whether we are resetting the pipeline
|
||||
/// \return Status The status code returned
|
||||
|
|
|
@ -61,6 +61,7 @@ Status TreeAdapter::PrePass(const std::shared_ptr<DatasetNode> &ir) {
|
|||
MS_LOG(INFO) << "Running pre pass loops.";
|
||||
(void)actions.emplace_back(std::make_unique<InputValidationPass>());
|
||||
(void)actions.emplace_back(std::make_unique<CacheValidationPass>());
|
||||
(void)actions.emplace_back(std::make_unique<NodeRemovalPass>());
|
||||
(void)actions.emplace_back(std::make_unique<InsertMapPass>());
|
||||
if (usage_ == kDeReset) {
|
||||
(void)actions.emplace_back(std::make_unique<AddSkipPass>());
|
||||
|
@ -68,7 +69,6 @@ Status TreeAdapter::PrePass(const std::shared_ptr<DatasetNode> &ir) {
|
|||
(void)actions.emplace_back(std::make_unique<SkipPushdownPass>());
|
||||
}
|
||||
}
|
||||
(void)actions.emplace_back(std::make_unique<NodeRemovalPass>());
|
||||
(void)actions.emplace_back(std::make_unique<EpochCtrlPass>());
|
||||
if (usage_ == kDeGetter) {
|
||||
(void)actions.emplace_back(std::make_unique<GetterPass>());
|
||||
|
|
|
@ -4223,6 +4223,12 @@ class _ToDevice:
|
|||
"""
|
||||
return self._to_device.GetDataInfo()
|
||||
|
||||
def get_mbuf_queue_size(self):
|
||||
"""
|
||||
Get element numbers inside mbuf.
|
||||
"""
|
||||
return self._to_device.GetMbufQueueSize()
|
||||
|
||||
def get_send_info(self):
|
||||
"""
|
||||
In sink mode, it returns the send information of dataset at this moment.
|
||||
|
@ -4337,6 +4343,14 @@ class TransferDataset(Dataset):
|
|||
return self._to_device.get_data_info()
|
||||
raise RuntimeError("Calling get_data_info with bad state.")
|
||||
|
||||
def get_mbuf_queue_size(self):
|
||||
"""
|
||||
Get element numbers inside mbuf.
|
||||
"""
|
||||
if self._to_device is not None:
|
||||
return self._to_device.get_mbuf_queue_size()
|
||||
raise RuntimeError("Device queue is not init, call get_mbuf_queue_size failed.")
|
||||
|
||||
def get_send_info(self):
|
||||
"""
|
||||
In sink mode, it returns the send information of dataset at this moment.
|
||||
|
|
|
@ -449,6 +449,11 @@ class DatasetHelper:
|
|||
# Generally, it works in dynamic shape scenarios.
|
||||
return self.iter.get_data_info()
|
||||
|
||||
# pylint: disable=missing-docstring
|
||||
def get_mbuf_queue_size(self):
|
||||
# In sink mode, it returns the element numbers inside mbuf channel.
|
||||
return self.iter.get_mbuf_queue_size()
|
||||
|
||||
# pylint: disable=missing-docstring
|
||||
def get_send_info(self, run_context):
|
||||
# In sink mode, it returns the send information of dataset at this moment.
|
||||
|
@ -531,6 +536,7 @@ class _DatasetIter:
|
|||
self.release = dataset.__transfer_dataset__.release
|
||||
self.continue_send = dataset.__transfer_dataset__.continue_send
|
||||
self.get_data_info = dataset.__transfer_dataset__.get_data_info
|
||||
self.get_mbuf_queue_size = dataset.__transfer_dataset__.get_mbuf_queue_size
|
||||
self.get_send_info = dataset.__transfer_dataset__.get_send_info
|
||||
if hasattr(dataset.__transfer_dataset__, "_reset"):
|
||||
self._reset = dataset.__transfer_dataset__._reset # pylint: disable=protected-access
|
||||
|
|
|
@ -22,6 +22,7 @@ import os
|
|||
import math
|
||||
import copy
|
||||
import importlib
|
||||
import time
|
||||
import numpy as np
|
||||
|
||||
import mindspore
|
||||
|
@ -523,6 +524,16 @@ class Model:
|
|||
dataset_sink_mode=True,
|
||||
sink_size=sink_size)
|
||||
self._warmup_dataset(epoch, train_dataset, sink_size)
|
||||
if train_dataset.get_init_step() > 0:
|
||||
mbuf_size = train_dataset.__transfer_dataset__.get_mbuf_queue_size()
|
||||
while mbuf_size == 0:
|
||||
time.sleep(10)
|
||||
mbuf_size = train_dataset.__transfer_dataset__.get_mbuf_queue_size()
|
||||
if mbuf_size != 0:
|
||||
break
|
||||
logger.warning(f"Failover mode, waiting for dataset recover to specify step, "
|
||||
f"current device queue size: {mbuf_size}")
|
||||
|
||||
if context.get_auto_parallel_context("pipeline_stages") > 1 and valid_dataset:
|
||||
train_network.add_flags_recursive(is_first_iteration=True)
|
||||
for inputs in train_dataset_helper:
|
||||
|
|
|
@ -74,6 +74,9 @@ class MindData:
|
|||
def get_data_info(self):
|
||||
pass
|
||||
|
||||
def get_mbuf_queue_size(self):
|
||||
pass
|
||||
|
||||
def get_send_info(self):
|
||||
pass
|
||||
|
||||
|
|
|
@ -33,7 +33,8 @@ class MindDataSkipPushdownTestOptimizationPass : public UT::DatasetOpTesting {
|
|||
/// \param[in] root_target Target dataset for compare
|
||||
/// \param[in] step Skip step
|
||||
/// \return Status of the function
|
||||
Status prepare_trees(std::shared_ptr<Dataset> root_original, std::shared_ptr<Dataset> root_target, int64_t step = 0) {
|
||||
Status prepare_trees(std::shared_ptr<Dataset> root_original, std::shared_ptr<Dataset> root_target, int64_t step = 0,
|
||||
bool check_pipeline = true) {
|
||||
auto ir_tree = std::make_shared<TreeAdapter>(TreeAdapter::UsageFlag::kDeReset);
|
||||
|
||||
// Get the dataset size for calculating the initial epoch
|
||||
|
@ -47,7 +48,7 @@ class MindDataSkipPushdownTestOptimizationPass : public UT::DatasetOpTesting {
|
|||
RETURN_IF_NOT_OK(ir_tree_target->Compile(root_target->IRNode(), 1,
|
||||
0)); // Step is 0 for target node tree
|
||||
|
||||
if (step != 0) {
|
||||
if (step != 0 && check_pipeline) {
|
||||
RETURN_IF_NOT_OK(compare_pass(ir_tree_target->RootIRNode(), ir_tree->RootIRNode()));
|
||||
}
|
||||
RETURN_IF_NOT_OK(compare_pass_row(ir_tree_target, ir_tree));
|
||||
|
@ -367,19 +368,19 @@ TEST_F(MindDataSkipPushdownTestOptimizationPass, SkipPushdownSkip0) {
|
|||
|
||||
root = ImageFolder(folder_path, false, std::make_shared<SequentialSampler>())->Project({"label", "image"})->Skip(0);
|
||||
root_target = ImageFolder(folder_path, false, std::make_shared<SequentialSampler>())->Project({"label", "image"});
|
||||
EXPECT_OK(prepare_trees(root, root_target, 0));
|
||||
EXPECT_OK(prepare_trees(root, root_target, 0, false));
|
||||
|
||||
root = ImageFolder(folder_path, false, std::make_shared<SequentialSampler>())
|
||||
->Skip(0)
|
||||
->Project({"label", "image"})
|
||||
->Skip(0);
|
||||
root_target = ImageFolder(folder_path, false, std::make_shared<SequentialSampler>())->Project({"label", "image"});
|
||||
EXPECT_OK(prepare_trees(root, root_target, 0));
|
||||
EXPECT_OK(prepare_trees(root, root_target, 0, false));
|
||||
|
||||
root = ImageFolder(folder_path, false, std::make_shared<SequentialSampler>())->Skip(0)->Project({"label", "image"});
|
||||
root_target =
|
||||
ImageFolder(folder_path, false, std::make_shared<SequentialSampler>())->Skip(1)->Project({"label", "image"});
|
||||
EXPECT_OK(prepare_trees(root, root_target, 1));
|
||||
EXPECT_OK(prepare_trees(root, root_target, 1, false));
|
||||
|
||||
root = ImageFolder(folder_path, false, std::make_shared<SequentialSampler>())
|
||||
->Skip(2)
|
||||
|
|
Loading…
Reference in New Issue