forked from OSSInnovation/mindspore
!5551 Fix master ps stuck
Merge pull request !5551 from ZPaC/master-fix-stuck-bug
This commit is contained in:
commit
4dddee575d
|
@ -88,10 +88,6 @@ void SparseApplyFtrlPSKernel::ReInit(const std::vector<AddressPtr> &inputs) {
|
|||
bool SparseApplyFtrlPSKernel::Execute(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs) {
|
||||
ReInit(inputs);
|
||||
int *indices = reinterpret_cast<int *>(inputs[4]->addr);
|
||||
for (size_t i = 0; i < inputs[4]->size / sizeof(int); i++) {
|
||||
indices[i] -= row_offset_;
|
||||
}
|
||||
return Launch(inputs, workspace, outputs);
|
||||
}
|
||||
|
||||
|
|
|
@ -86,10 +86,6 @@ bool SparseApplyLazyAdamPSKernel::Execute(const std::vector<AddressPtr> &inputs,
|
|||
const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs) {
|
||||
ReInit(inputs);
|
||||
int *indices = reinterpret_cast<int *>(inputs[10]->addr);
|
||||
for (size_t i = 0; i < inputs[10]->size / sizeof(int); i++) {
|
||||
indices[i] -= row_offset_;
|
||||
}
|
||||
return Launch(inputs, workspace, outputs);
|
||||
}
|
||||
|
||||
|
|
|
@ -511,9 +511,7 @@ void ParameterServer<T>::UpdateWeights() {
|
|||
MS_EXCEPTION_IF_NULL(optimizer);
|
||||
|
||||
std::shared_ptr<OptimizerInfo> optim_info = optim_infos_[key];
|
||||
if (optim_info == nullptr) {
|
||||
continue;
|
||||
}
|
||||
if (optim_info != nullptr) {
|
||||
const std::vector<kernel::AddressPtr> &inputs = optim_info->inputs();
|
||||
const std::vector<kernel::AddressPtr> &workspaces = optim_info->workspaces();
|
||||
const std::vector<kernel::AddressPtr> &outputs = optim_info->outputs();
|
||||
|
@ -533,6 +531,7 @@ void ParameterServer<T>::UpdateWeights() {
|
|||
optim_info->ComputeMean(shapes, worker_num_, pserver_num_, rank_id_);
|
||||
optimizer->Execute(inputs, workspaces, outputs);
|
||||
optim_info->Reset();
|
||||
}
|
||||
if (!is_embedding_[key]) {
|
||||
tokens_[key] = worker_num_;
|
||||
}
|
||||
|
@ -545,6 +544,8 @@ template <typename T>
|
|||
void ParameterServer<T>::AccumGrad(const Keys &keys, const Values &values, const Lengths &lengths) {
|
||||
std::unique_lock<std::mutex> lock(mutex_);
|
||||
const Key &key = keys[0];
|
||||
bool no_sparse_grad = values.size() == 1 && values[0] == -100;
|
||||
if (!no_sparse_grad) {
|
||||
std::shared_ptr<OptimizerInfo> optim_info = optim_infos_[key];
|
||||
|
||||
// Create or update the optimizer info
|
||||
|
@ -563,6 +564,7 @@ void ParameterServer<T>::AccumGrad(const Keys &keys, const Values &values, const
|
|||
optim_info->Update(values, lengths);
|
||||
optim_info->Accumulate(values, lengths);
|
||||
}
|
||||
}
|
||||
|
||||
grads_accum_counter_[key] += 1;
|
||||
if (grads_accum_counter_[key] == worker_num_) {
|
||||
|
|
|
@ -112,7 +112,7 @@ class WorkerProxy : public ::ps::KVWorker<T> {
|
|||
std::unique_ptr<::ps::Customer> general_customer_;
|
||||
std::unordered_map<::ps::Key, std::shared_ptr<std::vector<::ps::Range>>> embedding_table_ranges_;
|
||||
std::unordered_map<int, std::vector<::ps::KVPairs<T>>> lookup_results_;
|
||||
std::unordered_map<int, ::ps::KVPairs<T>> gathered_response_;
|
||||
std::unordered_map<int, std::map<int, ::ps::KVPairs<T>>> gathered_response_;
|
||||
std::mutex mutex_;
|
||||
Slicer lookup_slicer_;
|
||||
Slicer sparse_slicer_;
|
||||
|
@ -337,12 +337,19 @@ int WorkerProxy<T>::AddGeneralRspCB(const ::ps::SArray<::ps::Key> &keys, ::ps::S
|
|||
int ts = general_customer_->NewRequest(::ps::kServerGroup);
|
||||
const auto &callback = [this, ts, keys, vals, lens, cb]() mutable {
|
||||
mutex_.lock();
|
||||
auto &kvs = gathered_response_[ts];
|
||||
std::map<int, ::ps::KVPairs<T>> server_kvs = gathered_response_[ts];
|
||||
mutex_.unlock();
|
||||
|
||||
*vals = kvs.vals;
|
||||
vals->clear();
|
||||
for (auto kvs : server_kvs) {
|
||||
for (auto val : kvs.second.vals) {
|
||||
vals->push_back(val);
|
||||
}
|
||||
if (lens) {
|
||||
*lens = kvs.lens;
|
||||
for (auto len : kvs.second.lens) {
|
||||
lens->push_back(len);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
mutex_.lock();
|
||||
|
@ -464,6 +471,7 @@ void WorkerProxy<T>::SparseSlicer(int timestamp, const ::ps::KVPairs<T> &send, c
|
|||
}
|
||||
}
|
||||
size_t indices_size = indice_ids.size();
|
||||
if (indices_size > 0) {
|
||||
int slice_segment_size = indices_size * segment_size;
|
||||
T *src_grad_data = new T[slice_segment_size];
|
||||
int *src_indice_data = new int[indices_size];
|
||||
|
@ -494,15 +502,21 @@ void WorkerProxy<T>::SparseSlicer(int timestamp, const ::ps::KVPairs<T> &send, c
|
|||
|
||||
kvs.lens = reduced_lens;
|
||||
kvs.vals = reduced_data;
|
||||
}
|
||||
|
||||
if (indices_size <= 0) {
|
||||
sliced->at(i).first = false;
|
||||
} else {
|
||||
::ps::SArray<T> no_keys;
|
||||
::ps::SArray<T> no_vals;
|
||||
::ps::SArray<T> no_lens;
|
||||
no_keys.push_back(key);
|
||||
no_vals.push_back(-100);
|
||||
kvs.vals = no_vals;
|
||||
kvs.lens = no_lens;
|
||||
}
|
||||
sliced->at(i).first = true;
|
||||
expected_result_count_[timestamp] += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void WorkerProxy<T>::PrepareSparseGradient(const size_t begin, const size_t end,
|
||||
|
@ -554,8 +568,8 @@ void WorkerProxy<T>::BuildSparseValue(const ::ps::SArray<int> &lengths, const si
|
|||
}
|
||||
|
||||
// Fill the reduced indice
|
||||
int indice_offset = grad_offset + lengths[grad_index];
|
||||
data_size = lengths[indice_index] * sizeof(T);
|
||||
int indice_offset = grad_offset + data_size;
|
||||
T *indice_data = reduced_data->data() + indice_offset;
|
||||
T *convert = new T[lengths[indice_index]];
|
||||
for (int i = 0; i < lengths[indice_index]; i++) {
|
||||
|
@ -656,7 +670,7 @@ void WorkerProxy<T>::ProcessLookupResult(const ::ps::Message &msg) {
|
|||
lookup_results_[ts].push_back(kvs);
|
||||
mutex_.unlock();
|
||||
}
|
||||
if (lookup_customer_->NumResponse(ts) == expected_result_count_[ts] - 1) {
|
||||
if (lookup_customer_->NumResponse(ts) + 1 == server_num_) {
|
||||
const auto &cb = lookup_callbacks_[ts];
|
||||
cb();
|
||||
lookup_callbacks_.erase(ts);
|
||||
|
@ -676,15 +690,8 @@ void WorkerProxy<T>::ProcessResponse(const ::ps::Message &msg) {
|
|||
kvs.lens = msg.data[2];
|
||||
}
|
||||
mutex_.lock();
|
||||
for (auto key : kvs.keys) {
|
||||
gathered_response_[ts].keys.push_back(key);
|
||||
}
|
||||
for (auto val : kvs.vals) {
|
||||
gathered_response_[ts].vals.push_back(val);
|
||||
}
|
||||
for (auto len : kvs.lens) {
|
||||
gathered_response_[ts].lens.push_back(len);
|
||||
}
|
||||
int rsp_server_rank = ::ps::Postoffice::Get()->IDtoRank(msg.meta.sender);
|
||||
gathered_response_[ts][rsp_server_rank] = kvs;
|
||||
mutex_.unlock();
|
||||
if (general_customer_->NumResponse(ts) + 1 == server_num_) {
|
||||
const auto &cb = general_callbacks_[ts];
|
||||
|
|
Loading…
Reference in New Issue