optimize mindrecord writer performance

This commit is contained in:
jonwe 2020-04-22 11:09:28 -04:00
parent bfbefa88c7
commit 353b3b7944
16 changed files with 681 additions and 35 deletions

View File

@ -0,0 +1,46 @@
# MindRecord generating guidelines
<!-- TOC -->
- [MindRecord generating guidelines](#mindrecord-generating-guidelines)
- [Create work space](#create-work-space)
- [Implement data generator](#implement-data-generator)
- [Run data generator](#run-data-generator)
<!-- /TOC -->
## Create work space
Assume the dataset name is 'xyz'
* Create work space from template
```shell
cd ${your_mindspore_home}/example/convert_to_mindrecord
cp -r template xyz
```
## Implement data generator
Edit dictionary data generator
* Edit file
```shell
cd ${your_mindspore_home}/example/convert_to_mindrecord
vi xyz/mr_api.py
```
Two API, 'mindrecord_task_number' and 'mindrecord_dict_data', must be implemented
- 'mindrecord_task_number()' returns number of tasks. Return 1 if data row is generated serially. Return N if generator can be split into N parallel-run tasks.
- 'mindrecord_dict_data(task_id)' yields dictionary data row by row. 'task_id' is 0..N-1, if N is return value of mindrecord_task_number()
Tricky for parallel run
- For imagenet, one directory can be a task.
- For TFRecord with multiple files, each file can be a task.
- For TFRecord with 1 file only, it could also be split into N tasks. Task_id=K means: data row is picked only if (count % N == K)
## Run data generator
* run python script
```shell
cd ${your_mindspore_home}/example/convert_to_mindrecord
python writer.py --mindrecord_script imagenet [...]
```

View File

@ -0,0 +1,122 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
User-defined API for MindRecord writer.
Two API must be implemented,
1. mindrecord_task_number()
# Return number of parallel tasks. return 1 if no parallel
2. mindrecord_dict_data(task_id)
# Yield data for one task
# task_id is 0..N-1, if N is return value of mindrecord_task_number()
"""
import argparse
import os
import pickle
######## mindrecord_schema begin ##########
mindrecord_schema = {"label": {"type": "int64"},
"data": {"type": "bytes"},
"file_name": {"type": "string"}}
######## mindrecord_schema end ##########
######## Frozen code begin ##########
with open('mr_argument.pickle', 'rb') as mindrecord_argument_file_handle:
ARG_LIST = pickle.load(mindrecord_argument_file_handle)
######## Frozen code end ##########
parser = argparse.ArgumentParser(description='Mind record imagenet example')
parser.add_argument('--label_file', type=str, default="", help='label file')
parser.add_argument('--image_dir', type=str, default="", help='images directory')
######## Frozen code begin ##########
args = parser.parse_args(ARG_LIST)
print(args)
######## Frozen code end ##########
def _user_defined_private_func():
"""
Internal function for tasks list
Return:
tasks list
"""
if not os.path.exists(args.label_file):
raise IOError("map file {} not exists".format(args.label_file))
label_dict = {}
with open(args.label_file) as file_handle:
line = file_handle.readline()
while line:
labels = line.split(" ")
label_dict[labels[1]] = labels[0]
line = file_handle.readline()
# get all the dir which are n02087046, n02094114, n02109525
dir_paths = {}
for item in label_dict:
real_path = os.path.join(args.image_dir, label_dict[item])
if not os.path.isdir(real_path):
print("{} dir is not exist".format(real_path))
continue
dir_paths[item] = real_path
if not dir_paths:
print("not valid image dir in {}".format(args.image_dir))
return {}, {}
dir_list = []
for label in dir_paths:
dir_list.append(label)
return dir_list, dir_paths
dir_list_global, dir_paths_global = _user_defined_private_func()
def mindrecord_task_number():
"""
Get task size.
Return:
number of tasks
"""
return len(dir_list_global)
def mindrecord_dict_data(task_id):
"""
Get data dict.
Yields:
data (dict): data row which is dict.
"""
# get the filename, label and image binary as a dict
label = dir_list_global[task_id]
for item in os.listdir(dir_paths_global[label]):
file_name = os.path.join(dir_paths_global[label], item)
if not item.endswith("JPEG") and not item.endswith(
"jpg") and not item.endswith("jpeg"):
print("{} file is not suffix with JPEG/jpg, skip it.".format(file_name))
continue
data = {}
data["file_name"] = str(file_name)
data["label"] = int(label)
# get the image data
image_file = open(file_name, "rb")
image_bytes = image_file.read()
image_file.close()
data["data"] = image_bytes
yield data

View File

@ -0,0 +1,8 @@
#!/bin/bash
rm /tmp/imagenet/mr/*
python writer.py --mindrecord_script imagenet \
--mindrecord_file "/tmp/imagenet/mr/m" \
--mindrecord_partitions 16 \
--label_file "/tmp/imagenet/label.txt" \
--image_dir "/tmp/imagenet/jpeg"

View File

@ -0,0 +1,6 @@
#!/bin/bash
rm /tmp/template/*
python writer.py --mindrecord_script template \
--mindrecord_file "/tmp/template/m" \
--mindrecord_partitions 4

View File

@ -0,0 +1,73 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
User-defined API for MindRecord writer.
Two API must be implemented,
1. mindrecord_task_number()
# Return number of parallel tasks. return 1 if no parallel
2. mindrecord_dict_data(task_id)
# Yield data for one task
# task_id is 0..N-1, if N is return value of mindrecord_task_number()
"""
import argparse
import pickle
# ## Parse argument
with open('mr_argument.pickle', 'rb') as mindrecord_argument_file_handle: # Do NOT change this line
ARG_LIST = pickle.load(mindrecord_argument_file_handle) # Do NOT change this line
parser = argparse.ArgumentParser(description='Mind record api template') # Do NOT change this line
# ## Your arguments below
# parser.add_argument(...)
args = parser.parse_args(ARG_LIST) # Do NOT change this line
print(args) # Do NOT change this line
# ## Default mindrecord vars. Comment them unless default value has to be changed.
# mindrecord_index_fields = ['label']
# mindrecord_header_size = 1 << 24
# mindrecord_page_size = 1 << 25
# define global vars here if necessary
# ####### Your code below ##########
mindrecord_schema = {"label": {"type": "int32"}}
def mindrecord_task_number():
"""
Get task size.
Return:
number of tasks
"""
return 1
def mindrecord_dict_data(task_id):
"""
Get data dict.
Yields:
data (dict): data row which is dict.
"""
print("task is {}".format(task_id))
for i in range(256):
data = {}
data['label'] = i
yield data

View File

@ -0,0 +1,152 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
######################## write mindrecord example ########################
Write mindrecord by data dictionary:
python writer.py --mindrecord_script /YourScriptPath ...
"""
import argparse
import os
import pickle
import time
from importlib import import_module
from multiprocessing import Pool
from mindspore.mindrecord import FileWriter
def _exec_task(task_id, parallel_writer=True):
"""
Execute task with specified task id
"""
print("exec task {}, parallel: {} ...".format(task_id, parallel_writer))
imagenet_iter = mindrecord_dict_data(task_id)
batch_size = 2048
transform_count = 0
while True:
data_list = []
try:
for _ in range(batch_size):
data_list.append(imagenet_iter.__next__())
transform_count += 1
writer.write_raw_data(data_list, parallel_writer=parallel_writer)
print("transformed {} record...".format(transform_count))
except StopIteration:
if data_list:
writer.write_raw_data(data_list, parallel_writer=parallel_writer)
print("transformed {} record...".format(transform_count))
break
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Mind record writer')
parser.add_argument('--mindrecord_script', type=str, default="template",
help='path where script is saved')
parser.add_argument('--mindrecord_file', type=str, default="/tmp/mindrecord",
help='written file name prefix')
parser.add_argument('--mindrecord_partitions', type=int, default=1,
help='number of written files')
parser.add_argument('--mindrecord_workers', type=int, default=8,
help='number of parallel workers')
args = parser.parse_known_args()
args, other_args = parser.parse_known_args()
print(args)
print(other_args)
with open('mr_argument.pickle', 'wb') as file_handle:
pickle.dump(other_args, file_handle)
try:
mr_api = import_module(args.mindrecord_script + '.mr_api')
except ModuleNotFoundError:
raise RuntimeError("Unknown module path: {}".format(args.mindrecord_script + '.mr_api'))
num_tasks = mr_api.mindrecord_task_number()
print("Write mindrecord ...")
mindrecord_dict_data = mr_api.mindrecord_dict_data
# get number of files
writer = FileWriter(args.mindrecord_file, args.mindrecord_partitions)
start_time = time.time()
# set the header size
try:
header_size = mr_api.mindrecord_header_size
writer.set_header_size(header_size)
except AttributeError:
print("Default header size: {}".format(1 << 24))
# set the page size
try:
page_size = mr_api.mindrecord_page_size
writer.set_page_size(page_size)
except AttributeError:
print("Default page size: {}".format(1 << 25))
# get schema
try:
mindrecord_schema = mr_api.mindrecord_schema
except AttributeError:
raise RuntimeError("mindrecord_schema is not defined in mr_api.py.")
# create the schema
writer.add_schema(mindrecord_schema, "mindrecord_schema")
# add the index
try:
index_fields = mr_api.mindrecord_index_fields
writer.add_index(index_fields)
except AttributeError:
print("Default index fields: all simple fields are indexes.")
writer.open_and_set_header()
task_list = list(range(num_tasks))
# set number of workers
num_workers = args.mindrecord_workers
if num_tasks < 1:
num_tasks = 1
if num_workers > num_tasks:
num_workers = num_tasks
if os.name == 'nt':
for window_task_id in task_list:
_exec_task(window_task_id, False)
elif num_tasks > 1:
with Pool(num_workers) as p:
p.map(_exec_task, task_list)
else:
_exec_task(0, False)
ret = writer.commit()
os.remove("{}".format("mr_argument.pickle"))
end_time = time.time()
print("--------------------------------------------")
print("END. Total time: {}".format(end_time - start_time))
print("--------------------------------------------")

View File

@ -75,11 +75,8 @@ void BindShardWriter(py::module *m) {
.def("set_header_size", &ShardWriter::set_header_size)
.def("set_page_size", &ShardWriter::set_page_size)
.def("set_shard_header", &ShardWriter::SetShardHeader)
.def("write_raw_data",
(MSRStatus(ShardWriter::*)(std::map<uint64_t, std::vector<py::handle>> &, vector<vector<uint8_t>> &, bool)) &
ShardWriter::WriteRawData)
.def("write_raw_nlp_data", (MSRStatus(ShardWriter::*)(std::map<uint64_t, std::vector<py::handle>> &,
std::map<uint64_t, std::vector<py::handle>> &, bool)) &
.def("write_raw_data", (MSRStatus(ShardWriter::*)(std::map<uint64_t, std::vector<py::handle>> &,
vector<vector<uint8_t>> &, bool, bool)) &
ShardWriter::WriteRawData)
.def("commit", &ShardWriter::Commit);
}

View File

@ -121,6 +121,10 @@ class ShardHeader {
std::vector<std::string> SerializeHeader();
MSRStatus PagesToFile(const std::string dump_file_name);
MSRStatus FileToPages(const std::string dump_file_name);
private:
MSRStatus InitializeHeader(const std::vector<json> &headers);

View File

@ -18,6 +18,7 @@
#define MINDRECORD_INCLUDE_SHARD_WRITER_H_
#include <libgen.h>
#include <sys/file.h>
#include <unistd.h>
#include <algorithm>
#include <array>
@ -87,7 +88,7 @@ class ShardWriter {
/// \param[in] sign validate data or not
/// \return MSRStatus the status of MSRStatus to judge if write successfully
MSRStatus WriteRawData(std::map<uint64_t, std::vector<json>> &raw_data, vector<vector<uint8_t>> &blob_data,
bool sign = true);
bool sign = true, bool parallel_writer = false);
/// \brief write raw data by group size for call from python
/// \param[in] raw_data the vector of raw json data, python-handle format
@ -95,7 +96,7 @@ class ShardWriter {
/// \param[in] sign validate data or not
/// \return MSRStatus the status of MSRStatus to judge if write successfully
MSRStatus WriteRawData(std::map<uint64_t, std::vector<py::handle>> &raw_data, vector<vector<uint8_t>> &blob_data,
bool sign = true);
bool sign = true, bool parallel_writer = false);
/// \brief write raw data by group size for call from python
/// \param[in] raw_data the vector of raw json data, python-handle format
@ -103,7 +104,8 @@ class ShardWriter {
/// \param[in] sign validate data or not
/// \return MSRStatus the status of MSRStatus to judge if write successfully
MSRStatus WriteRawData(std::map<uint64_t, std::vector<py::handle>> &raw_data,
std::map<uint64_t, std::vector<py::handle>> &blob_data, bool sign = true);
std::map<uint64_t, std::vector<py::handle>> &blob_data, bool sign = true,
bool parallel_writer = false);
private:
/// \brief write shard header data to disk
@ -201,7 +203,34 @@ class ShardWriter {
MSRStatus CheckDataTypeAndValue(const std::string &key, const json &value, const json &data, const int &i,
std::map<int, std::string> &err_raw_data);
/// \brief Lock writer and save pages info
int LockWriter(bool parallel_writer = false);
/// \brief Unlock writer and save pages info
MSRStatus UnlockWriter(int fd, bool parallel_writer = false);
/// \brief Check raw data before writing
MSRStatus WriteRawDataPreCheck(std::map<uint64_t, std::vector<json>> &raw_data, vector<vector<uint8_t>> &blob_data,
bool sign, int *schema_count, int *row_count);
/// \brief Get full path from file name
MSRStatus GetFullPathFromFileName(const std::vector<std::string> &paths);
/// \brief Open files
MSRStatus OpenDataFiles(bool append);
/// \brief Remove lock file
MSRStatus RemoveLockFile();
/// \brief Remove lock file
MSRStatus InitLockFile();
private:
const std::string kLockFileSuffix = "_Locker";
const std::string kPageFileSuffix = "_Pages";
std::string lock_file_; // lock file for parallel run
std::string pages_file_; // temporary file of pages info for parallel run
int shard_count_; // number of files
uint64_t header_size_; // header size
uint64_t page_size_; // page size
@ -211,7 +240,7 @@ class ShardWriter {
std::vector<uint64_t> raw_data_size_; // Raw data size
std::vector<uint64_t> blob_data_size_; // Blob data size
std::vector<string> file_paths_; // file paths
std::vector<std::string> file_paths_; // file paths
std::vector<std::shared_ptr<std::fstream>> file_streams_; // file handles
std::shared_ptr<ShardHeader> shard_header_; // shard headers

View File

@ -520,13 +520,16 @@ MSRStatus ShardIndexGenerator::ExecuteTransaction(const int &shard_no, const std
for (int raw_page_id : raw_page_ids) {
auto sql = GenerateRawSQL(fields_);
if (sql.first != SUCCESS) {
MS_LOG(ERROR) << "Generate raw SQL failed";
return FAILED;
}
auto data = GenerateRowData(shard_no, blob_id_to_page_id, raw_page_id, in);
if (data.first != SUCCESS) {
MS_LOG(ERROR) << "Generate raw data failed";
return FAILED;
}
if (BindParameterExecuteSQL(db.second, sql.second, data.second) == FAILED) {
MS_LOG(ERROR) << "Execute SQL failed";
return FAILED;
}
MS_LOG(INFO) << "Insert " << data.second.size() << " rows to index db.";

View File

@ -40,17 +40,7 @@ ShardWriter::~ShardWriter() {
}
}
MSRStatus ShardWriter::Open(const std::vector<std::string> &paths, bool append) {
shard_count_ = paths.size();
if (shard_count_ > kMaxShardCount || shard_count_ == 0) {
MS_LOG(ERROR) << "The Shard Count greater than max value or equal to 0.";
return FAILED;
}
if (schema_count_ > kMaxSchemaCount) {
MS_LOG(ERROR) << "The schema Count greater than max value.";
return FAILED;
}
MSRStatus ShardWriter::GetFullPathFromFileName(const std::vector<std::string> &paths) {
// Get full path from file name
for (const auto &path : paths) {
if (!CheckIsValidUtf8(path)) {
@ -60,7 +50,7 @@ MSRStatus ShardWriter::Open(const std::vector<std::string> &paths, bool append)
char resolved_path[PATH_MAX] = {0};
char buf[PATH_MAX] = {0};
if (strncpy_s(buf, PATH_MAX, common::SafeCStr(path), path.length()) != EOK) {
MS_LOG(ERROR) << "Securec func failed";
MS_LOG(ERROR) << "Secure func failed";
return FAILED;
}
#if defined(_WIN32) || defined(_WIN64)
@ -82,7 +72,10 @@ MSRStatus ShardWriter::Open(const std::vector<std::string> &paths, bool append)
#endif
file_paths_.emplace_back(string(resolved_path));
}
return SUCCESS;
}
MSRStatus ShardWriter::OpenDataFiles(bool append) {
// Open files
for (const auto &file : file_paths_) {
std::shared_ptr<std::fstream> fs = std::make_shared<std::fstream>();
@ -116,6 +109,67 @@ MSRStatus ShardWriter::Open(const std::vector<std::string> &paths, bool append)
return SUCCESS;
}
MSRStatus ShardWriter::RemoveLockFile() {
// Remove temporary file
int ret = std::remove(pages_file_.c_str());
if (ret == 0) {
MS_LOG(DEBUG) << "Remove page file.";
}
ret = std::remove(lock_file_.c_str());
if (ret == 0) {
MS_LOG(DEBUG) << "Remove lock file.";
}
return SUCCESS;
}
MSRStatus ShardWriter::InitLockFile() {
if (file_paths_.size() == 0) {
MS_LOG(ERROR) << "File path not initialized.";
return FAILED;
}
lock_file_ = file_paths_[0] + kLockFileSuffix;
pages_file_ = file_paths_[0] + kPageFileSuffix;
if (RemoveLockFile() == FAILED) {
MS_LOG(ERROR) << "Remove file failed.";
return FAILED;
}
return SUCCESS;
}
MSRStatus ShardWriter::Open(const std::vector<std::string> &paths, bool append) {
shard_count_ = paths.size();
if (shard_count_ > kMaxShardCount || shard_count_ == 0) {
MS_LOG(ERROR) << "The Shard Count greater than max value or equal to 0.";
return FAILED;
}
if (schema_count_ > kMaxSchemaCount) {
MS_LOG(ERROR) << "The schema Count greater than max value.";
return FAILED;
}
// Get full path from file name
if (GetFullPathFromFileName(paths) == FAILED) {
MS_LOG(ERROR) << "Get full path from file name failed.";
return FAILED;
}
// Open files
if (OpenDataFiles(append) == FAILED) {
MS_LOG(ERROR) << "Open data files failed.";
return FAILED;
}
// Init lock file
if (InitLockFile() == FAILED) {
MS_LOG(ERROR) << "Init lock file failed.";
return FAILED;
}
return SUCCESS;
}
MSRStatus ShardWriter::OpenForAppend(const std::string &path) {
if (!IsLegalFile(path)) {
return FAILED;
@ -143,11 +197,28 @@ MSRStatus ShardWriter::OpenForAppend(const std::string &path) {
}
MSRStatus ShardWriter::Commit() {
// Read pages file
std::ifstream page_file(pages_file_.c_str());
if (page_file.good()) {
page_file.close();
if (shard_header_->FileToPages(pages_file_) == FAILED) {
MS_LOG(ERROR) << "Read pages from file failed";
return FAILED;
}
}
if (WriteShardHeader() == FAILED) {
MS_LOG(ERROR) << "Write metadata failed";
return FAILED;
}
MS_LOG(INFO) << "Write metadata successfully.";
// Remove lock file
if (RemoveLockFile() == FAILED) {
MS_LOG(ERROR) << "Remove lock file failed.";
return FAILED;
}
return SUCCESS;
}
@ -455,15 +526,75 @@ void ShardWriter::FillArray(int start, int end, std::map<uint64_t, vector<json>>
}
}
MSRStatus ShardWriter::WriteRawData(std::map<uint64_t, std::vector<json>> &raw_data,
std::vector<std::vector<uint8_t>> &blob_data, bool sign) {
int ShardWriter::LockWriter(bool parallel_writer) {
if (!parallel_writer) {
return 0;
}
#if defined(_WIN32) || defined(_WIN64)
MS_LOG(DEBUG) << "Lock file done by python.";
const int fd = 0;
#else
const int fd = open(lock_file_.c_str(), O_WRONLY | O_CREAT, 0666);
if (fd >= 0) {
flock(fd, LOCK_EX);
} else {
MS_LOG(ERROR) << "Shard writer failed when locking file";
return -1;
}
#endif
// Open files
file_streams_.clear();
for (const auto &file : file_paths_) {
std::shared_ptr<std::fstream> fs = std::make_shared<std::fstream>();
fs->open(common::SafeCStr(file), std::ios::in | std::ios::out | std::ios::binary);
if (fs->fail()) {
MS_LOG(ERROR) << "File could not opened";
return -1;
}
file_streams_.push_back(fs);
}
if (shard_header_->FileToPages(pages_file_) == FAILED) {
MS_LOG(ERROR) << "Read pages from file failed";
return -1;
}
return fd;
}
MSRStatus ShardWriter::UnlockWriter(int fd, bool parallel_writer) {
if (!parallel_writer) {
return SUCCESS;
}
if (shard_header_->PagesToFile(pages_file_) == FAILED) {
MS_LOG(ERROR) << "Write pages to file failed";
return FAILED;
}
for (int i = static_cast<int>(file_streams_.size()) - 1; i >= 0; i--) {
file_streams_[i]->close();
}
#if defined(_WIN32) || defined(_WIN64)
MS_LOG(DEBUG) << "Unlock file done by python.";
#else
flock(fd, LOCK_UN);
close(fd);
#endif
return SUCCESS;
}
MSRStatus ShardWriter::WriteRawDataPreCheck(std::map<uint64_t, std::vector<json>> &raw_data,
std::vector<std::vector<uint8_t>> &blob_data, bool sign, int *schema_count,
int *row_count) {
// check the free disk size
auto st_space = GetDiskSize(file_paths_[0], kFreeSize);
if (st_space.first != SUCCESS || st_space.second < kMinFreeDiskSize) {
MS_LOG(ERROR) << "IO error / there is no free disk to be used";
return FAILED;
}
// Add 4-bytes dummy blob data if no any blob fields
if (blob_data.size() == 0 && raw_data.size() > 0) {
blob_data = std::vector<std::vector<uint8_t>>(raw_data[0].size(), std::vector<uint8_t>(kUnsignedInt4, 0));
@ -479,10 +610,29 @@ MSRStatus ShardWriter::WriteRawData(std::map<uint64_t, std::vector<json>> &raw_d
MS_LOG(ERROR) << "Validate raw data failed";
return FAILED;
}
*schema_count = std::get<1>(v);
*row_count = std::get<2>(v);
return SUCCESS;
}
MSRStatus ShardWriter::WriteRawData(std::map<uint64_t, std::vector<json>> &raw_data,
std::vector<std::vector<uint8_t>> &blob_data, bool sign, bool parallel_writer) {
// Lock Writer if loading data parallel
int fd = LockWriter(parallel_writer);
if (fd < 0) {
MS_LOG(ERROR) << "Lock writer failed";
return FAILED;
}
// Get the count of schemas and rows
int schema_count = std::get<1>(v);
int row_count = std::get<2>(v);
int schema_count = 0;
int row_count = 0;
// Serialize raw data
if (WriteRawDataPreCheck(raw_data, blob_data, sign, &schema_count, &row_count) == FAILED) {
MS_LOG(ERROR) << "Check raw data failed";
return FAILED;
}
if (row_count == kInt0) {
MS_LOG(INFO) << "Raw data size is 0.";
@ -516,11 +666,17 @@ MSRStatus ShardWriter::WriteRawData(std::map<uint64_t, std::vector<json>> &raw_d
}
MS_LOG(INFO) << "Write " << bin_raw_data.size() << " records successfully.";
if (UnlockWriter(fd, parallel_writer) == FAILED) {
MS_LOG(ERROR) << "Unlock writer failed";
return FAILED;
}
return SUCCESS;
}
MSRStatus ShardWriter::WriteRawData(std::map<uint64_t, std::vector<py::handle>> &raw_data,
std::map<uint64_t, std::vector<py::handle>> &blob_data, bool sign) {
std::map<uint64_t, std::vector<py::handle>> &blob_data, bool sign,
bool parallel_writer) {
std::map<uint64_t, std::vector<json>> raw_data_json;
std::map<uint64_t, std::vector<json>> blob_data_json;
@ -554,11 +710,11 @@ MSRStatus ShardWriter::WriteRawData(std::map<uint64_t, std::vector<py::handle>>
MS_LOG(ERROR) << "Serialize raw data failed in write raw data";
return FAILED;
}
return WriteRawData(raw_data_json, bin_blob_data, sign);
return WriteRawData(raw_data_json, bin_blob_data, sign, parallel_writer);
}
MSRStatus ShardWriter::WriteRawData(std::map<uint64_t, std::vector<py::handle>> &raw_data,
vector<vector<uint8_t>> &blob_data, bool sign) {
vector<vector<uint8_t>> &blob_data, bool sign, bool parallel_writer) {
std::map<uint64_t, std::vector<json>> raw_data_json;
(void)std::transform(raw_data.begin(), raw_data.end(), std::inserter(raw_data_json, raw_data_json.end()),
[](const std::pair<uint64_t, std::vector<py::handle>> &pair) {
@ -568,7 +724,7 @@ MSRStatus ShardWriter::WriteRawData(std::map<uint64_t, std::vector<py::handle>>
[](const py::handle &obj) { return nlohmann::detail::ToJsonImpl(obj); });
return std::make_pair(pair.first, std::move(json_raw_data));
});
return WriteRawData(raw_data_json, blob_data, sign);
return WriteRawData(raw_data_json, blob_data, sign, parallel_writer);
}
MSRStatus ShardWriter::ParallelWriteData(const std::vector<std::vector<uint8_t>> &blob_data,

View File

@ -677,5 +677,43 @@ std::pair<std::shared_ptr<Statistics>, MSRStatus> ShardHeader::GetStatisticByID(
}
return std::make_pair(statistics_.at(statistic_id), SUCCESS);
}
MSRStatus ShardHeader::PagesToFile(const std::string dump_file_name) {
// write header content to file, dump whatever is in the file before
std::ofstream page_out_handle(dump_file_name.c_str(), std::ios_base::trunc | std::ios_base::out);
if (page_out_handle.fail()) {
MS_LOG(ERROR) << "Failed in opening page file";
return FAILED;
}
auto pages = SerializePage();
for (const auto &shard_pages : pages) {
page_out_handle << shard_pages << "\n";
}
page_out_handle.close();
return SUCCESS;
}
MSRStatus ShardHeader::FileToPages(const std::string dump_file_name) {
for (auto &v : pages_) { // clean pages
v.clear();
}
// attempt to open the file contains the page in json
std::ifstream page_in_handle(dump_file_name.c_str());
if (!page_in_handle.good()) {
MS_LOG(INFO) << "No page file exists.";
return SUCCESS;
}
std::string line;
while (std::getline(page_in_handle, line)) {
ParsePage(json::parse(line));
}
page_in_handle.close();
return SUCCESS;
}
} // namespace mindrecord
} // namespace mindspore

View File

@ -200,13 +200,24 @@ class FileWriter:
raw_data.pop(i)
logger.warning(v)
def write_raw_data(self, raw_data):
def open_and_set_header(self):
"""
Open writer and set header
"""
if not self._writer.is_open:
self._writer.open(self._paths)
if not self._writer.get_shard_header():
self._writer.set_shard_header(self._header)
def write_raw_data(self, raw_data, parallel_writer=False):
"""
Write raw data and generate sequential pair of MindRecord File and \
validate data based on predefined schema by default.
Args:
raw_data (list[dict]): List of raw data.
parallel_writer (bool, optional): Load data parallel if it equals to True (default=False).
Raises:
ParamTypeError: If index field is invalid.
@ -225,7 +236,7 @@ class FileWriter:
if not isinstance(each_raw, dict):
raise ParamTypeError('raw_data item', 'dict')
self._verify_based_on_schema(raw_data)
return self._writer.write_raw_data(raw_data, True)
return self._writer.write_raw_data(raw_data, True, parallel_writer)
def set_header_size(self, header_size):
"""

View File

@ -135,7 +135,7 @@ class ShardWriter:
def get_shard_header(self):
return self._header
def write_raw_data(self, data, validate=True):
def write_raw_data(self, data, validate=True, parallel_writer=False):
"""
Write raw data of cv dataset.
@ -145,6 +145,7 @@ class ShardWriter:
Args:
data (list[dict]): List of raw data.
validate (bool, optional): verify data according schema if it equals to True.
parallel_writer (bool, optional): Load data parallel if it equals to True.
Returns:
MSRStatus, SUCCESS or FAILED.
@ -165,7 +166,7 @@ class ShardWriter:
if row_raw:
raw_data.append(row_raw)
raw_data = {0: raw_data} if raw_data else {}
ret = self._writer.write_raw_data(raw_data, blob_data, validate)
ret = self._writer.write_raw_data(raw_data, blob_data, validate, parallel_writer)
if ret != ms.MSRStatus.SUCCESS:
logger.error("Failed to write dataset.")
raise MRMWriteDatasetError