support large file on windows

This commit is contained in:
Xiao Tianci 2022-11-21 21:30:25 +08:00
parent 8bf5fadde5
commit f0f4a0b50d
8 changed files with 30 additions and 28 deletions

View File

@ -105,7 +105,7 @@ Status ValidateDatasetFilesParam(const std::string &dataset_name, const std::vec
LOG_AND_RETURN_STATUS_SYNTAX_ERROR(err_msg);
}
for (auto f : dataset_files) {
for (const auto &f : dataset_files) {
Path dataset_file(f);
if (!dataset_file.Exists()) {
std::string err_msg = dataset_name + ": " + file_name + ": [" + f + "] is invalid or does not exist.";

View File

@ -174,6 +174,7 @@ Status TFRecordNode::Build(std::vector<std::shared_ptr<DatasetOp>> *const node_o
// Create Schema Object
std::unique_ptr<DataSchema> data_schema = std::make_unique<DataSchema>();
if (!schema_path_.empty()) {
RETURN_IF_NOT_OK(ValidateDatasetFilesParam("TFRecordDataset", {schema_path_}));
RETURN_IF_NOT_OK(data_schema->LoadSchemaFile(schema_path_, columns_list_));
} else if (schema_obj_ != nullptr) {
std::string schema_json_string = schema_obj_->to_json();

View File

@ -5535,15 +5535,7 @@ TFRecord(const std::vector<std::string> &dataset_files, const T &schema = nullpt
VectorStringToChar(columns_list), num_samples, shuffle, num_shards, shard_id,
shard_equal_rows, cache, StringToChar(compression_type));
} else {
std::string schema_path = schema;
if (!schema_path.empty()) {
struct stat sb {};
int rc = stat(schema_path.c_str(), &sb);
if (rc != 0) {
return nullptr;
}
}
ds = std::make_shared<TFRecordDataset>(VectorStringToChar(dataset_files), StringToChar(schema_path),
ds = std::make_shared<TFRecordDataset>(VectorStringToChar(dataset_files), StringToChar(schema),
VectorStringToChar(columns_list), num_samples, shuffle, num_shards, shard_id,
shard_equal_rows, cache, StringToChar(compression_type));
}

View File

@ -2332,9 +2332,7 @@ Status ReadFile(const std::string &filename, std::shared_ptr<Tensor> *output) {
if (!realpath.has_value()) {
RETURN_STATUS_UNEXPECTED("ReadFile: Invalid file path, " + filename + " does not exist.");
}
struct stat sb;
stat(realpath.value().c_str(), &sb);
if (S_ISREG(sb.st_mode) == 0) {
if (!Path(realpath.value()).IsFile()) {
RETURN_STATUS_UNEXPECTED("ReadFile: Invalid file path, " + filename + " is not a regular file.");
}
@ -2350,9 +2348,7 @@ Status ReadImage(const std::string &filename, std::shared_ptr<Tensor> *output, I
std::string err_msg = "ReadImage: Invalid file path, " + filename + " does not exist.";
RETURN_STATUS_UNEXPECTED(err_msg);
}
struct stat sb;
stat(realpath.value().c_str(), &sb);
if (S_ISREG(sb.st_mode) == 0) {
if (!Path(realpath.value()).IsFile()) {
RETURN_STATUS_UNEXPECTED("ReadImage: Invalid file path, " + filename + " is not a regular file.");
}
@ -2416,9 +2412,7 @@ Status WriteFile(const std::string &filename, const std::shared_ptr<Tensor> &dat
if (!realpath.has_value()) {
RETURN_STATUS_UNEXPECTED("WriteFile: Invalid file path, " + filename + " failed to get the real path.");
}
struct stat sb;
stat(realpath.value().c_str(), &sb);
if (S_ISREG(sb.st_mode) == 0) {
if (!Path(realpath.value()).IsFile()) {
RETURN_STATUS_UNEXPECTED("WriteFile: Invalid file path, " + filename + " is not a regular file.");
}
@ -2501,9 +2495,7 @@ Status WriteJpeg(const std::string &filename, const std::shared_ptr<Tensor> &ima
if (!realpath.has_value()) {
RETURN_STATUS_UNEXPECTED("WriteJpeg: Invalid file path, " + filename + " failed to get the real path.");
}
struct stat sb;
stat(realpath.value().c_str(), &sb);
if (S_ISREG(sb.st_mode) == 0) {
if (!Path(realpath.value()).IsFile()) {
RETURN_STATUS_UNEXPECTED("WriteJpeg: Invalid file path, " + filename + " is not a regular file.");
}

View File

@ -21,7 +21,8 @@
#include <sstream>
#ifdef _MSC_VER
#include <direct.h> // for _mkdir
#include <direct.h> // for _mkdir
#define stat _stat64 // for file size exceeds (1<<31)-1 bytes
#endif
#include "./securec.h"
@ -57,7 +58,7 @@ Path::Path(const char *p) {
#endif
}
Path::Path(const Path &p) : path_(p.path_) {}
Path::Path(const Path &p) = default;
Path &Path::operator=(const Path &p) {
if (&p != this) {
@ -138,7 +139,7 @@ std::string Path::Extension() const {
}
bool Path::Exists() {
struct stat sb;
struct stat sb {};
int rc = stat(common::SafeCStr(path_), &sb);
if (rc == -1 && errno != ENOENT) {
MS_LOG(WARNING) << "Unable to query the status of " << path_ << ". Errno = " << errno << ".";
@ -147,7 +148,7 @@ bool Path::Exists() {
}
bool Path::IsDirectory() {
struct stat sb;
struct stat sb {};
int rc = stat(common::SafeCStr(path_), &sb);
if (rc == 0) {
return S_ISDIR(sb.st_mode);
@ -156,6 +157,16 @@ bool Path::IsDirectory() {
}
}
bool Path::IsFile() {
struct stat sb {};
int rc = stat(common::SafeCStr(path_), &sb);
if (rc == 0) {
return S_ISREG(sb.st_mode);
} else {
return false;
}
}
Status Path::CreateDirectory(bool is_common_dir) {
if (!Exists()) {
#if defined(_WIN32) || defined(_WIN64)
@ -188,7 +199,7 @@ Status Path::CreateDirectory(bool is_common_dir) {
}
std::string Path::ParentPath() {
std::string r("");
std::string r;
std::size_t found = path_.find_last_of(separator_);
if (found != std::string::npos) {
if (found == 0) {

View File

@ -93,6 +93,8 @@ class Path {
bool IsDirectory();
bool IsFile();
Status CreateDirectory(bool is_common_dir = false);
Status CreateDirectories(bool is_common_dir = false);

View File

@ -19,6 +19,10 @@
#include "utils/ms_utils.h"
#include "./securec.h"
#ifdef _MSC_VER
#define stat _stat64 // for file size exceeds (1<<31)-1 bytes
#endif
namespace mindspore {
namespace mindrecord {
// split a string using a character

View File

@ -489,7 +489,7 @@ TEST_F(MindDataTestPipeline, TestTFRecordDatasetExeception) {
EXPECT_EQ(ds2->CreateIterator(), nullptr);
// This case expected to fail because the file of schema is not exist.
std::shared_ptr<Dataset> ds4 = TFRecord({file_path, "notexist.json"});
std::shared_ptr<Dataset> ds4 = TFRecord({file_path}, "notexist.json");
EXPECT_EQ(ds4->CreateIterator(), nullptr);
// This case expected to fail because num_samples is negative.