forked from mindspore-Ecosystem/mindspore
!8703 GPU fix padding and dataset_helper
From: @VectorSL Reviewed-by: Signed-off-by:
This commit is contained in:
commit
afde772128
|
@ -76,7 +76,7 @@ class Conv2dGpuFwdKernel : public GpuKernel {
|
|||
|
||||
const float alpha = 1;
|
||||
const float beta = 0;
|
||||
if ((pad_mode_ == kSamePadModeUpperCase || pad_mode_ == kSamePadModeLowerCase) && use_pad_) {
|
||||
if (use_pad_) {
|
||||
T *padded_addr = GetDeviceAddress<T>(workspace, 1);
|
||||
if (data_format_ == kOpFormat_NHWC) {
|
||||
CalPadNHWC(padded_size_ / sizeof(T), input_addr, n_, old_height_, old_width_, c_, old_height_ + pad_height_,
|
||||
|
@ -133,23 +133,18 @@ class Conv2dGpuFwdKernel : public GpuKernel {
|
|||
[](const int64_t &value) { return static_cast<int>(value); });
|
||||
pad_height_ = pad_list[0];
|
||||
pad_width_ = pad_list[2];
|
||||
auto symmetry_pad = (pad_height_ == pad_list[1]) && (pad_width_ == pad_list[3]);
|
||||
use_pad_ = !((pad_height_ == pad_list[1]) && (pad_width_ == pad_list[3]));
|
||||
pad_mode_ = GetAttr<std::string>(kernel_node, "pad_mode");
|
||||
SetStrideAndDilation(kernel_node);
|
||||
cudnnTensorDescriptor_t input_descriptor_real = nullptr;
|
||||
int padA[2];
|
||||
int strideA[2] = {stride_[2], stride_[3]};
|
||||
int dilaA[2] = {dilation_[2], dilation_[3]};
|
||||
if (pad_mode_ == kSamePadModeUpperCase || pad_mode_ == kSamePadModeLowerCase || !symmetry_pad) {
|
||||
if (use_pad_) {
|
||||
pad_height_ = pad_list[0] + pad_list[1];
|
||||
pad_width_ = pad_list[2] + pad_list[3];
|
||||
pad_top_ = pad_list[0];
|
||||
pad_left_ = pad_list[2];
|
||||
|
||||
// if use_pad_ == true, using zero padding in advance, else using the default cudnn pad.
|
||||
if (pad_height_ % 2 == 0 && pad_width_ % 2 == 0) {
|
||||
use_pad_ = false;
|
||||
}
|
||||
int dimA[4];
|
||||
int strideApadded[4];
|
||||
if (data_format_ == kOpFormat_NCHW || data_format_ == kOpFormat_DEFAULT) {
|
||||
|
@ -165,18 +160,12 @@ class Conv2dGpuFwdKernel : public GpuKernel {
|
|||
}
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptor(padded_desc_, cudnn_data_type_, 4, dimA, strideApadded),
|
||||
"cudnnSetTensor4dDescriptor failed");
|
||||
|
||||
if (use_pad_) {
|
||||
padA[0] = 0;
|
||||
padA[1] = 0;
|
||||
} else {
|
||||
padA[0] = pad_top_;
|
||||
padA[1] = pad_left_;
|
||||
}
|
||||
padA[0] = 0;
|
||||
padA[1] = 0;
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
cudnnSetConvolutionNdDescriptor(conv_desc_, 2, padA, strideA, dilaA, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT),
|
||||
"cudnnSetConvolutionNdDescriptor failed");
|
||||
input_descriptor_real = use_pad_ ? padded_desc_ : input_desc_;
|
||||
input_descriptor_real = padded_desc_;
|
||||
} else {
|
||||
if (pad_mode_ == kValidPadModeUpperCase || pad_mode_ == kValidPadModeLowerCase) {
|
||||
pad_height_ = 0;
|
||||
|
@ -232,7 +221,7 @@ class Conv2dGpuFwdKernel : public GpuKernel {
|
|||
input_size_list_.push_back(input_size_);
|
||||
input_size_list_.push_back(filter_size_);
|
||||
output_size_list_.push_back(output_size_);
|
||||
if ((pad_mode_ == kSamePadModeUpperCase || pad_mode_ == kSamePadModeLowerCase) && use_pad_ && !is_null_input_) {
|
||||
if (use_pad_ && !is_null_input_) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
cudnnGetConvolutionForwardWorkspaceSize(cudnn_handle_, padded_desc_, filter_desc_, conv_desc_, output_desc_,
|
||||
conv_algorithm_, &workspace_size_),
|
||||
|
|
|
@ -78,7 +78,7 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel {
|
|||
const float alpha = 1;
|
||||
const float beta = 0;
|
||||
|
||||
if ((pad_mode_ == kSamePadModeUpperCase || pad_mode_ == kSamePadModeLowerCase) && use_pad_) {
|
||||
if (use_pad_) {
|
||||
T *padded = GetDeviceAddress<T>(workspace, 1);
|
||||
if (data_format_ == kOpFormat_NHWC) {
|
||||
CalPadNHWC(padded_size_ / sizeof(T), x, n_, old_height_, old_width_, c_, old_height_ + pad_height_,
|
||||
|
@ -136,14 +136,14 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel {
|
|||
[](const int64_t &value) { return static_cast<int>(value); });
|
||||
pad_height_ = pad_list[0];
|
||||
pad_width_ = pad_list[2];
|
||||
auto symmetry_pad = (pad_height_ == pad_list[1]) && (pad_width_ == pad_list[3]);
|
||||
use_pad_ = !((pad_height_ == pad_list[1]) && (pad_width_ == pad_list[3]));
|
||||
pad_mode_ = GetAttr<std::string>(kernel_node, "pad_mode");
|
||||
SetStrideAndDilation(kernel_node);
|
||||
cudnnTensorDescriptor_t x_desc_real = nullptr;
|
||||
int padA[2];
|
||||
int strideA[2] = {stride_[0], stride_[1]};
|
||||
int dilaA[2] = {dilation_[2], dilation_[3]};
|
||||
if (pad_mode_ == kSamePadModeUpperCase || pad_mode_ == kSamePadModeLowerCase || !symmetry_pad) {
|
||||
if (use_pad_) {
|
||||
pad_height_ = pad_list[0] + pad_list[1];
|
||||
pad_width_ = pad_list[2] + pad_list[3];
|
||||
pad_top_ = pad_list[0];
|
||||
|
@ -167,17 +167,12 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel {
|
|||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
cudnnSetTensorNdDescriptor(padded_descriptor_, cudnn_data_type_, 4, dimA, strideApadded),
|
||||
"cudnnSetTensor4dDescriptor failed");
|
||||
if (use_pad_) {
|
||||
padA[0] = 0;
|
||||
padA[1] = 0;
|
||||
} else {
|
||||
padA[0] = pad_top_;
|
||||
padA[1] = pad_left_;
|
||||
}
|
||||
padA[0] = 0;
|
||||
padA[1] = 0;
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
cudnnSetConvolutionNdDescriptor(conv_desc_, 2, padA, strideA, dilaA, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT),
|
||||
"cudnnSetConvolutionNdDescriptor failed");
|
||||
x_desc_real = use_pad_ ? padded_descriptor_ : x_desc_;
|
||||
x_desc_real = padded_descriptor_;
|
||||
} else {
|
||||
if (pad_mode_ == kValidPadModeUpperCase || pad_mode_ == kValidPadModeLowerCase) {
|
||||
pad_height_ = 0;
|
||||
|
@ -231,7 +226,7 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel {
|
|||
input_size_list_.push_back(input_size_);
|
||||
output_size_list_.push_back(output_size_);
|
||||
|
||||
if ((pad_mode_ == kSamePadModeUpperCase || pad_mode_ == kSamePadModeLowerCase) && use_pad_ && !is_null_input_) {
|
||||
if (use_pad_ && !is_null_input_) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
cudnnGetTensorSizeInBytes(padded_descriptor_, reinterpret_cast<size_t *>(&padded_size_)),
|
||||
"cudnnGetTensorSizeInBytes failed");
|
||||
|
|
|
@ -77,7 +77,7 @@ class ConvGradInputGpuBkwKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
const float alpha = 1;
|
||||
if ((pad_mode_ == kSamePadModeUpperCase || pad_mode_ == kSamePadModeLowerCase) && use_pad_) {
|
||||
if (use_pad_) {
|
||||
T *padded = GetDeviceAddress<T>(workspace, 1);
|
||||
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
|
@ -139,14 +139,14 @@ class ConvGradInputGpuBkwKernel : public GpuKernel {
|
|||
[](const int64_t &value) { return static_cast<int>(value); });
|
||||
pad_height_ = pad_list[0];
|
||||
pad_width_ = pad_list[2];
|
||||
auto symmetry_pad = (pad_height_ == pad_list[1]) && (pad_width_ == pad_list[3]);
|
||||
use_pad_ = !((pad_height_ == pad_list[1]) && (pad_width_ == pad_list[3]));
|
||||
pad_mode_ = GetAttr<std::string>(kernel_node, "pad_mode");
|
||||
SetStrideAndDilation(kernel_node);
|
||||
cudnnTensorDescriptor_t dx_desc_real = nullptr;
|
||||
int padA[2];
|
||||
int strideA[2] = {stride_[0], stride_[1]};
|
||||
int dilaA[2] = {dilation_[2], dilation_[3]};
|
||||
if (pad_mode_ == kSamePadModeUpperCase || pad_mode_ == kSamePadModeLowerCase || !symmetry_pad) {
|
||||
if (use_pad_) {
|
||||
pad_height_ = pad_list[0] + pad_list[1];
|
||||
pad_width_ = pad_list[2] + pad_list[3];
|
||||
pad_top_ = pad_list[0];
|
||||
|
@ -170,17 +170,12 @@ class ConvGradInputGpuBkwKernel : public GpuKernel {
|
|||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
cudnnSetTensorNdDescriptor(padded_descriptor_, cudnn_data_type_, 4, dimA, strideApadded),
|
||||
"cudnnSetTensor4dDescriptor failed");
|
||||
if (use_pad_) {
|
||||
padA[0] = 0;
|
||||
padA[1] = 0;
|
||||
} else {
|
||||
padA[0] = pad_top_;
|
||||
padA[1] = pad_left_;
|
||||
}
|
||||
padA[0] = 0;
|
||||
padA[1] = 0;
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
cudnnSetConvolutionNdDescriptor(conv_desc_, 2, padA, strideA, dilaA, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT),
|
||||
"cudnnSetConvolutionNdDescriptor failed");
|
||||
dx_desc_real = use_pad_ ? padded_descriptor_ : dx_desc_;
|
||||
dx_desc_real = padded_descriptor_;
|
||||
} else {
|
||||
if (pad_mode_ == kValidPadModeUpperCase || pad_mode_ == kValidPadModeLowerCase) {
|
||||
pad_height_ = 0;
|
||||
|
@ -233,7 +228,7 @@ class ConvGradInputGpuBkwKernel : public GpuKernel {
|
|||
input_size_list_.push_back(w_size_);
|
||||
output_size_list_.push_back(output_size_);
|
||||
|
||||
if ((pad_mode_ == kSamePadModeUpperCase || pad_mode_ == kSamePadModeLowerCase) && use_pad_ && !is_null_input_) {
|
||||
if (use_pad_ && !is_null_input_) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(padded_descriptor_, &padded_size_),
|
||||
"cudnnGetTensorSizeInBytes failed");
|
||||
|
||||
|
|
|
@ -662,8 +662,9 @@ void Pipeline::Run() {
|
|||
auto manager = func_graph->manager();
|
||||
size_t graph_nums = manager->func_graphs().size();
|
||||
if (graph_nums == 1) {
|
||||
resource_->set_gpu_loopsink_flag(true);
|
||||
MS_LOG(INFO) << "Change gpu_loopsink_flag_ to true.";
|
||||
int64_t sinksize = ConfigManager::GetInstance().iter_num();
|
||||
resource_->set_gpu_loopsink(true, sinksize);
|
||||
MS_LOG(INFO) << "Change gpu_loopsink_flag_ to true,set loopsink size to " << sinksize;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -834,7 +835,7 @@ py::object ExecutorPy::Run(const py::tuple &args, const py::object &phase) {
|
|||
}
|
||||
// Set loopsink size for each phase.
|
||||
bool is_loopsink = info_[phase_s]->resource->gpu_loopsink_flag();
|
||||
int64_t sinksize = ConfigManager::GetInstance().iter_num();
|
||||
int64_t sinksize = info_[phase_s]->resource->gpu_loopsink_size();
|
||||
ConfigManager::GetInstance().set_gpu_loopsink_size(is_loopsink ? sinksize : 1);
|
||||
// If target is not gpu or is loopsink, keep vmloop 1.
|
||||
bool g = (MsContext::GetInstance()->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice);
|
||||
|
|
|
@ -74,8 +74,12 @@ class Resource : public ResourceBase {
|
|||
const abstract::AbstractBasePtrList &args_spec() const { return args_spec_; }
|
||||
void set_args_spec(const abstract::AbstractBasePtrList &args_spec) { args_spec_ = args_spec; }
|
||||
|
||||
void set_gpu_loopsink_flag(const bool &flag) { gpu_loopsink_flag_ = flag; }
|
||||
void set_gpu_loopsink(const bool &flag, const int64_t size) {
|
||||
gpu_loopsink_flag_ = flag;
|
||||
gpu_loopsink_size_ = size;
|
||||
}
|
||||
bool gpu_loopsink_flag() { return gpu_loopsink_flag_; }
|
||||
int64_t gpu_loopsink_size() { return gpu_loopsink_size_; }
|
||||
|
||||
// Reclaim resource and clear the cache.
|
||||
// ExecutorPy::Compile() can be called multiple times, so cache
|
||||
|
@ -89,6 +93,7 @@ class Resource : public ResourceBase {
|
|||
py::object input_;
|
||||
bool is_cleaned_;
|
||||
bool gpu_loopsink_flag_{false};
|
||||
int64_t gpu_loopsink_size_{1};
|
||||
};
|
||||
|
||||
using ResourcePtr = std::shared_ptr<pipeline::Resource>;
|
||||
|
|
|
@ -129,6 +129,8 @@ class DatasetHelper:
|
|||
Validator.check_is_int(sink_size)
|
||||
if sink_size < -1 or sink_size == 0:
|
||||
raise ValueError("The sink_size must be -1 or positive, but got sink_size {}.".format(sink_size))
|
||||
if sink_size == -1:
|
||||
sink_size = dataset.get_dataset_size()
|
||||
|
||||
if dataset_sink_mode:
|
||||
if context.get_context("enable_ge"):
|
||||
|
|
|
@ -14,15 +14,14 @@
|
|||
# limitations under the License.
|
||||
# ============================================================================
|
||||
# an simple tutorial as follows, more parameters can be setting
|
||||
if [ $# != 3 ]
|
||||
if [ $# != 2 ]
|
||||
then
|
||||
echo "Usage: sh run_standalone_train_gpu.sh [cifar10|imagenet] [DATA_PATH] [DEVICE_ID]"
|
||||
echo "Usage: sh run_standalone_train_gpu.sh [cifar10|imagenet] [DATA_PATH]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
export DATASET_NAME=$1
|
||||
export DATA_PATH=$2
|
||||
export DEVICE_ID=$3
|
||||
|
||||
python train.py --dataset_name=$DATASET_NAME --data_path=$DATA_PATH \
|
||||
--device_id=$DEVICE_ID --device_target="GPU" > log 2>&1 &
|
||||
--device_target="GPU" > log 2>&1 &
|
||||
|
|
|
@ -345,11 +345,11 @@ epoch: 5 step: 5004, loss is 3.3501816
|
|||
|
||||
```
|
||||
# ========START RESNET50 GPU BENCHMARK========
|
||||
step time: 12416.098 ms, fps: 412 img/sec. epoch: 1 step: 20, loss is 6.940182
|
||||
step time: 3472.037 ms, fps: 1474 img/sec. epoch: 2 step: 20, loss is 7.078993
|
||||
step time: 3469.523 ms, fps: 1475 img/sec. epoch: 3 step: 20, loss is 7.559594
|
||||
step time: 3460.311 ms, fps: 1479 img/sec. epoch: 4 step: 20, loss is 6.920937
|
||||
step time: 3460.543 ms, fps: 1479 img/sec. epoch: 5 step: 20, loss is 6.814013
|
||||
Epoch time: 12416.098 ms, fps: 412 img/sec. epoch: 1 step: 20, loss is 6.940182
|
||||
Epoch time: 3472.037 ms, fps: 1474 img/sec. epoch: 2 step: 20, loss is 7.078993
|
||||
Epoch time: 3469.523 ms, fps: 1475 img/sec. epoch: 3 step: 20, loss is 7.559594
|
||||
Epoch time: 3460.311 ms, fps: 1479 img/sec. epoch: 4 step: 20, loss is 6.920937
|
||||
Epoch time: 3460.543 ms, fps: 1479 img/sec. epoch: 5 step: 20, loss is 6.814013
|
||||
...
|
||||
```
|
||||
## [Evaluation Process](#contents)
|
||||
|
|
|
@ -53,7 +53,7 @@ class MyTimeMonitor(Callback):
|
|||
def step_end(self, run_context):
|
||||
step_mseconds = (time.time() - self.step_time) * 1000
|
||||
fps = self.batch_size / step_mseconds *1000 * self.size
|
||||
print("step time: {:5.3f} ms, fps: {:d} img/sec.".format(step_mseconds, int(fps)), flush=True, end=" ")
|
||||
print("Epoch time: {:5.3f} ms, fps: {:d} img/sec.".format(step_mseconds, int(fps)), flush=True, end=" ")
|
||||
|
||||
def pad(image):
|
||||
zeros = np.zeros([224, 224, 1], dtype=np.uint8)
|
||||
|
|
Loading…
Reference in New Issue