!8703 GPU fix padding and dataset_helper

From: @VectorSL
Reviewed-by: 
Signed-off-by:
This commit is contained in:
mindspore-ci-bot 2020-11-19 15:22:19 +08:00 committed by Gitee
commit afde772128
9 changed files with 42 additions and 56 deletions

View File

@ -76,7 +76,7 @@ class Conv2dGpuFwdKernel : public GpuKernel {
const float alpha = 1;
const float beta = 0;
if ((pad_mode_ == kSamePadModeUpperCase || pad_mode_ == kSamePadModeLowerCase) && use_pad_) {
if (use_pad_) {
T *padded_addr = GetDeviceAddress<T>(workspace, 1);
if (data_format_ == kOpFormat_NHWC) {
CalPadNHWC(padded_size_ / sizeof(T), input_addr, n_, old_height_, old_width_, c_, old_height_ + pad_height_,
@ -133,23 +133,18 @@ class Conv2dGpuFwdKernel : public GpuKernel {
[](const int64_t &value) { return static_cast<int>(value); });
pad_height_ = pad_list[0];
pad_width_ = pad_list[2];
auto symmetry_pad = (pad_height_ == pad_list[1]) && (pad_width_ == pad_list[3]);
use_pad_ = !((pad_height_ == pad_list[1]) && (pad_width_ == pad_list[3]));
pad_mode_ = GetAttr<std::string>(kernel_node, "pad_mode");
SetStrideAndDilation(kernel_node);
cudnnTensorDescriptor_t input_descriptor_real = nullptr;
int padA[2];
int strideA[2] = {stride_[2], stride_[3]};
int dilaA[2] = {dilation_[2], dilation_[3]};
if (pad_mode_ == kSamePadModeUpperCase || pad_mode_ == kSamePadModeLowerCase || !symmetry_pad) {
if (use_pad_) {
pad_height_ = pad_list[0] + pad_list[1];
pad_width_ = pad_list[2] + pad_list[3];
pad_top_ = pad_list[0];
pad_left_ = pad_list[2];
// if use_pad_ == true, using zero padding in advance, else using the default cudnn pad.
if (pad_height_ % 2 == 0 && pad_width_ % 2 == 0) {
use_pad_ = false;
}
int dimA[4];
int strideApadded[4];
if (data_format_ == kOpFormat_NCHW || data_format_ == kOpFormat_DEFAULT) {
@ -165,18 +160,12 @@ class Conv2dGpuFwdKernel : public GpuKernel {
}
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptor(padded_desc_, cudnn_data_type_, 4, dimA, strideApadded),
"cudnnSetTensor4dDescriptor failed");
if (use_pad_) {
padA[0] = 0;
padA[1] = 0;
} else {
padA[0] = pad_top_;
padA[1] = pad_left_;
}
padA[0] = 0;
padA[1] = 0;
CHECK_CUDNN_RET_WITH_EXCEPT(
cudnnSetConvolutionNdDescriptor(conv_desc_, 2, padA, strideA, dilaA, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT),
"cudnnSetConvolutionNdDescriptor failed");
input_descriptor_real = use_pad_ ? padded_desc_ : input_desc_;
input_descriptor_real = padded_desc_;
} else {
if (pad_mode_ == kValidPadModeUpperCase || pad_mode_ == kValidPadModeLowerCase) {
pad_height_ = 0;
@ -232,7 +221,7 @@ class Conv2dGpuFwdKernel : public GpuKernel {
input_size_list_.push_back(input_size_);
input_size_list_.push_back(filter_size_);
output_size_list_.push_back(output_size_);
if ((pad_mode_ == kSamePadModeUpperCase || pad_mode_ == kSamePadModeLowerCase) && use_pad_ && !is_null_input_) {
if (use_pad_ && !is_null_input_) {
CHECK_CUDNN_RET_WITH_EXCEPT(
cudnnGetConvolutionForwardWorkspaceSize(cudnn_handle_, padded_desc_, filter_desc_, conv_desc_, output_desc_,
conv_algorithm_, &workspace_size_),

View File

@ -78,7 +78,7 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel {
const float alpha = 1;
const float beta = 0;
if ((pad_mode_ == kSamePadModeUpperCase || pad_mode_ == kSamePadModeLowerCase) && use_pad_) {
if (use_pad_) {
T *padded = GetDeviceAddress<T>(workspace, 1);
if (data_format_ == kOpFormat_NHWC) {
CalPadNHWC(padded_size_ / sizeof(T), x, n_, old_height_, old_width_, c_, old_height_ + pad_height_,
@ -136,14 +136,14 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel {
[](const int64_t &value) { return static_cast<int>(value); });
pad_height_ = pad_list[0];
pad_width_ = pad_list[2];
auto symmetry_pad = (pad_height_ == pad_list[1]) && (pad_width_ == pad_list[3]);
use_pad_ = !((pad_height_ == pad_list[1]) && (pad_width_ == pad_list[3]));
pad_mode_ = GetAttr<std::string>(kernel_node, "pad_mode");
SetStrideAndDilation(kernel_node);
cudnnTensorDescriptor_t x_desc_real = nullptr;
int padA[2];
int strideA[2] = {stride_[0], stride_[1]};
int dilaA[2] = {dilation_[2], dilation_[3]};
if (pad_mode_ == kSamePadModeUpperCase || pad_mode_ == kSamePadModeLowerCase || !symmetry_pad) {
if (use_pad_) {
pad_height_ = pad_list[0] + pad_list[1];
pad_width_ = pad_list[2] + pad_list[3];
pad_top_ = pad_list[0];
@ -167,17 +167,12 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel {
CHECK_CUDNN_RET_WITH_EXCEPT(
cudnnSetTensorNdDescriptor(padded_descriptor_, cudnn_data_type_, 4, dimA, strideApadded),
"cudnnSetTensor4dDescriptor failed");
if (use_pad_) {
padA[0] = 0;
padA[1] = 0;
} else {
padA[0] = pad_top_;
padA[1] = pad_left_;
}
padA[0] = 0;
padA[1] = 0;
CHECK_CUDNN_RET_WITH_EXCEPT(
cudnnSetConvolutionNdDescriptor(conv_desc_, 2, padA, strideA, dilaA, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT),
"cudnnSetConvolutionNdDescriptor failed");
x_desc_real = use_pad_ ? padded_descriptor_ : x_desc_;
x_desc_real = padded_descriptor_;
} else {
if (pad_mode_ == kValidPadModeUpperCase || pad_mode_ == kValidPadModeLowerCase) {
pad_height_ = 0;
@ -231,7 +226,7 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel {
input_size_list_.push_back(input_size_);
output_size_list_.push_back(output_size_);
if ((pad_mode_ == kSamePadModeUpperCase || pad_mode_ == kSamePadModeLowerCase) && use_pad_ && !is_null_input_) {
if (use_pad_ && !is_null_input_) {
CHECK_CUDNN_RET_WITH_EXCEPT(
cudnnGetTensorSizeInBytes(padded_descriptor_, reinterpret_cast<size_t *>(&padded_size_)),
"cudnnGetTensorSizeInBytes failed");

View File

@ -77,7 +77,7 @@ class ConvGradInputGpuBkwKernel : public GpuKernel {
}
const float alpha = 1;
if ((pad_mode_ == kSamePadModeUpperCase || pad_mode_ == kSamePadModeLowerCase) && use_pad_) {
if (use_pad_) {
T *padded = GetDeviceAddress<T>(workspace, 1);
CHECK_CUDNN_RET_WITH_EXCEPT(
@ -139,14 +139,14 @@ class ConvGradInputGpuBkwKernel : public GpuKernel {
[](const int64_t &value) { return static_cast<int>(value); });
pad_height_ = pad_list[0];
pad_width_ = pad_list[2];
auto symmetry_pad = (pad_height_ == pad_list[1]) && (pad_width_ == pad_list[3]);
use_pad_ = !((pad_height_ == pad_list[1]) && (pad_width_ == pad_list[3]));
pad_mode_ = GetAttr<std::string>(kernel_node, "pad_mode");
SetStrideAndDilation(kernel_node);
cudnnTensorDescriptor_t dx_desc_real = nullptr;
int padA[2];
int strideA[2] = {stride_[0], stride_[1]};
int dilaA[2] = {dilation_[2], dilation_[3]};
if (pad_mode_ == kSamePadModeUpperCase || pad_mode_ == kSamePadModeLowerCase || !symmetry_pad) {
if (use_pad_) {
pad_height_ = pad_list[0] + pad_list[1];
pad_width_ = pad_list[2] + pad_list[3];
pad_top_ = pad_list[0];
@ -170,17 +170,12 @@ class ConvGradInputGpuBkwKernel : public GpuKernel {
CHECK_CUDNN_RET_WITH_EXCEPT(
cudnnSetTensorNdDescriptor(padded_descriptor_, cudnn_data_type_, 4, dimA, strideApadded),
"cudnnSetTensor4dDescriptor failed");
if (use_pad_) {
padA[0] = 0;
padA[1] = 0;
} else {
padA[0] = pad_top_;
padA[1] = pad_left_;
}
padA[0] = 0;
padA[1] = 0;
CHECK_CUDNN_RET_WITH_EXCEPT(
cudnnSetConvolutionNdDescriptor(conv_desc_, 2, padA, strideA, dilaA, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT),
"cudnnSetConvolutionNdDescriptor failed");
dx_desc_real = use_pad_ ? padded_descriptor_ : dx_desc_;
dx_desc_real = padded_descriptor_;
} else {
if (pad_mode_ == kValidPadModeUpperCase || pad_mode_ == kValidPadModeLowerCase) {
pad_height_ = 0;
@ -233,7 +228,7 @@ class ConvGradInputGpuBkwKernel : public GpuKernel {
input_size_list_.push_back(w_size_);
output_size_list_.push_back(output_size_);
if ((pad_mode_ == kSamePadModeUpperCase || pad_mode_ == kSamePadModeLowerCase) && use_pad_ && !is_null_input_) {
if (use_pad_ && !is_null_input_) {
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(padded_descriptor_, &padded_size_),
"cudnnGetTensorSizeInBytes failed");

View File

@ -662,8 +662,9 @@ void Pipeline::Run() {
auto manager = func_graph->manager();
size_t graph_nums = manager->func_graphs().size();
if (graph_nums == 1) {
resource_->set_gpu_loopsink_flag(true);
MS_LOG(INFO) << "Change gpu_loopsink_flag_ to true.";
int64_t sinksize = ConfigManager::GetInstance().iter_num();
resource_->set_gpu_loopsink(true, sinksize);
MS_LOG(INFO) << "Change gpu_loopsink_flag_ to true,set loopsink size to " << sinksize;
}
}
}
@ -834,7 +835,7 @@ py::object ExecutorPy::Run(const py::tuple &args, const py::object &phase) {
}
// Set loopsink size for each phase.
bool is_loopsink = info_[phase_s]->resource->gpu_loopsink_flag();
int64_t sinksize = ConfigManager::GetInstance().iter_num();
int64_t sinksize = info_[phase_s]->resource->gpu_loopsink_size();
ConfigManager::GetInstance().set_gpu_loopsink_size(is_loopsink ? sinksize : 1);
// If target is not gpu or is loopsink, keep vmloop 1.
bool g = (MsContext::GetInstance()->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice);

View File

@ -74,8 +74,12 @@ class Resource : public ResourceBase {
const abstract::AbstractBasePtrList &args_spec() const { return args_spec_; }
void set_args_spec(const abstract::AbstractBasePtrList &args_spec) { args_spec_ = args_spec; }
void set_gpu_loopsink_flag(const bool &flag) { gpu_loopsink_flag_ = flag; }
void set_gpu_loopsink(const bool &flag, const int64_t size) {
gpu_loopsink_flag_ = flag;
gpu_loopsink_size_ = size;
}
bool gpu_loopsink_flag() { return gpu_loopsink_flag_; }
int64_t gpu_loopsink_size() { return gpu_loopsink_size_; }
// Reclaim resource and clear the cache.
// ExecutorPy::Compile() can be called multiple times, so cache
@ -89,6 +93,7 @@ class Resource : public ResourceBase {
py::object input_;
bool is_cleaned_;
bool gpu_loopsink_flag_{false};
int64_t gpu_loopsink_size_{1};
};
using ResourcePtr = std::shared_ptr<pipeline::Resource>;

View File

@ -129,6 +129,8 @@ class DatasetHelper:
Validator.check_is_int(sink_size)
if sink_size < -1 or sink_size == 0:
raise ValueError("The sink_size must be -1 or positive, but got sink_size {}.".format(sink_size))
if sink_size == -1:
sink_size = dataset.get_dataset_size()
if dataset_sink_mode:
if context.get_context("enable_ge"):

View File

@ -14,15 +14,14 @@
# limitations under the License.
# ============================================================================
# an simple tutorial as follows, more parameters can be setting
if [ $# != 3 ]
if [ $# != 2 ]
then
echo "Usage: sh run_standalone_train_gpu.sh [cifar10|imagenet] [DATA_PATH] [DEVICE_ID]"
echo "Usage: sh run_standalone_train_gpu.sh [cifar10|imagenet] [DATA_PATH]"
exit 1
fi
export DATASET_NAME=$1
export DATA_PATH=$2
export DEVICE_ID=$3
python train.py --dataset_name=$DATASET_NAME --data_path=$DATA_PATH \
--device_id=$DEVICE_ID --device_target="GPU" > log 2>&1 &
--device_target="GPU" > log 2>&1 &

View File

@ -345,11 +345,11 @@ epoch: 5 step: 5004, loss is 3.3501816
```
# ========START RESNET50 GPU BENCHMARK========
step time: 12416.098 ms, fps: 412 img/sec. epoch: 1 step: 20, loss is 6.940182
step time: 3472.037 ms, fps: 1474 img/sec. epoch: 2 step: 20, loss is 7.078993
step time: 3469.523 ms, fps: 1475 img/sec. epoch: 3 step: 20, loss is 7.559594
step time: 3460.311 ms, fps: 1479 img/sec. epoch: 4 step: 20, loss is 6.920937
step time: 3460.543 ms, fps: 1479 img/sec. epoch: 5 step: 20, loss is 6.814013
Epoch time: 12416.098 ms, fps: 412 img/sec. epoch: 1 step: 20, loss is 6.940182
Epoch time: 3472.037 ms, fps: 1474 img/sec. epoch: 2 step: 20, loss is 7.078993
Epoch time: 3469.523 ms, fps: 1475 img/sec. epoch: 3 step: 20, loss is 7.559594
Epoch time: 3460.311 ms, fps: 1479 img/sec. epoch: 4 step: 20, loss is 6.920937
Epoch time: 3460.543 ms, fps: 1479 img/sec. epoch: 5 step: 20, loss is 6.814013
...
```
## [Evaluation Process](#contents)

View File

@ -53,7 +53,7 @@ class MyTimeMonitor(Callback):
def step_end(self, run_context):
step_mseconds = (time.time() - self.step_time) * 1000
fps = self.batch_size / step_mseconds *1000 * self.size
print("step time: {:5.3f} ms, fps: {:d} img/sec.".format(step_mseconds, int(fps)), flush=True, end=" ")
print("Epoch time: {:5.3f} ms, fps: {:d} img/sec.".format(step_mseconds, int(fps)), flush=True, end=" ")
def pad(image):
zeros = np.zeros([224, 224, 1], dtype=np.uint8)