Increase default level of parallelism in mindddata pipeline to 8.

Adjust memory usage to not increase as parallelism increases. it will stay at same level it would be with 4 parallelism
2021-03-10 18:38:11 -08:00 · 2021-03-10 18:38:11 -08:00 · a0e75b845a
parent 9add98350e
commit a0e75b845a
7 changed files with 37 additions and 10 deletions
--- a/mindspore/ccsrc/minddata/dataset/core/constants.h
+++ b/mindspore/ccsrc/minddata/dataset/core/constants.h
@ -87,7 +87,7 @@ constexpr int64_t kDeMaxFreq = std::numeric_limits<int64_t>::max();  // 92233720
 constexpr int64_t kDeMaxTopk = std::numeric_limits<int64_t>::max();

 constexpr uint32_t kCfgRowsPerBuffer = 1;
-constexpr uint32_t kCfgParallelWorkers = 4;
+constexpr uint32_t kCfgParallelWorkers = 8;
 constexpr uint32_t kCfgWorkerConnectorSize = 16;
 constexpr uint32_t kCfgOpConnectorSize = 16;
 constexpr int32_t kCfgDefaultRankId = -1;
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/batch_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/batch_op.cc
@ -77,7 +77,15 @@ BatchOp::BatchOp(int32_t batch_size, bool drop, bool pad, int32_t op_queue_size,
      pad_info_(pad_map),
      batch_num_(0),
      batch_cnt_(0) {
-  worker_queues_.Init(num_workers, op_queue_size);
+  // Adjust connector queue size.  After batch each row is batch_size times larger
+  int32_t queue_size;
+  queue_size = std::max(1, op_queue_size / start_batch_size_);
+  if (num_workers == 1) {
+    // ensure there is at least 2 queue slots for whole operation..  If only 1 worker, incrase it to 2
+    queue_size = std::max(2, queue_size);
+  }
+
+  worker_queues_.Init(num_workers, queue_size);
 }
 // if PYTHON is disabled. per_batch_map can't be used
 #else
@ -88,8 +96,16 @@ BatchOp::BatchOp(int32_t batch_size, bool drop, bool pad, int32_t op_queue_size,
      drop_(drop),
      pad_(pad),
      in_col_names_(cols_to_map),
-      pad_info_(pad_map) {
-  worker_queues_.Init(num_workers, op_queue_size);
+      pad_info_(pad_map),
+      batch_num_(0),
+      batch_cnt_(0) {
+  int32_t queue_size;
+  queue_size = std::max(1, op_queue_size / start_batch_size_);
+  if (num_workers == 1) {
+    // ensure there is at least 2 queue slots for whole operation..  If only 1 worker, incrase it to 2
+    queue_size = std::max(2, queue_size);
+  }
+  worker_queues_.Init(num_workers, queue_size);
 }
 #endif

--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/parallel_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/parallel_op.cc
@ -33,7 +33,13 @@ ParallelOp::ParallelOp(int32_t num_workers, int32_t op_connector_size, std::shar
      worker_connector_size_(1),
      worker_connector_(nullptr),
      num_workers_paused_(0),
-      epoch_sync_flag_(false) {}
+      epoch_sync_flag_(false) {
+  // reduce excessive memory usage with high parallelism
+  // when num_workers > 4, reduce op_connector_size to have similar total size if there were only 4 workers
+  if (num_workers_ > 4) {
+    oc_queue_size_ = std::max(1, op_connector_size * 4 / num_workers_);
+  }
+}

 // Creates the internal worker connector for the parallel op if the derived class wants to use it
 Status ParallelOp::CreateWorkerConnector(int32_t worker_connector_size) {
--- a/mindspore/dataset/core/config.py
+++ b/mindspore/dataset/core/config.py
@ -112,11 +112,16 @@ def set_prefetch_size(size):
    Set the number of rows to be prefetched.

    Args:
-        size (int): Total number of rows to be prefetched.
+        size (int): Total number of rows to be prefetched per operator per parallel worker.

    Raises:
        ValueError: If prefetch_size is invalid (<= 0 or > MAX_INT_32).

+    Note:
+        Since total memory used for prefetch can grow very large with high number of workers,
+        when number of workers is > 4, the per worker prefetch size will be reduced. The actual
+        prefetch size at runtime per worker will be prefetchsize * (4 / num_parallel_workers).
+
    Examples:
        >>> # Set a new global configuration value for the prefetch size.
        >>> ds.config.set_prefetch_size(1000)
--- a/tests/ut/cpp/dataset/c_api_dataset_config_test.cc
+++ b/tests/ut/cpp/dataset/c_api_dataset_config_test.cc
@ -42,7 +42,7 @@ TEST_F(MindDataTestPipeline, TestConfigSetting) {
  EXPECT_EQ(load_status, true);

  // Test configuration loaded
-  EXPECT_EQ(config::get_num_parallel_workers(), 4);
+  EXPECT_EQ(config::get_num_parallel_workers(), 8);
  EXPECT_EQ(config::get_prefetch_size(), 16);
  EXPECT_EQ(config::get_seed(), 5489);
  EXPECT_EQ(config::get_monitor_sampling_interval(), 15);
--- a/tests/ut/data/dataset/declient.cfg
+++ b/tests/ut/data/dataset/declient.cfg
@ -1,7 +1,7 @@
 {
   "logFilePath": "/tmp",
   "rowsPerBuffer": 1,
-   "numParallelWorkers": 4,
+   "numParallelWorkers": 8,
   "workerConnectorSize": 16,
   "opConnectorSize": 16,
   "seed": 5489,
--- a/tests/ut/python/dataset/test_config.py
+++ b/tests/ut/python/dataset/test_config.py
@ -44,7 +44,7 @@ def test_basic():
    ds.config.load('../data/dataset/declient.cfg')

    # assert ds.config.get_rows_per_buffer() == 32
-    assert ds.config.get_num_parallel_workers() == 4
+    assert ds.config.get_num_parallel_workers() == 8
    # assert ds.config.get_worker_connector_size() == 16
    assert ds.config.get_prefetch_size() == 16
    assert ds.config.get_seed() == 5489
@ -348,7 +348,7 @@ def test_deterministic_python_seed_multi_thread():
    try:
        np.testing.assert_equal(data1_output, data2_output)
    except Exception as e:
-        # expect output to not match during multi-threaded excution
+        # expect output to not match during multi-threaded execution
        logger.info("Got an exception in DE: {}".format(str(e)))
        assert "Array" in str(e)