Merge pull request #2144 from rbberger/tokenizer-performance-fixes

Performance optimization of Tokenizer
2020-06-12 04:20:05 -04:00 · 2020-06-12 04:20:05 -04:00 · 5f86bac419
parent fc9bbd4d06 3c99471df8
commit 5f86bac419
13 changed files with 252 additions and 110 deletions
--- a/src/MANYBODY/pair_eim.cpp
+++ b/src/MANYBODY/pair_eim.cpp
@ -1123,7 +1123,7 @@ char * EIMPotentialFileReader::next_line(FILE * fp) {
      concat = false;
    }
-    nwords = utils::count_words(line);
+    nwords += utils::count_words(&line[n]);
    // skip line if blank
    if (nwords > 0) {
--- a/src/atom_vec.cpp
+++ b/src/atom_vec.cpp
@ -2448,11 +2448,11 @@ int AtomVec::process_fields(char *str, const char *default_str, Method *method)
  }
  // tokenize words in both strings
-  Tokenizer words(str, " ");
+  std::vector<std::string> words = Tokenizer(str, " ").as_vector();
-  Tokenizer def_words(default_str, " ");
+  std::vector<std::string> def_words = Tokenizer(default_str, " ").as_vector();
-  int nfield = words.count();
+  int nfield = words.size();
-  int ndef   = def_words.count();
+  int ndef   = def_words.size();
  // process fields one by one, add to index vector
--- a/src/atom_vec_hybrid.cpp
+++ b/src/atom_vec_hybrid.cpp
@ -514,8 +514,8 @@ char *AtomVecHybrid::merge_fields(int inum, char *root,
  // identify unique words in concatenated string
-  Tokenizer words(concat, " ");
+  std::vector<std::string> words = Tokenizer(concat, " ").as_vector();
-  int nwords = words.count();
+  int nwords = words.size();
  int *unique = new int[nwords];
--- a/src/potential_file_reader.cpp
+++ b/src/potential_file_reader.cpp
@ -83,9 +83,9 @@ void PotentialFileReader::next_dvector(double * list, int n) {
  }
 }
-ValueTokenizer PotentialFileReader::next_values(int nparams, const std::string & seperators) {
+ValueTokenizer PotentialFileReader::next_values(int nparams, const std::string & separators) {
  try {
-    return reader->next_values(nparams, seperators);
+    return reader->next_values(nparams, separators);
  } catch (FileReaderException & e) {
    error->one(FLERR, e.what());
  }
--- a/src/potential_file_reader.h
+++ b/src/potential_file_reader.h
@ -43,7 +43,7 @@ namespace LAMMPS_NS
    void skip_line();
    char * next_line(int nparams = 0);
    void next_dvector(double * list, int n);
-    ValueTokenizer next_values(int nparams, const std::string & seperators = TOKENIZER_DEFAULT_SEPERATORS);
+    ValueTokenizer next_values(int nparams, const std::string & separators = TOKENIZER_DEFAULT_SEPARATORS);
    // convenience functions
    double next_double();
--- a/src/text_file_reader.cpp
+++ b/src/text_file_reader.cpp
@ -86,7 +86,7 @@ char *TextFileReader::next_line(int nparams) {
    // strip comment
    if (ignore_comments && (ptr = strchr(line, '#'))) *ptr = '\0';
-    nwords = utils::count_words(line);
+    nwords += utils::count_words(&line[n]);
    // skip line if blank
    if (nwords > 0) {
@ -116,6 +116,6 @@ void TextFileReader::next_dvector(double * list, int n) {
  }
 }
-ValueTokenizer TextFileReader::next_values(int nparams, const std::string & seperators) {
+ValueTokenizer TextFileReader::next_values(int nparams, const std::string & separators) {
-  return ValueTokenizer(next_line(nparams), seperators);
+  return ValueTokenizer(next_line(nparams), separators);
 }
--- a/src/text_file_reader.h
+++ b/src/text_file_reader.h
@ -42,7 +42,7 @@ namespace LAMMPS_NS
    char * next_line(int nparams = 0);
    void next_dvector(double * list, int n);
-    ValueTokenizer next_values(int nparams, const std::string & seperators = TOKENIZER_DEFAULT_SEPERATORS);
+    ValueTokenizer next_values(int nparams, const std::string & separators = TOKENIZER_DEFAULT_SEPARATORS);
  };
  class FileReaderException : public std::exception {
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@ -17,77 +17,118 @@
 #include "tokenizer.h"
 #include "utils.h"
 #include "fmt/format.h"
 using namespace LAMMPS_NS;
-Tokenizer::Tokenizer(const std::string & str, const std::string & seperators) {
+TokenizerException::TokenizerException(const std::string & msg, const std::string & token){
-    size_t end = -1;
+    if(token.empty()) {
        message = msg;
    } else {
        message = fmt::format("{}: '{}'", msg, token);
    }
 }
-    do {
+Tokenizer::Tokenizer(const std::string & str, const std::string & separators) :
-        size_t start = str.find_first_not_of(seperators, end + 1);
+    text(str), separators(separators), start(0), ntokens(std::string::npos)
-        if(start == std::string::npos) break;
+{
    reset();
 }
-        end = str.find_first_of(seperators, start);
+Tokenizer::Tokenizer(const Tokenizer & rhs) :
    text(rhs.text), separators(rhs.separators), ntokens(rhs.ntokens)
 {
    reset();
 }
 Tokenizer::Tokenizer(Tokenizer && rhs) :
    text(std::move(rhs.text)), separators(std::move(rhs.separators)), ntokens(rhs.ntokens)
 {
    reset();
 }
 void Tokenizer::reset() {
    start = text.find_first_not_of(separators);
 }
 void Tokenizer::skip(int n) {
    for(int i = 0; i < n; ++i) {
        if(!has_next()) throw TokenizerException("No more tokens", "");
        size_t end = text.find_first_of(separators, start);
        if(end == std::string::npos) {
-            tokens.push_back(str.substr(start));
+            start = end;
        } else {
-            tokens.push_back(str.substr(start, end-start));
+            start = text.find_first_not_of(separators, end+1);
        }
-    } while(end != std::string::npos);
+    }
 }
-Tokenizer::Tokenizer(const Tokenizer & rhs) : tokens(rhs.tokens) {
+bool Tokenizer::has_next() const {
    return start != std::string::npos;
 }
-Tokenizer::Tokenizer(Tokenizer && rhs) : tokens(std::move(rhs.tokens)) {
+std::string Tokenizer::next() {
    if(!has_next()) throw TokenizerException("No more tokens", "");
    size_t end = text.find_first_of(separators, start);
    if(end == std::string::npos) {
        std::string token = text.substr(start);
        start = end;
        return token;
    }
    std::string token = text.substr(start, end-start);
    start = text.find_first_not_of(separators, end+1);
    return token;
 }
-Tokenizer::iterator Tokenizer::begin() {
+size_t Tokenizer::count() {
-    return tokens.begin();
+    // lazy evaluation
    if (ntokens == std::string::npos) {
      ntokens = utils::count_words(text, separators);
    }
    return ntokens;
 }
-Tokenizer::iterator Tokenizer::end() {
+std::vector<std::string> Tokenizer::as_vector() {
-    return tokens.end();
+  // store current state
-}
+  size_t current = start;
-Tokenizer::const_iterator Tokenizer::cbegin() const {
+  reset();
    return tokens.cbegin();
 }
-Tokenizer::const_iterator Tokenizer::cend() const {
+  // generate vector
-    return tokens.cend();
+  std::vector<std::string> tokens;
 }
-std::string & Tokenizer::operator[](size_t index) {
+  while(has_next()) {
-    return tokens[index];
+    tokens.emplace_back(next());
-}
+  }
-size_t Tokenizer::count() const {
+  // restore state
-    return tokens.size();
+  start = current;
  return tokens;
 }
-ValueTokenizer::ValueTokenizer(const std::string & str, const std::string & seperators) : tokens(str, seperators) {
+ValueTokenizer::ValueTokenizer(const std::string & str, const std::string & separators) : tokens(str, separators) {
    current  = tokens.begin();
 }
 ValueTokenizer::ValueTokenizer(const ValueTokenizer & rhs) : tokens(rhs.tokens) {
    current  = tokens.begin();
 }
 ValueTokenizer::ValueTokenizer(ValueTokenizer && rhs) : tokens(std::move(rhs.tokens)) {
    current  = tokens.begin();
 }
 bool ValueTokenizer::has_next() const {
-    return current != tokens.cend();
+    return tokens.has_next();
 }
 std::string ValueTokenizer::next_string() {
    if (has_next()) {
-        std::string value = *current;
+        std::string value = tokens.next();
        ++current;
        return value;
    }
    return "";
@ -95,11 +136,11 @@ std::string ValueTokenizer::next_string() {
 int ValueTokenizer::next_int() {
    if (has_next()) {
-        if(!utils::is_integer(*current)) {
+        std::string current = tokens.next();
-            throw InvalidIntegerException(*current);
+        if(!utils::is_integer(current)) {
            throw InvalidIntegerException(current);
        }
-        int value = atoi(current->c_str());
+        int value = atoi(current.c_str());
        ++current;
        return value;
    }
    return 0;
@ -107,45 +148,44 @@ int ValueTokenizer::next_int() {
 bigint ValueTokenizer::next_bigint() {
    if (has_next()) {
-        if(!utils::is_integer(*current)) {
+        std::string current = tokens.next();
-            throw InvalidIntegerException(*current);
+        if(!utils::is_integer(current)) {
            throw InvalidIntegerException(current);
        }
-        bigint value = ATOBIGINT(current->c_str());
+        bigint value = ATOBIGINT(current.c_str());
        ++current;
        return value;
    }
    return 0;
 }
 tagint ValueTokenizer::next_tagint() {
-    if (current != tokens.end()) {
+    if (has_next()) {
-        if(!utils::is_integer(*current)) {
+        std::string current = tokens.next();
-            throw InvalidIntegerException(*current);
+        if(!utils::is_integer(current)) {
            throw InvalidIntegerException(current);
        }
-        tagint value = ATOTAGINT(current->c_str());
+        tagint value = ATOTAGINT(current.c_str());
        ++current;
        return value;
    }
    return 0;
 }
 double ValueTokenizer::next_double() {
-    if (current != tokens.end()) {
+    if (has_next()) {
-        if(!utils::is_double(*current)) {
+        std::string current = tokens.next();
-            throw InvalidFloatException(*current);
+        if(!utils::is_double(current)) {
            throw InvalidFloatException(current);
        }
-
+        double value = atof(current.c_str());
        double value = atof(current->c_str());
        ++current;
        return value;
    }
    return 0.0;
 }
-void ValueTokenizer::skip(int ntokens) {
+void ValueTokenizer::skip(int n) {
-    current = std::next(current, ntokens);
+    tokens.skip(n);
 }
-size_t ValueTokenizer::count() const {
+size_t ValueTokenizer::count() {
    return tokens.count();
 }
--- a/src/tokenizer.h
+++ b/src/tokenizer.h
@ -25,34 +25,33 @@
 namespace LAMMPS_NS {
-#define TOKENIZER_DEFAULT_SEPERATORS " \t\r\n\f"
+#define TOKENIZER_DEFAULT_SEPARATORS " \t\r\n\f"
 class Tokenizer {
-    std::vector<std::string> tokens;
+    std::string text;
    std::string separators;
    size_t start;
    size_t ntokens;
 public:
-    typedef std::vector<std::string>::iterator iterator;
+    Tokenizer(const std::string & str, const std::string & separators = TOKENIZER_DEFAULT_SEPARATORS);
    typedef std::vector<std::string>::const_iterator const_iterator;
    Tokenizer(const std::string & str, const std::string & seperators = TOKENIZER_DEFAULT_SEPERATORS);
    Tokenizer(Tokenizer &&);
    Tokenizer(const Tokenizer &);
    Tokenizer& operator=(const Tokenizer&) = default;
    Tokenizer& operator=(Tokenizer&&) = default;
-    iterator begin();
+    void reset();
-    iterator end();
+    void skip(int n);
-    const_iterator cbegin() const;
+    bool has_next() const;
-    const_iterator cend() const;
+    std::string next();
-    std::string & operator[](size_t index);
+    size_t count();
-    size_t count() const;
+    std::vector<std::string> as_vector();
 };
 class TokenizerException : public std::exception {
  std::string message;
 public:
-  TokenizerException(const std::string & msg, const std::string & token) : message(msg + ": '" + token + "'") {
+  TokenizerException(const std::string & msg, const std::string & token);
  }
  ~TokenizerException() throw() {
  }
@ -76,9 +75,8 @@ public:
 class ValueTokenizer {
    Tokenizer tokens;
    Tokenizer::const_iterator current;
 public:
-    ValueTokenizer(const std::string & str, const std::string & seperators = TOKENIZER_DEFAULT_SEPERATORS);
+    ValueTokenizer(const std::string & str, const std::string & separators = TOKENIZER_DEFAULT_SEPARATORS);
    ValueTokenizer(const ValueTokenizer &);
    ValueTokenizer(ValueTokenizer &&);
    ValueTokenizer& operator=(const ValueTokenizer&) = default;
@ -91,9 +89,9 @@ public:
    double next_double();
    bool has_next() const;
-    void skip(int ntokens);
+    void skip(int n);
-    size_t count() const;
+    size_t count();
 };
--- a/src/utils.cpp
+++ b/src/utils.cpp
@ -364,21 +364,70 @@ std::string utils::trim_comment(const std::string & line) {
  return std::string(line);
 }
 /* ----------------------------------------------------------------------
   return number of words
 ------------------------------------------------------------------------- */
 size_t utils::count_words(const char * text) {
  size_t count = 0;
  const char * buf = text;
  char c = *buf;
  while (c) {
    if (c == ' ' || c == '\t' || c == '\r' ||  c == '\n' || c == '\f') {
      c = *++buf;
      continue;
    };
    ++count;
    c = *++buf;
    while (c) {
      if (c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\f') {
        break;
      }
      c = *++buf;
    }
  }
  return count;
 }
 /* ----------------------------------------------------------------------
   return number of words
 ------------------------------------------------------------------------- */
 size_t utils::count_words(const std::string & text) {
  return utils::count_words(text.c_str());
 }
 /* ----------------------------------------------------------------------
   Return number of words
 ------------------------------------------------------------------------- */
-size_t utils::count_words(const std::string & text, const std::string & seperators) {
+size_t utils::count_words(const std::string & text, const std::string & separators) {
-  ValueTokenizer words(text, seperators);
+  size_t count = 0;
-  return words.count();
+  size_t start = text.find_first_not_of(separators);
  while (start != std::string::npos) {
    size_t end = text.find_first_of(separators, start);
    ++count;
    if(end == std::string::npos) {
      return count;
    } else {
      start = text.find_first_not_of(separators, end + 1);
    }
  }
  return count;
 }
 /* ----------------------------------------------------------------------
   Trim comment from string and return number of words
 ------------------------------------------------------------------------- */
-size_t utils::trim_and_count_words(const std::string & text, const std::string & seperators) {
+size_t utils::trim_and_count_words(const std::string & text, const std::string & separators) {
-  return utils::count_words(utils::trim_comment(text), seperators);
+  return utils::count_words(utils::trim_comment(text), separators);
 }
 /* ----------------------------------------------------------------------
--- a/src/utils.h
+++ b/src/utils.h
@ -153,18 +153,34 @@ namespace LAMMPS_NS {
    /**
     * \brief Count words in string
     * \param text string that should be searched
-     * \param seperators string containing characters that will be treated as whitespace
+     * \param separators string containing characters that will be treated as whitespace
     * \return number of words found
     */
-    size_t count_words(const std::string & text, const std::string & seperators = " \t\r\n\f");
+    size_t count_words(const std::string & text, const std::string & separators);
    /**
     * \brief Count words in string, ignore any whitespace matching " \t\r\n\f"
     * \param text string that should be searched
     * \param separators string containing characters that will be treated as whitespace
     * \return number of words found
     */
    size_t count_words(const std::string & text);
    /**
     * \brief Count words in C-string, ignore any whitespace matching " \t\r\n\f"
     * \param text string that should be searched
     * \param separators string containing characters that will be treated as whitespace
     * \return number of words found
     */
    size_t count_words(const char * text);
    /**
     * \brief Count words in a single line, trim anything from '#' onward
     * \param text string that should be trimmed and searched
-     * \param seperators string containing characters that will be treated as whitespace
+     * \param separators string containing characters that will be treated as whitespace
     * \return number of words found
     */
-    size_t trim_and_count_words(const std::string & text, const std::string & seperators = " \t\r\n\f");
+    size_t trim_and_count_words(const std::string & text, const std::string & separators = " \t\r\n\f");
    /**
     * \brief Check if string can be converted to valid integer
--- a/unittest/utils/test_tokenizer.cpp
+++ b/unittest/utils/test_tokenizer.cpp
@ -38,37 +38,33 @@ TEST(Tokenizer, two_words) {
    ASSERT_EQ(t.count(), 2);
 }
-TEST(Tokenizer, prefix_seperators) {
+TEST(Tokenizer, prefix_separators) {
    Tokenizer t("  test word", " ");
    ASSERT_EQ(t.count(), 2);
 }
-TEST(Tokenizer, postfix_seperators) {
+TEST(Tokenizer, postfix_separators) {
    Tokenizer t("test word   ", " ");
    ASSERT_EQ(t.count(), 2);
 }
 TEST(Tokenizer, iterate_words) {
    Tokenizer t("  test word   ", " ");
-    ASSERT_THAT(t[0], Eq("test"));
+    ASSERT_THAT(t.next(), Eq("test"));
-    ASSERT_THAT(t[1], Eq("word"));
+    ASSERT_THAT(t.next(), Eq("word"));
    ASSERT_EQ(t.count(), 2);
 }
-TEST(Tokenizer, default_seperators) {
+TEST(Tokenizer, default_separators) {
    Tokenizer t(" \r\n test \t word \f");
-    ASSERT_THAT(t[0], Eq("test"));
+    ASSERT_THAT(t.next(), Eq("test"));
-    ASSERT_THAT(t[1], Eq("word"));
+    ASSERT_THAT(t.next(), Eq("word"));
    ASSERT_EQ(t.count(), 2);
 }
-TEST(Tokenizer, for_loop) {
+TEST(Tokenizer, as_vector) {
    Tokenizer t(" \r\n test \t word \f");
-    std::vector<std::string> list;
+    std::vector<std::string> list = t.as_vector();
    for(auto word : t) {
        list.push_back(word);
    }
    ASSERT_THAT(list[0], Eq("test"));
    ASSERT_THAT(list[1], Eq("word"));
 }
--- a/unittest/utils/test_utils.cpp
+++ b/unittest/utils/test_utils.cpp
@ -15,6 +15,9 @@
 #include "gmock/gmock.h"
 #include "utils.h"
 #include <string>
 #include <cerrno>
 #include <cstdio>
 #include <cstdlib>
 using namespace LAMMPS_NS;
 using ::testing::Eq;
@ -28,10 +31,18 @@ TEST(Utils, count_words) {
    ASSERT_EQ(utils::count_words("some text # comment"), 4);
 }
 TEST(Utils, count_words_non_default) {
    ASSERT_EQ(utils::count_words("some text # comment", " #"), 3);
 }
 TEST(Utils, trim_and_count_words) {
    ASSERT_EQ(utils::trim_and_count_words("some text # comment"), 2);
 }
 TEST(Utils, count_words_with_extra_spaces) {
    ASSERT_EQ(utils::count_words("   some text # comment   "), 4);
 }
 TEST(Utils, valid_integer1) {
    ASSERT_TRUE(utils::is_integer("10"));
 }
@ -215,3 +226,35 @@ TEST(Utils, path_basename) {
    ASSERT_THAT(utils::path_basename("/parent/folder/filename"), Eq("filename"));
 #endif
 }
 TEST(Utils, getsyserror) {
 #if defined(__linux__)
    errno = ENOENT;
    std::string errmesg = utils::getsyserror();
    ASSERT_THAT(errmesg, Eq("No such file or directory"));
 #else
    GTEST_SKIP();
 #endif
 }
 TEST(Utils, potential_file) {
    FILE *fp;
    fp = fopen("ctest.txt","w");
    ASSERT_NE(fp,nullptr);
    fputs("# DATE: 2020-02-20 CONTRIBUTOR: Nessuno\n",fp);
    fclose(fp);
    EXPECT_TRUE(utils::file_is_readable("ctest.txt"));
    EXPECT_FALSE(utils::file_is_readable("no_such_file.txt"));
    EXPECT_THAT(utils::get_potential_file_path("ctest.txt"),Eq("ctest.txt"));
    const char *folder = getenv("LAMMPS_POTENTIALS");
    if (folder != nullptr) {
      std::string path=utils::path_join(folder,"Cu_u3.eam");
      EXPECT_THAT(utils::get_potential_file_path("Cu_u3.eam"),Eq(path));
    }
    EXPECT_THAT(utils::get_potential_date("ctest.txt","Test"),Eq("2020-02-20"));
    remove("ctest.txt");
 }