From 81d937ee97ba27ee22dd10965950bbd909224aa3 Mon Sep 17 00:00:00 2001
From: Richard Berger <richard.berger@temple.edu>
Date: Thu, 11 Jun 2020 00:16:46 -0400
Subject: [PATCH 1/8] Performance optimization of Tokenizer

Reduces string allocations and removes std::vector from Tokenizer
Most processing now happens on-demand.
---
 src/atom_vec.cpp                  |   8 +-
 src/atom_vec_hybrid.cpp           |   4 +-
 src/potential_file_reader.h       |   2 +-
 src/text_file_reader.h            |   2 +-
 src/tokenizer.cpp                 | 154 +++++++++++++++++++-----------
 src/tokenizer.h                   |  34 ++++---
 src/utils.cpp                     |  16 +++-
 unittest/utils/test_tokenizer.cpp |  16 ++--
 unittest/utils/test_utils.cpp     |   4 +
 9 files changed, 145 insertions(+), 95 deletions(-)

diff --git a/src/atom_vec.cpp b/src/atom_vec.cpp
index 8cd7db4fd4..353ab27c99 100644
--- a/src/atom_vec.cpp
+++ b/src/atom_vec.cpp
@@ -2448,11 +2448,11 @@ int AtomVec::process_fields(char *str, const char *default_str, Method *method)
   }
 
   // tokenize words in both strings
-  Tokenizer words(str, " ");
-  Tokenizer def_words(default_str, " ");
+  std::vector<std::string> words = Tokenizer(str, " ").as_vector();
+  std::vector<std::string> def_words = Tokenizer(default_str, " ").as_vector();
 
-  int nfield = words.count();
-  int ndef   = def_words.count();
+  int nfield = words.size();
+  int ndef   = def_words.size();
 
   // process fields one by one, add to index vector
 
diff --git a/src/atom_vec_hybrid.cpp b/src/atom_vec_hybrid.cpp
index 7e599863c0..9ba2b6f468 100644
--- a/src/atom_vec_hybrid.cpp
+++ b/src/atom_vec_hybrid.cpp
@@ -514,8 +514,8 @@ char *AtomVecHybrid::merge_fields(int inum, char *root,
 
   // identify unique words in concatenated string
 
-  Tokenizer words(concat, " ");
-  int nwords = words.count();
+  std::vector<std::string> words = Tokenizer(concat, " ").as_vector();
+  int nwords = words.size();
 
   int *unique = new int[nwords];
 
diff --git a/src/potential_file_reader.h b/src/potential_file_reader.h
index c512e7886d..a73f5fdbaa 100644
--- a/src/potential_file_reader.h
+++ b/src/potential_file_reader.h
@@ -43,7 +43,7 @@ namespace LAMMPS_NS
     void skip_line();
     char * next_line(int nparams = 0);
     void next_dvector(double * list, int n);
-    ValueTokenizer next_values(int nparams, const std::string & seperators = TOKENIZER_DEFAULT_SEPERATORS);
+    ValueTokenizer next_values(int nparams, const std::string & separators = TOKENIZER_DEFAULT_SEPARATORS);
 
     // convenience functions
     double next_double();
diff --git a/src/text_file_reader.h b/src/text_file_reader.h
index b162bfb23c..80a5d756ea 100644
--- a/src/text_file_reader.h
+++ b/src/text_file_reader.h
@@ -42,7 +42,7 @@ namespace LAMMPS_NS
     char * next_line(int nparams = 0);
 
     void next_dvector(double * list, int n);
-    ValueTokenizer next_values(int nparams, const std::string & seperators = TOKENIZER_DEFAULT_SEPERATORS);
+    ValueTokenizer next_values(int nparams, const std::string & separators = TOKENIZER_DEFAULT_SEPARATORS);
   };
 
   class FileReaderException : public std::exception {
diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp
index 7d048d9e02..f041c79baa 100644
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@@ -17,77 +17,118 @@
 
 #include "tokenizer.h"
 #include "utils.h"
+#include "fmt/format.h"
 
 using namespace LAMMPS_NS;
 
-Tokenizer::Tokenizer(const std::string & str, const std::string & seperators) {
-    size_t end = -1;
+TokenizerException::TokenizerException(const std::string & msg, const std::string & token){
+    if(token.empty()) {
+        message = msg;
+    } else {
+        message = fmt::format("{}: '{}'", msg, token);
+    }
+}
 
-    do {
-        size_t start = str.find_first_not_of(seperators, end + 1);
-        if(start == std::string::npos) break;
+Tokenizer::Tokenizer(const std::string & str, const std::string & separators) :
+    text(str), separators(separators), start(0), ntokens(std::string::npos)
+{
+    reset();
+}
 
-        end = str.find_first_of(seperators, start);
+Tokenizer::Tokenizer(const Tokenizer & rhs) : 
+    text(rhs.text), separators(rhs.separators), ntokens(rhs.ntokens)
+{
+    reset();
+}
+
+Tokenizer::Tokenizer(Tokenizer && rhs) : 
+    text(std::move(rhs.text)), separators(std::move(rhs.separators)), ntokens(rhs.ntokens)
+{
+    reset();
+}
+
+void Tokenizer::reset() {
+    start = text.find_first_not_of(separators);
+}
+
+void Tokenizer::skip(int n) {
+    for(int i = 0; i < n; ++i) {
+        if(!has_next()) throw TokenizerException("No more tokens", "");
+
+        size_t end = text.find_first_of(separators, start);      
 
         if(end == std::string::npos) {
-            tokens.push_back(str.substr(start));
+            start = end;
         } else {
-            tokens.push_back(str.substr(start, end-start));
-        }
-    } while(end != std::string::npos);
+            start = text.find_first_not_of(separators, end+1);
+        } 
+    }
 }
 
-Tokenizer::Tokenizer(const Tokenizer & rhs) : tokens(rhs.tokens) {
+bool Tokenizer::has_next() const {
+    return start != std::string::npos;
 }
 
-Tokenizer::Tokenizer(Tokenizer && rhs) : tokens(std::move(rhs.tokens)) {
+std::string Tokenizer::next() {
+    if(!has_next()) throw TokenizerException("No more tokens", "");
+
+    size_t end = text.find_first_of(separators, start);
+
+    if(end == std::string::npos) {
+        std::string token = text.substr(start);
+        start = end;
+        return token;
+    } 
+
+    std::string token = text.substr(start, end-start);
+    start = text.find_first_not_of(separators, end+1);
+    return token;
 }
 
-Tokenizer::iterator Tokenizer::begin() {
-    return tokens.begin();
+size_t Tokenizer::count() {
+    // lazy evaluation
+    if (ntokens == std::string::npos) {
+      ntokens = utils::count_words(text, separators);
+    }
+    return ntokens;
 }
 
-Tokenizer::iterator Tokenizer::end() {
-    return tokens.end();
-}
+std::vector<std::string> Tokenizer::as_vector() {
+  // store current state
+  size_t current = start;
 
-Tokenizer::const_iterator Tokenizer::cbegin() const {
-    return tokens.cbegin();
-}
+  reset();
 
-Tokenizer::const_iterator Tokenizer::cend() const {
-    return tokens.cend();
-}
+  // generate vector
+  std::vector<std::string> tokens;
 
-std::string & Tokenizer::operator[](size_t index) {
-    return tokens[index];
-}
+  while(has_next()) {
+    tokens.emplace_back(next());
+  }
 
-size_t Tokenizer::count() const {
-    return tokens.size();
+  // restore state
+  start = current;
+
+  return tokens;
 }
 
 
-ValueTokenizer::ValueTokenizer(const std::string & str, const std::string & seperators) : tokens(str, seperators) {
-    current  = tokens.begin();
+ValueTokenizer::ValueTokenizer(const std::string & str, const std::string & separators) : tokens(str, separators) {
 }
 
 ValueTokenizer::ValueTokenizer(const ValueTokenizer & rhs) : tokens(rhs.tokens) {
-    current  = tokens.begin();
 }
 
 ValueTokenizer::ValueTokenizer(ValueTokenizer && rhs) : tokens(std::move(rhs.tokens)) {
-    current  = tokens.begin();
 }
 
 bool ValueTokenizer::has_next() const {
-    return current != tokens.cend();
+    return tokens.has_next();
 }
 
 std::string ValueTokenizer::next_string() {
     if (has_next()) {
-        std::string value = *current;
-        ++current;
+        std::string value = tokens.next();
         return value;
     }
     return "";
@@ -95,11 +136,11 @@ std::string ValueTokenizer::next_string() {
 
 int ValueTokenizer::next_int() {
     if (has_next()) {
-        if(!utils::is_integer(*current)) {
-            throw InvalidIntegerException(*current);
+        std::string current = tokens.next();
+        if(!utils::is_integer(current)) {
+            throw InvalidIntegerException(current);
         }
-        int value = atoi(current->c_str());
-        ++current;
+        int value = atoi(current.c_str());
         return value;
     }
     return 0;
@@ -107,45 +148,44 @@ int ValueTokenizer::next_int() {
 
 bigint ValueTokenizer::next_bigint() {
     if (has_next()) {
-        if(!utils::is_integer(*current)) {
-            throw InvalidIntegerException(*current);
+        std::string current = tokens.next();
+        if(!utils::is_integer(current)) {
+            throw InvalidIntegerException(current);
         }
-        bigint value = ATOBIGINT(current->c_str());
-        ++current;
+        bigint value = ATOBIGINT(current.c_str());
         return value;
     }
     return 0;
 }
 
 tagint ValueTokenizer::next_tagint() {
-    if (current != tokens.end()) {
-        if(!utils::is_integer(*current)) {
-            throw InvalidIntegerException(*current);
+    if (has_next()) {
+        std::string current = tokens.next();
+        if(!utils::is_integer(current)) {
+            throw InvalidIntegerException(current);
         }
-        tagint value = ATOTAGINT(current->c_str());
-        ++current;
+        tagint value = ATOTAGINT(current.c_str());
         return value;
     }
     return 0;
 }
 
 double ValueTokenizer::next_double() {
-    if (current != tokens.end()) {
-        if(!utils::is_double(*current)) {
-            throw InvalidFloatException(*current);
+    if (has_next()) {
+        std::string current = tokens.next();
+        if(!utils::is_double(current)) {
+            throw InvalidFloatException(current);
         }
-
-        double value = atof(current->c_str());
-        ++current;
+        double value = atof(current.c_str());
         return value;
     }
     return 0.0;
 }
 
-void ValueTokenizer::skip(int ntokens) {
-    current = std::next(current, ntokens);
+void ValueTokenizer::skip(int n) {
+    tokens.skip(n);
 }
 
-size_t ValueTokenizer::count() const {
+size_t ValueTokenizer::count() {
     return tokens.count();
 }
diff --git a/src/tokenizer.h b/src/tokenizer.h
index 89cb57b301..8ad19ce960 100644
--- a/src/tokenizer.h
+++ b/src/tokenizer.h
@@ -25,34 +25,33 @@
 
 namespace LAMMPS_NS {
 
-#define TOKENIZER_DEFAULT_SEPERATORS " \t\r\n\f"
+#define TOKENIZER_DEFAULT_SEPARATORS " \t\r\n\f"
 
 class Tokenizer {
-    std::vector<std::string> tokens;
+    std::string text;
+    std::string separators;
+    size_t start;
+    size_t ntokens;
 public:
-    typedef std::vector<std::string>::iterator iterator;
-    typedef std::vector<std::string>::const_iterator const_iterator;
-
-    Tokenizer(const std::string & str, const std::string & seperators = TOKENIZER_DEFAULT_SEPERATORS);
+    Tokenizer(const std::string & str, const std::string & separators = TOKENIZER_DEFAULT_SEPARATORS);
     Tokenizer(Tokenizer &&);
     Tokenizer(const Tokenizer &);
     Tokenizer& operator=(const Tokenizer&) = default;
     Tokenizer& operator=(Tokenizer&&) = default;
 
-    iterator begin();
-    iterator end();
-    const_iterator cbegin() const;
-    const_iterator cend() const;
+    void reset();
+    void skip(int n);
+    bool has_next() const;
+    std::string next();
 
-    std::string & operator[](size_t index);
-    size_t count() const;
+    size_t count();
+    std::vector<std::string> as_vector();
 };
 
 class TokenizerException : public std::exception {
   std::string message;
 public:
-  TokenizerException(const std::string & msg, const std::string & token) : message(msg + ": '" + token + "'") {
-  }
+  TokenizerException(const std::string & msg, const std::string & token);
 
   ~TokenizerException() throw() {
   }
@@ -76,9 +75,8 @@ public:
 
 class ValueTokenizer {
     Tokenizer tokens;
-    Tokenizer::const_iterator current;
 public:
-    ValueTokenizer(const std::string & str, const std::string & seperators = TOKENIZER_DEFAULT_SEPERATORS);
+    ValueTokenizer(const std::string & str, const std::string & separators = TOKENIZER_DEFAULT_SEPARATORS);
     ValueTokenizer(const ValueTokenizer &);
     ValueTokenizer(ValueTokenizer &&);
     ValueTokenizer& operator=(const ValueTokenizer&) = default;
@@ -91,9 +89,9 @@ public:
     double next_double();
 
     bool has_next() const;
-    void skip(int ntokens);
+    void skip(int n);
 
-    size_t count() const;
+    size_t count();
 };
 
 
diff --git a/src/utils.cpp b/src/utils.cpp
index a8dc4e308e..72193bb2c8 100644
--- a/src/utils.cpp
+++ b/src/utils.cpp
@@ -369,8 +369,20 @@ std::string utils::trim_comment(const std::string & line) {
 ------------------------------------------------------------------------- */
 
 size_t utils::count_words(const std::string & text, const std::string & seperators) {
-  ValueTokenizer words(text, seperators);
-  return words.count();
+  size_t count = 0;
+  size_t start = text.find_first_not_of(seperators);
+
+  while (start != std::string::npos) {
+    size_t end = text.find_first_of(seperators, start);
+    ++count;
+
+    if(end == std::string::npos) {
+      return count;
+    } else {
+      start = text.find_first_not_of(seperators, end + 1);
+    }
+  }
+  return count;
 }
 
 /* ----------------------------------------------------------------------
diff --git a/unittest/utils/test_tokenizer.cpp b/unittest/utils/test_tokenizer.cpp
index 08c71338be..09487aabff 100644
--- a/unittest/utils/test_tokenizer.cpp
+++ b/unittest/utils/test_tokenizer.cpp
@@ -50,25 +50,21 @@ TEST(Tokenizer, postfix_seperators) {
 
 TEST(Tokenizer, iterate_words) {
     Tokenizer t("  test word   ", " ");
-    ASSERT_THAT(t[0], Eq("test"));
-    ASSERT_THAT(t[1], Eq("word"));
+    ASSERT_THAT(t.next(), Eq("test"));
+    ASSERT_THAT(t.next(), Eq("word"));
     ASSERT_EQ(t.count(), 2);
 }
 
 TEST(Tokenizer, default_seperators) {
     Tokenizer t(" \r\n test \t word \f");
-    ASSERT_THAT(t[0], Eq("test"));
-    ASSERT_THAT(t[1], Eq("word"));
+    ASSERT_THAT(t.next(), Eq("test"));
+    ASSERT_THAT(t.next(), Eq("word"));
     ASSERT_EQ(t.count(), 2);
 }
 
-TEST(Tokenizer, for_loop) {
+TEST(Tokenizer, as_vector) {
     Tokenizer t(" \r\n test \t word \f");
-    std::vector<std::string> list;
-
-    for(auto word : t) {
-        list.push_back(word);
-    }
+    std::vector<std::string> list = t.as_vector();
     ASSERT_THAT(list[0], Eq("test"));
     ASSERT_THAT(list[1], Eq("word"));
 }
diff --git a/unittest/utils/test_utils.cpp b/unittest/utils/test_utils.cpp
index e1c458a173..9830207c3e 100644
--- a/unittest/utils/test_utils.cpp
+++ b/unittest/utils/test_utils.cpp
@@ -32,6 +32,10 @@ TEST(Utils, trim_and_count_words) {
     ASSERT_EQ(utils::trim_and_count_words("some text # comment"), 2);
 }
 
+TEST(Utils, count_words_with_extra_spaces) {
+    ASSERT_EQ(utils::count_words("   some text # comment   "), 4);
+}
+
 TEST(Utils, valid_integer1) {
     ASSERT_TRUE(utils::is_integer("10"));
 }

From 9945f737438288eca9564cd4650e43dc23c27476 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Thu, 11 Jun 2020 01:05:58 -0400
Subject: [PATCH 2/8] fix spelling in a few more files

---
 src/potential_file_reader.cpp     |  4 ++--
 src/text_file_reader.cpp          |  4 ++--
 src/utils.cpp                     | 12 ++++++------
 src/utils.h                       |  8 ++++----
 unittest/utils/test_tokenizer.cpp |  6 +++---
 5 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/src/potential_file_reader.cpp b/src/potential_file_reader.cpp
index 2ec7908ae3..5fd361eff5 100644
--- a/src/potential_file_reader.cpp
+++ b/src/potential_file_reader.cpp
@@ -83,9 +83,9 @@ void PotentialFileReader::next_dvector(double * list, int n) {
   }
 }
 
-ValueTokenizer PotentialFileReader::next_values(int nparams, const std::string & seperators) {
+ValueTokenizer PotentialFileReader::next_values(int nparams, const std::string & separators) {
   try {
-    return reader->next_values(nparams, seperators);
+    return reader->next_values(nparams, separators);
   } catch (FileReaderException & e) {
     error->one(FLERR, e.what());
   }
diff --git a/src/text_file_reader.cpp b/src/text_file_reader.cpp
index 8063bba87f..9015ddecee 100644
--- a/src/text_file_reader.cpp
+++ b/src/text_file_reader.cpp
@@ -116,6 +116,6 @@ void TextFileReader::next_dvector(double * list, int n) {
   }
 }
 
-ValueTokenizer TextFileReader::next_values(int nparams, const std::string & seperators) {
-  return ValueTokenizer(next_line(nparams), seperators);
+ValueTokenizer TextFileReader::next_values(int nparams, const std::string & separators) {
+  return ValueTokenizer(next_line(nparams), separators);
 }
diff --git a/src/utils.cpp b/src/utils.cpp
index 72193bb2c8..928a84883c 100644
--- a/src/utils.cpp
+++ b/src/utils.cpp
@@ -368,18 +368,18 @@ std::string utils::trim_comment(const std::string & line) {
    Return number of words
 ------------------------------------------------------------------------- */
 
-size_t utils::count_words(const std::string & text, const std::string & seperators) {
+size_t utils::count_words(const std::string & text, const std::string & separators) {
   size_t count = 0;
-  size_t start = text.find_first_not_of(seperators);
+  size_t start = text.find_first_not_of(separators);
 
   while (start != std::string::npos) {
-    size_t end = text.find_first_of(seperators, start);
+    size_t end = text.find_first_of(separators, start);
     ++count;
 
     if(end == std::string::npos) {
       return count;
     } else {
-      start = text.find_first_not_of(seperators, end + 1);
+      start = text.find_first_not_of(separators, end + 1);
     }
   }
   return count;
@@ -389,8 +389,8 @@ size_t utils::count_words(const std::string & text, const std::string & seperato
    Trim comment from string and return number of words
 ------------------------------------------------------------------------- */
 
-size_t utils::trim_and_count_words(const std::string & text, const std::string & seperators) {
-  return utils::count_words(utils::trim_comment(text), seperators);
+size_t utils::trim_and_count_words(const std::string & text, const std::string & separators) {
+  return utils::count_words(utils::trim_comment(text), separators);
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/utils.h b/src/utils.h
index 79fb2349d3..562293f2f3 100644
--- a/src/utils.h
+++ b/src/utils.h
@@ -153,18 +153,18 @@ namespace LAMMPS_NS {
     /**
      * \brief Count words in string
      * \param text string that should be searched
-     * \param seperators string containing characters that will be treated as whitespace
+     * \param separators string containing characters that will be treated as whitespace
      * \return number of words found
      */
-    size_t count_words(const std::string & text, const std::string & seperators = " \t\r\n\f");
+    size_t count_words(const std::string & text, const std::string & separators = " \t\r\n\f");
 
     /**
      * \brief Count words in a single line, trim anything from '#' onward
      * \param text string that should be trimmed and searched
-     * \param seperators string containing characters that will be treated as whitespace
+     * \param separators string containing characters that will be treated as whitespace
      * \return number of words found
      */
-    size_t trim_and_count_words(const std::string & text, const std::string & seperators = " \t\r\n\f");
+    size_t trim_and_count_words(const std::string & text, const std::string & separators = " \t\r\n\f");
 
     /**
      * \brief Check if string can be converted to valid integer
diff --git a/unittest/utils/test_tokenizer.cpp b/unittest/utils/test_tokenizer.cpp
index 09487aabff..903f660959 100644
--- a/unittest/utils/test_tokenizer.cpp
+++ b/unittest/utils/test_tokenizer.cpp
@@ -38,12 +38,12 @@ TEST(Tokenizer, two_words) {
     ASSERT_EQ(t.count(), 2);
 }
 
-TEST(Tokenizer, prefix_seperators) {
+TEST(Tokenizer, prefix_separators) {
     Tokenizer t("  test word", " ");
     ASSERT_EQ(t.count(), 2);
 }
 
-TEST(Tokenizer, postfix_seperators) {
+TEST(Tokenizer, postfix_separators) {
     Tokenizer t("test word   ", " ");
     ASSERT_EQ(t.count(), 2);
 }
@@ -55,7 +55,7 @@ TEST(Tokenizer, iterate_words) {
     ASSERT_EQ(t.count(), 2);
 }
 
-TEST(Tokenizer, default_seperators) {
+TEST(Tokenizer, default_separators) {
     Tokenizer t(" \r\n test \t word \f");
     ASSERT_THAT(t.next(), Eq("test"));
     ASSERT_THAT(t.next(), Eq("word"));

From 6cb5345cd086a29a975b7fe99490e5fd1cdb1251 Mon Sep 17 00:00:00 2001
From: Richard Berger <richard.berger@temple.edu>
Date: Thu, 11 Jun 2020 13:37:22 -0400
Subject: [PATCH 3/8] Add optimized version of count_words for default
 whitespace chars

---
 src/utils.cpp                 | 29 +++++++++++++++++++++++++++++
 src/utils.h                   | 10 +++++++++-
 unittest/utils/test_utils.cpp |  4 ++++
 3 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/src/utils.cpp b/src/utils.cpp
index 928a84883c..86f56e7b2a 100644
--- a/src/utils.cpp
+++ b/src/utils.cpp
@@ -368,6 +368,35 @@ std::string utils::trim_comment(const std::string & line) {
    Return number of words
 ------------------------------------------------------------------------- */
 
+size_t utils::count_words(const std::string & text) {
+  size_t count = 0;
+  const char * buf = text.c_str();
+  char c = *buf;
+
+  while (c) {
+    if (c == ' ' || c == '\t' || c == '\r' ||  c == '\n' || c == '\f') {
+      c = *++buf;
+      continue;
+    };
+
+    ++count;
+    c = *++buf;
+
+    while (c) {
+      if (c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\f') {
+        break;
+      }
+      c = *++buf;
+    }
+  }
+
+  return count;
+}
+
+/* ----------------------------------------------------------------------
+   Return number of words
+------------------------------------------------------------------------- */
+
 size_t utils::count_words(const std::string & text, const std::string & separators) {
   size_t count = 0;
   size_t start = text.find_first_not_of(separators);
diff --git a/src/utils.h b/src/utils.h
index 562293f2f3..bce9ff3e66 100644
--- a/src/utils.h
+++ b/src/utils.h
@@ -156,7 +156,15 @@ namespace LAMMPS_NS {
      * \param separators string containing characters that will be treated as whitespace
      * \return number of words found
      */
-    size_t count_words(const std::string & text, const std::string & separators = " \t\r\n\f");
+    size_t count_words(const std::string & text, const std::string & separators);
+
+    /**
+     * \brief Count words in string, ignore any whitespace matching " \t\r\n\f"
+     * \param text string that should be searched
+     * \param separators string containing characters that will be treated as whitespace
+     * \return number of words found
+     */
+    size_t count_words(const std::string & text);
 
     /**
      * \brief Count words in a single line, trim anything from '#' onward
diff --git a/unittest/utils/test_utils.cpp b/unittest/utils/test_utils.cpp
index 9830207c3e..5660c097f1 100644
--- a/unittest/utils/test_utils.cpp
+++ b/unittest/utils/test_utils.cpp
@@ -28,6 +28,10 @@ TEST(Utils, count_words) {
     ASSERT_EQ(utils::count_words("some text # comment"), 4);
 }
 
+TEST(Utils, count_words_non_default) {
+    ASSERT_EQ(utils::count_words("some text # comment", " #"), 3);
+}
+
 TEST(Utils, trim_and_count_words) {
     ASSERT_EQ(utils::trim_and_count_words("some text # comment"), 2);
 }

From 645d3b61baf0e36e605012f291521215aa65a64f Mon Sep 17 00:00:00 2001
From: Richard Berger <richard.berger@temple.edu>
Date: Thu, 11 Jun 2020 13:49:58 -0400
Subject: [PATCH 4/8] Only count new words

---
 src/text_file_reader.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/text_file_reader.cpp b/src/text_file_reader.cpp
index 9015ddecee..8abe1b001f 100644
--- a/src/text_file_reader.cpp
+++ b/src/text_file_reader.cpp
@@ -86,7 +86,7 @@ char *TextFileReader::next_line(int nparams) {
     // strip comment
     if (ignore_comments && (ptr = strchr(line, '#'))) *ptr = '\0';
 
-    nwords = utils::count_words(line);
+    nwords += utils::count_words(&line[n]);
 
     // skip line if blank
     if (nwords > 0) {

From 6a9073a0cbcbb5b8519988f693d9387d5c9febeb Mon Sep 17 00:00:00 2001
From: Richard Berger <richard.berger@temple.edu>
Date: Thu, 11 Jun 2020 13:50:26 -0400
Subject: [PATCH 5/8] Add count_words for C-Strings

---
 src/utils.cpp | 14 +++++++++++---
 src/utils.h   |  8 ++++++++
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/src/utils.cpp b/src/utils.cpp
index 86f56e7b2a..88677bf542 100644
--- a/src/utils.cpp
+++ b/src/utils.cpp
@@ -365,12 +365,12 @@ std::string utils::trim_comment(const std::string & line) {
 }
 
 /* ----------------------------------------------------------------------
-   Return number of words
+   return number of words
 ------------------------------------------------------------------------- */
 
-size_t utils::count_words(const std::string & text) {
+size_t utils::count_words(const char * text) {
   size_t count = 0;
-  const char * buf = text.c_str();
+  const char * buf = text;
   char c = *buf;
 
   while (c) {
@@ -393,6 +393,14 @@ size_t utils::count_words(const std::string & text) {
   return count;
 }
 
+/* ----------------------------------------------------------------------
+   return number of words
+------------------------------------------------------------------------- */
+
+size_t utils::count_words(const std::string & text) {
+  return utils::count_words(text.c_str());
+}
+
 /* ----------------------------------------------------------------------
    Return number of words
 ------------------------------------------------------------------------- */
diff --git a/src/utils.h b/src/utils.h
index bce9ff3e66..ef272087b7 100644
--- a/src/utils.h
+++ b/src/utils.h
@@ -166,6 +166,14 @@ namespace LAMMPS_NS {
      */
     size_t count_words(const std::string & text);
 
+    /**
+     * \brief Count words in C-string, ignore any whitespace matching " \t\r\n\f"
+     * \param text string that should be searched
+     * \param separators string containing characters that will be treated as whitespace
+     * \return number of words found
+     */
+    size_t count_words(const char * text);
+
     /**
      * \brief Count words in a single line, trim anything from '#' onward
      * \param text string that should be trimmed and searched

From f25f7fee8d60a511ee01ac5d42ed18b24e518f83 Mon Sep 17 00:00:00 2001
From: Richard Berger <richard.berger@temple.edu>
Date: Thu, 11 Jun 2020 13:56:47 -0400
Subject: [PATCH 6/8] Only count new words

---
 src/MANYBODY/pair_eim.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/MANYBODY/pair_eim.cpp b/src/MANYBODY/pair_eim.cpp
index 0d552caebc..f3f965a231 100644
--- a/src/MANYBODY/pair_eim.cpp
+++ b/src/MANYBODY/pair_eim.cpp
@@ -1123,7 +1123,7 @@ char * EIMPotentialFileReader::next_line(FILE * fp) {
       concat = false;
     }
 
-    nwords = utils::count_words(line);
+    nwords += utils::count_words(&line[n]);
 
     // skip line if blank
     if (nwords > 0) {

From 43d3133583fa39c285e0ef3896315b086bb7f1ae Mon Sep 17 00:00:00 2001
From: Richard Berger <richard.berger@temple.edu>
Date: Thu, 11 Jun 2020 14:00:13 -0400
Subject: [PATCH 7/8] Whitespace

---
 src/tokenizer.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp
index f041c79baa..7ea7c88e5e 100644
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@@ -35,13 +35,13 @@ Tokenizer::Tokenizer(const std::string & str, const std::string & separators) :
     reset();
 }
 
-Tokenizer::Tokenizer(const Tokenizer & rhs) : 
+Tokenizer::Tokenizer(const Tokenizer & rhs) :
     text(rhs.text), separators(rhs.separators), ntokens(rhs.ntokens)
 {
     reset();
 }
 
-Tokenizer::Tokenizer(Tokenizer && rhs) : 
+Tokenizer::Tokenizer(Tokenizer && rhs) :
     text(std::move(rhs.text)), separators(std::move(rhs.separators)), ntokens(rhs.ntokens)
 {
     reset();
@@ -55,13 +55,13 @@ void Tokenizer::skip(int n) {
     for(int i = 0; i < n; ++i) {
         if(!has_next()) throw TokenizerException("No more tokens", "");
 
-        size_t end = text.find_first_of(separators, start);      
+        size_t end = text.find_first_of(separators, start);
 
         if(end == std::string::npos) {
             start = end;
         } else {
             start = text.find_first_not_of(separators, end+1);
-        } 
+        }
     }
 }
 
@@ -78,7 +78,7 @@ std::string Tokenizer::next() {
         std::string token = text.substr(start);
         start = end;
         return token;
-    } 
+    }
 
     std::string token = text.substr(start, end-start);
     start = text.find_first_not_of(separators, end+1);

From 3c99471df8da67dc34056485e7e7a18128fc269a Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Fri, 12 Jun 2020 01:29:42 -0400
Subject: [PATCH 8/8] add a few more unit tests for functions in utils

---
 unittest/utils/test_utils.cpp | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/unittest/utils/test_utils.cpp b/unittest/utils/test_utils.cpp
index 5660c097f1..317c77cf3f 100644
--- a/unittest/utils/test_utils.cpp
+++ b/unittest/utils/test_utils.cpp
@@ -15,6 +15,9 @@
 #include "gmock/gmock.h"
 #include "utils.h"
 #include <string>
+#include <cerrno>
+#include <cstdio>
+#include <cstdlib>
 
 using namespace LAMMPS_NS;
 using ::testing::Eq;
@@ -223,3 +226,35 @@ TEST(Utils, path_basename) {
     ASSERT_THAT(utils::path_basename("/parent/folder/filename"), Eq("filename"));
 #endif
 }
+
+TEST(Utils, getsyserror) {
+#if defined(__linux__)
+    errno = ENOENT;
+    std::string errmesg = utils::getsyserror();
+    ASSERT_THAT(errmesg, Eq("No such file or directory"));
+#else
+    GTEST_SKIP();
+#endif
+}
+
+TEST(Utils, potential_file) {
+    FILE *fp;
+    fp = fopen("ctest.txt","w");
+    ASSERT_NE(fp,nullptr);
+    fputs("# DATE: 2020-02-20 CONTRIBUTOR: Nessuno\n",fp);
+    fclose(fp);
+
+    EXPECT_TRUE(utils::file_is_readable("ctest.txt"));
+    EXPECT_FALSE(utils::file_is_readable("no_such_file.txt"));
+
+    EXPECT_THAT(utils::get_potential_file_path("ctest.txt"),Eq("ctest.txt"));
+    const char *folder = getenv("LAMMPS_POTENTIALS");
+    if (folder != nullptr) {
+      std::string path=utils::path_join(folder,"Cu_u3.eam");
+      EXPECT_THAT(utils::get_potential_file_path("Cu_u3.eam"),Eq(path));
+    }
+
+    EXPECT_THAT(utils::get_potential_date("ctest.txt","Test"),Eq("2020-02-20"));
+
+    remove("ctest.txt");
+}