Merge pull request #2144 from rbberger/tokenizer-performance-fixes

Performance optimization of Tokenizer
This commit is contained in:
Axel Kohlmeyer 2020-06-12 04:20:05 -04:00 committed by GitHub
commit 5f86bac419
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 252 additions and 110 deletions

View File

@ -1123,7 +1123,7 @@ char * EIMPotentialFileReader::next_line(FILE * fp) {
concat = false; concat = false;
} }
nwords = utils::count_words(line); nwords += utils::count_words(&line[n]);
// skip line if blank // skip line if blank
if (nwords > 0) { if (nwords > 0) {

View File

@ -2448,11 +2448,11 @@ int AtomVec::process_fields(char *str, const char *default_str, Method *method)
} }
// tokenize words in both strings // tokenize words in both strings
Tokenizer words(str, " "); std::vector<std::string> words = Tokenizer(str, " ").as_vector();
Tokenizer def_words(default_str, " "); std::vector<std::string> def_words = Tokenizer(default_str, " ").as_vector();
int nfield = words.count(); int nfield = words.size();
int ndef = def_words.count(); int ndef = def_words.size();
// process fields one by one, add to index vector // process fields one by one, add to index vector

View File

@ -514,8 +514,8 @@ char *AtomVecHybrid::merge_fields(int inum, char *root,
// identify unique words in concatenated string // identify unique words in concatenated string
Tokenizer words(concat, " "); std::vector<std::string> words = Tokenizer(concat, " ").as_vector();
int nwords = words.count(); int nwords = words.size();
int *unique = new int[nwords]; int *unique = new int[nwords];

View File

@ -83,9 +83,9 @@ void PotentialFileReader::next_dvector(double * list, int n) {
} }
} }
ValueTokenizer PotentialFileReader::next_values(int nparams, const std::string & seperators) { ValueTokenizer PotentialFileReader::next_values(int nparams, const std::string & separators) {
try { try {
return reader->next_values(nparams, seperators); return reader->next_values(nparams, separators);
} catch (FileReaderException & e) { } catch (FileReaderException & e) {
error->one(FLERR, e.what()); error->one(FLERR, e.what());
} }

View File

@ -43,7 +43,7 @@ namespace LAMMPS_NS
void skip_line(); void skip_line();
char * next_line(int nparams = 0); char * next_line(int nparams = 0);
void next_dvector(double * list, int n); void next_dvector(double * list, int n);
ValueTokenizer next_values(int nparams, const std::string & seperators = TOKENIZER_DEFAULT_SEPERATORS); ValueTokenizer next_values(int nparams, const std::string & separators = TOKENIZER_DEFAULT_SEPARATORS);
// convenience functions // convenience functions
double next_double(); double next_double();

View File

@ -86,7 +86,7 @@ char *TextFileReader::next_line(int nparams) {
// strip comment // strip comment
if (ignore_comments && (ptr = strchr(line, '#'))) *ptr = '\0'; if (ignore_comments && (ptr = strchr(line, '#'))) *ptr = '\0';
nwords = utils::count_words(line); nwords += utils::count_words(&line[n]);
// skip line if blank // skip line if blank
if (nwords > 0) { if (nwords > 0) {
@ -116,6 +116,6 @@ void TextFileReader::next_dvector(double * list, int n) {
} }
} }
ValueTokenizer TextFileReader::next_values(int nparams, const std::string & seperators) { ValueTokenizer TextFileReader::next_values(int nparams, const std::string & separators) {
return ValueTokenizer(next_line(nparams), seperators); return ValueTokenizer(next_line(nparams), separators);
} }

View File

@ -42,7 +42,7 @@ namespace LAMMPS_NS
char * next_line(int nparams = 0); char * next_line(int nparams = 0);
void next_dvector(double * list, int n); void next_dvector(double * list, int n);
ValueTokenizer next_values(int nparams, const std::string & seperators = TOKENIZER_DEFAULT_SEPERATORS); ValueTokenizer next_values(int nparams, const std::string & separators = TOKENIZER_DEFAULT_SEPARATORS);
}; };
class FileReaderException : public std::exception { class FileReaderException : public std::exception {

View File

@ -17,77 +17,118 @@
#include "tokenizer.h" #include "tokenizer.h"
#include "utils.h" #include "utils.h"
#include "fmt/format.h"
using namespace LAMMPS_NS; using namespace LAMMPS_NS;
Tokenizer::Tokenizer(const std::string & str, const std::string & seperators) { TokenizerException::TokenizerException(const std::string & msg, const std::string & token){
size_t end = -1; if(token.empty()) {
message = msg;
} else {
message = fmt::format("{}: '{}'", msg, token);
}
}
do { Tokenizer::Tokenizer(const std::string & str, const std::string & separators) :
size_t start = str.find_first_not_of(seperators, end + 1); text(str), separators(separators), start(0), ntokens(std::string::npos)
if(start == std::string::npos) break; {
reset();
}
end = str.find_first_of(seperators, start); Tokenizer::Tokenizer(const Tokenizer & rhs) :
text(rhs.text), separators(rhs.separators), ntokens(rhs.ntokens)
{
reset();
}
Tokenizer::Tokenizer(Tokenizer && rhs) :
text(std::move(rhs.text)), separators(std::move(rhs.separators)), ntokens(rhs.ntokens)
{
reset();
}
void Tokenizer::reset() {
start = text.find_first_not_of(separators);
}
void Tokenizer::skip(int n) {
for(int i = 0; i < n; ++i) {
if(!has_next()) throw TokenizerException("No more tokens", "");
size_t end = text.find_first_of(separators, start);
if(end == std::string::npos) { if(end == std::string::npos) {
tokens.push_back(str.substr(start)); start = end;
} else { } else {
tokens.push_back(str.substr(start, end-start)); start = text.find_first_not_of(separators, end+1);
} }
} while(end != std::string::npos); }
} }
Tokenizer::Tokenizer(const Tokenizer & rhs) : tokens(rhs.tokens) { bool Tokenizer::has_next() const {
return start != std::string::npos;
} }
Tokenizer::Tokenizer(Tokenizer && rhs) : tokens(std::move(rhs.tokens)) { std::string Tokenizer::next() {
if(!has_next()) throw TokenizerException("No more tokens", "");
size_t end = text.find_first_of(separators, start);
if(end == std::string::npos) {
std::string token = text.substr(start);
start = end;
return token;
}
std::string token = text.substr(start, end-start);
start = text.find_first_not_of(separators, end+1);
return token;
} }
Tokenizer::iterator Tokenizer::begin() { size_t Tokenizer::count() {
return tokens.begin(); // lazy evaluation
if (ntokens == std::string::npos) {
ntokens = utils::count_words(text, separators);
}
return ntokens;
} }
Tokenizer::iterator Tokenizer::end() { std::vector<std::string> Tokenizer::as_vector() {
return tokens.end(); // store current state
} size_t current = start;
Tokenizer::const_iterator Tokenizer::cbegin() const { reset();
return tokens.cbegin();
}
Tokenizer::const_iterator Tokenizer::cend() const { // generate vector
return tokens.cend(); std::vector<std::string> tokens;
}
std::string & Tokenizer::operator[](size_t index) { while(has_next()) {
return tokens[index]; tokens.emplace_back(next());
} }
size_t Tokenizer::count() const { // restore state
return tokens.size(); start = current;
return tokens;
} }
ValueTokenizer::ValueTokenizer(const std::string & str, const std::string & seperators) : tokens(str, seperators) { ValueTokenizer::ValueTokenizer(const std::string & str, const std::string & separators) : tokens(str, separators) {
current = tokens.begin();
} }
ValueTokenizer::ValueTokenizer(const ValueTokenizer & rhs) : tokens(rhs.tokens) { ValueTokenizer::ValueTokenizer(const ValueTokenizer & rhs) : tokens(rhs.tokens) {
current = tokens.begin();
} }
ValueTokenizer::ValueTokenizer(ValueTokenizer && rhs) : tokens(std::move(rhs.tokens)) { ValueTokenizer::ValueTokenizer(ValueTokenizer && rhs) : tokens(std::move(rhs.tokens)) {
current = tokens.begin();
} }
bool ValueTokenizer::has_next() const { bool ValueTokenizer::has_next() const {
return current != tokens.cend(); return tokens.has_next();
} }
std::string ValueTokenizer::next_string() { std::string ValueTokenizer::next_string() {
if (has_next()) { if (has_next()) {
std::string value = *current; std::string value = tokens.next();
++current;
return value; return value;
} }
return ""; return "";
@ -95,11 +136,11 @@ std::string ValueTokenizer::next_string() {
int ValueTokenizer::next_int() { int ValueTokenizer::next_int() {
if (has_next()) { if (has_next()) {
if(!utils::is_integer(*current)) { std::string current = tokens.next();
throw InvalidIntegerException(*current); if(!utils::is_integer(current)) {
throw InvalidIntegerException(current);
} }
int value = atoi(current->c_str()); int value = atoi(current.c_str());
++current;
return value; return value;
} }
return 0; return 0;
@ -107,45 +148,44 @@ int ValueTokenizer::next_int() {
bigint ValueTokenizer::next_bigint() { bigint ValueTokenizer::next_bigint() {
if (has_next()) { if (has_next()) {
if(!utils::is_integer(*current)) { std::string current = tokens.next();
throw InvalidIntegerException(*current); if(!utils::is_integer(current)) {
throw InvalidIntegerException(current);
} }
bigint value = ATOBIGINT(current->c_str()); bigint value = ATOBIGINT(current.c_str());
++current;
return value; return value;
} }
return 0; return 0;
} }
tagint ValueTokenizer::next_tagint() { tagint ValueTokenizer::next_tagint() {
if (current != tokens.end()) { if (has_next()) {
if(!utils::is_integer(*current)) { std::string current = tokens.next();
throw InvalidIntegerException(*current); if(!utils::is_integer(current)) {
throw InvalidIntegerException(current);
} }
tagint value = ATOTAGINT(current->c_str()); tagint value = ATOTAGINT(current.c_str());
++current;
return value; return value;
} }
return 0; return 0;
} }
double ValueTokenizer::next_double() { double ValueTokenizer::next_double() {
if (current != tokens.end()) { if (has_next()) {
if(!utils::is_double(*current)) { std::string current = tokens.next();
throw InvalidFloatException(*current); if(!utils::is_double(current)) {
throw InvalidFloatException(current);
} }
double value = atof(current.c_str());
double value = atof(current->c_str());
++current;
return value; return value;
} }
return 0.0; return 0.0;
} }
void ValueTokenizer::skip(int ntokens) { void ValueTokenizer::skip(int n) {
current = std::next(current, ntokens); tokens.skip(n);
} }
size_t ValueTokenizer::count() const { size_t ValueTokenizer::count() {
return tokens.count(); return tokens.count();
} }

View File

@ -25,34 +25,33 @@
namespace LAMMPS_NS { namespace LAMMPS_NS {
#define TOKENIZER_DEFAULT_SEPERATORS " \t\r\n\f" #define TOKENIZER_DEFAULT_SEPARATORS " \t\r\n\f"
class Tokenizer { class Tokenizer {
std::vector<std::string> tokens; std::string text;
std::string separators;
size_t start;
size_t ntokens;
public: public:
typedef std::vector<std::string>::iterator iterator; Tokenizer(const std::string & str, const std::string & separators = TOKENIZER_DEFAULT_SEPARATORS);
typedef std::vector<std::string>::const_iterator const_iterator;
Tokenizer(const std::string & str, const std::string & seperators = TOKENIZER_DEFAULT_SEPERATORS);
Tokenizer(Tokenizer &&); Tokenizer(Tokenizer &&);
Tokenizer(const Tokenizer &); Tokenizer(const Tokenizer &);
Tokenizer& operator=(const Tokenizer&) = default; Tokenizer& operator=(const Tokenizer&) = default;
Tokenizer& operator=(Tokenizer&&) = default; Tokenizer& operator=(Tokenizer&&) = default;
iterator begin(); void reset();
iterator end(); void skip(int n);
const_iterator cbegin() const; bool has_next() const;
const_iterator cend() const; std::string next();
std::string & operator[](size_t index); size_t count();
size_t count() const; std::vector<std::string> as_vector();
}; };
class TokenizerException : public std::exception { class TokenizerException : public std::exception {
std::string message; std::string message;
public: public:
TokenizerException(const std::string & msg, const std::string & token) : message(msg + ": '" + token + "'") { TokenizerException(const std::string & msg, const std::string & token);
}
~TokenizerException() throw() { ~TokenizerException() throw() {
} }
@ -76,9 +75,8 @@ public:
class ValueTokenizer { class ValueTokenizer {
Tokenizer tokens; Tokenizer tokens;
Tokenizer::const_iterator current;
public: public:
ValueTokenizer(const std::string & str, const std::string & seperators = TOKENIZER_DEFAULT_SEPERATORS); ValueTokenizer(const std::string & str, const std::string & separators = TOKENIZER_DEFAULT_SEPARATORS);
ValueTokenizer(const ValueTokenizer &); ValueTokenizer(const ValueTokenizer &);
ValueTokenizer(ValueTokenizer &&); ValueTokenizer(ValueTokenizer &&);
ValueTokenizer& operator=(const ValueTokenizer&) = default; ValueTokenizer& operator=(const ValueTokenizer&) = default;
@ -91,9 +89,9 @@ public:
double next_double(); double next_double();
bool has_next() const; bool has_next() const;
void skip(int ntokens); void skip(int n);
size_t count() const; size_t count();
}; };

View File

@ -364,21 +364,70 @@ std::string utils::trim_comment(const std::string & line) {
return std::string(line); return std::string(line);
} }
/* ----------------------------------------------------------------------
return number of words
------------------------------------------------------------------------- */
size_t utils::count_words(const char * text) {
size_t count = 0;
const char * buf = text;
char c = *buf;
while (c) {
if (c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\f') {
c = *++buf;
continue;
};
++count;
c = *++buf;
while (c) {
if (c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\f') {
break;
}
c = *++buf;
}
}
return count;
}
/* ----------------------------------------------------------------------
return number of words
------------------------------------------------------------------------- */
size_t utils::count_words(const std::string & text) {
return utils::count_words(text.c_str());
}
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
Return number of words Return number of words
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
size_t utils::count_words(const std::string & text, const std::string & seperators) { size_t utils::count_words(const std::string & text, const std::string & separators) {
ValueTokenizer words(text, seperators); size_t count = 0;
return words.count(); size_t start = text.find_first_not_of(separators);
while (start != std::string::npos) {
size_t end = text.find_first_of(separators, start);
++count;
if(end == std::string::npos) {
return count;
} else {
start = text.find_first_not_of(separators, end + 1);
}
}
return count;
} }
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
Trim comment from string and return number of words Trim comment from string and return number of words
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
size_t utils::trim_and_count_words(const std::string & text, const std::string & seperators) { size_t utils::trim_and_count_words(const std::string & text, const std::string & separators) {
return utils::count_words(utils::trim_comment(text), seperators); return utils::count_words(utils::trim_comment(text), separators);
} }
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------

View File

@ -153,18 +153,34 @@ namespace LAMMPS_NS {
/** /**
* \brief Count words in string * \brief Count words in string
* \param text string that should be searched * \param text string that should be searched
* \param seperators string containing characters that will be treated as whitespace * \param separators string containing characters that will be treated as whitespace
* \return number of words found * \return number of words found
*/ */
size_t count_words(const std::string & text, const std::string & seperators = " \t\r\n\f"); size_t count_words(const std::string & text, const std::string & separators);
/**
* \brief Count words in string, ignore any whitespace matching " \t\r\n\f"
* \param text string that should be searched
* \param separators string containing characters that will be treated as whitespace
* \return number of words found
*/
size_t count_words(const std::string & text);
/**
* \brief Count words in C-string, ignore any whitespace matching " \t\r\n\f"
* \param text string that should be searched
* \param separators string containing characters that will be treated as whitespace
* \return number of words found
*/
size_t count_words(const char * text);
/** /**
* \brief Count words in a single line, trim anything from '#' onward * \brief Count words in a single line, trim anything from '#' onward
* \param text string that should be trimmed and searched * \param text string that should be trimmed and searched
* \param seperators string containing characters that will be treated as whitespace * \param separators string containing characters that will be treated as whitespace
* \return number of words found * \return number of words found
*/ */
size_t trim_and_count_words(const std::string & text, const std::string & seperators = " \t\r\n\f"); size_t trim_and_count_words(const std::string & text, const std::string & separators = " \t\r\n\f");
/** /**
* \brief Check if string can be converted to valid integer * \brief Check if string can be converted to valid integer

View File

@ -38,37 +38,33 @@ TEST(Tokenizer, two_words) {
ASSERT_EQ(t.count(), 2); ASSERT_EQ(t.count(), 2);
} }
TEST(Tokenizer, prefix_seperators) { TEST(Tokenizer, prefix_separators) {
Tokenizer t(" test word", " "); Tokenizer t(" test word", " ");
ASSERT_EQ(t.count(), 2); ASSERT_EQ(t.count(), 2);
} }
TEST(Tokenizer, postfix_seperators) { TEST(Tokenizer, postfix_separators) {
Tokenizer t("test word ", " "); Tokenizer t("test word ", " ");
ASSERT_EQ(t.count(), 2); ASSERT_EQ(t.count(), 2);
} }
TEST(Tokenizer, iterate_words) { TEST(Tokenizer, iterate_words) {
Tokenizer t(" test word ", " "); Tokenizer t(" test word ", " ");
ASSERT_THAT(t[0], Eq("test")); ASSERT_THAT(t.next(), Eq("test"));
ASSERT_THAT(t[1], Eq("word")); ASSERT_THAT(t.next(), Eq("word"));
ASSERT_EQ(t.count(), 2); ASSERT_EQ(t.count(), 2);
} }
TEST(Tokenizer, default_seperators) { TEST(Tokenizer, default_separators) {
Tokenizer t(" \r\n test \t word \f"); Tokenizer t(" \r\n test \t word \f");
ASSERT_THAT(t[0], Eq("test")); ASSERT_THAT(t.next(), Eq("test"));
ASSERT_THAT(t[1], Eq("word")); ASSERT_THAT(t.next(), Eq("word"));
ASSERT_EQ(t.count(), 2); ASSERT_EQ(t.count(), 2);
} }
TEST(Tokenizer, for_loop) { TEST(Tokenizer, as_vector) {
Tokenizer t(" \r\n test \t word \f"); Tokenizer t(" \r\n test \t word \f");
std::vector<std::string> list; std::vector<std::string> list = t.as_vector();
for(auto word : t) {
list.push_back(word);
}
ASSERT_THAT(list[0], Eq("test")); ASSERT_THAT(list[0], Eq("test"));
ASSERT_THAT(list[1], Eq("word")); ASSERT_THAT(list[1], Eq("word"));
} }

View File

@ -15,6 +15,9 @@
#include "gmock/gmock.h" #include "gmock/gmock.h"
#include "utils.h" #include "utils.h"
#include <string> #include <string>
#include <cerrno>
#include <cstdio>
#include <cstdlib>
using namespace LAMMPS_NS; using namespace LAMMPS_NS;
using ::testing::Eq; using ::testing::Eq;
@ -28,10 +31,18 @@ TEST(Utils, count_words) {
ASSERT_EQ(utils::count_words("some text # comment"), 4); ASSERT_EQ(utils::count_words("some text # comment"), 4);
} }
TEST(Utils, count_words_non_default) {
ASSERT_EQ(utils::count_words("some text # comment", " #"), 3);
}
TEST(Utils, trim_and_count_words) { TEST(Utils, trim_and_count_words) {
ASSERT_EQ(utils::trim_and_count_words("some text # comment"), 2); ASSERT_EQ(utils::trim_and_count_words("some text # comment"), 2);
} }
TEST(Utils, count_words_with_extra_spaces) {
ASSERT_EQ(utils::count_words(" some text # comment "), 4);
}
TEST(Utils, valid_integer1) { TEST(Utils, valid_integer1) {
ASSERT_TRUE(utils::is_integer("10")); ASSERT_TRUE(utils::is_integer("10"));
} }
@ -215,3 +226,35 @@ TEST(Utils, path_basename) {
ASSERT_THAT(utils::path_basename("/parent/folder/filename"), Eq("filename")); ASSERT_THAT(utils::path_basename("/parent/folder/filename"), Eq("filename"));
#endif #endif
} }
TEST(Utils, getsyserror) {
#if defined(__linux__)
errno = ENOENT;
std::string errmesg = utils::getsyserror();
ASSERT_THAT(errmesg, Eq("No such file or directory"));
#else
GTEST_SKIP();
#endif
}
TEST(Utils, potential_file) {
FILE *fp;
fp = fopen("ctest.txt","w");
ASSERT_NE(fp,nullptr);
fputs("# DATE: 2020-02-20 CONTRIBUTOR: Nessuno\n",fp);
fclose(fp);
EXPECT_TRUE(utils::file_is_readable("ctest.txt"));
EXPECT_FALSE(utils::file_is_readable("no_such_file.txt"));
EXPECT_THAT(utils::get_potential_file_path("ctest.txt"),Eq("ctest.txt"));
const char *folder = getenv("LAMMPS_POTENTIALS");
if (folder != nullptr) {
std::string path=utils::path_join(folder,"Cu_u3.eam");
EXPECT_THAT(utils::get_potential_file_path("Cu_u3.eam"),Eq(path));
}
EXPECT_THAT(utils::get_potential_date("ctest.txt","Test"),Eq("2020-02-20"));
remove("ctest.txt");
}