Add Tokenizer class

2020-05-15 15:36:13 -04:00 · 2020-05-15 15:36:13 -04:00 · d41927b056
parent 8691579def
commit d41927b056
5 changed files with 161 additions and 0 deletions
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@ -0,0 +1,53 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Richard Berger (Temple U)
+------------------------------------------------------------------------- */
+
+#include "tokenizer.h"
+
+using namespace LAMMPS_NS;
+
+Tokenizer::Tokenizer(const std::string & str, const std::string & seperators) {
+    size_t end = -1;
+
+    do {
+        size_t start = str.find_first_not_of(seperators, end + 1);
+        if(start == std::string::npos) break;
+
+        end = str.find_first_of(seperators, start);
+
+        if(end == std::string::npos) {
+            tokens.push_back(str.substr(start));
+        } else {
+            tokens.push_back(str.substr(start, end-start));
+        }
+    } while(end != std::string::npos);
+}
+
+Tokenizer::iterator Tokenizer::begin() {
+    return tokens.begin();
+}
+
+Tokenizer::iterator Tokenizer::end() {
+    return tokens.end();
+}
+
+const std::string & Tokenizer::operator[](size_t index) {
+    return tokens[index];
+}
+
+const size_t Tokenizer::count() const {
+    return tokens.size();
+}
--- a/src/tokenizer.h
+++ b/src/tokenizer.h
@ -0,0 +1,42 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Richard Berger (Temple U)
+------------------------------------------------------------------------- */
+
+#ifndef LMP_TOKENIZER_H
+#define LMP_TOKENIZER_H
+
+#include <string>
+#include <vector>
+
+namespace LAMMPS_NS {
+
+class Tokenizer {
+    std::vector<std::string> tokens;
+public:
+    typedef std::vector<std::string>::iterator iterator;
+
+    Tokenizer(const std::string & str, const std::string & seperators = " \t\r\n\f");
+
+    iterator begin();
+    iterator end();
+
+    const std::string & operator[](size_t index);
+    const size_t count() const;
+};
+
+}
+
+#endif
--- a/unittest/CMakeLists.txt
+++ b/unittest/CMakeLists.txt
@ -1,3 +1,5 @@
 include(GTest)

 add_subdirectory(force-styles)
+
+add_subdirectory(utils)
--- a/unittest/utils/CMakeLists.txt
+++ b/unittest/utils/CMakeLists.txt
@ -0,0 +1,3 @@
+add_executable(test_tokenizer test_tokenizer.cpp)
+target_link_libraries(test_tokenizer PRIVATE lammps GTest::GMockMain GTest::GMock GTest::GTest)
+add_test(Tokenizer test_tokenizer)
--- a/unittest/utils/test_tokenizer.cpp
+++ b/unittest/utils/test_tokenizer.cpp
@ -0,0 +1,61 @@
+#include <gtest/gtest.h>
+#include <gmock/gmock.h>
+#include "tokenizer.h"
+
+using namespace LAMMPS_NS;
+using ::testing::Eq;
+
+TEST(Tokenizer, empty_string) {
+    Tokenizer t("", " ");
+    ASSERT_EQ(t.count(), 0);
+}
+
+TEST(Tokenizer, whitespace_only) {
+    Tokenizer t("    ", " ");
+    ASSERT_EQ(t.count(), 0);
+}
+
+TEST(Tokenizer, single_word) {
+    Tokenizer t("test", " ");
+    ASSERT_EQ(t.count(), 1);
+}
+
+TEST(Tokenizer, two_words) {
+    Tokenizer t("test word", " ");
+    ASSERT_EQ(t.count(), 2);
+}
+
+TEST(Tokenizer, prefix_seperators) {
+    Tokenizer t("  test word", " ");
+    ASSERT_EQ(t.count(), 2);
+}
+
+TEST(Tokenizer, postfix_seperators) {
+    Tokenizer t("test word   ", " ");
+    ASSERT_EQ(t.count(), 2);
+}
+
+TEST(Tokenizer, iterate_words) {
+    Tokenizer t("  test word   ", " ");
+    ASSERT_THAT(t[0], Eq("test"));
+    ASSERT_THAT(t[1], Eq("word"));
+    ASSERT_EQ(t.count(), 2);
+}
+
+TEST(Tokenizer, default_seperators) {
+    Tokenizer t(" \r\n test \t word \f");
+    ASSERT_THAT(t[0], Eq("test"));
+    ASSERT_THAT(t[1], Eq("word"));
+    ASSERT_EQ(t.count(), 2);
+}
+
+TEST(Tokenizer, for_loop) {
+    Tokenizer t(" \r\n test \t word \f");
+    std::vector<std::string> list;
+
+    for(auto word : t) {
+        list.push_back(word);
+    }
+    ASSERT_THAT(list[0], Eq("test"));
+    ASSERT_THAT(list[1], Eq("word"));
+}