start documenting tokenizer classes

2020-08-30 01:50:37 -04:00 · 2020-08-30 01:50:37 -04:00 · 96ee132e85
parent 31c91a8928
commit 96ee132e85
3 changed files with 74 additions and 0 deletions
--- a/doc/doxygen/Doxyfile.in
+++ b/doc/doxygen/Doxyfile.in
@ -422,6 +422,8 @@ INPUT                  = @LAMMPS_SOURCE_DIR@/utils.cpp      \
                         @LAMMPS_SOURCE_DIR@/atom.h         \
                         @LAMMPS_SOURCE_DIR@/input.cpp      \
                         @LAMMPS_SOURCE_DIR@/input.h        \
+                         @LAMMPS_SOURCE_DIR@/tokenizer.cpp  \
+                         @LAMMPS_SOURCE_DIR@/tokenizer.h    \

 # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
 # directories that are symbolic links (a Unix file system feature) are excluded
--- a/doc/src/pg_developer.rst
+++ b/doc/src/pg_developer.rst
@ -880,3 +880,32 @@ Convenience functions

 .. doxygenfunction:: timespec2seconds
   :project: progguide
+
+---------------------------
+
+Tokenizer classes
+=================
+
+The purpose of the tokenizer classes is to simplify the recurring task
+of breaking lines of text down into words and/or numbers.
+Traditionally, LAMMPS code would be using the ``strtok()`` function from
+the C library for that purpose, but that function has two significant
+disadvantages: 1) it cannot be used concurrently from different LAMMPS
+instances since it stores its status in a global variable and 2) it
+modifies the string that it is processing.  These classes were
+implemented to avoid both of these issues and also to reduce the amount
+of code that needs to be written.
+
+The basic procedure is to create an instance of the class with the
+string to be processed as an argument and then do a loop until all
+available tokens are read.  The constructor has a default set of
+separator characters, but that can be overridden. The default separators
+are all "whitespace" characters, i.e. the space character, the tabulator
+character, the carriage return character, the linefeed character, and
+the form feed character.
+
+.. doxygenclass:: LAMMPS_NS::Tokenizer
+   :project: progguide
+
+.. doxygenclass:: LAMMPS_NS::ValueTokenizer
+   :project: progguide
--- a/src/tokenizer.h
+++ b/src/tokenizer.h
@ -27,6 +27,11 @@ namespace LAMMPS_NS {

 #define TOKENIZER_DEFAULT_SEPARATORS " \t\r\n\f"

+/*! Class for splitting text into words
+ *
+ * \sa ValueTokenizer
+ */
+
 class Tokenizer {
    std::string text;
    std::string separators;
@ -39,13 +44,46 @@ public:
    Tokenizer& operator=(const Tokenizer&) = default;
    Tokenizer& operator=(Tokenizer&&) = default;

+    /*! Reposition the tokenizer state to the first word,
+     * i.e. the first non-separator character
+     */
    void reset();
+
+    /*! Skip over a given number of tokens
+     *
+     * \param  n  number of tokens to skip over
+     */
    void skip(int n);
+
+    /*! Indicate whether more tokens are available
+     *
+     * \return   true if there are more tokens, false if not
+     */
    bool has_next() const;
+
+    /*! Search the text to be processed for a sub-string.
+     *
+     * \param  str  string to be searched for
+     * \return      true if string was found, false if not
+     */
    bool contains(const std::string & str) const;
+
+    /*! Retrieve next token.
+     *
+     * \return   string with the next token
+     */
    std::string next();

+    /*! Count number of tokens in text.
+     *
+     * \return   number of counted tokens
+     */
    size_t count();
+
+    /*! Retrieve the entire text converted to an STL vector of tokens.
+     *
+     * \return   The STL vector
+     */
    std::vector<std::string> as_vector();
 };

@ -74,6 +112,11 @@ public:
    }
 };

+/*! Class for reading text with numbers
+ *
+ * \sa Tokenizer
+ */
+
 class ValueTokenizer {
    Tokenizer tokens;
 public: