initial version

20 years ago · adad69abd1
parent b3d92f3be9
commit adad69abd1
2 changed files with 243 additions and 0 deletions
--- a/mrw/tokenizer.hpp
+++ b/mrw/tokenizer.hpp
@ -0,0 +1,171 @@
+/** @file
+
+    $Id$
+
+    $Date$
+    $Author$
+
+    @copy &copy; Marc W&auml;ckerlin
+    @license LGPL, see file <a href="license.html">COPYING</a>
+
+    $Log$
+    Revision 1.1  2004/12/17 16:26:58  marc
+    initial version
+
+
+*/
+
+#include <string>
+#include <stdexcept>
+
+namespace mrw {
+
+  /** @addtogroup regexp
+
+      There is a Tokenizer which splits strings according to a list of
+      delimiters and allows to iterate over the individual tokens:
+
+      This code:
+      
+      @code
+      mrw::Tokenizer token("first,second,third,,fifth", false, ",");
+      while (token) std::cout<<"TOKEN: '"<<token()<<"'"<<std::endl;
+      @endcode
+      Writes:
+
+      @verbatim
+      TOKEN: 'first'
+      TOKEN: 'second'
+      TOKEN: 'third'
+      TOKEN: ''
+      TOKEN: 'fifth'
+      @endverbatim
+  */
+  //@{
+
+  /** @brief split strings into parts separated by delimiters
+
+      Splits a string into individual parts according to a list of
+      delimiters. If a delimiter is followed by another delimiter, all
+      delimiters are eaten if flag @c greedy is set.
+
+      This code:
+      
+      @code
+      mrw::Tokenizer token("Hello world, here I am!", true, " \t\n,.?!");
+      while (token) std::cout<<"TOKEN: '"<<token()<<"'"<<std::endl;
+      @endcode
+
+      Writes:
+
+      @verbatim
+      TOKEN: 'Hello'
+      TOKEN: 'world'
+      TOKEN: 'here'
+      TOKEN: 'I'
+      TOKEN: 'am'
+      @endverbatim
+  */      
+  class Tokenizer {
+    
+  public:
+    
+    /** @brief initialize a Tokenizer
+
+        @param text the text that has to be split into tokens
+        
+        @param greedy
+          - @c true  don't generate empty tokens, if a delimiter is followed
+                     by another delimiter, both are removed
+          - @c false if several delimiters follow each other in the text,
+                     eat them all and don't produce empty tokens
+
+        @param delim a list of delimiters, each char in the string is a
+                     delimiter
+    */
+    Tokenizer(const std::string& text, bool greedy = true,
+              const std::string& delim = " \t\n")
+      throw(std::bad_exception):
+      _text(text), _delim(delim), _pos(0), _oldpos(0), _greedy(greedy) {
+    }
+    
+    /** @brief advance to the next token
+
+        Advances to the next token. Call this before you access the tokens.
+
+        @code
+        while (token) [...]
+        @endcode
+
+        @return
+          - @c true  if there is a token
+          - @c false if the end was reaced
+
+        @see @ref mrw::Tokenizer for an example
+    */
+    operator bool() throw(std::bad_exception) {
+      _oldpos = _greedy
+        ? _text.find_first_not_of(_delim, _pos)
+        : _pos>0 ? _pos+1 : _pos;
+      if (_pos==std::string::npos || _oldpos==std::string::npos) return false;
+      _pos = _text.find_first_of(_delim, _oldpos);
+      return true;
+    }
+    
+    /** @brief access the token
+
+        Get the actual token. You must first call @c
+        Tokenizer::operator bool(), before you can access a
+        token. This method does not change the state of the object, so
+        it can be accessed several times and always returns the same
+        token.
+
+        @code
+        std::string actualToken = token();
+        @endcode
+
+        @return returns the actual token
+
+        @see @ref mrw::Tokenizer for an example
+    */
+    std::string operator()() const throw(std::bad_exception) {
+      return _text.substr(_oldpos, _pos-_oldpos);
+    }
+    
+    /** @brief reset the tokenizer
+
+        Resets the tokenizer so, that you can restart your iteration.
+    */
+    void reset() throw() {
+      _pos = _oldpos = 0;
+    }
+
+    /** @brief reset the tokenizer and initializes it with a new text
+
+        Resets the tokenizer so, that you can restart your iteration
+        on new text.
+
+        @code
+        mrw::Tokenizer token("Hello World");
+        while (token) std::cout<<"-> "<<token();
+        token.reset("Another text to split");
+        while (token) std::cout<<"-> "<<token();        
+        @endcode
+
+        @param text the text given in the constructor is replaced
+    */
+    void reset(const std::string& text) throw(std::bad_exception) {
+      _text = text;
+      reset();
+    }
+    
+  private:
+    std::string _text;
+    std::string _delim;
+    std::string::size_type _pos;
+    std::string::size_type _oldpos;
+    bool _greedy;
+  };
+  //@}
+}
+
--- a/mrw/tokenizer_test.cpp
+++ b/mrw/tokenizer_test.cpp
@ -0,0 +1,72 @@
+/** @file
+
+    $Id$
+
+    $Date$
+    $Author$
+
+    @copy &copy; Marc W&auml;ckerlin
+    @license LGPL, see file <a href="license.html">COPYING</a>
+
+    $Log$
+    Revision 1.1  2004/12/17 16:26:58  marc
+    initial version
+
+
+*/
+
+#include <mrw/tokenizer.hpp>
+#include <mrw/list.hpp>
+#include <algorithm>
+
+#include <cppunit/TestFixture.h>
+#include <cppunit/ui/text/TestRunner.h>
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/extensions/TestFactoryRegistry.h>
+
+#include <iostream>
+class TokenizerTest: public CppUnit::TestFixture { 
+public:
+  void CheckNonGreedy() {
+    const std::string aux[] = {"first", "second", "third", "", "fifth"};
+    std::list<std::string> a(aux, aux+sizeof(aux)/sizeof(std::string)), b;
+    mrw::Tokenizer token("first,second,third,,fifth", false, ",");
+    while (token) b<<token();
+    CPPUNIT_ASSERT(equal(a.begin(), a.end(), b.begin()));
+  }
+  void CheckGreedy() {
+    const std::string aux[] = {"Hello", "world", "here", "I", "am"};
+    std::list<std::string> a(aux, aux+sizeof(aux)/sizeof(std::string)), b;
+    mrw::Tokenizer token("Hello world, here I am!", true, " \t\n,.?!");
+    while (token) b<<token();
+    CPPUNIT_ASSERT(equal(a.begin(), a.end(), b.begin()));
+  }
+  void CheckReset() {
+    const std::string aux[] = {"first", "second", "third", "", "fifth"};
+    std::list<std::string> a(aux, aux+sizeof(aux)/sizeof(std::string)), b;
+    mrw::Tokenizer token("first,second,third,,fifth", false, ",");
+    while (token) b<<token();
+    CPPUNIT_ASSERT(equal(a.begin(), a.end(), b.begin()));
+    const std::string aux2[] = {"a", "b", "c", "d", "e"};
+    std::list<std::string> a2(aux2, aux2+sizeof(aux2)/sizeof(std::string)),
+      b2, b3;
+    token.reset("a,b,c,d,e");
+    while (token) b2<<token();
+    CPPUNIT_ASSERT(equal(a2.begin(), a2.end(), b2.begin()));
+    token.reset();
+    while (token) b3<<token();
+    CPPUNIT_ASSERT(equal(a2.begin(), a2.end(), b3.begin()));
+  }
+  CPPUNIT_TEST_SUITE(TokenizerTest);
+  CPPUNIT_TEST(CheckNonGreedy);
+  CPPUNIT_TEST(CheckGreedy);
+  CPPUNIT_TEST(CheckReset);
+  CPPUNIT_TEST_SUITE_END();
+};
+CPPUNIT_TEST_SUITE_REGISTRATION(TokenizerTest);
+
+int main() {
+  CppUnit::TextUi::TestRunner runner;
+  runner.addTest(CppUnit::TestFactoryRegistry::getRegistry().makeTest());
+  return runner.run() ? 0 : 1;
+}