initial version

2004-12-17 16:26:58 +00:00
parent b3d92f3be9
commit adad69abd1
2 changed files with 243 additions and 0 deletions
--- a/mrw/tokenizer.hpp
+++ b/mrw/tokenizer.hpp
@@ -0,0 +1,171 @@
 /** @file
    $Id$
    $Date$
    $Author$
    @copy &copy; Marc W&auml;ckerlin
    @license LGPL, see file <a href="license.html">COPYING</a>
    $Log$
    Revision 1.1  2004/12/17 16:26:58  marc
    initial version
 */
 #include <string>
 #include <stdexcept>
 namespace mrw {
  /** @addtogroup regexp
      There is a Tokenizer which splits strings according to a list of
      delimiters and allows to iterate over the individual tokens:
      This code:
      @code
      mrw::Tokenizer token("first,second,third,,fifth", false, ",");
      while (token) std::cout<<"TOKEN: '"<<token()<<"'"<<std::endl;
      @endcode
      Writes:
      @verbatim
      TOKEN: 'first'
      TOKEN: 'second'
      TOKEN: 'third'
      TOKEN: ''
      TOKEN: 'fifth'
      @endverbatim
  */
  //@{
  /** @brief split strings into parts separated by delimiters
      Splits a string into individual parts according to a list of
      delimiters. If a delimiter is followed by another delimiter, all
      delimiters are eaten if flag @c greedy is set.
      This code:
      @code
      mrw::Tokenizer token("Hello world, here I am!", true, " \t\n,.?!");
      while (token) std::cout<<"TOKEN: '"<<token()<<"'"<<std::endl;
      @endcode
      Writes:
      @verbatim
      TOKEN: 'Hello'
      TOKEN: 'world'
      TOKEN: 'here'
      TOKEN: 'I'
      TOKEN: 'am'
      @endverbatim
  */      
  class Tokenizer {
  public:
    /** @brief initialize a Tokenizer
        @param text the text that has to be split into tokens
        @param greedy
          - @c true  don't generate empty tokens, if a delimiter is followed
                     by another delimiter, both are removed
          - @c false if several delimiters follow each other in the text,
                     eat them all and don't produce empty tokens
        @param delim a list of delimiters, each char in the string is a
                     delimiter
    */
    Tokenizer(const std::string& text, bool greedy = true,
              const std::string& delim = " \t\n")
      throw(std::bad_exception):
      _text(text), _delim(delim), _pos(0), _oldpos(0), _greedy(greedy) {
    }
    /** @brief advance to the next token
        Advances to the next token. Call this before you access the tokens.
        @code
        while (token) [...]
        @endcode
        @return
          - @c true  if there is a token
          - @c false if the end was reaced
        @see @ref mrw::Tokenizer for an example
    */
    operator bool() throw(std::bad_exception) {
      _oldpos = _greedy
        ? _text.find_first_not_of(_delim, _pos)
        : _pos>0 ? _pos+1 : _pos;
      if (_pos==std::string::npos || _oldpos==std::string::npos) return false;
      _pos = _text.find_first_of(_delim, _oldpos);
      return true;
    }
    /** @brief access the token
        Get the actual token. You must first call @c
        Tokenizer::operator bool(), before you can access a
        token. This method does not change the state of the object, so
        it can be accessed several times and always returns the same
        token.
        @code
        std::string actualToken = token();
        @endcode
        @return returns the actual token
        @see @ref mrw::Tokenizer for an example
    */
    std::string operator()() const throw(std::bad_exception) {
      return _text.substr(_oldpos, _pos-_oldpos);
    }
    /** @brief reset the tokenizer
        Resets the tokenizer so, that you can restart your iteration.
    */
    void reset() throw() {
      _pos = _oldpos = 0;
    }
    /** @brief reset the tokenizer and initializes it with a new text
        Resets the tokenizer so, that you can restart your iteration
        on new text.
        @code
        mrw::Tokenizer token("Hello World");
        while (token) std::cout<<"-> "<<token();
        token.reset("Another text to split");
        while (token) std::cout<<"-> "<<token();        
        @endcode
        @param text the text given in the constructor is replaced
    */
    void reset(const std::string& text) throw(std::bad_exception) {
      _text = text;
      reset();
    }
  private:
    std::string _text;
    std::string _delim;
    std::string::size_type _pos;
    std::string::size_type _oldpos;
    bool _greedy;
  };
  //@}
 }
--- a/mrw/tokenizer_test.cpp
+++ b/mrw/tokenizer_test.cpp
@@ -0,0 +1,72 @@
 /** @file
    $Id$
    $Date$
    $Author$
    @copy &copy; Marc W&auml;ckerlin
    @license LGPL, see file <a href="license.html">COPYING</a>
    $Log$
    Revision 1.1  2004/12/17 16:26:58  marc
    initial version
 */
 #include <mrw/tokenizer.hpp>
 #include <mrw/list.hpp>
 #include <algorithm>
 #include <cppunit/TestFixture.h>
 #include <cppunit/ui/text/TestRunner.h>
 #include <cppunit/extensions/HelperMacros.h>
 #include <cppunit/extensions/TestFactoryRegistry.h>
 #include <iostream>
 class TokenizerTest: public CppUnit::TestFixture { 
 public:
  void CheckNonGreedy() {
    const std::string aux[] = {"first", "second", "third", "", "fifth"};
    std::list<std::string> a(aux, aux+sizeof(aux)/sizeof(std::string)), b;
    mrw::Tokenizer token("first,second,third,,fifth", false, ",");
    while (token) b<<token();
    CPPUNIT_ASSERT(equal(a.begin(), a.end(), b.begin()));
  }
  void CheckGreedy() {
    const std::string aux[] = {"Hello", "world", "here", "I", "am"};
    std::list<std::string> a(aux, aux+sizeof(aux)/sizeof(std::string)), b;
    mrw::Tokenizer token("Hello world, here I am!", true, " \t\n,.?!");
    while (token) b<<token();
    CPPUNIT_ASSERT(equal(a.begin(), a.end(), b.begin()));
  }
  void CheckReset() {
    const std::string aux[] = {"first", "second", "third", "", "fifth"};
    std::list<std::string> a(aux, aux+sizeof(aux)/sizeof(std::string)), b;
    mrw::Tokenizer token("first,second,third,,fifth", false, ",");
    while (token) b<<token();
    CPPUNIT_ASSERT(equal(a.begin(), a.end(), b.begin()));
    const std::string aux2[] = {"a", "b", "c", "d", "e"};
    std::list<std::string> a2(aux2, aux2+sizeof(aux2)/sizeof(std::string)),
      b2, b3;
    token.reset("a,b,c,d,e");
    while (token) b2<<token();
    CPPUNIT_ASSERT(equal(a2.begin(), a2.end(), b2.begin()));
    token.reset();
    while (token) b3<<token();
    CPPUNIT_ASSERT(equal(a2.begin(), a2.end(), b3.begin()));
  }
  CPPUNIT_TEST_SUITE(TokenizerTest);
  CPPUNIT_TEST(CheckNonGreedy);
  CPPUNIT_TEST(CheckGreedy);
  CPPUNIT_TEST(CheckReset);
  CPPUNIT_TEST_SUITE_END();
 };
 CPPUNIT_TEST_SUITE_REGISTRATION(TokenizerTest);
 int main() {
  CppUnit::TextUi::TestRunner runner;
  runner.addTest(CppUnit::TestFactoryRegistry::getRegistry().makeTest());
  return runner.run() ? 0 : 1;
 }