mrw/tokenizer.hpp

/** @file

    $Id$

    $Date$
    $Author$

    @copy &copy; Marc W&auml;ckerlin
    @license LGPL, see file <a href="license.html">COPYING</a>

    $Log$
    Revision 1.2  2004/12/20 07:40:36  marc
    documentation improved, new grouping

    Revision 1.1  2004/12/17 16:26:58  marc
    initial version


*/

#include <string>
#include <stdexcept>

namespace mrw {

  /** @addtogroup regexp
   */
  //@{
  /** @defgroup regexptokenizer Tokenizer

      There is a Tokenizer which splits strings according to a list of
      delimiters and allows to iterate over the individual tokens:

      This code:
      
      @code
      mrw::Tokenizer token("first,second,third,,fifth", false, ",");
      while (token) std::cout<<"TOKEN: '"<<token()<<"'"<<std::endl;
      @endcode
      Writes:

      @verbatim
      TOKEN: 'first'
      TOKEN: 'second'
      TOKEN: 'third'
      TOKEN: ''
      TOKEN: 'fifth'
      @endverbatim
  */
  //@{

  /** @brief split strings into parts separated by delimiters

      Splits a string into individual parts according to a list of
      delimiters. If a delimiter is followed by another delimiter, all
      delimiters are eaten if flag @c greedy is set.

      This code:
      
      @code
      mrw::Tokenizer token("Hello world, here I am!", true, " \t\n,.?!");
      while (token) std::cout<<"TOKEN: '"<<token()<<"'"<<std::endl;
      @endcode

      Writes:

      @verbatim
      TOKEN: 'Hello'
      TOKEN: 'world'
      TOKEN: 'here'
      TOKEN: 'I'
      TOKEN: 'am'
      @endverbatim
  */      
  class Tokenizer {
    
  public:
    
    /** @brief initialize a Tokenizer

        @param text the text that has to be split into tokens
        
        @param greedy
          - @c true  don't generate empty tokens, if a delimiter is followed
                     by another delimiter, both are removed
          - @c false if several delimiters follow each other in the text,
                     eat them all and don't produce empty tokens

        @param delim a list of delimiters, each char in the string is a
                     delimiter
    */
    Tokenizer(const std::string& text, bool greedy = true,
              const std::string& delim = " \t\n")
      throw(std::bad_exception):
      _text(text), _delim(delim), _pos(0), _oldpos(0), _greedy(greedy) {
    }
    
    /** @brief advance to the next token

        Advances to the next token. Call this before you access the tokens.

        @code
        while (token) [...]
        @endcode

        @return
          - @c true  if there is a token
          - @c false if the end was reaced

        @see @ref mrw::Tokenizer for an example
    */
    operator bool() throw(std::bad_exception) {
      _oldpos = _greedy
        ? _text.find_first_not_of(_delim, _pos)
        : _pos>0 ? _pos+1 : _pos;
      if (_pos==std::string::npos || _oldpos==std::string::npos) return false;
      _pos = _text.find_first_of(_delim, _oldpos);
      return true;
    }
    
    /** @brief access the token

        Get the actual token. You must first call @c
        Tokenizer::operator bool(), before you can access a
        token. This method does not change the state of the object, so
        it can be accessed several times and always returns the same
        token.

        @code
        std::string actualToken = token();
        @endcode

        @return returns the actual token

        @see @ref mrw::Tokenizer for an example
    */
    std::string operator()() const throw(std::bad_exception) {
      return _text.substr(_oldpos, _pos-_oldpos);
    }
    
    /** @brief reset the tokenizer

        Resets the tokenizer so, that you can restart your iteration.
    */
    void reset() throw() {
      _pos = _oldpos = 0;
    }

    /** @brief reset the tokenizer and initializes it with a new text

        Resets the tokenizer so, that you can restart your iteration
        on new text.

        @code
        mrw::Tokenizer token("Hello World");
        while (token) std::cout<<"-> "<<token();
        token.reset("Another text to split");
        while (token) std::cout<<"-> "<<token();        
        @endcode

        @param text the text given in the constructor is replaced
    */
    void reset(const std::string& text) throw(std::bad_exception) {
      _text = text;
      reset();
    }
    
  private:
    std::string _text;
    std::string _delim;
    std::string::size_type _pos;
    std::string::size_type _oldpos;
    bool _greedy;
  };
  //@}
  //@}
}
initial version 2004-12-17 16:26:58 +00:00			`/** @file`

			$Id$

			$Date$
			$Author$

			`@copy © Marc Wäckerlin`
			`@license LGPL, see file <a href="license.html">COPYING</a>`

			$Log$
documentation improved, new grouping 2004-12-20 07:40:36 +00:00			`Revision 1.2 2004/12/20 07:40:36 marc`
			`documentation improved, new grouping`

initial version 2004-12-17 16:26:58 +00:00			`Revision 1.1 2004/12/17 16:26:58 marc`
			`initial version`


			`*/`

			`#include <string>`
			`#include <stdexcept>`

			`namespace mrw {`

			`/** @addtogroup regexp`
documentation improved, new grouping 2004-12-20 07:40:36 +00:00			`*/`
			`//@{`
			`/** @defgroup regexptokenizer Tokenizer`
initial version 2004-12-17 16:26:58 +00:00
			`There is a Tokenizer which splits strings according to a list of`
			`delimiters and allows to iterate over the individual tokens:`

			`This code:`

			`@code`
			`mrw::Tokenizer token("first,second,third,,fifth", false, ",");`
			`while (token) std::cout<<"TOKEN: '"<<token()<<"'"<<std::endl;`
			`@endcode`
			`Writes:`

			`@verbatim`
			`TOKEN: 'first'`
			`TOKEN: 'second'`
			`TOKEN: 'third'`
			`TOKEN: ''`
			`TOKEN: 'fifth'`
			`@endverbatim`
			`*/`
			`//@{`

			`/** @brief split strings into parts separated by delimiters`

			`Splits a string into individual parts according to a list of`
			`delimiters. If a delimiter is followed by another delimiter, all`
			`delimiters are eaten if flag @c greedy is set.`

			`This code:`

			`@code`
			`mrw::Tokenizer token("Hello world, here I am!", true, " \t\n,.?!");`
			`while (token) std::cout<<"TOKEN: '"<<token()<<"'"<<std::endl;`
			`@endcode`

			`Writes:`

			`@verbatim`
			`TOKEN: 'Hello'`
			`TOKEN: 'world'`
			`TOKEN: 'here'`
			`TOKEN: 'I'`
			`TOKEN: 'am'`
			`@endverbatim`
			`*/`
			`class Tokenizer {`

			`public:`

			`/** @brief initialize a Tokenizer`

			`@param text the text that has to be split into tokens`

			`@param greedy`
			`- @c true don't generate empty tokens, if a delimiter is followed`
			`by another delimiter, both are removed`
			`- @c false if several delimiters follow each other in the text,`
			`eat them all and don't produce empty tokens`

			`@param delim a list of delimiters, each char in the string is a`
			`delimiter`
			`*/`
			`Tokenizer(const std::string& text, bool greedy = true,`
			`const std::string& delim = " \t\n")`
			`throw(std::bad_exception):`
			`_text(text), _delim(delim), _pos(0), _oldpos(0), _greedy(greedy) {`
			`}`

			`/** @brief advance to the next token`

			`Advances to the next token. Call this before you access the tokens.`

			`@code`
			`while (token) [...]`
			`@endcode`

			`@return`
			`- @c true if there is a token`
			`- @c false if the end was reaced`

			`@see @ref mrw::Tokenizer for an example`
			`*/`
			`operator bool() throw(std::bad_exception) {`
			`_oldpos = _greedy`
			`? _text.find_first_not_of(_delim, _pos)`
			`: _pos>0 ? _pos+1 : _pos;`
			`if (_pos==std::string::npos \|\| _oldpos==std::string::npos) return false;`
			`_pos = _text.find_first_of(_delim, _oldpos);`
			`return true;`
			`}`

			`/** @brief access the token`

			`Get the actual token. You must first call @c`
			`Tokenizer::operator bool(), before you can access a`
			`token. This method does not change the state of the object, so`
			`it can be accessed several times and always returns the same`
			`token.`

			`@code`
			`std::string actualToken = token();`
			`@endcode`

			`@return returns the actual token`

			`@see @ref mrw::Tokenizer for an example`
			`*/`
			`std::string operator()() const throw(std::bad_exception) {`
			`return _text.substr(_oldpos, _pos-_oldpos);`
			`}`

			`/** @brief reset the tokenizer`

			`Resets the tokenizer so, that you can restart your iteration.`
			`*/`
			`void reset() throw() {`
			`_pos = _oldpos = 0;`
			`}`

			`/** @brief reset the tokenizer and initializes it with a new text`

			`Resets the tokenizer so, that you can restart your iteration`
			`on new text.`

			`@code`
			`mrw::Tokenizer token("Hello World");`
			`while (token) std::cout<<"-> "<<token();`
			`token.reset("Another text to split");`
			`while (token) std::cout<<"-> "<<token();`
			`@endcode`

			`@param text the text given in the constructor is replaced`
			`*/`
			`void reset(const std::string& text) throw(std::bad_exception) {`
			`_text = text;`
			`reset();`
			`}`

			`private:`
			`std::string _text;`
			`std::string _delim;`
			`std::string::size_type _pos;`
			`std::string::size_type _oldpos;`
			`bool _greedy;`
			`};`
			`//@}`
documentation improved, new grouping 2004-12-20 07:40:36 +00:00			`//@}`
initial version 2004-12-17 16:26:58 +00:00			`}`