mrw-cxx/mrw/tokenizer.hpp

/** @file

    $Id$

    $Date$
    $Author$

    @copy &copy; Marc W&auml;ckerlin
    @license LGPL, see file <a href="license.html">COPYING</a>

    $Log$
    Revision 1.2  2004/12/20 07:40:36  marc
    documentation improved, new grouping

    Revision 1.1  2004/12/17 16:26:58  marc
    initial version


*/

#include <string>
#include <stdexcept>

namespace mrw {

  /** @addtogroup regexp
   */
  //@{
  /** @defgroup regexptokenizer Tokenizer

      There is a Tokenizer which splits strings according to a list of
      delimiters and allows to iterate over the individual tokens:

      This code:

      @code
      mrw::Tokenizer token("first,second,third,,fifth", false, ",");
      while (token) std::cout<<"TOKEN: '"<<token()<<"'"<<std::endl;
      @endcode
      Writes:

      @verbatim
      TOKEN: 'first'
      TOKEN: 'second'
      TOKEN: 'third'
      TOKEN: ''
      TOKEN: 'fifth'
      @endverbatim
  */
  //@{

  /** @brief split strings into parts separated by delimiters

      Splits a string into individual parts according to a list of
      delimiters. If a delimiter is followed by another delimiter, all
      delimiters are eaten if flag @c greedy is set.

      This code:

      @code
      mrw::Tokenizer token("Hello world, here I am!", true, " \t\n,.?!");
      while (token) std::cout<<"TOKEN: '"<<token()<<"'"<<std::endl;
      @endcode

      Writes:

      @verbatim
      TOKEN: 'Hello'
      TOKEN: 'world'
      TOKEN: 'here'
      TOKEN: 'I'
      TOKEN: 'am'
      @endverbatim
  */
  class Tokenizer {

  public:

    /** @brief initialize a Tokenizer

        @param text the text that has to be split into tokens

        @param greedy
          - @c true  don't generate empty tokens, if a delimiter is followed
                     by another delimiter, both are removed
          - @c false if several delimiters follow each other in the text,
                     eat them all and don't produce empty tokens

        @param delim a list of delimiters, each char in the string is a
                     delimiter
    */
    Tokenizer(const std::string& text, bool greedy = true,
              const std::string& delim = " \t\n")
      throw(std::bad_exception):
      _text(text), _delim(delim), _pos(0), _oldpos(0), _greedy(greedy) {
    }

    /** @brief advance to the next token

        Advances to the next token. Call this before you access the tokens.

        @code
        while (token) [...]
        @endcode

        @return
          - @c true  if there is a token
          - @c false if the end was reaced

        @see @ref mrw::Tokenizer for an example
    */
    operator bool() throw(std::bad_exception) {
      _oldpos = _greedy
        ? _text.find_first_not_of(_delim, _pos)
        : _pos>0 ? _pos+1 : _pos;
      if (_pos==std::string::npos || _oldpos==std::string::npos) return false;
      _pos = _text.find_first_of(_delim, _oldpos);
      return true;
    }

    /** @brief access the token

        Get the actual token. You must first call @c
        Tokenizer::operator bool(), before you can access a
        token. This method does not change the state of the object, so
        it can be accessed several times and always returns the same
        token.

        @code
        std::string actualToken = token();
        @endcode

        @return returns the actual token

        @see @ref mrw::Tokenizer for an example
    */
    std::string operator()() const throw(std::bad_exception) {
      return _text.substr(_oldpos, _pos-_oldpos);
    }

    /** @brief reset the tokenizer

        Resets the tokenizer so, that you can restart your iteration.
    */
    void reset() throw() {
      _pos = _oldpos = 0;
    }

    /** @brief reset the tokenizer and initializes it with a new text

        Resets the tokenizer so, that you can restart your iteration
        on new text.

        @code
        mrw::Tokenizer token("Hello World");
        while (token) std::cout<<"-> "<<token();
        token.reset("Another text to split");
        while (token) std::cout<<"-> "<<token();
        @endcode

        @param text the text given in the constructor is replaced
    */
    void reset(const std::string& text) throw(std::bad_exception) {
      _text = text;
      reset();
    }

  private:
    std::string _text;
    std::string _delim;
    std::string::size_type _pos;
    std::string::size_type _oldpos;
    bool _greedy;
  };
  //@}
  //@}
}