middle of porting; unstable, don't checkout; refs #1

2011-12-10 11:24:11 +00:00
parent 4357fe3d9f
commit 599af2dbbf
129 changed files with 9474 additions and 0 deletions
--- a/src/mrw/tokenizer.hpp
+++ b/src/mrw/tokenizer.hpp
@@ -0,0 +1,174 @@
+/** @file
+
+    $Id$
+
+    $Date$
+    $Author$
+
+    @copy &copy; Marc W&auml;ckerlin
+    @license LGPL, see file <a href="license.html">COPYING</a>
+
+*/
+#ifndef __MRW__TOKENIZER_HPP__
+#define __MRW__TOKENIZER_HPP__
+
+#include <string>
+#include <stdexcept>
+
+namespace mrw {
+
+  /** @addtogroup regexp
+   */
+  //@{
+  /** @defgroup regexptokenizer Tokenizer
+
+      @pre \#include <mrw/tokenizer.hpp>
+
+      There is a Tokenizer which splits strings according to a list of
+      delimiters and allows to iterate over the individual tokens:
+
+      This code:
+      
+      @code
+      mrw::Tokenizer token("first,second,third,,fifth", false, ",");
+      while (token) std::cout<<"TOKEN: '"<<token()<<"'"<<std::endl;
+      @endcode
+      Writes:
+
+      @verbatim
+      TOKEN: 'first'
+      TOKEN: 'second'
+      TOKEN: 'third'
+      TOKEN: ''
+      TOKEN: 'fifth'
+      @endverbatim
+  */
+  //@{
+
+  /** @brief split strings into parts separated by delimiters
+
+      Splits a string into individual parts according to a list of
+      delimiters. If a delimiter is followed by another delimiter, all
+      delimiters are eaten if flag @c greedy is set.
+
+      This code:
+      
+      @code
+      mrw::Tokenizer token("Hello world, here I am!", true, " \t\n,.?!");
+      while (token) std::cout<<"TOKEN: '"<<token()<<"'"<<std::endl;
+      @endcode
+
+      Writes:
+
+      @verbatim
+      TOKEN: 'Hello'
+      TOKEN: 'world'
+      TOKEN: 'here'
+      TOKEN: 'I'
+      TOKEN: 'am'
+      @endverbatim
+  */      
+  class Tokenizer {
+    
+  public:
+    
+    /** @brief initialize a Tokenizer
+
+        @param text the text that has to be split into tokens
+        
+        @param greedy
+          - @c true  don't generate empty tokens, if a delimiter is followed
+                     by another delimiter, both are removed
+          - @c false if several delimiters follow each other in the text,
+                     eat them all and don't produce empty tokens
+
+        @param delim a list of delimiters, each char in the string is a
+                     delimiter
+    */
+    Tokenizer(const std::string& text, bool greedy = true,
+              const std::string& delim = " \t\n")
+      throw(std::bad_exception):
+      _text(text), _delim(delim), _pos(0), _oldpos(0), _greedy(greedy) {
+    }
+    
+    /** @brief advance to the next token
+
+        Advances to the next token. Call this before you access the tokens.
+
+        @code
+        while (token) [...]
+        @endcode
+
+        @return
+          - @c true  if there is a token
+          - @c false if the end was reaced
+
+        @see @ref mrw::Tokenizer for an example
+    */
+    operator bool() throw(std::bad_exception) {
+      _oldpos = _greedy
+        ? _text.find_first_not_of(_delim, _pos)
+        : _pos>0 ? _pos+1 : _pos;
+      if (_pos==std::string::npos || _oldpos==std::string::npos) return false;
+      _pos = _text.find_first_of(_delim, _oldpos);
+      return true;
+    }
+    
+    /** @brief access the token
+
+        Get the actual token. You must first call @c
+        Tokenizer::operator bool(), before you can access a
+        token. This method does not change the state of the object, so
+        it can be accessed several times and always returns the same
+        token.
+
+        @code
+        std::string actualToken = token();
+        @endcode
+
+        @return returns the actual token
+
+        @see @ref mrw::Tokenizer for an example
+    */
+    std::string operator()() const throw(std::bad_exception) {
+      return _text.substr(_oldpos, _pos-_oldpos);
+    }
+    
+    /** @brief reset the tokenizer
+
+        Resets the tokenizer so, that you can restart your iteration.
+    */
+    void reset() throw() {
+      _pos = _oldpos = 0;
+    }
+
+    /** @brief reset the tokenizer and initializes it with a new text
+
+        Resets the tokenizer so, that you can restart your iteration
+        on new text.
+
+        @code
+        mrw::Tokenizer token("Hello World");
+        while (token) std::cout<<"-> "<<token();
+        token.reset("Another text to split");
+        while (token) std::cout<<"-> "<<token();        
+        @endcode
+
+        @param text the text given in the constructor is replaced
+    */
+    void reset(const std::string& text) throw(std::bad_exception) {
+      _text = text;
+      reset();
+    }
+    
+  private:
+    std::string _text;
+    std::string _delim;
+    std::string::size_type _pos;
+    std::string::size_type _oldpos;
+    bool _greedy;
+  };
+  //@}
+  //@}
+}
+#endif