initial version
This commit is contained in:
		
							
								
								
									
										171
									
								
								mrw/tokenizer.hpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										171
									
								
								mrw/tokenizer.hpp
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,171 @@
 | 
				
			|||||||
 | 
					/** @file
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    $Id$
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    $Date$
 | 
				
			||||||
 | 
					    $Author$
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @copy © Marc Wäckerlin
 | 
				
			||||||
 | 
					    @license LGPL, see file <a href="license.html">COPYING</a>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    $Log$
 | 
				
			||||||
 | 
					    Revision 1.1  2004/12/17 16:26:58  marc
 | 
				
			||||||
 | 
					    initial version
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					*/
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#include <string>
 | 
				
			||||||
 | 
					#include <stdexcept>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					namespace mrw {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  /** @addtogroup regexp
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      There is a Tokenizer which splits strings according to a list of
 | 
				
			||||||
 | 
					      delimiters and allows to iterate over the individual tokens:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      This code:
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      @code
 | 
				
			||||||
 | 
					      mrw::Tokenizer token("first,second,third,,fifth", false, ",");
 | 
				
			||||||
 | 
					      while (token) std::cout<<"TOKEN: '"<<token()<<"'"<<std::endl;
 | 
				
			||||||
 | 
					      @endcode
 | 
				
			||||||
 | 
					      Writes:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      @verbatim
 | 
				
			||||||
 | 
					      TOKEN: 'first'
 | 
				
			||||||
 | 
					      TOKEN: 'second'
 | 
				
			||||||
 | 
					      TOKEN: 'third'
 | 
				
			||||||
 | 
					      TOKEN: ''
 | 
				
			||||||
 | 
					      TOKEN: 'fifth'
 | 
				
			||||||
 | 
					      @endverbatim
 | 
				
			||||||
 | 
					  */
 | 
				
			||||||
 | 
					  //@{
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  /** @brief split strings into parts separated by delimiters
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      Splits a string into individual parts according to a list of
 | 
				
			||||||
 | 
					      delimiters. If a delimiter is followed by another delimiter, all
 | 
				
			||||||
 | 
					      delimiters are eaten if flag @c greedy is set.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      This code:
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      @code
 | 
				
			||||||
 | 
					      mrw::Tokenizer token("Hello world, here I am!", true, " \t\n,.?!");
 | 
				
			||||||
 | 
					      while (token) std::cout<<"TOKEN: '"<<token()<<"'"<<std::endl;
 | 
				
			||||||
 | 
					      @endcode
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      Writes:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      @verbatim
 | 
				
			||||||
 | 
					      TOKEN: 'Hello'
 | 
				
			||||||
 | 
					      TOKEN: 'world'
 | 
				
			||||||
 | 
					      TOKEN: 'here'
 | 
				
			||||||
 | 
					      TOKEN: 'I'
 | 
				
			||||||
 | 
					      TOKEN: 'am'
 | 
				
			||||||
 | 
					      @endverbatim
 | 
				
			||||||
 | 
					  */      
 | 
				
			||||||
 | 
					  class Tokenizer {
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					  public:
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    /** @brief initialize a Tokenizer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        @param text the text that has to be split into tokens
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        @param greedy
 | 
				
			||||||
 | 
					          - @c true  don't generate empty tokens, if a delimiter is followed
 | 
				
			||||||
 | 
					                     by another delimiter, both are removed
 | 
				
			||||||
 | 
					          - @c false if several delimiters follow each other in the text,
 | 
				
			||||||
 | 
					                     eat them all and don't produce empty tokens
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        @param delim a list of delimiters, each char in the string is a
 | 
				
			||||||
 | 
					                     delimiter
 | 
				
			||||||
 | 
					    */
 | 
				
			||||||
 | 
					    Tokenizer(const std::string& text, bool greedy = true,
 | 
				
			||||||
 | 
					              const std::string& delim = " \t\n")
 | 
				
			||||||
 | 
					      throw(std::bad_exception):
 | 
				
			||||||
 | 
					      _text(text), _delim(delim), _pos(0), _oldpos(0), _greedy(greedy) {
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    /** @brief advance to the next token
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        Advances to the next token. Call this before you access the tokens.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        @code
 | 
				
			||||||
 | 
					        while (token) [...]
 | 
				
			||||||
 | 
					        @endcode
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        @return
 | 
				
			||||||
 | 
					          - @c true  if there is a token
 | 
				
			||||||
 | 
					          - @c false if the end was reaced
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        @see @ref mrw::Tokenizer for an example
 | 
				
			||||||
 | 
					    */
 | 
				
			||||||
 | 
					    operator bool() throw(std::bad_exception) {
 | 
				
			||||||
 | 
					      _oldpos = _greedy
 | 
				
			||||||
 | 
					        ? _text.find_first_not_of(_delim, _pos)
 | 
				
			||||||
 | 
					        : _pos>0 ? _pos+1 : _pos;
 | 
				
			||||||
 | 
					      if (_pos==std::string::npos || _oldpos==std::string::npos) return false;
 | 
				
			||||||
 | 
					      _pos = _text.find_first_of(_delim, _oldpos);
 | 
				
			||||||
 | 
					      return true;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    /** @brief access the token
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        Get the actual token. You must first call @c
 | 
				
			||||||
 | 
					        Tokenizer::operator bool(), before you can access a
 | 
				
			||||||
 | 
					        token. This method does not change the state of the object, so
 | 
				
			||||||
 | 
					        it can be accessed several times and always returns the same
 | 
				
			||||||
 | 
					        token.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        @code
 | 
				
			||||||
 | 
					        std::string actualToken = token();
 | 
				
			||||||
 | 
					        @endcode
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        @return returns the actual token
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        @see @ref mrw::Tokenizer for an example
 | 
				
			||||||
 | 
					    */
 | 
				
			||||||
 | 
					    std::string operator()() const throw(std::bad_exception) {
 | 
				
			||||||
 | 
					      return _text.substr(_oldpos, _pos-_oldpos);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    /** @brief reset the tokenizer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        Resets the tokenizer so, that you can restart your iteration.
 | 
				
			||||||
 | 
					    */
 | 
				
			||||||
 | 
					    void reset() throw() {
 | 
				
			||||||
 | 
					      _pos = _oldpos = 0;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    /** @brief reset the tokenizer and initializes it with a new text
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        Resets the tokenizer so, that you can restart your iteration
 | 
				
			||||||
 | 
					        on new text.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        @code
 | 
				
			||||||
 | 
					        mrw::Tokenizer token("Hello World");
 | 
				
			||||||
 | 
					        while (token) std::cout<<"-> "<<token();
 | 
				
			||||||
 | 
					        token.reset("Another text to split");
 | 
				
			||||||
 | 
					        while (token) std::cout<<"-> "<<token();        
 | 
				
			||||||
 | 
					        @endcode
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        @param text the text given in the constructor is replaced
 | 
				
			||||||
 | 
					    */
 | 
				
			||||||
 | 
					    void reset(const std::string& text) throw(std::bad_exception) {
 | 
				
			||||||
 | 
					      _text = text;
 | 
				
			||||||
 | 
					      reset();
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					  private:
 | 
				
			||||||
 | 
					    std::string _text;
 | 
				
			||||||
 | 
					    std::string _delim;
 | 
				
			||||||
 | 
					    std::string::size_type _pos;
 | 
				
			||||||
 | 
					    std::string::size_type _oldpos;
 | 
				
			||||||
 | 
					    bool _greedy;
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					  //@}
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
							
								
								
									
										72
									
								
								mrw/tokenizer_test.cpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										72
									
								
								mrw/tokenizer_test.cpp
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,72 @@
 | 
				
			|||||||
 | 
					/** @file
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    $Id$
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    $Date$
 | 
				
			||||||
 | 
					    $Author$
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @copy © Marc Wäckerlin
 | 
				
			||||||
 | 
					    @license LGPL, see file <a href="license.html">COPYING</a>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    $Log$
 | 
				
			||||||
 | 
					    Revision 1.1  2004/12/17 16:26:58  marc
 | 
				
			||||||
 | 
					    initial version
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					*/
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#include <mrw/tokenizer.hpp>
 | 
				
			||||||
 | 
					#include <mrw/list.hpp>
 | 
				
			||||||
 | 
					#include <algorithm>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#include <cppunit/TestFixture.h>
 | 
				
			||||||
 | 
					#include <cppunit/ui/text/TestRunner.h>
 | 
				
			||||||
 | 
					#include <cppunit/extensions/HelperMacros.h>
 | 
				
			||||||
 | 
					#include <cppunit/extensions/TestFactoryRegistry.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#include <iostream>
 | 
				
			||||||
 | 
					class TokenizerTest: public CppUnit::TestFixture { 
 | 
				
			||||||
 | 
					public:
 | 
				
			||||||
 | 
					  void CheckNonGreedy() {
 | 
				
			||||||
 | 
					    const std::string aux[] = {"first", "second", "third", "", "fifth"};
 | 
				
			||||||
 | 
					    std::list<std::string> a(aux, aux+sizeof(aux)/sizeof(std::string)), b;
 | 
				
			||||||
 | 
					    mrw::Tokenizer token("first,second,third,,fifth", false, ",");
 | 
				
			||||||
 | 
					    while (token) b<<token();
 | 
				
			||||||
 | 
					    CPPUNIT_ASSERT(equal(a.begin(), a.end(), b.begin()));
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  void CheckGreedy() {
 | 
				
			||||||
 | 
					    const std::string aux[] = {"Hello", "world", "here", "I", "am"};
 | 
				
			||||||
 | 
					    std::list<std::string> a(aux, aux+sizeof(aux)/sizeof(std::string)), b;
 | 
				
			||||||
 | 
					    mrw::Tokenizer token("Hello world, here I am!", true, " \t\n,.?!");
 | 
				
			||||||
 | 
					    while (token) b<<token();
 | 
				
			||||||
 | 
					    CPPUNIT_ASSERT(equal(a.begin(), a.end(), b.begin()));
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  void CheckReset() {
 | 
				
			||||||
 | 
					    const std::string aux[] = {"first", "second", "third", "", "fifth"};
 | 
				
			||||||
 | 
					    std::list<std::string> a(aux, aux+sizeof(aux)/sizeof(std::string)), b;
 | 
				
			||||||
 | 
					    mrw::Tokenizer token("first,second,third,,fifth", false, ",");
 | 
				
			||||||
 | 
					    while (token) b<<token();
 | 
				
			||||||
 | 
					    CPPUNIT_ASSERT(equal(a.begin(), a.end(), b.begin()));
 | 
				
			||||||
 | 
					    const std::string aux2[] = {"a", "b", "c", "d", "e"};
 | 
				
			||||||
 | 
					    std::list<std::string> a2(aux2, aux2+sizeof(aux2)/sizeof(std::string)),
 | 
				
			||||||
 | 
					      b2, b3;
 | 
				
			||||||
 | 
					    token.reset("a,b,c,d,e");
 | 
				
			||||||
 | 
					    while (token) b2<<token();
 | 
				
			||||||
 | 
					    CPPUNIT_ASSERT(equal(a2.begin(), a2.end(), b2.begin()));
 | 
				
			||||||
 | 
					    token.reset();
 | 
				
			||||||
 | 
					    while (token) b3<<token();
 | 
				
			||||||
 | 
					    CPPUNIT_ASSERT(equal(a2.begin(), a2.end(), b3.begin()));
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  CPPUNIT_TEST_SUITE(TokenizerTest);
 | 
				
			||||||
 | 
					  CPPUNIT_TEST(CheckNonGreedy);
 | 
				
			||||||
 | 
					  CPPUNIT_TEST(CheckGreedy);
 | 
				
			||||||
 | 
					  CPPUNIT_TEST(CheckReset);
 | 
				
			||||||
 | 
					  CPPUNIT_TEST_SUITE_END();
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					CPPUNIT_TEST_SUITE_REGISTRATION(TokenizerTest);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					int main() {
 | 
				
			||||||
 | 
					  CppUnit::TextUi::TestRunner runner;
 | 
				
			||||||
 | 
					  runner.addTest(CppUnit::TestFactoryRegistry::getRegistry().makeTest());
 | 
				
			||||||
 | 
					  return runner.run() ? 0 : 1;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
		Reference in New Issue
	
	Block a user