From adad69abd1e9cc4686a2db49c510354a9b0cf153 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20W=C3=A4ckerlin?= Date: Fri, 17 Dec 2004 16:26:58 +0000 Subject: [PATCH] initial version --- mrw/tokenizer.hpp | 171 +++++++++++++++++++++++++++++++++++++++++ mrw/tokenizer_test.cpp | 72 +++++++++++++++++ 2 files changed, 243 insertions(+) create mode 100644 mrw/tokenizer.hpp create mode 100644 mrw/tokenizer_test.cpp diff --git a/mrw/tokenizer.hpp b/mrw/tokenizer.hpp new file mode 100644 index 0000000..7f54839 --- /dev/null +++ b/mrw/tokenizer.hpp @@ -0,0 +1,171 @@ +/** @file + + $Id$ + + $Date$ + $Author$ + + @copy © Marc Wäckerlin + @license LGPL, see file COPYING + + $Log$ + Revision 1.1 2004/12/17 16:26:58 marc + initial version + + +*/ + +#include +#include + +namespace mrw { + + /** @addtogroup regexp + + There is a Tokenizer which splits strings according to a list of + delimiters and allows to iterate over the individual tokens: + + This code: + + @code + mrw::Tokenizer token("first,second,third,,fifth", false, ","); + while (token) std::cout<<"TOKEN: '"<0 ? _pos+1 : _pos; + if (_pos==std::string::npos || _oldpos==std::string::npos) return false; + _pos = _text.find_first_of(_delim, _oldpos); + return true; + } + + /** @brief access the token + + Get the actual token. You must first call @c + Tokenizer::operator bool(), before you can access a + token. This method does not change the state of the object, so + it can be accessed several times and always returns the same + token. + + @code + std::string actualToken = token(); + @endcode + + @return returns the actual token + + @see @ref mrw::Tokenizer for an example + */ + std::string operator()() const throw(std::bad_exception) { + return _text.substr(_oldpos, _pos-_oldpos); + } + + /** @brief reset the tokenizer + + Resets the tokenizer so, that you can restart your iteration. + */ + void reset() throw() { + _pos = _oldpos = 0; + } + + /** @brief reset the tokenizer and initializes it with a new text + + Resets the tokenizer so, that you can restart your iteration + on new text. + + @code + mrw::Tokenizer token("Hello World"); + while (token) std::cout<<"-> "< "<COPYING + + $Log$ + Revision 1.1 2004/12/17 16:26:58 marc + initial version + + +*/ + +#include +#include +#include + +#include +#include +#include +#include + +#include +class TokenizerTest: public CppUnit::TestFixture { +public: + void CheckNonGreedy() { + const std::string aux[] = {"first", "second", "third", "", "fifth"}; + std::list a(aux, aux+sizeof(aux)/sizeof(std::string)), b; + mrw::Tokenizer token("first,second,third,,fifth", false, ","); + while (token) b< a(aux, aux+sizeof(aux)/sizeof(std::string)), b; + mrw::Tokenizer token("Hello world, here I am!", true, " \t\n,.?!"); + while (token) b< a(aux, aux+sizeof(aux)/sizeof(std::string)), b; + mrw::Tokenizer token("first,second,third,,fifth", false, ","); + while (token) b< a2(aux2, aux2+sizeof(aux2)/sizeof(std::string)), + b2, b3; + token.reset("a,b,c,d,e"); + while (token) b2<