parent
b3d92f3be9
commit
adad69abd1
2 changed files with 243 additions and 0 deletions
@ -0,0 +1,171 @@ |
||||
/** @file
|
||||
|
||||
$Id$ |
||||
|
||||
$Date$ |
||||
$Author$ |
||||
|
||||
@copy © Marc Wäckerlin |
||||
@license LGPL, see file <a href="license.html">COPYING</a> |
||||
|
||||
$Log$ |
||||
Revision 1.1 2004/12/17 16:26:58 marc |
||||
initial version |
||||
|
||||
|
||||
*/ |
||||
|
||||
#include <string> |
||||
#include <stdexcept> |
||||
|
||||
namespace mrw { |
||||
|
||||
/** @addtogroup regexp
|
||||
|
||||
There is a Tokenizer which splits strings according to a list of |
||||
delimiters and allows to iterate over the individual tokens: |
||||
|
||||
This code: |
||||
|
||||
@code |
||||
mrw::Tokenizer token("first,second,third,,fifth", false, ","); |
||||
while (token) std::cout<<"TOKEN: '"<<token()<<"'"<<std::endl; |
||||
@endcode |
||||
Writes: |
||||
|
||||
@verbatim |
||||
TOKEN: 'first' |
||||
TOKEN: 'second' |
||||
TOKEN: 'third' |
||||
TOKEN: '' |
||||
TOKEN: 'fifth' |
||||
@endverbatim |
||||
*/ |
||||
//@{
|
||||
|
||||
/** @brief split strings into parts separated by delimiters
|
||||
|
||||
Splits a string into individual parts according to a list of |
||||
delimiters. If a delimiter is followed by another delimiter, all |
||||
delimiters are eaten if flag @c greedy is set. |
||||
|
||||
This code: |
||||
|
||||
@code |
||||
mrw::Tokenizer token("Hello world, here I am!", true, " \t\n,.?!"); |
||||
while (token) std::cout<<"TOKEN: '"<<token()<<"'"<<std::endl; |
||||
@endcode |
||||
|
||||
Writes: |
||||
|
||||
@verbatim |
||||
TOKEN: 'Hello' |
||||
TOKEN: 'world' |
||||
TOKEN: 'here' |
||||
TOKEN: 'I' |
||||
TOKEN: 'am' |
||||
@endverbatim |
||||
*/
|
||||
class Tokenizer { |
||||
|
||||
public: |
||||
|
||||
/** @brief initialize a Tokenizer
|
||||
|
||||
@param text the text that has to be split into tokens |
||||
|
||||
@param greedy |
||||
- @c true don't generate empty tokens, if a delimiter is followed |
||||
by another delimiter, both are removed |
||||
- @c false if several delimiters follow each other in the text, |
||||
eat them all and don't produce empty tokens |
||||
|
||||
@param delim a list of delimiters, each char in the string is a |
||||
delimiter |
||||
*/ |
||||
Tokenizer(const std::string& text, bool greedy = true, |
||||
const std::string& delim = " \t\n") |
||||
throw(std::bad_exception): |
||||
_text(text), _delim(delim), _pos(0), _oldpos(0), _greedy(greedy) { |
||||
} |
||||
|
||||
/** @brief advance to the next token
|
||||
|
||||
Advances to the next token. Call this before you access the tokens. |
||||
|
||||
@code |
||||
while (token) [...] |
||||
@endcode |
||||
|
||||
@return |
||||
- @c true if there is a token |
||||
- @c false if the end was reaced |
||||
|
||||
@see @ref mrw::Tokenizer for an example |
||||
*/ |
||||
operator bool() throw(std::bad_exception) { |
||||
_oldpos = _greedy |
||||
? _text.find_first_not_of(_delim, _pos) |
||||
: _pos>0 ? _pos+1 : _pos; |
||||
if (_pos==std::string::npos || _oldpos==std::string::npos) return false; |
||||
_pos = _text.find_first_of(_delim, _oldpos); |
||||
return true; |
||||
} |
||||
|
||||
/** @brief access the token
|
||||
|
||||
Get the actual token. You must first call @c |
||||
Tokenizer::operator bool(), before you can access a |
||||
token. This method does not change the state of the object, so |
||||
it can be accessed several times and always returns the same |
||||
token. |
||||
|
||||
@code |
||||
std::string actualToken = token(); |
||||
@endcode |
||||
|
||||
@return returns the actual token |
||||
|
||||
@see @ref mrw::Tokenizer for an example |
||||
*/ |
||||
std::string operator()() const throw(std::bad_exception) { |
||||
return _text.substr(_oldpos, _pos-_oldpos); |
||||
} |
||||
|
||||
/** @brief reset the tokenizer
|
||||
|
||||
Resets the tokenizer so, that you can restart your iteration. |
||||
*/ |
||||
void reset() throw() { |
||||
_pos = _oldpos = 0; |
||||
} |
||||
|
||||
/** @brief reset the tokenizer and initializes it with a new text
|
||||
|
||||
Resets the tokenizer so, that you can restart your iteration |
||||
on new text. |
||||
|
||||
@code |
||||
mrw::Tokenizer token("Hello World"); |
||||
while (token) std::cout<<"-> "<<token(); |
||||
token.reset("Another text to split"); |
||||
while (token) std::cout<<"-> "<<token();
|
||||
@endcode |
||||
|
||||
@param text the text given in the constructor is replaced |
||||
*/ |
||||
void reset(const std::string& text) throw(std::bad_exception) { |
||||
_text = text; |
||||
reset(); |
||||
} |
||||
|
||||
private: |
||||
std::string _text; |
||||
std::string _delim; |
||||
std::string::size_type _pos; |
||||
std::string::size_type _oldpos; |
||||
bool _greedy; |
||||
}; |
||||
//@}
|
||||
} |
||||
|
@ -0,0 +1,72 @@ |
||||
/** @file
|
||||
|
||||
$Id$ |
||||
|
||||
$Date$ |
||||
$Author$ |
||||
|
||||
@copy © Marc Wäckerlin |
||||
@license LGPL, see file <a href="license.html">COPYING</a> |
||||
|
||||
$Log$ |
||||
Revision 1.1 2004/12/17 16:26:58 marc |
||||
initial version |
||||
|
||||
|
||||
*/ |
||||
|
||||
#include <mrw/tokenizer.hpp> |
||||
#include <mrw/list.hpp> |
||||
#include <algorithm> |
||||
|
||||
#include <cppunit/TestFixture.h> |
||||
#include <cppunit/ui/text/TestRunner.h> |
||||
#include <cppunit/extensions/HelperMacros.h> |
||||
#include <cppunit/extensions/TestFactoryRegistry.h> |
||||
|
||||
#include <iostream> |
||||
class TokenizerTest: public CppUnit::TestFixture {
|
||||
public: |
||||
void CheckNonGreedy() { |
||||
const std::string aux[] = {"first", "second", "third", "", "fifth"}; |
||||
std::list<std::string> a(aux, aux+sizeof(aux)/sizeof(std::string)), b; |
||||
mrw::Tokenizer token("first,second,third,,fifth", false, ","); |
||||
while (token) b<<token(); |
||||
CPPUNIT_ASSERT(equal(a.begin(), a.end(), b.begin())); |
||||
} |
||||
void CheckGreedy() { |
||||
const std::string aux[] = {"Hello", "world", "here", "I", "am"}; |
||||
std::list<std::string> a(aux, aux+sizeof(aux)/sizeof(std::string)), b; |
||||
mrw::Tokenizer token("Hello world, here I am!", true, " \t\n,.?!"); |
||||
while (token) b<<token(); |
||||
CPPUNIT_ASSERT(equal(a.begin(), a.end(), b.begin())); |
||||
} |
||||
void CheckReset() { |
||||
const std::string aux[] = {"first", "second", "third", "", "fifth"}; |
||||
std::list<std::string> a(aux, aux+sizeof(aux)/sizeof(std::string)), b; |
||||
mrw::Tokenizer token("first,second,third,,fifth", false, ","); |
||||
while (token) b<<token(); |
||||
CPPUNIT_ASSERT(equal(a.begin(), a.end(), b.begin())); |
||||
const std::string aux2[] = {"a", "b", "c", "d", "e"}; |
||||
std::list<std::string> a2(aux2, aux2+sizeof(aux2)/sizeof(std::string)), |
||||
b2, b3; |
||||
token.reset("a,b,c,d,e"); |
||||
while (token) b2<<token(); |
||||
CPPUNIT_ASSERT(equal(a2.begin(), a2.end(), b2.begin())); |
||||
token.reset(); |
||||
while (token) b3<<token(); |
||||
CPPUNIT_ASSERT(equal(a2.begin(), a2.end(), b3.begin())); |
||||
} |
||||
CPPUNIT_TEST_SUITE(TokenizerTest); |
||||
CPPUNIT_TEST(CheckNonGreedy); |
||||
CPPUNIT_TEST(CheckGreedy); |
||||
CPPUNIT_TEST(CheckReset); |
||||
CPPUNIT_TEST_SUITE_END(); |
||||
}; |
||||
CPPUNIT_TEST_SUITE_REGISTRATION(TokenizerTest); |
||||
|
||||
int main() { |
||||
CppUnit::TextUi::TestRunner runner; |
||||
runner.addTest(CppUnit::TestFactoryRegistry::getRegistry().makeTest()); |
||||
return runner.run() ? 0 : 1; |
||||
} |
Loading…
Reference in new issue