initial version
This commit is contained in:
171
mrw/tokenizer.hpp
Normal file
171
mrw/tokenizer.hpp
Normal file
@@ -0,0 +1,171 @@
|
|||||||
|
/** @file
|
||||||
|
|
||||||
|
$Id$
|
||||||
|
|
||||||
|
$Date$
|
||||||
|
$Author$
|
||||||
|
|
||||||
|
@copy © Marc Wäckerlin
|
||||||
|
@license LGPL, see file <a href="license.html">COPYING</a>
|
||||||
|
|
||||||
|
$Log$
|
||||||
|
Revision 1.1 2004/12/17 16:26:58 marc
|
||||||
|
initial version
|
||||||
|
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <stdexcept>
|
||||||
|
|
||||||
|
namespace mrw {
|
||||||
|
|
||||||
|
/** @addtogroup regexp
|
||||||
|
|
||||||
|
There is a Tokenizer which splits strings according to a list of
|
||||||
|
delimiters and allows to iterate over the individual tokens:
|
||||||
|
|
||||||
|
This code:
|
||||||
|
|
||||||
|
@code
|
||||||
|
mrw::Tokenizer token("first,second,third,,fifth", false, ",");
|
||||||
|
while (token) std::cout<<"TOKEN: '"<<token()<<"'"<<std::endl;
|
||||||
|
@endcode
|
||||||
|
Writes:
|
||||||
|
|
||||||
|
@verbatim
|
||||||
|
TOKEN: 'first'
|
||||||
|
TOKEN: 'second'
|
||||||
|
TOKEN: 'third'
|
||||||
|
TOKEN: ''
|
||||||
|
TOKEN: 'fifth'
|
||||||
|
@endverbatim
|
||||||
|
*/
|
||||||
|
//@{
|
||||||
|
|
||||||
|
/** @brief split strings into parts separated by delimiters
|
||||||
|
|
||||||
|
Splits a string into individual parts according to a list of
|
||||||
|
delimiters. If a delimiter is followed by another delimiter, all
|
||||||
|
delimiters are eaten if flag @c greedy is set.
|
||||||
|
|
||||||
|
This code:
|
||||||
|
|
||||||
|
@code
|
||||||
|
mrw::Tokenizer token("Hello world, here I am!", true, " \t\n,.?!");
|
||||||
|
while (token) std::cout<<"TOKEN: '"<<token()<<"'"<<std::endl;
|
||||||
|
@endcode
|
||||||
|
|
||||||
|
Writes:
|
||||||
|
|
||||||
|
@verbatim
|
||||||
|
TOKEN: 'Hello'
|
||||||
|
TOKEN: 'world'
|
||||||
|
TOKEN: 'here'
|
||||||
|
TOKEN: 'I'
|
||||||
|
TOKEN: 'am'
|
||||||
|
@endverbatim
|
||||||
|
*/
|
||||||
|
class Tokenizer {
|
||||||
|
|
||||||
|
public:
|
||||||
|
|
||||||
|
/** @brief initialize a Tokenizer
|
||||||
|
|
||||||
|
@param text the text that has to be split into tokens
|
||||||
|
|
||||||
|
@param greedy
|
||||||
|
- @c true don't generate empty tokens, if a delimiter is followed
|
||||||
|
by another delimiter, both are removed
|
||||||
|
- @c false if several delimiters follow each other in the text,
|
||||||
|
eat them all and don't produce empty tokens
|
||||||
|
|
||||||
|
@param delim a list of delimiters, each char in the string is a
|
||||||
|
delimiter
|
||||||
|
*/
|
||||||
|
Tokenizer(const std::string& text, bool greedy = true,
|
||||||
|
const std::string& delim = " \t\n")
|
||||||
|
throw(std::bad_exception):
|
||||||
|
_text(text), _delim(delim), _pos(0), _oldpos(0), _greedy(greedy) {
|
||||||
|
}
|
||||||
|
|
||||||
|
/** @brief advance to the next token
|
||||||
|
|
||||||
|
Advances to the next token. Call this before you access the tokens.
|
||||||
|
|
||||||
|
@code
|
||||||
|
while (token) [...]
|
||||||
|
@endcode
|
||||||
|
|
||||||
|
@return
|
||||||
|
- @c true if there is a token
|
||||||
|
- @c false if the end was reaced
|
||||||
|
|
||||||
|
@see @ref mrw::Tokenizer for an example
|
||||||
|
*/
|
||||||
|
operator bool() throw(std::bad_exception) {
|
||||||
|
_oldpos = _greedy
|
||||||
|
? _text.find_first_not_of(_delim, _pos)
|
||||||
|
: _pos>0 ? _pos+1 : _pos;
|
||||||
|
if (_pos==std::string::npos || _oldpos==std::string::npos) return false;
|
||||||
|
_pos = _text.find_first_of(_delim, _oldpos);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** @brief access the token
|
||||||
|
|
||||||
|
Get the actual token. You must first call @c
|
||||||
|
Tokenizer::operator bool(), before you can access a
|
||||||
|
token. This method does not change the state of the object, so
|
||||||
|
it can be accessed several times and always returns the same
|
||||||
|
token.
|
||||||
|
|
||||||
|
@code
|
||||||
|
std::string actualToken = token();
|
||||||
|
@endcode
|
||||||
|
|
||||||
|
@return returns the actual token
|
||||||
|
|
||||||
|
@see @ref mrw::Tokenizer for an example
|
||||||
|
*/
|
||||||
|
std::string operator()() const throw(std::bad_exception) {
|
||||||
|
return _text.substr(_oldpos, _pos-_oldpos);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** @brief reset the tokenizer
|
||||||
|
|
||||||
|
Resets the tokenizer so, that you can restart your iteration.
|
||||||
|
*/
|
||||||
|
void reset() throw() {
|
||||||
|
_pos = _oldpos = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** @brief reset the tokenizer and initializes it with a new text
|
||||||
|
|
||||||
|
Resets the tokenizer so, that you can restart your iteration
|
||||||
|
on new text.
|
||||||
|
|
||||||
|
@code
|
||||||
|
mrw::Tokenizer token("Hello World");
|
||||||
|
while (token) std::cout<<"-> "<<token();
|
||||||
|
token.reset("Another text to split");
|
||||||
|
while (token) std::cout<<"-> "<<token();
|
||||||
|
@endcode
|
||||||
|
|
||||||
|
@param text the text given in the constructor is replaced
|
||||||
|
*/
|
||||||
|
void reset(const std::string& text) throw(std::bad_exception) {
|
||||||
|
_text = text;
|
||||||
|
reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::string _text;
|
||||||
|
std::string _delim;
|
||||||
|
std::string::size_type _pos;
|
||||||
|
std::string::size_type _oldpos;
|
||||||
|
bool _greedy;
|
||||||
|
};
|
||||||
|
//@}
|
||||||
|
}
|
||||||
|
|
72
mrw/tokenizer_test.cpp
Normal file
72
mrw/tokenizer_test.cpp
Normal file
@@ -0,0 +1,72 @@
|
|||||||
|
/** @file
|
||||||
|
|
||||||
|
$Id$
|
||||||
|
|
||||||
|
$Date$
|
||||||
|
$Author$
|
||||||
|
|
||||||
|
@copy © Marc Wäckerlin
|
||||||
|
@license LGPL, see file <a href="license.html">COPYING</a>
|
||||||
|
|
||||||
|
$Log$
|
||||||
|
Revision 1.1 2004/12/17 16:26:58 marc
|
||||||
|
initial version
|
||||||
|
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <mrw/tokenizer.hpp>
|
||||||
|
#include <mrw/list.hpp>
|
||||||
|
#include <algorithm>
|
||||||
|
|
||||||
|
#include <cppunit/TestFixture.h>
|
||||||
|
#include <cppunit/ui/text/TestRunner.h>
|
||||||
|
#include <cppunit/extensions/HelperMacros.h>
|
||||||
|
#include <cppunit/extensions/TestFactoryRegistry.h>
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
class TokenizerTest: public CppUnit::TestFixture {
|
||||||
|
public:
|
||||||
|
void CheckNonGreedy() {
|
||||||
|
const std::string aux[] = {"first", "second", "third", "", "fifth"};
|
||||||
|
std::list<std::string> a(aux, aux+sizeof(aux)/sizeof(std::string)), b;
|
||||||
|
mrw::Tokenizer token("first,second,third,,fifth", false, ",");
|
||||||
|
while (token) b<<token();
|
||||||
|
CPPUNIT_ASSERT(equal(a.begin(), a.end(), b.begin()));
|
||||||
|
}
|
||||||
|
void CheckGreedy() {
|
||||||
|
const std::string aux[] = {"Hello", "world", "here", "I", "am"};
|
||||||
|
std::list<std::string> a(aux, aux+sizeof(aux)/sizeof(std::string)), b;
|
||||||
|
mrw::Tokenizer token("Hello world, here I am!", true, " \t\n,.?!");
|
||||||
|
while (token) b<<token();
|
||||||
|
CPPUNIT_ASSERT(equal(a.begin(), a.end(), b.begin()));
|
||||||
|
}
|
||||||
|
void CheckReset() {
|
||||||
|
const std::string aux[] = {"first", "second", "third", "", "fifth"};
|
||||||
|
std::list<std::string> a(aux, aux+sizeof(aux)/sizeof(std::string)), b;
|
||||||
|
mrw::Tokenizer token("first,second,third,,fifth", false, ",");
|
||||||
|
while (token) b<<token();
|
||||||
|
CPPUNIT_ASSERT(equal(a.begin(), a.end(), b.begin()));
|
||||||
|
const std::string aux2[] = {"a", "b", "c", "d", "e"};
|
||||||
|
std::list<std::string> a2(aux2, aux2+sizeof(aux2)/sizeof(std::string)),
|
||||||
|
b2, b3;
|
||||||
|
token.reset("a,b,c,d,e");
|
||||||
|
while (token) b2<<token();
|
||||||
|
CPPUNIT_ASSERT(equal(a2.begin(), a2.end(), b2.begin()));
|
||||||
|
token.reset();
|
||||||
|
while (token) b3<<token();
|
||||||
|
CPPUNIT_ASSERT(equal(a2.begin(), a2.end(), b3.begin()));
|
||||||
|
}
|
||||||
|
CPPUNIT_TEST_SUITE(TokenizerTest);
|
||||||
|
CPPUNIT_TEST(CheckNonGreedy);
|
||||||
|
CPPUNIT_TEST(CheckGreedy);
|
||||||
|
CPPUNIT_TEST(CheckReset);
|
||||||
|
CPPUNIT_TEST_SUITE_END();
|
||||||
|
};
|
||||||
|
CPPUNIT_TEST_SUITE_REGISTRATION(TokenizerTest);
|
||||||
|
|
||||||
|
int main() {
|
||||||
|
CppUnit::TextUi::TestRunner runner;
|
||||||
|
runner.addTest(CppUnit::TestFactoryRegistry::getRegistry().makeTest());
|
||||||
|
return runner.run() ? 0 : 1;
|
||||||
|
}
|
Reference in New Issue
Block a user