/** @file
$ Id $
$ Date $
$ Author $
@ copy & copy ; Marc W & auml ; ckerlin
@ license LGPL , see file < a href = " license.html " > COPYING < / a >
$ Log $
Revision 1.3 2004 / 12 / 20 13 : 24 : 26 marc
# ifndef forgotten
Revision 1.2 2004 / 12 / 20 07 : 40 : 36 marc
documentation improved , new grouping
Revision 1.1 2004 / 12 / 17 16 : 26 : 58 marc
initial version
*/
# ifndef __MRW__TOKENIZER_HPP__
# define __MRW__TOKENIZER_HPP__
# include <string>
# include <stdexcept>
namespace mrw {
/** @addtogroup regexp
*/
//@{
/** @defgroup regexptokenizer Tokenizer
There is a Tokenizer which splits strings according to a list of
delimiters and allows to iterate over the individual tokens :
This code :
@ code
mrw : : Tokenizer token ( " first,second,third,,fifth " , false , " , " ) ;
while ( token ) std : : cout < < " TOKEN: ' " < < token ( ) < < " ' " < < std : : endl ;
@ endcode
Writes :
@ verbatim
TOKEN : ' first '
TOKEN : ' second '
TOKEN : ' third '
TOKEN : ' '
TOKEN : ' fifth '
@ endverbatim
*/
//@{
/** @brief split strings into parts separated by delimiters
Splits a string into individual parts according to a list of
delimiters . If a delimiter is followed by another delimiter , all
delimiters are eaten if flag @ c greedy is set .
This code :
@ code
mrw : : Tokenizer token ( " Hello world, here I am! " , true , " \t \n ,.?! " ) ;
while ( token ) std : : cout < < " TOKEN: ' " < < token ( ) < < " ' " < < std : : endl ;
@ endcode
Writes :
@ verbatim
TOKEN : ' Hello '
TOKEN : ' world '
TOKEN : ' here '
TOKEN : ' I '
TOKEN : ' am '
@ endverbatim
*/
class Tokenizer {
public :
/** @brief initialize a Tokenizer
@ param text the text that has to be split into tokens
@ param greedy
- @ c true don ' t generate empty tokens , if a delimiter is followed
by another delimiter , both are removed
- @ c false if several delimiters follow each other in the text ,
eat them all and don ' t produce empty tokens
@ param delim a list of delimiters , each char in the string is a
delimiter
*/
Tokenizer ( const std : : string & text , bool greedy = true ,
const std : : string & delim = " \t \n " )
throw ( std : : bad_exception ) :
_text ( text ) , _delim ( delim ) , _pos ( 0 ) , _oldpos ( 0 ) , _greedy ( greedy ) {
}
/** @brief advance to the next token
Advances to the next token . Call this before you access the tokens .
@ code
while ( token ) [ . . . ]
@ endcode
@ return
- @ c true if there is a token
- @ c false if the end was reaced
@ see @ ref mrw : : Tokenizer for an example
*/
operator bool ( ) throw ( std : : bad_exception ) {
_oldpos = _greedy
? _text . find_first_not_of ( _delim , _pos )
: _pos > 0 ? _pos + 1 : _pos ;
if ( _pos = = std : : string : : npos | | _oldpos = = std : : string : : npos ) return false ;
_pos = _text . find_first_of ( _delim , _oldpos ) ;
return true ;
}
/** @brief access the token
Get the actual token . You must first call @ c
Tokenizer : : operator bool ( ) , before you can access a
token . This method does not change the state of the object , so
it can be accessed several times and always returns the same
token .
@ code
std : : string actualToken = token ( ) ;
@ endcode
@ return returns the actual token
@ see @ ref mrw : : Tokenizer for an example
*/
std : : string operator ( ) ( ) const throw ( std : : bad_exception ) {
return _text . substr ( _oldpos , _pos - _oldpos ) ;
}
/** @brief reset the tokenizer
Resets the tokenizer so , that you can restart your iteration .
*/
void reset ( ) throw ( ) {
_pos = _oldpos = 0 ;
}
/** @brief reset the tokenizer and initializes it with a new text
Resets the tokenizer so , that you can restart your iteration
on new text .
@ code
mrw : : Tokenizer token ( " Hello World " ) ;
while ( token ) std : : cout < < " -> " < < token ( ) ;
token . reset ( " Another text to split " ) ;
while ( token ) std : : cout < < " -> " < < token ( ) ;
@ endcode
@ param text the text given in the constructor is replaced
*/
void reset ( const std : : string & text ) throw ( std : : bad_exception ) {
_text = text ;
reset ( ) ;
}
private :
std : : string _text ;
std : : string _delim ;
std : : string : : size_type _pos ;
std : : string : : size_type _oldpos ;
bool _greedy ;
} ;
//@}
//@}
}
# endif