C++ Library containing a lot of needful things: Stack Trace, Command Line Parser, Resource Handling, Configuration Files, Unix Command Execution, Directories, Regular Expressions, Tokenizer, Function Trace, Standard Extensions.
 
 
 
 
 

174 lines
4.3 KiB

/** @file
$Id$
$Date$
$Author$
@copy © Marc Wäckerlin
@license LGPL, see file <a href="license.html">COPYING</a>
*/
#ifndef __MRW__TOKENIZER_HPP__
#define __MRW__TOKENIZER_HPP__
#include <string>
#include <stdexcept>
namespace mrw {
/** @addtogroup regexp
*/
//@{
/** @defgroup regexptokenizer Tokenizer
@pre \#include <mrw/tokenizer.hpp>
There is a Tokenizer which splits strings according to a list of
delimiters and allows to iterate over the individual tokens:
This code:
@code
mrw::Tokenizer token("first,second,third,,fifth", false, ",");
while (token) std::cout<<"TOKEN: '"<<token()<<"'"<<std::endl;
@endcode
Writes:
@verbatim
TOKEN: 'first'
TOKEN: 'second'
TOKEN: 'third'
TOKEN: ''
TOKEN: 'fifth'
@endverbatim
*/
//@{
/** @brief split strings into parts separated by delimiters
Splits a string into individual parts according to a list of
delimiters. If a delimiter is followed by another delimiter, all
delimiters are eaten if flag @c greedy is set.
This code:
@code
mrw::Tokenizer token("Hello world, here I am!", true, " \t\n,.?!");
while (token) std::cout<<"TOKEN: '"<<token()<<"'"<<std::endl;
@endcode
Writes:
@verbatim
TOKEN: 'Hello'
TOKEN: 'world'
TOKEN: 'here'
TOKEN: 'I'
TOKEN: 'am'
@endverbatim
*/
class Tokenizer {
public:
/** @brief initialize a Tokenizer
@param text the text that has to be split into tokens
@param greedy
- @c true don't generate empty tokens, if a delimiter is followed
by another delimiter, both are removed
- @c false if several delimiters follow each other in the text,
eat them all and don't produce empty tokens
@param delim a list of delimiters, each char in the string is a
delimiter
*/
Tokenizer(const std::string& text, bool greedy = true,
const std::string& delim = " \t\n")
throw(std::bad_exception):
_text(text), _delim(delim), _pos(0), _oldpos(0), _greedy(greedy) {
}
/** @brief advance to the next token
Advances to the next token. Call this before you access the tokens.
@code
while (token) [...]
@endcode
@return
- @c true if there is a token
- @c false if the end was reaced
@see @ref mrw::Tokenizer for an example
*/
operator bool() throw(std::bad_exception) {
_oldpos = _greedy
? _text.find_first_not_of(_delim, _pos)
: _pos>0 ? _pos+1 : _pos;
if (_pos==std::string::npos || _oldpos==std::string::npos) return false;
_pos = _text.find_first_of(_delim, _oldpos);
return true;
}
/** @brief access the token
Get the actual token. You must first call @c
Tokenizer::operator bool(), before you can access a
token. This method does not change the state of the object, so
it can be accessed several times and always returns the same
token.
@code
std::string actualToken = token();
@endcode
@return returns the actual token
@see @ref mrw::Tokenizer for an example
*/
std::string operator()() const throw(std::bad_exception) {
return _text.substr(_oldpos, _pos-_oldpos);
}
/** @brief reset the tokenizer
Resets the tokenizer so, that you can restart your iteration.
*/
void reset() throw() {
_pos = _oldpos = 0;
}
/** @brief reset the tokenizer and initializes it with a new text
Resets the tokenizer so, that you can restart your iteration
on new text.
@code
mrw::Tokenizer token("Hello World");
while (token) std::cout<<"-> "<<token();
token.reset("Another text to split");
while (token) std::cout<<"-> "<<token();
@endcode
@param text the text given in the constructor is replaced
*/
void reset(const std::string& text) throw(std::bad_exception) {
_text = text;
reset();
}
private:
std::string _text;
std::string _delim;
std::string::size_type _pos;
std::string::size_type _oldpos;
bool _greedy;
};
//@}
//@}
}
#endif