C++ Library containing a lot of needful things: Stack Trace, Command Line Parser, Resource Handling, Configuration Files, Unix Command Execution, Directories, Regular Expressions, Tokenizer, Function Trace, Standard Extensions.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

174 lines
4.2 KiB

/** @file
$Id$
$Date$
$Author$
@copy © Marc Wäckerlin
@license LGPL, see file <a href="license.html">COPYING</a>
*/
#ifndef __MRW__TOKENIZER_HPP__
#define __MRW__TOKENIZER_HPP__
#include <string>
#include <stdexcept>
namespace mrw {
/** @addtogroup regexp
*/
//@{
/** @defgroup regexptokenizer Tokenizer
@pre \#include <mrw/tokenizer.hxx>
There is a Tokenizer which splits strings according to a list of
delimiters and allows to iterate over the individual tokens:
This code:
@code
mrw::Tokenizer token("first,second,third,,fifth", false, ",");
while (token) std::cout<<"TOKEN: '"<<token()<<"'"<<std::endl;
@endcode
Writes:
@verbatim
TOKEN: 'first'
TOKEN: 'second'
TOKEN: 'third'
TOKEN: ''
TOKEN: 'fifth'
@endverbatim
*/
//@{
/** @brief split strings into parts separated by delimiters
Splits a string into individual parts according to a list of
delimiters. If a delimiter is followed by another delimiter, all
delimiters are eaten if flag @c greedy is set.
This code:
@code
mrw::Tokenizer token("Hello world, here I am!", true, " \t\n,.?!");
while (token) std::cout<<"TOKEN: '"<<token()<<"'"<<std::endl;
@endcode
Writes:
@verbatim
TOKEN: 'Hello'
TOKEN: 'world'
TOKEN: 'here'
TOKEN: 'I'
TOKEN: 'am'
@endverbatim
*/
class Tokenizer {
public:
/** @brief initialize a Tokenizer
@param text the text that has to be split into tokens
@param greedy
- @c true don't generate empty tokens, if a delimiter is followed
by another delimiter, both are removed
- @c false if several delimiters follow each other in the text,
eat them all and don't produce empty tokens
@param delim a list of delimiters, each char in the string is a
delimiter
*/
Tokenizer(const std::string& text, bool greedy = true,
const std::string& delim = " \t\n")
:
_text(text), _delim(delim), _pos(0), _oldpos(0), _greedy(greedy) {
}
/** @brief advance to the next token
Advances to the next token. Call this before you access the tokens.
@code
while (token) [...]
@endcode
@return
- @c true if there is a token
- @c false if the end was reaced
@see @ref mrw::Tokenizer for an example
*/
operator bool() {
_oldpos = _greedy
? _text.find_first_not_of(_delim, _pos)
: _pos>0 ? _pos+1 : _pos;
if (_pos==std::string::npos || _oldpos==std::string::npos) return false;
_pos = _text.find_first_of(_delim, _oldpos);
return true;
}
/** @brief access the token
Get the actual token. You must first call @c
Tokenizer::operator bool(), before you can access a
token. This method does not change the state of the object, so
it can be accessed several times and always returns the same
token.
@code
std::string actualToken = token();
@endcode
@return returns the actual token
@see @ref mrw::Tokenizer for an example
*/
std::string operator()() const {
return _text.substr(_oldpos, _pos-_oldpos);
}
/** @brief reset the tokenizer
Resets the tokenizer so, that you can restart your iteration.
*/
void reset() noexcept {
_pos = _oldpos = 0;
}
/** @brief reset the tokenizer and initializes it with a new text
Resets the tokenizer so, that you can restart your iteration
on new text.
@code
mrw::Tokenizer token("Hello World");
while (token) std::cout<<"-> "<<token();
token.reset("Another text to split");
while (token) std::cout<<"-> "<<token();
@endcode
@param text the text given in the constructor is replaced
*/
void reset(const std::string& text) {
_text = text;
reset();
}
private:
std::string _text;
std::string _delim;
std::string::size_type _pos;
std::string::size_type _oldpos;
bool _greedy;
};
//@}
//@}
}
#endif