C++ Library containing a lot of needful things: Stack Trace, Command Line Parser, Resource Handling, Configuration Files, Unix Command Execution, Directories, Regular Expressions, Tokenizer, Function Trace, Standard Extensions.
174 lines
4.3 KiB
174 lines
4.3 KiB
/** @file |
|
|
|
$Id$ |
|
|
|
$Date$ |
|
$Author$ |
|
|
|
@copy © Marc Wäckerlin |
|
@license LGPL, see file <a href="license.html">COPYING</a> |
|
|
|
*/ |
|
#ifndef __MRW__TOKENIZER_HPP__ |
|
#define __MRW__TOKENIZER_HPP__ |
|
|
|
#include <string> |
|
#include <stdexcept> |
|
|
|
namespace mrw { |
|
|
|
/** @addtogroup regexp |
|
*/ |
|
//@{ |
|
/** @defgroup regexptokenizer Tokenizer |
|
|
|
@pre \#include <mrw/tokenizer.hpp> |
|
|
|
There is a Tokenizer which splits strings according to a list of |
|
delimiters and allows to iterate over the individual tokens: |
|
|
|
This code: |
|
|
|
@code |
|
mrw::Tokenizer token("first,second,third,,fifth", false, ","); |
|
while (token) std::cout<<"TOKEN: '"<<token()<<"'"<<std::endl; |
|
@endcode |
|
Writes: |
|
|
|
@verbatim |
|
TOKEN: 'first' |
|
TOKEN: 'second' |
|
TOKEN: 'third' |
|
TOKEN: '' |
|
TOKEN: 'fifth' |
|
@endverbatim |
|
*/ |
|
//@{ |
|
|
|
/** @brief split strings into parts separated by delimiters |
|
|
|
Splits a string into individual parts according to a list of |
|
delimiters. If a delimiter is followed by another delimiter, all |
|
delimiters are eaten if flag @c greedy is set. |
|
|
|
This code: |
|
|
|
@code |
|
mrw::Tokenizer token("Hello world, here I am!", true, " \t\n,.?!"); |
|
while (token) std::cout<<"TOKEN: '"<<token()<<"'"<<std::endl; |
|
@endcode |
|
|
|
Writes: |
|
|
|
@verbatim |
|
TOKEN: 'Hello' |
|
TOKEN: 'world' |
|
TOKEN: 'here' |
|
TOKEN: 'I' |
|
TOKEN: 'am' |
|
@endverbatim |
|
*/ |
|
class Tokenizer { |
|
|
|
public: |
|
|
|
/** @brief initialize a Tokenizer |
|
|
|
@param text the text that has to be split into tokens |
|
|
|
@param greedy |
|
- @c true don't generate empty tokens, if a delimiter is followed |
|
by another delimiter, both are removed |
|
- @c false if several delimiters follow each other in the text, |
|
eat them all and don't produce empty tokens |
|
|
|
@param delim a list of delimiters, each char in the string is a |
|
delimiter |
|
*/ |
|
Tokenizer(const std::string& text, bool greedy = true, |
|
const std::string& delim = " \t\n") |
|
throw(std::bad_exception): |
|
_text(text), _delim(delim), _pos(0), _oldpos(0), _greedy(greedy) { |
|
} |
|
|
|
/** @brief advance to the next token |
|
|
|
Advances to the next token. Call this before you access the tokens. |
|
|
|
@code |
|
while (token) [...] |
|
@endcode |
|
|
|
@return |
|
- @c true if there is a token |
|
- @c false if the end was reaced |
|
|
|
@see @ref mrw::Tokenizer for an example |
|
*/ |
|
operator bool() throw(std::bad_exception) { |
|
_oldpos = _greedy |
|
? _text.find_first_not_of(_delim, _pos) |
|
: _pos>0 ? _pos+1 : _pos; |
|
if (_pos==std::string::npos || _oldpos==std::string::npos) return false; |
|
_pos = _text.find_first_of(_delim, _oldpos); |
|
return true; |
|
} |
|
|
|
/** @brief access the token |
|
|
|
Get the actual token. You must first call @c |
|
Tokenizer::operator bool(), before you can access a |
|
token. This method does not change the state of the object, so |
|
it can be accessed several times and always returns the same |
|
token. |
|
|
|
@code |
|
std::string actualToken = token(); |
|
@endcode |
|
|
|
@return returns the actual token |
|
|
|
@see @ref mrw::Tokenizer for an example |
|
*/ |
|
std::string operator()() const throw(std::bad_exception) { |
|
return _text.substr(_oldpos, _pos-_oldpos); |
|
} |
|
|
|
/** @brief reset the tokenizer |
|
|
|
Resets the tokenizer so, that you can restart your iteration. |
|
*/ |
|
void reset() throw() { |
|
_pos = _oldpos = 0; |
|
} |
|
|
|
/** @brief reset the tokenizer and initializes it with a new text |
|
|
|
Resets the tokenizer so, that you can restart your iteration |
|
on new text. |
|
|
|
@code |
|
mrw::Tokenizer token("Hello World"); |
|
while (token) std::cout<<"-> "<<token(); |
|
token.reset("Another text to split"); |
|
while (token) std::cout<<"-> "<<token(); |
|
@endcode |
|
|
|
@param text the text given in the constructor is replaced |
|
*/ |
|
void reset(const std::string& text) throw(std::bad_exception) { |
|
_text = text; |
|
reset(); |
|
} |
|
|
|
private: |
|
std::string _text; |
|
std::string _delim; |
|
std::string::size_type _pos; |
|
std::string::size_type _oldpos; |
|
bool _greedy; |
|
}; |
|
//@} |
|
//@} |
|
} |
|
#endif
|
|
|