C++ Library containing a lot of needful things: Stack Trace, Command Line Parser, Resource Handling, Configuration Files, Unix Command Execution, Directories, Regular Expressions, Tokenizer, Function Trace, Standard Extensions.
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
178 lines
4.4 KiB
178 lines
4.4 KiB
/** @file |
|
|
|
$Id$ |
|
|
|
$Date$ |
|
$Author$ |
|
|
|
@copy © Marc Wäckerlin |
|
@license LGPL, see file <a href="license.html">COPYING</a> |
|
|
|
$Log$ |
|
Revision 1.2 2004/12/20 07:40:36 marc |
|
documentation improved, new grouping |
|
|
|
Revision 1.1 2004/12/17 16:26:58 marc |
|
initial version |
|
|
|
|
|
*/ |
|
|
|
#include <string> |
|
#include <stdexcept> |
|
|
|
namespace mrw { |
|
|
|
/** @addtogroup regexp |
|
*/ |
|
//@{ |
|
/** @defgroup regexptokenizer Tokenizer |
|
|
|
There is a Tokenizer which splits strings according to a list of |
|
delimiters and allows to iterate over the individual tokens: |
|
|
|
This code: |
|
|
|
@code |
|
mrw::Tokenizer token("first,second,third,,fifth", false, ","); |
|
while (token) std::cout<<"TOKEN: '"<<token()<<"'"<<std::endl; |
|
@endcode |
|
Writes: |
|
|
|
@verbatim |
|
TOKEN: 'first' |
|
TOKEN: 'second' |
|
TOKEN: 'third' |
|
TOKEN: '' |
|
TOKEN: 'fifth' |
|
@endverbatim |
|
*/ |
|
//@{ |
|
|
|
/** @brief split strings into parts separated by delimiters |
|
|
|
Splits a string into individual parts according to a list of |
|
delimiters. If a delimiter is followed by another delimiter, all |
|
delimiters are eaten if flag @c greedy is set. |
|
|
|
This code: |
|
|
|
@code |
|
mrw::Tokenizer token("Hello world, here I am!", true, " \t\n,.?!"); |
|
while (token) std::cout<<"TOKEN: '"<<token()<<"'"<<std::endl; |
|
@endcode |
|
|
|
Writes: |
|
|
|
@verbatim |
|
TOKEN: 'Hello' |
|
TOKEN: 'world' |
|
TOKEN: 'here' |
|
TOKEN: 'I' |
|
TOKEN: 'am' |
|
@endverbatim |
|
*/ |
|
class Tokenizer { |
|
|
|
public: |
|
|
|
/** @brief initialize a Tokenizer |
|
|
|
@param text the text that has to be split into tokens |
|
|
|
@param greedy |
|
- @c true don't generate empty tokens, if a delimiter is followed |
|
by another delimiter, both are removed |
|
- @c false if several delimiters follow each other in the text, |
|
eat them all and don't produce empty tokens |
|
|
|
@param delim a list of delimiters, each char in the string is a |
|
delimiter |
|
*/ |
|
Tokenizer(const std::string& text, bool greedy = true, |
|
const std::string& delim = " \t\n") |
|
throw(std::bad_exception): |
|
_text(text), _delim(delim), _pos(0), _oldpos(0), _greedy(greedy) { |
|
} |
|
|
|
/** @brief advance to the next token |
|
|
|
Advances to the next token. Call this before you access the tokens. |
|
|
|
@code |
|
while (token) [...] |
|
@endcode |
|
|
|
@return |
|
- @c true if there is a token |
|
- @c false if the end was reaced |
|
|
|
@see @ref mrw::Tokenizer for an example |
|
*/ |
|
operator bool() throw(std::bad_exception) { |
|
_oldpos = _greedy |
|
? _text.find_first_not_of(_delim, _pos) |
|
: _pos>0 ? _pos+1 : _pos; |
|
if (_pos==std::string::npos || _oldpos==std::string::npos) return false; |
|
_pos = _text.find_first_of(_delim, _oldpos); |
|
return true; |
|
} |
|
|
|
/** @brief access the token |
|
|
|
Get the actual token. You must first call @c |
|
Tokenizer::operator bool(), before you can access a |
|
token. This method does not change the state of the object, so |
|
it can be accessed several times and always returns the same |
|
token. |
|
|
|
@code |
|
std::string actualToken = token(); |
|
@endcode |
|
|
|
@return returns the actual token |
|
|
|
@see @ref mrw::Tokenizer for an example |
|
*/ |
|
std::string operator()() const throw(std::bad_exception) { |
|
return _text.substr(_oldpos, _pos-_oldpos); |
|
} |
|
|
|
/** @brief reset the tokenizer |
|
|
|
Resets the tokenizer so, that you can restart your iteration. |
|
*/ |
|
void reset() throw() { |
|
_pos = _oldpos = 0; |
|
} |
|
|
|
/** @brief reset the tokenizer and initializes it with a new text |
|
|
|
Resets the tokenizer so, that you can restart your iteration |
|
on new text. |
|
|
|
@code |
|
mrw::Tokenizer token("Hello World"); |
|
while (token) std::cout<<"-> "<<token(); |
|
token.reset("Another text to split"); |
|
while (token) std::cout<<"-> "<<token(); |
|
@endcode |
|
|
|
@param text the text given in the constructor is replaced |
|
*/ |
|
void reset(const std::string& text) throw(std::bad_exception) { |
|
_text = text; |
|
reset(); |
|
} |
|
|
|
private: |
|
std::string _text; |
|
std::string _delim; |
|
std::string::size_type _pos; |
|
std::string::size_type _oldpos; |
|
bool _greedy; |
|
}; |
|
//@} |
|
//@} |
|
} |
|
|
|
|