Code: Select all
/*
(c) 2012 +++ Filip Stoklas, aka FipS, http://www.4FipS.com +++
THIS CODE IS FREE - LICENSED UNDER THE MIT LICENSE
ARTICLE URL: http://forums.4fips.com/viewtopic.php?f=3&t=810
*/
#include <vector>
#include <string>
#include <cstdio>
#include <cassert>
/// A very lightweight in-place string Tokenizer that can handle quoted tokens.
class Tokenizer
{
public:
explicit Tokenizer(const char *text, const char *delims = " \t\n", char quote = '\0');
const char * next() const;
private:
// these are only refs to the external data!
const char *_text;
const char *_delims;
char _quote;
mutable size_t _offset;
mutable std::vector<char> _token; // a copy of the current token
};
Tokenizer::Tokenizer(const char *text, const char *delims, char quote):
_text(text),
_delims(delims),
_quote(quote),
_offset(0),
_token()
{
assert(text && delims);
}
const char * Tokenizer::next() const
{
static const size_t end_of_string = size_t(-1); // offset terminator: 0xffff...
if(_offset == end_of_string) // nothing more to consume (from the previous run)
return nullptr;
// eat up left delimiter(s), stop at the first token char, quote, or '\0'
assert(_offset <= strlen(_text)); // it's allowed to point to '\0' as well...
const char *from = _text + _offset;
while(*from && strchr(_delims, *from)) ++from;
const bool quoted_token = *from == _quote;
const bool double_quotes = quoted_token && from[1] == _quote; // can't overflow here...
from += quoted_token ? 1 : 0; // if quoted, skip the quote char
if(!*from) // end-of-string reached
{
// F F
// .0 | ."0
_offset = end_of_string; // finalize & exit
return nullptr;
}
const char *to = double_quotes ? from : from + 1; // optionally collapse quotes
// FT FT F~~~T
// .A? | ."A? | .""?
if(quoted_token)
{
if(!double_quotes)
{
to = strchr(to, _quote); // stop at the next quote
if(!to) // missing right quote, discard the whole token
{
_offset = end_of_string; // finalize & exit
return nullptr;
}
}
}
else
{
// stop at the next delim, '\0' or on an unexpected quote
while(*to != _quote && !strchr(_delims, *to)) ++to;
}
// F T F T F T FxxT ~ unexpected
// .AAA0 | .AAA. | ."AAA" | .AAA"
// copy the token
const size_t size = to - from; // doesn't include trailing '\0'
_token.resize(size + 1); // + trailing char
std::copy(from, from + size, &_token[0]); // without trailing '\0'
_token[size] = '\0'; // put trailing '\0' (needed when shrinking)
// advance the offset
if(*to)
{
const bool unexpected_quote = !quoted_token && (*to == _quote);
const size_t step = unexpected_quote ? 0 : 1; // reuse the ending quote
_offset = to - _text + step;
}
else // end-of-string reached
{
_offset = end_of_string;
}
assert(!_token.empty());
return &_token[0];
}
int main()
{
{
// tokenize a string using <comma> and <space> as delimiters,
// and <apostrophe> as a quote character:
const Tokenizer tk("John, Jane, 'John Doe', 'Jane Roe'", ", ", '\'');
for(const char *token = tk.next(); token; token = tk.next())
{
printf("<%s>\n", token);
}
}
{
// tokenize key/value pairs (with optional '=' separator),
// note that multiple consecutive quoted tokens are treated individually:
const Tokenizer tk("key1 = value1 | 'multi key2' = 'multi value2' | 'key3''value3'", "|", '\0');
for(const char *token = tk.next(); token; token = tk.next())
{
const Tokenizer tk2(token, "= ", '\''); // we need both here: '=' and <space>
std::string key = tk2.next(); // keep a copy
const char *value = tk2.next();
printf("<%s> = <%s>\n", key.c_str(), value);
}
}
// also, run a bunch of tests:
auto test_tokenizer = [](const std::string &input, const std::string &expected_result)->bool
{
std::string result;
const Tokenizer tk(input.c_str(), ".", '\'');
for(const char *token = tk.next(); token; token = tk.next())
{
result += std::string("<") + token + ">";
}
return result == expected_result;
};
// well formatted
assert(test_tokenizer(".tok.tok.'.tok.'.'.tok.''.tok.'", "<tok><tok><.tok.><.tok.><.tok.>"));
assert(test_tokenizer("", ""));
assert(test_tokenizer(".", ""));
assert(test_tokenizer("''", "<>"));
assert(test_tokenizer("'.'", "<.>"));
assert(test_tokenizer("''''''", "<><><>"));
assert(test_tokenizer(".''.''.''.", "<><><>"));
assert(test_tokenizer("A", "<A>"));
assert(test_tokenizer("ABC", "<ABC>"));
assert(test_tokenizer("'A''B''C'", "<A><B><C>"));
assert(test_tokenizer(".A.", "<A>"));
assert(test_tokenizer(".'.A.'.", "<.A.>"));
assert(test_tokenizer(".A.'.B.'.C.", "<A><.B.><C>"));
assert(test_tokenizer(".'.ABC.'.'.DEF.'.", "<.ABC.><.DEF.>"));
// mismatching right quotes
assert(test_tokenizer(".tok.tok.'.tok.'.'.tok.''.tok.''.ignored.", "<tok><tok><.tok.><.tok.><.tok.>"));
assert(test_tokenizer("'", ""));
assert(test_tokenizer(".'.", ""));
assert(test_tokenizer("'''", "<>"));
assert(test_tokenizer(".'''.", "<>"));
assert(test_tokenizer(".''.''.'.", "<><>"));
assert(test_tokenizer("'A'B'", "<A><B>"));
assert(test_tokenizer(".A.'.B.'.C.'.D.", "<A><.B.><C>"));
assert(test_tokenizer("A.''B'C.D", "<A><><B>"));
assert(test_tokenizer(".A.''.B.'''''.C.", "<A><><B><><>"));
return 0;
}
// output:
// <John>
// <Jane>
// <John Doe>
// <Jane Roe>
// <key1> = <value1>
// <multi key2> = <multi value2>
// <key3> = <value3>