Yet another string Tokenizer (C++)

Post by **fips** » Sun Sep 30, 2012 4:52 pm

There's certainly room for yet another C++ string Tokenizer, something lightweight and efficient for those who don't need the full power of the Boost.Tokenizer with all its pros and cons, something that can be easily plugged into an existing project without bringing too much dependencies. As I needed exactly one like that recently, I dusted of my older one, clean it up a bit by removing all the bloated template code (mostly related to the Unicode support), and voila! here it is:

VIEW THE CODE BELOW IN FULL-SCREEN (tokenizer_sample.cpp)

Code: Select all

/*
(c) 2012 +++ Filip Stoklas, aka FipS, http://www.4FipS.com +++
THIS CODE IS FREE - LICENSED UNDER THE MIT LICENSE
ARTICLE URL: http://forums.4fips.com/viewtopic.php?f=3&t=810
*/

#include <vector>
#include <string>
#include <cstdio>
#include <cassert>

/// A very lightweight in-place string Tokenizer that can handle quoted tokens.
class Tokenizer
{
 public:

    explicit Tokenizer(const char *text, const char *delims = " \t\n", char quote = '\0');
    const char * next() const;

 private:

    // these are only refs to the external data!
    const char *_text;
    const char *_delims;
    char _quote;

    mutable size_t _offset;
    mutable std::vector<char> _token; // a copy of the current token
};

Tokenizer::Tokenizer(const char *text, const char *delims, char quote):
_text(text),
_delims(delims),
_quote(quote),
_offset(0),
_token()
{
    assert(text && delims);
}

const char * Tokenizer::next() const
{
    static const size_t end_of_string = size_t(-1); // offset terminator: 0xffff...

    if(_offset == end_of_string) // nothing more to consume (from the previous run)
        return nullptr;

    // eat up left delimiter(s), stop at the first token char, quote, or '\0'
    assert(_offset <= strlen(_text)); // it's allowed to point to '\0' as well...
    const char *from = _text + _offset;
    while(*from && strchr(_delims, *from)) ++from;

    const bool quoted_token = *from == _quote;
    const bool double_quotes = quoted_token && from[1] == _quote; // can't overflow here...

    from += quoted_token ? 1 : 0; // if quoted, skip the quote char

    if(!*from) // end-of-string reached
    {
        //  F     F
        // .0 | ."0
        _offset = end_of_string; // finalize & exit
        return nullptr;
    }

    const char *to = double_quotes ? from : from + 1; // optionally collapse quotes
    //  FT     FT     F~~~T
    // .A? | ."A? | .""?

    if(quoted_token)
    {
        if(!double_quotes)
        {
            to = strchr(to, _quote); // stop at the next quote
            if(!to) // missing right quote, discard the whole token
            {
                _offset = end_of_string; // finalize & exit
                return nullptr;
            }
        }
    }
    else
    {
        // stop at the next delim, '\0' or on an unexpected quote
        while(*to != _quote && !strchr(_delims, *to)) ++to;
    }

    //  F  T    F  T     F  T    FxxT ~ unexpected
    // .AAA0 | .AAA. | ."AAA" | .AAA"

    // copy the token
    const size_t size = to - from; // doesn't include trailing '\0'
    _token.resize(size + 1); // + trailing char
    std::copy(from, from + size, &_token[0]); // without trailing '\0'
    _token[size] = '\0'; // put trailing '\0' (needed when shrinking)

    // advance the offset
    if(*to)
    {
        const bool unexpected_quote = !quoted_token && (*to == _quote);
        const size_t step = unexpected_quote ? 0 : 1; // reuse the ending quote
        _offset = to - _text + step;
    }
    else // end-of-string reached
    {
        _offset = end_of_string;
    }

    assert(!_token.empty());
    return &_token[0];
}

int main()
{
    {
        // tokenize a string using <comma> and <space> as delimiters,
        // and <apostrophe> as a quote character:

        const Tokenizer tk("John, Jane, 'John Doe', 'Jane Roe'", ", ", '\'');
        for(const char *token = tk.next(); token; token = tk.next())
        {
            printf("<%s>\n", token);
        }
    }

    {
        // tokenize key/value pairs (with optional '=' separator),
        // note that multiple consecutive quoted tokens are treated individually:

        const Tokenizer tk("key1 = value1 | 'multi key2' = 'multi value2' | 'key3''value3'", "|", '\0');
        for(const char *token = tk.next(); token; token = tk.next())
        {
            const Tokenizer tk2(token, "= ", '\''); // we need both here: '=' and <space>
            std::string key = tk2.next(); // keep a copy
            const char *value = tk2.next();
            printf("<%s> = <%s>\n", key.c_str(), value);
        }
    }

    // also, run a bunch of tests:

    auto test_tokenizer = [](const std::string &input, const std::string &expected_result)->bool
    {
        std::string result;
        const Tokenizer tk(input.c_str(), ".", '\'');
        for(const char *token = tk.next(); token; token = tk.next())
        {
            result += std::string("<") + token + ">";
        }
        return result == expected_result;
    };

    // well formatted
    assert(test_tokenizer(".tok.tok.'.tok.'.'.tok.''.tok.'", "<tok><tok><.tok.><.tok.><.tok.>"));
    assert(test_tokenizer("", ""));
    assert(test_tokenizer(".", ""));
    assert(test_tokenizer("''", "<>"));
    assert(test_tokenizer("'.'", "<.>"));
    assert(test_tokenizer("''''''", "<><><>"));
    assert(test_tokenizer(".''.''.''.", "<><><>"));
    assert(test_tokenizer("A", "<A>"));
    assert(test_tokenizer("ABC", "<ABC>"));
    assert(test_tokenizer("'A''B''C'", "<A><B><C>"));
    assert(test_tokenizer(".A.", "<A>"));
    assert(test_tokenizer(".'.A.'.", "<.A.>"));
    assert(test_tokenizer(".A.'.B.'.C.", "<A><.B.><C>"));
    assert(test_tokenizer(".'.ABC.'.'.DEF.'.", "<.ABC.><.DEF.>"));

    // mismatching right quotes
    assert(test_tokenizer(".tok.tok.'.tok.'.'.tok.''.tok.''.ignored.", "<tok><tok><.tok.><.tok.><.tok.>"));
    assert(test_tokenizer("'", ""));
    assert(test_tokenizer(".'.", ""));
    assert(test_tokenizer("'''", "<>"));
    assert(test_tokenizer(".'''.", "<>"));
    assert(test_tokenizer(".''.''.'.", "<><>"));
    assert(test_tokenizer("'A'B'", "<A><B>"));
    assert(test_tokenizer(".A.'.B.'.C.'.D.", "<A><.B.><C>"));
    assert(test_tokenizer("A.''B'C.D", "<A><><B>"));
    assert(test_tokenizer(".A.''.B.'''''.C.", "<A><><B><><>"));

    return 0;
}

// output:
// <John>
// <Jane>
// <John Doe>
// <Jane Roe>
// <key1> = <value1>
// <multi key2> = <multi value2>
// <key3> = <value3>

It's worth mentioning a few things: The Tokenizer doesn't allocate any memory except for the current token that is kept inside a resizeable std::vector. There's also no need for pre-processing of the whole input text, we even don't need to know its length, so we can cheaply operate on terabytes of data, which is especially useful if we are interested only in a few initial tokens!