reactos/sdk/tools/asmpp/tokenizer.hpp

/*
 * PROJECT:     ReactOS host tools
 * LICENSE:     MIT (https://spdx.org/licenses/MIT)
 * PURPOSE:     Tokenizer class implementation
 * COPYRIGHT:   Copyright 2021 Timo Kreuzer <timo.kreuzer@reactos.org>
 */

#include <string>
#include <vector>
#include <fstream>
#include <regex>
#include <ctime>

// Uncomment this for easier debugging
#if 0
#define throw __debugbreak(); throw
#endif

extern time_t search_time;

struct TOKEN_DEF
{
    int Type;
    std::string RegExString;
};

class Token
{
    const std::string& m_text;
    unsigned int m_pos;
    unsigned int m_len;
#if _DEBUG
    std::string m_dbgstr;
#endif
    int m_type;

public:

    Token(const std::string& text, size_t pos, size_t len, int type)
        : m_text(text),
        m_pos(static_cast<unsigned int>(pos)),
        m_len(static_cast<unsigned int>(len)),
        m_type(type)
    {
#if _DEBUG
        m_dbgstr = str();
#endif
    }

    std::string str() const
    {
        return m_text.substr(m_pos, m_len);
    }

    int type() const
    {
        return m_type;
    }
};

struct Tokenizer
{
    const std::vector<TOKEN_DEF> &m_tokendefs;
    const std::regex m_re;

    typedef int myint;

    static
    unsigned int
    count_captures(const std::string& exp)
    {
        bool in_char_group = false;
        unsigned int count = 0;

        for (size_t i = 0; i < exp.size(); i++)
        {
            char c = exp[i];

            // Skip escaped characters
            if (c == '\\')
            {
                i++;
                continue;
            }

            if (in_char_group)
            {
                if (c == ']')
                {
                    in_char_group = false;
                }
                continue;
            }

            if (c == '[')
            {
                in_char_group = true;
                continue;
            }

            if (c == '(')
            {
                if (exp[i + 1] != '?')
                {
                    count++;
                }
            }
        }

        return count;
    }

    static
    std::regex
    CompileMultiRegex(const std::vector<TOKEN_DEF> &tokendefs)
    {
        std::string combinedString;

        if (tokendefs.size() == 0)
        {
            return std::regex();
        }

        // Validate all token definitions
        for (auto def : tokendefs)
        {
            size_t found = -1;

            // Count capture groups
            unsigned int count = count_captures(def.RegExString);
            if (count != 1)
            {
                throw "invalid count!\n";
            }
        }

        // Combine all expressions into one (one capture group for each)
        combinedString = "(?:" + tokendefs[0].RegExString + ")";
        for (size_t i = 1; i < tokendefs.size(); i++)
        {
            combinedString += "|(?:" + tokendefs[i].RegExString + ")";
        }

        return std::regex(combinedString, std::regex_constants::icase);
    }

public:

    struct TOKEN_REF
    {
        unsigned int pos;
        unsigned int len;
        int type;
    };

    Tokenizer(std::vector<TOKEN_DEF> &tokendefs)
        : m_tokendefs(tokendefs),
          m_re(CompileMultiRegex(tokendefs))
    {
    }

    TOKEN_REF match(std::smatch &matches, const std::string& str) const
    {
        return match(matches, str, 0);
    }

    TOKEN_REF match(std::smatch &matches, const std::string &str, size_t startpos) const
    {
        const std::string::const_iterator first = str.cbegin() + startpos;
        const std::string::const_iterator last = str.cend();

        // If we reached the end, there is nothing more to do
        if (first == last)
        {
            return TOKEN_REF{ static_cast<unsigned int>(startpos), 0, -1 };
        }

        time_t start_time = time(NULL);

        // Try to find a match
        if (!std::regex_search(first, last, matches, m_re))
        {
            throw "Failed to match\n";
        }

        search_time += time(NULL) - start_time;

        // Validate that it's at the start of the string
        if (matches.prefix().matched)
        {
            throw "Failed to match at current position!\n";
        }
        
        // We have a match, check which one it is
        for (size_t i = 1; i < matches.size(); i++)
        {
            if (matches[i].matched)
            {
                unsigned int len = static_cast<unsigned int>(matches.length(i));
                int type = m_tokendefs[i - 1].Type;
                return TOKEN_REF{ static_cast<unsigned int>(startpos), len, type};
            }
        }

        // We should never get here
        throw "Something went wrong!\n";
    }
};


class TokenList
{
    using TOKEN_REF = typename Tokenizer::TOKEN_REF;

    const Tokenizer& m_tokenizer;
    const std::string& m_text;
    std::vector<TOKEN_REF> m_tokens;

public:

    TokenList(const Tokenizer& tokenizer, const std::string& text)
        : m_tokenizer(tokenizer),
          m_text(text)
    {
        size_t startpos = 0;
        size_t len = m_text.size();
        std::smatch matches;

        m_tokens.reserve(len / 5);

        while (startpos < len)
        {
            TOKEN_REF tref = m_tokenizer.match(matches, m_text, startpos);
            m_tokens.push_back(tref);
            startpos += tref.len;
        };
    }

    size_t size() const
    {
        return m_tokens.size();
    }

    Token operator[](size_t n) const
    {
        return Token(m_text, m_tokens[n].pos, m_tokens[n].len, m_tokens[n].type);
    }

};
[ASMPP] Implement asm preprocessor This converts ML style assembly to GAS compatible syntax 2022-06-20 15:29:01 +00:00			`/*`
			`* PROJECT: ReactOS host tools`
			`* LICENSE: MIT (https://spdx.org/licenses/MIT)`
			`* PURPOSE: Tokenizer class implementation`
			`* COPYRIGHT: Copyright 2021 Timo Kreuzer <timo.kreuzer@reactos.org>`
			`*/`

			`#include <string>`
			`#include <vector>`
			`#include <fstream>`
			`#include <regex>`
			`#include <ctime>`

			`// Uncomment this for easier debugging`
			`#if 0`
			`#define throw __debugbreak(); throw`
			`#endif`

			`extern time_t search_time;`

			`struct TOKEN_DEF`
			`{`
			`int Type;`
			`std::string RegExString;`
			`};`

			`class Token`
			`{`
			`const std::string& m_text;`
			`unsigned int m_pos;`
			`unsigned int m_len;`
			`#if _DEBUG`
			`std::string m_dbgstr;`
			`#endif`
			`int m_type;`

			`public:`

			`Token(const std::string& text, size_t pos, size_t len, int type)`
			`: m_text(text),`
			`m_pos(static_cast<unsigned int>(pos)),`
			`m_len(static_cast<unsigned int>(len)),`
			`m_type(type)`
			`{`
			`#if _DEBUG`
			`m_dbgstr = str();`
			`#endif`
			`}`

			`std::string str() const`
			`{`
			`return m_text.substr(m_pos, m_len);`
			`}`

			`int type() const`
			`{`
			`return m_type;`
			`}`
			`};`

			`struct Tokenizer`
			`{`
			`const std::vector<TOKEN_DEF> &m_tokendefs;`
			`const std::regex m_re;`

			`typedef int myint;`

			`static`
			`unsigned int`
			`count_captures(const std::string& exp)`
			`{`
			`bool in_char_group = false;`
			`unsigned int count = 0;`

			`for (size_t i = 0; i < exp.size(); i++)`
			`{`
			`char c = exp[i];`

			`// Skip escaped characters`
			`if (c == '\\')`
			`{`
			`i++;`
			`continue;`
			`}`

			`if (in_char_group)`
			`{`
			`if (c == ']')`
			`{`
			`in_char_group = false;`
			`}`
			`continue;`
			`}`

			`if (c == '[')`
			`{`
			`in_char_group = true;`
			`continue;`
			`}`

			`if (c == '(')`
			`{`
			`if (exp[i + 1] != '?')`
			`{`
			`count++;`
			`}`
			`}`
			`}`

			`return count;`
			`}`

			`static`
			`std::regex`
			`CompileMultiRegex(const std::vector<TOKEN_DEF> &tokendefs)`
			`{`
			`std::string combinedString;`

			`if (tokendefs.size() == 0)`
			`{`
			`return std::regex();`
			`}`

			`// Validate all token definitions`
			`for (auto def : tokendefs)`
			`{`
			`size_t found = -1;`

			`// Count capture groups`
			`unsigned int count = count_captures(def.RegExString);`
			`if (count != 1)`
			`{`
			`throw "invalid count!\n";`
			`}`
			`}`

			`// Combine all expressions into one (one capture group for each)`
			`combinedString = "(?:" + tokendefs[0].RegExString + ")";`
			`for (size_t i = 1; i < tokendefs.size(); i++)`
			`{`
			`combinedString += "\|(?:" + tokendefs[i].RegExString + ")";`
			`}`

			`return std::regex(combinedString, std::regex_constants::icase);`
			`}`

			`public:`

			`struct TOKEN_REF`
			`{`
			`unsigned int pos;`
			`unsigned int len;`
			`int type;`
			`};`

			`Tokenizer(std::vector<TOKEN_DEF> &tokendefs)`
			`: m_tokendefs(tokendefs),`
			`m_re(CompileMultiRegex(tokendefs))`
			`{`
			`}`

			`TOKEN_REF match(std::smatch &matches, const std::string& str) const`
			`{`
			`return match(matches, str, 0);`
			`}`

			`TOKEN_REF match(std::smatch &matches, const std::string &str, size_t startpos) const`
			`{`
			`const std::string::const_iterator first = str.cbegin() + startpos;`
			`const std::string::const_iterator last = str.cend();`

			`// If we reached the end, there is nothing more to do`
			`if (first == last)`
			`{`
			`return TOKEN_REF{ static_cast<unsigned int>(startpos), 0, -1 };`
			`}`

			`time_t start_time = time(NULL);`

			`// Try to find a match`
			`if (!std::regex_search(first, last, matches, m_re))`
			`{`
			`throw "Failed to match\n";`
			`}`

			`search_time += time(NULL) - start_time;`

			`// Validate that it's at the start of the string`
			`if (matches.prefix().matched)`
			`{`
			`throw "Failed to match at current position!\n";`
			`}`

			`// We have a match, check which one it is`
			`for (size_t i = 1; i < matches.size(); i++)`
			`{`
			`if (matches[i].matched)`
			`{`
			`unsigned int len = static_cast<unsigned int>(matches.length(i));`
			`int type = m_tokendefs[i - 1].Type;`
			`return TOKEN_REF{ static_cast<unsigned int>(startpos), len, type};`
			`}`
			`}`

			`// We should never get here`
			`throw "Something went wrong!\n";`
			`}`
			`};`


			`class TokenList`
			`{`
			`using TOKEN_REF = typename Tokenizer::TOKEN_REF;`

			`const Tokenizer& m_tokenizer;`
			`const std::string& m_text;`
			`std::vector<TOKEN_REF> m_tokens;`

			`public:`

			`TokenList(const Tokenizer& tokenizer, const std::string& text)`
			`: m_tokenizer(tokenizer),`
			`m_text(text)`
			`{`
			`size_t startpos = 0;`
			`size_t len = m_text.size();`
			`std::smatch matches;`

			`m_tokens.reserve(len / 5);`

			`while (startpos < len)`
			`{`
			`TOKEN_REF tref = m_tokenizer.match(matches, m_text, startpos);`
			`m_tokens.push_back(tref);`
			`startpos += tref.len;`
			`};`
			`}`

			`size_t size() const`
			`{`
			`return m_tokens.size();`
			`}`

			`Token operator[](size_t n) const`
			`{`
			`return Token(m_text, m_tokens[n].pos, m_tokens[n].len, m_tokens[n].type);`
			`}`

			`};`