/* * PROJECT: ReactOS host tools * LICENSE: MIT (https://spdx.org/licenses/MIT) * PURPOSE: Tokenizer class implementation * COPYRIGHT: Copyright 2021 Timo Kreuzer */ #include #include #include #include #include // Uncomment this for easier debugging #if 0 #define throw __debugbreak(); throw #endif extern time_t search_time; struct TOKEN_DEF { int Type; std::string RegExString; }; class Token { const std::string& m_text; unsigned int m_pos; unsigned int m_len; #if _DEBUG std::string m_dbgstr; #endif int m_type; public: Token(const std::string& text, size_t pos, size_t len, int type) : m_text(text), m_pos(static_cast(pos)), m_len(static_cast(len)), m_type(type) { #if _DEBUG m_dbgstr = str(); #endif } std::string str() const { return m_text.substr(m_pos, m_len); } int type() const { return m_type; } }; struct Tokenizer { const std::vector &m_tokendefs; const std::regex m_re; typedef int myint; static unsigned int count_captures(const std::string& exp) { bool in_char_group = false; unsigned int count = 0; for (size_t i = 0; i < exp.size(); i++) { char c = exp[i]; // Skip escaped characters if (c == '\\') { i++; continue; } if (in_char_group) { if (c == ']') { in_char_group = false; } continue; } if (c == '[') { in_char_group = true; continue; } if (c == '(') { if (exp[i + 1] != '?') { count++; } } } return count; } static std::regex CompileMultiRegex(const std::vector &tokendefs) { std::string combinedString; if (tokendefs.size() == 0) { return std::regex(); } // Validate all token definitions for (auto def : tokendefs) { size_t found = -1; // Count capture groups unsigned int count = count_captures(def.RegExString); if (count != 1) { throw "invalid count!\n"; } } // Combine all expressions into one (one capture group for each) combinedString = "(?:" + tokendefs[0].RegExString + ")"; for (size_t i = 1; i < tokendefs.size(); i++) { combinedString += "|(?:" + tokendefs[i].RegExString + ")"; } return std::regex(combinedString, std::regex_constants::icase); } public: struct TOKEN_REF { unsigned int pos; unsigned int len; int type; }; Tokenizer(std::vector &tokendefs) : m_tokendefs(tokendefs), m_re(CompileMultiRegex(tokendefs)) { } TOKEN_REF match(std::smatch &matches, const std::string& str) const { return match(matches, str, 0); } TOKEN_REF match(std::smatch &matches, const std::string &str, size_t startpos) const { const std::string::const_iterator first = str.cbegin() + startpos; const std::string::const_iterator last = str.cend(); // If we reached the end, there is nothing more to do if (first == last) { return TOKEN_REF{ static_cast(startpos), 0, -1 }; } time_t start_time = time(NULL); // Try to find a match if (!std::regex_search(first, last, matches, m_re)) { throw "Failed to match\n"; } search_time += time(NULL) - start_time; // Validate that it's at the start of the string if (matches.prefix().matched) { throw "Failed to match at current position!\n"; } // We have a match, check which one it is for (size_t i = 1; i < matches.size(); i++) { if (matches[i].matched) { unsigned int len = static_cast(matches.length(i)); int type = m_tokendefs[i - 1].Type; return TOKEN_REF{ static_cast(startpos), len, type}; } } // We should never get here throw "Something went wrong!\n"; } }; class TokenList { using TOKEN_REF = typename Tokenizer::TOKEN_REF; const Tokenizer& m_tokenizer; const std::string& m_text; std::vector m_tokens; public: TokenList(const Tokenizer& tokenizer, const std::string& text) : m_tokenizer(tokenizer), m_text(text) { size_t startpos = 0; size_t len = m_text.size(); std::smatch matches; m_tokens.reserve(len / 5); while (startpos < len) { TOKEN_REF tref = m_tokenizer.match(matches, m_text, startpos); m_tokens.push_back(tref); startpos += tref.len; }; } size_t size() const { return m_tokens.size(); } Token operator[](size_t n) const { return Token(m_text, m_tokens[n].pos, m_tokens[n].len, m_tokens[n].type); } };