[ASMPP] Implement asm preprocessor

This converts ML style assembly to GAS compatible syntax
2025-06-18 22:45:52 +00:00 · 2022-06-20 18:29:01 +03:00 · 2022-06-20 18:29:01 +03:00 · 61cc62d1b2
commit 61cc62d1b2
parent 7277e26944
10 changed files with 1700 additions and 18 deletions
--- a/sdk/tools/asmpp/tokenizer.hpp
+++ b/sdk/tools/asmpp/tokenizer.hpp
@ -0,0 +1,249 @@
+/*
+ * PROJECT:     ReactOS host tools
+ * LICENSE:     MIT (https://spdx.org/licenses/MIT)
+ * PURPOSE:     Tokenizer class implementation
+ * COPYRIGHT:   Copyright 2021 Timo Kreuzer <timo.kreuzer@reactos.org>
+ */
+
+#include <string>
+#include <vector>
+#include <fstream>
+#include <regex>
+#include <ctime>
+
+// Uncomment this for easier debugging
+#if 0
+#define throw __debugbreak(); throw
+#endif
+
+extern time_t search_time;
+
+struct TOKEN_DEF
+{
+    int Type;
+    std::string RegExString;
+};
+
+class Token
+{
+    const std::string& m_text;
+    unsigned int m_pos;
+    unsigned int m_len;
+#if _DEBUG
+    std::string m_dbgstr;
+#endif
+    int m_type;
+
+public:
+
+    Token(const std::string& text, size_t pos, size_t len, int type)
+        : m_text(text),
+        m_pos(static_cast<unsigned int>(pos)),
+        m_len(static_cast<unsigned int>(len)),
+        m_type(type)
+    {
+#if _DEBUG
+        m_dbgstr = str();
+#endif
+    }
+
+    std::string str() const
+    {
+        return m_text.substr(m_pos, m_len);
+    }
+
+    int type() const
+    {
+        return m_type;
+    }
+};
+
+struct Tokenizer
+{
+    const std::vector<TOKEN_DEF> &m_tokendefs;
+    const std::regex m_re;
+
+    typedef int myint;
+
+    static
+    unsigned int
+    count_captures(const std::string& exp)
+    {
+        bool in_char_group = false;
+        unsigned int count = 0;
+
+        for (size_t i = 0; i < exp.size(); i++)
+        {
+            char c = exp[i];
+
+            // Skip escaped characters
+            if (c == '\\')
+            {
+                i++;
+                continue;
+            }
+
+            if (in_char_group)
+            {
+                if (c == ']')
+                {
+                    in_char_group = false;
+                }
+                continue;
+            }
+
+            if (c == '[')
+            {
+                in_char_group = true;
+                continue;
+            }
+
+            if (c == '(')
+            {
+                if (exp[i + 1] != '?')
+                {
+                    count++;
+                }
+            }
+        }
+
+        return count;
+    }
+
+    static
+    std::regex
+    CompileMultiRegex(const std::vector<TOKEN_DEF> &tokendefs)
+    {
+        std::string combinedString;
+
+        if (tokendefs.size() == 0)
+        {
+            return std::regex();
+        }
+
+        // Validate all token definitions
+        for (auto def : tokendefs)
+        {
+            size_t found = -1;
+
+            // Count capture groups
+            unsigned int count = count_captures(def.RegExString);
+            if (count != 1)
+            {
+                throw "invalid count!\n";
+            }
+        }
+
+        // Combine all expressions into one (one capture group for each)
+        combinedString = "(?:" + tokendefs[0].RegExString + ")";
+        for (size_t i = 1; i < tokendefs.size(); i++)
+        {
+            combinedString += "|(?:" + tokendefs[i].RegExString + ")";
+        }
+
+        return std::regex(combinedString, std::regex_constants::icase);
+    }
+
+public:
+
+    struct TOKEN_REF
+    {
+        unsigned int pos;
+        unsigned int len;
+        int type;
+    };
+
+    Tokenizer(std::vector<TOKEN_DEF> &tokendefs)
+        : m_tokendefs(tokendefs),
+          m_re(CompileMultiRegex(tokendefs))
+    {
+    }
+
+    TOKEN_REF match(std::smatch &matches, const std::string& str) const
+    {
+        return match(matches, str, 0);
+    }
+
+    TOKEN_REF match(std::smatch &matches, const std::string &str, size_t startpos) const
+    {
+        const std::string::const_iterator first = str.cbegin() + startpos;
+        const std::string::const_iterator last = str.cend();
+
+        // If we reached the end, there is nothing more to do
+        if (first == last)
+        {
+            return TOKEN_REF{ static_cast<unsigned int>(startpos), 0, -1 };
+        }
+
+        time_t start_time = time(NULL);
+
+        // Try to find a match
+        if (!std::regex_search(first, last, matches, m_re))
+        {
+            throw "Failed to match\n";
+        }
+
+        search_time += time(NULL) - start_time;
+
+        // Validate that it's at the start of the string
+        if (matches.prefix().matched)
+        {
+            throw "Failed to match at current position!\n";
+        }
+        
+        // We have a match, check which one it is
+        for (size_t i = 1; i < matches.size(); i++)
+        {
+            if (matches[i].matched)
+            {
+                unsigned int len = static_cast<unsigned int>(matches.length(i));
+                int type = m_tokendefs[i - 1].Type;
+                return TOKEN_REF{ static_cast<unsigned int>(startpos), len, type};
+            }
+        }
+
+        // We should never get here
+        throw "Something went wrong!\n";
+    }
+};
+
+
+class TokenList
+{
+    using TOKEN_REF = typename Tokenizer::TOKEN_REF;
+
+    const Tokenizer& m_tokenizer;
+    const std::string& m_text;
+    std::vector<TOKEN_REF> m_tokens;
+
+public:
+
+    TokenList(const Tokenizer& tokenizer, const std::string& text)
+        : m_tokenizer(tokenizer),
+          m_text(text)
+    {
+        size_t startpos = 0;
+        size_t len = m_text.size();
+        std::smatch matches;
+
+        m_tokens.reserve(len / 5);
+
+        while (startpos < len)
+        {
+            TOKEN_REF tref = m_tokenizer.match(matches, m_text, startpos);
+            m_tokens.push_back(tref);
+            startpos += tref.len;
+        };
+    }
+
+    size_t size() const
+    {
+        return m_tokens.size();
+    }
+
+    Token operator[](size_t n) const
+    {
+        return Token(m_text, m_tokens[n].pos, m_tokens[n].len, m_tokens[n].type);
+    }
+
+};