mirror of
https://github.com/reactos/reactos.git
synced 2025-06-18 22:45:52 +00:00
[ASMPP] Implement asm preprocessor
This converts ML style assembly to GAS compatible syntax
This commit is contained in:
parent
7277e26944
commit
61cc62d1b2
10 changed files with 1700 additions and 18 deletions
249
sdk/tools/asmpp/tokenizer.hpp
Normal file
249
sdk/tools/asmpp/tokenizer.hpp
Normal file
|
@ -0,0 +1,249 @@
|
|||
/*
|
||||
* PROJECT: ReactOS host tools
|
||||
* LICENSE: MIT (https://spdx.org/licenses/MIT)
|
||||
* PURPOSE: Tokenizer class implementation
|
||||
* COPYRIGHT: Copyright 2021 Timo Kreuzer <timo.kreuzer@reactos.org>
|
||||
*/
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <fstream>
|
||||
#include <regex>
|
||||
#include <ctime>
|
||||
|
||||
// Uncomment this for easier debugging
|
||||
#if 0
|
||||
#define throw __debugbreak(); throw
|
||||
#endif
|
||||
|
||||
extern time_t search_time;
|
||||
|
||||
struct TOKEN_DEF
|
||||
{
|
||||
int Type;
|
||||
std::string RegExString;
|
||||
};
|
||||
|
||||
class Token
|
||||
{
|
||||
const std::string& m_text;
|
||||
unsigned int m_pos;
|
||||
unsigned int m_len;
|
||||
#if _DEBUG
|
||||
std::string m_dbgstr;
|
||||
#endif
|
||||
int m_type;
|
||||
|
||||
public:
|
||||
|
||||
Token(const std::string& text, size_t pos, size_t len, int type)
|
||||
: m_text(text),
|
||||
m_pos(static_cast<unsigned int>(pos)),
|
||||
m_len(static_cast<unsigned int>(len)),
|
||||
m_type(type)
|
||||
{
|
||||
#if _DEBUG
|
||||
m_dbgstr = str();
|
||||
#endif
|
||||
}
|
||||
|
||||
std::string str() const
|
||||
{
|
||||
return m_text.substr(m_pos, m_len);
|
||||
}
|
||||
|
||||
int type() const
|
||||
{
|
||||
return m_type;
|
||||
}
|
||||
};
|
||||
|
||||
struct Tokenizer
|
||||
{
|
||||
const std::vector<TOKEN_DEF> &m_tokendefs;
|
||||
const std::regex m_re;
|
||||
|
||||
typedef int myint;
|
||||
|
||||
static
|
||||
unsigned int
|
||||
count_captures(const std::string& exp)
|
||||
{
|
||||
bool in_char_group = false;
|
||||
unsigned int count = 0;
|
||||
|
||||
for (size_t i = 0; i < exp.size(); i++)
|
||||
{
|
||||
char c = exp[i];
|
||||
|
||||
// Skip escaped characters
|
||||
if (c == '\\')
|
||||
{
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (in_char_group)
|
||||
{
|
||||
if (c == ']')
|
||||
{
|
||||
in_char_group = false;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (c == '[')
|
||||
{
|
||||
in_char_group = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (c == '(')
|
||||
{
|
||||
if (exp[i + 1] != '?')
|
||||
{
|
||||
count++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
static
|
||||
std::regex
|
||||
CompileMultiRegex(const std::vector<TOKEN_DEF> &tokendefs)
|
||||
{
|
||||
std::string combinedString;
|
||||
|
||||
if (tokendefs.size() == 0)
|
||||
{
|
||||
return std::regex();
|
||||
}
|
||||
|
||||
// Validate all token definitions
|
||||
for (auto def : tokendefs)
|
||||
{
|
||||
size_t found = -1;
|
||||
|
||||
// Count capture groups
|
||||
unsigned int count = count_captures(def.RegExString);
|
||||
if (count != 1)
|
||||
{
|
||||
throw "invalid count!\n";
|
||||
}
|
||||
}
|
||||
|
||||
// Combine all expressions into one (one capture group for each)
|
||||
combinedString = "(?:" + tokendefs[0].RegExString + ")";
|
||||
for (size_t i = 1; i < tokendefs.size(); i++)
|
||||
{
|
||||
combinedString += "|(?:" + tokendefs[i].RegExString + ")";
|
||||
}
|
||||
|
||||
return std::regex(combinedString, std::regex_constants::icase);
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
struct TOKEN_REF
|
||||
{
|
||||
unsigned int pos;
|
||||
unsigned int len;
|
||||
int type;
|
||||
};
|
||||
|
||||
Tokenizer(std::vector<TOKEN_DEF> &tokendefs)
|
||||
: m_tokendefs(tokendefs),
|
||||
m_re(CompileMultiRegex(tokendefs))
|
||||
{
|
||||
}
|
||||
|
||||
TOKEN_REF match(std::smatch &matches, const std::string& str) const
|
||||
{
|
||||
return match(matches, str, 0);
|
||||
}
|
||||
|
||||
TOKEN_REF match(std::smatch &matches, const std::string &str, size_t startpos) const
|
||||
{
|
||||
const std::string::const_iterator first = str.cbegin() + startpos;
|
||||
const std::string::const_iterator last = str.cend();
|
||||
|
||||
// If we reached the end, there is nothing more to do
|
||||
if (first == last)
|
||||
{
|
||||
return TOKEN_REF{ static_cast<unsigned int>(startpos), 0, -1 };
|
||||
}
|
||||
|
||||
time_t start_time = time(NULL);
|
||||
|
||||
// Try to find a match
|
||||
if (!std::regex_search(first, last, matches, m_re))
|
||||
{
|
||||
throw "Failed to match\n";
|
||||
}
|
||||
|
||||
search_time += time(NULL) - start_time;
|
||||
|
||||
// Validate that it's at the start of the string
|
||||
if (matches.prefix().matched)
|
||||
{
|
||||
throw "Failed to match at current position!\n";
|
||||
}
|
||||
|
||||
// We have a match, check which one it is
|
||||
for (size_t i = 1; i < matches.size(); i++)
|
||||
{
|
||||
if (matches[i].matched)
|
||||
{
|
||||
unsigned int len = static_cast<unsigned int>(matches.length(i));
|
||||
int type = m_tokendefs[i - 1].Type;
|
||||
return TOKEN_REF{ static_cast<unsigned int>(startpos), len, type};
|
||||
}
|
||||
}
|
||||
|
||||
// We should never get here
|
||||
throw "Something went wrong!\n";
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
class TokenList
|
||||
{
|
||||
using TOKEN_REF = typename Tokenizer::TOKEN_REF;
|
||||
|
||||
const Tokenizer& m_tokenizer;
|
||||
const std::string& m_text;
|
||||
std::vector<TOKEN_REF> m_tokens;
|
||||
|
||||
public:
|
||||
|
||||
TokenList(const Tokenizer& tokenizer, const std::string& text)
|
||||
: m_tokenizer(tokenizer),
|
||||
m_text(text)
|
||||
{
|
||||
size_t startpos = 0;
|
||||
size_t len = m_text.size();
|
||||
std::smatch matches;
|
||||
|
||||
m_tokens.reserve(len / 5);
|
||||
|
||||
while (startpos < len)
|
||||
{
|
||||
TOKEN_REF tref = m_tokenizer.match(matches, m_text, startpos);
|
||||
m_tokens.push_back(tref);
|
||||
startpos += tref.len;
|
||||
};
|
||||
}
|
||||
|
||||
size_t size() const
|
||||
{
|
||||
return m_tokens.size();
|
||||
}
|
||||
|
||||
Token operator[](size_t n) const
|
||||
{
|
||||
return Token(m_text, m_tokens[n].pos, m_tokens[n].len, m_tokens[n].type);
|
||||
}
|
||||
|
||||
};
|
Loading…
Add table
Add a link
Reference in a new issue