summaryrefslogtreecommitdiff
path: root/src/gen_tokens.cc
diff options
context:
space:
mode:
authorJoel Klinghed <the_jk@spawned.biz>2025-09-28 22:53:30 +0200
committerJoel Klinghed <the_jk@spawned.biz>2025-09-29 09:39:17 +0200
commit1e9e51dae1c01bab7562911b958c47528b8011c8 (patch)
tree73e0c97545d1cf833a4205c8ced41c822b4bb348 /src/gen_tokens.cc
parent0ca22c7d6d650c80906bd1217fccf32066cc2502 (diff)
java: Add tokens
Only parses Java 8 tokens for now.
Diffstat (limited to 'src/gen_tokens.cc')
-rw-r--r--src/gen_tokens.cc1062
1 files changed, 1062 insertions, 0 deletions
diff --git a/src/gen_tokens.cc b/src/gen_tokens.cc
new file mode 100644
index 0000000..ef0fce7
--- /dev/null
+++ b/src/gen_tokens.cc
@@ -0,0 +1,1062 @@
+#include "args.hh"
+#include "errors.hh"
+#include "grammar.hh"
+#include "io.hh"
+#include "prefix_tree.hh"
+
+#include <algorithm>
+#include <cassert>
+#include <charconv>
+#include <cstddef>
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <set>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+namespace {
+
+enum class CharacterClass : uint8_t {
+ kWhiteSpace = 0,
+ kLineTerminator = 1,
+ kInputCharacter = 2,
+ kJavaLetter = 3,
+ kJavaLetterOrDigit = 4,
+};
+
+std::vector<std::string> const kCharacterClassNames(
+ {"WhiteSpace", "LineTerminator", "InputCharacter", "JavaLetter",
+ "JavaLetterOrDigit"});
+
+enum class ReturnType : uint8_t {
+ kTokenAndSize,
+ kInternalAndSize,
+ kSize,
+};
+
+std::string make_define(std::string_view filename) {
+ std::string ret;
+ ret.reserve(filename.size());
+ for (char c : filename) {
+ if ((c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_') {
+ ret.push_back(c);
+ } else if (c >= 'a' && c <= 'z') {
+ ret.push_back(static_cast<char>(c & ~0x20));
+ } else {
+ ret.push_back('_');
+ }
+ }
+ return ret;
+}
+
+class Generator {
+ public:
+ bool generate(std::string_view header_name, std::string_view source_name,
+ std::string const& ns, std::string const& unicode_version,
+ grammar::Grammar& grammar);
+
+ private:
+ void find_specific_elements(grammar::Element const& root);
+ void find_all_elements(grammar::Element const& root);
+
+ void check_need_last(grammar::Element const& element);
+ bool find_report_last(grammar::Element const& root,
+ grammar::Element const& match);
+
+ [[nodiscard]]
+ ReturnType get_return_type(grammar::Element const& element) const;
+ [[nodiscard]]
+ ReturnType get_return_type(uint8_t character_class) const;
+
+ void write_matcher(std::ostream& out, grammar::Symbol const& symbol,
+ ReturnType return_type, std::string_view str_arg);
+ bool write_matcher(std::ostream& out, grammar::Definition const& definition,
+ ReturnType return_type, std::string_view indent);
+ bool write_matcher(std::ostream& out, grammar::Element const& element,
+ ReturnType return_type);
+
+ std::set<std::string_view> above_specific_tokens_;
+ std::set<std::string_view> specific_tokens_;
+ std::set<grammar::Element const*> all_elements_;
+ std::set<std::string_view> copy_last_;
+ std::set<std::string_view> report_last_;
+};
+
+// Find the Elements that has at least one terminal or character class as symbol
+// These will be the different tokens the tokenizer can return
+void Generator::find_specific_elements(grammar::Element const& root) {
+ if (std::ranges::any_of(root.definitions, [](auto const& definition) {
+ return definition.symbols.size() > 1 ||
+ definition.symbols[0].type == grammar::Symbol::Type::kTerminal;
+ })) {
+ specific_tokens_.insert(root.name);
+ return;
+ }
+
+ above_specific_tokens_.insert(root.name);
+
+ for (auto const& definition : root.definitions) {
+ for (auto const& symbol : definition.symbols) {
+ switch (symbol.type) {
+ case grammar::Symbol::Type::kNonTerminal:
+ find_specific_elements(*symbol.element);
+ break;
+ case grammar::Symbol::Type::kCharacterClass:
+ specific_tokens_.insert(kCharacterClassNames[symbol.char_class]);
+ break;
+ case grammar::Symbol::Type::kTerminal:
+ std::unreachable();
+ }
+ }
+ }
+}
+
+// Find elements that have definitions that has ZeroOrMore matches with a final condition
+void Generator::check_need_last(grammar::Element const& element) {
+ for (auto const& definition : element.definitions) {
+ if (definition.symbols.size() < 2)
+ continue;
+ if (definition.symbols[definition.symbols.size() - 1].optional ==
+ grammar::Symbol::Optional::kRequired &&
+ definition.symbols[definition.symbols.size() - 1].type ==
+ grammar::Symbol::Type::kNonTerminal &&
+ (definition.symbols[definition.symbols.size() - 2].optional ==
+ grammar::Symbol::Optional::kZeroOrOne ||
+ definition.symbols[definition.symbols.size() - 2].optional ==
+ grammar::Symbol::Optional::kZeroOrMore) &&
+ definition.symbols[definition.symbols.size() - 2].type ==
+ grammar::Symbol::Type::kNonTerminal) {
+ if (!copy_last_.contains(definition.symbols[definition.symbols.size() - 2]
+ .element->name)) {
+ find_report_last(
+ *definition.symbols[definition.symbols.size() - 2].element,
+ *definition.symbols[definition.symbols.size() - 1].element);
+ }
+ }
+ }
+}
+
+// Find element that has match as a single definition, if so, return true and insert into
+// report_last_.
+bool Generator::find_report_last(grammar::Element const& root,
+ grammar::Element const& match) {
+ if (std::ranges::any_of(root.definitions, [&match](auto const& definition) {
+ return definition.symbols.size() == 1 &&
+ definition.symbols[0].optional ==
+ grammar::Symbol::Optional::kRequired &&
+ definition.symbols[0].type ==
+ grammar::Symbol::Type::kNonTerminal &&
+ definition.symbols[0].element == &match;
+ })) {
+ report_last_.insert(root.name);
+ return true;
+ }
+
+ for (auto const& definition : root.definitions) {
+ for (auto const& symbol : definition.symbols) {
+ if (symbol.type == grammar::Symbol::Type::kNonTerminal) {
+ if (!copy_last_.contains(symbol.element->name)) {
+ if (find_report_last(*symbol.element, match)) {
+ copy_last_.insert(root.name);
+ return true;
+ }
+ }
+ }
+ }
+ }
+
+ return false;
+}
+
+ReturnType Generator::get_return_type(grammar::Element const& element) const {
+ if (above_specific_tokens_.contains(element.name) ||
+ specific_tokens_.contains(element.name))
+ return ReturnType::kTokenAndSize;
+ if (copy_last_.contains(element.name) || report_last_.contains(element.name))
+ return ReturnType::kInternalAndSize;
+ return ReturnType::kSize;
+}
+
+ReturnType Generator::get_return_type(uint8_t character_class) const {
+ auto const& name = kCharacterClassNames[character_class];
+ if (above_specific_tokens_.contains(name) || specific_tokens_.contains(name))
+ return ReturnType::kTokenAndSize;
+ return ReturnType::kSize;
+}
+
+void Generator::find_all_elements(grammar::Element const& root) {
+ auto pair = all_elements_.insert(&root);
+ if (!pair.second)
+ return;
+
+ for (auto const& definition : root.definitions) {
+ for (auto const& symbol : definition.symbols) {
+ switch (symbol.type) {
+ case grammar::Symbol::Type::kNonTerminal:
+ find_all_elements(*symbol.element);
+ break;
+ case grammar::Symbol::Type::kCharacterClass:
+ case grammar::Symbol::Type::kTerminal:
+ break;
+ }
+ }
+ }
+}
+
+void declare_character_class_matchers(std::ostream& out) {
+ out << "[[nodiscard]]\n"
+ << "std::optional<size_t> matchLineTerminator(std::string_view str);\n";
+
+ out << "[[nodiscard]]\n"
+ << "static std::optional<size_t> matchInputCharacter(std::string_view "
+ "str);\n";
+
+ out << "[[nodiscard]]\n"
+ << "std::optional<std::pair<Token, size_t>> "
+ "matchWhiteSpace(std::string_view str);\n";
+
+ out << "[[nodiscard]]\n"
+ << "static std::optional<size_t> matchJavaLetter"
+ << "(std::string_view str);\n";
+
+ out << "[[nodiscard]]\n"
+ << "static std::optional<size_t> matchJavaLetterOrDigit"
+ << "(std::string_view str);\n";
+}
+
+void write_character_class_matchers(std::ostream& out,
+ std::string_view unicode_version) {
+ out << "[[nodiscard]]\n"
+ << "std::optional<size_t> TokenMatcher::matchLineTerminator"
+ << "(std::string_view str) {\n"
+ // Tokenizer normally reads one line at a time, there is only
+ // one construct (traditional comment) that needs it.
+ // So match synthetic '\n' or report that it was needed if we are at
+ // end of string.
+ << " if (str.empty()) {\n"
+ << " line_end_reached_ = true;\n"
+ << " return std::nullopt;\n"
+ << " }\n"
+ << " return str.front() == '\\n' ? std::make_optional<size_t>(1)"
+ << " : std::nullopt;\n"
+ << "}\n"
+ << "\n";
+
+ out << "[[nodiscard]]\n"
+ << "std::optional<size_t> TokenMatcher::matchInputCharacter"
+ << "(std::string_view str) {\n"
+ // UnicodeInputCharacter but not CR or LF
+ << " if (str.empty() || str.front() == '\\n')\n"
+ << " return std::nullopt;\n"
+ << " auto* const start = reinterpret_cast<uint8_t const*>(str.data());\n"
+ << " auto* ptr = start;\n"
+ << " u8::skip(ptr, start + str.size());\n"
+ << " return ptr - start;\n"
+ << "}\n"
+ << "\n";
+
+ out << "[[nodiscard]]\n"
+ << "std::optional<std::pair<Token, size_t>> TokenMatcher::matchWhiteSpace"
+ << "(std::string_view str) {\n"
+ << " if (!str.empty()) {\n"
+ << " switch (str.front()) {\n"
+ // the ASCII SP character, also known as "space"
+ << " case ' ':\n"
+ // the ASCII HT character, also known as "horizontal tab"
+ << " case '\\t':\n"
+ // the ASCII FF character, also known as "form feed"
+ << " case '\\f':\n"
+ // LineTerminator, not calling matchLineTerminator as it does special things
+ << " case '\\n':\n"
+ << " return std::make_pair(Token::kWhiteSpace, 1);\n"
+ << " default:\n"
+ << " break;\n"
+ << " }\n"
+ << " }\n"
+ << " return std::nullopt;\n"
+ << "}\n"
+ << "\n";
+
+ out << "[[nodiscard]]\n"
+ << "std::optional<size_t> TokenMatcher::matchJavaLetter"
+ << "(std::string_view str) {\n"
+ << " auto* const start = reinterpret_cast<uint8_t const*>(str.data());\n"
+ << " auto* ptr = start;\n"
+ << " auto code = u8::read(ptr, ptr + str.size());\n"
+ << " if (!code.has_value())\n"
+ << " return std::nullopt;\n"
+ // any Unicode character that is a "Java letter"
+ // A "Java letter" is a character for which the method Character.isJavaIdentifierStart(int) returns true.
+ // A character may start a Java identifier if and only if one of the following conditions is true:
+ // isLetter(codePoint) returns true
+ // getType(codePoint) returns LETTER_NUMBER
+ // the referenced character is a currency symbol (such as '$')
+ // the referenced character is a connecting punctuation character (such as '_').
+ << " switch (u::lookup_gc(code.value(), u::Version::" << unicode_version
+ << ")) {\n"
+ << " case u::GeneralCategory::LETTER_UPPERCASE:\n"
+ << " case u::GeneralCategory::LETTER_LOWERCASE:\n"
+ << " case u::GeneralCategory::LETTER_TITLECASE:\n"
+ << " case u::GeneralCategory::LETTER_MODIFIER:\n"
+ << " case u::GeneralCategory::LETTER_OTHER:\n"
+ << " case u::GeneralCategory::NUMBER_LETTER:\n"
+ << " case u::GeneralCategory::SYMBOL_CURRENCY:\n"
+ << " case u::GeneralCategory::PUNCTUATION_CONNECTOR:\n"
+ << " return ptr - start;\n"
+ << " default:\n"
+ << " return std::nullopt;\n"
+ << " }\n"
+ << "}\n"
+ << "\n";
+
+ out << "[[nodiscard]]\n"
+ << "std::optional<size_t> TokenMatcher::matchJavaLetterOrDigit"
+ << "(std::string_view str) {\n"
+ << " auto* const start = reinterpret_cast<uint8_t const*>(str.data());\n"
+ << " auto* ptr = start;\n"
+ << " auto code = u8::read(ptr, ptr + str.size());\n"
+ << " if (!code.has_value())\n"
+ << " return std::nullopt;\n"
+ // any Unicode character that is a "Java letter-or-digit"
+ // A "Java letter-or-digit" is a character for which the method Character.isJavaIdentifierPart(int) returns true.
+ // A character may be part of a Java identifier if any of the following conditions are true:
+ // it is a letter
+ // it is a currency symbol (such as '$')
+ // it is a connecting punctuation character (such as '_')
+ // it is a digit
+ // it is a numeric letter (such as a Roman numeral character)
+ // it is a combining mark
+ // it is a non-spacing mark
+ // isIdentifierIgnorable returns true for the character
+ << " switch (u::lookup_gc(code.value(), u::Version::" << unicode_version
+ << ")) {\n"
+ << " case u::GeneralCategory::LETTER_UPPERCASE:\n"
+ << " case u::GeneralCategory::LETTER_LOWERCASE:\n"
+ << " case u::GeneralCategory::LETTER_TITLECASE:\n"
+ << " case u::GeneralCategory::LETTER_MODIFIER:\n"
+ << " case u::GeneralCategory::LETTER_OTHER:\n"
+ << " case u::GeneralCategory::SYMBOL_CURRENCY:\n"
+ << " case u::GeneralCategory::PUNCTUATION_CONNECTOR:\n"
+ << " case u::GeneralCategory::NUMBER_DIGIT:\n"
+ << " case u::GeneralCategory::NUMBER_LETTER:\n"
+ << " case u::GeneralCategory::MARK_SPACING_COMBINING:\n"
+ << " case u::GeneralCategory::MARK_NONSPACING:\n"
+ << " case u::GeneralCategory::OTHER_FORMAT:\n"
+ << " return ptr - start;\n"
+ << " case u::GeneralCategory::OTHER_CONTROL:\n"
+ << " if ((/* code.value() >= 0 && */ code.value() <= 8) ||\n"
+ << " (code.value() >= 0xe && code.value() <= 0x1b) ||\n"
+ << " (code.value() >= 0x7f && code.value() <= 0x9f))\n"
+ << " return 1;\n"
+ << " break;\n"
+ << " default:\n"
+ << " break;\n"
+ << " }\n"
+ << " return std::nullopt;\n"
+ << "}\n"
+ << "\n";
+}
+
+std::ostream& quote(std::ostream& out, std::string_view in) {
+ out << '"';
+ bool avoid_digit = false;
+ for (auto c : in) {
+ if (c == '"' || c == '\\') {
+ out << '\\';
+ } else if (c < ' ' || (c & 0x80)) {
+ char tmp[4];
+ std::to_chars(tmp, tmp + sizeof(tmp), c & 0xff, 8).ptr[0] = 0;
+ out << "\\" << tmp;
+ avoid_digit = true;
+ continue;
+ } else if (avoid_digit) {
+ if (c >= '0' && c <= '7') {
+ out << "\" \"";
+ }
+ }
+ avoid_digit = false;
+ out << c;
+ }
+ out << '"';
+ return out;
+}
+
+void match_return_type(std::ostream& out, ReturnType in_return_type,
+ std::string_view in_name, ReturnType out_return_type) {
+ switch (out_return_type) {
+ case ReturnType::kTokenAndSize:
+ switch (in_return_type) {
+ case ReturnType::kTokenAndSize:
+ break;
+ case ReturnType::kInternalAndSize:
+ out << ".transform([](auto pair) { return std::make_pair(Token::k"
+ << in_name << ", pair.second); }";
+ break;
+ case ReturnType::kSize:
+ out << ".transform([](auto size) { return std::make_pair(Token::k"
+ << in_name << ", size); })";
+ break;
+ }
+ break;
+ case ReturnType::kInternalAndSize:
+ switch (in_return_type) {
+ case ReturnType::kTokenAndSize:
+ out << ".transform([](auto pair) { return std::make_pair(Internal::k"
+ << in_name << ", pair.second); }";
+ break;
+ case ReturnType::kInternalAndSize:
+ break;
+ case ReturnType::kSize:
+ if (in_name.empty()) {
+ out << ".transform([](auto size) { return "
+ "std::make_pair(Internal::UNDEFINED, size); })";
+ } else {
+ out << ".transform([](auto size) { return "
+ "std::make_pair(Internal::k"
+ << in_name << ", size); })";
+ }
+ break;
+ }
+ break;
+ case ReturnType::kSize:
+ if (in_return_type != ReturnType::kSize) {
+ out << ".transform([](auto pair) { return pair.second; })";
+ }
+ break;
+ }
+}
+
+void Generator::write_matcher(std::ostream& out, grammar::Symbol const& symbol,
+ ReturnType return_type,
+ std::string_view str_arg) {
+ std::string_view in_name;
+ ReturnType in_return_type;
+
+ switch (symbol.type) {
+ case grammar::Symbol::Type::kTerminal:
+ in_return_type = ReturnType::kSize;
+ out << "(" << str_arg << ".starts_with(";
+ quote(out, symbol.value);
+ out << ") ? std::make_optional<size_t>(" << symbol.value.size()
+ << ") : " << "std::nullopt)";
+ break;
+ case grammar::Symbol::Type::kNonTerminal:
+ out << "match" << symbol.element->name << "(" << str_arg << ")";
+ in_return_type = get_return_type(*symbol.element);
+ in_name = symbol.element->name;
+ break;
+ case grammar::Symbol::Type::kCharacterClass:
+ out << "match" << kCharacterClassNames[symbol.char_class] << "("
+ << str_arg << ")";
+ in_return_type = get_return_type(symbol.char_class);
+ in_name = kCharacterClassNames[symbol.char_class];
+ break;
+ }
+
+ match_return_type(out, in_return_type, in_name, return_type);
+}
+
+bool Generator::write_matcher(std::ostream& out,
+ grammar::Definition const& definition,
+ ReturnType return_type, std::string_view indent) {
+ if (definition.symbols.size() == 1 &&
+ definition.symbols[0].optional == grammar::Symbol::Optional::kRequired) {
+ out << indent << "return ";
+ write_matcher(out, definition.symbols[0], return_type, "str");
+ out << ";\n";
+ return true;
+ }
+
+ std::string_view size_suffix;
+ switch (return_type) {
+ case ReturnType::kTokenAndSize:
+ case ReturnType::kInternalAndSize:
+ size_suffix = "->second";
+ break;
+ case ReturnType::kSize:
+ size_suffix = ".value()";
+ break;
+ }
+
+ if (definition.symbols.size() > 1 &&
+ definition.symbols[0].optional == grammar::Symbol::Optional::kRequired &&
+ definition.symbols[1].optional == grammar::Symbol::Optional::kExcluded) {
+ bool first = true;
+ for (auto const& symbol : definition.symbols) {
+ if (first) {
+ out << indent << "auto first_ret = ";
+ write_matcher(out, symbol, return_type, "str");
+ out << ";\n"
+ << indent << "if (!first_ret.has_value())\n"
+ << indent << " return first_ret;\n"
+ << indent << "std::optional<size_t> ret;\n"
+ << indent << "auto tmp = str.substr(0, first_ret" << size_suffix
+ << ");\n";
+ first = false;
+ } else {
+ if (symbol.optional != grammar::Symbol::Optional::kExcluded) {
+ std::cerr << "Non-excluded after at least one excluded\n";
+ return false;
+ }
+ out << indent << "ret = ";
+ write_matcher(out, symbol, ReturnType::kSize, "tmp");
+ out << ";\n"
+ << indent << "if (ret.has_value() && ret.value() == tmp.size())\n"
+ << indent << " return std::nullopt;\n";
+ }
+ }
+ out << indent << "return first_ret;\n";
+ return true;
+ }
+
+ if (std::ranges::all_of(definition.symbols, [](auto const& symbol) {
+ return symbol.optional == grammar::Symbol::Optional::kRequired;
+ })) {
+ out << indent << "size_t tot = 0;\n";
+ bool first = true;
+ for (auto const& symbol : definition.symbols) {
+ std::string indent2(indent);
+ if (first) {
+ out << indent2 << "auto ret = ";
+ write_matcher(out, symbol, return_type, "str");
+ out << ";\n";
+ first = false;
+ } else {
+ out << indent2 << "ret = ";
+ write_matcher(out, symbol, return_type, "str.substr(tot)");
+ out << ";\n";
+ }
+ out << indent2 << "if (!ret.has_value())\n"
+ << indent2 << " return ret;\n";
+ out << indent2 << "tot += ret" << size_suffix << ";\n";
+ }
+ switch (return_type) {
+ case ReturnType::kInternalAndSize:
+ // Return last internal
+ out << indent << "return std::make_pair(ret->first, tot);\n";
+ break;
+ case ReturnType::kTokenAndSize:
+ std::cerr << "Unable to return token and size\n";
+ return false;
+ case ReturnType::kSize:
+ out << indent << "return tot;\n";
+ break;
+ }
+ return true;
+ }
+
+ out << indent << "size_t tot = 0;\n";
+ bool last_internal = false;
+ switch (return_type) {
+ case ReturnType::kInternalAndSize:
+ last_internal = true;
+ out << indent << "std::optional<Internal> last_internal;\n";
+ break;
+ case ReturnType::kTokenAndSize:
+ case ReturnType::kSize:
+ break;
+ }
+ bool at_least_one_required = false;
+ bool first = true;
+ bool first_internal = true;
+ bool next_internal = false;
+ for (size_t i = 0; i < definition.symbols.size(); ++i) {
+ auto const& symbol = definition.symbols[i];
+ std::string indent2(indent);
+ bool have_internal = next_internal;
+ next_internal = false;
+ ReturnType symbol_return_type = return_type;
+
+ if (symbol.optional != grammar::Symbol::Optional::kRequired &&
+ i + 1 < definition.symbols.size() &&
+ definition.symbols[i + 1].optional ==
+ grammar::Symbol::Optional::kRequired &&
+ definition.symbols[i + 1].type == grammar::Symbol::Type::kNonTerminal) {
+ symbol_return_type = ReturnType::kInternalAndSize;
+ next_internal = true;
+ }
+
+ switch (symbol_return_type) {
+ case ReturnType::kTokenAndSize:
+ case ReturnType::kInternalAndSize:
+ size_suffix = "->second";
+ break;
+ case ReturnType::kSize:
+ size_suffix = ".value()";
+ break;
+ }
+
+ switch (symbol.optional) {
+ case grammar::Symbol::Optional::kRequired:
+ at_least_one_required = true;
+ break;
+ case grammar::Symbol::Optional::kZeroOrOne:
+ break;
+ case grammar::Symbol::Optional::kZeroOrMore:
+ if (first) {
+ switch (symbol_return_type) {
+ case ReturnType::kTokenAndSize:
+ out << indent << "std::optional<std::pair<Token, size_t>> ret;\n";
+ break;
+ case ReturnType::kInternalAndSize:
+ out << indent
+ << "std::optional<std::pair<Internal, size_t>> ret;\n";
+ break;
+ case ReturnType::kSize:
+ out << indent << "std::optional<size_t> ret;\n";
+ break;
+ }
+ first = false;
+ }
+ out << indent << "while (true) {\n";
+ indent2 += " ";
+ break;
+ case grammar::Symbol::Optional::kExcluded:
+ std::cerr << "Excluded mixed with conditional\n";
+ return false;
+ }
+ if (symbol_return_type == return_type) {
+ if (first) {
+ out << indent2 << "auto ret = ";
+ write_matcher(out, symbol, symbol_return_type, "str");
+ first = false;
+ } else {
+ out << indent2 << "ret = ";
+ write_matcher(out, symbol, symbol_return_type, "str.substr(tot)");
+ }
+ out << ";\n";
+ } else {
+ if (first_internal) {
+ out << indent2 << "auto ret_internal = ";
+ write_matcher(out, symbol, symbol_return_type,
+ first ? "str" : "str.substr(tot)");
+ first_internal = false;
+ } else {
+ out << indent2 << "ret_internal = ";
+ write_matcher(out, symbol, symbol_return_type, "str.substr(tot)");
+ }
+ out << ";\n";
+ if (first) {
+ out << indent2 << "auto ret = ret_internal";
+ first = false;
+ } else {
+ out << indent2 << "ret = ret_internal";
+ }
+ match_return_type(out, symbol_return_type, "", return_type);
+ out << ";\n";
+ }
+ switch (symbol.optional) {
+ case grammar::Symbol::Optional::kRequired:
+ out << indent2 << "if (!ret.has_value()) {\n";
+ if (have_internal &&
+ symbol.type == grammar::Symbol::Type::kNonTerminal) {
+ out << indent2
+ << " if (!ret_internal.has_value() || ret_internal->first != "
+ "Internal::k"
+ << symbol.element->name << ")\n"
+ << indent2 << " return ret;\n";
+ } else {
+ out << indent2 << " return ret;\n";
+ }
+ out << indent2 << "} else {\n"
+ << indent2 << " tot += ret" << size_suffix << ";\n";
+ if (last_internal)
+ out << indent2 << " last_internal = ret->first;\n";
+ out << indent2 << "}\n";
+ break;
+ case grammar::Symbol::Optional::kZeroOrOne:
+ if (symbol_return_type == ReturnType::kTokenAndSize) {
+ out << indent2 << "tot += ret.has_value() ? ret->second : 0;\n";
+ } else {
+ out << indent2 << "tot += ret.value_or(0);\n";
+ }
+ if (last_internal)
+ out << indent2 << "if (ret.has_value())\n"
+ << indent2 << " last_internal = ret->first;\n";
+ break;
+ case grammar::Symbol::Optional::kZeroOrMore:
+ out << indent2 << "if (!ret.has_value())\n"
+ << indent2 << " break;\n"
+ << indent2 << "tot += ret" << size_suffix << ";\n";
+ if (last_internal)
+ out << indent2 << "last_internal = ret->first;\n";
+ out << indent << "}\n";
+ break;
+ case grammar::Symbol::Optional::kExcluded:
+ assert(false);
+ break;
+ }
+ }
+ switch (return_type) {
+ case ReturnType::kInternalAndSize:
+ // Return last internal
+ if (at_least_one_required) {
+ out << indent << "return std::make_pair(last_internal.value(), tot);\n";
+ } else {
+ out << indent << "if (last_internal.has_value())\n"
+ << indent
+ << " return std::make_pair(last_internal.value(), tot);\n"
+ << indent << "return std::make_pair(Internal::UNDEFINED, tot);\n";
+ }
+ break;
+ case ReturnType::kTokenAndSize:
+ std::cerr << "Unable to return token and size\n";
+ return false;
+ case ReturnType::kSize:
+ out << indent << "return tot;\n";
+ break;
+ }
+ return true;
+}
+
+void declare_matcher(std::ostream& out, grammar::Element const& element,
+ ReturnType return_type) {
+ switch (return_type) {
+ case ReturnType::kSize:
+ out << "[[nodiscard]]\n"
+ << "std::optional<size_t> match" << element.name
+ << "(std::string_view str);\n";
+ break;
+ case ReturnType::kTokenAndSize:
+ out << "[[nodiscard]]\n"
+ << "std::optional<std::pair<Token, size_t>> match" << element.name
+ << "(std::string_view str);\n";
+ break;
+ case ReturnType::kInternalAndSize:
+ out << "[[nodiscard]]\n"
+ << "std::optional<std::pair<Internal, size_t>> match" << element.name
+ << "(std::string_view str);\n";
+ break;
+ }
+}
+
+bool Generator::write_matcher(std::ostream& out,
+ grammar::Element const& element,
+ ReturnType return_type) {
+ ReturnType sub_return_type = return_type;
+ bool make_token = false;
+
+ switch (return_type) {
+ case ReturnType::kSize:
+ out << "[[nodiscard]]\n"
+ << "std::optional<size_t> TokenMatcher::match" << element.name
+ << "(std::string_view str) {\n";
+ break;
+ case ReturnType::kTokenAndSize:
+ out << "[[nodiscard]]\n"
+ << "std::optional<std::pair<Token, size_t>> TokenMatcher::match"
+ << element.name << "(std::string_view str) {\n";
+
+ if (specific_tokens_.contains(element.name)) {
+ sub_return_type = ReturnType::kSize;
+ make_token = true;
+ }
+ break;
+ case ReturnType::kInternalAndSize:
+ out << "[[nodiscard]]\n"
+ << "std::optional<std::pair<TokenMatcher::Internal, size_t>> "
+ "TokenMatcher::match"
+ << element.name << "(std::string_view str) {\n";
+ break;
+ }
+
+ if (element.definitions.size() == 1) {
+ if (make_token) {
+ out << " auto ret = [this, str]() -> std::optional<size_t> {\n";
+ if (!write_matcher(out, element.definitions[0], sub_return_type,
+ " ")) {
+ std::cerr << "Error in " << element.name << "\n";
+ return false;
+ }
+ out << " }();\n"
+ << " return ret.transform([](auto size) {\n"
+ << " return std::make_pair(Token::k" << element.name
+ << ", size); });\n";
+ } else {
+ if (!write_matcher(out, element.definitions[0], sub_return_type, " ")) {
+ std::cerr << "Error in " << element.name << "\n";
+ return false;
+ }
+ }
+ } else if (std::ranges::all_of(
+ element.definitions, [](auto const& definition) {
+ return definition.symbols.size() == 1 &&
+ definition.symbols[0].optional ==
+ grammar::Symbol::Optional::kRequired &&
+ definition.symbols[0].type ==
+ grammar::Symbol::Type::kTerminal;
+ })) {
+ if (std::ranges::all_of(element.definitions, [](auto const& definition) {
+ return definition.symbols[0].value.size() == 1;
+ })) {
+ out << " if (!str.empty()) {\n"
+ << " switch (str.front()) {\n";
+ for (auto const& definition : element.definitions) {
+ out << " case '" << definition.symbols[0].value[0] << "':\n";
+ }
+ out << " return 1;\n"
+ << " default:\n"
+ << " break;\n"
+ << " }\n"
+ << " }\n"
+ << " return std::nullopt;\n";
+ } else {
+ auto builder = prefix_tree::builder();
+ for (auto const& definition : element.definitions) {
+ builder->add(definition.symbols[0].value);
+ }
+ auto tree = builder->build();
+ if (!tree.has_value()) {
+ std::cerr << "To large prefix tree\n";
+ return false;
+ }
+ out << " static const auto tree = ";
+ quote(out, tree.value()) << "sv;\n";
+ out << " return prefix_tree::lookup(tree, str)";
+ if (make_token) {
+ out << ".transform([](auto size) {\n"
+ << " return std::make_pair(Token::k" << element.name
+ << ", size); })";
+ }
+ out << ";\n";
+ }
+ } else {
+ bool first = true;
+ std::string_view ret_type;
+ switch (sub_return_type) {
+ case ReturnType::kTokenAndSize:
+ ret_type = "std::optional<std::pair<Token, size_t>>";
+ break;
+ case ReturnType::kInternalAndSize:
+ ret_type = "std::optional<std::pair<Internal, size_t>>";
+ break;
+ case ReturnType::kSize:
+ ret_type = "std::optional<size_t>";
+ break;
+ }
+ for (auto const& definition : element.definitions) {
+ if (first) {
+ first = false;
+ out << " auto tmp = [this, str]() -> " << ret_type << " {\n";
+ if (!write_matcher(out, definition, sub_return_type, " ")) {
+ std::cerr << "Error in " << element.name << "\n";
+ return false;
+ }
+ out << " }();\n";
+ out << " auto ret = tmp;\n";
+ } else {
+ out << " tmp = [this, str]() -> " << ret_type << " {\n";
+ if (!write_matcher(out, definition, sub_return_type, " ")) {
+ std::cerr << "Error in " << element.name << "\n";
+ return false;
+ }
+ out << " }();\n"
+ << " if (tmp.has_value()) {\n";
+ if (sub_return_type == ReturnType::kTokenAndSize) {
+ out << " if (!ret.has_value() || ret.value().second < "
+ "tmp.value().second) {\n";
+ } else {
+ out << " if (!ret.has_value() || ret.value() < tmp.value()) {\n";
+ }
+ out << " ret = tmp;\n"
+ << " }\n"
+ << " }\n";
+ }
+ }
+ if (make_token) {
+ out << " return ret.transform([](auto size) {\n"
+ << " return std::make_pair(Token::k" << element.name
+ << ", size); });\n";
+ } else {
+ out << " return ret;\n";
+ }
+ }
+
+ out << "}\n"
+ << "\n";
+ return true;
+}
+
+bool Generator::generate(std::string_view header_name,
+ std::string_view source_name, std::string const& ns,
+ std::string const& unicode_version,
+ grammar::Grammar& grammar) {
+ std::fstream header{std::string(header_name),
+ std::fstream::trunc | std::fstream::out};
+ std::fstream source{std::string(source_name),
+ std::fstream::trunc | std::fstream::out};
+
+ auto header_guard = make_define(header_name);
+
+ header << "#ifndef " << header_guard << "\n"
+ << "#define " << header_guard << "\n"
+ << "\n"
+ << "#include \"prefix_tree.hh\"\n"
+ << "\n"
+ << "#include <cstddef>\n"
+ << "#include <cstdint>\n"
+ << "#include <optional>\n"
+ << "#include <string_view>\n"
+ << "#include <utility>\n"
+ << "\n"
+ << "namespace " << ns << " {\n"
+ << "\n";
+
+ find_specific_elements(grammar.root());
+
+ find_all_elements(grammar.root());
+
+ for (auto& element : all_elements_) {
+ check_need_last(*element);
+ }
+
+ header << "enum class Token : "
+ << (specific_tokens_.size() < 256 ? "uint8_t" : "uint16_t") << " {\n";
+ for (auto const& token : specific_tokens_) {
+ header << " k" << token << ",\n";
+ }
+ header << "};\n";
+
+ header << "\n"
+ << "class TokenMatcher {\n"
+ << " public:\n"
+ << " [[nodiscard]]\n"
+ << " std::optional<std::pair<Token, size_t>>"
+ << " matchNext(std::string_view str);\n"
+ << "\n"
+ << " bool line_end_reached_{false};\n"
+ << "\n"
+ << " private:\n";
+
+ declare_character_class_matchers(header);
+
+ header << "\n"
+ << "enum class Internal : "
+ << (all_elements_.size() < 256 ? "uint8_t" : "uint16_t") << " {\n"
+ << " UNDEFINED,\n";
+ for (auto* element : all_elements_) {
+ header << " k" << element->name << ",\n";
+ }
+ header << "};\n"
+ << "\n";
+
+ for (auto* element : all_elements_) {
+ declare_matcher(header, *element, get_return_type(*element));
+ }
+
+ header << "\n"
+ << "};\n"
+ << "\n"
+ << "} // " << ns << "\n"
+ << "\n"
+ << "#endif // " << header_guard << "\n";
+
+ source << "#include \"" << header_name << "\"\n"
+ << "\n"
+ << "#include \"prefix_tree.hh\"\n"
+ << "#include \"u.hh\"\n"
+ << "#include \"u8.hh\"\n"
+ << "\n"
+ << "#include <cstddef>\n"
+ << "#include <optional>\n"
+ << "#include <string_view>\n"
+ << "#include <utility>\n"
+ << "\n"
+ << "using namespace std::literals::string_view_literals;\n"
+ << "\n"
+ << "// NOLINTBEGIN(readability-else-after-return, "
+ "readability-convert-member-functions-to-static)\n"
+ << "#pragma GCC diagnostic push\n"
+ << "#pragma GCC diagnostic ignored \"-Wunused-lambda-capture\"\n"
+ << "\n"
+ << "namespace " << ns << " {\n"
+ << "\n";
+
+ write_character_class_matchers(source, unicode_version);
+
+ source << "\n";
+
+ if (std::ranges::any_of(all_elements_, [this, &source](auto* element) {
+ auto sts = get_return_type(*element);
+ return !write_matcher(source, *element, sts);
+ })) {
+ return false;
+ }
+
+ source << "\n"
+ << "std::optional<std::pair<Token, size_t>>"
+ << "TokenMatcher::matchNext(std::string_view str) {\n"
+ << " return match" << grammar.root().name << "(str);\n"
+ << "}"
+ << "\n"
+ << "} // namespace " << ns << "\n"
+ << "\n"
+ << "// NOLINTEND(readability-else-after-return, "
+ "readability-convert-member-functions-to-static)\n"
+ << "#pragma GCC diagnostic pop\n"
+ << "\n";
+
+ return true;
+}
+
+} // namespace
+
+int main(int argc, char** argv) {
+ auto args = Args::create();
+ auto opt_help = args->option('h', "help", "display this text and exit");
+ auto opt_ns = args->option_argument('\0', "namespace", "ARG",
+ "Namespace for tokenizer");
+ auto opt_unicode =
+ args->option_argument('u', "unicode", "ARG", "Unicode version");
+ std::vector<std::string_view> arguments;
+ if (!args->run(argc, argv, &arguments)) {
+ args->print_error(std::cerr);
+ std::cerr << "Try `gen_tokens --help` for usage\n";
+ return 1;
+ }
+ if (opt_help->is_set()) {
+ std::cout << "Usage: `gen_tokens [OPTIONS...] tokens.grammar"
+ << " OUTPUT.hh OUTPUT.cc`\n"
+ << "Generates a tokenizer for grammar.\n"
+ << "\n";
+ args->print_help(std::cout);
+ return 0;
+ }
+ if (!opt_ns->is_set()) {
+ std::cerr << "No namespace given.\n"
+ << "Try `gen_tokens --help` for usage\n";
+ return 1;
+ }
+ if (!opt_unicode->is_set()) {
+ std::cerr << "No unicode version given.\n"
+ << "Try `gen_tokens --help` for usage\n";
+ return 1;
+ }
+ auto ns = opt_ns->argument();
+ auto unicode = opt_unicode->argument();
+ if (arguments.size() != 3) {
+ std::cerr << "Expecting three arguments. No more, no less.\n"
+ << "Try `gen_tokens --help` for usage\n";
+ return 1;
+ }
+
+ auto filename = std::string(arguments[0]);
+ auto reader = io::open(filename);
+ if (!reader.has_value()) {
+ std::cerr << "Unable to open " << filename << '\n';
+ return 1;
+ }
+ auto errors = src::file_errors(std::move(filename));
+ auto grammar =
+ grammar::load(std::move(reader.value()), kCharacterClassNames, *errors);
+ if (!grammar || errors->errors() > 0)
+ return 1;
+
+ Generator generator;
+ if (!generator.generate(arguments[1], arguments[2], ns, unicode, *grammar))
+ return 1;
+ return 0;
+}