diff options
Diffstat (limited to 'src/gen_tokens.cc')
| -rw-r--r-- | src/gen_tokens.cc | 1019 |
1 files changed, 1019 insertions, 0 deletions
diff --git a/src/gen_tokens.cc b/src/gen_tokens.cc new file mode 100644 index 0000000..2442c4f --- /dev/null +++ b/src/gen_tokens.cc @@ -0,0 +1,1019 @@ +#include "args.hh" +#include "errors.hh" +#include "grammar.hh" +#include "io.hh" +#include "prefix_tree.hh" + +#include <algorithm> +#include <cassert> +#include <charconv> +#include <cstddef> +#include <cstdint> +#include <fstream> +#include <iostream> +#include <set> +#include <string> +#include <string_view> +#include <utility> +#include <vector> + +namespace { + +enum class CharacterClass : uint8_t { + kWhiteSpace = 0, + kLineTerminator = 1, + kInputCharacter = 2, + kJavaLetter = 3, + kJavaLetterOrDigit = 4, +}; + +std::vector<std::string> const kCharacterClassNames( + {"WhiteSpace", "LineTerminator", "InputCharacter", "JavaLetter", + "JavaLetterOrDigit"}); + +enum class ReturnType : uint8_t { + kTokenAndSize, + kInternalAndSize, + kSize, +}; + +std::string make_define(std::string_view filename) { + std::string ret; + ret.reserve(filename.size()); + for (char c : filename) { + if ((c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_') { + ret.push_back(c); + } else if (c >= 'a' && c <= 'z') { + ret.push_back(static_cast<char>(c & ~0x20)); + } else { + ret.push_back('_'); + } + } + return ret; +} + +class Generator { + public: + bool generate(std::string_view header_name, std::string_view source_name, + std::string const& ns, std::string const& unicode_version, + grammar::Grammar& grammar); + + private: + void find_specific_elements(grammar::Element const& root); + void find_all_elements(grammar::Element const& root); + + void check_need_last(grammar::Element const& element); + bool find_report_last(grammar::Element const& root, + grammar::Element const& match); + + [[nodiscard]] + ReturnType get_return_type(grammar::Element const& element) const; + [[nodiscard]] + ReturnType get_return_type(uint8_t character_class) const; + + void write_matcher(std::ostream& out, grammar::Symbol const& symbol, + ReturnType return_type, std::string_view str_arg); + bool write_matcher(std::ostream& out, grammar::Definition const& definition, + ReturnType return_type, std::string_view indent); + bool write_matcher(std::ostream& out, grammar::Element const& element, + ReturnType return_type); + + std::set<std::string_view> above_specific_tokens_; + std::set<std::string_view> specific_tokens_; + std::set<grammar::Element const*> all_elements_; + std::set<std::string_view> copy_last_; + std::set<std::string_view> report_last_; +}; + +// Find the Elements that has at least one terminal or character class as symbol +// These will be the different tokens the tokenizer can return +void Generator::find_specific_elements(grammar::Element const& root) { + if (std::ranges::any_of(root.definitions, [](auto const& definition) { + return definition.symbols.size() > 1 || + definition.symbols[0].type == grammar::Symbol::Type::kTerminal; + })) { + specific_tokens_.insert(root.name); + return; + } + + above_specific_tokens_.insert(root.name); + + for (auto const& definition : root.definitions) { + for (auto const& symbol : definition.symbols) { + switch (symbol.type) { + case grammar::Symbol::Type::kNonTerminal: + find_specific_elements(*symbol.element); + break; + case grammar::Symbol::Type::kCharacterClass: + specific_tokens_.insert(kCharacterClassNames[symbol.char_class]); + break; + case grammar::Symbol::Type::kTerminal: + std::unreachable(); + } + } + } +} + +// Find elements that have definitions that has ZeroOrMore matches with a final condition +void Generator::check_need_last(grammar::Element const& element) { + for (auto const& definition : element.definitions) { + if (definition.symbols.size() < 2) + continue; + if (definition.symbols[definition.symbols.size() - 1].optional == + grammar::Symbol::Optional::kRequired && + definition.symbols[definition.symbols.size() - 1].type == + grammar::Symbol::Type::kNonTerminal && + (definition.symbols[definition.symbols.size() - 2].optional == + grammar::Symbol::Optional::kZeroOrOne || + definition.symbols[definition.symbols.size() - 2].optional == + grammar::Symbol::Optional::kZeroOrMore) && + definition.symbols[definition.symbols.size() - 2].type == + grammar::Symbol::Type::kNonTerminal) { + if (!copy_last_.contains(definition.symbols[definition.symbols.size() - 2] + .element->name)) { + find_report_last( + *definition.symbols[definition.symbols.size() - 2].element, + *definition.symbols[definition.symbols.size() - 1].element); + } + } + } +} + +// Find element that has match as a single definition, if so, return true and insert into +// report_last_. +bool Generator::find_report_last(grammar::Element const& root, + grammar::Element const& match) { + if (std::ranges::any_of(root.definitions, [&match](auto const& definition) { + return definition.symbols.size() == 1 && + definition.symbols[0].optional == + grammar::Symbol::Optional::kRequired && + definition.symbols[0].type == + grammar::Symbol::Type::kNonTerminal && + definition.symbols[0].element == &match; + })) { + report_last_.insert(root.name); + return true; + } + + for (auto const& definition : root.definitions) { + for (auto const& symbol : definition.symbols) { + if (symbol.type == grammar::Symbol::Type::kNonTerminal) { + if (!copy_last_.contains(symbol.element->name)) { + if (find_report_last(*symbol.element, match)) { + copy_last_.insert(root.name); + return true; + } + } + } + } + } + + return false; +} + +ReturnType Generator::get_return_type(grammar::Element const& element) const { + if (above_specific_tokens_.contains(element.name) || + specific_tokens_.contains(element.name)) + return ReturnType::kTokenAndSize; + if (copy_last_.contains(element.name) || report_last_.contains(element.name)) + return ReturnType::kInternalAndSize; + return ReturnType::kSize; +} + +ReturnType Generator::get_return_type(uint8_t character_class) const { + auto const& name = kCharacterClassNames[character_class]; + if (above_specific_tokens_.contains(name) || specific_tokens_.contains(name)) + return ReturnType::kTokenAndSize; + return ReturnType::kSize; +} + +void Generator::find_all_elements(grammar::Element const& root) { + auto pair = all_elements_.insert(&root); + if (!pair.second) + return; + + for (auto const& definition : root.definitions) { + for (auto const& symbol : definition.symbols) { + switch (symbol.type) { + case grammar::Symbol::Type::kNonTerminal: + find_all_elements(*symbol.element); + break; + case grammar::Symbol::Type::kCharacterClass: + case grammar::Symbol::Type::kTerminal: + break; + } + } + } +} + +void write_character_class_matchers(std::ostream& out, + std::string_view unicode_version) { + out << "[[nodiscard]]\n" + << "inline std::optional<size_t> matchLineTerminator" + << "(std::string_view str) {\n" + // Tokenizer reads one line at a time, + // so line terminator matches end of line. + << " return str.empty() ? std::make_optional<size_t>(0)" + << " : std::nullopt;\n" + << "}\n" + << "\n"; + + out << "[[nodiscard]]\n" + << "inline std::optional<size_t> matchInputCharacter" + << "(std::string_view str) {\n" + << " if (str.empty())\n" + << " return std::nullopt;\n" + // UnicodeInputCharacter but not CR or LF + << " auto* const start = reinterpret_cast<uint8_t const*>(str.data());\n" + << " auto* ptr = start;\n" + << " u8::skip(ptr, start + str.size());\n" + << " return ptr - start;\n" + << "}\n" + << "\n"; + + out << "[[nodiscard]]\n" + << "inline std::optional<std::pair<Token, size_t>> matchWhiteSpace" + << "(std::string_view str) {\n" + // LineTerminator + << " if (auto ret = matchLineTerminator(str); ret.has_value())\n" + << " return std::make_pair(Token::kWhiteSpace, ret.value());\n" + << " switch (str.front()) {\n" + // the ASCII SP character, also known as "space" + << " case ' ':\n" + // the ASCII HT character, also known as "horizontal tab" + << " case '\\t':\n" + // the ASCII FF character, also known as "form feed" + << " case '\\f':\n" + << " return std::make_pair(Token::kWhiteSpace, 1);\n" + << " default:\n" + << " return std::nullopt;\n" + << " }\n" + << "}\n" + << "\n"; + + out << "[[nodiscard]]\n" + << "inline std::optional<size_t> matchJavaLetter" + << "(std::string_view str) {\n" + << " auto* const start = reinterpret_cast<uint8_t const*>(str.data());\n" + << " auto* ptr = start;\n" + << " auto code = u8::read(ptr, ptr + str.size());\n" + << " if (!code.has_value())\n" + << " return std::nullopt;\n" + // any Unicode character that is a "Java letter" + // A "Java letter" is a character for which the method Character.isJavaIdentifierStart(int) returns true. + // A character may start a Java identifier if and only if one of the following conditions is true: + // isLetter(codePoint) returns true + // getType(codePoint) returns LETTER_NUMBER + // the referenced character is a currency symbol (such as '$') + // the referenced character is a connecting punctuation character (such as '_'). + << " switch (u::lookup_gc(code.value(), u::Version::" << unicode_version + << ")) {\n" + << " case u::GeneralCategory::LETTER_UPPERCASE:\n" + << " case u::GeneralCategory::LETTER_LOWERCASE:\n" + << " case u::GeneralCategory::LETTER_TITLECASE:\n" + << " case u::GeneralCategory::LETTER_MODIFIER:\n" + << " case u::GeneralCategory::LETTER_OTHER:\n" + << " case u::GeneralCategory::NUMBER_LETTER:\n" + << " case u::GeneralCategory::SYMBOL_CURRENCY:\n" + << " case u::GeneralCategory::PUNCTUATION_CONNECTOR:\n" + << " return ptr - start;\n" + << " default:\n" + << " return std::nullopt;\n" + << " }\n" + << "}\n" + << "\n"; + + out << "[[nodiscard]]\n" + << "inline std::optional<size_t> matchJavaLetterOrDigit" + << "(std::string_view str) {\n" + << " auto* const start = reinterpret_cast<uint8_t const*>(str.data());\n" + << " auto* ptr = start;\n" + << " auto code = u8::read(ptr, ptr + str.size());\n" + << " if (!code.has_value())\n" + << " return std::nullopt;\n" + // any Unicode character that is a "Java letter-or-digit" + // A "Java letter-or-digit" is a character for which the method Character.isJavaIdentifierPart(int) returns true. + // A character may be part of a Java identifier if any of the following conditions are true: + // it is a letter + // it is a currency symbol (such as '$') + // it is a connecting punctuation character (such as '_') + // it is a digit + // it is a numeric letter (such as a Roman numeral character) + // it is a combining mark + // it is a non-spacing mark + // isIdentifierIgnorable returns true for the character + << " switch (u::lookup_gc(code.value(), u::Version::" << unicode_version + << ")) {\n" + << " case u::GeneralCategory::LETTER_UPPERCASE:\n" + << " case u::GeneralCategory::LETTER_LOWERCASE:\n" + << " case u::GeneralCategory::LETTER_TITLECASE:\n" + << " case u::GeneralCategory::LETTER_MODIFIER:\n" + << " case u::GeneralCategory::LETTER_OTHER:\n" + << " case u::GeneralCategory::SYMBOL_CURRENCY:\n" + << " case u::GeneralCategory::PUNCTUATION_CONNECTOR:\n" + << " case u::GeneralCategory::NUMBER_DIGIT:\n" + << " case u::GeneralCategory::NUMBER_LETTER:\n" + << " case u::GeneralCategory::MARK_SPACING_COMBINING:\n" + << " case u::GeneralCategory::MARK_NONSPACING:\n" + << " case u::GeneralCategory::OTHER_FORMAT:\n" + << " return ptr - start;\n" + << " case u::GeneralCategory::OTHER_CONTROL:\n" + << " if ((/* code.value() >= 0 && */ code.value() <= 8) ||\n" + << " (code.value() >= 0xe && code.value() <= 0x1b) ||\n" + << " (code.value() >= 0x7f && code.value() <= 0x9f))\n" + << " return 1;\n" + << " break;\n" + << " default:\n" + << " break;\n" + << " }\n" + << " return std::nullopt;\n" + << "}\n" + << "\n"; +} + +std::ostream& quote(std::ostream& out, std::string_view in) { + out << '"'; + bool avoid_digit = false; + for (auto c : in) { + if (c == '"' || c == '\\') { + out << '\\'; + } else if (c < ' ' || (c & 0x80)) { + char tmp[4]; + std::to_chars(tmp, tmp + sizeof(tmp), c & 0xff, 8).ptr[0] = 0; + out << "\\" << tmp; + avoid_digit = true; + continue; + } else if (avoid_digit) { + if (c >= '0' && c <= '7') { + out << "\" \""; + } + } + avoid_digit = false; + out << c; + } + out << '"'; + return out; +} + +void match_return_type(std::ostream& out, ReturnType in_return_type, + std::string_view in_name, ReturnType out_return_type) { + switch (out_return_type) { + case ReturnType::kTokenAndSize: + switch (in_return_type) { + case ReturnType::kTokenAndSize: + break; + case ReturnType::kInternalAndSize: + out << ".transform([](auto pair) { return std::make_pair(Token::k" + << in_name << ", pair.second); }"; + break; + case ReturnType::kSize: + out << ".transform([](auto size) { return std::make_pair(Token::k" + << in_name << ", size); })"; + break; + } + break; + case ReturnType::kInternalAndSize: + switch (in_return_type) { + case ReturnType::kTokenAndSize: + out << ".transform([](auto pair) { return std::make_pair(Internal::k" + << in_name << ", pair.second); }"; + break; + case ReturnType::kInternalAndSize: + break; + case ReturnType::kSize: + if (in_name.empty()) { + out << ".transform([](auto size) { return " + "std::make_pair(Internal::UNDEFINED, size); })"; + } else { + out << ".transform([](auto size) { return " + "std::make_pair(Internal::k" + << in_name << ", size); })"; + } + break; + } + break; + case ReturnType::kSize: + if (in_return_type != ReturnType::kSize) { + out << ".transform([](auto pair) { return pair.second; })"; + } + break; + } +} + +void Generator::write_matcher(std::ostream& out, grammar::Symbol const& symbol, + ReturnType return_type, + std::string_view str_arg) { + std::string_view in_name; + ReturnType in_return_type; + + switch (symbol.type) { + case grammar::Symbol::Type::kTerminal: + in_return_type = ReturnType::kSize; + out << "(" << str_arg << ".starts_with("; + quote(out, symbol.value); + out << ") ? std::make_optional<size_t>(" << symbol.value.size() + << ") : " << "std::nullopt)"; + break; + case grammar::Symbol::Type::kNonTerminal: + out << "match" << symbol.element->name << "(" << str_arg << ")"; + in_return_type = get_return_type(*symbol.element); + in_name = symbol.element->name; + break; + case grammar::Symbol::Type::kCharacterClass: + out << "match" << kCharacterClassNames[symbol.char_class] << "(" + << str_arg << ")"; + in_return_type = get_return_type(symbol.char_class); + in_name = kCharacterClassNames[symbol.char_class]; + break; + } + + match_return_type(out, in_return_type, in_name, return_type); +} + +bool Generator::write_matcher(std::ostream& out, + grammar::Definition const& definition, + ReturnType return_type, std::string_view indent) { + if (definition.symbols.size() == 1 && + definition.symbols[0].optional == grammar::Symbol::Optional::kRequired) { + out << indent << "return "; + write_matcher(out, definition.symbols[0], return_type, "str"); + out << ";\n"; + return true; + } + + std::string_view size_suffix; + switch (return_type) { + case ReturnType::kTokenAndSize: + case ReturnType::kInternalAndSize: + size_suffix = "->second"; + break; + case ReturnType::kSize: + size_suffix = ".value()"; + break; + } + + if (definition.symbols.size() > 1 && + definition.symbols[0].optional == grammar::Symbol::Optional::kRequired && + definition.symbols[1].optional == grammar::Symbol::Optional::kExcluded) { + bool first = true; + for (auto const& symbol : definition.symbols) { + if (first) { + out << indent << "auto first_ret = "; + write_matcher(out, symbol, return_type, "str"); + out << ";\n" + << indent << "if (!first_ret.has_value())\n" + << indent << " return first_ret;\n" + << indent << "std::optional<size_t> ret;\n" + << indent << "auto tmp = str.substr(0, first_ret" << size_suffix + << ");\n"; + first = false; + } else { + if (symbol.optional != grammar::Symbol::Optional::kExcluded) { + std::cerr << "Non-excluded after at least one excluded\n"; + return false; + } + out << indent << "ret = "; + write_matcher(out, symbol, ReturnType::kSize, "tmp"); + out << ";\n" + << indent << "if (ret.has_value() && ret.value() == tmp.size())\n" + << indent << " return std::nullopt;\n"; + } + } + out << indent << "return first_ret;\n"; + return true; + } + + if (std::ranges::all_of(definition.symbols, [](auto const& symbol) { + return symbol.optional == grammar::Symbol::Optional::kRequired; + })) { + out << indent << "size_t tot = 0;\n"; + bool first = true; + for (auto const& symbol : definition.symbols) { + std::string indent2(indent); + if (first) { + out << indent2 << "auto ret = "; + write_matcher(out, symbol, return_type, "str"); + out << ";\n"; + first = false; + } else { + out << indent2 << "ret = "; + write_matcher(out, symbol, return_type, "str.substr(tot)"); + out << ";\n"; + } + out << indent2 << "if (!ret.has_value())\n" + << indent2 << " return ret;\n"; + out << indent2 << "tot += ret" << size_suffix << ";\n"; + } + switch (return_type) { + case ReturnType::kInternalAndSize: + // Return last internal + out << indent << "return std::make_pair(ret->first, tot);\n"; + break; + case ReturnType::kTokenAndSize: + std::cerr << "Unable to return token and size\n"; + return false; + case ReturnType::kSize: + out << indent << "return tot;\n"; + break; + } + return true; + } + + out << indent << "size_t tot = 0;\n"; + bool last_internal = false; + switch (return_type) { + case ReturnType::kInternalAndSize: + last_internal = true; + out << indent << "std::optional<Internal> last_internal;\n"; + break; + case ReturnType::kTokenAndSize: + case ReturnType::kSize: + break; + } + bool at_least_one_required = false; + bool first = true; + bool first_internal = true; + bool next_internal = false; + for (size_t i = 0; i < definition.symbols.size(); ++i) { + auto const& symbol = definition.symbols[i]; + std::string indent2(indent); + bool have_internal = next_internal; + next_internal = false; + ReturnType symbol_return_type = return_type; + + if (symbol.optional != grammar::Symbol::Optional::kRequired && + i + 1 < definition.symbols.size() && + definition.symbols[i + 1].optional == + grammar::Symbol::Optional::kRequired && + definition.symbols[i + 1].type == grammar::Symbol::Type::kNonTerminal) { + symbol_return_type = ReturnType::kInternalAndSize; + next_internal = true; + } + + switch (symbol_return_type) { + case ReturnType::kTokenAndSize: + case ReturnType::kInternalAndSize: + size_suffix = "->second"; + break; + case ReturnType::kSize: + size_suffix = ".value()"; + break; + } + + switch (symbol.optional) { + case grammar::Symbol::Optional::kRequired: + at_least_one_required = true; + break; + case grammar::Symbol::Optional::kZeroOrOne: + break; + case grammar::Symbol::Optional::kZeroOrMore: + if (first) { + switch (symbol_return_type) { + case ReturnType::kTokenAndSize: + out << indent << "std::optional<std::pair<Token, size_t>> ret;\n"; + break; + case ReturnType::kInternalAndSize: + out << indent + << "std::optional<std::pair<Internal, size_t>> ret;\n"; + break; + case ReturnType::kSize: + out << indent << "std::optional<size_t> ret;\n"; + break; + } + first = false; + } + out << indent << "while (true) {\n"; + indent2 += " "; + break; + case grammar::Symbol::Optional::kExcluded: + std::cerr << "Excluded mixed with conditional\n"; + return false; + } + if (symbol_return_type == return_type) { + if (first) { + out << indent2 << "auto ret = "; + write_matcher(out, symbol, symbol_return_type, "str"); + first = false; + } else { + out << indent2 << "ret = "; + write_matcher(out, symbol, symbol_return_type, "str.substr(tot)"); + } + out << ";\n"; + } else { + if (first_internal) { + out << indent2 << "auto ret_internal = "; + write_matcher(out, symbol, symbol_return_type, + first ? "str" : "str.substr(tot)"); + first_internal = false; + } else { + out << indent2 << "ret_internal = "; + write_matcher(out, symbol, symbol_return_type, "str.substr(tot)"); + } + out << ";\n"; + if (first) { + out << indent2 << "auto ret = ret_internal"; + first = false; + } else { + out << indent2 << "ret = ret_internal"; + } + match_return_type(out, symbol_return_type, "", return_type); + out << ";\n"; + } + switch (symbol.optional) { + case grammar::Symbol::Optional::kRequired: + out << indent2 << "if (!ret.has_value()) {\n"; + if (have_internal && + symbol.type == grammar::Symbol::Type::kNonTerminal) { + out << indent2 + << " if (!ret_internal.has_value() || ret_internal->first != " + "Internal::k" + << symbol.element->name << ")\n" + << indent2 << " return ret;\n"; + } else { + out << indent2 << " return ret;\n"; + } + out << indent2 << "} else {\n" + << indent2 << " tot += ret" << size_suffix << ";\n"; + if (last_internal) + out << indent2 << " last_internal = ret->first;\n"; + out << indent2 << "}\n"; + break; + case grammar::Symbol::Optional::kZeroOrOne: + if (symbol_return_type == ReturnType::kTokenAndSize) { + out << indent2 << "tot += ret.has_value() ? ret->second : 0;\n"; + } else { + out << indent2 << "tot += ret.value_or(0);\n"; + } + if (last_internal) + out << indent2 << "if (ret.has_value())\n" + << indent2 << " last_internal = ret->first;\n"; + break; + case grammar::Symbol::Optional::kZeroOrMore: + out << indent2 << "if (!ret.has_value())\n" + << indent2 << " break;\n" + << indent2 << "tot += ret" << size_suffix << ";\n"; + if (last_internal) + out << indent2 << "last_internal = ret->first;\n"; + out << indent << "}\n"; + break; + case grammar::Symbol::Optional::kExcluded: + assert(false); + break; + } + } + switch (return_type) { + case ReturnType::kInternalAndSize: + // Return last internal + if (at_least_one_required) { + out << indent << "return std::make_pair(last_internal.value(), tot);\n"; + } else { + out << indent << "if (last_internal.has_value())\n" + << indent + << " return std::make_pair(last_internal.value(), tot);\n" + << indent << "return std::make_pair(Internal::UNDEFINED, tot);\n"; + } + break; + case ReturnType::kTokenAndSize: + std::cerr << "Unable to return token and size\n"; + return false; + case ReturnType::kSize: + out << indent << "return tot;\n"; + break; + } + return true; +} + +void declare_matcher(std::ostream& out, grammar::Element const& element, + ReturnType return_type) { + switch (return_type) { + case ReturnType::kSize: + out << "[[nodiscard]]\n" + << "inline std::optional<size_t> match" << element.name + << "(std::string_view str);\n"; + break; + case ReturnType::kTokenAndSize: + out << "[[nodiscard]]\n" + << "inline std::optional<std::pair<Token, size_t>> match" + << element.name << "(std::string_view str);\n"; + break; + case ReturnType::kInternalAndSize: + out << "[[nodiscard]]\n" + << "inline std::optional<std::pair<Internal, size_t>> match" + << element.name << "(std::string_view str);\n"; + break; + } +} + +bool Generator::write_matcher(std::ostream& out, + grammar::Element const& element, + ReturnType return_type) { + ReturnType sub_return_type = return_type; + bool make_token = false; + + switch (return_type) { + case ReturnType::kSize: + out << "[[nodiscard]]\n" + << "inline std::optional<size_t> match" << element.name + << "(std::string_view str) {\n"; + break; + case ReturnType::kTokenAndSize: + out << "[[nodiscard]]\n" + << "inline std::optional<std::pair<Token, size_t>> match" + << element.name << "(std::string_view str) {\n"; + + if (specific_tokens_.contains(element.name)) { + sub_return_type = ReturnType::kSize; + make_token = true; + } + break; + case ReturnType::kInternalAndSize: + out << "[[nodiscard]]\n" + << "inline std::optional<std::pair<Internal, size_t>> match" + << element.name << "(std::string_view str) {\n"; + break; + } + + if (element.definitions.size() == 1) { + if (make_token) { + out << " auto ret = [str]() -> std::optional<size_t> {\n"; + if (!write_matcher(out, element.definitions[0], sub_return_type, + " ")) { + std::cerr << "Error in " << element.name << "\n"; + return false; + } + out << " }();\n" + << " return ret.transform([](auto size) {\n" + << " return std::make_pair(Token::k" << element.name + << ", size); });\n"; + } else { + if (!write_matcher(out, element.definitions[0], sub_return_type, " ")) { + std::cerr << "Error in " << element.name << "\n"; + return false; + } + } + } else if (std::ranges::all_of( + element.definitions, [](auto const& definition) { + return definition.symbols.size() == 1 && + definition.symbols[0].optional == + grammar::Symbol::Optional::kRequired && + definition.symbols[0].type == + grammar::Symbol::Type::kTerminal; + })) { + if (std::ranges::all_of(element.definitions, [](auto const& definition) { + return definition.symbols[0].value.size() == 1; + })) { + out << " if (!str.empty()) {\n" + << " switch (str.front()) {\n"; + for (auto const& definition : element.definitions) { + out << " case '" << definition.symbols[0].value[0] << "':\n"; + } + out << " return 1;\n" + << " default:\n" + << " break;\n" + << " }\n" + << " }\n" + << " return std::nullopt;\n"; + } else { + auto builder = prefix_tree::builder(); + for (auto const& definition : element.definitions) { + builder->add(definition.symbols[0].value); + } + auto tree = builder->build(); + if (!tree.has_value()) { + std::cerr << "To large prefix tree\n"; + return false; + } + out << " static const auto tree = "; + quote(out, tree.value()) << "sv;\n"; + out << " return prefix_tree::lookup(tree, str)"; + if (make_token) { + out << ".transform([](auto size) {\n" + << " return std::make_pair(Token::k" << element.name + << ", size); })"; + } + out << ";\n"; + } + } else { + bool first = true; + std::string_view ret_type; + switch (sub_return_type) { + case ReturnType::kTokenAndSize: + ret_type = "std::optional<std::pair<Token, size_t>>"; + break; + case ReturnType::kInternalAndSize: + ret_type = "std::optional<std::pair<Internal, size_t>>"; + break; + case ReturnType::kSize: + ret_type = "std::optional<size_t>"; + break; + } + for (auto const& definition : element.definitions) { + if (first) { + first = false; + out << " auto tmp = [str]() -> " << ret_type << " {\n"; + if (!write_matcher(out, definition, sub_return_type, " ")) { + std::cerr << "Error in " << element.name << "\n"; + return false; + } + out << " }();\n"; + out << " auto ret = tmp;\n"; + } else { + out << " tmp = [str]() -> " << ret_type << " {\n"; + if (!write_matcher(out, definition, sub_return_type, " ")) { + std::cerr << "Error in " << element.name << "\n"; + return false; + } + out << " }();\n" + << " if (tmp.has_value()) {\n"; + if (sub_return_type == ReturnType::kTokenAndSize) { + out << " if (!ret.has_value() || ret.value().second < " + "tmp.value().second) {\n"; + } else { + out << " if (!ret.has_value() || ret.value() < tmp.value()) {\n"; + } + out << " ret = tmp;\n" + << " }\n" + << " }\n"; + } + } + if (make_token) { + out << " return ret.transform([](auto size) {\n" + << " return std::make_pair(Token::k" << element.name + << ", size); });\n"; + } else { + out << " return ret;\n"; + } + } + + out << "}\n" + << "\n"; + return true; +} + +bool Generator::generate(std::string_view header_name, + std::string_view source_name, std::string const& ns, + std::string const& unicode_version, + grammar::Grammar& grammar) { + std::fstream header{std::string(header_name), + std::fstream::trunc | std::fstream::out}; + std::fstream source{std::string(source_name), + std::fstream::trunc | std::fstream::out}; + + auto header_guard = make_define(header_name); + + header << "#ifndef " << header_guard << "\n" + << "#define " << header_guard << "\n" + << "\n" + << "#include \"prefix_tree.hh\"\n" + << "\n" + << "#include <cstddef>\n" + << "#include <cstdint>\n" + << "#include <optional>\n" + << "#include <string_view>\n" + << "#include <utility>\n" + << "\n" + << "namespace " << ns << " {\n" + << "\n"; + + find_specific_elements(grammar.root()); + + find_all_elements(grammar.root()); + + for (auto& element : all_elements_) { + check_need_last(*element); + } + + header << "enum class Token : " + << (specific_tokens_.size() < 256 ? "uint8_t" : "uint16_t") << " {\n"; + for (auto const& token : specific_tokens_) { + header << " k" << token << ",\n"; + } + header << "};\n"; + + header << "\n" + << "[[nodiscard]]\n" + << "std::optional<std::pair<Token, size_t>>" + << " matchNext(std::string_view str);\n" + << "\n" + << "} // " << ns << "\n" + << "\n" + << "#endif // " << header_guard << "\n"; + + source << "#include \"" << header_name << "\"\n" + << "\n" + << "#include \"prefix_tree.hh\"\n" + << "#include \"u.hh\"\n" + << "#include \"u8.hh\"\n" + << "\n" + << "#include <cstddef>\n" + << "#include <optional>\n" + << "#include <string_view>\n" + << "#include <utility>\n" + << "\n" + << "using namespace std::literals::string_view_literals;\n" + << "\n" + << "// NOLINTBEGIN(readability-else-after-return)\n" + << "\n" + << "namespace " << ns << " {\n"; + + source << "namespace {\n" + << "\n"; + + write_character_class_matchers(source, unicode_version); + + source << "\n" + << "enum class Internal : " + << (all_elements_.size() < 256 ? "uint8_t" : "uint16_t") << " {\n" + << " UNDEFINED,\n"; + for (auto* element : all_elements_) { + source << " k" << element->name << ",\n"; + } + source << "};\n" + << "\n"; + + for (auto* element : all_elements_) { + declare_matcher(source, *element, get_return_type(*element)); + } + + if (std::ranges::any_of(all_elements_, [this, &source](auto* element) { + auto sts = get_return_type(*element); + return !write_matcher(source, *element, sts); + })) { + return false; + } + + source << "\n" + << "} // namespace\n"; + + source << "\n" + << "std::optional<std::pair<Token, size_t>>" + << "matchNext(std::string_view str) {\n" + << " return match" << grammar.root().name << "(str);\n" + << "}" + << "\n" + << "} // namespace " << ns << "\n" + << "\n" + << "// NOLINTEND(readability-else-after-return)\n" + << "\n"; + + return true; +} + +} // namespace + +int main(int argc, char** argv) { + auto args = Args::create(); + auto opt_help = args->option('h', "help", "display this text and exit"); + auto opt_ns = args->option_argument('\0', "namespace", "ARG", + "Namespace for tokenizer"); + auto opt_unicode = + args->option_argument('u', "unicode", "ARG", "Unicode version"); + std::vector<std::string_view> arguments; + if (!args->run(argc, argv, &arguments)) { + args->print_error(std::cerr); + std::cerr << "Try `gen_tokens --help` for usage\n"; + return 1; + } + if (opt_help->is_set()) { + std::cout << "Usage: `gen_tokens [OPTIONS...] tokens.grammar" + << " OUTPUT.hh OUTPUT.cc`\n" + << "Generates a tokenizer for grammar.\n" + << "\n"; + args->print_help(std::cout); + return 0; + } + if (!opt_ns->is_set()) { + std::cerr << "No namespace given.\n" + << "Try `gen_tokens --help` for usage\n"; + return 1; + } + if (!opt_unicode->is_set()) { + std::cerr << "No unicode version given.\n" + << "Try `gen_tokens --help` for usage\n"; + return 1; + } + auto ns = opt_ns->argument(); + auto unicode = opt_unicode->argument(); + if (arguments.size() != 3) { + std::cerr << "Expecting three arguments. No more, no less.\n" + << "Try `gen_tokens --help` for usage\n"; + return 1; + } + + auto filename = std::string(arguments[0]); + auto reader = io::open(filename); + if (!reader.has_value()) { + std::cerr << "Unable to open " << filename << '\n'; + return 1; + } + auto errors = src::file_errors(std::move(filename)); + auto grammar = + grammar::load(std::move(reader.value()), kCharacterClassNames, *errors); + if (!grammar || errors->errors() > 0) + return 1; + + Generator generator; + if (!generator.generate(arguments[1], arguments[2], ns, unicode, *grammar)) + return 1; + return 0; +} |
