#include "args.hh" #include "errors.hh" #include "grammar.hh" #include "io.hh" #include "prefix_tree.hh" #include #include #include #include #include #include #include #include #include #include #include #include namespace { enum class CharacterClass : uint8_t { kWhiteSpace = 0, kLineTerminator = 1, kInputCharacter = 2, kJavaLetter = 3, kJavaLetterOrDigit = 4, }; std::vector const kCharacterClassNames( {"WhiteSpace", "LineTerminator", "InputCharacter", "JavaLetter", "JavaLetterOrDigit"}); enum class ReturnType : uint8_t { kTokenAndSize, kInternalAndSize, kSize, }; std::string make_define(std::string_view filename) { std::string ret; ret.reserve(filename.size()); for (char c : filename) { if ((c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_') { ret.push_back(c); } else if (c >= 'a' && c <= 'z') { ret.push_back(static_cast(c & ~0x20)); } else { ret.push_back('_'); } } return ret; } class Generator { public: bool generate(std::string_view header_name, std::string_view source_name, std::string const& ns, std::string const& unicode_version, grammar::Grammar& grammar); private: void find_specific_elements(grammar::Element const& root); void find_all_elements(grammar::Element const& root); void check_need_last(grammar::Element const& element); bool find_report_last(grammar::Element const& root, grammar::Element const& match); [[nodiscard]] ReturnType get_return_type(grammar::Element const& element) const; [[nodiscard]] ReturnType get_return_type(uint8_t character_class) const; void write_matcher(std::ostream& out, grammar::Symbol const& symbol, ReturnType return_type, std::string_view str_arg); bool write_matcher(std::ostream& out, grammar::Definition const& definition, ReturnType return_type, std::string_view indent); bool write_matcher(std::ostream& out, grammar::Element const& element, ReturnType return_type); std::set above_specific_tokens_; std::set specific_tokens_; std::set all_elements_; std::set copy_last_; std::set report_last_; }; // Find the Elements that has at least one terminal or character class as symbol // These will be the different tokens the tokenizer can return void Generator::find_specific_elements(grammar::Element const& root) { if (std::ranges::any_of(root.definitions, [](auto const& definition) { return definition.symbols.size() > 1 || definition.symbols[0].type == grammar::Symbol::Type::kTerminal; })) { specific_tokens_.insert(root.name); return; } above_specific_tokens_.insert(root.name); for (auto const& definition : root.definitions) { for (auto const& symbol : definition.symbols) { switch (symbol.type) { case grammar::Symbol::Type::kNonTerminal: find_specific_elements(*symbol.element); break; case grammar::Symbol::Type::kCharacterClass: specific_tokens_.insert(kCharacterClassNames[symbol.char_class]); break; case grammar::Symbol::Type::kTerminal: std::unreachable(); } } } } // Find elements that have definitions that has ZeroOrMore matches with a final condition void Generator::check_need_last(grammar::Element const& element) { for (auto const& definition : element.definitions) { if (definition.symbols.size() < 2) continue; if (definition.symbols[definition.symbols.size() - 1].optional == grammar::Symbol::Optional::kRequired && definition.symbols[definition.symbols.size() - 1].type == grammar::Symbol::Type::kNonTerminal && (definition.symbols[definition.symbols.size() - 2].optional == grammar::Symbol::Optional::kZeroOrOne || definition.symbols[definition.symbols.size() - 2].optional == grammar::Symbol::Optional::kZeroOrMore) && definition.symbols[definition.symbols.size() - 2].type == grammar::Symbol::Type::kNonTerminal) { if (!copy_last_.contains(definition.symbols[definition.symbols.size() - 2] .element->name)) { find_report_last( *definition.symbols[definition.symbols.size() - 2].element, *definition.symbols[definition.symbols.size() - 1].element); } } } } // Find element that has match as a single definition, if so, return true and insert into // report_last_. bool Generator::find_report_last(grammar::Element const& root, grammar::Element const& match) { if (std::ranges::any_of(root.definitions, [&match](auto const& definition) { return definition.symbols.size() == 1 && definition.symbols[0].optional == grammar::Symbol::Optional::kRequired && definition.symbols[0].type == grammar::Symbol::Type::kNonTerminal && definition.symbols[0].element == &match; })) { report_last_.insert(root.name); return true; } for (auto const& definition : root.definitions) { for (auto const& symbol : definition.symbols) { if (symbol.type == grammar::Symbol::Type::kNonTerminal) { if (!copy_last_.contains(symbol.element->name)) { if (find_report_last(*symbol.element, match)) { copy_last_.insert(root.name); return true; } } } } } return false; } ReturnType Generator::get_return_type(grammar::Element const& element) const { if (above_specific_tokens_.contains(element.name) || specific_tokens_.contains(element.name)) return ReturnType::kTokenAndSize; if (copy_last_.contains(element.name) || report_last_.contains(element.name)) return ReturnType::kInternalAndSize; return ReturnType::kSize; } ReturnType Generator::get_return_type(uint8_t character_class) const { auto const& name = kCharacterClassNames[character_class]; if (above_specific_tokens_.contains(name) || specific_tokens_.contains(name)) return ReturnType::kTokenAndSize; return ReturnType::kSize; } void Generator::find_all_elements(grammar::Element const& root) { auto pair = all_elements_.insert(&root); if (!pair.second) return; for (auto const& definition : root.definitions) { for (auto const& symbol : definition.symbols) { switch (symbol.type) { case grammar::Symbol::Type::kNonTerminal: find_all_elements(*symbol.element); break; case grammar::Symbol::Type::kCharacterClass: case grammar::Symbol::Type::kTerminal: break; } } } } void write_character_class_matchers(std::ostream& out, std::string_view unicode_version) { out << "[[nodiscard]]\n" << "inline std::optional matchLineTerminator" << "(std::string_view str) {\n" // Tokenizer reads one line at a time, // so line terminator matches end of line. << " return str.empty() ? std::make_optional(0)" << " : std::nullopt;\n" << "}\n" << "\n"; out << "[[nodiscard]]\n" << "inline std::optional matchInputCharacter" << "(std::string_view str) {\n" << " if (str.empty())\n" << " return std::nullopt;\n" // UnicodeInputCharacter but not CR or LF << " auto* const start = reinterpret_cast(str.data());\n" << " auto* ptr = start;\n" << " u8::skip(ptr, start + str.size());\n" << " return ptr - start;\n" << "}\n" << "\n"; out << "[[nodiscard]]\n" << "inline std::optional> matchWhiteSpace" << "(std::string_view str) {\n" // LineTerminator << " if (auto ret = matchLineTerminator(str); ret.has_value())\n" << " return std::make_pair(Token::kWhiteSpace, ret.value());\n" << " switch (str.front()) {\n" // the ASCII SP character, also known as "space" << " case ' ':\n" // the ASCII HT character, also known as "horizontal tab" << " case '\\t':\n" // the ASCII FF character, also known as "form feed" << " case '\\f':\n" << " return std::make_pair(Token::kWhiteSpace, 1);\n" << " default:\n" << " return std::nullopt;\n" << " }\n" << "}\n" << "\n"; out << "[[nodiscard]]\n" << "inline std::optional matchJavaLetter" << "(std::string_view str) {\n" << " auto* const start = reinterpret_cast(str.data());\n" << " auto* ptr = start;\n" << " auto code = u8::read(ptr, ptr + str.size());\n" << " if (!code.has_value())\n" << " return std::nullopt;\n" // any Unicode character that is a "Java letter" // A "Java letter" is a character for which the method Character.isJavaIdentifierStart(int) returns true. // A character may start a Java identifier if and only if one of the following conditions is true: // isLetter(codePoint) returns true // getType(codePoint) returns LETTER_NUMBER // the referenced character is a currency symbol (such as '$') // the referenced character is a connecting punctuation character (such as '_'). << " switch (u::lookup_gc(code.value(), u::Version::" << unicode_version << ")) {\n" << " case u::GeneralCategory::LETTER_UPPERCASE:\n" << " case u::GeneralCategory::LETTER_LOWERCASE:\n" << " case u::GeneralCategory::LETTER_TITLECASE:\n" << " case u::GeneralCategory::LETTER_MODIFIER:\n" << " case u::GeneralCategory::LETTER_OTHER:\n" << " case u::GeneralCategory::NUMBER_LETTER:\n" << " case u::GeneralCategory::SYMBOL_CURRENCY:\n" << " case u::GeneralCategory::PUNCTUATION_CONNECTOR:\n" << " return ptr - start;\n" << " default:\n" << " return std::nullopt;\n" << " }\n" << "}\n" << "\n"; out << "[[nodiscard]]\n" << "inline std::optional matchJavaLetterOrDigit" << "(std::string_view str) {\n" << " auto* const start = reinterpret_cast(str.data());\n" << " auto* ptr = start;\n" << " auto code = u8::read(ptr, ptr + str.size());\n" << " if (!code.has_value())\n" << " return std::nullopt;\n" // any Unicode character that is a "Java letter-or-digit" // A "Java letter-or-digit" is a character for which the method Character.isJavaIdentifierPart(int) returns true. // A character may be part of a Java identifier if any of the following conditions are true: // it is a letter // it is a currency symbol (such as '$') // it is a connecting punctuation character (such as '_') // it is a digit // it is a numeric letter (such as a Roman numeral character) // it is a combining mark // it is a non-spacing mark // isIdentifierIgnorable returns true for the character << " switch (u::lookup_gc(code.value(), u::Version::" << unicode_version << ")) {\n" << " case u::GeneralCategory::LETTER_UPPERCASE:\n" << " case u::GeneralCategory::LETTER_LOWERCASE:\n" << " case u::GeneralCategory::LETTER_TITLECASE:\n" << " case u::GeneralCategory::LETTER_MODIFIER:\n" << " case u::GeneralCategory::LETTER_OTHER:\n" << " case u::GeneralCategory::SYMBOL_CURRENCY:\n" << " case u::GeneralCategory::PUNCTUATION_CONNECTOR:\n" << " case u::GeneralCategory::NUMBER_DIGIT:\n" << " case u::GeneralCategory::NUMBER_LETTER:\n" << " case u::GeneralCategory::MARK_SPACING_COMBINING:\n" << " case u::GeneralCategory::MARK_NONSPACING:\n" << " case u::GeneralCategory::OTHER_FORMAT:\n" << " return ptr - start;\n" << " case u::GeneralCategory::OTHER_CONTROL:\n" << " if ((/* code.value() >= 0 && */ code.value() <= 8) ||\n" << " (code.value() >= 0xe && code.value() <= 0x1b) ||\n" << " (code.value() >= 0x7f && code.value() <= 0x9f))\n" << " return 1;\n" << " break;\n" << " default:\n" << " break;\n" << " }\n" << " return std::nullopt;\n" << "}\n" << "\n"; } std::ostream& quote(std::ostream& out, std::string_view in) { out << '"'; bool avoid_digit = false; for (auto c : in) { if (c == '"' || c == '\\') { out << '\\'; } else if (c < ' ' || (c & 0x80)) { char tmp[4]; std::to_chars(tmp, tmp + sizeof(tmp), c & 0xff, 8).ptr[0] = 0; out << "\\" << tmp; avoid_digit = true; continue; } else if (avoid_digit) { if (c >= '0' && c <= '7') { out << "\" \""; } } avoid_digit = false; out << c; } out << '"'; return out; } void match_return_type(std::ostream& out, ReturnType in_return_type, std::string_view in_name, ReturnType out_return_type) { switch (out_return_type) { case ReturnType::kTokenAndSize: switch (in_return_type) { case ReturnType::kTokenAndSize: break; case ReturnType::kInternalAndSize: out << ".transform([](auto pair) { return std::make_pair(Token::k" << in_name << ", pair.second); }"; break; case ReturnType::kSize: out << ".transform([](auto size) { return std::make_pair(Token::k" << in_name << ", size); })"; break; } break; case ReturnType::kInternalAndSize: switch (in_return_type) { case ReturnType::kTokenAndSize: out << ".transform([](auto pair) { return std::make_pair(Internal::k" << in_name << ", pair.second); }"; break; case ReturnType::kInternalAndSize: break; case ReturnType::kSize: if (in_name.empty()) { out << ".transform([](auto size) { return " "std::make_pair(Internal::UNDEFINED, size); })"; } else { out << ".transform([](auto size) { return " "std::make_pair(Internal::k" << in_name << ", size); })"; } break; } break; case ReturnType::kSize: if (in_return_type != ReturnType::kSize) { out << ".transform([](auto pair) { return pair.second; })"; } break; } } void Generator::write_matcher(std::ostream& out, grammar::Symbol const& symbol, ReturnType return_type, std::string_view str_arg) { std::string_view in_name; ReturnType in_return_type; switch (symbol.type) { case grammar::Symbol::Type::kTerminal: in_return_type = ReturnType::kSize; out << "(" << str_arg << ".starts_with("; quote(out, symbol.value); out << ") ? std::make_optional(" << symbol.value.size() << ") : " << "std::nullopt)"; break; case grammar::Symbol::Type::kNonTerminal: out << "match" << symbol.element->name << "(" << str_arg << ")"; in_return_type = get_return_type(*symbol.element); in_name = symbol.element->name; break; case grammar::Symbol::Type::kCharacterClass: out << "match" << kCharacterClassNames[symbol.char_class] << "(" << str_arg << ")"; in_return_type = get_return_type(symbol.char_class); in_name = kCharacterClassNames[symbol.char_class]; break; } match_return_type(out, in_return_type, in_name, return_type); } bool Generator::write_matcher(std::ostream& out, grammar::Definition const& definition, ReturnType return_type, std::string_view indent) { if (definition.symbols.size() == 1 && definition.symbols[0].optional == grammar::Symbol::Optional::kRequired) { out << indent << "return "; write_matcher(out, definition.symbols[0], return_type, "str"); out << ";\n"; return true; } std::string_view size_suffix; switch (return_type) { case ReturnType::kTokenAndSize: case ReturnType::kInternalAndSize: size_suffix = "->second"; break; case ReturnType::kSize: size_suffix = ".value()"; break; } if (definition.symbols.size() > 1 && definition.symbols[0].optional == grammar::Symbol::Optional::kRequired && definition.symbols[1].optional == grammar::Symbol::Optional::kExcluded) { bool first = true; for (auto const& symbol : definition.symbols) { if (first) { out << indent << "auto first_ret = "; write_matcher(out, symbol, return_type, "str"); out << ";\n" << indent << "if (!first_ret.has_value())\n" << indent << " return first_ret;\n" << indent << "std::optional ret;\n" << indent << "auto tmp = str.substr(0, first_ret" << size_suffix << ");\n"; first = false; } else { if (symbol.optional != grammar::Symbol::Optional::kExcluded) { std::cerr << "Non-excluded after at least one excluded\n"; return false; } out << indent << "ret = "; write_matcher(out, symbol, ReturnType::kSize, "tmp"); out << ";\n" << indent << "if (ret.has_value() && ret.value() == tmp.size())\n" << indent << " return std::nullopt;\n"; } } out << indent << "return first_ret;\n"; return true; } if (std::ranges::all_of(definition.symbols, [](auto const& symbol) { return symbol.optional == grammar::Symbol::Optional::kRequired; })) { out << indent << "size_t tot = 0;\n"; bool first = true; for (auto const& symbol : definition.symbols) { std::string indent2(indent); if (first) { out << indent2 << "auto ret = "; write_matcher(out, symbol, return_type, "str"); out << ";\n"; first = false; } else { out << indent2 << "ret = "; write_matcher(out, symbol, return_type, "str.substr(tot)"); out << ";\n"; } out << indent2 << "if (!ret.has_value())\n" << indent2 << " return ret;\n"; out << indent2 << "tot += ret" << size_suffix << ";\n"; } switch (return_type) { case ReturnType::kInternalAndSize: // Return last internal out << indent << "return std::make_pair(ret->first, tot);\n"; break; case ReturnType::kTokenAndSize: std::cerr << "Unable to return token and size\n"; return false; case ReturnType::kSize: out << indent << "return tot;\n"; break; } return true; } out << indent << "size_t tot = 0;\n"; bool last_internal = false; switch (return_type) { case ReturnType::kInternalAndSize: last_internal = true; out << indent << "std::optional last_internal;\n"; break; case ReturnType::kTokenAndSize: case ReturnType::kSize: break; } bool at_least_one_required = false; bool first = true; bool first_internal = true; bool next_internal = false; for (size_t i = 0; i < definition.symbols.size(); ++i) { auto const& symbol = definition.symbols[i]; std::string indent2(indent); bool have_internal = next_internal; next_internal = false; ReturnType symbol_return_type = return_type; if (symbol.optional != grammar::Symbol::Optional::kRequired && i + 1 < definition.symbols.size() && definition.symbols[i + 1].optional == grammar::Symbol::Optional::kRequired && definition.symbols[i + 1].type == grammar::Symbol::Type::kNonTerminal) { symbol_return_type = ReturnType::kInternalAndSize; next_internal = true; } switch (symbol_return_type) { case ReturnType::kTokenAndSize: case ReturnType::kInternalAndSize: size_suffix = "->second"; break; case ReturnType::kSize: size_suffix = ".value()"; break; } switch (symbol.optional) { case grammar::Symbol::Optional::kRequired: at_least_one_required = true; break; case grammar::Symbol::Optional::kZeroOrOne: break; case grammar::Symbol::Optional::kZeroOrMore: if (first) { switch (symbol_return_type) { case ReturnType::kTokenAndSize: out << indent << "std::optional> ret;\n"; break; case ReturnType::kInternalAndSize: out << indent << "std::optional> ret;\n"; break; case ReturnType::kSize: out << indent << "std::optional ret;\n"; break; } first = false; } out << indent << "while (true) {\n"; indent2 += " "; break; case grammar::Symbol::Optional::kExcluded: std::cerr << "Excluded mixed with conditional\n"; return false; } if (symbol_return_type == return_type) { if (first) { out << indent2 << "auto ret = "; write_matcher(out, symbol, symbol_return_type, "str"); first = false; } else { out << indent2 << "ret = "; write_matcher(out, symbol, symbol_return_type, "str.substr(tot)"); } out << ";\n"; } else { if (first_internal) { out << indent2 << "auto ret_internal = "; write_matcher(out, symbol, symbol_return_type, first ? "str" : "str.substr(tot)"); first_internal = false; } else { out << indent2 << "ret_internal = "; write_matcher(out, symbol, symbol_return_type, "str.substr(tot)"); } out << ";\n"; if (first) { out << indent2 << "auto ret = ret_internal"; first = false; } else { out << indent2 << "ret = ret_internal"; } match_return_type(out, symbol_return_type, "", return_type); out << ";\n"; } switch (symbol.optional) { case grammar::Symbol::Optional::kRequired: out << indent2 << "if (!ret.has_value()) {\n"; if (have_internal && symbol.type == grammar::Symbol::Type::kNonTerminal) { out << indent2 << " if (!ret_internal.has_value() || ret_internal->first != " "Internal::k" << symbol.element->name << ")\n" << indent2 << " return ret;\n"; } else { out << indent2 << " return ret;\n"; } out << indent2 << "} else {\n" << indent2 << " tot += ret" << size_suffix << ";\n"; if (last_internal) out << indent2 << " last_internal = ret->first;\n"; out << indent2 << "}\n"; break; case grammar::Symbol::Optional::kZeroOrOne: if (symbol_return_type == ReturnType::kTokenAndSize) { out << indent2 << "tot += ret.has_value() ? ret->second : 0;\n"; } else { out << indent2 << "tot += ret.value_or(0);\n"; } if (last_internal) out << indent2 << "if (ret.has_value())\n" << indent2 << " last_internal = ret->first;\n"; break; case grammar::Symbol::Optional::kZeroOrMore: out << indent2 << "if (!ret.has_value())\n" << indent2 << " break;\n" << indent2 << "tot += ret" << size_suffix << ";\n"; if (last_internal) out << indent2 << "last_internal = ret->first;\n"; out << indent << "}\n"; break; case grammar::Symbol::Optional::kExcluded: assert(false); break; } } switch (return_type) { case ReturnType::kInternalAndSize: // Return last internal if (at_least_one_required) { out << indent << "return std::make_pair(last_internal.value(), tot);\n"; } else { out << indent << "if (last_internal.has_value())\n" << indent << " return std::make_pair(last_internal.value(), tot);\n" << indent << "return std::make_pair(Internal::UNDEFINED, tot);\n"; } break; case ReturnType::kTokenAndSize: std::cerr << "Unable to return token and size\n"; return false; case ReturnType::kSize: out << indent << "return tot;\n"; break; } return true; } void declare_matcher(std::ostream& out, grammar::Element const& element, ReturnType return_type) { switch (return_type) { case ReturnType::kSize: out << "[[nodiscard]]\n" << "inline std::optional match" << element.name << "(std::string_view str);\n"; break; case ReturnType::kTokenAndSize: out << "[[nodiscard]]\n" << "inline std::optional> match" << element.name << "(std::string_view str);\n"; break; case ReturnType::kInternalAndSize: out << "[[nodiscard]]\n" << "inline std::optional> match" << element.name << "(std::string_view str);\n"; break; } } bool Generator::write_matcher(std::ostream& out, grammar::Element const& element, ReturnType return_type) { ReturnType sub_return_type = return_type; bool make_token = false; switch (return_type) { case ReturnType::kSize: out << "[[nodiscard]]\n" << "inline std::optional match" << element.name << "(std::string_view str) {\n"; break; case ReturnType::kTokenAndSize: out << "[[nodiscard]]\n" << "inline std::optional> match" << element.name << "(std::string_view str) {\n"; if (specific_tokens_.contains(element.name)) { sub_return_type = ReturnType::kSize; make_token = true; } break; case ReturnType::kInternalAndSize: out << "[[nodiscard]]\n" << "inline std::optional> match" << element.name << "(std::string_view str) {\n"; break; } if (element.definitions.size() == 1) { if (make_token) { out << " auto ret = [str]() -> std::optional {\n"; if (!write_matcher(out, element.definitions[0], sub_return_type, " ")) { std::cerr << "Error in " << element.name << "\n"; return false; } out << " }();\n" << " return ret.transform([](auto size) {\n" << " return std::make_pair(Token::k" << element.name << ", size); });\n"; } else { if (!write_matcher(out, element.definitions[0], sub_return_type, " ")) { std::cerr << "Error in " << element.name << "\n"; return false; } } } else if (std::ranges::all_of( element.definitions, [](auto const& definition) { return definition.symbols.size() == 1 && definition.symbols[0].optional == grammar::Symbol::Optional::kRequired && definition.symbols[0].type == grammar::Symbol::Type::kTerminal; })) { if (std::ranges::all_of(element.definitions, [](auto const& definition) { return definition.symbols[0].value.size() == 1; })) { out << " if (!str.empty()) {\n" << " switch (str.front()) {\n"; for (auto const& definition : element.definitions) { out << " case '" << definition.symbols[0].value[0] << "':\n"; } out << " return 1;\n" << " default:\n" << " break;\n" << " }\n" << " }\n" << " return std::nullopt;\n"; } else { auto builder = prefix_tree::builder(); for (auto const& definition : element.definitions) { builder->add(definition.symbols[0].value); } auto tree = builder->build(); if (!tree.has_value()) { std::cerr << "To large prefix tree\n"; return false; } out << " static const auto tree = "; quote(out, tree.value()) << "sv;\n"; out << " return prefix_tree::lookup(tree, str)"; if (make_token) { out << ".transform([](auto size) {\n" << " return std::make_pair(Token::k" << element.name << ", size); })"; } out << ";\n"; } } else { bool first = true; std::string_view ret_type; switch (sub_return_type) { case ReturnType::kTokenAndSize: ret_type = "std::optional>"; break; case ReturnType::kInternalAndSize: ret_type = "std::optional>"; break; case ReturnType::kSize: ret_type = "std::optional"; break; } for (auto const& definition : element.definitions) { if (first) { first = false; out << " auto tmp = [str]() -> " << ret_type << " {\n"; if (!write_matcher(out, definition, sub_return_type, " ")) { std::cerr << "Error in " << element.name << "\n"; return false; } out << " }();\n"; out << " auto ret = tmp;\n"; } else { out << " tmp = [str]() -> " << ret_type << " {\n"; if (!write_matcher(out, definition, sub_return_type, " ")) { std::cerr << "Error in " << element.name << "\n"; return false; } out << " }();\n" << " if (tmp.has_value()) {\n"; if (sub_return_type == ReturnType::kTokenAndSize) { out << " if (!ret.has_value() || ret.value().second < " "tmp.value().second) {\n"; } else { out << " if (!ret.has_value() || ret.value() < tmp.value()) {\n"; } out << " ret = tmp;\n" << " }\n" << " }\n"; } } if (make_token) { out << " return ret.transform([](auto size) {\n" << " return std::make_pair(Token::k" << element.name << ", size); });\n"; } else { out << " return ret;\n"; } } out << "}\n" << "\n"; return true; } bool Generator::generate(std::string_view header_name, std::string_view source_name, std::string const& ns, std::string const& unicode_version, grammar::Grammar& grammar) { std::fstream header{std::string(header_name), std::fstream::trunc | std::fstream::out}; std::fstream source{std::string(source_name), std::fstream::trunc | std::fstream::out}; auto header_guard = make_define(header_name); header << "#ifndef " << header_guard << "\n" << "#define " << header_guard << "\n" << "\n" << "#include \"prefix_tree.hh\"\n" << "\n" << "#include \n" << "#include \n" << "#include \n" << "#include \n" << "#include \n" << "\n" << "namespace " << ns << " {\n" << "\n"; find_specific_elements(grammar.root()); find_all_elements(grammar.root()); for (auto& element : all_elements_) { check_need_last(*element); } header << "enum class Token : " << (specific_tokens_.size() < 256 ? "uint8_t" : "uint16_t") << " {\n"; for (auto const& token : specific_tokens_) { header << " k" << token << ",\n"; } header << "};\n"; header << "\n" << "[[nodiscard]]\n" << "std::optional>" << " matchNext(std::string_view str);\n" << "\n" << "} // " << ns << "\n" << "\n" << "#endif // " << header_guard << "\n"; source << "#include \"" << header_name << "\"\n" << "\n" << "#include \"prefix_tree.hh\"\n" << "#include \"u.hh\"\n" << "#include \"u8.hh\"\n" << "\n" << "#include \n" << "#include \n" << "#include \n" << "#include \n" << "\n" << "using namespace std::literals::string_view_literals;\n" << "\n" << "// NOLINTBEGIN(readability-else-after-return)\n" << "\n" << "namespace " << ns << " {\n"; source << "namespace {\n" << "\n"; write_character_class_matchers(source, unicode_version); source << "\n" << "enum class Internal : " << (all_elements_.size() < 256 ? "uint8_t" : "uint16_t") << " {\n" << " UNDEFINED,\n"; for (auto* element : all_elements_) { source << " k" << element->name << ",\n"; } source << "};\n" << "\n"; for (auto* element : all_elements_) { declare_matcher(source, *element, get_return_type(*element)); } if (std::ranges::any_of(all_elements_, [this, &source](auto* element) { auto sts = get_return_type(*element); return !write_matcher(source, *element, sts); })) { return false; } source << "\n" << "} // namespace\n"; source << "\n" << "std::optional>" << "matchNext(std::string_view str) {\n" << " return match" << grammar.root().name << "(str);\n" << "}" << "\n" << "} // namespace " << ns << "\n" << "\n" << "// NOLINTEND(readability-else-after-return)\n" << "\n"; return true; } } // namespace int main(int argc, char** argv) { auto args = Args::create(); auto opt_help = args->option('h', "help", "display this text and exit"); auto opt_ns = args->option_argument('\0', "namespace", "ARG", "Namespace for tokenizer"); auto opt_unicode = args->option_argument('u', "unicode", "ARG", "Unicode version"); std::vector arguments; if (!args->run(argc, argv, &arguments)) { args->print_error(std::cerr); std::cerr << "Try `gen_tokens --help` for usage\n"; return 1; } if (opt_help->is_set()) { std::cout << "Usage: `gen_tokens [OPTIONS...] tokens.grammar" << " OUTPUT.hh OUTPUT.cc`\n" << "Generates a tokenizer for grammar.\n" << "\n"; args->print_help(std::cout); return 0; } if (!opt_ns->is_set()) { std::cerr << "No namespace given.\n" << "Try `gen_tokens --help` for usage\n"; return 1; } if (!opt_unicode->is_set()) { std::cerr << "No unicode version given.\n" << "Try `gen_tokens --help` for usage\n"; return 1; } auto ns = opt_ns->argument(); auto unicode = opt_unicode->argument(); if (arguments.size() != 3) { std::cerr << "Expecting three arguments. No more, no less.\n" << "Try `gen_tokens --help` for usage\n"; return 1; } auto filename = std::string(arguments[0]); auto reader = io::open(filename); if (!reader.has_value()) { std::cerr << "Unable to open " << filename << '\n'; return 1; } auto errors = src::file_errors(std::move(filename)); auto grammar = grammar::load(std::move(reader.value()), kCharacterClassNames, *errors); if (!grammar || errors->errors() > 0) return 1; Generator generator; if (!generator.generate(arguments[1], arguments[2], ns, unicode, *grammar)) return 1; return 0; }