From d196d51e07f50f3510c43ad375c5559b58860023 Mon Sep 17 00:00:00 2001 From: Joel Klinghed Date: Mon, 29 Sep 2025 09:39:49 +0200 Subject: java: Add tokens support for Java 21 Some new keywords, I opted to modify java-8 grammar to use the new names, even if they are not going to match anything. Makes the tokenizer easier to write. --- src/gen_tokens.cc | 78 ++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 68 insertions(+), 10 deletions(-) (limited to 'src/gen_tokens.cc') diff --git a/src/gen_tokens.cc b/src/gen_tokens.cc index ef0fce7..cc8c06d 100644 --- a/src/gen_tokens.cc +++ b/src/gen_tokens.cc @@ -88,7 +88,8 @@ class Generator { // Find the Elements that has at least one terminal or character class as symbol // These will be the different tokens the tokenizer can return void Generator::find_specific_elements(grammar::Element const& root) { - if (std::ranges::any_of(root.definitions, [](auto const& definition) { + if (root.definitions.empty() || + std::ranges::any_of(root.definitions, [](auto const& definition) { return definition.symbols.size() > 1 || definition.symbols[0].type == grammar::Symbol::Type::kTerminal; })) { @@ -233,7 +234,7 @@ void write_character_class_matchers(std::ostream& out, << "std::optional TokenMatcher::matchLineTerminator" << "(std::string_view str) {\n" // Tokenizer normally reads one line at a time, there is only - // one construct (traditional comment) that needs it. + // a few constructs (traditional comment, textblock) that needs it. // So match synthetic '\n' or report that it was needed if we are at // end of string. << " if (str.empty()) {\n" @@ -361,6 +362,22 @@ void write_character_class_matchers(std::ostream& out, } std::ostream& quote(std::ostream& out, std::string_view in) { + int use_raw_string = 1; + for (auto c : in) { + if (c == '"' || c == '\\' || c == '\n') { + use_raw_string = 2; + } else if (c < ' ' || (c & 0x80)) { + use_raw_string = 0; + break; + } + } + if (use_raw_string == 2) { + out << "R\"("; + out << in; + out << ")\""; + return out; + } + out << '"'; bool avoid_digit = false; for (auto c : in) { @@ -569,6 +586,7 @@ bool Generator::write_matcher(std::ostream& out, bool have_internal = next_internal; next_internal = false; ReturnType symbol_return_type = return_type; + bool zero_or_more_with_terminal = false; if (symbol.optional != grammar::Symbol::Optional::kRequired && i + 1 < definition.symbols.size() && @@ -613,6 +631,33 @@ bool Generator::write_matcher(std::ostream& out, } out << indent << "while (true) {\n"; indent2 += " "; + + if (i + 1 < definition.symbols.size() && + definition.symbols[i + 1].optional == + grammar::Symbol::Optional::kRequired && + definition.symbols[i + 1].type == + grammar::Symbol::Type::kTerminal) { + if (symbol_return_type == return_type) { + out << indent2 << "ret = "; + } else { + out << indent2 << "ret_internal = "; + } + write_matcher(out, definition.symbols[i + 1], symbol_return_type, + "str.substr(tot)"); + out << ";\n"; + if (symbol_return_type != return_type) { + out << indent2 << "ret = ret_internal"; + match_return_type(out, symbol_return_type, "", return_type); + out << ";\n"; + } + out << indent2 << "if (ret.has_value()) {\n" + << indent2 << " tot += ret" << size_suffix << ";\n"; + if (last_internal) + out << indent2 << " last_internal = ret->first;\n"; + out << indent2 << " break;\n" << indent2 << "}\n"; + + zero_or_more_with_terminal = true; + } break; case grammar::Symbol::Optional::kExcluded: std::cerr << "Excluded mixed with conditional\n"; @@ -678,9 +723,15 @@ bool Generator::write_matcher(std::ostream& out, << indent2 << " last_internal = ret->first;\n"; break; case grammar::Symbol::Optional::kZeroOrMore: - out << indent2 << "if (!ret.has_value())\n" - << indent2 << " break;\n" - << indent2 << "tot += ret" << size_suffix << ";\n"; + out << indent2 << "if (!ret.has_value())\n"; + if (zero_or_more_with_terminal) { + out << indent2 << " return ret;\n"; + // Skip next symbol as it was already used to terminate the loop + ++i; + } else { + out << indent2 << " break;\n"; + } + out << indent2 << "tot += ret" << size_suffix << ";\n"; if (last_internal) out << indent2 << "last_internal = ret->first;\n"; out << indent << "}\n"; @@ -742,13 +793,12 @@ bool Generator::write_matcher(std::ostream& out, switch (return_type) { case ReturnType::kSize: out << "[[nodiscard]]\n" - << "std::optional TokenMatcher::match" << element.name - << "(std::string_view str) {\n"; + << "std::optional TokenMatcher::match" << element.name; break; case ReturnType::kTokenAndSize: out << "[[nodiscard]]\n" << "std::optional> TokenMatcher::match" - << element.name << "(std::string_view str) {\n"; + << element.name; if (specific_tokens_.contains(element.name)) { sub_return_type = ReturnType::kSize; @@ -759,11 +809,19 @@ bool Generator::write_matcher(std::ostream& out, out << "[[nodiscard]]\n" << "std::optional> " "TokenMatcher::match" - << element.name << "(std::string_view str) {\n"; + << element.name; break; } - if (element.definitions.size() == 1) { + if (element.definitions.empty()) { + out << "(std::string_view /* str */) {\n"; + } else { + out << "(std::string_view str) {\n"; + } + + if (element.definitions.empty()) { + out << " return std::nullopt;\n"; + } else if (element.definitions.size() == 1) { if (make_token) { out << " auto ret = [this, str]() -> std::optional {\n"; if (!write_matcher(out, element.definitions[0], sub_return_type, -- cgit v1.3