diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/gen_tokens.cc | 78 | ||||
| -rw-r--r-- | src/grammar.cc | 37 | ||||
| -rw-r--r-- | src/java_tokens.cc | 61 | ||||
| -rw-r--r-- | src/java_tokens.hh | 5 | ||||
| -rw-r--r-- | src/java_version.hh | 3 |
5 files changed, 163 insertions, 21 deletions
diff --git a/src/gen_tokens.cc b/src/gen_tokens.cc index ef0fce7..cc8c06d 100644 --- a/src/gen_tokens.cc +++ b/src/gen_tokens.cc @@ -88,7 +88,8 @@ class Generator { // Find the Elements that has at least one terminal or character class as symbol // These will be the different tokens the tokenizer can return void Generator::find_specific_elements(grammar::Element const& root) { - if (std::ranges::any_of(root.definitions, [](auto const& definition) { + if (root.definitions.empty() || + std::ranges::any_of(root.definitions, [](auto const& definition) { return definition.symbols.size() > 1 || definition.symbols[0].type == grammar::Symbol::Type::kTerminal; })) { @@ -233,7 +234,7 @@ void write_character_class_matchers(std::ostream& out, << "std::optional<size_t> TokenMatcher::matchLineTerminator" << "(std::string_view str) {\n" // Tokenizer normally reads one line at a time, there is only - // one construct (traditional comment) that needs it. + // a few constructs (traditional comment, textblock) that needs it. // So match synthetic '\n' or report that it was needed if we are at // end of string. << " if (str.empty()) {\n" @@ -361,6 +362,22 @@ void write_character_class_matchers(std::ostream& out, } std::ostream& quote(std::ostream& out, std::string_view in) { + int use_raw_string = 1; + for (auto c : in) { + if (c == '"' || c == '\\' || c == '\n') { + use_raw_string = 2; + } else if (c < ' ' || (c & 0x80)) { + use_raw_string = 0; + break; + } + } + if (use_raw_string == 2) { + out << "R\"("; + out << in; + out << ")\""; + return out; + } + out << '"'; bool avoid_digit = false; for (auto c : in) { @@ -569,6 +586,7 @@ bool Generator::write_matcher(std::ostream& out, bool have_internal = next_internal; next_internal = false; ReturnType symbol_return_type = return_type; + bool zero_or_more_with_terminal = false; if (symbol.optional != grammar::Symbol::Optional::kRequired && i + 1 < definition.symbols.size() && @@ -613,6 +631,33 @@ bool Generator::write_matcher(std::ostream& out, } out << indent << "while (true) {\n"; indent2 += " "; + + if (i + 1 < definition.symbols.size() && + definition.symbols[i + 1].optional == + grammar::Symbol::Optional::kRequired && + definition.symbols[i + 1].type == + grammar::Symbol::Type::kTerminal) { + if (symbol_return_type == return_type) { + out << indent2 << "ret = "; + } else { + out << indent2 << "ret_internal = "; + } + write_matcher(out, definition.symbols[i + 1], symbol_return_type, + "str.substr(tot)"); + out << ";\n"; + if (symbol_return_type != return_type) { + out << indent2 << "ret = ret_internal"; + match_return_type(out, symbol_return_type, "", return_type); + out << ";\n"; + } + out << indent2 << "if (ret.has_value()) {\n" + << indent2 << " tot += ret" << size_suffix << ";\n"; + if (last_internal) + out << indent2 << " last_internal = ret->first;\n"; + out << indent2 << " break;\n" << indent2 << "}\n"; + + zero_or_more_with_terminal = true; + } break; case grammar::Symbol::Optional::kExcluded: std::cerr << "Excluded mixed with conditional\n"; @@ -678,9 +723,15 @@ bool Generator::write_matcher(std::ostream& out, << indent2 << " last_internal = ret->first;\n"; break; case grammar::Symbol::Optional::kZeroOrMore: - out << indent2 << "if (!ret.has_value())\n" - << indent2 << " break;\n" - << indent2 << "tot += ret" << size_suffix << ";\n"; + out << indent2 << "if (!ret.has_value())\n"; + if (zero_or_more_with_terminal) { + out << indent2 << " return ret;\n"; + // Skip next symbol as it was already used to terminate the loop + ++i; + } else { + out << indent2 << " break;\n"; + } + out << indent2 << "tot += ret" << size_suffix << ";\n"; if (last_internal) out << indent2 << "last_internal = ret->first;\n"; out << indent << "}\n"; @@ -742,13 +793,12 @@ bool Generator::write_matcher(std::ostream& out, switch (return_type) { case ReturnType::kSize: out << "[[nodiscard]]\n" - << "std::optional<size_t> TokenMatcher::match" << element.name - << "(std::string_view str) {\n"; + << "std::optional<size_t> TokenMatcher::match" << element.name; break; case ReturnType::kTokenAndSize: out << "[[nodiscard]]\n" << "std::optional<std::pair<Token, size_t>> TokenMatcher::match" - << element.name << "(std::string_view str) {\n"; + << element.name; if (specific_tokens_.contains(element.name)) { sub_return_type = ReturnType::kSize; @@ -759,11 +809,19 @@ bool Generator::write_matcher(std::ostream& out, out << "[[nodiscard]]\n" << "std::optional<std::pair<TokenMatcher::Internal, size_t>> " "TokenMatcher::match" - << element.name << "(std::string_view str) {\n"; + << element.name; break; } - if (element.definitions.size() == 1) { + if (element.definitions.empty()) { + out << "(std::string_view /* str */) {\n"; + } else { + out << "(std::string_view str) {\n"; + } + + if (element.definitions.empty()) { + out << " return std::nullopt;\n"; + } else if (element.definitions.size() == 1) { if (make_token) { out << " auto ret = [this, str]() -> std::optional<size_t> {\n"; if (!write_matcher(out, element.definitions[0], sub_return_type, diff --git a/src/grammar.cc b/src/grammar.cc index 25c4d64..6ed2766 100644 --- a/src/grammar.cc +++ b/src/grammar.cc @@ -127,11 +127,6 @@ class GrammarLoader { auto it = second_pass_elements.begin(); for (auto const& pair : first_pass_elements) { auto const& element = *it++; - if (pair.second.definitions.empty()) { - errors_.err(pair.second.loc, - std::format("No definitions for {}", pair.first)); - continue; - } std::vector<std::string_view> in_symbols; for (auto const& in_definition : pair.second.definitions) { str::split(in_definition, in_symbols); @@ -247,10 +242,42 @@ class GrammarLoader { "No root element found"); } + optimize(second_pass_elements); + return std::make_unique<GrammarImpl>(std::move(second_pass_elements)); } private: + static void optimize(std::vector<std::unique_ptr<Element>> const& elements) { + merge_terminals(elements); + } + + static void merge_terminals(std::vector<std::unique_ptr<Element>> const& elements) { + for (auto const& element : elements) { + for (auto& definition : element->definitions) { + auto it = definition.symbols.begin(); + while (it != definition.symbols.end()) { + if (it->type != Symbol::Type::kTerminal) { + ++it; + continue; + } + + auto it2 = it + 1; + if (it2 == definition.symbols.end()) + break; + if (it2->type != Symbol::Type::kTerminal || + it->optional != it2->optional) { + ++it; + continue; + } + + it->value += it2->value; + definition.symbols.erase(it2); + } + } + } + } + std::unique_ptr<line::Reader> reader_; std::vector<std::string> const& character_classes_; src::Errors& errors_; diff --git a/src/java_tokens.cc b/src/java_tokens.cc index 1ba40a3..42c310b 100644 --- a/src/java_tokens.cc +++ b/src/java_tokens.cc @@ -1,6 +1,7 @@ #include "java_tokens.hh" #include "errors.hh" +#include "java_tokens_java-21.hh" #include "java_tokens_java-8.hh" #include "java_uescape.hh" #include "str.hh" @@ -62,9 +63,13 @@ class TokensImpl : public Tokens { break; line_tmp_.append(maybe_line.value()); got_any = true; - // Simple check, it might not actually end the comment but if so tokenizer will complain + // Simple check, it might not actually be true but if so tokenizer will complain // about reaching line_end again. - if (maybe_line->contains("*/")) + auto stop = (maybe_token_pair.has_value() && + maybe_token_pair->first == MatchToken::kStringLiteral) + ? R"(""")" + : "*/"; + if (maybe_line->contains(stop)) break; line_tmp_.push_back('\n'); } @@ -118,8 +123,11 @@ class TokensImpl : public Tokens { case MatchToken::kIdentifier: token.type = Token::Type::kIdentifier; break; - case MatchToken::kKeyword: - token.type = Token::Type::kKeyword; + case MatchToken::kReservedKeyword: + token.type = Token::Type::kReservedKeyword; + break; + case MatchToken::kContextualKeyword: + token.type = Token::Type::kContextualKeyword; break; case MatchToken::kNullLiteral: token.type = Token::Type::kLiteralNull; @@ -138,6 +146,13 @@ class TokensImpl : public Tokens { token.str = unescape_if_needed(token.str.substr(1, token.str.size() - 2)); break; + case MatchToken::kTextBlock: { + token.type = Token::Type::kLiteralString; + auto start = token.str.find('\n', 3) + 1; + token.str = unescape_if_needed(trim_indent( + token.str.substr(start, token.str.size() - 3 - start))); + break; + } case MatchToken::kTraditionalComment: { token.type = Token::Type::kComment; size_t s = 2; @@ -368,6 +383,39 @@ class TokensImpl : public Tokens { return count; } + static size_t indent(std::string_view str) { + size_t i = 0; + while (i < str.size() && + (str[i] == ' ' || str[i] == '\t' || str[i] == '\f')) + ++i; + return i; + } + + std::string_view trim_indent(std::string_view str) { + auto lines = str::split(str, '\n', /* keep_empty */ true); + auto it = lines.begin(); + auto min_indent = indent(*it); + if (min_indent == 0) + return str; + for (++it; it != lines.end(); ++it) { + auto i = indent(*it); + if (i < min_indent) { + if (i == 0) + return str; + min_indent = i; + } + } + trim_tmp_.clear(); + trim_tmp_.reserve(str.size()); + for (auto line : lines) { + trim_tmp_.append(line, min_indent); + trim_tmp_.push_back('\n'); + } + // remove last '\n' + trim_tmp_.resize(trim_tmp_.size() - 1); + return trim_tmp_; + } + std::unique_ptr<u8::line::Reader> reader_; std::unique_ptr<src::Errors> errors_; TokensConfig const config_; @@ -376,6 +424,7 @@ class TokensImpl : public Tokens { std::string line_tmp_; Location location_; std::string unescape_tmp_; + std::string trim_tmp_; }; } // namespace @@ -387,6 +436,10 @@ std::unique_ptr<Tokens> open(std::unique_ptr<io::Reader> reader, case Version::kJava8: return std::make_unique<TokensImpl<java_8::TokenMatcher, java_8::Token>>( std::move(reader), std::move(errors), config); + case Version::kJava21: + return std::make_unique< + TokensImpl<java_21::TokenMatcher, java_21::Token>>( + std::move(reader), std::move(errors), config); } std::unreachable(); } diff --git a/src/java_tokens.hh b/src/java_tokens.hh index 6fbefcb..c4e27c0 100644 --- a/src/java_tokens.hh +++ b/src/java_tokens.hh @@ -25,7 +25,10 @@ struct Token { kIdentifier, // str is keyword, int_value is Keyword index - kKeyword, + kReservedKeyword, + + // str is keyword, int_value is Keyword index + kContextualKeyword, // str is separator, int_value is Separator index kSeparator, diff --git a/src/java_version.hh b/src/java_version.hh index 444ae36..4877263 100644 --- a/src/java_version.hh +++ b/src/java_version.hh @@ -7,8 +7,9 @@ namespace java { enum class Version : uint8_t { kJava8 = 8, + kJava21 = 21, - kMax = kJava8, + kMax = kJava21, }; } // namespace java |
