From d196d51e07f50f3510c43ad375c5559b58860023 Mon Sep 17 00:00:00 2001 From: Joel Klinghed Date: Mon, 29 Sep 2025 09:39:49 +0200 Subject: java: Add tokens support for Java 21 Some new keywords, I opted to modify java-8 grammar to use the new names, even if they are not going to match anything. Makes the tokenizer easier to write. --- src/java_tokens.cc | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 57 insertions(+), 4 deletions(-) (limited to 'src/java_tokens.cc') diff --git a/src/java_tokens.cc b/src/java_tokens.cc index 1ba40a3..42c310b 100644 --- a/src/java_tokens.cc +++ b/src/java_tokens.cc @@ -1,6 +1,7 @@ #include "java_tokens.hh" #include "errors.hh" +#include "java_tokens_java-21.hh" #include "java_tokens_java-8.hh" #include "java_uescape.hh" #include "str.hh" @@ -62,9 +63,13 @@ class TokensImpl : public Tokens { break; line_tmp_.append(maybe_line.value()); got_any = true; - // Simple check, it might not actually end the comment but if so tokenizer will complain + // Simple check, it might not actually be true but if so tokenizer will complain // about reaching line_end again. - if (maybe_line->contains("*/")) + auto stop = (maybe_token_pair.has_value() && + maybe_token_pair->first == MatchToken::kStringLiteral) + ? R"(""")" + : "*/"; + if (maybe_line->contains(stop)) break; line_tmp_.push_back('\n'); } @@ -118,8 +123,11 @@ class TokensImpl : public Tokens { case MatchToken::kIdentifier: token.type = Token::Type::kIdentifier; break; - case MatchToken::kKeyword: - token.type = Token::Type::kKeyword; + case MatchToken::kReservedKeyword: + token.type = Token::Type::kReservedKeyword; + break; + case MatchToken::kContextualKeyword: + token.type = Token::Type::kContextualKeyword; break; case MatchToken::kNullLiteral: token.type = Token::Type::kLiteralNull; @@ -138,6 +146,13 @@ class TokensImpl : public Tokens { token.str = unescape_if_needed(token.str.substr(1, token.str.size() - 2)); break; + case MatchToken::kTextBlock: { + token.type = Token::Type::kLiteralString; + auto start = token.str.find('\n', 3) + 1; + token.str = unescape_if_needed(trim_indent( + token.str.substr(start, token.str.size() - 3 - start))); + break; + } case MatchToken::kTraditionalComment: { token.type = Token::Type::kComment; size_t s = 2; @@ -368,6 +383,39 @@ class TokensImpl : public Tokens { return count; } + static size_t indent(std::string_view str) { + size_t i = 0; + while (i < str.size() && + (str[i] == ' ' || str[i] == '\t' || str[i] == '\f')) + ++i; + return i; + } + + std::string_view trim_indent(std::string_view str) { + auto lines = str::split(str, '\n', /* keep_empty */ true); + auto it = lines.begin(); + auto min_indent = indent(*it); + if (min_indent == 0) + return str; + for (++it; it != lines.end(); ++it) { + auto i = indent(*it); + if (i < min_indent) { + if (i == 0) + return str; + min_indent = i; + } + } + trim_tmp_.clear(); + trim_tmp_.reserve(str.size()); + for (auto line : lines) { + trim_tmp_.append(line, min_indent); + trim_tmp_.push_back('\n'); + } + // remove last '\n' + trim_tmp_.resize(trim_tmp_.size() - 1); + return trim_tmp_; + } + std::unique_ptr reader_; std::unique_ptr errors_; TokensConfig const config_; @@ -376,6 +424,7 @@ class TokensImpl : public Tokens { std::string line_tmp_; Location location_; std::string unescape_tmp_; + std::string trim_tmp_; }; } // namespace @@ -387,6 +436,10 @@ std::unique_ptr open(std::unique_ptr reader, case Version::kJava8: return std::make_unique>( std::move(reader), std::move(errors), config); + case Version::kJava21: + return std::make_unique< + TokensImpl>( + std::move(reader), std::move(errors), config); } std::unreachable(); } -- cgit v1.3