java: Add tokens support for Java 21

Some new keywords, I opted to modify java-8 grammar to use the new names, even if they are not going to match anything. Makes the tokenizer easier to write.
author: Joel Klinghed <the_jk@spawned.biz> 2025-09-29 09:39:49 +0200
committer: Joel Klinghed <the_jk@spawned.biz> 2025-09-29 09:50:47 +0200
commit: d196d51e07f50f3510c43ad375c5559b58860023 (patch)
tree: 3432b8e99e306d0ece9f29ddad1e2945f88a1481 /src/java_tokens.cc
parent: 1e9e51dae1c01bab7562911b958c47528b8011c8 (diff)
1 files changed, 57 insertions, 4 deletions
diff --git a/src/java_tokens.cc b/src/java_tokens.cc
index 1ba40a3..42c310b 100644
--- a/src/java_tokens.cc
+++ b/src/java_tokens.cc
@@ -1,6 +1,7 @@
 #include "java_tokens.hh"
 
 #include "errors.hh"
+#include "java_tokens_java-21.hh"
 #include "java_tokens_java-8.hh"
 #include "java_uescape.hh"
 #include "str.hh"
@@ -62,9 +63,13 @@ class TokensImpl : public Tokens {
             break;
           line_tmp_.append(maybe_line.value());
           got_any = true;
-          // Simple check, it might not actually end the comment but if so tokenizer will complain
+          // Simple check, it might not actually be true but if so tokenizer will complain
           // about reaching line_end again.
-          if (maybe_line->contains("*/"))
+          auto stop = (maybe_token_pair.has_value() &&
+                       maybe_token_pair->first == MatchToken::kStringLiteral)
+                          ? R"(""")"
+                          : "*/";
+          if (maybe_line->contains(stop))
             break;
           line_tmp_.push_back('\n');
         }
@@ -118,8 +123,11 @@ class TokensImpl : public Tokens {
           case MatchToken::kIdentifier:
             token.type = Token::Type::kIdentifier;
             break;
-          case MatchToken::kKeyword:
-            token.type = Token::Type::kKeyword;
+          case MatchToken::kReservedKeyword:
+            token.type = Token::Type::kReservedKeyword;
+            break;
+          case MatchToken::kContextualKeyword:
+            token.type = Token::Type::kContextualKeyword;
             break;
           case MatchToken::kNullLiteral:
             token.type = Token::Type::kLiteralNull;
@@ -138,6 +146,13 @@ class TokensImpl : public Tokens {
             token.str =
                 unescape_if_needed(token.str.substr(1, token.str.size() - 2));
             break;
+          case MatchToken::kTextBlock: {
+            token.type = Token::Type::kLiteralString;
+            auto start = token.str.find('\n', 3) + 1;
+            token.str = unescape_if_needed(trim_indent(
+                token.str.substr(start, token.str.size() - 3 - start)));
+            break;
+          }
           case MatchToken::kTraditionalComment: {
             token.type = Token::Type::kComment;
             size_t s = 2;
@@ -368,6 +383,39 @@ class TokensImpl : public Tokens {
     return count;
   }
 
+  static size_t indent(std::string_view str) {
+    size_t i = 0;
+    while (i < str.size() &&
+           (str[i] == ' ' || str[i] == '\t' || str[i] == '\f'))
+      ++i;
+    return i;
+  }
+
+  std::string_view trim_indent(std::string_view str) {
+    auto lines = str::split(str, '\n', /* keep_empty */ true);
+    auto it = lines.begin();
+    auto min_indent = indent(*it);
+    if (min_indent == 0)
+      return str;
+    for (++it; it != lines.end(); ++it) {
+      auto i = indent(*it);
+      if (i < min_indent) {
+        if (i == 0)
+          return str;
+        min_indent = i;
+      }
+    }
+    trim_tmp_.clear();
+    trim_tmp_.reserve(str.size());
+    for (auto line : lines) {
+      trim_tmp_.append(line, min_indent);
+      trim_tmp_.push_back('\n');
+    }
+    // remove last '\n'
+    trim_tmp_.resize(trim_tmp_.size() - 1);
+    return trim_tmp_;
+  }
+
   std::unique_ptr<u8::line::Reader> reader_;
   std::unique_ptr<src::Errors> errors_;
   TokensConfig const config_;
@@ -376,6 +424,7 @@ class TokensImpl : public Tokens {
   std::string line_tmp_;
   Location location_;
   std::string unescape_tmp_;
+  std::string trim_tmp_;
 };
 
 }  // namespace
@@ -387,6 +436,10 @@ std::unique_ptr<Tokens> open(std::unique_ptr<io::Reader> reader,
     case Version::kJava8:
       return std::make_unique<TokensImpl<java_8::TokenMatcher, java_8::Token>>(
           std::move(reader), std::move(errors), config);
+    case Version::kJava21:
+      return std::make_unique<
+          TokensImpl<java_21::TokenMatcher, java_21::Token>>(
+          std::move(reader), std::move(errors), config);
   }
   std::unreachable();
 }
author	Joel Klinghed <the_jk@spawned.biz>	2025-09-29 09:39:49 +0200
committer	Joel Klinghed <the_jk@spawned.biz>	2025-09-29 09:50:47 +0200
commit	d196d51e07f50f3510c43ad375c5559b58860023 (patch)
tree	3432b8e99e306d0ece9f29ddad1e2945f88a1481 /src/java_tokens.cc
parent	1e9e51dae1c01bab7562911b958c47528b8011c8 (diff)