java: Add tokens support for Java 21

Some new keywords, I opted to modify java-8 grammar to use the new names, even if they are not going to match anything. Makes the tokenizer easier to write.
author: Joel Klinghed <the_jk@spawned.biz> 2025-09-29 09:39:49 +0200
committer: Joel Klinghed <the_jk@spawned.biz> 2025-09-29 09:50:47 +0200
commit: d196d51e07f50f3510c43ad375c5559b58860023 (patch)
tree: 3432b8e99e306d0ece9f29ddad1e2945f88a1481
parent: 1e9e51dae1c01bab7562911b958c47528b8011c8 (diff)
9 files changed, 654 insertions, 23 deletions
diff --git a/data/java-21/tokens.grammar b/data/java-21/tokens.grammar
new file mode 100644
index 0000000..db935b2
--- /dev/null
+++ b/data/java-21/tokens.grammar
@@ -0,0 +1,423 @@
+InputElement:
+  WhiteSpace
+  Comment
+  Token
+
+Token:
+  Identifier
+  Keyword
+  Literal
+  Separator
+  Operator
+
+Comment:
+  TraditionalComment
+  EndOfLineComment
+
+TraditionalComment:
+  / * CommentTail
+
+CommentTail:
+  * CommentTailStar
+  NotStar CommentTail
+
+CommentTailStar:
+  /
+  * CommentTailStar
+  NotStarNotSlash CommentTail
+
+NotStar:
+  InputCharacter but not *
+  LineTerminator
+
+NotStarNotSlash:
+  InputCharacter but not * or /
+  LineTerminator
+
+EndOfLineComment:
+  / / {InputCharacter}
+
+Identifier:
+  IdentifierChars but not a ReservedKeyword or BooleanLiteral or NullLiteral
+
+IdentifierChars:
+  JavaLetter {JavaLetterOrDigit}
+
+Keyword:
+  ReservedKeyword
+  ContextualKeyword
+
+ReservedKeyword:
+  abstract
+  assert
+  boolean
+  break
+  byte
+  case
+  catch
+  char
+  class
+  const
+  continue
+  default
+  do
+  double
+  else
+  enum
+  extends
+  final
+  finally
+  float
+  for
+  goto
+  if
+  implements
+  import
+  instanceof
+  int
+  interface
+  long
+  native
+  new
+  package
+  private
+  protected
+  public
+  return
+  short
+  static
+  strictfp
+  super
+  switch
+  synchronized
+  this
+  throw
+  throws
+  transient
+  try
+  void
+  volatile
+  while
+  _
+
+ContextualKeyword:
+  exports
+  module
+  non-sealed
+  open
+  opens
+  permits
+  provides
+  record
+  requires
+  sealed
+  to
+  transitive
+  uses
+  var
+  when
+  with
+  yield
+
+Literal:
+  IntegerLiteral
+  FloatingPointLiteral
+  BooleanLiteral
+  CharacterLiteral
+  StringLiteral
+  TextBlock
+  NullLiteral
+
+IntegerLiteral:
+  DecimalIntegerLiteral
+  HexIntegerLiteral
+  OctalIntegerLiteral
+  BinaryIntegerLiteral
+
+DecimalIntegerLiteral:
+  DecimalNumeral [IntegerTypeSuffix]
+
+HexIntegerLiteral:
+  HexNumeral [IntegerTypeSuffix]
+
+OctalIntegerLiteral:
+  OctalNumeral [IntegerTypeSuffix]
+
+BinaryIntegerLiteral:
+  BinaryNumeral [IntegerTypeSuffix]
+
+IntegerTypeSuffix:
+  l
+  L
+
+DecimalNumeral:
+  0
+  NonZeroDigit [Digits]
+  NonZeroDigit Underscores Digits
+
+NonZeroDigit:
+  1
+  2
+  3
+  4
+  5
+  6
+  7
+  8
+  9
+
+Digits:
+  Digit
+  Digit [DigitsAndUnderscores] Digit
+
+Digit:
+  0
+  NonZeroDigit
+
+DigitsAndUnderscores:
+  DigitOrUnderscore {DigitOrUnderscore}
+
+DigitOrUnderscore:
+  Digit
+  _
+
+Underscores:
+  _ {_}
+
+HexNumeral:
+  0 x HexDigits
+  0 X HexDigits
+
+HexDigits:
+  HexDigit
+  HexDigit [HexDigitsAndUnderscores] HexDigit
+
+HexDigit:
+  0
+  1
+  2
+  3
+  4
+  5
+  6
+  7
+  8
+  9
+  a
+  b
+  c
+  d
+  e
+  f
+  A
+  B
+  C
+  D
+  E
+  F
+
+HexDigitsAndUnderscores:
+  HexDigitOrUnderscore {HexDigitOrUnderscore}
+
+HexDigitOrUnderscore:
+  HexDigit
+  _
+
+OctalNumeral:
+  0 OctalDigits
+  0 Underscores OctalDigits
+
+OctalDigits:
+  OctalDigit
+  OctalDigit [OctalDigitsAndUnderscores] OctalDigit
+
+OctalDigit:
+  0
+  1
+  2
+  3
+  4
+  5
+  6
+  7
+
+OctalDigitsAndUnderscores:
+  OctalDigitOrUnderscore {OctalDigitOrUnderscore}
+
+OctalDigitOrUnderscore:
+  OctalDigit
+  _
+
+BinaryNumeral:
+  0 b BinaryDigits
+  0 B BinaryDigits
+
+BinaryDigits:
+  BinaryDigit
+  BinaryDigit [BinaryDigitsAndUnderscores] BinaryDigit
+
+BinaryDigit:
+  0
+  1
+
+BinaryDigitsAndUnderscores:
+  BinaryDigitOrUnderscore {BinaryDigitOrUnderscore}
+
+BinaryDigitOrUnderscore:
+  BinaryDigit
+  _
+
+FloatingPointLiteral:
+  DecimalFloatingPointLiteral
+  HexadecimalFloatingPointLiteral
+
+DecimalFloatingPointLiteral:
+  Digits . [Digits] [ExponentPart] [FloatTypeSuffix]
+  . Digits [ExponentPart] [FloatTypeSuffix]
+  Digits ExponentPart [FloatTypeSuffix]
+  Digits [ExponentPart] FloatTypeSuffix
+
+ExponentPart:
+  ExponentIndicator SignedInteger
+
+ExponentIndicator:
+  e
+  E
+
+SignedInteger:
+  [Sign] Digits
+
+Sign:
+  +
+  -
+
+FloatTypeSuffix:
+  f
+  F
+  d
+  D
+
+HexadecimalFloatingPointLiteral:
+  HexSignificand BinaryExponent [FloatTypeSuffix]
+
+HexSignificand:
+  HexNumeral [.]
+  0 x [HexDigits] . HexDigits
+  0 X [HexDigits] . HexDigits
+
+BinaryExponent:
+  BinaryExponentIndicator SignedInteger
+
+BinaryExponentIndicator:
+  p
+  P
+
+BooleanLiteral:
+  true
+  false
+
+CharacterLiteral:
+  ' SingleCharacter '
+  ' EscapeSequence '
+
+SingleCharacter:
+  InputCharacter but not ' or \
+
+StringLiteral:
+  " {StringCharacter} "
+
+StringCharacter:
+  InputCharacter but not " or \
+  EscapeSequence
+
+TextBlock:
+  " " " {TextBlockWhiteSpace} LineTerminator {TextBlockCharacter} " " "
+
+TextBlockWhiteSpace:
+  WhiteSpace but not LineTerminator
+
+TextBlockCharacter:
+  InputCharacter but not \
+  EscapeSequence
+  LineTerminator
+
+EscapeSequence:
+  \ b
+  \ s
+  \ t
+  \ n
+  \ f
+  \ r
+  \ LineTerminator
+  \ "
+  \ '
+  \ \
+  OctalEscape
+
+OctalEscape:
+  \ OctalDigit
+  \ OctalDigit OctalDigit
+  \ ZeroToThree OctalDigit OctalDigit
+
+ZeroToThree:
+  0
+  1
+  2
+  3
+
+NullLiteral:
+  null
+
+Separator:
+  (
+  )
+  {
+  }
+  [
+  ]
+  ;
+  ,
+  .
+  ...
+  @
+  ::
+
+Operator:
+  =
+  >
+  <
+  !
+  ~
+  ?
+  :
+  ->
+  ==
+  >=
+  <=
+  !=
+  &&
+  ||
+  ++
+  --
+  +
+  -
+  *
+  /
+  &
+  |
+  ^
+  %
+  <<
+  >>
+  >>>
+  +=
+  -=
+  *=
+  /=
+  &=
+  |=
+  ^=
+  %=
+  <<=
+  >>=
+  >>>=
diff --git a/data/java-8/tokens.grammar b/data/java-8/tokens.grammar
index 3521ac0..3941b94 100644
--- a/data/java-8/tokens.grammar
+++ b/data/java-8/tokens.grammar
@@ -43,7 +43,13 @@ Identifier:
 IdentifierChars:
   JavaLetter {JavaLetterOrDigit}
 
+# Java 8 only has reserved keywords, but use modern names
+# here to make a shared tokenizer simpler.
 Keyword:
+  ReservedKeyword
+  ContextualKeyword
+
+ReservedKeyword:
   abstract
   continue
   for
@@ -95,14 +101,20 @@ Keyword:
   super
   while
 
+ContextualKeyword:
+
 Literal:
   IntegerLiteral
   FloatingPointLiteral
   BooleanLiteral
   CharacterLiteral
   StringLiteral
+  TextBlock
   NullLiteral
 
+# Java 8 doesn't have TextBlock, but add it as newer grammers have it
+TextBlock:
+
 IntegerLiteral:
   DecimalIntegerLiteral
   HexIntegerLiteral
diff --git a/meson.build b/meson.build
index 7782285..a8f4b97 100644
--- a/meson.build
+++ b/meson.build
@@ -263,10 +263,12 @@ gen_tokens = executable(
 
 java_versions = [
   'java-8',
+  'java-21',
 ]
 
 java_unicode_versions = {
   'java-8': '6.2.0',
+  'java-21': '15.0.0',
 }
 
 java_tokens_sources = []
diff --git a/src/gen_tokens.cc b/src/gen_tokens.cc
index ef0fce7..cc8c06d 100644
--- a/src/gen_tokens.cc
+++ b/src/gen_tokens.cc
@@ -88,7 +88,8 @@ class Generator {
 // Find the Elements that has at least one terminal or character class as symbol
 // These will be the different tokens the tokenizer can return
 void Generator::find_specific_elements(grammar::Element const& root) {
-  if (std::ranges::any_of(root.definitions, [](auto const& definition) {
+  if (root.definitions.empty() ||
+      std::ranges::any_of(root.definitions, [](auto const& definition) {
         return definition.symbols.size() > 1 ||
                definition.symbols[0].type == grammar::Symbol::Type::kTerminal;
       })) {
@@ -233,7 +234,7 @@ void write_character_class_matchers(std::ostream& out,
       << "std::optional<size_t> TokenMatcher::matchLineTerminator"
       << "(std::string_view str) {\n"
       // Tokenizer normally reads one line at a time, there is only
-      // one construct (traditional comment) that needs it.
+      // a few constructs (traditional comment, textblock) that needs it.
       // So match synthetic '\n' or report that it was needed if we are at
       // end of string.
       << "  if (str.empty()) {\n"
@@ -361,6 +362,22 @@ void write_character_class_matchers(std::ostream& out,
 }
 
 std::ostream& quote(std::ostream& out, std::string_view in) {
+  int use_raw_string = 1;
+  for (auto c : in) {
+    if (c == '"' || c == '\\' || c == '\n') {
+      use_raw_string = 2;
+    } else if (c < ' ' || (c & 0x80)) {
+      use_raw_string = 0;
+      break;
+    }
+  }
+  if (use_raw_string == 2) {
+    out << "R\"(";
+    out << in;
+    out << ")\"";
+    return out;
+  }
+
   out << '"';
   bool avoid_digit = false;
   for (auto c : in) {
@@ -569,6 +586,7 @@ bool Generator::write_matcher(std::ostream& out,
     bool have_internal = next_internal;
     next_internal = false;
     ReturnType symbol_return_type = return_type;
+    bool zero_or_more_with_terminal = false;
 
     if (symbol.optional != grammar::Symbol::Optional::kRequired &&
         i + 1 < definition.symbols.size() &&
@@ -613,6 +631,33 @@ bool Generator::write_matcher(std::ostream& out,
         }
         out << indent << "while (true) {\n";
         indent2 += "  ";
+
+        if (i + 1 < definition.symbols.size() &&
+            definition.symbols[i + 1].optional ==
+                grammar::Symbol::Optional::kRequired &&
+            definition.symbols[i + 1].type ==
+                grammar::Symbol::Type::kTerminal) {
+          if (symbol_return_type == return_type) {
+            out << indent2 << "ret = ";
+          } else {
+            out << indent2 << "ret_internal = ";
+          }
+          write_matcher(out, definition.symbols[i + 1], symbol_return_type,
+                        "str.substr(tot)");
+          out << ";\n";
+          if (symbol_return_type != return_type) {
+            out << indent2 << "ret = ret_internal";
+            match_return_type(out, symbol_return_type, "", return_type);
+            out << ";\n";
+          }
+          out << indent2 << "if (ret.has_value()) {\n"
+              << indent2 << "  tot += ret" << size_suffix << ";\n";
+          if (last_internal)
+            out << indent2 << "  last_internal = ret->first;\n";
+          out << indent2 << "  break;\n" << indent2 << "}\n";
+
+          zero_or_more_with_terminal = true;
+        }
         break;
       case grammar::Symbol::Optional::kExcluded:
         std::cerr << "Excluded mixed with conditional\n";
@@ -678,9 +723,15 @@ bool Generator::write_matcher(std::ostream& out,
               << indent2 << "  last_internal = ret->first;\n";
         break;
       case grammar::Symbol::Optional::kZeroOrMore:
-        out << indent2 << "if (!ret.has_value())\n"
-            << indent2 << "  break;\n"
-            << indent2 << "tot += ret" << size_suffix << ";\n";
+        out << indent2 << "if (!ret.has_value())\n";
+        if (zero_or_more_with_terminal) {
+          out << indent2 << "  return ret;\n";
+          // Skip next symbol as it was already used to terminate the loop
+          ++i;
+        } else {
+          out << indent2 << "  break;\n";
+        }
+        out << indent2 << "tot += ret" << size_suffix << ";\n";
         if (last_internal)
           out << indent2 << "last_internal = ret->first;\n";
         out << indent << "}\n";
@@ -742,13 +793,12 @@ bool Generator::write_matcher(std::ostream& out,
   switch (return_type) {
     case ReturnType::kSize:
       out << "[[nodiscard]]\n"
-          << "std::optional<size_t> TokenMatcher::match" << element.name
-          << "(std::string_view str) {\n";
+          << "std::optional<size_t> TokenMatcher::match" << element.name;
       break;
     case ReturnType::kTokenAndSize:
       out << "[[nodiscard]]\n"
           << "std::optional<std::pair<Token, size_t>> TokenMatcher::match"
-          << element.name << "(std::string_view str) {\n";
+          << element.name;
 
       if (specific_tokens_.contains(element.name)) {
         sub_return_type = ReturnType::kSize;
@@ -759,11 +809,19 @@ bool Generator::write_matcher(std::ostream& out,
       out << "[[nodiscard]]\n"
           << "std::optional<std::pair<TokenMatcher::Internal, size_t>> "
              "TokenMatcher::match"
-          << element.name << "(std::string_view str) {\n";
+          << element.name;
       break;
   }
 
-  if (element.definitions.size() == 1) {
+  if (element.definitions.empty()) {
+    out << "(std::string_view /* str */) {\n";
+  } else {
+    out << "(std::string_view str) {\n";
+  }
+
+  if (element.definitions.empty()) {
+    out << "  return std::nullopt;\n";
+  } else if (element.definitions.size() == 1) {
     if (make_token) {
       out << "  auto ret = [this, str]() -> std::optional<size_t> {\n";
       if (!write_matcher(out, element.definitions[0], sub_return_type,
diff --git a/src/grammar.cc b/src/grammar.cc
index 25c4d64..6ed2766 100644
--- a/src/grammar.cc
+++ b/src/grammar.cc
@@ -127,11 +127,6 @@ class GrammarLoader {
     auto it = second_pass_elements.begin();
     for (auto const& pair : first_pass_elements) {
       auto const& element = *it++;
-      if (pair.second.definitions.empty()) {
-        errors_.err(pair.second.loc,
-                    std::format("No definitions for {}", pair.first));
-        continue;
-      }
       std::vector<std::string_view> in_symbols;
       for (auto const& in_definition : pair.second.definitions) {
         str::split(in_definition, in_symbols);
@@ -247,10 +242,42 @@ class GrammarLoader {
           "No root element found");
     }
 
+    optimize(second_pass_elements);
+
     return std::make_unique<GrammarImpl>(std::move(second_pass_elements));
   }
 
  private:
+  static void optimize(std::vector<std::unique_ptr<Element>> const& elements) {
+    merge_terminals(elements);
+  }
+
+  static void merge_terminals(std::vector<std::unique_ptr<Element>> const& elements) {
+    for (auto const& element : elements) {
+      for (auto& definition : element->definitions) {
+        auto it = definition.symbols.begin();
+        while (it != definition.symbols.end()) {
+          if (it->type != Symbol::Type::kTerminal) {
+            ++it;
+            continue;
+          }
+
+          auto it2 = it + 1;
+          if (it2 == definition.symbols.end())
+            break;
+          if (it2->type != Symbol::Type::kTerminal ||
+              it->optional != it2->optional) {
+            ++it;
+            continue;
+          }
+
+          it->value += it2->value;
+          definition.symbols.erase(it2);
+        }
+      }
+    }
+  }
+
   std::unique_ptr<line::Reader> reader_;
   std::vector<std::string> const& character_classes_;
   src::Errors& errors_;
diff --git a/src/java_tokens.cc b/src/java_tokens.cc
index 1ba40a3..42c310b 100644
--- a/src/java_tokens.cc
+++ b/src/java_tokens.cc
@@ -1,6 +1,7 @@
 #include "java_tokens.hh"
 
 #include "errors.hh"
+#include "java_tokens_java-21.hh"
 #include "java_tokens_java-8.hh"
 #include "java_uescape.hh"
 #include "str.hh"
@@ -62,9 +63,13 @@ class TokensImpl : public Tokens {
             break;
           line_tmp_.append(maybe_line.value());
           got_any = true;
-          // Simple check, it might not actually end the comment but if so tokenizer will complain
+          // Simple check, it might not actually be true but if so tokenizer will complain
           // about reaching line_end again.
-          if (maybe_line->contains("*/"))
+          auto stop = (maybe_token_pair.has_value() &&
+                       maybe_token_pair->first == MatchToken::kStringLiteral)
+                          ? R"(""")"
+                          : "*/";
+          if (maybe_line->contains(stop))
             break;
           line_tmp_.push_back('\n');
         }
@@ -118,8 +123,11 @@ class TokensImpl : public Tokens {
           case MatchToken::kIdentifier:
             token.type = Token::Type::kIdentifier;
             break;
-          case MatchToken::kKeyword:
-            token.type = Token::Type::kKeyword;
+          case MatchToken::kReservedKeyword:
+            token.type = Token::Type::kReservedKeyword;
+            break;
+          case MatchToken::kContextualKeyword:
+            token.type = Token::Type::kContextualKeyword;
             break;
           case MatchToken::kNullLiteral:
             token.type = Token::Type::kLiteralNull;
@@ -138,6 +146,13 @@ class TokensImpl : public Tokens {
             token.str =
                 unescape_if_needed(token.str.substr(1, token.str.size() - 2));
             break;
+          case MatchToken::kTextBlock: {
+            token.type = Token::Type::kLiteralString;
+            auto start = token.str.find('\n', 3) + 1;
+            token.str = unescape_if_needed(trim_indent(
+                token.str.substr(start, token.str.size() - 3 - start)));
+            break;
+          }
           case MatchToken::kTraditionalComment: {
             token.type = Token::Type::kComment;
             size_t s = 2;
@@ -368,6 +383,39 @@ class TokensImpl : public Tokens {
     return count;
   }
 
+  static size_t indent(std::string_view str) {
+    size_t i = 0;
+    while (i < str.size() &&
+           (str[i] == ' ' || str[i] == '\t' || str[i] == '\f'))
+      ++i;
+    return i;
+  }
+
+  std::string_view trim_indent(std::string_view str) {
+    auto lines = str::split(str, '\n', /* keep_empty */ true);
+    auto it = lines.begin();
+    auto min_indent = indent(*it);
+    if (min_indent == 0)
+      return str;
+    for (++it; it != lines.end(); ++it) {
+      auto i = indent(*it);
+      if (i < min_indent) {
+        if (i == 0)
+          return str;
+        min_indent = i;
+      }
+    }
+    trim_tmp_.clear();
+    trim_tmp_.reserve(str.size());
+    for (auto line : lines) {
+      trim_tmp_.append(line, min_indent);
+      trim_tmp_.push_back('\n');
+    }
+    // remove last '\n'
+    trim_tmp_.resize(trim_tmp_.size() - 1);
+    return trim_tmp_;
+  }
+
   std::unique_ptr<u8::line::Reader> reader_;
   std::unique_ptr<src::Errors> errors_;
   TokensConfig const config_;
@@ -376,6 +424,7 @@ class TokensImpl : public Tokens {
   std::string line_tmp_;
   Location location_;
   std::string unescape_tmp_;
+  std::string trim_tmp_;
 };
 
 }  // namespace
@@ -387,6 +436,10 @@ std::unique_ptr<Tokens> open(std::unique_ptr<io::Reader> reader,
     case Version::kJava8:
       return std::make_unique<TokensImpl<java_8::TokenMatcher, java_8::Token>>(
           std::move(reader), std::move(errors), config);
+    case Version::kJava21:
+      return std::make_unique<
+          TokensImpl<java_21::TokenMatcher, java_21::Token>>(
+          std::move(reader), std::move(errors), config);
   }
   std::unreachable();
 }
diff --git a/src/java_tokens.hh b/src/java_tokens.hh
index 6fbefcb..c4e27c0 100644
--- a/src/java_tokens.hh
+++ b/src/java_tokens.hh
@@ -25,7 +25,10 @@ struct Token {
     kIdentifier,
 
     // str is keyword, int_value is Keyword index
-    kKeyword,
+    kReservedKeyword,
+
+    // str is keyword, int_value is Keyword index
+    kContextualKeyword,
 
     // str is separator, int_value is Separator index
     kSeparator,
diff --git a/src/java_version.hh b/src/java_version.hh
index 444ae36..4877263 100644
--- a/src/java_version.hh
+++ b/src/java_version.hh
@@ -7,8 +7,9 @@ namespace java {
 
 enum class Version : uint8_t {
   kJava8 = 8,
+  kJava21 = 21,
 
-  kMax = kJava8,
+  kMax = kJava21,
 };
 
 }  // namespace java
diff --git a/test/java_tokens.cc b/test/java_tokens.cc
index 1c69196..cb1ae73 100644
--- a/test/java_tokens.cc
+++ b/test/java_tokens.cc
@@ -29,7 +29,7 @@ TEST_P(JavaTokens, empty_class) {
                            java::TokensConfig{.version = GetParam()});
   auto ret = tokens->read();
   ASSERT_TRUE(ret.has_value());
-  EXPECT_EQ(java::Token::Type::kKeyword, ret->type);
+  EXPECT_EQ(java::Token::Type::kReservedKeyword, ret->type);
   EXPECT_EQ("class", ret->str);
   EXPECT_EQ(1, ret->loc.line);
   EXPECT_EQ(0, ret->loc.column);
@@ -602,5 +602,57 @@ TEST_P(JavaTokens, null) {
   EXPECT_EQ(io::ReadError::Eof, ret.error());
 }
 
+TEST_P(JavaTokens, textblock) {
+  auto input = io::memory(R"(String html = """
+              <html>
+                  <body>
+                      <p>Hello, world</p>
+                  </body>
+              </html>
+              """;)");
+  auto tokens = java::open(std::move(input), make_errors(),
+                           java::TokensConfig{.version = GetParam()});
+
+  auto ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kIdentifier, ret->type);
+  EXPECT_EQ("String", ret->str);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kIdentifier, ret->type);
+  EXPECT_EQ("html", ret->str);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kOperator, ret->type);
+  EXPECT_EQ("=", ret->str);
+  ret = tokens->read();
+  if (std::to_underlying(GetParam()) >= 15) {
+    ASSERT_TRUE(ret.has_value());
+    EXPECT_EQ(java::Token::Type::kLiteralString, ret->type);
+    EXPECT_EQ(R"(<html>
+    <body>
+        <p>Hello, world</p>
+    </body>
+</html>
+)",
+              ret->str);
+    ret = tokens->read();
+    ASSERT_TRUE(ret.has_value());
+    EXPECT_EQ(java::Token::Type::kSeparator, ret->type);
+    EXPECT_EQ(";", ret->str);
+    ret = tokens->read();
+    ASSERT_FALSE(ret.has_value());
+    EXPECT_EQ(io::ReadError::Eof, ret.error());
+  } else {
+    ASSERT_TRUE(ret.has_value());
+    EXPECT_EQ(java::Token::Type::kLiteralString, ret->type);
+    EXPECT_EQ("", ret->str);
+    ret = tokens->read();
+    ASSERT_TRUE(ret.has_value());
+    EXPECT_EQ(java::Token::Type::kError, ret->type);
+  }
+}
+
 INSTANTIATE_TEST_SUITE_P(AllVersions, JavaTokens,
-                         testing::Values(java::Version::kJava8));
+                         testing::Values(java::Version::kJava8,
+                                         java::Version::kJava21));
author	Joel Klinghed <the_jk@spawned.biz>	2025-09-29 09:39:49 +0200
committer	Joel Klinghed <the_jk@spawned.biz>	2025-09-29 09:50:47 +0200
commit	d196d51e07f50f3510c43ad375c5559b58860023 (patch)
tree	3432b8e99e306d0ece9f29ddad1e2945f88a1481
parent	1e9e51dae1c01bab7562911b958c47528b8011c8 (diff)