5 files changed, 163 insertions, 21 deletions
diff --git a/src/gen_tokens.cc b/src/gen_tokens.cc
index ef0fce7..cc8c06d 100644
--- a/src/gen_tokens.cc
+++ b/src/gen_tokens.cc
@@ -88,7 +88,8 @@ class Generator {
 // Find the Elements that has at least one terminal or character class as symbol
 // These will be the different tokens the tokenizer can return
 void Generator::find_specific_elements(grammar::Element const& root) {
-  if (std::ranges::any_of(root.definitions, [](auto const& definition) {
+  if (root.definitions.empty() ||
+      std::ranges::any_of(root.definitions, [](auto const& definition) {
         return definition.symbols.size() > 1 ||
                definition.symbols[0].type == grammar::Symbol::Type::kTerminal;
       })) {
@@ -233,7 +234,7 @@ void write_character_class_matchers(std::ostream& out,
       << "std::optional<size_t> TokenMatcher::matchLineTerminator"
       << "(std::string_view str) {\n"
       // Tokenizer normally reads one line at a time, there is only
-      // one construct (traditional comment) that needs it.
+      // a few constructs (traditional comment, textblock) that needs it.
       // So match synthetic '\n' or report that it was needed if we are at
       // end of string.
       << "  if (str.empty()) {\n"
@@ -361,6 +362,22 @@ void write_character_class_matchers(std::ostream& out,
 }
 
 std::ostream& quote(std::ostream& out, std::string_view in) {
+  int use_raw_string = 1;
+  for (auto c : in) {
+    if (c == '"' || c == '\\' || c == '\n') {
+      use_raw_string = 2;
+    } else if (c < ' ' || (c & 0x80)) {
+      use_raw_string = 0;
+      break;
+    }
+  }
+  if (use_raw_string == 2) {
+    out << "R\"(";
+    out << in;
+    out << ")\"";
+    return out;
+  }
+
   out << '"';
   bool avoid_digit = false;
   for (auto c : in) {
@@ -569,6 +586,7 @@ bool Generator::write_matcher(std::ostream& out,
     bool have_internal = next_internal;
     next_internal = false;
     ReturnType symbol_return_type = return_type;
+    bool zero_or_more_with_terminal = false;
 
     if (symbol.optional != grammar::Symbol::Optional::kRequired &&
         i + 1 < definition.symbols.size() &&
@@ -613,6 +631,33 @@ bool Generator::write_matcher(std::ostream& out,
         }
         out << indent << "while (true) {\n";
         indent2 += "  ";
+
+        if (i + 1 < definition.symbols.size() &&
+            definition.symbols[i + 1].optional ==
+                grammar::Symbol::Optional::kRequired &&
+            definition.symbols[i + 1].type ==
+                grammar::Symbol::Type::kTerminal) {
+          if (symbol_return_type == return_type) {
+            out << indent2 << "ret = ";
+          } else {
+            out << indent2 << "ret_internal = ";
+          }
+          write_matcher(out, definition.symbols[i + 1], symbol_return_type,
+                        "str.substr(tot)");
+          out << ";\n";
+          if (symbol_return_type != return_type) {
+            out << indent2 << "ret = ret_internal";
+            match_return_type(out, symbol_return_type, "", return_type);
+            out << ";\n";
+          }
+          out << indent2 << "if (ret.has_value()) {\n"
+              << indent2 << "  tot += ret" << size_suffix << ";\n";
+          if (last_internal)
+            out << indent2 << "  last_internal = ret->first;\n";
+          out << indent2 << "  break;\n" << indent2 << "}\n";
+
+          zero_or_more_with_terminal = true;
+        }
         break;
       case grammar::Symbol::Optional::kExcluded:
         std::cerr << "Excluded mixed with conditional\n";
@@ -678,9 +723,15 @@ bool Generator::write_matcher(std::ostream& out,
               << indent2 << "  last_internal = ret->first;\n";
         break;
       case grammar::Symbol::Optional::kZeroOrMore:
-        out << indent2 << "if (!ret.has_value())\n"
-            << indent2 << "  break;\n"
-            << indent2 << "tot += ret" << size_suffix << ";\n";
+        out << indent2 << "if (!ret.has_value())\n";
+        if (zero_or_more_with_terminal) {
+          out << indent2 << "  return ret;\n";
+          // Skip next symbol as it was already used to terminate the loop
+          ++i;
+        } else {
+          out << indent2 << "  break;\n";
+        }
+        out << indent2 << "tot += ret" << size_suffix << ";\n";
         if (last_internal)
           out << indent2 << "last_internal = ret->first;\n";
         out << indent << "}\n";
@@ -742,13 +793,12 @@ bool Generator::write_matcher(std::ostream& out,
   switch (return_type) {
     case ReturnType::kSize:
       out << "[[nodiscard]]\n"
-          << "std::optional<size_t> TokenMatcher::match" << element.name
-          << "(std::string_view str) {\n";
+          << "std::optional<size_t> TokenMatcher::match" << element.name;
       break;
     case ReturnType::kTokenAndSize:
       out << "[[nodiscard]]\n"
           << "std::optional<std::pair<Token, size_t>> TokenMatcher::match"
-          << element.name << "(std::string_view str) {\n";
+          << element.name;
 
       if (specific_tokens_.contains(element.name)) {
         sub_return_type = ReturnType::kSize;
@@ -759,11 +809,19 @@ bool Generator::write_matcher(std::ostream& out,
       out << "[[nodiscard]]\n"
           << "std::optional<std::pair<TokenMatcher::Internal, size_t>> "
              "TokenMatcher::match"
-          << element.name << "(std::string_view str) {\n";
+          << element.name;
       break;
   }
 
-  if (element.definitions.size() == 1) {
+  if (element.definitions.empty()) {
+    out << "(std::string_view /* str */) {\n";
+  } else {
+    out << "(std::string_view str) {\n";
+  }
+
+  if (element.definitions.empty()) {
+    out << "  return std::nullopt;\n";
+  } else if (element.definitions.size() == 1) {
     if (make_token) {
       out << "  auto ret = [this, str]() -> std::optional<size_t> {\n";
       if (!write_matcher(out, element.definitions[0], sub_return_type,
diff --git a/src/grammar.cc b/src/grammar.cc
index 25c4d64..6ed2766 100644
--- a/src/grammar.cc
+++ b/src/grammar.cc
@@ -127,11 +127,6 @@ class GrammarLoader {
     auto it = second_pass_elements.begin();
     for (auto const& pair : first_pass_elements) {
       auto const& element = *it++;
-      if (pair.second.definitions.empty()) {
-        errors_.err(pair.second.loc,
-                    std::format("No definitions for {}", pair.first));
-        continue;
-      }
       std::vector<std::string_view> in_symbols;
       for (auto const& in_definition : pair.second.definitions) {
         str::split(in_definition, in_symbols);
@@ -247,10 +242,42 @@ class GrammarLoader {
           "No root element found");
     }
 
+    optimize(second_pass_elements);
+
     return std::make_unique<GrammarImpl>(std::move(second_pass_elements));
   }
 
  private:
+  static void optimize(std::vector<std::unique_ptr<Element>> const& elements) {
+    merge_terminals(elements);
+  }
+
+  static void merge_terminals(std::vector<std::unique_ptr<Element>> const& elements) {
+    for (auto const& element : elements) {
+      for (auto& definition : element->definitions) {
+        auto it = definition.symbols.begin();
+        while (it != definition.symbols.end()) {
+          if (it->type != Symbol::Type::kTerminal) {
+            ++it;
+            continue;
+          }
+
+          auto it2 = it + 1;
+          if (it2 == definition.symbols.end())
+            break;
+          if (it2->type != Symbol::Type::kTerminal ||
+              it->optional != it2->optional) {
+            ++it;
+            continue;
+          }
+
+          it->value += it2->value;
+          definition.symbols.erase(it2);
+        }
+      }
+    }
+  }
+
   std::unique_ptr<line::Reader> reader_;
   std::vector<std::string> const& character_classes_;
   src::Errors& errors_;
diff --git a/src/java_tokens.cc b/src/java_tokens.cc
index 1ba40a3..42c310b 100644
--- a/src/java_tokens.cc
+++ b/src/java_tokens.cc
@@ -1,6 +1,7 @@
 #include "java_tokens.hh"
 
 #include "errors.hh"
+#include "java_tokens_java-21.hh"
 #include "java_tokens_java-8.hh"
 #include "java_uescape.hh"
 #include "str.hh"
@@ -62,9 +63,13 @@ class TokensImpl : public Tokens {
             break;
           line_tmp_.append(maybe_line.value());
           got_any = true;
-          // Simple check, it might not actually end the comment but if so tokenizer will complain
+          // Simple check, it might not actually be true but if so tokenizer will complain
           // about reaching line_end again.
-          if (maybe_line->contains("*/"))
+          auto stop = (maybe_token_pair.has_value() &&
+                       maybe_token_pair->first == MatchToken::kStringLiteral)
+                          ? R"(""")"
+                          : "*/";
+          if (maybe_line->contains(stop))
             break;
           line_tmp_.push_back('\n');
         }
@@ -118,8 +123,11 @@ class TokensImpl : public Tokens {
           case MatchToken::kIdentifier:
             token.type = Token::Type::kIdentifier;
             break;
-          case MatchToken::kKeyword:
-            token.type = Token::Type::kKeyword;
+          case MatchToken::kReservedKeyword:
+            token.type = Token::Type::kReservedKeyword;
+            break;
+          case MatchToken::kContextualKeyword:
+            token.type = Token::Type::kContextualKeyword;
             break;
           case MatchToken::kNullLiteral:
             token.type = Token::Type::kLiteralNull;
@@ -138,6 +146,13 @@ class TokensImpl : public Tokens {
             token.str =
                 unescape_if_needed(token.str.substr(1, token.str.size() - 2));
             break;
+          case MatchToken::kTextBlock: {
+            token.type = Token::Type::kLiteralString;
+            auto start = token.str.find('\n', 3) + 1;
+            token.str = unescape_if_needed(trim_indent(
+                token.str.substr(start, token.str.size() - 3 - start)));
+            break;
+          }
           case MatchToken::kTraditionalComment: {
             token.type = Token::Type::kComment;
             size_t s = 2;
@@ -368,6 +383,39 @@ class TokensImpl : public Tokens {
     return count;
   }
 
+  static size_t indent(std::string_view str) {
+    size_t i = 0;
+    while (i < str.size() &&
+           (str[i] == ' ' || str[i] == '\t' || str[i] == '\f'))
+      ++i;
+    return i;
+  }
+
+  std::string_view trim_indent(std::string_view str) {
+    auto lines = str::split(str, '\n', /* keep_empty */ true);
+    auto it = lines.begin();
+    auto min_indent = indent(*it);
+    if (min_indent == 0)
+      return str;
+    for (++it; it != lines.end(); ++it) {
+      auto i = indent(*it);
+      if (i < min_indent) {
+        if (i == 0)
+          return str;
+        min_indent = i;
+      }
+    }
+    trim_tmp_.clear();
+    trim_tmp_.reserve(str.size());
+    for (auto line : lines) {
+      trim_tmp_.append(line, min_indent);
+      trim_tmp_.push_back('\n');
+    }
+    // remove last '\n'
+    trim_tmp_.resize(trim_tmp_.size() - 1);
+    return trim_tmp_;
+  }
+
   std::unique_ptr<u8::line::Reader> reader_;
   std::unique_ptr<src::Errors> errors_;
   TokensConfig const config_;
@@ -376,6 +424,7 @@ class TokensImpl : public Tokens {
   std::string line_tmp_;
   Location location_;
   std::string unescape_tmp_;
+  std::string trim_tmp_;
 };
 
 }  // namespace
@@ -387,6 +436,10 @@ std::unique_ptr<Tokens> open(std::unique_ptr<io::Reader> reader,
     case Version::kJava8:
       return std::make_unique<TokensImpl<java_8::TokenMatcher, java_8::Token>>(
           std::move(reader), std::move(errors), config);
+    case Version::kJava21:
+      return std::make_unique<
+          TokensImpl<java_21::TokenMatcher, java_21::Token>>(
+          std::move(reader), std::move(errors), config);
   }
   std::unreachable();
 }
diff --git a/src/java_tokens.hh b/src/java_tokens.hh
index 6fbefcb..c4e27c0 100644
--- a/src/java_tokens.hh
+++ b/src/java_tokens.hh
@@ -25,7 +25,10 @@ struct Token {
     kIdentifier,
 
     // str is keyword, int_value is Keyword index
-    kKeyword,
+    kReservedKeyword,
+
+    // str is keyword, int_value is Keyword index
+    kContextualKeyword,
 
     // str is separator, int_value is Separator index
     kSeparator,
diff --git a/src/java_version.hh b/src/java_version.hh
index 444ae36..4877263 100644
--- a/src/java_version.hh
+++ b/src/java_version.hh
@@ -7,8 +7,9 @@ namespace java {
 
 enum class Version : uint8_t {
   kJava8 = 8,
+  kJava21 = 21,
 
-  kMax = kJava8,
+  kMax = kJava21,
 };
 
 }  // namespace java