WIPWIP

author: Joel Klinghed <the_jk@spawned.biz> 2025-09-27 20:11:32 +0200
committer: Joel Klinghed <the_jk@spawned.biz> 2025-09-28 22:48:24 +0200
commit: c1ae5d53fb0fa7ceb9d6fc7a60c87df958ce37fe (patch)
tree: f028a04619aa1b69f8b0aa72a5154f6ba1c09775
parent: 2f13baa843bd1fb5db6630a2823681ffaff9fb11 (diff)
21 files changed, 3285 insertions, 18 deletions
diff --git a/data/java-8/tokens.grammar b/data/java-8/tokens.grammar
new file mode 100644
index 0000000..3521ac0
--- /dev/null
+++ b/data/java-8/tokens.grammar
@@ -0,0 +1,385 @@
+InputElement:
+  WhiteSpace
+  Comment
+  Token
+
+Token:
+  Identifier
+  Keyword
+  Literal
+  Separator
+  Operator
+
+Comment:
+  TraditionalComment
+  EndOfLineComment
+
+TraditionalComment:
+  / * CommentTail
+
+CommentTail:
+  * CommentTailStar
+  NotStar CommentTail
+
+CommentTailStar:
+  /
+  * CommentTailStar
+  NotStarNotSlash CommentTail
+
+NotStar:
+  InputCharacter but not *
+  LineTerminator
+
+NotStarNotSlash:
+  InputCharacter but not * or /
+  LineTerminator
+
+EndOfLineComment:
+  / / {InputCharacter}
+
+Identifier:
+  IdentifierChars but not Keyword or BooleanLiteral or NullLiteral
+
+IdentifierChars:
+  JavaLetter {JavaLetterOrDigit}
+
+Keyword:
+  abstract
+  continue
+  for
+  new
+  switch
+  assert
+  default
+  if
+  package
+  synchronized
+  boolean
+  do
+  goto
+  private
+  this
+  break
+  double
+  implements
+  protected
+  throw
+  byte
+  else
+  import
+  public
+  throws
+  case
+  enum
+  instanceof
+  return
+  transient
+  catch
+  extends
+  int
+  short
+  try
+  char
+  final
+  interface
+  static
+  void
+  class
+  finally
+  long
+  strictfp
+  volatile
+  const
+  float
+  native
+  super
+  while
+
+Literal:
+  IntegerLiteral
+  FloatingPointLiteral
+  BooleanLiteral
+  CharacterLiteral
+  StringLiteral
+  NullLiteral
+
+IntegerLiteral:
+  DecimalIntegerLiteral
+  HexIntegerLiteral
+  OctalIntegerLiteral
+  BinaryIntegerLiteral
+
+DecimalIntegerLiteral:
+  DecimalNumeral [IntegerTypeSuffix]
+
+HexIntegerLiteral:
+  HexNumeral [IntegerTypeSuffix]
+
+OctalIntegerLiteral:
+  OctalNumeral [IntegerTypeSuffix]
+
+BinaryIntegerLiteral:
+  BinaryNumeral [IntegerTypeSuffix]
+
+IntegerTypeSuffix:
+  l
+  L
+
+DecimalNumeral:
+  0
+  NonZeroDigit [Digits]
+  NonZeroDigit Underscores Digits
+
+NonZeroDigit:
+  1
+  2
+  3
+  4
+  5
+  6
+  7
+  8
+  9
+
+Digits:
+  Digit
+  Digit [DigitsAndUnderscores] Digit
+
+Digit:
+  0
+  NonZeroDigit
+
+DigitsAndUnderscores:
+  DigitOrUnderscore {DigitOrUnderscore}
+
+DigitOrUnderscore:
+  Digit
+  _
+
+Underscores:
+  _ {_}
+
+HexNumeral:
+  0 x HexDigits
+  0 X HexDigits
+
+HexDigits:
+  HexDigit
+  HexDigit [HexDigitsAndUnderscores] HexDigit
+
+HexDigit:
+  0
+  1
+  2
+  3
+  4
+  5
+  6
+  7
+  8
+  9
+  a
+  b
+  c
+  d
+  e
+  f
+  A
+  B
+  C
+  D
+  E
+  F
+
+HexDigitsAndUnderscores:
+  HexDigitOrUnderscore {HexDigitOrUnderscore}
+
+HexDigitOrUnderscore:
+  HexDigit
+  _
+
+OctalNumeral:
+  0 OctalDigits
+  0 Underscores OctalDigits
+
+OctalDigits:
+  OctalDigit
+  OctalDigit [OctalDigitsAndUnderscores] OctalDigit
+
+OctalDigit:
+  0
+  1
+  2
+  3
+  4
+  5
+  6
+  7
+
+OctalDigitsAndUnderscores:
+  OctalDigitOrUnderscore {OctalDigitOrUnderscore}
+
+OctalDigitOrUnderscore:
+  OctalDigit
+  _
+
+BinaryNumeral:
+  0 b BinaryDigits
+  0 B BinaryDigits
+
+BinaryDigits:
+  BinaryDigit
+  BinaryDigit [BinaryDigitsAndUnderscores] BinaryDigit
+
+BinaryDigit:
+  0
+  1
+
+BinaryDigitsAndUnderscores:
+  BinaryDigitOrUnderscore {BinaryDigitOrUnderscore}
+
+BinaryDigitOrUnderscore:
+  BinaryDigit
+  _
+
+FloatingPointLiteral:
+  DecimalFloatingPointLiteral
+  HexadecimalFloatingPointLiteral
+
+DecimalFloatingPointLiteral:
+  Digits . [Digits] [ExponentPart] [FloatTypeSuffix]
+  . Digits [ExponentPart] [FloatTypeSuffix]
+  Digits ExponentPart [FloatTypeSuffix]
+  Digits [ExponentPart] FloatTypeSuffix
+
+ExponentPart:
+  ExponentIndicator SignedInteger
+
+ExponentIndicator:
+  e
+  E
+
+SignedInteger:
+  [Sign] Digits
+
+Sign:
+  +
+  -
+
+FloatTypeSuffix:
+  f
+  F
+  d
+  D
+
+HexadecimalFloatingPointLiteral:
+  HexSignificand BinaryExponent [FloatTypeSuffix]
+
+HexSignificand:
+  HexNumeral [.]
+  0 x [HexDigits] . HexDigits
+  0 X [HexDigits] . HexDigits
+
+BinaryExponent:
+  BinaryExponentIndicator SignedInteger
+
+BinaryExponentIndicator:
+  p
+  P
+
+BooleanLiteral:
+  true
+  false
+
+CharacterLiteral:
+  ' SingleCharacter '
+  ' EscapeSequence '
+
+SingleCharacter:
+  InputCharacter but not ' or \
+
+StringLiteral:
+  " {StringCharacter} "
+
+StringCharacter:
+  InputCharacter but not " or \
+  EscapeSequence
+
+EscapeSequence:
+  \ b
+  \ t
+  \ n
+  \ f
+  \ r
+  \ "
+  \ '
+  \ \
+  OctalEscape
+
+OctalEscape:
+  \ OctalDigit
+  \ OctalDigit OctalDigit
+  \ ZeroToThree OctalDigit OctalDigit
+
+ZeroToThree:
+  0
+  1
+  2
+  3
+
+NullLiteral:
+  null
+
+Separator:
+  (
+  )
+  {
+  }
+  [
+  ]
+  ;
+  ,
+  .
+  ...
+  @
+  ::
+
+Operator:
+  =
+  >
+  <
+  !
+  ~
+  ?
+  :
+  ->
+  ==
+  >=
+  <=
+  !=
+  &&
+  ||
+  ++
+  --
+  +
+  -
+  *
+  /
+  &
+  |
+  ^
+  %
+  <<
+  >>
+  >>>
+  +=
+  -=
+  *=
+  /=
+  &=
+  |=
+  ^=
+  %=
+  <<=
+  >>=
+  >>>=
diff --git a/meson.build b/meson.build
index 1607088..d1f6908 100644
--- a/meson.build
+++ b/meson.build
@@ -12,6 +12,23 @@ project(
   ],
 )
 
+cpp_flags = []
+cpp_optional_flags = []
+if get_option('buildtype') == 'release'
+  # If asserts are disabled parameters and variables used for only that
+  # end up causing warnings
+  cpp_optional_flags += ['-Wno-unused-parameter', '-Wno-unused-variable',
+                         '-Wno-unused-but-set-variable']
+  cpp_flags += '-DNDEBUG'
+endif
+cpp = meson.get_compiler('cpp')
+foreach flag : cpp_optional_flags
+  if cpp.has_argument(flag)
+    cpp_flags += flag
+  endif
+endforeach
+add_project_arguments([cpp_flags], language: 'cpp')
+
 conf_data = configuration_data()
 conf_data.set('version', meson.project_version())
 configure_file(input: 'src/config.h.in',
@@ -191,6 +208,34 @@ java_uescape_dep = declare_dependency(
   dependencies: [buffer_dep, uio_dep],
 )
 
+src_lib = library(
+  'src',
+  sources: [
+    'src/errors.cc',
+    'src/errors.hh',
+    'src/location.cc',
+    'src/location.hh',
+  ],
+  include_directories: inc,
+)
+src_dep = declare_dependency(
+  link_with: src_lib,
+)
+
+grammar_lib = library(
+  'grammar',
+  sources: [
+    'src/grammar.cc',
+    'src/grammar.hh',
+  ],
+  include_directories: inc,
+  dependencies: [io_dep, src_dep, str_dep],
+)
+grammar_dep = declare_dependency(
+  link_with: grammar_lib,
+  dependencies: [io_dep, src_dep, str_dep],
+)
+
 prefix_tree_lib = library(
   'prefix_tree',
   sources: [
@@ -201,6 +246,69 @@ prefix_tree_lib = library(
 )
 prefix_tree_dep = declare_dependency(link_with: prefix_tree_lib)
 
+gen_tokens = executable(
+  'gen_tokens',
+  sources: [
+    'src/gen_tokens.cc',
+  ],
+  include_directories: inc,
+  install : false,
+  dependencies : [
+    args_dep,
+    grammar_dep,
+    prefix_tree_dep,
+    src_dep,
+  ],
+)
+
+java_versions = [
+  'java-8',
+]
+
+java_unicode_versions = {
+  'java-8': '6.2.0',
+}
+
+java_tokens_sources = []
+foreach java_version : java_versions
+  java_unicode_version = 'u' + java_unicode_versions[java_version].replace('.', '_')
+  java_tokens_sources += custom_target(
+    'gen-tokens-' + java_version,
+    input: ['data/' + java_version + '/tokens.grammar'],
+    output: ['java_tokens_' + java_version + '.hh',
+             'java_tokens_' + java_version + '.cc'],
+    command : [gen_tokens, '--namespace',
+               java_version.replace('.', '_').replace('-', '_'),
+               '--unicode', java_unicode_version,
+               '@INPUT@', '@OUTPUT@'])
+endforeach
+
+java_lib = library(
+  'java',
+  sources: [
+    'src/java_tokens.hh',
+    'src/java_tokens.cc',
+    'src/java_version.hh',
+    java_tokens_sources,
+  ],
+  include_directories: inc,
+  dependencies: [
+    java_uescape_dep,
+    prefix_tree_dep,
+    src_dep,
+    str_dep,
+    uline_dep,
+  ]
+)
+java_dep = declare_dependency(link_with: java_lib,
+                              dependencies: [
+                                java_uescape_dep,
+                                prefix_tree_dep,
+                                src_dep,
+                                str_dep,
+                                uline_dep,
+                              ])
+
 jkc = executable(
   'jkc',
   sources: [
@@ -351,12 +459,23 @@ test('java_uescape', executable(
   ],
 ))
 
+test('java_tokens', executable(
+  'test_java_tokens',
+  sources: ['test/java_tokens.cc'],
+  include_directories: inc,
+  dependencies: [
+    java_dep,
+    test_dependencies,
+  ],
+))
+
 test('prefix_tree', executable(
   'test_prefix_tree',
   sources: ['test/prefix_tree.cc'],
   include_directories: inc,
   dependencies: [
     prefix_tree_dep,
+    str_dep,
     test_dependencies,
   ],
 ))
diff --git a/src/errors.cc b/src/errors.cc
new file mode 100644
index 0000000..bf92c3b
--- /dev/null
+++ b/src/errors.cc
@@ -0,0 +1,116 @@
+#include "errors.hh"
+
+#include "location.hh"
+
+#include <cstdint>
+#include <format>
+#include <iostream>
+#include <memory>
+#include <ostream>
+#include <string>
+#include <string_view>
+#include <utility>
+
+namespace src {
+
+namespace {
+
+class FileErrors : public Errors {
+ public:
+  FileErrors(std::string filename, std::shared_ptr<ErrorsOutput> output)
+      : filename_(std::move(filename)), output_(std::move(output)) {}
+
+  void err(Location loc, std::string_view msg) override {
+    ++errors_;
+    output_->println(std::format("{}:{}:{}: Error {}", filename_, loc.line,
+                                 loc.column, msg));
+  }
+
+  void warn(Location loc, std::string_view msg) override {
+    ++warnings_;
+    output_->println(std::format("{}:{}:{}: Warning {}", filename_, loc.line,
+                                 loc.column, msg));
+  }
+
+#if !defined(NDEBUG)
+  void dbg(Location loc, std::string_view msg) override {
+    output_->println(std::format("{}:{}:{}: Debug {}", filename_, loc.line,
+                                 loc.column, msg));
+  }
+#endif
+
+  [[nodiscard]]
+  uint64_t errors() const override {
+    return errors_;
+  }
+
+  [[nodiscard]]
+  uint64_t warnings() const override {
+    return warnings_;
+  }
+
+ private:
+  std::string const filename_;
+  std::shared_ptr<ErrorsOutput> output_;
+  uint64_t errors_{0};
+  uint64_t warnings_{0};
+};
+
+class IgnoreErrors : public Errors {
+ public:
+  IgnoreErrors() = default;
+
+  void err(Location /* loc */, std::string_view /* msg */) override {}
+  void warn(Location /* loc */, std::string_view /* msg */) override {}
+
+#if !defined(NDEBUG)
+  void dbg(Location /* loc */, std::string_view /* msg */) override {}
+#endif
+
+  [[nodiscard]]
+  uint64_t errors() const override {
+    return 0;
+  }
+
+  [[nodiscard]]
+  uint64_t warnings() const override {
+    return 0;
+  }
+};
+
+class OutputStreamErrorsOutput : public ErrorsOutput {
+ public:
+  explicit OutputStreamErrorsOutput(std::ostream& out) : out_(out) {}
+
+  void println(std::string_view line) override { out_ << line << '\n'; }
+
+ private:
+  std::ostream& out_;
+};
+
+}  // namespace
+
+[[nodiscard]]
+std::unique_ptr<Errors> file_errors(std::string filename,
+                                    std::shared_ptr<ErrorsOutput> output) {
+  if (!output) {
+    static std::shared_ptr<ErrorsOutput> g_stderr_output;
+    // TODO: Make thread-safe when needed
+    if (!g_stderr_output)
+      g_stderr_output = std::make_shared<OutputStreamErrorsOutput>(std::cerr);
+    output = g_stderr_output;
+  }
+  return std::make_unique<FileErrors>(std::move(filename), std::move(output));
+}
+
+[[nodiscard]]
+std::unique_ptr<Errors> ignore_errors() {
+  return std::make_unique<IgnoreErrors>();
+}
+
+[[nodiscard]]
+std::unique_ptr<ErrorsOutput> errors_output_ios(std::ostream& out) {
+  return std::make_unique<OutputStreamErrorsOutput>(out);
+}
+
+}  // namespace src
diff --git a/src/errors.hh b/src/errors.hh
new file mode 100644
index 0000000..d8b3d60
--- /dev/null
+++ b/src/errors.hh
@@ -0,0 +1,64 @@
+#ifndef ERRORS_HH
+#define ERRORS_HH
+
+#include "location.hh"
+
+#include <iosfwd>
+#include <memory>
+#include <string>
+#include <string_view>
+
+namespace src {
+
+class Errors {
+ public:
+  virtual ~Errors() = default;
+
+  virtual void err(Location loc, std::string_view msg) = 0;
+
+  virtual void warn(Location loc, std::string_view msg) = 0;
+
+#if !defined(NDEBUG)
+  virtual void dbg(Location loc, std::string_view msg) = 0;
+#else
+  void dbg(Location, std::string_view) {}
+#endif
+
+  [[nodiscard]]
+  virtual uint64_t errors() const = 0;
+  [[nodiscard]]
+  virtual uint64_t warnings() const = 0;
+
+ protected:
+  Errors() = default;
+
+  Errors(Errors const&) = delete;
+  Errors& operator=(Errors const&) = delete;
+};
+
+class ErrorsOutput {
+ public:
+  virtual ~ErrorsOutput() = default;
+
+  virtual void println(std::string_view line) = 0;
+
+ protected:
+  ErrorsOutput() = default;
+
+  ErrorsOutput(ErrorsOutput const&) = delete;
+  ErrorsOutput& operator=(ErrorsOutput const&) = delete;
+};
+
+[[nodiscard]]
+std::unique_ptr<Errors> file_errors(
+    std::string filename, std::shared_ptr<ErrorsOutput> output = nullptr);
+
+[[nodiscard]]
+std::unique_ptr<Errors> ignore_errors();
+
+[[nodiscard]]
+std::unique_ptr<ErrorsOutput> errors_output_ios(std::ostream& out);
+
+}  // namespace src
+
+#endif  // ERRORS_HH
diff --git a/src/gen_tokens.cc b/src/gen_tokens.cc
new file mode 100644
index 0000000..2442c4f
--- /dev/null
+++ b/src/gen_tokens.cc
@@ -0,0 +1,1019 @@
+#include "args.hh"
+#include "errors.hh"
+#include "grammar.hh"
+#include "io.hh"
+#include "prefix_tree.hh"
+
+#include <algorithm>
+#include <cassert>
+#include <charconv>
+#include <cstddef>
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <set>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+namespace {
+
+enum class CharacterClass : uint8_t {
+  kWhiteSpace = 0,
+  kLineTerminator = 1,
+  kInputCharacter = 2,
+  kJavaLetter = 3,
+  kJavaLetterOrDigit = 4,
+};
+
+std::vector<std::string> const kCharacterClassNames(
+    {"WhiteSpace", "LineTerminator", "InputCharacter", "JavaLetter",
+     "JavaLetterOrDigit"});
+
+enum class ReturnType : uint8_t {
+  kTokenAndSize,
+  kInternalAndSize,
+  kSize,
+};
+
+std::string make_define(std::string_view filename) {
+  std::string ret;
+  ret.reserve(filename.size());
+  for (char c : filename) {
+    if ((c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_') {
+      ret.push_back(c);
+    } else if (c >= 'a' && c <= 'z') {
+      ret.push_back(static_cast<char>(c & ~0x20));
+    } else {
+      ret.push_back('_');
+    }
+  }
+  return ret;
+}
+
+class Generator {
+ public:
+  bool generate(std::string_view header_name, std::string_view source_name,
+                std::string const& ns, std::string const& unicode_version,
+                grammar::Grammar& grammar);
+
+ private:
+  void find_specific_elements(grammar::Element const& root);
+  void find_all_elements(grammar::Element const& root);
+
+  void check_need_last(grammar::Element const& element);
+  bool find_report_last(grammar::Element const& root,
+                        grammar::Element const& match);
+
+  [[nodiscard]]
+  ReturnType get_return_type(grammar::Element const& element) const;
+  [[nodiscard]]
+  ReturnType get_return_type(uint8_t character_class) const;
+
+  void write_matcher(std::ostream& out, grammar::Symbol const& symbol,
+                     ReturnType return_type, std::string_view str_arg);
+  bool write_matcher(std::ostream& out, grammar::Definition const& definition,
+                     ReturnType return_type, std::string_view indent);
+  bool write_matcher(std::ostream& out, grammar::Element const& element,
+                     ReturnType return_type);
+
+  std::set<std::string_view> above_specific_tokens_;
+  std::set<std::string_view> specific_tokens_;
+  std::set<grammar::Element const*> all_elements_;
+  std::set<std::string_view> copy_last_;
+  std::set<std::string_view> report_last_;
+};
+
+// Find the Elements that has at least one terminal or character class as symbol
+// These will be the different tokens the tokenizer can return
+void Generator::find_specific_elements(grammar::Element const& root) {
+  if (std::ranges::any_of(root.definitions, [](auto const& definition) {
+        return definition.symbols.size() > 1 ||
+               definition.symbols[0].type == grammar::Symbol::Type::kTerminal;
+      })) {
+    specific_tokens_.insert(root.name);
+    return;
+  }
+
+  above_specific_tokens_.insert(root.name);
+
+  for (auto const& definition : root.definitions) {
+    for (auto const& symbol : definition.symbols) {
+      switch (symbol.type) {
+        case grammar::Symbol::Type::kNonTerminal:
+          find_specific_elements(*symbol.element);
+          break;
+        case grammar::Symbol::Type::kCharacterClass:
+          specific_tokens_.insert(kCharacterClassNames[symbol.char_class]);
+          break;
+        case grammar::Symbol::Type::kTerminal:
+          std::unreachable();
+      }
+    }
+  }
+}
+
+// Find elements that have definitions that has ZeroOrMore matches with a final condition
+void Generator::check_need_last(grammar::Element const& element) {
+  for (auto const& definition : element.definitions) {
+    if (definition.symbols.size() < 2)
+      continue;
+    if (definition.symbols[definition.symbols.size() - 1].optional ==
+            grammar::Symbol::Optional::kRequired &&
+        definition.symbols[definition.symbols.size() - 1].type ==
+            grammar::Symbol::Type::kNonTerminal &&
+        (definition.symbols[definition.symbols.size() - 2].optional ==
+             grammar::Symbol::Optional::kZeroOrOne ||
+         definition.symbols[definition.symbols.size() - 2].optional ==
+             grammar::Symbol::Optional::kZeroOrMore) &&
+        definition.symbols[definition.symbols.size() - 2].type ==
+            grammar::Symbol::Type::kNonTerminal) {
+      if (!copy_last_.contains(definition.symbols[definition.symbols.size() - 2]
+                                   .element->name)) {
+        find_report_last(
+            *definition.symbols[definition.symbols.size() - 2].element,
+            *definition.symbols[definition.symbols.size() - 1].element);
+      }
+    }
+  }
+}
+
+// Find element that has match as a single definition, if so, return true and insert into
+// report_last_.
+bool Generator::find_report_last(grammar::Element const& root,
+                                 grammar::Element const& match) {
+  if (std::ranges::any_of(root.definitions, [&match](auto const& definition) {
+        return definition.symbols.size() == 1 &&
+               definition.symbols[0].optional ==
+                   grammar::Symbol::Optional::kRequired &&
+               definition.symbols[0].type ==
+                   grammar::Symbol::Type::kNonTerminal &&
+               definition.symbols[0].element == &match;
+      })) {
+    report_last_.insert(root.name);
+    return true;
+  }
+
+  for (auto const& definition : root.definitions) {
+    for (auto const& symbol : definition.symbols) {
+      if (symbol.type == grammar::Symbol::Type::kNonTerminal) {
+        if (!copy_last_.contains(symbol.element->name)) {
+          if (find_report_last(*symbol.element, match)) {
+            copy_last_.insert(root.name);
+            return true;
+          }
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
+ReturnType Generator::get_return_type(grammar::Element const& element) const {
+  if (above_specific_tokens_.contains(element.name) ||
+      specific_tokens_.contains(element.name))
+    return ReturnType::kTokenAndSize;
+  if (copy_last_.contains(element.name) || report_last_.contains(element.name))
+    return ReturnType::kInternalAndSize;
+  return ReturnType::kSize;
+}
+
+ReturnType Generator::get_return_type(uint8_t character_class) const {
+  auto const& name = kCharacterClassNames[character_class];
+  if (above_specific_tokens_.contains(name) || specific_tokens_.contains(name))
+    return ReturnType::kTokenAndSize;
+  return ReturnType::kSize;
+}
+
+void Generator::find_all_elements(grammar::Element const& root) {
+  auto pair = all_elements_.insert(&root);
+  if (!pair.second)
+    return;
+
+  for (auto const& definition : root.definitions) {
+    for (auto const& symbol : definition.symbols) {
+      switch (symbol.type) {
+        case grammar::Symbol::Type::kNonTerminal:
+          find_all_elements(*symbol.element);
+          break;
+        case grammar::Symbol::Type::kCharacterClass:
+        case grammar::Symbol::Type::kTerminal:
+          break;
+      }
+    }
+  }
+}
+
+void write_character_class_matchers(std::ostream& out,
+                                    std::string_view unicode_version) {
+  out << "[[nodiscard]]\n"
+      << "inline std::optional<size_t> matchLineTerminator"
+      << "(std::string_view str) {\n"
+      // Tokenizer reads one line at a time,
+      // so line terminator matches end of line.
+      << "  return str.empty() ? std::make_optional<size_t>(0)"
+      << " : std::nullopt;\n"
+      << "}\n"
+      << "\n";
+
+  out << "[[nodiscard]]\n"
+      << "inline std::optional<size_t> matchInputCharacter"
+      << "(std::string_view str) {\n"
+      << "  if (str.empty())\n"
+      << "    return std::nullopt;\n"
+      // UnicodeInputCharacter but not CR or LF
+      << "  auto* const start = reinterpret_cast<uint8_t const*>(str.data());\n"
+      << "  auto* ptr = start;\n"
+      << "  u8::skip(ptr, start + str.size());\n"
+      << "  return ptr - start;\n"
+      << "}\n"
+      << "\n";
+
+  out << "[[nodiscard]]\n"
+      << "inline std::optional<std::pair<Token, size_t>> matchWhiteSpace"
+      << "(std::string_view str) {\n"
+      // LineTerminator
+      << "  if (auto ret = matchLineTerminator(str); ret.has_value())\n"
+      << "    return std::make_pair(Token::kWhiteSpace, ret.value());\n"
+      << "  switch (str.front()) {\n"
+      // the ASCII SP character, also known as "space"
+      << "    case ' ':\n"
+      // the ASCII HT character, also known as "horizontal tab"
+      << "    case '\\t':\n"
+      // the ASCII FF character, also known as "form feed"
+      << "    case '\\f':\n"
+      << "      return std::make_pair(Token::kWhiteSpace, 1);\n"
+      << "    default:\n"
+      << "      return std::nullopt;\n"
+      << "  }\n"
+      << "}\n"
+      << "\n";
+
+  out << "[[nodiscard]]\n"
+      << "inline std::optional<size_t> matchJavaLetter"
+      << "(std::string_view str) {\n"
+      << "  auto* const start = reinterpret_cast<uint8_t const*>(str.data());\n"
+      << "  auto* ptr = start;\n"
+      << "  auto code = u8::read(ptr, ptr + str.size());\n"
+      << "  if (!code.has_value())\n"
+      << "    return std::nullopt;\n"
+      // any Unicode character that is a "Java letter"
+      // A "Java letter" is a character for which the method Character.isJavaIdentifierStart(int) returns true.
+      // A character may start a Java identifier if and only if one of the following conditions is true:
+      //   isLetter(codePoint) returns true
+      //   getType(codePoint) returns LETTER_NUMBER
+      //   the referenced character is a currency symbol (such as '$')
+      //   the referenced character is a connecting punctuation character (such as '_').
+      << "  switch (u::lookup_gc(code.value(), u::Version::" << unicode_version
+      << ")) {\n"
+      << "    case u::GeneralCategory::LETTER_UPPERCASE:\n"
+      << "    case u::GeneralCategory::LETTER_LOWERCASE:\n"
+      << "    case u::GeneralCategory::LETTER_TITLECASE:\n"
+      << "    case u::GeneralCategory::LETTER_MODIFIER:\n"
+      << "    case u::GeneralCategory::LETTER_OTHER:\n"
+      << "    case u::GeneralCategory::NUMBER_LETTER:\n"
+      << "    case u::GeneralCategory::SYMBOL_CURRENCY:\n"
+      << "    case u::GeneralCategory::PUNCTUATION_CONNECTOR:\n"
+      << "      return ptr - start;\n"
+      << "    default:\n"
+      << "      return std::nullopt;\n"
+      << "  }\n"
+      << "}\n"
+      << "\n";
+
+  out << "[[nodiscard]]\n"
+      << "inline std::optional<size_t> matchJavaLetterOrDigit"
+      << "(std::string_view str) {\n"
+      << "  auto* const start = reinterpret_cast<uint8_t const*>(str.data());\n"
+      << "  auto* ptr = start;\n"
+      << "  auto code = u8::read(ptr, ptr + str.size());\n"
+      << "  if (!code.has_value())\n"
+      << "    return std::nullopt;\n"
+      // any Unicode character that is a "Java letter-or-digit"
+      // A "Java letter-or-digit" is a character for which the method Character.isJavaIdentifierPart(int) returns true.
+      // A character may be part of a Java identifier if any of the following conditions are true:
+      //   it is a letter
+      //   it is a currency symbol (such as '$')
+      //   it is a connecting punctuation character (such as '_')
+      //   it is a digit
+      //   it is a numeric letter (such as a Roman numeral character)
+      //   it is a combining mark
+      //   it is a non-spacing mark
+      //   isIdentifierIgnorable returns true for the character
+      << "  switch (u::lookup_gc(code.value(), u::Version::" << unicode_version
+      << ")) {\n"
+      << "    case u::GeneralCategory::LETTER_UPPERCASE:\n"
+      << "    case u::GeneralCategory::LETTER_LOWERCASE:\n"
+      << "    case u::GeneralCategory::LETTER_TITLECASE:\n"
+      << "    case u::GeneralCategory::LETTER_MODIFIER:\n"
+      << "    case u::GeneralCategory::LETTER_OTHER:\n"
+      << "    case u::GeneralCategory::SYMBOL_CURRENCY:\n"
+      << "    case u::GeneralCategory::PUNCTUATION_CONNECTOR:\n"
+      << "    case u::GeneralCategory::NUMBER_DIGIT:\n"
+      << "    case u::GeneralCategory::NUMBER_LETTER:\n"
+      << "    case u::GeneralCategory::MARK_SPACING_COMBINING:\n"
+      << "    case u::GeneralCategory::MARK_NONSPACING:\n"
+      << "    case u::GeneralCategory::OTHER_FORMAT:\n"
+      << "      return ptr - start;\n"
+      << "    case u::GeneralCategory::OTHER_CONTROL:\n"
+      << "      if ((/* code.value() >= 0 && */ code.value() <= 8) ||\n"
+      << "          (code.value() >= 0xe && code.value() <= 0x1b) ||\n"
+      << "          (code.value() >= 0x7f && code.value() <= 0x9f))\n"
+      << "        return 1;\n"
+      << "      break;\n"
+      << "    default:\n"
+      << "      break;\n"
+      << "  }\n"
+      << "  return std::nullopt;\n"
+      << "}\n"
+      << "\n";
+}
+
+std::ostream& quote(std::ostream& out, std::string_view in) {
+  out << '"';
+  bool avoid_digit = false;
+  for (auto c : in) {
+    if (c == '"' || c == '\\') {
+      out << '\\';
+    } else if (c < ' ' || (c & 0x80)) {
+      char tmp[4];
+      std::to_chars(tmp, tmp + sizeof(tmp), c & 0xff, 8).ptr[0] = 0;
+      out << "\\" << tmp;
+      avoid_digit = true;
+      continue;
+    } else if (avoid_digit) {
+      if (c >= '0' && c <= '7') {
+        out << "\" \"";
+      }
+    }
+    avoid_digit = false;
+    out << c;
+  }
+  out << '"';
+  return out;
+}
+
+void match_return_type(std::ostream& out, ReturnType in_return_type,
+                       std::string_view in_name, ReturnType out_return_type) {
+  switch (out_return_type) {
+    case ReturnType::kTokenAndSize:
+      switch (in_return_type) {
+        case ReturnType::kTokenAndSize:
+          break;
+        case ReturnType::kInternalAndSize:
+          out << ".transform([](auto pair) { return std::make_pair(Token::k"
+              << in_name << ", pair.second); }";
+          break;
+        case ReturnType::kSize:
+          out << ".transform([](auto size) { return std::make_pair(Token::k"
+              << in_name << ", size); })";
+          break;
+      }
+      break;
+    case ReturnType::kInternalAndSize:
+      switch (in_return_type) {
+        case ReturnType::kTokenAndSize:
+          out << ".transform([](auto pair) { return std::make_pair(Internal::k"
+              << in_name << ", pair.second); }";
+          break;
+        case ReturnType::kInternalAndSize:
+          break;
+        case ReturnType::kSize:
+          if (in_name.empty()) {
+            out << ".transform([](auto size) { return "
+                   "std::make_pair(Internal::UNDEFINED, size); })";
+          } else {
+            out << ".transform([](auto size) { return "
+                   "std::make_pair(Internal::k"
+                << in_name << ", size); })";
+          }
+          break;
+      }
+      break;
+    case ReturnType::kSize:
+      if (in_return_type != ReturnType::kSize) {
+        out << ".transform([](auto pair) { return pair.second; })";
+      }
+      break;
+  }
+}
+
+void Generator::write_matcher(std::ostream& out, grammar::Symbol const& symbol,
+                              ReturnType return_type,
+                              std::string_view str_arg) {
+  std::string_view in_name;
+  ReturnType in_return_type;
+
+  switch (symbol.type) {
+    case grammar::Symbol::Type::kTerminal:
+      in_return_type = ReturnType::kSize;
+      out << "(" << str_arg << ".starts_with(";
+      quote(out, symbol.value);
+      out << ") ? std::make_optional<size_t>(" << symbol.value.size()
+          << ") : " << "std::nullopt)";
+      break;
+    case grammar::Symbol::Type::kNonTerminal:
+      out << "match" << symbol.element->name << "(" << str_arg << ")";
+      in_return_type = get_return_type(*symbol.element);
+      in_name = symbol.element->name;
+      break;
+    case grammar::Symbol::Type::kCharacterClass:
+      out << "match" << kCharacterClassNames[symbol.char_class] << "("
+          << str_arg << ")";
+      in_return_type = get_return_type(symbol.char_class);
+      in_name = kCharacterClassNames[symbol.char_class];
+      break;
+  }
+
+  match_return_type(out, in_return_type, in_name, return_type);
+}
+
+bool Generator::write_matcher(std::ostream& out,
+                              grammar::Definition const& definition,
+                              ReturnType return_type, std::string_view indent) {
+  if (definition.symbols.size() == 1 &&
+      definition.symbols[0].optional == grammar::Symbol::Optional::kRequired) {
+    out << indent << "return ";
+    write_matcher(out, definition.symbols[0], return_type, "str");
+    out << ";\n";
+    return true;
+  }
+
+  std::string_view size_suffix;
+  switch (return_type) {
+    case ReturnType::kTokenAndSize:
+    case ReturnType::kInternalAndSize:
+      size_suffix = "->second";
+      break;
+    case ReturnType::kSize:
+      size_suffix = ".value()";
+      break;
+  }
+
+  if (definition.symbols.size() > 1 &&
+      definition.symbols[0].optional == grammar::Symbol::Optional::kRequired &&
+      definition.symbols[1].optional == grammar::Symbol::Optional::kExcluded) {
+    bool first = true;
+    for (auto const& symbol : definition.symbols) {
+      if (first) {
+        out << indent << "auto first_ret = ";
+        write_matcher(out, symbol, return_type, "str");
+        out << ";\n"
+            << indent << "if (!first_ret.has_value())\n"
+            << indent << "  return first_ret;\n"
+            << indent << "std::optional<size_t> ret;\n"
+            << indent << "auto tmp = str.substr(0, first_ret" << size_suffix
+            << ");\n";
+        first = false;
+      } else {
+        if (symbol.optional != grammar::Symbol::Optional::kExcluded) {
+          std::cerr << "Non-excluded after at least one excluded\n";
+          return false;
+        }
+        out << indent << "ret = ";
+        write_matcher(out, symbol, ReturnType::kSize, "tmp");
+        out << ";\n"
+            << indent << "if (ret.has_value() && ret.value() == tmp.size())\n"
+            << indent << "  return std::nullopt;\n";
+      }
+    }
+    out << indent << "return first_ret;\n";
+    return true;
+  }
+
+  if (std::ranges::all_of(definition.symbols, [](auto const& symbol) {
+        return symbol.optional == grammar::Symbol::Optional::kRequired;
+      })) {
+    out << indent << "size_t tot = 0;\n";
+    bool first = true;
+    for (auto const& symbol : definition.symbols) {
+      std::string indent2(indent);
+      if (first) {
+        out << indent2 << "auto ret = ";
+        write_matcher(out, symbol, return_type, "str");
+        out << ";\n";
+        first = false;
+      } else {
+        out << indent2 << "ret = ";
+        write_matcher(out, symbol, return_type, "str.substr(tot)");
+        out << ";\n";
+      }
+      out << indent2 << "if (!ret.has_value())\n"
+          << indent2 << "  return ret;\n";
+      out << indent2 << "tot += ret" << size_suffix << ";\n";
+    }
+    switch (return_type) {
+      case ReturnType::kInternalAndSize:
+        // Return last internal
+        out << indent << "return std::make_pair(ret->first, tot);\n";
+        break;
+      case ReturnType::kTokenAndSize:
+        std::cerr << "Unable to return token and size\n";
+        return false;
+      case ReturnType::kSize:
+        out << indent << "return tot;\n";
+        break;
+    }
+    return true;
+  }
+
+  out << indent << "size_t tot = 0;\n";
+  bool last_internal = false;
+  switch (return_type) {
+    case ReturnType::kInternalAndSize:
+      last_internal = true;
+      out << indent << "std::optional<Internal> last_internal;\n";
+      break;
+    case ReturnType::kTokenAndSize:
+    case ReturnType::kSize:
+      break;
+  }
+  bool at_least_one_required = false;
+  bool first = true;
+  bool first_internal = true;
+  bool next_internal = false;
+  for (size_t i = 0; i < definition.symbols.size(); ++i) {
+    auto const& symbol = definition.symbols[i];
+    std::string indent2(indent);
+    bool have_internal = next_internal;
+    next_internal = false;
+    ReturnType symbol_return_type = return_type;
+
+    if (symbol.optional != grammar::Symbol::Optional::kRequired &&
+        i + 1 < definition.symbols.size() &&
+        definition.symbols[i + 1].optional ==
+            grammar::Symbol::Optional::kRequired &&
+        definition.symbols[i + 1].type == grammar::Symbol::Type::kNonTerminal) {
+      symbol_return_type = ReturnType::kInternalAndSize;
+      next_internal = true;
+    }
+
+    switch (symbol_return_type) {
+      case ReturnType::kTokenAndSize:
+      case ReturnType::kInternalAndSize:
+        size_suffix = "->second";
+        break;
+      case ReturnType::kSize:
+        size_suffix = ".value()";
+        break;
+    }
+
+    switch (symbol.optional) {
+      case grammar::Symbol::Optional::kRequired:
+        at_least_one_required = true;
+        break;
+      case grammar::Symbol::Optional::kZeroOrOne:
+        break;
+      case grammar::Symbol::Optional::kZeroOrMore:
+        if (first) {
+          switch (symbol_return_type) {
+            case ReturnType::kTokenAndSize:
+              out << indent << "std::optional<std::pair<Token, size_t>> ret;\n";
+              break;
+            case ReturnType::kInternalAndSize:
+              out << indent
+                  << "std::optional<std::pair<Internal, size_t>> ret;\n";
+              break;
+            case ReturnType::kSize:
+              out << indent << "std::optional<size_t> ret;\n";
+              break;
+          }
+          first = false;
+        }
+        out << indent << "while (true) {\n";
+        indent2 += "  ";
+        break;
+      case grammar::Symbol::Optional::kExcluded:
+        std::cerr << "Excluded mixed with conditional\n";
+        return false;
+    }
+    if (symbol_return_type == return_type) {
+      if (first) {
+        out << indent2 << "auto ret = ";
+        write_matcher(out, symbol, symbol_return_type, "str");
+        first = false;
+      } else {
+        out << indent2 << "ret = ";
+        write_matcher(out, symbol, symbol_return_type, "str.substr(tot)");
+      }
+      out << ";\n";
+    } else {
+      if (first_internal) {
+        out << indent2 << "auto ret_internal = ";
+        write_matcher(out, symbol, symbol_return_type,
+                      first ? "str" : "str.substr(tot)");
+        first_internal = false;
+      } else {
+        out << indent2 << "ret_internal = ";
+        write_matcher(out, symbol, symbol_return_type, "str.substr(tot)");
+      }
+      out << ";\n";
+      if (first) {
+        out << indent2 << "auto ret = ret_internal";
+        first = false;
+      } else {
+        out << indent2 << "ret = ret_internal";
+      }
+      match_return_type(out, symbol_return_type, "", return_type);
+      out << ";\n";
+    }
+    switch (symbol.optional) {
+      case grammar::Symbol::Optional::kRequired:
+        out << indent2 << "if (!ret.has_value()) {\n";
+        if (have_internal &&
+            symbol.type == grammar::Symbol::Type::kNonTerminal) {
+          out << indent2
+              << "  if (!ret_internal.has_value() || ret_internal->first != "
+                 "Internal::k"
+              << symbol.element->name << ")\n"
+              << indent2 << "    return ret;\n";
+        } else {
+          out << indent2 << "  return ret;\n";
+        }
+        out << indent2 << "} else {\n"
+            << indent2 << "  tot += ret" << size_suffix << ";\n";
+        if (last_internal)
+          out << indent2 << "  last_internal = ret->first;\n";
+        out << indent2 << "}\n";
+        break;
+      case grammar::Symbol::Optional::kZeroOrOne:
+        if (symbol_return_type == ReturnType::kTokenAndSize) {
+          out << indent2 << "tot += ret.has_value() ? ret->second : 0;\n";
+        } else {
+          out << indent2 << "tot += ret.value_or(0);\n";
+        }
+        if (last_internal)
+          out << indent2 << "if (ret.has_value())\n"
+              << indent2 << "  last_internal = ret->first;\n";
+        break;
+      case grammar::Symbol::Optional::kZeroOrMore:
+        out << indent2 << "if (!ret.has_value())\n"
+            << indent2 << "  break;\n"
+            << indent2 << "tot += ret" << size_suffix << ";\n";
+        if (last_internal)
+          out << indent2 << "last_internal = ret->first;\n";
+        out << indent << "}\n";
+        break;
+      case grammar::Symbol::Optional::kExcluded:
+        assert(false);
+        break;
+    }
+  }
+  switch (return_type) {
+    case ReturnType::kInternalAndSize:
+      // Return last internal
+      if (at_least_one_required) {
+        out << indent << "return std::make_pair(last_internal.value(), tot);\n";
+      } else {
+        out << indent << "if (last_internal.has_value())\n"
+            << indent
+            << "  return std::make_pair(last_internal.value(), tot);\n"
+            << indent << "return std::make_pair(Internal::UNDEFINED, tot);\n";
+      }
+      break;
+    case ReturnType::kTokenAndSize:
+      std::cerr << "Unable to return token and size\n";
+      return false;
+    case ReturnType::kSize:
+      out << indent << "return tot;\n";
+      break;
+  }
+  return true;
+}
+
+void declare_matcher(std::ostream& out, grammar::Element const& element,
+                     ReturnType return_type) {
+  switch (return_type) {
+    case ReturnType::kSize:
+      out << "[[nodiscard]]\n"
+          << "inline std::optional<size_t> match" << element.name
+          << "(std::string_view str);\n";
+      break;
+    case ReturnType::kTokenAndSize:
+      out << "[[nodiscard]]\n"
+          << "inline std::optional<std::pair<Token, size_t>> match"
+          << element.name << "(std::string_view str);\n";
+      break;
+    case ReturnType::kInternalAndSize:
+      out << "[[nodiscard]]\n"
+          << "inline std::optional<std::pair<Internal, size_t>> match"
+          << element.name << "(std::string_view str);\n";
+      break;
+  }
+}
+
+bool Generator::write_matcher(std::ostream& out,
+                              grammar::Element const& element,
+                              ReturnType return_type) {
+  ReturnType sub_return_type = return_type;
+  bool make_token = false;
+
+  switch (return_type) {
+    case ReturnType::kSize:
+      out << "[[nodiscard]]\n"
+          << "inline std::optional<size_t> match" << element.name
+          << "(std::string_view str) {\n";
+      break;
+    case ReturnType::kTokenAndSize:
+      out << "[[nodiscard]]\n"
+          << "inline std::optional<std::pair<Token, size_t>> match"
+          << element.name << "(std::string_view str) {\n";
+
+      if (specific_tokens_.contains(element.name)) {
+        sub_return_type = ReturnType::kSize;
+        make_token = true;
+      }
+      break;
+    case ReturnType::kInternalAndSize:
+      out << "[[nodiscard]]\n"
+          << "inline std::optional<std::pair<Internal, size_t>> match"
+          << element.name << "(std::string_view str) {\n";
+      break;
+  }
+
+  if (element.definitions.size() == 1) {
+    if (make_token) {
+      out << "  auto ret = [str]() -> std::optional<size_t> {\n";
+      if (!write_matcher(out, element.definitions[0], sub_return_type,
+                         "    ")) {
+        std::cerr << "Error in " << element.name << "\n";
+        return false;
+      }
+      out << "  }();\n"
+          << "  return ret.transform([](auto size) {\n"
+          << "    return std::make_pair(Token::k" << element.name
+          << ", size); });\n";
+    } else {
+      if (!write_matcher(out, element.definitions[0], sub_return_type, "  ")) {
+        std::cerr << "Error in " << element.name << "\n";
+        return false;
+      }
+    }
+  } else if (std::ranges::all_of(
+                 element.definitions, [](auto const& definition) {
+                   return definition.symbols.size() == 1 &&
+                          definition.symbols[0].optional ==
+                              grammar::Symbol::Optional::kRequired &&
+                          definition.symbols[0].type ==
+                              grammar::Symbol::Type::kTerminal;
+                 })) {
+    if (std::ranges::all_of(element.definitions, [](auto const& definition) {
+          return definition.symbols[0].value.size() == 1;
+        })) {
+      out << "  if (!str.empty()) {\n"
+          << "    switch (str.front()) {\n";
+      for (auto const& definition : element.definitions) {
+        out << "      case '" << definition.symbols[0].value[0] << "':\n";
+      }
+      out << "        return 1;\n"
+          << "      default:\n"
+          << "        break;\n"
+          << "    }\n"
+          << "  }\n"
+          << "  return std::nullopt;\n";
+    } else {
+      auto builder = prefix_tree::builder();
+      for (auto const& definition : element.definitions) {
+        builder->add(definition.symbols[0].value);
+      }
+      auto tree = builder->build();
+      if (!tree.has_value()) {
+        std::cerr << "To large prefix tree\n";
+        return false;
+      }
+      out << "  static const auto tree = ";
+      quote(out, tree.value()) << "sv;\n";
+      out << "  return prefix_tree::lookup(tree, str)";
+      if (make_token) {
+        out << ".transform([](auto size) {\n"
+            << "  return std::make_pair(Token::k" << element.name
+            << ", size); })";
+      }
+      out << ";\n";
+    }
+  } else {
+    bool first = true;
+    std::string_view ret_type;
+    switch (sub_return_type) {
+      case ReturnType::kTokenAndSize:
+        ret_type = "std::optional<std::pair<Token, size_t>>";
+        break;
+      case ReturnType::kInternalAndSize:
+        ret_type = "std::optional<std::pair<Internal, size_t>>";
+        break;
+      case ReturnType::kSize:
+        ret_type = "std::optional<size_t>";
+        break;
+    }
+    for (auto const& definition : element.definitions) {
+      if (first) {
+        first = false;
+        out << "  auto tmp = [str]() -> " << ret_type << " {\n";
+        if (!write_matcher(out, definition, sub_return_type, "    ")) {
+          std::cerr << "Error in " << element.name << "\n";
+          return false;
+        }
+        out << "  }();\n";
+        out << "  auto ret = tmp;\n";
+      } else {
+        out << "  tmp = [str]() -> " << ret_type << " {\n";
+        if (!write_matcher(out, definition, sub_return_type, "    ")) {
+          std::cerr << "Error in " << element.name << "\n";
+          return false;
+        }
+        out << "  }();\n"
+            << "  if (tmp.has_value()) {\n";
+        if (sub_return_type == ReturnType::kTokenAndSize) {
+          out << "    if (!ret.has_value() || ret.value().second < "
+                 "tmp.value().second) {\n";
+        } else {
+          out << "    if (!ret.has_value() || ret.value() < tmp.value()) {\n";
+        }
+        out << "      ret = tmp;\n"
+            << "    }\n"
+            << "  }\n";
+      }
+    }
+    if (make_token) {
+      out << "  return ret.transform([](auto size) {\n"
+          << "    return std::make_pair(Token::k" << element.name
+          << ", size); });\n";
+    } else {
+      out << "  return ret;\n";
+    }
+  }
+
+  out << "}\n"
+      << "\n";
+  return true;
+}
+
+bool Generator::generate(std::string_view header_name,
+                         std::string_view source_name, std::string const& ns,
+                         std::string const& unicode_version,
+                         grammar::Grammar& grammar) {
+  std::fstream header{std::string(header_name),
+                      std::fstream::trunc | std::fstream::out};
+  std::fstream source{std::string(source_name),
+                      std::fstream::trunc | std::fstream::out};
+
+  auto header_guard = make_define(header_name);
+
+  header << "#ifndef " << header_guard << "\n"
+         << "#define " << header_guard << "\n"
+         << "\n"
+         << "#include \"prefix_tree.hh\"\n"
+         << "\n"
+         << "#include <cstddef>\n"
+         << "#include <cstdint>\n"
+         << "#include <optional>\n"
+         << "#include <string_view>\n"
+         << "#include <utility>\n"
+         << "\n"
+         << "namespace " << ns << " {\n"
+         << "\n";
+
+  find_specific_elements(grammar.root());
+
+  find_all_elements(grammar.root());
+
+  for (auto& element : all_elements_) {
+    check_need_last(*element);
+  }
+
+  header << "enum class Token : "
+         << (specific_tokens_.size() < 256 ? "uint8_t" : "uint16_t") << " {\n";
+  for (auto const& token : specific_tokens_) {
+    header << "  k" << token << ",\n";
+  }
+  header << "};\n";
+
+  header << "\n"
+         << "[[nodiscard]]\n"
+         << "std::optional<std::pair<Token, size_t>>"
+         << " matchNext(std::string_view str);\n"
+         << "\n"
+         << "}  // " << ns << "\n"
+         << "\n"
+         << "#endif  // " << header_guard << "\n";
+
+  source << "#include \"" << header_name << "\"\n"
+         << "\n"
+         << "#include \"prefix_tree.hh\"\n"
+         << "#include \"u.hh\"\n"
+         << "#include \"u8.hh\"\n"
+         << "\n"
+         << "#include <cstddef>\n"
+         << "#include <optional>\n"
+         << "#include <string_view>\n"
+         << "#include <utility>\n"
+         << "\n"
+         << "using namespace std::literals::string_view_literals;\n"
+         << "\n"
+         << "// NOLINTBEGIN(readability-else-after-return)\n"
+         << "\n"
+         << "namespace " << ns << " {\n";
+
+  source << "namespace {\n"
+         << "\n";
+
+  write_character_class_matchers(source, unicode_version);
+
+  source << "\n"
+         << "enum class Internal : "
+         << (all_elements_.size() < 256 ? "uint8_t" : "uint16_t") << " {\n"
+         << "  UNDEFINED,\n";
+  for (auto* element : all_elements_) {
+    source << "  k" << element->name << ",\n";
+  }
+  source << "};\n"
+         << "\n";
+
+  for (auto* element : all_elements_) {
+    declare_matcher(source, *element, get_return_type(*element));
+  }
+
+  if (std::ranges::any_of(all_elements_, [this, &source](auto* element) {
+        auto sts = get_return_type(*element);
+        return !write_matcher(source, *element, sts);
+      })) {
+    return false;
+  }
+
+  source << "\n"
+         << "}  // namespace\n";
+
+  source << "\n"
+         << "std::optional<std::pair<Token, size_t>>"
+         << "matchNext(std::string_view str) {\n"
+         << "  return match" << grammar.root().name << "(str);\n"
+         << "}"
+         << "\n"
+         << "}  // namespace " << ns << "\n"
+         << "\n"
+         << "// NOLINTEND(readability-else-after-return)\n"
+         << "\n";
+
+  return true;
+}
+
+}  // namespace
+
+int main(int argc, char** argv) {
+  auto args = Args::create();
+  auto opt_help = args->option('h', "help", "display this text and exit");
+  auto opt_ns = args->option_argument('\0', "namespace", "ARG",
+                                      "Namespace for tokenizer");
+  auto opt_unicode =
+      args->option_argument('u', "unicode", "ARG", "Unicode version");
+  std::vector<std::string_view> arguments;
+  if (!args->run(argc, argv, &arguments)) {
+    args->print_error(std::cerr);
+    std::cerr << "Try `gen_tokens --help` for usage\n";
+    return 1;
+  }
+  if (opt_help->is_set()) {
+    std::cout << "Usage: `gen_tokens [OPTIONS...] tokens.grammar"
+              << " OUTPUT.hh OUTPUT.cc`\n"
+              << "Generates a tokenizer for grammar.\n"
+              << "\n";
+    args->print_help(std::cout);
+    return 0;
+  }
+  if (!opt_ns->is_set()) {
+    std::cerr << "No namespace given.\n"
+              << "Try `gen_tokens --help` for usage\n";
+    return 1;
+  }
+  if (!opt_unicode->is_set()) {
+    std::cerr << "No unicode version given.\n"
+              << "Try `gen_tokens --help` for usage\n";
+    return 1;
+  }
+  auto ns = opt_ns->argument();
+  auto unicode = opt_unicode->argument();
+  if (arguments.size() != 3) {
+    std::cerr << "Expecting three arguments. No more, no less.\n"
+              << "Try `gen_tokens --help` for usage\n";
+    return 1;
+  }
+
+  auto filename = std::string(arguments[0]);
+  auto reader = io::open(filename);
+  if (!reader.has_value()) {
+    std::cerr << "Unable to open " << filename << '\n';
+    return 1;
+  }
+  auto errors = src::file_errors(std::move(filename));
+  auto grammar =
+      grammar::load(std::move(reader.value()), kCharacterClassNames, *errors);
+  if (!grammar || errors->errors() > 0)
+    return 1;
+
+  Generator generator;
+  if (!generator.generate(arguments[1], arguments[2], ns, unicode, *grammar))
+    return 1;
+  return 0;
+}
diff --git a/src/gen_ugc.cc b/src/gen_ugc.cc
index 2670803..9100946 100644
--- a/src/gen_ugc.cc
+++ b/src/gen_ugc.cc
@@ -28,7 +28,7 @@ std::map<std::string, u::GeneralCategory, std::less<>> str2gc{
   {"Lo", u::GeneralCategory::LETTER_OTHER},
 
   {"Mn", u::GeneralCategory::MARK_NONSPACING},
-  {"Mc", u::GeneralCategory::MARK_SPACING_COMBINDING},
+  {"Mc", u::GeneralCategory::MARK_SPACING_COMBINING},
   {"Me", u::GeneralCategory::MARK_SPACING_ENCLOSING},
 
   {"Nd", u::GeneralCategory::NUMBER_DIGIT},
@@ -277,6 +277,7 @@ int main(int argc, char** argv) {
       args->option_argument('p', "prefix", "ARG", "Prefix for exported method");
   std::vector<std::string_view> arguments;
   if (!args->run(argc, argv, &arguments)) {
+    args->print_error(std::cerr);
     std::cerr << "Try `gen_u --help` for usage\n";
     return 1;
   }
diff --git a/src/grammar.cc b/src/grammar.cc
new file mode 100644
index 0000000..25c4d64
--- /dev/null
+++ b/src/grammar.cc
@@ -0,0 +1,268 @@
+#include "grammar.hh"
+
+#include "errors.hh"
+#include "line.hh"
+#include "location.hh"
+#include "str.hh"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <format>
+#include <functional>
+#include <map>
+#include <memory>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+namespace grammar {
+
+namespace {
+
+class GrammarImpl : public Grammar {
+ public:
+  explicit GrammarImpl(std::vector<std::unique_ptr<Element>> elements)
+      : elements_(std::move(elements)) {}
+
+  [[nodiscard]]
+  Element const& root() const override {
+    return *elements_[0];
+  }
+
+ private:
+  std::vector<std::unique_ptr<Element>> elements_;
+};
+
+struct FirstPassElement {
+  src::Location loc;
+  std::vector<std::string> definitions;
+
+  explicit FirstPassElement(src::Location loc) : loc(loc) {}
+};
+
+class GrammarLoader {
+ public:
+  GrammarLoader(std::unique_ptr<io::Reader> reader,
+                std::vector<std::string> const& character_classes,
+                src::Errors& errors)
+      : reader_(line::open(std::move(reader))),
+        character_classes_(character_classes),
+        errors_(errors) {}
+
+  std::unique_ptr<Grammar> load() {
+    // Read whole file in a first pass, before parsing definitions.
+    std::map<std::string, FirstPassElement> first_pass_elements;
+    FirstPassElement* last_element = nullptr;
+
+    std::map<std::string_view, uint8_t, std::less<>> cc_lookup;
+    for (uint8_t i = 0; i < static_cast<uint8_t>(character_classes_.size());
+         ++i) {
+      cc_lookup.emplace(character_classes_[i], i);
+    }
+
+    while (true) {
+      auto line = reader_->read();
+      auto loc = src::Location(reader_->number(), 0);
+      if (!line.has_value()) {
+        if (line.error() == io::ReadError::Eof)
+          break;
+        errors_.err(loc, "Error reading");
+        return nullptr;
+      }
+      if (line.value().empty() || line.value().front() == '#')
+        continue;
+      if (line.value().front() == ' ') {
+        // Continue on last element
+        size_t i = 1;
+        while (i < line.value().size() && line.value()[i] == ' ')
+          ++i;
+        if (i == line.value().size()) {
+          errors_.err(loc, "Unexpected line, only spaces");
+          continue;
+        }
+        if (last_element == nullptr) {
+          errors_.err(loc, "Expected element before indented lines");
+          continue;
+        }
+        last_element->definitions.emplace_back(line.value().substr(i));
+      } else {
+        // New element
+        if (line.value().back() != ':') {
+          errors_.err(
+              loc,
+              "Unexpected line, not indented but also not ending with a ':'");
+          continue;
+        }
+
+        auto name = line.value().substr(0, line.value().size() - 1);
+        if (cc_lookup.contains(name)) {
+          errors_.warn(
+              loc, std::format("Element {} overrides character class", name));
+        }
+        auto pair = first_pass_elements.emplace(name, FirstPassElement(loc));
+        if (!pair.second) {
+          errors_.err(loc, std::format("Duplicate element {}", name));
+        }
+        last_element = &pair.first->second;
+      }
+    }
+
+    if (first_pass_elements.empty()) {
+      errors_.err(src::Location(reader_->number(), 0), "No elements found");
+      return nullptr;
+    }
+
+    std::vector<std::unique_ptr<Element>> second_pass_elements;
+    std::map<std::string_view, size_t, std::less<>> second_pass_lookup;
+    for (auto const& pair : first_pass_elements) {
+      auto element = std::make_unique<Element>();
+      element->name = pair.first;
+      second_pass_lookup.emplace(element->name, second_pass_elements.size());
+      second_pass_elements.emplace_back(std::move(element));
+    }
+
+    auto it = second_pass_elements.begin();
+    for (auto const& pair : first_pass_elements) {
+      auto const& element = *it++;
+      if (pair.second.definitions.empty()) {
+        errors_.err(pair.second.loc,
+                    std::format("No definitions for {}", pair.first));
+        continue;
+      }
+      std::vector<std::string_view> in_symbols;
+      for (auto const& in_definition : pair.second.definitions) {
+        str::split(in_definition, in_symbols);
+
+        std::vector<Symbol> out_symbols;
+        bool exclude = false;
+        bool expect_not = false;
+        for (auto in_symbol : in_symbols) {
+          Symbol out_symbol;
+          if (exclude) {
+            if (in_symbol == "or")
+              continue;
+            out_symbol.optional = Symbol::Optional::kExcluded;
+          } else if (expect_not) {
+            expect_not = false;
+            if (in_symbol == "not") {
+              exclude = true;
+            } else {
+              errors_.err(pair.second.loc, "but is not followed by not");
+            }
+            continue;
+          }
+          if (in_symbol == "but") {
+            expect_not = true;
+            continue;
+          }
+          if (in_symbol.front() == '{' && in_symbol.back() == '}') {
+            if (exclude) {
+              errors_.err(pair.second.loc,
+                          "Optional and exclude doesn't work together");
+            } else {
+              out_symbol.optional = Symbol::Optional::kZeroOrMore;
+            }
+            in_symbol = in_symbol.substr(1, in_symbol.size() - 2);
+          } else if (in_symbol.front() == '[' && in_symbol.back() == ']') {
+            if (exclude) {
+              errors_.err(pair.second.loc,
+                          "Optional and exclude doesn't work together");
+            } else {
+              out_symbol.optional = Symbol::Optional::kZeroOrOne;
+            }
+            in_symbol = in_symbol.substr(1, in_symbol.size() - 2);
+          }
+          auto it2 = second_pass_lookup.find(in_symbol);
+          if (it2 != second_pass_lookup.end()) {
+            out_symbol.type = Symbol::Type::kNonTerminal;
+            out_symbol.element = second_pass_elements[it2->second].get();
+          } else {
+            auto it3 = cc_lookup.find(in_symbol);
+            if (it3 != cc_lookup.end()) {
+              out_symbol.type = Symbol::Type::kCharacterClass;
+              out_symbol.char_class = it3->second;
+            } else {
+              out_symbol.type = Symbol::Type::kTerminal;
+              out_symbol.value = in_symbol;
+            }
+          }
+          out_symbols.emplace_back(std::move(out_symbol));
+        }
+
+        if (expect_not) {
+          errors_.err(pair.second.loc, "but is not followed by not");
+        }
+
+        if (out_symbols.empty()) {
+          errors_.err(pair.second.loc, "no symbols found in definition");
+          continue;
+        }
+
+        element->definitions.emplace_back(
+            Definition{.symbols = std::move(out_symbols)});
+      }
+    }
+
+    // Find root and move it first (if needed)
+    std::vector<size_t> used(second_pass_elements.size(), 0);
+    for (auto const& element : second_pass_elements) {
+      for (auto const& definition : element->definitions) {
+        for (auto const& symbol : definition.symbols) {
+          switch (symbol.type) {
+            case Symbol::Type::kTerminal:
+            case Symbol::Type::kCharacterClass:
+              break;
+            case Symbol::Type::kNonTerminal:
+              used[second_pass_lookup.find(symbol.element->name)->second]++;
+              break;
+          }
+        }
+      }
+    }
+
+    std::optional<size_t> root_index;
+    for (size_t i = 0; i < used.size(); ++i) {
+      if (used[i] == 0) {
+        if (root_index.has_value()) {
+          errors_.warn(first_pass_elements.find(second_pass_elements[i]->name)
+                           ->second.loc,
+                       "Is not referenced but also not root");
+        } else {
+          root_index = i;
+        }
+      }
+    }
+
+    if (root_index.has_value()) {
+      if (root_index.value() != 0) {
+        std::swap(second_pass_elements[0],
+                  second_pass_elements[root_index.value()]);
+      }
+    } else {
+      errors_.err(
+          first_pass_elements.find(second_pass_elements[0]->name)->second.loc,
+          "No root element found");
+    }
+
+    return std::make_unique<GrammarImpl>(std::move(second_pass_elements));
+  }
+
+ private:
+  std::unique_ptr<line::Reader> reader_;
+  std::vector<std::string> const& character_classes_;
+  src::Errors& errors_;
+};
+
+}  // namespace
+
+std::unique_ptr<Grammar> load(std::unique_ptr<io::Reader> reader,
+                              std::vector<std::string> const& character_classes,
+                              src::Errors& errors) {
+  GrammarLoader loader(std::move(reader), character_classes, errors);
+  return loader.load();
+}
+
+}  // namespace grammar
diff --git a/src/grammar.hh b/src/grammar.hh
new file mode 100644
index 0000000..13beec4
--- /dev/null
+++ b/src/grammar.hh
@@ -0,0 +1,80 @@
+#ifndef GRAMMAR_HH
+#define GRAMMAR_HH
+
+#include "errors.hh"
+#include "io.hh"
+
+#include <cstdint>
+#include <span>
+#include <string>
+#include <vector>
+
+namespace grammar {
+
+struct Element;
+
+struct Symbol {
+  enum class Type : uint8_t {
+    // value == terminal, as UTF-8
+    kTerminal,
+
+    // element != nullptr
+    kNonTerminal,
+
+    // char_class != 0
+    kCharacterClass,
+  };
+
+  Type type;
+
+  enum class Optional : uint8_t {
+    // Symbol is NOT optional.
+    kRequired,
+    // Symbol is optional.
+    kZeroOrOne,
+    // Symbol is optional and can repeat.
+    kZeroOrMore,
+    // Symbol should be excluded from previous symbol.
+    // Example: InputCharacter but not * or /
+    kExcluded,
+  };
+
+  Optional optional{Optional::kRequired};
+
+  uint8_t char_class{0};
+
+  Element const* element{nullptr};
+  std::string value;
+};
+
+struct Definition {
+  std::vector<Symbol> symbols;
+};
+
+struct Element {
+  std::string name;
+
+  std::vector<Definition> definitions;
+};
+
+class Grammar {
+ public:
+  virtual ~Grammar() = default;
+
+  [[nodiscard]]
+  virtual Element const& root() const = 0;
+
+ protected:
+  Grammar() = default;
+
+  Grammar(Grammar const&) = delete;
+  Grammar& operator=(Grammar const&) = delete;
+};
+
+std::unique_ptr<Grammar> load(std::unique_ptr<io::Reader> reader,
+                              std::vector<std::string> const& character_classes,
+                              src::Errors& errors);
+
+}  // namespace grammar
+
+#endif  // GRAMMAR_HH
diff --git a/src/java_tokens.cc b/src/java_tokens.cc
new file mode 100644
index 0000000..59748c1
--- /dev/null
+++ b/src/java_tokens.cc
@@ -0,0 +1,374 @@
+#include "java_tokens.hh"
+
+#include "errors.hh"
+#include "java_tokens_java-8.hh"
+#include "java_uescape.hh"
+#include "str.hh"
+#include "u8.hh"
+#include "uline.hh"
+
+#include <cassert>
+#include <charconv>
+#include <cstddef>
+#include <expected>
+#include <format>
+#include <limits>
+#include <memory>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <system_error>
+#include <utility>
+
+namespace java {
+
+namespace {
+
+template <typename MatchNext, typename MatchToken>
+class TokensImpl : public Tokens {
+ public:
+  TokensImpl(std::unique_ptr<io::Reader> reader,
+             std::unique_ptr<src::Errors> errors, TokensConfig config)
+      : reader_(u8::line::open(u8::java::open(std::move(reader)))),
+        errors_(std::move(errors)),
+        config_(config) {}
+
+  std::expected<Token, io::ReadError> read() override {
+    while (true) {
+      while (line_.empty()) {
+        auto maybe_line = reader_->read();
+        if (!maybe_line.has_value())
+          return std::unexpected(maybe_line.error());
+        line_ = maybe_line.value();
+        location_.line = reader_->number();
+        location_.column = 0;
+      }
+
+      Token token;
+      token.loc = location_;
+
+      auto maybe_token_pair = match_next_(line_);
+      if (maybe_token_pair.has_value()) {
+        token.str = line_.substr(0, maybe_token_pair->second);
+        location_.column += u8len(token.str);
+        line_ = line_.substr(maybe_token_pair->second);
+        switch (maybe_token_pair->first) {
+          case MatchToken::kBinaryIntegerLiteral:
+            handle_int_literal(token, /* base */ 2);
+            break;
+          case MatchToken::kBooleanLiteral:
+            token.type = Token::Type::kLiteralBoolean;
+            token.int_value = token.str != "false";
+            break;
+          case MatchToken::kCharacterLiteral:
+            token.type = Token::Type::kLiteralCharacter;
+            if (token.str[1] == '\\') {
+              token.int_value =
+                  unescape(token.str.substr(1, token.str.size() - 2)).first;
+            } else {
+              auto* ptr =
+                  reinterpret_cast<uint8_t const*>(token.str.data() + 1);
+              auto* end = ptr + token.str.size() - 2;
+              token.int_value = u8::read(ptr, end).value();
+            }
+            break;
+          case MatchToken::kDecimalFloatingPointLiteral:
+            handle_float_literal(token);
+            break;
+          case MatchToken::kDecimalIntegerLiteral:
+            handle_int_literal(token);
+            break;
+          case MatchToken::kEndOfLineComment:
+            token.type = Token::Type::kComment;
+            token.str = str::trim(token.str.substr(2));
+            break;
+          case MatchToken::kHexIntegerLiteral:
+            handle_int_literal(token, /* base */ 16);
+            break;
+          case MatchToken::kHexadecimalFloatingPointLiteral:
+            handle_float_literal(token, /* base */ 16);
+            break;
+          case MatchToken::kIdentifier:
+            token.type = Token::Type::kIdentifier;
+            break;
+          case MatchToken::kKeyword:
+            token.type = Token::Type::kKeyword;
+            break;
+          case MatchToken::kNullLiteral:
+            token.type = Token::Type::kLiteralNull;
+            break;
+          case MatchToken::kOctalIntegerLiteral:
+            handle_int_literal(token, /* base */ 8);
+            break;
+          case MatchToken::kOperator:
+            token.type = Token::Type::kOperator;
+            break;
+          case MatchToken::kSeparator:
+            token.type = Token::Type::kSeparator;
+            break;
+          case MatchToken::kStringLiteral:
+            token.type = Token::Type::kLiteralString;
+            token.str =
+                unescape_if_needed(token.str.substr(1, token.str.size() - 2));
+            break;
+          case MatchToken::kTraditionalComment: {
+            token.type = Token::Type::kComment;
+            size_t s = 2;
+            while (s < token.str.size() && token.str[s] == '*')
+              ++s;
+            token.str =
+                str::trim(token.str.substr(s, token.str.size() - 2 - s));
+            token.int_value = static_cast<int64_t>(s - 1);
+            // TODO: handle multiline
+            break;
+          }
+          case MatchToken::kWhiteSpace:
+            continue;
+        }
+      } else {
+        errors_->err(location_, std::format("Invalid token: {}", line_));
+        token.type = Token::Type::kError;
+        token.str = line_;
+      }
+      return token;
+    }
+  }
+
+ private:
+  void handle_int_literal_error(Token& token, std::string_view str,
+                                std::errc err, int base) {
+    if (err == std::errc::result_out_of_range) {
+      // Java assumes two completent (so 0xffff_ffff is -1) and also, negative literals
+      // are read as positive (because the operator '-' is a separate token)
+      uint64_t tmp;
+      auto ret =
+          std::from_chars(str.data(), str.data() + str.size(), tmp, base);
+      if (ret.ec == std::errc()) {
+        token.type = ret.ptr < str.data() + str.size()
+                         ? Token::Type::kLiteralLong
+                         : Token::Type::kLiteralInt;
+        token.int_value = static_cast<int64_t>(tmp);
+        return;
+      }
+    }
+    errors_->err(location_,
+                 std::format("Invalid integer literal: {}", token.str));
+    token.type = Token::Type::kError;
+  }
+
+  void handle_int_literal(Token& token, int base = 10) {
+    size_t prefix;
+    switch (base) {
+      case 16:  // 0x
+      case 2:   // 0b
+        prefix = 2;
+        break;
+      case 8:  // 0
+        prefix = 1;
+        break;
+      default:
+        prefix = 0;
+        break;
+    }
+    std::optional<char> suffix;
+    if (token.str.find('_') == std::string_view::npos) {
+      auto ret = std::from_chars(token.str.data() + prefix,
+                                 token.str.data() + token.str.size(),
+                                 token.int_value, base);
+      if (ret.ec != std::errc()) {
+        handle_int_literal_error(token, token.str.substr(prefix), ret.ec, base);
+        return;
+      }
+      if (ret.ptr < token.str.data() + token.str.size())
+        suffix = *ret.ptr;
+    } else {
+      std::string tmp;
+      tmp.reserve(token.str.size() - prefix);
+      for (size_t i = prefix; i < token.str.size(); ++i) {
+        if (token.str[i] != '_') {
+          tmp.push_back(token.str[i]);
+        }
+      }
+      auto ret = std::from_chars(tmp.data(), tmp.data() + tmp.size(),
+                                 token.int_value, base);
+      if (ret.ec != std::errc()) {
+        handle_int_literal_error(token, tmp, ret.ec, base);
+        return;
+      }
+      if (ret.ptr < tmp.data() + tmp.size())
+        suffix = *ret.ptr;
+    }
+    if (suffix.has_value() &&
+        (suffix.value() == 'l' || suffix.value() == 'L')) {
+      token.type = Token::Type::kLiteralLong;
+    } else {
+      if (base == 10 &&
+          token.int_value >
+              static_cast<int64_t>(1) + std::numeric_limits<int32_t>::max()) {
+        errors_->err(location_,
+                     std::format("Invalid integer literal: {}", token.str));
+        token.type = Token::Type::kError;
+        return;
+      }
+      if (std::cmp_greater(token.int_value,
+                           std::numeric_limits<uint32_t>::max())) {
+        errors_->err(location_,
+                     std::format("Invalid integer literal: {}", token.str));
+        token.type = Token::Type::kError;
+        return;
+      }
+      token.type = Token::Type::kLiteralInt;
+      token.int_value = static_cast<int32_t>(token.int_value);
+    }
+  }
+
+  void handle_float_literal(Token& token, int base = 10) {
+    size_t prefix;
+    std::chars_format fmt;
+    switch (base) {
+      case 16:  // 0x
+        fmt = std::chars_format::general | std::chars_format::hex;
+        prefix = 2;
+        break;
+      default:
+        fmt = std::chars_format::general;
+        prefix = 0;
+        break;
+    }
+
+    std::from_chars_result ret;
+    if (token.str.ends_with("f") || token.str.ends_with("F")) {
+      // float and double do not parse exactly the same, so use a float parser for float.
+      float tmp;
+      ret = std::from_chars(token.str.data() + prefix,
+                            token.str.data() + token.str.size(), tmp, fmt);
+      token.type = Token::Type::kLiteralFloatingPoint;
+      token.float_value = tmp;
+    } else {
+      ret = std::from_chars(token.str.data() + prefix,
+                            token.str.data() + token.str.size(),
+                            token.float_value, fmt);
+      token.type = Token::Type::kLiteralDoubleFloatingPoint;
+    }
+
+    if (ret.ec != std::errc()) {
+      // Java allows 0 with just a suffix, std::from_chars does not.
+      if (token.str == "0f" || token.str == "0F") {
+        token.type = Token::Type::kLiteralFloatingPoint;
+        token.float_value = 0;
+        return;
+      }
+      if (token.str == "0d" || token.str == "0D") {
+        token.type = Token::Type::kLiteralDoubleFloatingPoint;
+        token.float_value = 0;
+        return;
+      }
+      errors_->err(location_,
+                   std::format("Invalid float literal: {}", token.str));
+      token.type = Token::Type::kError;
+    }
+  }
+
+  std::string_view unescape_if_needed(std::string_view str) {
+    auto back_slash = str.find('\\');
+    if (back_slash == std::string_view::npos)
+      return str;
+    unescape_tmp_.clear();
+    unescape_tmp_.reserve(str.size());
+    size_t last = 0;
+    uint8_t tmp[4];
+    while (true) {
+      unescape_tmp_.append(str, last, back_slash - last);
+      auto ret = unescape(str.substr(back_slash));
+      auto* ptr = tmp;
+      u8::write(ptr, tmp + sizeof(tmp), ret.first);
+      unescape_tmp_.append(
+          std::string_view(reinterpret_cast<char*>(tmp), ptr - tmp));
+      last = back_slash + ret.second;
+      back_slash = str.find('\\', last);
+      if (back_slash == std::string::npos) {
+        unescape_tmp_.append(str, last);
+        break;
+      }
+    }
+    return unescape_tmp_;
+  }
+
+  // Strings coming here have already been validated by the tokenizer
+  static std::pair<uint16_t, size_t> unescape(std::string_view in) {
+    assert(in.front() == '\\');
+    assert(in.size() > 1);
+    switch (in[1]) {
+      case 'b':
+        return std::make_pair(8, 2);
+      case 't':
+        return std::make_pair(9, 2);
+      case 'n':
+        return std::make_pair(10, 2);
+      case 'f':
+        return std::make_pair(12, 2);
+      case 'r':
+        return std::make_pair(13, 2);
+      case '"':
+        return std::make_pair(34, 2);
+      case '\'':
+        return std::make_pair(39, 2);
+      case '\\':
+        return std::make_pair(92, 2);
+      case '0':
+      case '1':
+      case '2':
+      case '3':
+      case '4':
+      case '5':
+      case '6':
+      case '7': {
+        uint8_t tmp;
+        auto ret = std::from_chars(in.data() + 1, in.data() + in.size(), tmp,
+                                   /* base */ 8);
+        return std::make_pair(tmp, ret.ptr - in.data());
+      }
+      default:
+        std::unreachable();
+    }
+  }
+
+  static size_t u8len(std::string_view str) {
+    auto* ptr = reinterpret_cast<uint8_t const*>(str.data());
+    auto* const end = ptr + str.size();
+    size_t count = 0;
+    while (u8::skip(ptr, end))
+      ++count;
+    return count;
+  }
+
+  std::unique_ptr<u8::line::Reader> reader_;
+  std::unique_ptr<src::Errors> errors_;
+  TokensConfig const config_;
+  MatchNext match_next_;
+  std::string_view line_;
+  Location location_;
+  std::string unescape_tmp_;
+};
+
+struct MatchNextJava8 {
+  std::optional<std::pair<java_8::Token, size_t>> operator()(
+      std::string_view str) const {
+    return java_8::matchNext(str);
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<Tokens> open(std::unique_ptr<io::Reader> reader,
+                             std::unique_ptr<src::Errors> errors,
+                             TokensConfig config) {
+  switch (config.version) {
+    case Version::kJava8:
+      return std::make_unique<TokensImpl<MatchNextJava8, java_8::Token>>(
+          std::move(reader), std::move(errors), config);
+  }
+  std::unreachable();
+}
+
+}  // namespace java
diff --git a/src/java_tokens.hh b/src/java_tokens.hh
new file mode 100644
index 0000000..6fbefcb
--- /dev/null
+++ b/src/java_tokens.hh
@@ -0,0 +1,92 @@
+#ifndef JAVA_TOKENS_HH
+#define JAVA_TOKENS_HH
+
+#include "errors.hh"
+#include "io.hh"
+#include "java_version.hh"  // IWYU pragma: export
+#include "location.hh"
+
+#include <expected>
+#include <memory>
+#include <string_view>
+
+namespace java {
+
+using src::Location;
+
+struct Token {
+  enum class Type : uint8_t {
+    // str is content of comment, excluding heading or trailing slash and star.
+    // int_value is number of stars at head, 0 for single line, 1 for /* and
+    // 2 for /** and so on.
+    kComment,
+
+    // str is identifier
+    kIdentifier,
+
+    // str is keyword, int_value is Keyword index
+    kKeyword,
+
+    // str is separator, int_value is Separator index
+    kSeparator,
+
+    // str is operator, int_value is Operator index
+    kOperator,
+
+    // int_value is literal value
+    kLiteralInt,
+
+    // int_value is literal value
+    kLiteralLong,
+
+    // str is literal value
+    kLiteralString,
+
+    // int_value is literal value as unicode code-point
+    kLiteralCharacter,
+
+    kLiteralNull,
+
+    // float_value is literal value
+    kLiteralFloatingPoint,
+
+    // float_value is literal value
+    kLiteralDoubleFloatingPoint,
+
+    // int_value is literal value, 0 = false, anything else = true
+    kLiteralBoolean,
+
+    kError,
+  };
+
+  Type type;
+  Location loc;
+  std::string_view str;
+  int64_t int_value{0};
+  double float_value{0};
+};
+
+struct TokensConfig {
+  // Source version of Java file
+  Version version = Version::kMax;
+};
+
+class Tokens {
+ public:
+  virtual ~Tokens() = default;
+
+  virtual std::expected<Token, io::ReadError> read() = 0;
+
+ protected:
+  Tokens() = default;
+  Tokens(Tokens const&) = delete;
+  Tokens& operator=(Tokens const&) = delete;
+};
+
+[[nodiscard]] std::unique_ptr<Tokens> open(std::unique_ptr<io::Reader> reader,
+                                           std::unique_ptr<src::Errors>,
+                                           TokensConfig config = {});
+
+}  // namespace java
+
+#endif  // JAVA_TOKENS_HH
diff --git a/src/java_version.hh b/src/java_version.hh
new file mode 100644
index 0000000..444ae36
--- /dev/null
+++ b/src/java_version.hh
@@ -0,0 +1,16 @@
+#ifndef JAVA_VERSION_HH
+#define JAVA_VERSION_HH
+
+#include <cstdint>
+
+namespace java {
+
+enum class Version : uint8_t {
+  kJava8 = 8,
+
+  kMax = kJava8,
+};
+
+}  // namespace java
+
+#endif  // JAVA_VERSION_HH
diff --git a/src/location.cc b/src/location.cc
new file mode 100644
index 0000000..3fa1075
--- /dev/null
+++ b/src/location.cc
@@ -0,0 +1,12 @@
+#include "location.hh"
+
+#include <ostream>
+
+namespace src {
+
+std::ostream& operator<<(std::ostream& out, Location const& loc) {
+  out << loc.line << ':' << loc.column;
+  return out;
+}
+
+}  // namespace src
diff --git a/src/location.hh b/src/location.hh
new file mode 100644
index 0000000..1a210cb
--- /dev/null
+++ b/src/location.hh
@@ -0,0 +1,31 @@
+#ifndef LOCATION_HH
+#define LOCATION_HH
+
+#include <compare>
+#include <cstdint>
+#include <iosfwd>
+
+namespace src {
+
+struct Location {
+  uint64_t line;
+  uint16_t column;
+
+  constexpr Location() : line(0), column(0) {}
+
+  Location(uint64_t line, uint16_t column) : line(line), column(column) {}
+
+  [[nodiscard]]
+  std::strong_ordering operator<=>(Location const& loc) const {
+    auto ret = line <=> loc.line;
+    if (ret == std::strong_ordering::equal)
+      return column <=> loc.column;
+    return ret;
+  }
+};
+
+std::ostream& operator<<(std::ostream& out, Location const& loc);
+
+}  // namespace src
+
+#endif  // LOCATION_HH
diff --git a/src/prefix_tree.cc b/src/prefix_tree.cc
index f16df22..56466e8 100644
--- a/src/prefix_tree.cc
+++ b/src/prefix_tree.cc
@@ -42,6 +42,17 @@ class BuilderImpl : public Builder {
 
   [[nodiscard]]
   std::optional<std::string> build() const override {
+    auto ret = build(1);
+    if (!ret.has_value())
+      ret = build(2);
+    return ret;
+  }
+
+ private:
+  [[nodiscard]]
+  std::optional<std::string> build(uint8_t size) const {
+    assert(size > 0);
+
     std::string tree;
     std::string strings;
 
@@ -49,20 +60,19 @@ class BuilderImpl : public Builder {
     for (auto const& str : strings_)
       tmp.emplace(str);
 
-    if (!write_tree(tmp, tree, strings))
+    if (!write_tree(tmp, tree, strings, size))
       return std::nullopt;
 
     std::string header;
-    if (tree.size() > 0xffff)
+    write_u8(header, size);
+    if (!write_size(header, size, tree.size()))
       return std::nullopt;
-    write_u16(header, tree.size());
 
     return header + tree + strings;
   }
 
- private:
   bool write_tree(std::set<std::string_view> const& input, std::string& tree,
-                  std::string& strings) const {
+                  std::string& strings, uint8_t size) const {
     std::map<char, std::set<std::string_view>> buckets;
     bool match = false;
     for (auto& str : input) {
@@ -76,8 +86,8 @@ class BuilderImpl : public Builder {
     write_u8(tree, buckets.size() + (match ? 1 : 0));
     if (match) {
       write_u8(tree, 0);
-      write_u16(tree, 0);
-      write_u16(tree, 0);
+      write_size(tree, size, 0);
+      write_size(tree, size, 0);
     }
     std::string extra;
     for (auto& pair : buckets) {
@@ -95,23 +105,23 @@ class BuilderImpl : public Builder {
       }
 
       write_u8(tree, 1 + str.size());
-      if (strings.size() > 0xffff)
+      if (!write_size(tree, size, strings.size()))
         return false;
-      write_u16(tree, strings.size());
       strings.push_back(pair.first);
       strings.append(str);
       if (extra.size() > 0xffff)
         return false;
-      write_u16(tree, extra.size());
+      if (!write_size(tree, size, extra.size()))
+        return false;
       if (str.empty()) {
-        if (!write_tree(pair.second, extra, strings))
+        if (!write_tree(pair.second, extra, strings, size))
           return false;
       } else {
         std::set<std::string_view> tmp;
         for (auto& str2 : pair.second) {
           tmp.emplace(str2.substr(str.size()));
         }
-        if (!write_tree(tmp, extra, strings))
+        if (!write_tree(tmp, extra, strings, size))
           return false;
       }
     }
@@ -120,12 +130,27 @@ class BuilderImpl : public Builder {
     return true;
   }
 
+  static bool write_size(std::string& str, uint8_t size, size_t value) {
+    if (size == 1) {
+      if (value > 0xff)
+        return false;
+      write_u8(str, value);
+      return true;
+    }
+    if (size == 2) {
+      if (value > 0xffff)
+        return false;
+      write_u16(str, value);
+      return true;
+    }
+    assert(false);
+    return false;
+  }
+
   std::set<std::string> strings_;
 };
 
-}  // namespace
-
-std::optional<size_t> lookup(std::string_view tree, std::string_view str) {
+std::optional<size_t> lookup16(std::string_view tree, std::string_view str) {
   size_t base_str = 2 + get_u16(tree, 0);
   size_t node = 2;
   std::optional<size_t> match;
@@ -168,6 +193,63 @@ std::optional<size_t> lookup(std::string_view tree, std::string_view str) {
   return match;
 }
 
+std::optional<size_t> lookup8(std::string_view tree, std::string_view str) {
+  size_t base_str = 1 + get_u8(tree, 0);
+  size_t node = 1;
+  std::optional<size_t> match;
+  std::optional<size_t> earlier_match;
+
+  while (node < base_str && !str.empty()) {
+    auto children = get_u8(tree, node);
+
+    if (children == 0) {
+      // Leaf
+      return match;
+    }
+
+    size_t child_node = node + 1;
+    size_t child_end = child_node + (static_cast<size_t>(children) * 3);
+    for (; child_node < child_end; child_node += 3) {
+      uint8_t len = get_u8(tree, child_node);
+      uint8_t offset = get_u8(tree, child_node + 1);
+
+      if (str.starts_with(tree.substr(base_str + offset, len))) {
+        // Match but not a leaf, always first in the list of children
+        if (len == 0) {
+          earlier_match = match;
+          continue;
+        }
+        match = match.value_or(0) + len;
+        str = str.substr(len);
+        auto jump = get_u8(tree, child_node + 2);
+        node = child_end + jump;
+        break;
+      }
+    }
+
+    if (child_node == child_end)
+      return earlier_match;
+  }
+
+  if (node == base_str)
+    return earlier_match;
+  return match;
+}
+
+}  // namespace
+
+std::optional<size_t> lookup(std::string_view tree, std::string_view str) {
+  auto size = get_u8(tree, 0);
+  if (size == 1) {
+    return lookup8(tree.substr(1), str);
+  }
+  if (size == 2) {
+    return lookup16(tree.substr(1), str);
+  }
+  assert(false);
+  return std::nullopt;
+}
+
 std::unique_ptr<Builder> builder() { return std::make_unique<BuilderImpl>(); }
 
 }  // namespace prefix_tree
diff --git a/src/str.cc b/src/str.cc
index bd7a654..44db3a6 100644
--- a/src/str.cc
+++ b/src/str.cc
@@ -6,6 +6,15 @@
 
 namespace str {
 
+namespace {
+
+[[nodiscard]]
+inline bool is_space(char c) {
+  return c == ' ' || c == '\t' || c == '\r' || c == '\n';
+}
+
+}  // namespace
+
 void split(std::string_view str, std::vector<std::string_view>& out,
            char separator, bool keep_empty) {
   out.clear();
@@ -31,4 +40,14 @@ std::vector<std::string_view> split(std::string_view str, char separator,
   return vec;
 }
 
+std::string_view trim(std::string_view str) {
+  size_t s = 0;
+  size_t e = str.size();
+  while (s < e && is_space(str[s]))
+    ++s;
+  while (e > s && is_space(str[e - 1]))
+    --e;
+  return str.substr(s, e - s);
+}
+
 }  // namespace str
diff --git a/src/str.hh b/src/str.hh
index 58d5d32..e1ee549 100644
--- a/src/str.hh
+++ b/src/str.hh
@@ -13,6 +13,9 @@ void split(std::string_view str, std::vector<std::string_view>& out,
                                                   char separator = ' ',
                                                   bool keep_empty = false);
 
+[[nodiscard]]
+std::string_view trim(std::string_view str);
+
 }  // namespace str
 
 #endif  // STR_HH
diff --git a/src/ugc.hh b/src/ugc.hh
index c49d50f..206877f 100644
--- a/src/ugc.hh
+++ b/src/ugc.hh
@@ -13,7 +13,7 @@ enum class GeneralCategory : uint8_t {
   LETTER_OTHER,
 
   MARK_NONSPACING,
-  MARK_SPACING_COMBINDING,
+  MARK_SPACING_COMBINING,
   MARK_SPACING_ENCLOSING,
 
   NUMBER_DIGIT,
diff --git a/test/java_tokens.cc b/test/java_tokens.cc
new file mode 100644
index 0000000..df37409
--- /dev/null
+++ b/test/java_tokens.cc
@@ -0,0 +1,551 @@
+#include "java_tokens.hh"
+
+#include "errors.hh"
+#include "io.hh"
+
+#include <gtest/gtest.h>
+#include <memory>
+#include <string_view>
+#include <utility>
+
+using namespace std::literals::string_view_literals;
+
+namespace {
+
+class JavaTokens : public testing::TestWithParam<java::Version> {
+ protected:
+  static std::unique_ptr<src::Errors> make_errors() {
+    return src::file_errors(
+        testing::UnitTest::GetInstance()->current_test_info()->name());
+  }
+};
+
+}  // namespace
+
+TEST_P(JavaTokens, empty_class) {
+  auto input = io::memory(R"(class Empty {
+})");
+  auto tokens = java::open(std::move(input), make_errors(),
+                           java::TokensConfig{.version = GetParam()});
+  auto ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kKeyword, ret->type);
+  EXPECT_EQ("class", ret->str);
+  EXPECT_EQ(1, ret->loc.line);
+  EXPECT_EQ(0, ret->loc.column);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kIdentifier, ret->type);
+  EXPECT_EQ("Empty", ret->str);
+  EXPECT_EQ(1, ret->loc.line);
+  EXPECT_EQ(6, ret->loc.column);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kSeparator, ret->type);
+  EXPECT_EQ("{", ret->str);
+  EXPECT_EQ(1, ret->loc.line);
+  EXPECT_EQ(12, ret->loc.column);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kSeparator, ret->type);
+  EXPECT_EQ("}", ret->str);
+  EXPECT_EQ(2, ret->loc.line);
+  EXPECT_EQ(0, ret->loc.column);
+  ret = tokens->read();
+  ASSERT_FALSE(ret.has_value());
+  EXPECT_EQ(io::ReadError::Eof, ret.error());
+}
+
+TEST_P(JavaTokens, traditional_comment) {
+  auto input = io::memory(R"(/* this comment /* // /** ends here: */)");
+  auto tokens = java::open(std::move(input), make_errors(),
+                           java::TokensConfig{.version = GetParam()});
+  auto ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kComment, ret->type);
+  EXPECT_EQ("this comment /* // /** ends here:", ret->str);
+  EXPECT_EQ(1, ret->int_value);
+  ret = tokens->read();
+  ASSERT_FALSE(ret.has_value());
+  EXPECT_EQ(io::ReadError::Eof, ret.error());
+}
+
+TEST_P(JavaTokens, single_line_comment) {
+  auto input = io::memory(R"(// this is a comment)");
+  auto tokens = java::open(std::move(input), make_errors(),
+                           java::TokensConfig{.version = GetParam()});
+  auto ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kComment, ret->type);
+  EXPECT_EQ("this is a comment", ret->str);
+  EXPECT_EQ(0, ret->int_value);
+  ret = tokens->read();
+  ASSERT_FALSE(ret.has_value());
+  EXPECT_EQ(io::ReadError::Eof, ret.error());
+}
+
+TEST_P(JavaTokens, identifiers) {
+  auto input = io::memory(R"(String i3 αρετη MAX_VALUE isLetterOrDigit)");
+  auto tokens = java::open(std::move(input), make_errors(),
+                           java::TokensConfig{.version = GetParam()});
+  auto ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kIdentifier, ret->type);
+  EXPECT_EQ("String", ret->str);
+  EXPECT_EQ(1, ret->loc.line);
+  EXPECT_EQ(0, ret->loc.column);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kIdentifier, ret->type);
+  EXPECT_EQ("i3", ret->str);
+  EXPECT_EQ(1, ret->loc.line);
+  EXPECT_EQ(7, ret->loc.column);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kIdentifier, ret->type);
+  EXPECT_EQ("αρετη", ret->str);
+  EXPECT_EQ(1, ret->loc.line);
+  EXPECT_EQ(10, ret->loc.column);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kIdentifier, ret->type);
+  EXPECT_EQ("MAX_VALUE", ret->str);
+  EXPECT_EQ(1, ret->loc.line);
+  EXPECT_EQ(16, ret->loc.column);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kIdentifier, ret->type);
+  EXPECT_EQ("isLetterOrDigit", ret->str);
+  EXPECT_EQ(1, ret->loc.line);
+  EXPECT_EQ(26, ret->loc.column);
+  ret = tokens->read();
+  ASSERT_FALSE(ret.has_value());
+  EXPECT_EQ(io::ReadError::Eof, ret.error());
+}
+
+TEST_P(JavaTokens, int_literals) {
+  auto input = io::memory(R"(
+0    2    0372    0xDada_Cafe    1996    0x00_FF__00_FF
+)");
+  auto tokens = java::open(std::move(input), make_errors(),
+                           java::TokensConfig{.version = GetParam()});
+  auto ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type);
+  EXPECT_EQ(0L, ret->int_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type);
+  EXPECT_EQ(2L, ret->int_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type);
+  EXPECT_EQ(250L, ret->int_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type);
+  EXPECT_EQ(-623195394L, ret->int_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type);
+  EXPECT_EQ(1996L, ret->int_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type);
+  EXPECT_EQ(16711935L, ret->int_value);
+  ret = tokens->read();
+  ASSERT_FALSE(ret.has_value());
+  EXPECT_EQ(io::ReadError::Eof, ret.error());
+}
+
+TEST_P(JavaTokens, long_literals) {
+  auto input = io::memory(R"(
+0l    0777L    0x100000000L    2_147_483_648L    0xC0B0L
+)");
+  auto tokens = java::open(std::move(input), make_errors(),
+                           java::TokensConfig{.version = GetParam()});
+  auto ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type);
+  EXPECT_EQ(0, ret->int_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type);
+  EXPECT_EQ(511, ret->int_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type);
+  EXPECT_EQ(4294967296LL, ret->int_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type);
+  EXPECT_EQ(2147483648LL, ret->int_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type);
+  EXPECT_EQ(49328, ret->int_value);
+  ret = tokens->read();
+  ASSERT_FALSE(ret.has_value());
+  EXPECT_EQ(io::ReadError::Eof, ret.error());
+}
+
+TEST_P(JavaTokens, int_literal_min_max) {
+  auto input = io::memory(R"(
+2147483647
+-2147483648
+0x7fff_ffff
+0177_7777_7777
+0b0111_1111_1111_1111_1111_1111_1111_1111
+0x8000_0000
+0200_0000_0000
+0b1000_0000_0000_0000_0000_0000_0000_0000
+0xffff_ffff
+0377_7777_7777
+0b1111_1111_1111_1111_1111_1111_1111_1111
+)");
+  auto tokens = java::open(std::move(input), make_errors(),
+                           java::TokensConfig{.version = GetParam()});
+  auto ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type);
+  EXPECT_EQ(2147483647L, ret->int_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kOperator, ret->type);
+  EXPECT_EQ("-", ret->str);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type);
+  EXPECT_EQ(-2147483647L - 1, ret->int_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type);
+  EXPECT_EQ(2147483647L, ret->int_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type);
+  EXPECT_EQ(2147483647L, ret->int_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type);
+  EXPECT_EQ(2147483647L, ret->int_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type);
+  EXPECT_EQ(-2147483647L - 1, ret->int_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type);
+  EXPECT_EQ(-2147483647L - 1, ret->int_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type);
+  EXPECT_EQ(-2147483647L - 1, ret->int_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type);
+  EXPECT_EQ(-1L, ret->int_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type);
+  EXPECT_EQ(-1L, ret->int_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type);
+  EXPECT_EQ(-1L, ret->int_value);
+  ret = tokens->read();
+  ASSERT_FALSE(ret.has_value());
+  EXPECT_EQ(io::ReadError::Eof, ret.error());
+}
+
+TEST_P(JavaTokens, long_literal_min_max) {
+  auto input = io::memory(R"(
+9223372036854775807L
+-9223372036854775808L
+0x7fff_ffff_ffff_ffffL
+07_7777_7777_7777_7777_7777L
+0b0111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111L
+0x8000_0000_0000_0000L
+010_0000_0000_0000_0000_0000L
+0b1000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000L
+0xffff_ffff_ffff_ffffL
+017_7777_7777_7777_7777_7777L
+0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111L
+)");
+  auto tokens = java::open(std::move(input), make_errors(),
+                           java::TokensConfig{.version = GetParam()});
+  auto ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type);
+  EXPECT_EQ(9223372036854775807LL, ret->int_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kOperator, ret->type);
+  EXPECT_EQ("-", ret->str);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type);
+  EXPECT_EQ(-9223372036854775807LL - 1, ret->int_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type);
+  EXPECT_EQ(9223372036854775807LL, ret->int_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type);
+  EXPECT_EQ(9223372036854775807LL, ret->int_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type);
+  EXPECT_EQ(9223372036854775807LL, ret->int_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type);
+  EXPECT_EQ(-9223372036854775807LL - 1, ret->int_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type);
+  EXPECT_EQ(-9223372036854775807LL - 1, ret->int_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type);
+  EXPECT_EQ(-9223372036854775807LL - 1, ret->int_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type);
+  EXPECT_EQ(-1LL, ret->int_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type);
+  EXPECT_EQ(-1LL, ret->int_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type);
+  EXPECT_EQ(-1LL, ret->int_value);
+  ret = tokens->read();
+  ASSERT_FALSE(ret.has_value());
+  EXPECT_EQ(io::ReadError::Eof, ret.error());
+}
+
+TEST_P(JavaTokens, float_literals) {
+  auto input = io::memory(R"(
+1e1f    2.f    .3f    0f    3.14f    6.022137e+23f
+)");
+  auto tokens = java::open(std::move(input), make_errors(),
+                           java::TokensConfig{.version = GetParam()});
+  auto ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralFloatingPoint, ret->type);
+  EXPECT_EQ(1e1F, ret->float_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralFloatingPoint, ret->type);
+  EXPECT_EQ(2.F, ret->float_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralFloatingPoint, ret->type);
+  EXPECT_EQ(.3F, ret->float_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralFloatingPoint, ret->type);
+  EXPECT_EQ(0.F, ret->float_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralFloatingPoint, ret->type);
+  EXPECT_EQ(3.14F, ret->float_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralFloatingPoint, ret->type);
+  EXPECT_EQ(6.022137e+23F, ret->float_value);
+  ret = tokens->read();
+  ASSERT_FALSE(ret.has_value());
+  EXPECT_EQ(io::ReadError::Eof, ret.error());
+}
+
+TEST_P(JavaTokens, double_literals) {
+  auto input = io::memory(R"(
+1e1    2.    .3    0.0    3.14    1e-9d    1e137
+)");
+  auto tokens = java::open(std::move(input), make_errors(),
+                           java::TokensConfig{.version = GetParam()});
+  auto ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralDoubleFloatingPoint, ret->type);
+  EXPECT_EQ(1e1, ret->float_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralDoubleFloatingPoint, ret->type);
+  EXPECT_EQ(2., ret->float_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralDoubleFloatingPoint, ret->type);
+  EXPECT_EQ(.3, ret->float_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralDoubleFloatingPoint, ret->type);
+  EXPECT_EQ(0.0, ret->float_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralDoubleFloatingPoint, ret->type);
+  EXPECT_EQ(3.14, ret->float_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralDoubleFloatingPoint, ret->type);
+  EXPECT_EQ(1e-9, ret->float_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralDoubleFloatingPoint, ret->type);
+  EXPECT_EQ(1e137, ret->float_value);
+  ret = tokens->read();
+  ASSERT_FALSE(ret.has_value());
+  EXPECT_EQ(io::ReadError::Eof, ret.error());
+}
+
+TEST_P(JavaTokens, bool_literals) {
+  auto input = io::memory(R"(
+true false
+)");
+  auto tokens = java::open(std::move(input), make_errors(),
+                           java::TokensConfig{.version = GetParam()});
+  auto ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralBoolean, ret->type);
+  EXPECT_TRUE(ret->int_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralBoolean, ret->type);
+  EXPECT_EQ(0, ret->int_value);
+  ret = tokens->read();
+  ASSERT_FALSE(ret.has_value());
+  EXPECT_EQ(io::ReadError::Eof, ret.error());
+}
+
+TEST_P(JavaTokens, char_literals) {
+  auto input = io::memory(R"(
+'a'
+
+'%'
+
+'\t'
+
+'\\'
+
+'\''
+
+'\u03a9'
+
+'\uFFFF'
+
+'\177'
+
+'™'
+)");
+  auto tokens = java::open(std::move(input), make_errors(),
+                           java::TokensConfig{.version = GetParam()});
+  auto ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralCharacter, ret->type);
+  EXPECT_EQ('a', ret->int_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralCharacter, ret->type);
+  EXPECT_EQ('%', ret->int_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralCharacter, ret->type);
+  EXPECT_EQ('\t', ret->int_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralCharacter, ret->type);
+  EXPECT_EQ('\\', ret->int_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralCharacter, ret->type);
+  EXPECT_EQ('\'', ret->int_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralCharacter, ret->type);
+  EXPECT_EQ(0x3a9, ret->int_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralCharacter, ret->type);
+  EXPECT_EQ(0xffff, ret->int_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralCharacter, ret->type);
+  EXPECT_EQ(0177, ret->int_value);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralCharacter, ret->type);
+  EXPECT_EQ(0x2122, ret->int_value);
+  ret = tokens->read();
+  ASSERT_FALSE(ret.has_value());
+  EXPECT_EQ(io::ReadError::Eof, ret.error());
+}
+
+TEST_P(JavaTokens, string_literals) {
+  auto input = io::memory(R"(
+""
+"\""
+"This is a string"
+"This is a " +
+    "two-line string"
+)");
+  auto tokens = java::open(std::move(input), make_errors(),
+                           java::TokensConfig{.version = GetParam()});
+  auto ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralString, ret->type);
+  EXPECT_EQ("", ret->str);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralString, ret->type);
+  EXPECT_EQ(R"(")", ret->str);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralString, ret->type);
+  EXPECT_EQ("This is a string", ret->str);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralString, ret->type);
+  EXPECT_EQ("This is a ", ret->str);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kOperator, ret->type);
+  EXPECT_EQ("+", ret->str);
+  ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralString, ret->type);
+  EXPECT_EQ("two-line string", ret->str);
+  ret = tokens->read();
+  ASSERT_FALSE(ret.has_value());
+  EXPECT_EQ(io::ReadError::Eof, ret.error());
+}
+
+TEST_P(JavaTokens, string_escapes) {
+  auto input = io::memory(R"("\b\t\n\f\r\"\'\\\0\1\177")");
+  auto tokens = java::open(std::move(input), make_errors(),
+                           java::TokensConfig{.version = GetParam()});
+  auto ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralString, ret->type);
+  EXPECT_EQ("\b\t\n\f\r\"'\\\0\1\177"sv, ret->str);
+  ret = tokens->read();
+  ASSERT_FALSE(ret.has_value());
+  EXPECT_EQ(io::ReadError::Eof, ret.error());
+}
+
+TEST_P(JavaTokens, null) {
+  auto input = io::memory("null");
+  auto tokens = java::open(std::move(input), make_errors(),
+                           java::TokensConfig{.version = GetParam()});
+  auto ret = tokens->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(java::Token::Type::kLiteralNull, ret->type);
+  ret = tokens->read();
+  ASSERT_FALSE(ret.has_value());
+  EXPECT_EQ(io::ReadError::Eof, ret.error());
+}
+
+INSTANTIATE_TEST_SUITE_P(AllVersions, JavaTokens,
+                         testing::Values(java::Version::kJava8));
diff --git a/test/java_uescape.cc b/test/java_uescape.cc
index a6657d8..a9f6746 100644
--- a/test/java_uescape.cc
+++ b/test/java_uescape.cc
@@ -132,6 +132,15 @@ TEST(java_uescape_u8, bad_surrogate) {
   EXPECT_EQ(io::ReadError::InvalidData, ret.error());
 }
 
+TEST(java_uescape_u8, issue) {
+  auto reader = u8::line::open(u8::java::open(io::memory(R"('\u03a9')")));
+  auto ret = reader->read();
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ("'Ω'", ret.value());
+  ret = reader->read();
+  EXPECT_FALSE(ret.has_value());
+}
+
 TEST(java_uescape_u16, escaped_escape) {
   auto reader = u16::java::open(io::memory(R"(\\u2122=\u2122)"));
   std::u16string tmp;
diff --git a/test/prefix_tree.cc b/test/prefix_tree.cc
index 6c00adb..86c8990 100644
--- a/test/prefix_tree.cc
+++ b/test/prefix_tree.cc
@@ -1,5 +1,7 @@
 #include "prefix_tree.hh"
 
+#include "str.hh"
+
 #include <gtest/gtest.h>
 
 TEST(prefix_tree, empty) {
@@ -45,3 +47,27 @@ TEST(prefix_tree, sanity) {
   ASSERT_TRUE(ret.has_value());
   EXPECT_EQ(3, ret.value());
 }
+
+TEST(prefix_tree, many_and_long) {
+  auto builder = prefix_tree::builder();
+  for (auto str : str::split(
+           "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do "
+           "eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut "
+           "enim ad minim veniam, quis nostrud exercitation ullamco laboris "
+           "nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in "
+           "reprehenderit in voluptate velit esse cillum dolore eu fugiat "
+           "nulla pariatur. Excepteur sint occaecat cupidatat non proident, "
+           "sunt in culpa qui officia deserunt mollit anim id est laborum.")) {
+    builder->add(str);
+  }
+  auto tree = builder->build();
+  ASSERT_TRUE(tree.has_value());
+  auto ret = prefix_tree::lookup(tree.value(), "");
+  EXPECT_FALSE(ret.has_value());
+  ret = prefix_tree::lookup(tree.value(), "Lorem");
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(5, ret.value());
+  ret = prefix_tree::lookup(tree.value(), "cillum");
+  ASSERT_TRUE(ret.has_value());
+  EXPECT_EQ(6, ret.value());
+}
diff --git a/test/u.cc b/test/u.cc
index d43109b..d94f74e 100644
--- a/test/u.cc
+++ b/test/u.cc
@@ -711,7 +711,7 @@ TEST_P(UnicodeVersionTest, lookup_gc) {
   EXPECT_EQ(u::lookup_gc(0x483, GetParam()),
             u::GeneralCategory::MARK_NONSPACING);
   EXPECT_EQ(u::lookup_gc(0x93b, GetParam()),
-            u::GeneralCategory::MARK_SPACING_COMBINDING);
+            u::GeneralCategory::MARK_SPACING_COMBINING);
   EXPECT_EQ(u::lookup_gc(0x20de, GetParam()),
             u::GeneralCategory::MARK_SPACING_ENCLOSING);
author	Joel Klinghed <the_jk@spawned.biz>	2025-09-27 20:11:32 +0200
committer	Joel Klinghed <the_jk@spawned.biz>	2025-09-28 22:48:24 +0200
commit	c1ae5d53fb0fa7ceb9d6fc7a60c87df958ce37fe (patch)
tree	f028a04619aa1b69f8b0aa72a5154f6ba1c09775
parent	2f13baa843bd1fb5db6630a2823681ffaff9fb11 (diff)