10 files changed, 2135 insertions, 0 deletions
diff --git a/src/errors.cc b/src/errors.cc
new file mode 100644
index 0000000..7122a9a
--- /dev/null
+++ b/src/errors.cc
@@ -0,0 +1,116 @@
+#include "errors.hh"
+
+#include "location.hh"
+
+#include <cstdint>
+#include <format>
+#include <iostream>
+#include <memory>
+#include <ostream>
+#include <string>
+#include <string_view>
+#include <utility>
+
+namespace src {
+
+namespace {
+
+class FileErrors : public Errors {
+ public:
+  FileErrors(std::string filename, std::shared_ptr<ErrorsOutput> output)
+      : filename_(std::move(filename)), output_(std::move(output)) {}
+
+  void err(Location loc, std::string_view msg) override {
+    ++errors_;
+    output_->println(std::format("{}:{}:{}: Error: {}", filename_, loc.line,
+                                 loc.column, msg));
+  }
+
+  void warn(Location loc, std::string_view msg) override {
+    ++warnings_;
+    output_->println(std::format("{}:{}:{}: Warning: {}", filename_, loc.line,
+                                 loc.column, msg));
+  }
+
+#if !defined(NDEBUG)
+  void dbg(Location loc, std::string_view msg) override {
+    output_->println(std::format("{}:{}:{}: Debug: {}", filename_, loc.line,
+                                 loc.column, msg));
+  }
+#endif
+
+  [[nodiscard]]
+  uint64_t errors() const override {
+    return errors_;
+  }
+
+  [[nodiscard]]
+  uint64_t warnings() const override {
+    return warnings_;
+  }
+
+ private:
+  std::string const filename_;
+  std::shared_ptr<ErrorsOutput> output_;
+  uint64_t errors_{0};
+  uint64_t warnings_{0};
+};
+
+class IgnoreErrors : public Errors {
+ public:
+  IgnoreErrors() = default;
+
+  void err(Location /* loc */, std::string_view /* msg */) override {}
+  void warn(Location /* loc */, std::string_view /* msg */) override {}
+
+#if !defined(NDEBUG)
+  void dbg(Location /* loc */, std::string_view /* msg */) override {}
+#endif
+
+  [[nodiscard]]
+  uint64_t errors() const override {
+    return 0;
+  }
+
+  [[nodiscard]]
+  uint64_t warnings() const override {
+    return 0;
+  }
+};
+
+class OutputStreamErrorsOutput : public ErrorsOutput {
+ public:
+  explicit OutputStreamErrorsOutput(std::ostream& out) : out_(out) {}
+
+  void println(std::string_view line) override { out_ << line << '\n'; }
+
+ private:
+  std::ostream& out_;
+};
+
+}  // namespace
+
+[[nodiscard]]
+std::unique_ptr<Errors> file_errors(std::string filename,
+                                    std::shared_ptr<ErrorsOutput> output) {
+  if (!output) {
+    static std::shared_ptr<ErrorsOutput> g_stderr_output;
+    // TODO: Make thread-safe when needed
+    if (!g_stderr_output)
+      g_stderr_output = std::make_shared<OutputStreamErrorsOutput>(std::cerr);
+    output = g_stderr_output;
+  }
+  return std::make_unique<FileErrors>(std::move(filename), std::move(output));
+}
+
+[[nodiscard]]
+std::unique_ptr<Errors> ignore_errors() {
+  return std::make_unique<IgnoreErrors>();
+}
+
+[[nodiscard]]
+std::unique_ptr<ErrorsOutput> errors_output_ios(std::ostream& out) {
+  return std::make_unique<OutputStreamErrorsOutput>(out);
+}
+
+}  // namespace src
diff --git a/src/errors.hh b/src/errors.hh
new file mode 100644
index 0000000..d8b3d60
--- /dev/null
+++ b/src/errors.hh
@@ -0,0 +1,64 @@
+#ifndef ERRORS_HH
+#define ERRORS_HH
+
+#include "location.hh"
+
+#include <iosfwd>
+#include <memory>
+#include <string>
+#include <string_view>
+
+namespace src {
+
+class Errors {
+ public:
+  virtual ~Errors() = default;
+
+  virtual void err(Location loc, std::string_view msg) = 0;
+
+  virtual void warn(Location loc, std::string_view msg) = 0;
+
+#if !defined(NDEBUG)
+  virtual void dbg(Location loc, std::string_view msg) = 0;
+#else
+  void dbg(Location, std::string_view) {}
+#endif
+
+  [[nodiscard]]
+  virtual uint64_t errors() const = 0;
+  [[nodiscard]]
+  virtual uint64_t warnings() const = 0;
+
+ protected:
+  Errors() = default;
+
+  Errors(Errors const&) = delete;
+  Errors& operator=(Errors const&) = delete;
+};
+
+class ErrorsOutput {
+ public:
+  virtual ~ErrorsOutput() = default;
+
+  virtual void println(std::string_view line) = 0;
+
+ protected:
+  ErrorsOutput() = default;
+
+  ErrorsOutput(ErrorsOutput const&) = delete;
+  ErrorsOutput& operator=(ErrorsOutput const&) = delete;
+};
+
+[[nodiscard]]
+std::unique_ptr<Errors> file_errors(
+    std::string filename, std::shared_ptr<ErrorsOutput> output = nullptr);
+
+[[nodiscard]]
+std::unique_ptr<Errors> ignore_errors();
+
+[[nodiscard]]
+std::unique_ptr<ErrorsOutput> errors_output_ios(std::ostream& out);
+
+}  // namespace src
+
+#endif  // ERRORS_HH
diff --git a/src/gen_tokens.cc b/src/gen_tokens.cc
new file mode 100644
index 0000000..ef0fce7
--- /dev/null
+++ b/src/gen_tokens.cc
@@ -0,0 +1,1062 @@
+#include "args.hh"
+#include "errors.hh"
+#include "grammar.hh"
+#include "io.hh"
+#include "prefix_tree.hh"
+
+#include <algorithm>
+#include <cassert>
+#include <charconv>
+#include <cstddef>
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <set>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+namespace {
+
+enum class CharacterClass : uint8_t {
+  kWhiteSpace = 0,
+  kLineTerminator = 1,
+  kInputCharacter = 2,
+  kJavaLetter = 3,
+  kJavaLetterOrDigit = 4,
+};
+
+std::vector<std::string> const kCharacterClassNames(
+    {"WhiteSpace", "LineTerminator", "InputCharacter", "JavaLetter",
+     "JavaLetterOrDigit"});
+
+enum class ReturnType : uint8_t {
+  kTokenAndSize,
+  kInternalAndSize,
+  kSize,
+};
+
+std::string make_define(std::string_view filename) {
+  std::string ret;
+  ret.reserve(filename.size());
+  for (char c : filename) {
+    if ((c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_') {
+      ret.push_back(c);
+    } else if (c >= 'a' && c <= 'z') {
+      ret.push_back(static_cast<char>(c & ~0x20));
+    } else {
+      ret.push_back('_');
+    }
+  }
+  return ret;
+}
+
+class Generator {
+ public:
+  bool generate(std::string_view header_name, std::string_view source_name,
+                std::string const& ns, std::string const& unicode_version,
+                grammar::Grammar& grammar);
+
+ private:
+  void find_specific_elements(grammar::Element const& root);
+  void find_all_elements(grammar::Element const& root);
+
+  void check_need_last(grammar::Element const& element);
+  bool find_report_last(grammar::Element const& root,
+                        grammar::Element const& match);
+
+  [[nodiscard]]
+  ReturnType get_return_type(grammar::Element const& element) const;
+  [[nodiscard]]
+  ReturnType get_return_type(uint8_t character_class) const;
+
+  void write_matcher(std::ostream& out, grammar::Symbol const& symbol,
+                     ReturnType return_type, std::string_view str_arg);
+  bool write_matcher(std::ostream& out, grammar::Definition const& definition,
+                     ReturnType return_type, std::string_view indent);
+  bool write_matcher(std::ostream& out, grammar::Element const& element,
+                     ReturnType return_type);
+
+  std::set<std::string_view> above_specific_tokens_;
+  std::set<std::string_view> specific_tokens_;
+  std::set<grammar::Element const*> all_elements_;
+  std::set<std::string_view> copy_last_;
+  std::set<std::string_view> report_last_;
+};
+
+// Find the Elements that has at least one terminal or character class as symbol
+// These will be the different tokens the tokenizer can return
+void Generator::find_specific_elements(grammar::Element const& root) {
+  if (std::ranges::any_of(root.definitions, [](auto const& definition) {
+        return definition.symbols.size() > 1 ||
+               definition.symbols[0].type == grammar::Symbol::Type::kTerminal;
+      })) {
+    specific_tokens_.insert(root.name);
+    return;
+  }
+
+  above_specific_tokens_.insert(root.name);
+
+  for (auto const& definition : root.definitions) {
+    for (auto const& symbol : definition.symbols) {
+      switch (symbol.type) {
+        case grammar::Symbol::Type::kNonTerminal:
+          find_specific_elements(*symbol.element);
+          break;
+        case grammar::Symbol::Type::kCharacterClass:
+          specific_tokens_.insert(kCharacterClassNames[symbol.char_class]);
+          break;
+        case grammar::Symbol::Type::kTerminal:
+          std::unreachable();
+      }
+    }
+  }
+}
+
+// Find elements that have definitions that has ZeroOrMore matches with a final condition
+void Generator::check_need_last(grammar::Element const& element) {
+  for (auto const& definition : element.definitions) {
+    if (definition.symbols.size() < 2)
+      continue;
+    if (definition.symbols[definition.symbols.size() - 1].optional ==
+            grammar::Symbol::Optional::kRequired &&
+        definition.symbols[definition.symbols.size() - 1].type ==
+            grammar::Symbol::Type::kNonTerminal &&
+        (definition.symbols[definition.symbols.size() - 2].optional ==
+             grammar::Symbol::Optional::kZeroOrOne ||
+         definition.symbols[definition.symbols.size() - 2].optional ==
+             grammar::Symbol::Optional::kZeroOrMore) &&
+        definition.symbols[definition.symbols.size() - 2].type ==
+            grammar::Symbol::Type::kNonTerminal) {
+      if (!copy_last_.contains(definition.symbols[definition.symbols.size() - 2]
+                                   .element->name)) {
+        find_report_last(
+            *definition.symbols[definition.symbols.size() - 2].element,
+            *definition.symbols[definition.symbols.size() - 1].element);
+      }
+    }
+  }
+}
+
+// Find element that has match as a single definition, if so, return true and insert into
+// report_last_.
+bool Generator::find_report_last(grammar::Element const& root,
+                                 grammar::Element const& match) {
+  if (std::ranges::any_of(root.definitions, [&match](auto const& definition) {
+        return definition.symbols.size() == 1 &&
+               definition.symbols[0].optional ==
+                   grammar::Symbol::Optional::kRequired &&
+               definition.symbols[0].type ==
+                   grammar::Symbol::Type::kNonTerminal &&
+               definition.symbols[0].element == &match;
+      })) {
+    report_last_.insert(root.name);
+    return true;
+  }
+
+  for (auto const& definition : root.definitions) {
+    for (auto const& symbol : definition.symbols) {
+      if (symbol.type == grammar::Symbol::Type::kNonTerminal) {
+        if (!copy_last_.contains(symbol.element->name)) {
+          if (find_report_last(*symbol.element, match)) {
+            copy_last_.insert(root.name);
+            return true;
+          }
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
+ReturnType Generator::get_return_type(grammar::Element const& element) const {
+  if (above_specific_tokens_.contains(element.name) ||
+      specific_tokens_.contains(element.name))
+    return ReturnType::kTokenAndSize;
+  if (copy_last_.contains(element.name) || report_last_.contains(element.name))
+    return ReturnType::kInternalAndSize;
+  return ReturnType::kSize;
+}
+
+ReturnType Generator::get_return_type(uint8_t character_class) const {
+  auto const& name = kCharacterClassNames[character_class];
+  if (above_specific_tokens_.contains(name) || specific_tokens_.contains(name))
+    return ReturnType::kTokenAndSize;
+  return ReturnType::kSize;
+}
+
+void Generator::find_all_elements(grammar::Element const& root) {
+  auto pair = all_elements_.insert(&root);
+  if (!pair.second)
+    return;
+
+  for (auto const& definition : root.definitions) {
+    for (auto const& symbol : definition.symbols) {
+      switch (symbol.type) {
+        case grammar::Symbol::Type::kNonTerminal:
+          find_all_elements(*symbol.element);
+          break;
+        case grammar::Symbol::Type::kCharacterClass:
+        case grammar::Symbol::Type::kTerminal:
+          break;
+      }
+    }
+  }
+}
+
+void declare_character_class_matchers(std::ostream& out) {
+  out << "[[nodiscard]]\n"
+      << "std::optional<size_t> matchLineTerminator(std::string_view str);\n";
+
+  out << "[[nodiscard]]\n"
+      << "static std::optional<size_t> matchInputCharacter(std::string_view "
+         "str);\n";
+
+  out << "[[nodiscard]]\n"
+      << "std::optional<std::pair<Token, size_t>> "
+         "matchWhiteSpace(std::string_view str);\n";
+
+  out << "[[nodiscard]]\n"
+      << "static std::optional<size_t> matchJavaLetter"
+      << "(std::string_view str);\n";
+
+  out << "[[nodiscard]]\n"
+      << "static std::optional<size_t> matchJavaLetterOrDigit"
+      << "(std::string_view str);\n";
+}
+
+void write_character_class_matchers(std::ostream& out,
+                                    std::string_view unicode_version) {
+  out << "[[nodiscard]]\n"
+      << "std::optional<size_t> TokenMatcher::matchLineTerminator"
+      << "(std::string_view str) {\n"
+      // Tokenizer normally reads one line at a time, there is only
+      // one construct (traditional comment) that needs it.
+      // So match synthetic '\n' or report that it was needed if we are at
+      // end of string.
+      << "  if (str.empty()) {\n"
+      << "    line_end_reached_ = true;\n"
+      << "    return std::nullopt;\n"
+      << "  }\n"
+      << "  return str.front() == '\\n' ? std::make_optional<size_t>(1)"
+      << " : std::nullopt;\n"
+      << "}\n"
+      << "\n";
+
+  out << "[[nodiscard]]\n"
+      << "std::optional<size_t> TokenMatcher::matchInputCharacter"
+      << "(std::string_view str) {\n"
+      // UnicodeInputCharacter but not CR or LF
+      << "  if (str.empty() || str.front() == '\\n')\n"
+      << "    return std::nullopt;\n"
+      << "  auto* const start = reinterpret_cast<uint8_t const*>(str.data());\n"
+      << "  auto* ptr = start;\n"
+      << "  u8::skip(ptr, start + str.size());\n"
+      << "  return ptr - start;\n"
+      << "}\n"
+      << "\n";
+
+  out << "[[nodiscard]]\n"
+      << "std::optional<std::pair<Token, size_t>> TokenMatcher::matchWhiteSpace"
+      << "(std::string_view str) {\n"
+      << "  if (!str.empty()) {\n"
+      << "    switch (str.front()) {\n"
+      // the ASCII SP character, also known as "space"
+      << "      case ' ':\n"
+      // the ASCII HT character, also known as "horizontal tab"
+      << "      case '\\t':\n"
+      // the ASCII FF character, also known as "form feed"
+      << "      case '\\f':\n"
+      // LineTerminator, not calling matchLineTerminator as it does special things
+      << "      case '\\n':\n"
+      << "        return std::make_pair(Token::kWhiteSpace, 1);\n"
+      << "      default:\n"
+      << "        break;\n"
+      << "    }\n"
+      << "  }\n"
+      << "  return std::nullopt;\n"
+      << "}\n"
+      << "\n";
+
+  out << "[[nodiscard]]\n"
+      << "std::optional<size_t> TokenMatcher::matchJavaLetter"
+      << "(std::string_view str) {\n"
+      << "  auto* const start = reinterpret_cast<uint8_t const*>(str.data());\n"
+      << "  auto* ptr = start;\n"
+      << "  auto code = u8::read(ptr, ptr + str.size());\n"
+      << "  if (!code.has_value())\n"
+      << "    return std::nullopt;\n"
+      // any Unicode character that is a "Java letter"
+      // A "Java letter" is a character for which the method Character.isJavaIdentifierStart(int) returns true.
+      // A character may start a Java identifier if and only if one of the following conditions is true:
+      //   isLetter(codePoint) returns true
+      //   getType(codePoint) returns LETTER_NUMBER
+      //   the referenced character is a currency symbol (such as '$')
+      //   the referenced character is a connecting punctuation character (such as '_').
+      << "  switch (u::lookup_gc(code.value(), u::Version::" << unicode_version
+      << ")) {\n"
+      << "    case u::GeneralCategory::LETTER_UPPERCASE:\n"
+      << "    case u::GeneralCategory::LETTER_LOWERCASE:\n"
+      << "    case u::GeneralCategory::LETTER_TITLECASE:\n"
+      << "    case u::GeneralCategory::LETTER_MODIFIER:\n"
+      << "    case u::GeneralCategory::LETTER_OTHER:\n"
+      << "    case u::GeneralCategory::NUMBER_LETTER:\n"
+      << "    case u::GeneralCategory::SYMBOL_CURRENCY:\n"
+      << "    case u::GeneralCategory::PUNCTUATION_CONNECTOR:\n"
+      << "      return ptr - start;\n"
+      << "    default:\n"
+      << "      return std::nullopt;\n"
+      << "  }\n"
+      << "}\n"
+      << "\n";
+
+  out << "[[nodiscard]]\n"
+      << "std::optional<size_t> TokenMatcher::matchJavaLetterOrDigit"
+      << "(std::string_view str) {\n"
+      << "  auto* const start = reinterpret_cast<uint8_t const*>(str.data());\n"
+      << "  auto* ptr = start;\n"
+      << "  auto code = u8::read(ptr, ptr + str.size());\n"
+      << "  if (!code.has_value())\n"
+      << "    return std::nullopt;\n"
+      // any Unicode character that is a "Java letter-or-digit"
+      // A "Java letter-or-digit" is a character for which the method Character.isJavaIdentifierPart(int) returns true.
+      // A character may be part of a Java identifier if any of the following conditions are true:
+      //   it is a letter
+      //   it is a currency symbol (such as '$')
+      //   it is a connecting punctuation character (such as '_')
+      //   it is a digit
+      //   it is a numeric letter (such as a Roman numeral character)
+      //   it is a combining mark
+      //   it is a non-spacing mark
+      //   isIdentifierIgnorable returns true for the character
+      << "  switch (u::lookup_gc(code.value(), u::Version::" << unicode_version
+      << ")) {\n"
+      << "    case u::GeneralCategory::LETTER_UPPERCASE:\n"
+      << "    case u::GeneralCategory::LETTER_LOWERCASE:\n"
+      << "    case u::GeneralCategory::LETTER_TITLECASE:\n"
+      << "    case u::GeneralCategory::LETTER_MODIFIER:\n"
+      << "    case u::GeneralCategory::LETTER_OTHER:\n"
+      << "    case u::GeneralCategory::SYMBOL_CURRENCY:\n"
+      << "    case u::GeneralCategory::PUNCTUATION_CONNECTOR:\n"
+      << "    case u::GeneralCategory::NUMBER_DIGIT:\n"
+      << "    case u::GeneralCategory::NUMBER_LETTER:\n"
+      << "    case u::GeneralCategory::MARK_SPACING_COMBINING:\n"
+      << "    case u::GeneralCategory::MARK_NONSPACING:\n"
+      << "    case u::GeneralCategory::OTHER_FORMAT:\n"
+      << "      return ptr - start;\n"
+      << "    case u::GeneralCategory::OTHER_CONTROL:\n"
+      << "      if ((/* code.value() >= 0 && */ code.value() <= 8) ||\n"
+      << "          (code.value() >= 0xe && code.value() <= 0x1b) ||\n"
+      << "          (code.value() >= 0x7f && code.value() <= 0x9f))\n"
+      << "        return 1;\n"
+      << "      break;\n"
+      << "    default:\n"
+      << "      break;\n"
+      << "  }\n"
+      << "  return std::nullopt;\n"
+      << "}\n"
+      << "\n";
+}
+
+std::ostream& quote(std::ostream& out, std::string_view in) {
+  out << '"';
+  bool avoid_digit = false;
+  for (auto c : in) {
+    if (c == '"' || c == '\\') {
+      out << '\\';
+    } else if (c < ' ' || (c & 0x80)) {
+      char tmp[4];
+      std::to_chars(tmp, tmp + sizeof(tmp), c & 0xff, 8).ptr[0] = 0;
+      out << "\\" << tmp;
+      avoid_digit = true;
+      continue;
+    } else if (avoid_digit) {
+      if (c >= '0' && c <= '7') {
+        out << "\" \"";
+      }
+    }
+    avoid_digit = false;
+    out << c;
+  }
+  out << '"';
+  return out;
+}
+
+void match_return_type(std::ostream& out, ReturnType in_return_type,
+                       std::string_view in_name, ReturnType out_return_type) {
+  switch (out_return_type) {
+    case ReturnType::kTokenAndSize:
+      switch (in_return_type) {
+        case ReturnType::kTokenAndSize:
+          break;
+        case ReturnType::kInternalAndSize:
+          out << ".transform([](auto pair) { return std::make_pair(Token::k"
+              << in_name << ", pair.second); }";
+          break;
+        case ReturnType::kSize:
+          out << ".transform([](auto size) { return std::make_pair(Token::k"
+              << in_name << ", size); })";
+          break;
+      }
+      break;
+    case ReturnType::kInternalAndSize:
+      switch (in_return_type) {
+        case ReturnType::kTokenAndSize:
+          out << ".transform([](auto pair) { return std::make_pair(Internal::k"
+              << in_name << ", pair.second); }";
+          break;
+        case ReturnType::kInternalAndSize:
+          break;
+        case ReturnType::kSize:
+          if (in_name.empty()) {
+            out << ".transform([](auto size) { return "
+                   "std::make_pair(Internal::UNDEFINED, size); })";
+          } else {
+            out << ".transform([](auto size) { return "
+                   "std::make_pair(Internal::k"
+                << in_name << ", size); })";
+          }
+          break;
+      }
+      break;
+    case ReturnType::kSize:
+      if (in_return_type != ReturnType::kSize) {
+        out << ".transform([](auto pair) { return pair.second; })";
+      }
+      break;
+  }
+}
+
+void Generator::write_matcher(std::ostream& out, grammar::Symbol const& symbol,
+                              ReturnType return_type,
+                              std::string_view str_arg) {
+  std::string_view in_name;
+  ReturnType in_return_type;
+
+  switch (symbol.type) {
+    case grammar::Symbol::Type::kTerminal:
+      in_return_type = ReturnType::kSize;
+      out << "(" << str_arg << ".starts_with(";
+      quote(out, symbol.value);
+      out << ") ? std::make_optional<size_t>(" << symbol.value.size()
+          << ") : " << "std::nullopt)";
+      break;
+    case grammar::Symbol::Type::kNonTerminal:
+      out << "match" << symbol.element->name << "(" << str_arg << ")";
+      in_return_type = get_return_type(*symbol.element);
+      in_name = symbol.element->name;
+      break;
+    case grammar::Symbol::Type::kCharacterClass:
+      out << "match" << kCharacterClassNames[symbol.char_class] << "("
+          << str_arg << ")";
+      in_return_type = get_return_type(symbol.char_class);
+      in_name = kCharacterClassNames[symbol.char_class];
+      break;
+  }
+
+  match_return_type(out, in_return_type, in_name, return_type);
+}
+
+bool Generator::write_matcher(std::ostream& out,
+                              grammar::Definition const& definition,
+                              ReturnType return_type, std::string_view indent) {
+  if (definition.symbols.size() == 1 &&
+      definition.symbols[0].optional == grammar::Symbol::Optional::kRequired) {
+    out << indent << "return ";
+    write_matcher(out, definition.symbols[0], return_type, "str");
+    out << ";\n";
+    return true;
+  }
+
+  std::string_view size_suffix;
+  switch (return_type) {
+    case ReturnType::kTokenAndSize:
+    case ReturnType::kInternalAndSize:
+      size_suffix = "->second";
+      break;
+    case ReturnType::kSize:
+      size_suffix = ".value()";
+      break;
+  }
+
+  if (definition.symbols.size() > 1 &&
+      definition.symbols[0].optional == grammar::Symbol::Optional::kRequired &&
+      definition.symbols[1].optional == grammar::Symbol::Optional::kExcluded) {
+    bool first = true;
+    for (auto const& symbol : definition.symbols) {
+      if (first) {
+        out << indent << "auto first_ret = ";
+        write_matcher(out, symbol, return_type, "str");
+        out << ";\n"
+            << indent << "if (!first_ret.has_value())\n"
+            << indent << "  return first_ret;\n"
+            << indent << "std::optional<size_t> ret;\n"
+            << indent << "auto tmp = str.substr(0, first_ret" << size_suffix
+            << ");\n";
+        first = false;
+      } else {
+        if (symbol.optional != grammar::Symbol::Optional::kExcluded) {
+          std::cerr << "Non-excluded after at least one excluded\n";
+          return false;
+        }
+        out << indent << "ret = ";
+        write_matcher(out, symbol, ReturnType::kSize, "tmp");
+        out << ";\n"
+            << indent << "if (ret.has_value() && ret.value() == tmp.size())\n"
+            << indent << "  return std::nullopt;\n";
+      }
+    }
+    out << indent << "return first_ret;\n";
+    return true;
+  }
+
+  if (std::ranges::all_of(definition.symbols, [](auto const& symbol) {
+        return symbol.optional == grammar::Symbol::Optional::kRequired;
+      })) {
+    out << indent << "size_t tot = 0;\n";
+    bool first = true;
+    for (auto const& symbol : definition.symbols) {
+      std::string indent2(indent);
+      if (first) {
+        out << indent2 << "auto ret = ";
+        write_matcher(out, symbol, return_type, "str");
+        out << ";\n";
+        first = false;
+      } else {
+        out << indent2 << "ret = ";
+        write_matcher(out, symbol, return_type, "str.substr(tot)");
+        out << ";\n";
+      }
+      out << indent2 << "if (!ret.has_value())\n"
+          << indent2 << "  return ret;\n";
+      out << indent2 << "tot += ret" << size_suffix << ";\n";
+    }
+    switch (return_type) {
+      case ReturnType::kInternalAndSize:
+        // Return last internal
+        out << indent << "return std::make_pair(ret->first, tot);\n";
+        break;
+      case ReturnType::kTokenAndSize:
+        std::cerr << "Unable to return token and size\n";
+        return false;
+      case ReturnType::kSize:
+        out << indent << "return tot;\n";
+        break;
+    }
+    return true;
+  }
+
+  out << indent << "size_t tot = 0;\n";
+  bool last_internal = false;
+  switch (return_type) {
+    case ReturnType::kInternalAndSize:
+      last_internal = true;
+      out << indent << "std::optional<Internal> last_internal;\n";
+      break;
+    case ReturnType::kTokenAndSize:
+    case ReturnType::kSize:
+      break;
+  }
+  bool at_least_one_required = false;
+  bool first = true;
+  bool first_internal = true;
+  bool next_internal = false;
+  for (size_t i = 0; i < definition.symbols.size(); ++i) {
+    auto const& symbol = definition.symbols[i];
+    std::string indent2(indent);
+    bool have_internal = next_internal;
+    next_internal = false;
+    ReturnType symbol_return_type = return_type;
+
+    if (symbol.optional != grammar::Symbol::Optional::kRequired &&
+        i + 1 < definition.symbols.size() &&
+        definition.symbols[i + 1].optional ==
+            grammar::Symbol::Optional::kRequired &&
+        definition.symbols[i + 1].type == grammar::Symbol::Type::kNonTerminal) {
+      symbol_return_type = ReturnType::kInternalAndSize;
+      next_internal = true;
+    }
+
+    switch (symbol_return_type) {
+      case ReturnType::kTokenAndSize:
+      case ReturnType::kInternalAndSize:
+        size_suffix = "->second";
+        break;
+      case ReturnType::kSize:
+        size_suffix = ".value()";
+        break;
+    }
+
+    switch (symbol.optional) {
+      case grammar::Symbol::Optional::kRequired:
+        at_least_one_required = true;
+        break;
+      case grammar::Symbol::Optional::kZeroOrOne:
+        break;
+      case grammar::Symbol::Optional::kZeroOrMore:
+        if (first) {
+          switch (symbol_return_type) {
+            case ReturnType::kTokenAndSize:
+              out << indent << "std::optional<std::pair<Token, size_t>> ret;\n";
+              break;
+            case ReturnType::kInternalAndSize:
+              out << indent
+                  << "std::optional<std::pair<Internal, size_t>> ret;\n";
+              break;
+            case ReturnType::kSize:
+              out << indent << "std::optional<size_t> ret;\n";
+              break;
+          }
+          first = false;
+        }
+        out << indent << "while (true) {\n";
+        indent2 += "  ";
+        break;
+      case grammar::Symbol::Optional::kExcluded:
+        std::cerr << "Excluded mixed with conditional\n";
+        return false;
+    }
+    if (symbol_return_type == return_type) {
+      if (first) {
+        out << indent2 << "auto ret = ";
+        write_matcher(out, symbol, symbol_return_type, "str");
+        first = false;
+      } else {
+        out << indent2 << "ret = ";
+        write_matcher(out, symbol, symbol_return_type, "str.substr(tot)");
+      }
+      out << ";\n";
+    } else {
+      if (first_internal) {
+        out << indent2 << "auto ret_internal = ";
+        write_matcher(out, symbol, symbol_return_type,
+                      first ? "str" : "str.substr(tot)");
+        first_internal = false;
+      } else {
+        out << indent2 << "ret_internal = ";
+        write_matcher(out, symbol, symbol_return_type, "str.substr(tot)");
+      }
+      out << ";\n";
+      if (first) {
+        out << indent2 << "auto ret = ret_internal";
+        first = false;
+      } else {
+        out << indent2 << "ret = ret_internal";
+      }
+      match_return_type(out, symbol_return_type, "", return_type);
+      out << ";\n";
+    }
+    switch (symbol.optional) {
+      case grammar::Symbol::Optional::kRequired:
+        out << indent2 << "if (!ret.has_value()) {\n";
+        if (have_internal &&
+            symbol.type == grammar::Symbol::Type::kNonTerminal) {
+          out << indent2
+              << "  if (!ret_internal.has_value() || ret_internal->first != "
+                 "Internal::k"
+              << symbol.element->name << ")\n"
+              << indent2 << "    return ret;\n";
+        } else {
+          out << indent2 << "  return ret;\n";
+        }
+        out << indent2 << "} else {\n"
+            << indent2 << "  tot += ret" << size_suffix << ";\n";
+        if (last_internal)
+          out << indent2 << "  last_internal = ret->first;\n";
+        out << indent2 << "}\n";
+        break;
+      case grammar::Symbol::Optional::kZeroOrOne:
+        if (symbol_return_type == ReturnType::kTokenAndSize) {
+          out << indent2 << "tot += ret.has_value() ? ret->second : 0;\n";
+        } else {
+          out << indent2 << "tot += ret.value_or(0);\n";
+        }
+        if (last_internal)
+          out << indent2 << "if (ret.has_value())\n"
+              << indent2 << "  last_internal = ret->first;\n";
+        break;
+      case grammar::Symbol::Optional::kZeroOrMore:
+        out << indent2 << "if (!ret.has_value())\n"
+            << indent2 << "  break;\n"
+            << indent2 << "tot += ret" << size_suffix << ";\n";
+        if (last_internal)
+          out << indent2 << "last_internal = ret->first;\n";
+        out << indent << "}\n";
+        break;
+      case grammar::Symbol::Optional::kExcluded:
+        assert(false);
+        break;
+    }
+  }
+  switch (return_type) {
+    case ReturnType::kInternalAndSize:
+      // Return last internal
+      if (at_least_one_required) {
+        out << indent << "return std::make_pair(last_internal.value(), tot);\n";
+      } else {
+        out << indent << "if (last_internal.has_value())\n"
+            << indent
+            << "  return std::make_pair(last_internal.value(), tot);\n"
+            << indent << "return std::make_pair(Internal::UNDEFINED, tot);\n";
+      }
+      break;
+    case ReturnType::kTokenAndSize:
+      std::cerr << "Unable to return token and size\n";
+      return false;
+    case ReturnType::kSize:
+      out << indent << "return tot;\n";
+      break;
+  }
+  return true;
+}
+
+void declare_matcher(std::ostream& out, grammar::Element const& element,
+                     ReturnType return_type) {
+  switch (return_type) {
+    case ReturnType::kSize:
+      out << "[[nodiscard]]\n"
+          << "std::optional<size_t> match" << element.name
+          << "(std::string_view str);\n";
+      break;
+    case ReturnType::kTokenAndSize:
+      out << "[[nodiscard]]\n"
+          << "std::optional<std::pair<Token, size_t>> match" << element.name
+          << "(std::string_view str);\n";
+      break;
+    case ReturnType::kInternalAndSize:
+      out << "[[nodiscard]]\n"
+          << "std::optional<std::pair<Internal, size_t>> match" << element.name
+          << "(std::string_view str);\n";
+      break;
+  }
+}
+
+bool Generator::write_matcher(std::ostream& out,
+                              grammar::Element const& element,
+                              ReturnType return_type) {
+  ReturnType sub_return_type = return_type;
+  bool make_token = false;
+
+  switch (return_type) {
+    case ReturnType::kSize:
+      out << "[[nodiscard]]\n"
+          << "std::optional<size_t> TokenMatcher::match" << element.name
+          << "(std::string_view str) {\n";
+      break;
+    case ReturnType::kTokenAndSize:
+      out << "[[nodiscard]]\n"
+          << "std::optional<std::pair<Token, size_t>> TokenMatcher::match"
+          << element.name << "(std::string_view str) {\n";
+
+      if (specific_tokens_.contains(element.name)) {
+        sub_return_type = ReturnType::kSize;
+        make_token = true;
+      }
+      break;
+    case ReturnType::kInternalAndSize:
+      out << "[[nodiscard]]\n"
+          << "std::optional<std::pair<TokenMatcher::Internal, size_t>> "
+             "TokenMatcher::match"
+          << element.name << "(std::string_view str) {\n";
+      break;
+  }
+
+  if (element.definitions.size() == 1) {
+    if (make_token) {
+      out << "  auto ret = [this, str]() -> std::optional<size_t> {\n";
+      if (!write_matcher(out, element.definitions[0], sub_return_type,
+                         "    ")) {
+        std::cerr << "Error in " << element.name << "\n";
+        return false;
+      }
+      out << "  }();\n"
+          << "  return ret.transform([](auto size) {\n"
+          << "    return std::make_pair(Token::k" << element.name
+          << ", size); });\n";
+    } else {
+      if (!write_matcher(out, element.definitions[0], sub_return_type, "  ")) {
+        std::cerr << "Error in " << element.name << "\n";
+        return false;
+      }
+    }
+  } else if (std::ranges::all_of(
+                 element.definitions, [](auto const& definition) {
+                   return definition.symbols.size() == 1 &&
+                          definition.symbols[0].optional ==
+                              grammar::Symbol::Optional::kRequired &&
+                          definition.symbols[0].type ==
+                              grammar::Symbol::Type::kTerminal;
+                 })) {
+    if (std::ranges::all_of(element.definitions, [](auto const& definition) {
+          return definition.symbols[0].value.size() == 1;
+        })) {
+      out << "  if (!str.empty()) {\n"
+          << "    switch (str.front()) {\n";
+      for (auto const& definition : element.definitions) {
+        out << "      case '" << definition.symbols[0].value[0] << "':\n";
+      }
+      out << "        return 1;\n"
+          << "      default:\n"
+          << "        break;\n"
+          << "    }\n"
+          << "  }\n"
+          << "  return std::nullopt;\n";
+    } else {
+      auto builder = prefix_tree::builder();
+      for (auto const& definition : element.definitions) {
+        builder->add(definition.symbols[0].value);
+      }
+      auto tree = builder->build();
+      if (!tree.has_value()) {
+        std::cerr << "To large prefix tree\n";
+        return false;
+      }
+      out << "  static const auto tree = ";
+      quote(out, tree.value()) << "sv;\n";
+      out << "  return prefix_tree::lookup(tree, str)";
+      if (make_token) {
+        out << ".transform([](auto size) {\n"
+            << "  return std::make_pair(Token::k" << element.name
+            << ", size); })";
+      }
+      out << ";\n";
+    }
+  } else {
+    bool first = true;
+    std::string_view ret_type;
+    switch (sub_return_type) {
+      case ReturnType::kTokenAndSize:
+        ret_type = "std::optional<std::pair<Token, size_t>>";
+        break;
+      case ReturnType::kInternalAndSize:
+        ret_type = "std::optional<std::pair<Internal, size_t>>";
+        break;
+      case ReturnType::kSize:
+        ret_type = "std::optional<size_t>";
+        break;
+    }
+    for (auto const& definition : element.definitions) {
+      if (first) {
+        first = false;
+        out << "  auto tmp = [this, str]() -> " << ret_type << " {\n";
+        if (!write_matcher(out, definition, sub_return_type, "    ")) {
+          std::cerr << "Error in " << element.name << "\n";
+          return false;
+        }
+        out << "  }();\n";
+        out << "  auto ret = tmp;\n";
+      } else {
+        out << "  tmp = [this, str]() -> " << ret_type << " {\n";
+        if (!write_matcher(out, definition, sub_return_type, "    ")) {
+          std::cerr << "Error in " << element.name << "\n";
+          return false;
+        }
+        out << "  }();\n"
+            << "  if (tmp.has_value()) {\n";
+        if (sub_return_type == ReturnType::kTokenAndSize) {
+          out << "    if (!ret.has_value() || ret.value().second < "
+                 "tmp.value().second) {\n";
+        } else {
+          out << "    if (!ret.has_value() || ret.value() < tmp.value()) {\n";
+        }
+        out << "      ret = tmp;\n"
+            << "    }\n"
+            << "  }\n";
+      }
+    }
+    if (make_token) {
+      out << "  return ret.transform([](auto size) {\n"
+          << "    return std::make_pair(Token::k" << element.name
+          << ", size); });\n";
+    } else {
+      out << "  return ret;\n";
+    }
+  }
+
+  out << "}\n"
+      << "\n";
+  return true;
+}
+
+bool Generator::generate(std::string_view header_name,
+                         std::string_view source_name, std::string const& ns,
+                         std::string const& unicode_version,
+                         grammar::Grammar& grammar) {
+  std::fstream header{std::string(header_name),
+                      std::fstream::trunc | std::fstream::out};
+  std::fstream source{std::string(source_name),
+                      std::fstream::trunc | std::fstream::out};
+
+  auto header_guard = make_define(header_name);
+
+  header << "#ifndef " << header_guard << "\n"
+         << "#define " << header_guard << "\n"
+         << "\n"
+         << "#include \"prefix_tree.hh\"\n"
+         << "\n"
+         << "#include <cstddef>\n"
+         << "#include <cstdint>\n"
+         << "#include <optional>\n"
+         << "#include <string_view>\n"
+         << "#include <utility>\n"
+         << "\n"
+         << "namespace " << ns << " {\n"
+         << "\n";
+
+  find_specific_elements(grammar.root());
+
+  find_all_elements(grammar.root());
+
+  for (auto& element : all_elements_) {
+    check_need_last(*element);
+  }
+
+  header << "enum class Token : "
+         << (specific_tokens_.size() < 256 ? "uint8_t" : "uint16_t") << " {\n";
+  for (auto const& token : specific_tokens_) {
+    header << "  k" << token << ",\n";
+  }
+  header << "};\n";
+
+  header << "\n"
+         << "class TokenMatcher {\n"
+         << " public:\n"
+         << "  [[nodiscard]]\n"
+         << "  std::optional<std::pair<Token, size_t>>"
+         << "   matchNext(std::string_view str);\n"
+         << "\n"
+         << "  bool line_end_reached_{false};\n"
+         << "\n"
+         << " private:\n";
+
+  declare_character_class_matchers(header);
+
+  header << "\n"
+         << "enum class Internal : "
+         << (all_elements_.size() < 256 ? "uint8_t" : "uint16_t") << " {\n"
+         << "  UNDEFINED,\n";
+  for (auto* element : all_elements_) {
+    header << "  k" << element->name << ",\n";
+  }
+  header << "};\n"
+         << "\n";
+
+  for (auto* element : all_elements_) {
+    declare_matcher(header, *element, get_return_type(*element));
+  }
+
+  header << "\n"
+         << "};\n"
+         << "\n"
+         << "}  // " << ns << "\n"
+         << "\n"
+         << "#endif  // " << header_guard << "\n";
+
+  source << "#include \"" << header_name << "\"\n"
+         << "\n"
+         << "#include \"prefix_tree.hh\"\n"
+         << "#include \"u.hh\"\n"
+         << "#include \"u8.hh\"\n"
+         << "\n"
+         << "#include <cstddef>\n"
+         << "#include <optional>\n"
+         << "#include <string_view>\n"
+         << "#include <utility>\n"
+         << "\n"
+         << "using namespace std::literals::string_view_literals;\n"
+         << "\n"
+         << "// NOLINTBEGIN(readability-else-after-return, "
+            "readability-convert-member-functions-to-static)\n"
+         << "#pragma GCC diagnostic push\n"
+         << "#pragma GCC diagnostic ignored \"-Wunused-lambda-capture\"\n"
+         << "\n"
+         << "namespace " << ns << " {\n"
+         << "\n";
+
+  write_character_class_matchers(source, unicode_version);
+
+  source << "\n";
+
+  if (std::ranges::any_of(all_elements_, [this, &source](auto* element) {
+        auto sts = get_return_type(*element);
+        return !write_matcher(source, *element, sts);
+      })) {
+    return false;
+  }
+
+  source << "\n"
+         << "std::optional<std::pair<Token, size_t>>"
+         << "TokenMatcher::matchNext(std::string_view str) {\n"
+         << "  return match" << grammar.root().name << "(str);\n"
+         << "}"
+         << "\n"
+         << "}  // namespace " << ns << "\n"
+         << "\n"
+         << "// NOLINTEND(readability-else-after-return, "
+            "readability-convert-member-functions-to-static)\n"
+         << "#pragma GCC diagnostic pop\n"
+         << "\n";
+
+  return true;
+}
+
+}  // namespace
+
+int main(int argc, char** argv) {
+  auto args = Args::create();
+  auto opt_help = args->option('h', "help", "display this text and exit");
+  auto opt_ns = args->option_argument('\0', "namespace", "ARG",
+                                      "Namespace for tokenizer");
+  auto opt_unicode =
+      args->option_argument('u', "unicode", "ARG", "Unicode version");
+  std::vector<std::string_view> arguments;
+  if (!args->run(argc, argv, &arguments)) {
+    args->print_error(std::cerr);
+    std::cerr << "Try `gen_tokens --help` for usage\n";
+    return 1;
+  }
+  if (opt_help->is_set()) {
+    std::cout << "Usage: `gen_tokens [OPTIONS...] tokens.grammar"
+              << " OUTPUT.hh OUTPUT.cc`\n"
+              << "Generates a tokenizer for grammar.\n"
+              << "\n";
+    args->print_help(std::cout);
+    return 0;
+  }
+  if (!opt_ns->is_set()) {
+    std::cerr << "No namespace given.\n"
+              << "Try `gen_tokens --help` for usage\n";
+    return 1;
+  }
+  if (!opt_unicode->is_set()) {
+    std::cerr << "No unicode version given.\n"
+              << "Try `gen_tokens --help` for usage\n";
+    return 1;
+  }
+  auto ns = opt_ns->argument();
+  auto unicode = opt_unicode->argument();
+  if (arguments.size() != 3) {
+    std::cerr << "Expecting three arguments. No more, no less.\n"
+              << "Try `gen_tokens --help` for usage\n";
+    return 1;
+  }
+
+  auto filename = std::string(arguments[0]);
+  auto reader = io::open(filename);
+  if (!reader.has_value()) {
+    std::cerr << "Unable to open " << filename << '\n';
+    return 1;
+  }
+  auto errors = src::file_errors(std::move(filename));
+  auto grammar =
+      grammar::load(std::move(reader.value()), kCharacterClassNames, *errors);
+  if (!grammar || errors->errors() > 0)
+    return 1;
+
+  Generator generator;
+  if (!generator.generate(arguments[1], arguments[2], ns, unicode, *grammar))
+    return 1;
+  return 0;
+}
diff --git a/src/grammar.cc b/src/grammar.cc
new file mode 100644
index 0000000..25c4d64
--- /dev/null
+++ b/src/grammar.cc
@@ -0,0 +1,268 @@
+#include "grammar.hh"
+
+#include "errors.hh"
+#include "line.hh"
+#include "location.hh"
+#include "str.hh"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <format>
+#include <functional>
+#include <map>
+#include <memory>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+namespace grammar {
+
+namespace {
+
+class GrammarImpl : public Grammar {
+ public:
+  explicit GrammarImpl(std::vector<std::unique_ptr<Element>> elements)
+      : elements_(std::move(elements)) {}
+
+  [[nodiscard]]
+  Element const& root() const override {
+    return *elements_[0];
+  }
+
+ private:
+  std::vector<std::unique_ptr<Element>> elements_;
+};
+
+struct FirstPassElement {
+  src::Location loc;
+  std::vector<std::string> definitions;
+
+  explicit FirstPassElement(src::Location loc) : loc(loc) {}
+};
+
+class GrammarLoader {
+ public:
+  GrammarLoader(std::unique_ptr<io::Reader> reader,
+                std::vector<std::string> const& character_classes,
+                src::Errors& errors)
+      : reader_(line::open(std::move(reader))),
+        character_classes_(character_classes),
+        errors_(errors) {}
+
+  std::unique_ptr<Grammar> load() {
+    // Read whole file in a first pass, before parsing definitions.
+    std::map<std::string, FirstPassElement> first_pass_elements;
+    FirstPassElement* last_element = nullptr;
+
+    std::map<std::string_view, uint8_t, std::less<>> cc_lookup;
+    for (uint8_t i = 0; i < static_cast<uint8_t>(character_classes_.size());
+         ++i) {
+      cc_lookup.emplace(character_classes_[i], i);
+    }
+
+    while (true) {
+      auto line = reader_->read();
+      auto loc = src::Location(reader_->number(), 0);
+      if (!line.has_value()) {
+        if (line.error() == io::ReadError::Eof)
+          break;
+        errors_.err(loc, "Error reading");
+        return nullptr;
+      }
+      if (line.value().empty() || line.value().front() == '#')
+        continue;
+      if (line.value().front() == ' ') {
+        // Continue on last element
+        size_t i = 1;
+        while (i < line.value().size() && line.value()[i] == ' ')
+          ++i;
+        if (i == line.value().size()) {
+          errors_.err(loc, "Unexpected line, only spaces");
+          continue;
+        }
+        if (last_element == nullptr) {
+          errors_.err(loc, "Expected element before indented lines");
+          continue;
+        }
+        last_element->definitions.emplace_back(line.value().substr(i));
+      } else {
+        // New element
+        if (line.value().back() != ':') {
+          errors_.err(
+              loc,
+              "Unexpected line, not indented but also not ending with a ':'");
+          continue;
+        }
+
+        auto name = line.value().substr(0, line.value().size() - 1);
+        if (cc_lookup.contains(name)) {
+          errors_.warn(
+              loc, std::format("Element {} overrides character class", name));
+        }
+        auto pair = first_pass_elements.emplace(name, FirstPassElement(loc));
+        if (!pair.second) {
+          errors_.err(loc, std::format("Duplicate element {}", name));
+        }
+        last_element = &pair.first->second;
+      }
+    }
+
+    if (first_pass_elements.empty()) {
+      errors_.err(src::Location(reader_->number(), 0), "No elements found");
+      return nullptr;
+    }
+
+    std::vector<std::unique_ptr<Element>> second_pass_elements;
+    std::map<std::string_view, size_t, std::less<>> second_pass_lookup;
+    for (auto const& pair : first_pass_elements) {
+      auto element = std::make_unique<Element>();
+      element->name = pair.first;
+      second_pass_lookup.emplace(element->name, second_pass_elements.size());
+      second_pass_elements.emplace_back(std::move(element));
+    }
+
+    auto it = second_pass_elements.begin();
+    for (auto const& pair : first_pass_elements) {
+      auto const& element = *it++;
+      if (pair.second.definitions.empty()) {
+        errors_.err(pair.second.loc,
+                    std::format("No definitions for {}", pair.first));
+        continue;
+      }
+      std::vector<std::string_view> in_symbols;
+      for (auto const& in_definition : pair.second.definitions) {
+        str::split(in_definition, in_symbols);
+
+        std::vector<Symbol> out_symbols;
+        bool exclude = false;
+        bool expect_not = false;
+        for (auto in_symbol : in_symbols) {
+          Symbol out_symbol;
+          if (exclude) {
+            if (in_symbol == "or")
+              continue;
+            out_symbol.optional = Symbol::Optional::kExcluded;
+          } else if (expect_not) {
+            expect_not = false;
+            if (in_symbol == "not") {
+              exclude = true;
+            } else {
+              errors_.err(pair.second.loc, "but is not followed by not");
+            }
+            continue;
+          }
+          if (in_symbol == "but") {
+            expect_not = true;
+            continue;
+          }
+          if (in_symbol.front() == '{' && in_symbol.back() == '}') {
+            if (exclude) {
+              errors_.err(pair.second.loc,
+                          "Optional and exclude doesn't work together");
+            } else {
+              out_symbol.optional = Symbol::Optional::kZeroOrMore;
+            }
+            in_symbol = in_symbol.substr(1, in_symbol.size() - 2);
+          } else if (in_symbol.front() == '[' && in_symbol.back() == ']') {
+            if (exclude) {
+              errors_.err(pair.second.loc,
+                          "Optional and exclude doesn't work together");
+            } else {
+              out_symbol.optional = Symbol::Optional::kZeroOrOne;
+            }
+            in_symbol = in_symbol.substr(1, in_symbol.size() - 2);
+          }
+          auto it2 = second_pass_lookup.find(in_symbol);
+          if (it2 != second_pass_lookup.end()) {
+            out_symbol.type = Symbol::Type::kNonTerminal;
+            out_symbol.element = second_pass_elements[it2->second].get();
+          } else {
+            auto it3 = cc_lookup.find(in_symbol);
+            if (it3 != cc_lookup.end()) {
+              out_symbol.type = Symbol::Type::kCharacterClass;
+              out_symbol.char_class = it3->second;
+            } else {
+              out_symbol.type = Symbol::Type::kTerminal;
+              out_symbol.value = in_symbol;
+            }
+          }
+          out_symbols.emplace_back(std::move(out_symbol));
+        }
+
+        if (expect_not) {
+          errors_.err(pair.second.loc, "but is not followed by not");
+        }
+
+        if (out_symbols.empty()) {
+          errors_.err(pair.second.loc, "no symbols found in definition");
+          continue;
+        }
+
+        element->definitions.emplace_back(
+            Definition{.symbols = std::move(out_symbols)});
+      }
+    }
+
+    // Find root and move it first (if needed)
+    std::vector<size_t> used(second_pass_elements.size(), 0);
+    for (auto const& element : second_pass_elements) {
+      for (auto const& definition : element->definitions) {
+        for (auto const& symbol : definition.symbols) {
+          switch (symbol.type) {
+            case Symbol::Type::kTerminal:
+            case Symbol::Type::kCharacterClass:
+              break;
+            case Symbol::Type::kNonTerminal:
+              used[second_pass_lookup.find(symbol.element->name)->second]++;
+              break;
+          }
+        }
+      }
+    }
+
+    std::optional<size_t> root_index;
+    for (size_t i = 0; i < used.size(); ++i) {
+      if (used[i] == 0) {
+        if (root_index.has_value()) {
+          errors_.warn(first_pass_elements.find(second_pass_elements[i]->name)
+                           ->second.loc,
+                       "Is not referenced but also not root");
+        } else {
+          root_index = i;
+        }
+      }
+    }
+
+    if (root_index.has_value()) {
+      if (root_index.value() != 0) {
+        std::swap(second_pass_elements[0],
+                  second_pass_elements[root_index.value()]);
+      }
+    } else {
+      errors_.err(
+          first_pass_elements.find(second_pass_elements[0]->name)->second.loc,
+          "No root element found");
+    }
+
+    return std::make_unique<GrammarImpl>(std::move(second_pass_elements));
+  }
+
+ private:
+  std::unique_ptr<line::Reader> reader_;
+  std::vector<std::string> const& character_classes_;
+  src::Errors& errors_;
+};
+
+}  // namespace
+
+std::unique_ptr<Grammar> load(std::unique_ptr<io::Reader> reader,
+                              std::vector<std::string> const& character_classes,
+                              src::Errors& errors) {
+  GrammarLoader loader(std::move(reader), character_classes, errors);
+  return loader.load();
+}
+
+}  // namespace grammar
diff --git a/src/grammar.hh b/src/grammar.hh
new file mode 100644
index 0000000..13beec4
--- /dev/null
+++ b/src/grammar.hh
@@ -0,0 +1,80 @@
+#ifndef GRAMMAR_HH
+#define GRAMMAR_HH
+
+#include "errors.hh"
+#include "io.hh"
+
+#include <cstdint>
+#include <span>
+#include <string>
+#include <vector>
+
+namespace grammar {
+
+struct Element;
+
+struct Symbol {
+  enum class Type : uint8_t {
+    // value == terminal, as UTF-8
+    kTerminal,
+
+    // element != nullptr
+    kNonTerminal,
+
+    // char_class != 0
+    kCharacterClass,
+  };
+
+  Type type;
+
+  enum class Optional : uint8_t {
+    // Symbol is NOT optional.
+    kRequired,
+    // Symbol is optional.
+    kZeroOrOne,
+    // Symbol is optional and can repeat.
+    kZeroOrMore,
+    // Symbol should be excluded from previous symbol.
+    // Example: InputCharacter but not * or /
+    kExcluded,
+  };
+
+  Optional optional{Optional::kRequired};
+
+  uint8_t char_class{0};
+
+  Element const* element{nullptr};
+  std::string value;
+};
+
+struct Definition {
+  std::vector<Symbol> symbols;
+};
+
+struct Element {
+  std::string name;
+
+  std::vector<Definition> definitions;
+};
+
+class Grammar {
+ public:
+  virtual ~Grammar() = default;
+
+  [[nodiscard]]
+  virtual Element const& root() const = 0;
+
+ protected:
+  Grammar() = default;
+
+  Grammar(Grammar const&) = delete;
+  Grammar& operator=(Grammar const&) = delete;
+};
+
+std::unique_ptr<Grammar> load(std::unique_ptr<io::Reader> reader,
+                              std::vector<std::string> const& character_classes,
+                              src::Errors& errors);
+
+}  // namespace grammar
+
+#endif  // GRAMMAR_HH
diff --git a/src/java_tokens.cc b/src/java_tokens.cc
new file mode 100644
index 0000000..1ba40a3
--- /dev/null
+++ b/src/java_tokens.cc
@@ -0,0 +1,394 @@
+#include "java_tokens.hh"
+
+#include "errors.hh"
+#include "java_tokens_java-8.hh"
+#include "java_uescape.hh"
+#include "str.hh"
+#include "u8.hh"
+#include "uline.hh"
+
+#include <cassert>
+#include <charconv>
+#include <cstddef>
+#include <expected>
+#include <format>
+#include <limits>
+#include <memory>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <system_error>
+#include <utility>
+
+namespace java {
+
+namespace {
+
+template <typename TokenMatcher, typename MatchToken>
+class TokensImpl : public Tokens {
+ public:
+  TokensImpl(std::unique_ptr<io::Reader> reader,
+             std::unique_ptr<src::Errors> errors, TokensConfig config)
+      : reader_(u8::line::open(u8::java::open(std::move(reader)))),
+        errors_(std::move(errors)),
+        config_(config) {}
+
+  std::expected<Token, io::ReadError> read() override {
+    while (true) {
+      while (line_.empty()) {
+        auto maybe_line = reader_->read();
+        if (!maybe_line.has_value())
+          return std::unexpected(maybe_line.error());
+        line_ = maybe_line.value();
+        location_.line = reader_->number();
+        location_.column = 0;
+      }
+
+      Token token;
+      token.loc = location_;
+
+      auto maybe_token_pair = matcher_.matchNext(line_);
+
+      while (matcher_.line_end_reached_) {
+        // Matcher wants more lines.
+        matcher_.line_end_reached_ = false;
+
+        bool got_any = false;
+        line_tmp_ = line_;
+        line_tmp_.push_back('\n');
+        while (true) {
+          auto maybe_line = reader_->read();
+          if (!maybe_line.has_value())
+            break;
+          line_tmp_.append(maybe_line.value());
+          got_any = true;
+          // Simple check, it might not actually end the comment but if so tokenizer will complain
+          // about reaching line_end again.
+          if (maybe_line->contains("*/"))
+            break;
+          line_tmp_.push_back('\n');
+        }
+        line_ = line_tmp_;
+        maybe_token_pair = matcher_.matchNext(line_);
+
+        if (!got_any)
+          break;
+      }
+
+      if (maybe_token_pair.has_value()) {
+        token.str = line_.substr(0, maybe_token_pair->second);
+        location_.column += u8len(token.str);
+        line_ = line_.substr(maybe_token_pair->second);
+        switch (maybe_token_pair->first) {
+          case MatchToken::kBinaryIntegerLiteral:
+            handle_int_literal(token, /* base */ 2);
+            break;
+          case MatchToken::kBooleanLiteral:
+            token.type = Token::Type::kLiteralBoolean;
+            token.int_value = token.str != "false";
+            break;
+          case MatchToken::kCharacterLiteral:
+            token.type = Token::Type::kLiteralCharacter;
+            if (token.str[1] == '\\') {
+              token.int_value =
+                  unescape(token.str.substr(1, token.str.size() - 2)).first;
+            } else {
+              auto* ptr =
+                  reinterpret_cast<uint8_t const*>(token.str.data() + 1);
+              auto* end = ptr + token.str.size() - 2;
+              token.int_value = u8::read(ptr, end).value();
+            }
+            break;
+          case MatchToken::kDecimalFloatingPointLiteral:
+            handle_float_literal(token);
+            break;
+          case MatchToken::kDecimalIntegerLiteral:
+            handle_int_literal(token);
+            break;
+          case MatchToken::kEndOfLineComment:
+            token.type = Token::Type::kComment;
+            token.str = str::trim(token.str.substr(2));
+            break;
+          case MatchToken::kHexIntegerLiteral:
+            handle_int_literal(token, /* base */ 16);
+            break;
+          case MatchToken::kHexadecimalFloatingPointLiteral:
+            handle_float_literal(token, /* base */ 16);
+            break;
+          case MatchToken::kIdentifier:
+            token.type = Token::Type::kIdentifier;
+            break;
+          case MatchToken::kKeyword:
+            token.type = Token::Type::kKeyword;
+            break;
+          case MatchToken::kNullLiteral:
+            token.type = Token::Type::kLiteralNull;
+            break;
+          case MatchToken::kOctalIntegerLiteral:
+            handle_int_literal(token, /* base */ 8);
+            break;
+          case MatchToken::kOperator:
+            token.type = Token::Type::kOperator;
+            break;
+          case MatchToken::kSeparator:
+            token.type = Token::Type::kSeparator;
+            break;
+          case MatchToken::kStringLiteral:
+            token.type = Token::Type::kLiteralString;
+            token.str =
+                unescape_if_needed(token.str.substr(1, token.str.size() - 2));
+            break;
+          case MatchToken::kTraditionalComment: {
+            token.type = Token::Type::kComment;
+            size_t s = 2;
+            while (s < token.str.size() && token.str[s] == '*')
+              ++s;
+            token.str =
+                str::trim(token.str.substr(s, token.str.size() - 2 - s));
+            token.int_value = static_cast<int64_t>(s - 1);
+            break;
+          }
+          case MatchToken::kWhiteSpace:
+            continue;
+        }
+      } else {
+        errors_->err(location_, std::format("Invalid token: {}", line_));
+        token.type = Token::Type::kError;
+        token.str = line_;
+      }
+      return token;
+    }
+  }
+
+ private:
+  void handle_int_literal_error(Token& token, std::string_view str,
+                                std::errc err, int base) {
+    if (err == std::errc::result_out_of_range) {
+      // Java assumes two completent (so 0xffff_ffff is -1) and also, negative literals
+      // are read as positive (because the operator '-' is a separate token)
+      uint64_t tmp;
+      auto ret =
+          std::from_chars(str.data(), str.data() + str.size(), tmp, base);
+      if (ret.ec == std::errc()) {
+        token.type = ret.ptr < str.data() + str.size()
+                         ? Token::Type::kLiteralLong
+                         : Token::Type::kLiteralInt;
+        token.int_value = static_cast<int64_t>(tmp);
+        return;
+      }
+    }
+    errors_->err(location_,
+                 std::format("Invalid integer literal: {}", token.str));
+    token.type = Token::Type::kError;
+  }
+
+  void handle_int_literal(Token& token, int base = 10) {
+    size_t prefix;
+    switch (base) {
+      case 16:  // 0x
+      case 2:   // 0b
+        prefix = 2;
+        break;
+      case 8:  // 0
+        prefix = 1;
+        break;
+      default:
+        prefix = 0;
+        break;
+    }
+    std::optional<char> suffix;
+    if (token.str.find('_') == std::string_view::npos) {
+      auto ret = std::from_chars(token.str.data() + prefix,
+                                 token.str.data() + token.str.size(),
+                                 token.int_value, base);
+      if (ret.ec != std::errc()) {
+        handle_int_literal_error(token, token.str.substr(prefix), ret.ec, base);
+        return;
+      }
+      if (ret.ptr < token.str.data() + token.str.size())
+        suffix = *ret.ptr;
+    } else {
+      std::string tmp;
+      tmp.reserve(token.str.size() - prefix);
+      for (size_t i = prefix; i < token.str.size(); ++i) {
+        if (token.str[i] != '_') {
+          tmp.push_back(token.str[i]);
+        }
+      }
+      auto ret = std::from_chars(tmp.data(), tmp.data() + tmp.size(),
+                                 token.int_value, base);
+      if (ret.ec != std::errc()) {
+        handle_int_literal_error(token, tmp, ret.ec, base);
+        return;
+      }
+      if (ret.ptr < tmp.data() + tmp.size())
+        suffix = *ret.ptr;
+    }
+    if (suffix.has_value() &&
+        (suffix.value() == 'l' || suffix.value() == 'L')) {
+      token.type = Token::Type::kLiteralLong;
+    } else {
+      if (base == 10 &&
+          token.int_value >
+              static_cast<int64_t>(1) + std::numeric_limits<int32_t>::max()) {
+        errors_->err(location_,
+                     std::format("Invalid integer literal: {}", token.str));
+        token.type = Token::Type::kError;
+        return;
+      }
+      if (std::cmp_greater(token.int_value,
+                           std::numeric_limits<uint32_t>::max())) {
+        errors_->err(location_,
+                     std::format("Invalid integer literal: {}", token.str));
+        token.type = Token::Type::kError;
+        return;
+      }
+      token.type = Token::Type::kLiteralInt;
+      token.int_value = static_cast<int32_t>(token.int_value);
+    }
+  }
+
+  void handle_float_literal(Token& token, int base = 10) {
+    size_t prefix;
+    std::chars_format fmt;
+    switch (base) {
+      case 16:  // 0x
+        fmt = std::chars_format::general | std::chars_format::hex;
+        prefix = 2;
+        break;
+      default:
+        fmt = std::chars_format::general;
+        prefix = 0;
+        break;
+    }
+
+    std::from_chars_result ret;
+    if (token.str.ends_with("f") || token.str.ends_with("F")) {
+      // float and double do not parse exactly the same, so use a float parser for float.
+      float tmp;
+      ret = std::from_chars(token.str.data() + prefix,
+                            token.str.data() + token.str.size(), tmp, fmt);
+      token.type = Token::Type::kLiteralFloatingPoint;
+      token.float_value = tmp;
+    } else {
+      ret = std::from_chars(token.str.data() + prefix,
+                            token.str.data() + token.str.size(),
+                            token.float_value, fmt);
+      token.type = Token::Type::kLiteralDoubleFloatingPoint;
+    }
+
+    if (ret.ec != std::errc()) {
+      // Java allows 0 with just a suffix, std::from_chars does not.
+      if (token.str == "0f" || token.str == "0F") {
+        token.type = Token::Type::kLiteralFloatingPoint;
+        token.float_value = 0;
+        return;
+      }
+      if (token.str == "0d" || token.str == "0D") {
+        token.type = Token::Type::kLiteralDoubleFloatingPoint;
+        token.float_value = 0;
+        return;
+      }
+      errors_->err(location_,
+                   std::format("Invalid float literal: {}", token.str));
+      token.type = Token::Type::kError;
+    }
+  }
+
+  std::string_view unescape_if_needed(std::string_view str) {
+    auto back_slash = str.find('\\');
+    if (back_slash == std::string_view::npos)
+      return str;
+    unescape_tmp_.clear();
+    unescape_tmp_.reserve(str.size());
+    size_t last = 0;
+    uint8_t tmp[4];
+    while (true) {
+      unescape_tmp_.append(str, last, back_slash - last);
+      auto ret = unescape(str.substr(back_slash));
+      auto* ptr = tmp;
+      u8::write(ptr, tmp + sizeof(tmp), ret.first);
+      unescape_tmp_.append(
+          std::string_view(reinterpret_cast<char*>(tmp), ptr - tmp));
+      last = back_slash + ret.second;
+      back_slash = str.find('\\', last);
+      if (back_slash == std::string::npos) {
+        unescape_tmp_.append(str, last);
+        break;
+      }
+    }
+    return unescape_tmp_;
+  }
+
+  // Strings coming here have already been validated by the tokenizer
+  static std::pair<uint16_t, size_t> unescape(std::string_view in) {
+    assert(in.front() == '\\');
+    assert(in.size() > 1);
+    switch (in[1]) {
+      case 'b':
+        return std::make_pair(8, 2);
+      case 't':
+        return std::make_pair(9, 2);
+      case 'n':
+        return std::make_pair(10, 2);
+      case 'f':
+        return std::make_pair(12, 2);
+      case 'r':
+        return std::make_pair(13, 2);
+      case '"':
+        return std::make_pair(34, 2);
+      case '\'':
+        return std::make_pair(39, 2);
+      case '\\':
+        return std::make_pair(92, 2);
+      case '0':
+      case '1':
+      case '2':
+      case '3':
+      case '4':
+      case '5':
+      case '6':
+      case '7': {
+        uint8_t tmp;
+        auto ret = std::from_chars(in.data() + 1, in.data() + in.size(), tmp,
+                                   /* base */ 8);
+        return std::make_pair(tmp, ret.ptr - in.data());
+      }
+      default:
+        std::unreachable();
+    }
+  }
+
+  static size_t u8len(std::string_view str) {
+    auto* ptr = reinterpret_cast<uint8_t const*>(str.data());
+    auto* const end = ptr + str.size();
+    size_t count = 0;
+    while (u8::skip(ptr, end))
+      ++count;
+    return count;
+  }
+
+  std::unique_ptr<u8::line::Reader> reader_;
+  std::unique_ptr<src::Errors> errors_;
+  TokensConfig const config_;
+  TokenMatcher matcher_;
+  std::string_view line_;
+  std::string line_tmp_;
+  Location location_;
+  std::string unescape_tmp_;
+};
+
+}  // namespace
+
+std::unique_ptr<Tokens> open(std::unique_ptr<io::Reader> reader,
+                             std::unique_ptr<src::Errors> errors,
+                             TokensConfig config) {
+  switch (config.version) {
+    case Version::kJava8:
+      return std::make_unique<TokensImpl<java_8::TokenMatcher, java_8::Token>>(
+          std::move(reader), std::move(errors), config);
+  }
+  std::unreachable();
+}
+
+}  // namespace java
diff --git a/src/java_tokens.hh b/src/java_tokens.hh
new file mode 100644
index 0000000..6fbefcb
--- /dev/null
+++ b/src/java_tokens.hh
@@ -0,0 +1,92 @@
+#ifndef JAVA_TOKENS_HH
+#define JAVA_TOKENS_HH
+
+#include "errors.hh"
+#include "io.hh"
+#include "java_version.hh"  // IWYU pragma: export
+#include "location.hh"
+
+#include <expected>
+#include <memory>
+#include <string_view>
+
+namespace java {
+
+using src::Location;
+
+struct Token {
+  enum class Type : uint8_t {
+    // str is content of comment, excluding heading or trailing slash and star.
+    // int_value is number of stars at head, 0 for single line, 1 for /* and
+    // 2 for /** and so on.
+    kComment,
+
+    // str is identifier
+    kIdentifier,
+
+    // str is keyword, int_value is Keyword index
+    kKeyword,
+
+    // str is separator, int_value is Separator index
+    kSeparator,
+
+    // str is operator, int_value is Operator index
+    kOperator,
+
+    // int_value is literal value
+    kLiteralInt,
+
+    // int_value is literal value
+    kLiteralLong,
+
+    // str is literal value
+    kLiteralString,
+
+    // int_value is literal value as unicode code-point
+    kLiteralCharacter,
+
+    kLiteralNull,
+
+    // float_value is literal value
+    kLiteralFloatingPoint,
+
+    // float_value is literal value
+    kLiteralDoubleFloatingPoint,
+
+    // int_value is literal value, 0 = false, anything else = true
+    kLiteralBoolean,
+
+    kError,
+  };
+
+  Type type;
+  Location loc;
+  std::string_view str;
+  int64_t int_value{0};
+  double float_value{0};
+};
+
+struct TokensConfig {
+  // Source version of Java file
+  Version version = Version::kMax;
+};
+
+class Tokens {
+ public:
+  virtual ~Tokens() = default;
+
+  virtual std::expected<Token, io::ReadError> read() = 0;
+
+ protected:
+  Tokens() = default;
+  Tokens(Tokens const&) = delete;
+  Tokens& operator=(Tokens const&) = delete;
+};
+
+[[nodiscard]] std::unique_ptr<Tokens> open(std::unique_ptr<io::Reader> reader,
+                                           std::unique_ptr<src::Errors>,
+                                           TokensConfig config = {});
+
+}  // namespace java
+
+#endif  // JAVA_TOKENS_HH
diff --git a/src/java_version.hh b/src/java_version.hh
new file mode 100644
index 0000000..444ae36
--- /dev/null
+++ b/src/java_version.hh
@@ -0,0 +1,16 @@
+#ifndef JAVA_VERSION_HH
+#define JAVA_VERSION_HH
+
+#include <cstdint>
+
+namespace java {
+
+enum class Version : uint8_t {
+  kJava8 = 8,
+
+  kMax = kJava8,
+};
+
+}  // namespace java
+
+#endif  // JAVA_VERSION_HH
diff --git a/src/location.cc b/src/location.cc
new file mode 100644
index 0000000..3fa1075
--- /dev/null
+++ b/src/location.cc
@@ -0,0 +1,12 @@
+#include "location.hh"
+
+#include <ostream>
+
+namespace src {
+
+std::ostream& operator<<(std::ostream& out, Location const& loc) {
+  out << loc.line << ':' << loc.column;
+  return out;
+}
+
+}  // namespace src
diff --git a/src/location.hh b/src/location.hh
new file mode 100644
index 0000000..1a210cb
--- /dev/null
+++ b/src/location.hh
@@ -0,0 +1,31 @@
+#ifndef LOCATION_HH
+#define LOCATION_HH
+
+#include <compare>
+#include <cstdint>
+#include <iosfwd>
+
+namespace src {
+
+struct Location {
+  uint64_t line;
+  uint16_t column;
+
+  constexpr Location() : line(0), column(0) {}
+
+  Location(uint64_t line, uint16_t column) : line(line), column(column) {}
+
+  [[nodiscard]]
+  std::strong_ordering operator<=>(Location const& loc) const {
+    auto ret = line <=> loc.line;
+    if (ret == std::strong_ordering::equal)
+      return column <=> loc.column;
+    return ret;
+  }
+};
+
+std::ostream& operator<<(std::ostream& out, Location const& loc);
+
+}  // namespace src
+
+#endif  // LOCATION_HH