summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/errors.cc116
-rw-r--r--src/errors.hh64
-rw-r--r--src/gen_tokens.cc1062
-rw-r--r--src/grammar.cc268
-rw-r--r--src/grammar.hh80
-rw-r--r--src/java_tokens.cc394
-rw-r--r--src/java_tokens.hh92
-rw-r--r--src/java_version.hh16
-rw-r--r--src/location.cc12
-rw-r--r--src/location.hh31
10 files changed, 2135 insertions, 0 deletions
diff --git a/src/errors.cc b/src/errors.cc
new file mode 100644
index 0000000..7122a9a
--- /dev/null
+++ b/src/errors.cc
@@ -0,0 +1,116 @@
+#include "errors.hh"
+
+#include "location.hh"
+
+#include <cstdint>
+#include <format>
+#include <iostream>
+#include <memory>
+#include <ostream>
+#include <string>
+#include <string_view>
+#include <utility>
+
+namespace src {
+
+namespace {
+
+class FileErrors : public Errors {
+ public:
+ FileErrors(std::string filename, std::shared_ptr<ErrorsOutput> output)
+ : filename_(std::move(filename)), output_(std::move(output)) {}
+
+ void err(Location loc, std::string_view msg) override {
+ ++errors_;
+ output_->println(std::format("{}:{}:{}: Error: {}", filename_, loc.line,
+ loc.column, msg));
+ }
+
+ void warn(Location loc, std::string_view msg) override {
+ ++warnings_;
+ output_->println(std::format("{}:{}:{}: Warning: {}", filename_, loc.line,
+ loc.column, msg));
+ }
+
+#if !defined(NDEBUG)
+ void dbg(Location loc, std::string_view msg) override {
+ output_->println(std::format("{}:{}:{}: Debug: {}", filename_, loc.line,
+ loc.column, msg));
+ }
+#endif
+
+ [[nodiscard]]
+ uint64_t errors() const override {
+ return errors_;
+ }
+
+ [[nodiscard]]
+ uint64_t warnings() const override {
+ return warnings_;
+ }
+
+ private:
+ std::string const filename_;
+ std::shared_ptr<ErrorsOutput> output_;
+ uint64_t errors_{0};
+ uint64_t warnings_{0};
+};
+
+class IgnoreErrors : public Errors {
+ public:
+ IgnoreErrors() = default;
+
+ void err(Location /* loc */, std::string_view /* msg */) override {}
+ void warn(Location /* loc */, std::string_view /* msg */) override {}
+
+#if !defined(NDEBUG)
+ void dbg(Location /* loc */, std::string_view /* msg */) override {}
+#endif
+
+ [[nodiscard]]
+ uint64_t errors() const override {
+ return 0;
+ }
+
+ [[nodiscard]]
+ uint64_t warnings() const override {
+ return 0;
+ }
+};
+
+class OutputStreamErrorsOutput : public ErrorsOutput {
+ public:
+ explicit OutputStreamErrorsOutput(std::ostream& out) : out_(out) {}
+
+ void println(std::string_view line) override { out_ << line << '\n'; }
+
+ private:
+ std::ostream& out_;
+};
+
+} // namespace
+
+[[nodiscard]]
+std::unique_ptr<Errors> file_errors(std::string filename,
+ std::shared_ptr<ErrorsOutput> output) {
+ if (!output) {
+ static std::shared_ptr<ErrorsOutput> g_stderr_output;
+ // TODO: Make thread-safe when needed
+ if (!g_stderr_output)
+ g_stderr_output = std::make_shared<OutputStreamErrorsOutput>(std::cerr);
+ output = g_stderr_output;
+ }
+ return std::make_unique<FileErrors>(std::move(filename), std::move(output));
+}
+
+[[nodiscard]]
+std::unique_ptr<Errors> ignore_errors() {
+ return std::make_unique<IgnoreErrors>();
+}
+
+[[nodiscard]]
+std::unique_ptr<ErrorsOutput> errors_output_ios(std::ostream& out) {
+ return std::make_unique<OutputStreamErrorsOutput>(out);
+}
+
+} // namespace src
diff --git a/src/errors.hh b/src/errors.hh
new file mode 100644
index 0000000..d8b3d60
--- /dev/null
+++ b/src/errors.hh
@@ -0,0 +1,64 @@
+#ifndef ERRORS_HH
+#define ERRORS_HH
+
+#include "location.hh"
+
+#include <iosfwd>
+#include <memory>
+#include <string>
+#include <string_view>
+
+namespace src {
+
+class Errors {
+ public:
+ virtual ~Errors() = default;
+
+ virtual void err(Location loc, std::string_view msg) = 0;
+
+ virtual void warn(Location loc, std::string_view msg) = 0;
+
+#if !defined(NDEBUG)
+ virtual void dbg(Location loc, std::string_view msg) = 0;
+#else
+ void dbg(Location, std::string_view) {}
+#endif
+
+ [[nodiscard]]
+ virtual uint64_t errors() const = 0;
+ [[nodiscard]]
+ virtual uint64_t warnings() const = 0;
+
+ protected:
+ Errors() = default;
+
+ Errors(Errors const&) = delete;
+ Errors& operator=(Errors const&) = delete;
+};
+
+class ErrorsOutput {
+ public:
+ virtual ~ErrorsOutput() = default;
+
+ virtual void println(std::string_view line) = 0;
+
+ protected:
+ ErrorsOutput() = default;
+
+ ErrorsOutput(ErrorsOutput const&) = delete;
+ ErrorsOutput& operator=(ErrorsOutput const&) = delete;
+};
+
+[[nodiscard]]
+std::unique_ptr<Errors> file_errors(
+ std::string filename, std::shared_ptr<ErrorsOutput> output = nullptr);
+
+[[nodiscard]]
+std::unique_ptr<Errors> ignore_errors();
+
+[[nodiscard]]
+std::unique_ptr<ErrorsOutput> errors_output_ios(std::ostream& out);
+
+} // namespace src
+
+#endif // ERRORS_HH
diff --git a/src/gen_tokens.cc b/src/gen_tokens.cc
new file mode 100644
index 0000000..ef0fce7
--- /dev/null
+++ b/src/gen_tokens.cc
@@ -0,0 +1,1062 @@
+#include "args.hh"
+#include "errors.hh"
+#include "grammar.hh"
+#include "io.hh"
+#include "prefix_tree.hh"
+
+#include <algorithm>
+#include <cassert>
+#include <charconv>
+#include <cstddef>
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <set>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+namespace {
+
+enum class CharacterClass : uint8_t {
+ kWhiteSpace = 0,
+ kLineTerminator = 1,
+ kInputCharacter = 2,
+ kJavaLetter = 3,
+ kJavaLetterOrDigit = 4,
+};
+
+std::vector<std::string> const kCharacterClassNames(
+ {"WhiteSpace", "LineTerminator", "InputCharacter", "JavaLetter",
+ "JavaLetterOrDigit"});
+
+enum class ReturnType : uint8_t {
+ kTokenAndSize,
+ kInternalAndSize,
+ kSize,
+};
+
+std::string make_define(std::string_view filename) {
+ std::string ret;
+ ret.reserve(filename.size());
+ for (char c : filename) {
+ if ((c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_') {
+ ret.push_back(c);
+ } else if (c >= 'a' && c <= 'z') {
+ ret.push_back(static_cast<char>(c & ~0x20));
+ } else {
+ ret.push_back('_');
+ }
+ }
+ return ret;
+}
+
+class Generator {
+ public:
+ bool generate(std::string_view header_name, std::string_view source_name,
+ std::string const& ns, std::string const& unicode_version,
+ grammar::Grammar& grammar);
+
+ private:
+ void find_specific_elements(grammar::Element const& root);
+ void find_all_elements(grammar::Element const& root);
+
+ void check_need_last(grammar::Element const& element);
+ bool find_report_last(grammar::Element const& root,
+ grammar::Element const& match);
+
+ [[nodiscard]]
+ ReturnType get_return_type(grammar::Element const& element) const;
+ [[nodiscard]]
+ ReturnType get_return_type(uint8_t character_class) const;
+
+ void write_matcher(std::ostream& out, grammar::Symbol const& symbol,
+ ReturnType return_type, std::string_view str_arg);
+ bool write_matcher(std::ostream& out, grammar::Definition const& definition,
+ ReturnType return_type, std::string_view indent);
+ bool write_matcher(std::ostream& out, grammar::Element const& element,
+ ReturnType return_type);
+
+ std::set<std::string_view> above_specific_tokens_;
+ std::set<std::string_view> specific_tokens_;
+ std::set<grammar::Element const*> all_elements_;
+ std::set<std::string_view> copy_last_;
+ std::set<std::string_view> report_last_;
+};
+
+// Find the Elements that has at least one terminal or character class as symbol
+// These will be the different tokens the tokenizer can return
+void Generator::find_specific_elements(grammar::Element const& root) {
+ if (std::ranges::any_of(root.definitions, [](auto const& definition) {
+ return definition.symbols.size() > 1 ||
+ definition.symbols[0].type == grammar::Symbol::Type::kTerminal;
+ })) {
+ specific_tokens_.insert(root.name);
+ return;
+ }
+
+ above_specific_tokens_.insert(root.name);
+
+ for (auto const& definition : root.definitions) {
+ for (auto const& symbol : definition.symbols) {
+ switch (symbol.type) {
+ case grammar::Symbol::Type::kNonTerminal:
+ find_specific_elements(*symbol.element);
+ break;
+ case grammar::Symbol::Type::kCharacterClass:
+ specific_tokens_.insert(kCharacterClassNames[symbol.char_class]);
+ break;
+ case grammar::Symbol::Type::kTerminal:
+ std::unreachable();
+ }
+ }
+ }
+}
+
+// Find elements that have definitions that has ZeroOrMore matches with a final condition
+void Generator::check_need_last(grammar::Element const& element) {
+ for (auto const& definition : element.definitions) {
+ if (definition.symbols.size() < 2)
+ continue;
+ if (definition.symbols[definition.symbols.size() - 1].optional ==
+ grammar::Symbol::Optional::kRequired &&
+ definition.symbols[definition.symbols.size() - 1].type ==
+ grammar::Symbol::Type::kNonTerminal &&
+ (definition.symbols[definition.symbols.size() - 2].optional ==
+ grammar::Symbol::Optional::kZeroOrOne ||
+ definition.symbols[definition.symbols.size() - 2].optional ==
+ grammar::Symbol::Optional::kZeroOrMore) &&
+ definition.symbols[definition.symbols.size() - 2].type ==
+ grammar::Symbol::Type::kNonTerminal) {
+ if (!copy_last_.contains(definition.symbols[definition.symbols.size() - 2]
+ .element->name)) {
+ find_report_last(
+ *definition.symbols[definition.symbols.size() - 2].element,
+ *definition.symbols[definition.symbols.size() - 1].element);
+ }
+ }
+ }
+}
+
+// Find element that has match as a single definition, if so, return true and insert into
+// report_last_.
+bool Generator::find_report_last(grammar::Element const& root,
+ grammar::Element const& match) {
+ if (std::ranges::any_of(root.definitions, [&match](auto const& definition) {
+ return definition.symbols.size() == 1 &&
+ definition.symbols[0].optional ==
+ grammar::Symbol::Optional::kRequired &&
+ definition.symbols[0].type ==
+ grammar::Symbol::Type::kNonTerminal &&
+ definition.symbols[0].element == &match;
+ })) {
+ report_last_.insert(root.name);
+ return true;
+ }
+
+ for (auto const& definition : root.definitions) {
+ for (auto const& symbol : definition.symbols) {
+ if (symbol.type == grammar::Symbol::Type::kNonTerminal) {
+ if (!copy_last_.contains(symbol.element->name)) {
+ if (find_report_last(*symbol.element, match)) {
+ copy_last_.insert(root.name);
+ return true;
+ }
+ }
+ }
+ }
+ }
+
+ return false;
+}
+
+ReturnType Generator::get_return_type(grammar::Element const& element) const {
+ if (above_specific_tokens_.contains(element.name) ||
+ specific_tokens_.contains(element.name))
+ return ReturnType::kTokenAndSize;
+ if (copy_last_.contains(element.name) || report_last_.contains(element.name))
+ return ReturnType::kInternalAndSize;
+ return ReturnType::kSize;
+}
+
+ReturnType Generator::get_return_type(uint8_t character_class) const {
+ auto const& name = kCharacterClassNames[character_class];
+ if (above_specific_tokens_.contains(name) || specific_tokens_.contains(name))
+ return ReturnType::kTokenAndSize;
+ return ReturnType::kSize;
+}
+
+void Generator::find_all_elements(grammar::Element const& root) {
+ auto pair = all_elements_.insert(&root);
+ if (!pair.second)
+ return;
+
+ for (auto const& definition : root.definitions) {
+ for (auto const& symbol : definition.symbols) {
+ switch (symbol.type) {
+ case grammar::Symbol::Type::kNonTerminal:
+ find_all_elements(*symbol.element);
+ break;
+ case grammar::Symbol::Type::kCharacterClass:
+ case grammar::Symbol::Type::kTerminal:
+ break;
+ }
+ }
+ }
+}
+
+void declare_character_class_matchers(std::ostream& out) {
+ out << "[[nodiscard]]\n"
+ << "std::optional<size_t> matchLineTerminator(std::string_view str);\n";
+
+ out << "[[nodiscard]]\n"
+ << "static std::optional<size_t> matchInputCharacter(std::string_view "
+ "str);\n";
+
+ out << "[[nodiscard]]\n"
+ << "std::optional<std::pair<Token, size_t>> "
+ "matchWhiteSpace(std::string_view str);\n";
+
+ out << "[[nodiscard]]\n"
+ << "static std::optional<size_t> matchJavaLetter"
+ << "(std::string_view str);\n";
+
+ out << "[[nodiscard]]\n"
+ << "static std::optional<size_t> matchJavaLetterOrDigit"
+ << "(std::string_view str);\n";
+}
+
+void write_character_class_matchers(std::ostream& out,
+ std::string_view unicode_version) {
+ out << "[[nodiscard]]\n"
+ << "std::optional<size_t> TokenMatcher::matchLineTerminator"
+ << "(std::string_view str) {\n"
+ // Tokenizer normally reads one line at a time, there is only
+ // one construct (traditional comment) that needs it.
+ // So match synthetic '\n' or report that it was needed if we are at
+ // end of string.
+ << " if (str.empty()) {\n"
+ << " line_end_reached_ = true;\n"
+ << " return std::nullopt;\n"
+ << " }\n"
+ << " return str.front() == '\\n' ? std::make_optional<size_t>(1)"
+ << " : std::nullopt;\n"
+ << "}\n"
+ << "\n";
+
+ out << "[[nodiscard]]\n"
+ << "std::optional<size_t> TokenMatcher::matchInputCharacter"
+ << "(std::string_view str) {\n"
+ // UnicodeInputCharacter but not CR or LF
+ << " if (str.empty() || str.front() == '\\n')\n"
+ << " return std::nullopt;\n"
+ << " auto* const start = reinterpret_cast<uint8_t const*>(str.data());\n"
+ << " auto* ptr = start;\n"
+ << " u8::skip(ptr, start + str.size());\n"
+ << " return ptr - start;\n"
+ << "}\n"
+ << "\n";
+
+ out << "[[nodiscard]]\n"
+ << "std::optional<std::pair<Token, size_t>> TokenMatcher::matchWhiteSpace"
+ << "(std::string_view str) {\n"
+ << " if (!str.empty()) {\n"
+ << " switch (str.front()) {\n"
+ // the ASCII SP character, also known as "space"
+ << " case ' ':\n"
+ // the ASCII HT character, also known as "horizontal tab"
+ << " case '\\t':\n"
+ // the ASCII FF character, also known as "form feed"
+ << " case '\\f':\n"
+ // LineTerminator, not calling matchLineTerminator as it does special things
+ << " case '\\n':\n"
+ << " return std::make_pair(Token::kWhiteSpace, 1);\n"
+ << " default:\n"
+ << " break;\n"
+ << " }\n"
+ << " }\n"
+ << " return std::nullopt;\n"
+ << "}\n"
+ << "\n";
+
+ out << "[[nodiscard]]\n"
+ << "std::optional<size_t> TokenMatcher::matchJavaLetter"
+ << "(std::string_view str) {\n"
+ << " auto* const start = reinterpret_cast<uint8_t const*>(str.data());\n"
+ << " auto* ptr = start;\n"
+ << " auto code = u8::read(ptr, ptr + str.size());\n"
+ << " if (!code.has_value())\n"
+ << " return std::nullopt;\n"
+ // any Unicode character that is a "Java letter"
+ // A "Java letter" is a character for which the method Character.isJavaIdentifierStart(int) returns true.
+ // A character may start a Java identifier if and only if one of the following conditions is true:
+ // isLetter(codePoint) returns true
+ // getType(codePoint) returns LETTER_NUMBER
+ // the referenced character is a currency symbol (such as '$')
+ // the referenced character is a connecting punctuation character (such as '_').
+ << " switch (u::lookup_gc(code.value(), u::Version::" << unicode_version
+ << ")) {\n"
+ << " case u::GeneralCategory::LETTER_UPPERCASE:\n"
+ << " case u::GeneralCategory::LETTER_LOWERCASE:\n"
+ << " case u::GeneralCategory::LETTER_TITLECASE:\n"
+ << " case u::GeneralCategory::LETTER_MODIFIER:\n"
+ << " case u::GeneralCategory::LETTER_OTHER:\n"
+ << " case u::GeneralCategory::NUMBER_LETTER:\n"
+ << " case u::GeneralCategory::SYMBOL_CURRENCY:\n"
+ << " case u::GeneralCategory::PUNCTUATION_CONNECTOR:\n"
+ << " return ptr - start;\n"
+ << " default:\n"
+ << " return std::nullopt;\n"
+ << " }\n"
+ << "}\n"
+ << "\n";
+
+ out << "[[nodiscard]]\n"
+ << "std::optional<size_t> TokenMatcher::matchJavaLetterOrDigit"
+ << "(std::string_view str) {\n"
+ << " auto* const start = reinterpret_cast<uint8_t const*>(str.data());\n"
+ << " auto* ptr = start;\n"
+ << " auto code = u8::read(ptr, ptr + str.size());\n"
+ << " if (!code.has_value())\n"
+ << " return std::nullopt;\n"
+ // any Unicode character that is a "Java letter-or-digit"
+ // A "Java letter-or-digit" is a character for which the method Character.isJavaIdentifierPart(int) returns true.
+ // A character may be part of a Java identifier if any of the following conditions are true:
+ // it is a letter
+ // it is a currency symbol (such as '$')
+ // it is a connecting punctuation character (such as '_')
+ // it is a digit
+ // it is a numeric letter (such as a Roman numeral character)
+ // it is a combining mark
+ // it is a non-spacing mark
+ // isIdentifierIgnorable returns true for the character
+ << " switch (u::lookup_gc(code.value(), u::Version::" << unicode_version
+ << ")) {\n"
+ << " case u::GeneralCategory::LETTER_UPPERCASE:\n"
+ << " case u::GeneralCategory::LETTER_LOWERCASE:\n"
+ << " case u::GeneralCategory::LETTER_TITLECASE:\n"
+ << " case u::GeneralCategory::LETTER_MODIFIER:\n"
+ << " case u::GeneralCategory::LETTER_OTHER:\n"
+ << " case u::GeneralCategory::SYMBOL_CURRENCY:\n"
+ << " case u::GeneralCategory::PUNCTUATION_CONNECTOR:\n"
+ << " case u::GeneralCategory::NUMBER_DIGIT:\n"
+ << " case u::GeneralCategory::NUMBER_LETTER:\n"
+ << " case u::GeneralCategory::MARK_SPACING_COMBINING:\n"
+ << " case u::GeneralCategory::MARK_NONSPACING:\n"
+ << " case u::GeneralCategory::OTHER_FORMAT:\n"
+ << " return ptr - start;\n"
+ << " case u::GeneralCategory::OTHER_CONTROL:\n"
+ << " if ((/* code.value() >= 0 && */ code.value() <= 8) ||\n"
+ << " (code.value() >= 0xe && code.value() <= 0x1b) ||\n"
+ << " (code.value() >= 0x7f && code.value() <= 0x9f))\n"
+ << " return 1;\n"
+ << " break;\n"
+ << " default:\n"
+ << " break;\n"
+ << " }\n"
+ << " return std::nullopt;\n"
+ << "}\n"
+ << "\n";
+}
+
+std::ostream& quote(std::ostream& out, std::string_view in) {
+ out << '"';
+ bool avoid_digit = false;
+ for (auto c : in) {
+ if (c == '"' || c == '\\') {
+ out << '\\';
+ } else if (c < ' ' || (c & 0x80)) {
+ char tmp[4];
+ std::to_chars(tmp, tmp + sizeof(tmp), c & 0xff, 8).ptr[0] = 0;
+ out << "\\" << tmp;
+ avoid_digit = true;
+ continue;
+ } else if (avoid_digit) {
+ if (c >= '0' && c <= '7') {
+ out << "\" \"";
+ }
+ }
+ avoid_digit = false;
+ out << c;
+ }
+ out << '"';
+ return out;
+}
+
+void match_return_type(std::ostream& out, ReturnType in_return_type,
+ std::string_view in_name, ReturnType out_return_type) {
+ switch (out_return_type) {
+ case ReturnType::kTokenAndSize:
+ switch (in_return_type) {
+ case ReturnType::kTokenAndSize:
+ break;
+ case ReturnType::kInternalAndSize:
+ out << ".transform([](auto pair) { return std::make_pair(Token::k"
+ << in_name << ", pair.second); }";
+ break;
+ case ReturnType::kSize:
+ out << ".transform([](auto size) { return std::make_pair(Token::k"
+ << in_name << ", size); })";
+ break;
+ }
+ break;
+ case ReturnType::kInternalAndSize:
+ switch (in_return_type) {
+ case ReturnType::kTokenAndSize:
+ out << ".transform([](auto pair) { return std::make_pair(Internal::k"
+ << in_name << ", pair.second); }";
+ break;
+ case ReturnType::kInternalAndSize:
+ break;
+ case ReturnType::kSize:
+ if (in_name.empty()) {
+ out << ".transform([](auto size) { return "
+ "std::make_pair(Internal::UNDEFINED, size); })";
+ } else {
+ out << ".transform([](auto size) { return "
+ "std::make_pair(Internal::k"
+ << in_name << ", size); })";
+ }
+ break;
+ }
+ break;
+ case ReturnType::kSize:
+ if (in_return_type != ReturnType::kSize) {
+ out << ".transform([](auto pair) { return pair.second; })";
+ }
+ break;
+ }
+}
+
+void Generator::write_matcher(std::ostream& out, grammar::Symbol const& symbol,
+ ReturnType return_type,
+ std::string_view str_arg) {
+ std::string_view in_name;
+ ReturnType in_return_type;
+
+ switch (symbol.type) {
+ case grammar::Symbol::Type::kTerminal:
+ in_return_type = ReturnType::kSize;
+ out << "(" << str_arg << ".starts_with(";
+ quote(out, symbol.value);
+ out << ") ? std::make_optional<size_t>(" << symbol.value.size()
+ << ") : " << "std::nullopt)";
+ break;
+ case grammar::Symbol::Type::kNonTerminal:
+ out << "match" << symbol.element->name << "(" << str_arg << ")";
+ in_return_type = get_return_type(*symbol.element);
+ in_name = symbol.element->name;
+ break;
+ case grammar::Symbol::Type::kCharacterClass:
+ out << "match" << kCharacterClassNames[symbol.char_class] << "("
+ << str_arg << ")";
+ in_return_type = get_return_type(symbol.char_class);
+ in_name = kCharacterClassNames[symbol.char_class];
+ break;
+ }
+
+ match_return_type(out, in_return_type, in_name, return_type);
+}
+
+bool Generator::write_matcher(std::ostream& out,
+ grammar::Definition const& definition,
+ ReturnType return_type, std::string_view indent) {
+ if (definition.symbols.size() == 1 &&
+ definition.symbols[0].optional == grammar::Symbol::Optional::kRequired) {
+ out << indent << "return ";
+ write_matcher(out, definition.symbols[0], return_type, "str");
+ out << ";\n";
+ return true;
+ }
+
+ std::string_view size_suffix;
+ switch (return_type) {
+ case ReturnType::kTokenAndSize:
+ case ReturnType::kInternalAndSize:
+ size_suffix = "->second";
+ break;
+ case ReturnType::kSize:
+ size_suffix = ".value()";
+ break;
+ }
+
+ if (definition.symbols.size() > 1 &&
+ definition.symbols[0].optional == grammar::Symbol::Optional::kRequired &&
+ definition.symbols[1].optional == grammar::Symbol::Optional::kExcluded) {
+ bool first = true;
+ for (auto const& symbol : definition.symbols) {
+ if (first) {
+ out << indent << "auto first_ret = ";
+ write_matcher(out, symbol, return_type, "str");
+ out << ";\n"
+ << indent << "if (!first_ret.has_value())\n"
+ << indent << " return first_ret;\n"
+ << indent << "std::optional<size_t> ret;\n"
+ << indent << "auto tmp = str.substr(0, first_ret" << size_suffix
+ << ");\n";
+ first = false;
+ } else {
+ if (symbol.optional != grammar::Symbol::Optional::kExcluded) {
+ std::cerr << "Non-excluded after at least one excluded\n";
+ return false;
+ }
+ out << indent << "ret = ";
+ write_matcher(out, symbol, ReturnType::kSize, "tmp");
+ out << ";\n"
+ << indent << "if (ret.has_value() && ret.value() == tmp.size())\n"
+ << indent << " return std::nullopt;\n";
+ }
+ }
+ out << indent << "return first_ret;\n";
+ return true;
+ }
+
+ if (std::ranges::all_of(definition.symbols, [](auto const& symbol) {
+ return symbol.optional == grammar::Symbol::Optional::kRequired;
+ })) {
+ out << indent << "size_t tot = 0;\n";
+ bool first = true;
+ for (auto const& symbol : definition.symbols) {
+ std::string indent2(indent);
+ if (first) {
+ out << indent2 << "auto ret = ";
+ write_matcher(out, symbol, return_type, "str");
+ out << ";\n";
+ first = false;
+ } else {
+ out << indent2 << "ret = ";
+ write_matcher(out, symbol, return_type, "str.substr(tot)");
+ out << ";\n";
+ }
+ out << indent2 << "if (!ret.has_value())\n"
+ << indent2 << " return ret;\n";
+ out << indent2 << "tot += ret" << size_suffix << ";\n";
+ }
+ switch (return_type) {
+ case ReturnType::kInternalAndSize:
+ // Return last internal
+ out << indent << "return std::make_pair(ret->first, tot);\n";
+ break;
+ case ReturnType::kTokenAndSize:
+ std::cerr << "Unable to return token and size\n";
+ return false;
+ case ReturnType::kSize:
+ out << indent << "return tot;\n";
+ break;
+ }
+ return true;
+ }
+
+ out << indent << "size_t tot = 0;\n";
+ bool last_internal = false;
+ switch (return_type) {
+ case ReturnType::kInternalAndSize:
+ last_internal = true;
+ out << indent << "std::optional<Internal> last_internal;\n";
+ break;
+ case ReturnType::kTokenAndSize:
+ case ReturnType::kSize:
+ break;
+ }
+ bool at_least_one_required = false;
+ bool first = true;
+ bool first_internal = true;
+ bool next_internal = false;
+ for (size_t i = 0; i < definition.symbols.size(); ++i) {
+ auto const& symbol = definition.symbols[i];
+ std::string indent2(indent);
+ bool have_internal = next_internal;
+ next_internal = false;
+ ReturnType symbol_return_type = return_type;
+
+ if (symbol.optional != grammar::Symbol::Optional::kRequired &&
+ i + 1 < definition.symbols.size() &&
+ definition.symbols[i + 1].optional ==
+ grammar::Symbol::Optional::kRequired &&
+ definition.symbols[i + 1].type == grammar::Symbol::Type::kNonTerminal) {
+ symbol_return_type = ReturnType::kInternalAndSize;
+ next_internal = true;
+ }
+
+ switch (symbol_return_type) {
+ case ReturnType::kTokenAndSize:
+ case ReturnType::kInternalAndSize:
+ size_suffix = "->second";
+ break;
+ case ReturnType::kSize:
+ size_suffix = ".value()";
+ break;
+ }
+
+ switch (symbol.optional) {
+ case grammar::Symbol::Optional::kRequired:
+ at_least_one_required = true;
+ break;
+ case grammar::Symbol::Optional::kZeroOrOne:
+ break;
+ case grammar::Symbol::Optional::kZeroOrMore:
+ if (first) {
+ switch (symbol_return_type) {
+ case ReturnType::kTokenAndSize:
+ out << indent << "std::optional<std::pair<Token, size_t>> ret;\n";
+ break;
+ case ReturnType::kInternalAndSize:
+ out << indent
+ << "std::optional<std::pair<Internal, size_t>> ret;\n";
+ break;
+ case ReturnType::kSize:
+ out << indent << "std::optional<size_t> ret;\n";
+ break;
+ }
+ first = false;
+ }
+ out << indent << "while (true) {\n";
+ indent2 += " ";
+ break;
+ case grammar::Symbol::Optional::kExcluded:
+ std::cerr << "Excluded mixed with conditional\n";
+ return false;
+ }
+ if (symbol_return_type == return_type) {
+ if (first) {
+ out << indent2 << "auto ret = ";
+ write_matcher(out, symbol, symbol_return_type, "str");
+ first = false;
+ } else {
+ out << indent2 << "ret = ";
+ write_matcher(out, symbol, symbol_return_type, "str.substr(tot)");
+ }
+ out << ";\n";
+ } else {
+ if (first_internal) {
+ out << indent2 << "auto ret_internal = ";
+ write_matcher(out, symbol, symbol_return_type,
+ first ? "str" : "str.substr(tot)");
+ first_internal = false;
+ } else {
+ out << indent2 << "ret_internal = ";
+ write_matcher(out, symbol, symbol_return_type, "str.substr(tot)");
+ }
+ out << ";\n";
+ if (first) {
+ out << indent2 << "auto ret = ret_internal";
+ first = false;
+ } else {
+ out << indent2 << "ret = ret_internal";
+ }
+ match_return_type(out, symbol_return_type, "", return_type);
+ out << ";\n";
+ }
+ switch (symbol.optional) {
+ case grammar::Symbol::Optional::kRequired:
+ out << indent2 << "if (!ret.has_value()) {\n";
+ if (have_internal &&
+ symbol.type == grammar::Symbol::Type::kNonTerminal) {
+ out << indent2
+ << " if (!ret_internal.has_value() || ret_internal->first != "
+ "Internal::k"
+ << symbol.element->name << ")\n"
+ << indent2 << " return ret;\n";
+ } else {
+ out << indent2 << " return ret;\n";
+ }
+ out << indent2 << "} else {\n"
+ << indent2 << " tot += ret" << size_suffix << ";\n";
+ if (last_internal)
+ out << indent2 << " last_internal = ret->first;\n";
+ out << indent2 << "}\n";
+ break;
+ case grammar::Symbol::Optional::kZeroOrOne:
+ if (symbol_return_type == ReturnType::kTokenAndSize) {
+ out << indent2 << "tot += ret.has_value() ? ret->second : 0;\n";
+ } else {
+ out << indent2 << "tot += ret.value_or(0);\n";
+ }
+ if (last_internal)
+ out << indent2 << "if (ret.has_value())\n"
+ << indent2 << " last_internal = ret->first;\n";
+ break;
+ case grammar::Symbol::Optional::kZeroOrMore:
+ out << indent2 << "if (!ret.has_value())\n"
+ << indent2 << " break;\n"
+ << indent2 << "tot += ret" << size_suffix << ";\n";
+ if (last_internal)
+ out << indent2 << "last_internal = ret->first;\n";
+ out << indent << "}\n";
+ break;
+ case grammar::Symbol::Optional::kExcluded:
+ assert(false);
+ break;
+ }
+ }
+ switch (return_type) {
+ case ReturnType::kInternalAndSize:
+ // Return last internal
+ if (at_least_one_required) {
+ out << indent << "return std::make_pair(last_internal.value(), tot);\n";
+ } else {
+ out << indent << "if (last_internal.has_value())\n"
+ << indent
+ << " return std::make_pair(last_internal.value(), tot);\n"
+ << indent << "return std::make_pair(Internal::UNDEFINED, tot);\n";
+ }
+ break;
+ case ReturnType::kTokenAndSize:
+ std::cerr << "Unable to return token and size\n";
+ return false;
+ case ReturnType::kSize:
+ out << indent << "return tot;\n";
+ break;
+ }
+ return true;
+}
+
+void declare_matcher(std::ostream& out, grammar::Element const& element,
+ ReturnType return_type) {
+ switch (return_type) {
+ case ReturnType::kSize:
+ out << "[[nodiscard]]\n"
+ << "std::optional<size_t> match" << element.name
+ << "(std::string_view str);\n";
+ break;
+ case ReturnType::kTokenAndSize:
+ out << "[[nodiscard]]\n"
+ << "std::optional<std::pair<Token, size_t>> match" << element.name
+ << "(std::string_view str);\n";
+ break;
+ case ReturnType::kInternalAndSize:
+ out << "[[nodiscard]]\n"
+ << "std::optional<std::pair<Internal, size_t>> match" << element.name
+ << "(std::string_view str);\n";
+ break;
+ }
+}
+
+bool Generator::write_matcher(std::ostream& out,
+ grammar::Element const& element,
+ ReturnType return_type) {
+ ReturnType sub_return_type = return_type;
+ bool make_token = false;
+
+ switch (return_type) {
+ case ReturnType::kSize:
+ out << "[[nodiscard]]\n"
+ << "std::optional<size_t> TokenMatcher::match" << element.name
+ << "(std::string_view str) {\n";
+ break;
+ case ReturnType::kTokenAndSize:
+ out << "[[nodiscard]]\n"
+ << "std::optional<std::pair<Token, size_t>> TokenMatcher::match"
+ << element.name << "(std::string_view str) {\n";
+
+ if (specific_tokens_.contains(element.name)) {
+ sub_return_type = ReturnType::kSize;
+ make_token = true;
+ }
+ break;
+ case ReturnType::kInternalAndSize:
+ out << "[[nodiscard]]\n"
+ << "std::optional<std::pair<TokenMatcher::Internal, size_t>> "
+ "TokenMatcher::match"
+ << element.name << "(std::string_view str) {\n";
+ break;
+ }
+
+ if (element.definitions.size() == 1) {
+ if (make_token) {
+ out << " auto ret = [this, str]() -> std::optional<size_t> {\n";
+ if (!write_matcher(out, element.definitions[0], sub_return_type,
+ " ")) {
+ std::cerr << "Error in " << element.name << "\n";
+ return false;
+ }
+ out << " }();\n"
+ << " return ret.transform([](auto size) {\n"
+ << " return std::make_pair(Token::k" << element.name
+ << ", size); });\n";
+ } else {
+ if (!write_matcher(out, element.definitions[0], sub_return_type, " ")) {
+ std::cerr << "Error in " << element.name << "\n";
+ return false;
+ }
+ }
+ } else if (std::ranges::all_of(
+ element.definitions, [](auto const& definition) {
+ return definition.symbols.size() == 1 &&
+ definition.symbols[0].optional ==
+ grammar::Symbol::Optional::kRequired &&
+ definition.symbols[0].type ==
+ grammar::Symbol::Type::kTerminal;
+ })) {
+ if (std::ranges::all_of(element.definitions, [](auto const& definition) {
+ return definition.symbols[0].value.size() == 1;
+ })) {
+ out << " if (!str.empty()) {\n"
+ << " switch (str.front()) {\n";
+ for (auto const& definition : element.definitions) {
+ out << " case '" << definition.symbols[0].value[0] << "':\n";
+ }
+ out << " return 1;\n"
+ << " default:\n"
+ << " break;\n"
+ << " }\n"
+ << " }\n"
+ << " return std::nullopt;\n";
+ } else {
+ auto builder = prefix_tree::builder();
+ for (auto const& definition : element.definitions) {
+ builder->add(definition.symbols[0].value);
+ }
+ auto tree = builder->build();
+ if (!tree.has_value()) {
+ std::cerr << "To large prefix tree\n";
+ return false;
+ }
+ out << " static const auto tree = ";
+ quote(out, tree.value()) << "sv;\n";
+ out << " return prefix_tree::lookup(tree, str)";
+ if (make_token) {
+ out << ".transform([](auto size) {\n"
+ << " return std::make_pair(Token::k" << element.name
+ << ", size); })";
+ }
+ out << ";\n";
+ }
+ } else {
+ bool first = true;
+ std::string_view ret_type;
+ switch (sub_return_type) {
+ case ReturnType::kTokenAndSize:
+ ret_type = "std::optional<std::pair<Token, size_t>>";
+ break;
+ case ReturnType::kInternalAndSize:
+ ret_type = "std::optional<std::pair<Internal, size_t>>";
+ break;
+ case ReturnType::kSize:
+ ret_type = "std::optional<size_t>";
+ break;
+ }
+ for (auto const& definition : element.definitions) {
+ if (first) {
+ first = false;
+ out << " auto tmp = [this, str]() -> " << ret_type << " {\n";
+ if (!write_matcher(out, definition, sub_return_type, " ")) {
+ std::cerr << "Error in " << element.name << "\n";
+ return false;
+ }
+ out << " }();\n";
+ out << " auto ret = tmp;\n";
+ } else {
+ out << " tmp = [this, str]() -> " << ret_type << " {\n";
+ if (!write_matcher(out, definition, sub_return_type, " ")) {
+ std::cerr << "Error in " << element.name << "\n";
+ return false;
+ }
+ out << " }();\n"
+ << " if (tmp.has_value()) {\n";
+ if (sub_return_type == ReturnType::kTokenAndSize) {
+ out << " if (!ret.has_value() || ret.value().second < "
+ "tmp.value().second) {\n";
+ } else {
+ out << " if (!ret.has_value() || ret.value() < tmp.value()) {\n";
+ }
+ out << " ret = tmp;\n"
+ << " }\n"
+ << " }\n";
+ }
+ }
+ if (make_token) {
+ out << " return ret.transform([](auto size) {\n"
+ << " return std::make_pair(Token::k" << element.name
+ << ", size); });\n";
+ } else {
+ out << " return ret;\n";
+ }
+ }
+
+ out << "}\n"
+ << "\n";
+ return true;
+}
+
+bool Generator::generate(std::string_view header_name,
+ std::string_view source_name, std::string const& ns,
+ std::string const& unicode_version,
+ grammar::Grammar& grammar) {
+ std::fstream header{std::string(header_name),
+ std::fstream::trunc | std::fstream::out};
+ std::fstream source{std::string(source_name),
+ std::fstream::trunc | std::fstream::out};
+
+ auto header_guard = make_define(header_name);
+
+ header << "#ifndef " << header_guard << "\n"
+ << "#define " << header_guard << "\n"
+ << "\n"
+ << "#include \"prefix_tree.hh\"\n"
+ << "\n"
+ << "#include <cstddef>\n"
+ << "#include <cstdint>\n"
+ << "#include <optional>\n"
+ << "#include <string_view>\n"
+ << "#include <utility>\n"
+ << "\n"
+ << "namespace " << ns << " {\n"
+ << "\n";
+
+ find_specific_elements(grammar.root());
+
+ find_all_elements(grammar.root());
+
+ for (auto& element : all_elements_) {
+ check_need_last(*element);
+ }
+
+ header << "enum class Token : "
+ << (specific_tokens_.size() < 256 ? "uint8_t" : "uint16_t") << " {\n";
+ for (auto const& token : specific_tokens_) {
+ header << " k" << token << ",\n";
+ }
+ header << "};\n";
+
+ header << "\n"
+ << "class TokenMatcher {\n"
+ << " public:\n"
+ << " [[nodiscard]]\n"
+ << " std::optional<std::pair<Token, size_t>>"
+ << " matchNext(std::string_view str);\n"
+ << "\n"
+ << " bool line_end_reached_{false};\n"
+ << "\n"
+ << " private:\n";
+
+ declare_character_class_matchers(header);
+
+ header << "\n"
+ << "enum class Internal : "
+ << (all_elements_.size() < 256 ? "uint8_t" : "uint16_t") << " {\n"
+ << " UNDEFINED,\n";
+ for (auto* element : all_elements_) {
+ header << " k" << element->name << ",\n";
+ }
+ header << "};\n"
+ << "\n";
+
+ for (auto* element : all_elements_) {
+ declare_matcher(header, *element, get_return_type(*element));
+ }
+
+ header << "\n"
+ << "};\n"
+ << "\n"
+ << "} // " << ns << "\n"
+ << "\n"
+ << "#endif // " << header_guard << "\n";
+
+ source << "#include \"" << header_name << "\"\n"
+ << "\n"
+ << "#include \"prefix_tree.hh\"\n"
+ << "#include \"u.hh\"\n"
+ << "#include \"u8.hh\"\n"
+ << "\n"
+ << "#include <cstddef>\n"
+ << "#include <optional>\n"
+ << "#include <string_view>\n"
+ << "#include <utility>\n"
+ << "\n"
+ << "using namespace std::literals::string_view_literals;\n"
+ << "\n"
+ << "// NOLINTBEGIN(readability-else-after-return, "
+ "readability-convert-member-functions-to-static)\n"
+ << "#pragma GCC diagnostic push\n"
+ << "#pragma GCC diagnostic ignored \"-Wunused-lambda-capture\"\n"
+ << "\n"
+ << "namespace " << ns << " {\n"
+ << "\n";
+
+ write_character_class_matchers(source, unicode_version);
+
+ source << "\n";
+
+ if (std::ranges::any_of(all_elements_, [this, &source](auto* element) {
+ auto sts = get_return_type(*element);
+ return !write_matcher(source, *element, sts);
+ })) {
+ return false;
+ }
+
+ source << "\n"
+ << "std::optional<std::pair<Token, size_t>>"
+ << "TokenMatcher::matchNext(std::string_view str) {\n"
+ << " return match" << grammar.root().name << "(str);\n"
+ << "}"
+ << "\n"
+ << "} // namespace " << ns << "\n"
+ << "\n"
+ << "// NOLINTEND(readability-else-after-return, "
+ "readability-convert-member-functions-to-static)\n"
+ << "#pragma GCC diagnostic pop\n"
+ << "\n";
+
+ return true;
+}
+
+} // namespace
+
+int main(int argc, char** argv) {
+ auto args = Args::create();
+ auto opt_help = args->option('h', "help", "display this text and exit");
+ auto opt_ns = args->option_argument('\0', "namespace", "ARG",
+ "Namespace for tokenizer");
+ auto opt_unicode =
+ args->option_argument('u', "unicode", "ARG", "Unicode version");
+ std::vector<std::string_view> arguments;
+ if (!args->run(argc, argv, &arguments)) {
+ args->print_error(std::cerr);
+ std::cerr << "Try `gen_tokens --help` for usage\n";
+ return 1;
+ }
+ if (opt_help->is_set()) {
+ std::cout << "Usage: `gen_tokens [OPTIONS...] tokens.grammar"
+ << " OUTPUT.hh OUTPUT.cc`\n"
+ << "Generates a tokenizer for grammar.\n"
+ << "\n";
+ args->print_help(std::cout);
+ return 0;
+ }
+ if (!opt_ns->is_set()) {
+ std::cerr << "No namespace given.\n"
+ << "Try `gen_tokens --help` for usage\n";
+ return 1;
+ }
+ if (!opt_unicode->is_set()) {
+ std::cerr << "No unicode version given.\n"
+ << "Try `gen_tokens --help` for usage\n";
+ return 1;
+ }
+ auto ns = opt_ns->argument();
+ auto unicode = opt_unicode->argument();
+ if (arguments.size() != 3) {
+ std::cerr << "Expecting three arguments. No more, no less.\n"
+ << "Try `gen_tokens --help` for usage\n";
+ return 1;
+ }
+
+ auto filename = std::string(arguments[0]);
+ auto reader = io::open(filename);
+ if (!reader.has_value()) {
+ std::cerr << "Unable to open " << filename << '\n';
+ return 1;
+ }
+ auto errors = src::file_errors(std::move(filename));
+ auto grammar =
+ grammar::load(std::move(reader.value()), kCharacterClassNames, *errors);
+ if (!grammar || errors->errors() > 0)
+ return 1;
+
+ Generator generator;
+ if (!generator.generate(arguments[1], arguments[2], ns, unicode, *grammar))
+ return 1;
+ return 0;
+}
diff --git a/src/grammar.cc b/src/grammar.cc
new file mode 100644
index 0000000..25c4d64
--- /dev/null
+++ b/src/grammar.cc
@@ -0,0 +1,268 @@
+#include "grammar.hh"
+
+#include "errors.hh"
+#include "line.hh"
+#include "location.hh"
+#include "str.hh"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <format>
+#include <functional>
+#include <map>
+#include <memory>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+namespace grammar {
+
+namespace {
+
+class GrammarImpl : public Grammar {
+ public:
+ explicit GrammarImpl(std::vector<std::unique_ptr<Element>> elements)
+ : elements_(std::move(elements)) {}
+
+ [[nodiscard]]
+ Element const& root() const override {
+ return *elements_[0];
+ }
+
+ private:
+ std::vector<std::unique_ptr<Element>> elements_;
+};
+
+struct FirstPassElement {
+ src::Location loc;
+ std::vector<std::string> definitions;
+
+ explicit FirstPassElement(src::Location loc) : loc(loc) {}
+};
+
+class GrammarLoader {
+ public:
+ GrammarLoader(std::unique_ptr<io::Reader> reader,
+ std::vector<std::string> const& character_classes,
+ src::Errors& errors)
+ : reader_(line::open(std::move(reader))),
+ character_classes_(character_classes),
+ errors_(errors) {}
+
+ std::unique_ptr<Grammar> load() {
+ // Read whole file in a first pass, before parsing definitions.
+ std::map<std::string, FirstPassElement> first_pass_elements;
+ FirstPassElement* last_element = nullptr;
+
+ std::map<std::string_view, uint8_t, std::less<>> cc_lookup;
+ for (uint8_t i = 0; i < static_cast<uint8_t>(character_classes_.size());
+ ++i) {
+ cc_lookup.emplace(character_classes_[i], i);
+ }
+
+ while (true) {
+ auto line = reader_->read();
+ auto loc = src::Location(reader_->number(), 0);
+ if (!line.has_value()) {
+ if (line.error() == io::ReadError::Eof)
+ break;
+ errors_.err(loc, "Error reading");
+ return nullptr;
+ }
+ if (line.value().empty() || line.value().front() == '#')
+ continue;
+ if (line.value().front() == ' ') {
+ // Continue on last element
+ size_t i = 1;
+ while (i < line.value().size() && line.value()[i] == ' ')
+ ++i;
+ if (i == line.value().size()) {
+ errors_.err(loc, "Unexpected line, only spaces");
+ continue;
+ }
+ if (last_element == nullptr) {
+ errors_.err(loc, "Expected element before indented lines");
+ continue;
+ }
+ last_element->definitions.emplace_back(line.value().substr(i));
+ } else {
+ // New element
+ if (line.value().back() != ':') {
+ errors_.err(
+ loc,
+ "Unexpected line, not indented but also not ending with a ':'");
+ continue;
+ }
+
+ auto name = line.value().substr(0, line.value().size() - 1);
+ if (cc_lookup.contains(name)) {
+ errors_.warn(
+ loc, std::format("Element {} overrides character class", name));
+ }
+ auto pair = first_pass_elements.emplace(name, FirstPassElement(loc));
+ if (!pair.second) {
+ errors_.err(loc, std::format("Duplicate element {}", name));
+ }
+ last_element = &pair.first->second;
+ }
+ }
+
+ if (first_pass_elements.empty()) {
+ errors_.err(src::Location(reader_->number(), 0), "No elements found");
+ return nullptr;
+ }
+
+ std::vector<std::unique_ptr<Element>> second_pass_elements;
+ std::map<std::string_view, size_t, std::less<>> second_pass_lookup;
+ for (auto const& pair : first_pass_elements) {
+ auto element = std::make_unique<Element>();
+ element->name = pair.first;
+ second_pass_lookup.emplace(element->name, second_pass_elements.size());
+ second_pass_elements.emplace_back(std::move(element));
+ }
+
+ auto it = second_pass_elements.begin();
+ for (auto const& pair : first_pass_elements) {
+ auto const& element = *it++;
+ if (pair.second.definitions.empty()) {
+ errors_.err(pair.second.loc,
+ std::format("No definitions for {}", pair.first));
+ continue;
+ }
+ std::vector<std::string_view> in_symbols;
+ for (auto const& in_definition : pair.second.definitions) {
+ str::split(in_definition, in_symbols);
+
+ std::vector<Symbol> out_symbols;
+ bool exclude = false;
+ bool expect_not = false;
+ for (auto in_symbol : in_symbols) {
+ Symbol out_symbol;
+ if (exclude) {
+ if (in_symbol == "or")
+ continue;
+ out_symbol.optional = Symbol::Optional::kExcluded;
+ } else if (expect_not) {
+ expect_not = false;
+ if (in_symbol == "not") {
+ exclude = true;
+ } else {
+ errors_.err(pair.second.loc, "but is not followed by not");
+ }
+ continue;
+ }
+ if (in_symbol == "but") {
+ expect_not = true;
+ continue;
+ }
+ if (in_symbol.front() == '{' && in_symbol.back() == '}') {
+ if (exclude) {
+ errors_.err(pair.second.loc,
+ "Optional and exclude doesn't work together");
+ } else {
+ out_symbol.optional = Symbol::Optional::kZeroOrMore;
+ }
+ in_symbol = in_symbol.substr(1, in_symbol.size() - 2);
+ } else if (in_symbol.front() == '[' && in_symbol.back() == ']') {
+ if (exclude) {
+ errors_.err(pair.second.loc,
+ "Optional and exclude doesn't work together");
+ } else {
+ out_symbol.optional = Symbol::Optional::kZeroOrOne;
+ }
+ in_symbol = in_symbol.substr(1, in_symbol.size() - 2);
+ }
+ auto it2 = second_pass_lookup.find(in_symbol);
+ if (it2 != second_pass_lookup.end()) {
+ out_symbol.type = Symbol::Type::kNonTerminal;
+ out_symbol.element = second_pass_elements[it2->second].get();
+ } else {
+ auto it3 = cc_lookup.find(in_symbol);
+ if (it3 != cc_lookup.end()) {
+ out_symbol.type = Symbol::Type::kCharacterClass;
+ out_symbol.char_class = it3->second;
+ } else {
+ out_symbol.type = Symbol::Type::kTerminal;
+ out_symbol.value = in_symbol;
+ }
+ }
+ out_symbols.emplace_back(std::move(out_symbol));
+ }
+
+ if (expect_not) {
+ errors_.err(pair.second.loc, "but is not followed by not");
+ }
+
+ if (out_symbols.empty()) {
+ errors_.err(pair.second.loc, "no symbols found in definition");
+ continue;
+ }
+
+ element->definitions.emplace_back(
+ Definition{.symbols = std::move(out_symbols)});
+ }
+ }
+
+ // Find root and move it first (if needed)
+ std::vector<size_t> used(second_pass_elements.size(), 0);
+ for (auto const& element : second_pass_elements) {
+ for (auto const& definition : element->definitions) {
+ for (auto const& symbol : definition.symbols) {
+ switch (symbol.type) {
+ case Symbol::Type::kTerminal:
+ case Symbol::Type::kCharacterClass:
+ break;
+ case Symbol::Type::kNonTerminal:
+ used[second_pass_lookup.find(symbol.element->name)->second]++;
+ break;
+ }
+ }
+ }
+ }
+
+ std::optional<size_t> root_index;
+ for (size_t i = 0; i < used.size(); ++i) {
+ if (used[i] == 0) {
+ if (root_index.has_value()) {
+ errors_.warn(first_pass_elements.find(second_pass_elements[i]->name)
+ ->second.loc,
+ "Is not referenced but also not root");
+ } else {
+ root_index = i;
+ }
+ }
+ }
+
+ if (root_index.has_value()) {
+ if (root_index.value() != 0) {
+ std::swap(second_pass_elements[0],
+ second_pass_elements[root_index.value()]);
+ }
+ } else {
+ errors_.err(
+ first_pass_elements.find(second_pass_elements[0]->name)->second.loc,
+ "No root element found");
+ }
+
+ return std::make_unique<GrammarImpl>(std::move(second_pass_elements));
+ }
+
+ private:
+ std::unique_ptr<line::Reader> reader_;
+ std::vector<std::string> const& character_classes_;
+ src::Errors& errors_;
+};
+
+} // namespace
+
+std::unique_ptr<Grammar> load(std::unique_ptr<io::Reader> reader,
+ std::vector<std::string> const& character_classes,
+ src::Errors& errors) {
+ GrammarLoader loader(std::move(reader), character_classes, errors);
+ return loader.load();
+}
+
+} // namespace grammar
diff --git a/src/grammar.hh b/src/grammar.hh
new file mode 100644
index 0000000..13beec4
--- /dev/null
+++ b/src/grammar.hh
@@ -0,0 +1,80 @@
+#ifndef GRAMMAR_HH
+#define GRAMMAR_HH
+
+#include "errors.hh"
+#include "io.hh"
+
+#include <cstdint>
+#include <span>
+#include <string>
+#include <vector>
+
+namespace grammar {
+
+struct Element;
+
+struct Symbol {
+ enum class Type : uint8_t {
+ // value == terminal, as UTF-8
+ kTerminal,
+
+ // element != nullptr
+ kNonTerminal,
+
+ // char_class != 0
+ kCharacterClass,
+ };
+
+ Type type;
+
+ enum class Optional : uint8_t {
+ // Symbol is NOT optional.
+ kRequired,
+ // Symbol is optional.
+ kZeroOrOne,
+ // Symbol is optional and can repeat.
+ kZeroOrMore,
+ // Symbol should be excluded from previous symbol.
+ // Example: InputCharacter but not * or /
+ kExcluded,
+ };
+
+ Optional optional{Optional::kRequired};
+
+ uint8_t char_class{0};
+
+ Element const* element{nullptr};
+ std::string value;
+};
+
+struct Definition {
+ std::vector<Symbol> symbols;
+};
+
+struct Element {
+ std::string name;
+
+ std::vector<Definition> definitions;
+};
+
+class Grammar {
+ public:
+ virtual ~Grammar() = default;
+
+ [[nodiscard]]
+ virtual Element const& root() const = 0;
+
+ protected:
+ Grammar() = default;
+
+ Grammar(Grammar const&) = delete;
+ Grammar& operator=(Grammar const&) = delete;
+};
+
+std::unique_ptr<Grammar> load(std::unique_ptr<io::Reader> reader,
+ std::vector<std::string> const& character_classes,
+ src::Errors& errors);
+
+} // namespace grammar
+
+#endif // GRAMMAR_HH
diff --git a/src/java_tokens.cc b/src/java_tokens.cc
new file mode 100644
index 0000000..1ba40a3
--- /dev/null
+++ b/src/java_tokens.cc
@@ -0,0 +1,394 @@
+#include "java_tokens.hh"
+
+#include "errors.hh"
+#include "java_tokens_java-8.hh"
+#include "java_uescape.hh"
+#include "str.hh"
+#include "u8.hh"
+#include "uline.hh"
+
+#include <cassert>
+#include <charconv>
+#include <cstddef>
+#include <expected>
+#include <format>
+#include <limits>
+#include <memory>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <system_error>
+#include <utility>
+
+namespace java {
+
+namespace {
+
+template <typename TokenMatcher, typename MatchToken>
+class TokensImpl : public Tokens {
+ public:
+ TokensImpl(std::unique_ptr<io::Reader> reader,
+ std::unique_ptr<src::Errors> errors, TokensConfig config)
+ : reader_(u8::line::open(u8::java::open(std::move(reader)))),
+ errors_(std::move(errors)),
+ config_(config) {}
+
+ std::expected<Token, io::ReadError> read() override {
+ while (true) {
+ while (line_.empty()) {
+ auto maybe_line = reader_->read();
+ if (!maybe_line.has_value())
+ return std::unexpected(maybe_line.error());
+ line_ = maybe_line.value();
+ location_.line = reader_->number();
+ location_.column = 0;
+ }
+
+ Token token;
+ token.loc = location_;
+
+ auto maybe_token_pair = matcher_.matchNext(line_);
+
+ while (matcher_.line_end_reached_) {
+ // Matcher wants more lines.
+ matcher_.line_end_reached_ = false;
+
+ bool got_any = false;
+ line_tmp_ = line_;
+ line_tmp_.push_back('\n');
+ while (true) {
+ auto maybe_line = reader_->read();
+ if (!maybe_line.has_value())
+ break;
+ line_tmp_.append(maybe_line.value());
+ got_any = true;
+ // Simple check, it might not actually end the comment but if so tokenizer will complain
+ // about reaching line_end again.
+ if (maybe_line->contains("*/"))
+ break;
+ line_tmp_.push_back('\n');
+ }
+ line_ = line_tmp_;
+ maybe_token_pair = matcher_.matchNext(line_);
+
+ if (!got_any)
+ break;
+ }
+
+ if (maybe_token_pair.has_value()) {
+ token.str = line_.substr(0, maybe_token_pair->second);
+ location_.column += u8len(token.str);
+ line_ = line_.substr(maybe_token_pair->second);
+ switch (maybe_token_pair->first) {
+ case MatchToken::kBinaryIntegerLiteral:
+ handle_int_literal(token, /* base */ 2);
+ break;
+ case MatchToken::kBooleanLiteral:
+ token.type = Token::Type::kLiteralBoolean;
+ token.int_value = token.str != "false";
+ break;
+ case MatchToken::kCharacterLiteral:
+ token.type = Token::Type::kLiteralCharacter;
+ if (token.str[1] == '\\') {
+ token.int_value =
+ unescape(token.str.substr(1, token.str.size() - 2)).first;
+ } else {
+ auto* ptr =
+ reinterpret_cast<uint8_t const*>(token.str.data() + 1);
+ auto* end = ptr + token.str.size() - 2;
+ token.int_value = u8::read(ptr, end).value();
+ }
+ break;
+ case MatchToken::kDecimalFloatingPointLiteral:
+ handle_float_literal(token);
+ break;
+ case MatchToken::kDecimalIntegerLiteral:
+ handle_int_literal(token);
+ break;
+ case MatchToken::kEndOfLineComment:
+ token.type = Token::Type::kComment;
+ token.str = str::trim(token.str.substr(2));
+ break;
+ case MatchToken::kHexIntegerLiteral:
+ handle_int_literal(token, /* base */ 16);
+ break;
+ case MatchToken::kHexadecimalFloatingPointLiteral:
+ handle_float_literal(token, /* base */ 16);
+ break;
+ case MatchToken::kIdentifier:
+ token.type = Token::Type::kIdentifier;
+ break;
+ case MatchToken::kKeyword:
+ token.type = Token::Type::kKeyword;
+ break;
+ case MatchToken::kNullLiteral:
+ token.type = Token::Type::kLiteralNull;
+ break;
+ case MatchToken::kOctalIntegerLiteral:
+ handle_int_literal(token, /* base */ 8);
+ break;
+ case MatchToken::kOperator:
+ token.type = Token::Type::kOperator;
+ break;
+ case MatchToken::kSeparator:
+ token.type = Token::Type::kSeparator;
+ break;
+ case MatchToken::kStringLiteral:
+ token.type = Token::Type::kLiteralString;
+ token.str =
+ unescape_if_needed(token.str.substr(1, token.str.size() - 2));
+ break;
+ case MatchToken::kTraditionalComment: {
+ token.type = Token::Type::kComment;
+ size_t s = 2;
+ while (s < token.str.size() && token.str[s] == '*')
+ ++s;
+ token.str =
+ str::trim(token.str.substr(s, token.str.size() - 2 - s));
+ token.int_value = static_cast<int64_t>(s - 1);
+ break;
+ }
+ case MatchToken::kWhiteSpace:
+ continue;
+ }
+ } else {
+ errors_->err(location_, std::format("Invalid token: {}", line_));
+ token.type = Token::Type::kError;
+ token.str = line_;
+ }
+ return token;
+ }
+ }
+
+ private:
+ void handle_int_literal_error(Token& token, std::string_view str,
+ std::errc err, int base) {
+ if (err == std::errc::result_out_of_range) {
+ // Java assumes two completent (so 0xffff_ffff is -1) and also, negative literals
+ // are read as positive (because the operator '-' is a separate token)
+ uint64_t tmp;
+ auto ret =
+ std::from_chars(str.data(), str.data() + str.size(), tmp, base);
+ if (ret.ec == std::errc()) {
+ token.type = ret.ptr < str.data() + str.size()
+ ? Token::Type::kLiteralLong
+ : Token::Type::kLiteralInt;
+ token.int_value = static_cast<int64_t>(tmp);
+ return;
+ }
+ }
+ errors_->err(location_,
+ std::format("Invalid integer literal: {}", token.str));
+ token.type = Token::Type::kError;
+ }
+
+ void handle_int_literal(Token& token, int base = 10) {
+ size_t prefix;
+ switch (base) {
+ case 16: // 0x
+ case 2: // 0b
+ prefix = 2;
+ break;
+ case 8: // 0
+ prefix = 1;
+ break;
+ default:
+ prefix = 0;
+ break;
+ }
+ std::optional<char> suffix;
+ if (token.str.find('_') == std::string_view::npos) {
+ auto ret = std::from_chars(token.str.data() + prefix,
+ token.str.data() + token.str.size(),
+ token.int_value, base);
+ if (ret.ec != std::errc()) {
+ handle_int_literal_error(token, token.str.substr(prefix), ret.ec, base);
+ return;
+ }
+ if (ret.ptr < token.str.data() + token.str.size())
+ suffix = *ret.ptr;
+ } else {
+ std::string tmp;
+ tmp.reserve(token.str.size() - prefix);
+ for (size_t i = prefix; i < token.str.size(); ++i) {
+ if (token.str[i] != '_') {
+ tmp.push_back(token.str[i]);
+ }
+ }
+ auto ret = std::from_chars(tmp.data(), tmp.data() + tmp.size(),
+ token.int_value, base);
+ if (ret.ec != std::errc()) {
+ handle_int_literal_error(token, tmp, ret.ec, base);
+ return;
+ }
+ if (ret.ptr < tmp.data() + tmp.size())
+ suffix = *ret.ptr;
+ }
+ if (suffix.has_value() &&
+ (suffix.value() == 'l' || suffix.value() == 'L')) {
+ token.type = Token::Type::kLiteralLong;
+ } else {
+ if (base == 10 &&
+ token.int_value >
+ static_cast<int64_t>(1) + std::numeric_limits<int32_t>::max()) {
+ errors_->err(location_,
+ std::format("Invalid integer literal: {}", token.str));
+ token.type = Token::Type::kError;
+ return;
+ }
+ if (std::cmp_greater(token.int_value,
+ std::numeric_limits<uint32_t>::max())) {
+ errors_->err(location_,
+ std::format("Invalid integer literal: {}", token.str));
+ token.type = Token::Type::kError;
+ return;
+ }
+ token.type = Token::Type::kLiteralInt;
+ token.int_value = static_cast<int32_t>(token.int_value);
+ }
+ }
+
+ void handle_float_literal(Token& token, int base = 10) {
+ size_t prefix;
+ std::chars_format fmt;
+ switch (base) {
+ case 16: // 0x
+ fmt = std::chars_format::general | std::chars_format::hex;
+ prefix = 2;
+ break;
+ default:
+ fmt = std::chars_format::general;
+ prefix = 0;
+ break;
+ }
+
+ std::from_chars_result ret;
+ if (token.str.ends_with("f") || token.str.ends_with("F")) {
+ // float and double do not parse exactly the same, so use a float parser for float.
+ float tmp;
+ ret = std::from_chars(token.str.data() + prefix,
+ token.str.data() + token.str.size(), tmp, fmt);
+ token.type = Token::Type::kLiteralFloatingPoint;
+ token.float_value = tmp;
+ } else {
+ ret = std::from_chars(token.str.data() + prefix,
+ token.str.data() + token.str.size(),
+ token.float_value, fmt);
+ token.type = Token::Type::kLiteralDoubleFloatingPoint;
+ }
+
+ if (ret.ec != std::errc()) {
+ // Java allows 0 with just a suffix, std::from_chars does not.
+ if (token.str == "0f" || token.str == "0F") {
+ token.type = Token::Type::kLiteralFloatingPoint;
+ token.float_value = 0;
+ return;
+ }
+ if (token.str == "0d" || token.str == "0D") {
+ token.type = Token::Type::kLiteralDoubleFloatingPoint;
+ token.float_value = 0;
+ return;
+ }
+ errors_->err(location_,
+ std::format("Invalid float literal: {}", token.str));
+ token.type = Token::Type::kError;
+ }
+ }
+
+ std::string_view unescape_if_needed(std::string_view str) {
+ auto back_slash = str.find('\\');
+ if (back_slash == std::string_view::npos)
+ return str;
+ unescape_tmp_.clear();
+ unescape_tmp_.reserve(str.size());
+ size_t last = 0;
+ uint8_t tmp[4];
+ while (true) {
+ unescape_tmp_.append(str, last, back_slash - last);
+ auto ret = unescape(str.substr(back_slash));
+ auto* ptr = tmp;
+ u8::write(ptr, tmp + sizeof(tmp), ret.first);
+ unescape_tmp_.append(
+ std::string_view(reinterpret_cast<char*>(tmp), ptr - tmp));
+ last = back_slash + ret.second;
+ back_slash = str.find('\\', last);
+ if (back_slash == std::string::npos) {
+ unescape_tmp_.append(str, last);
+ break;
+ }
+ }
+ return unescape_tmp_;
+ }
+
+ // Strings coming here have already been validated by the tokenizer
+ static std::pair<uint16_t, size_t> unescape(std::string_view in) {
+ assert(in.front() == '\\');
+ assert(in.size() > 1);
+ switch (in[1]) {
+ case 'b':
+ return std::make_pair(8, 2);
+ case 't':
+ return std::make_pair(9, 2);
+ case 'n':
+ return std::make_pair(10, 2);
+ case 'f':
+ return std::make_pair(12, 2);
+ case 'r':
+ return std::make_pair(13, 2);
+ case '"':
+ return std::make_pair(34, 2);
+ case '\'':
+ return std::make_pair(39, 2);
+ case '\\':
+ return std::make_pair(92, 2);
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7': {
+ uint8_t tmp;
+ auto ret = std::from_chars(in.data() + 1, in.data() + in.size(), tmp,
+ /* base */ 8);
+ return std::make_pair(tmp, ret.ptr - in.data());
+ }
+ default:
+ std::unreachable();
+ }
+ }
+
+ static size_t u8len(std::string_view str) {
+ auto* ptr = reinterpret_cast<uint8_t const*>(str.data());
+ auto* const end = ptr + str.size();
+ size_t count = 0;
+ while (u8::skip(ptr, end))
+ ++count;
+ return count;
+ }
+
+ std::unique_ptr<u8::line::Reader> reader_;
+ std::unique_ptr<src::Errors> errors_;
+ TokensConfig const config_;
+ TokenMatcher matcher_;
+ std::string_view line_;
+ std::string line_tmp_;
+ Location location_;
+ std::string unescape_tmp_;
+};
+
+} // namespace
+
+std::unique_ptr<Tokens> open(std::unique_ptr<io::Reader> reader,
+ std::unique_ptr<src::Errors> errors,
+ TokensConfig config) {
+ switch (config.version) {
+ case Version::kJava8:
+ return std::make_unique<TokensImpl<java_8::TokenMatcher, java_8::Token>>(
+ std::move(reader), std::move(errors), config);
+ }
+ std::unreachable();
+}
+
+} // namespace java
diff --git a/src/java_tokens.hh b/src/java_tokens.hh
new file mode 100644
index 0000000..6fbefcb
--- /dev/null
+++ b/src/java_tokens.hh
@@ -0,0 +1,92 @@
+#ifndef JAVA_TOKENS_HH
+#define JAVA_TOKENS_HH
+
+#include "errors.hh"
+#include "io.hh"
+#include "java_version.hh" // IWYU pragma: export
+#include "location.hh"
+
+#include <expected>
+#include <memory>
+#include <string_view>
+
+namespace java {
+
+using src::Location;
+
+struct Token {
+ enum class Type : uint8_t {
+ // str is content of comment, excluding heading or trailing slash and star.
+ // int_value is number of stars at head, 0 for single line, 1 for /* and
+ // 2 for /** and so on.
+ kComment,
+
+ // str is identifier
+ kIdentifier,
+
+ // str is keyword, int_value is Keyword index
+ kKeyword,
+
+ // str is separator, int_value is Separator index
+ kSeparator,
+
+ // str is operator, int_value is Operator index
+ kOperator,
+
+ // int_value is literal value
+ kLiteralInt,
+
+ // int_value is literal value
+ kLiteralLong,
+
+ // str is literal value
+ kLiteralString,
+
+ // int_value is literal value as unicode code-point
+ kLiteralCharacter,
+
+ kLiteralNull,
+
+ // float_value is literal value
+ kLiteralFloatingPoint,
+
+ // float_value is literal value
+ kLiteralDoubleFloatingPoint,
+
+ // int_value is literal value, 0 = false, anything else = true
+ kLiteralBoolean,
+
+ kError,
+ };
+
+ Type type;
+ Location loc;
+ std::string_view str;
+ int64_t int_value{0};
+ double float_value{0};
+};
+
+struct TokensConfig {
+ // Source version of Java file
+ Version version = Version::kMax;
+};
+
+class Tokens {
+ public:
+ virtual ~Tokens() = default;
+
+ virtual std::expected<Token, io::ReadError> read() = 0;
+
+ protected:
+ Tokens() = default;
+ Tokens(Tokens const&) = delete;
+ Tokens& operator=(Tokens const&) = delete;
+};
+
+[[nodiscard]] std::unique_ptr<Tokens> open(std::unique_ptr<io::Reader> reader,
+ std::unique_ptr<src::Errors>,
+ TokensConfig config = {});
+
+} // namespace java
+
+#endif // JAVA_TOKENS_HH
diff --git a/src/java_version.hh b/src/java_version.hh
new file mode 100644
index 0000000..444ae36
--- /dev/null
+++ b/src/java_version.hh
@@ -0,0 +1,16 @@
+#ifndef JAVA_VERSION_HH
+#define JAVA_VERSION_HH
+
+#include <cstdint>
+
+namespace java {
+
+enum class Version : uint8_t {
+ kJava8 = 8,
+
+ kMax = kJava8,
+};
+
+} // namespace java
+
+#endif // JAVA_VERSION_HH
diff --git a/src/location.cc b/src/location.cc
new file mode 100644
index 0000000..3fa1075
--- /dev/null
+++ b/src/location.cc
@@ -0,0 +1,12 @@
+#include "location.hh"
+
+#include <ostream>
+
+namespace src {
+
+std::ostream& operator<<(std::ostream& out, Location const& loc) {
+ out << loc.line << ':' << loc.column;
+ return out;
+}
+
+} // namespace src
diff --git a/src/location.hh b/src/location.hh
new file mode 100644
index 0000000..1a210cb
--- /dev/null
+++ b/src/location.hh
@@ -0,0 +1,31 @@
+#ifndef LOCATION_HH
+#define LOCATION_HH
+
+#include <compare>
+#include <cstdint>
+#include <iosfwd>
+
+namespace src {
+
+struct Location {
+ uint64_t line;
+ uint16_t column;
+
+ constexpr Location() : line(0), column(0) {}
+
+ Location(uint64_t line, uint16_t column) : line(line), column(column) {}
+
+ [[nodiscard]]
+ std::strong_ordering operator<=>(Location const& loc) const {
+ auto ret = line <=> loc.line;
+ if (ret == std::strong_ordering::equal)
+ return column <=> loc.column;
+ return ret;
+ }
+};
+
+std::ostream& operator<<(std::ostream& out, Location const& loc);
+
+} // namespace src
+
+#endif // LOCATION_HH